Explorar el Código

add tabix region fetch: fetch_tabix_lines_with, fetch_bed, fetch_vcf

Core: fetch_tabix_lines_with in readers.rs — zero-allocation tabix fetch.
Reuses a single String buffer across all lines; the callback receives &str
so no intermediate Vec<String> is ever built. Avoids the double-allocation
that a Vec<String> + Vec<T> collect() pattern would incur.

  fetch_tabix_lines_with(bgz_path, region, |line| { ... })  -- core, callback
  fetch_tabix_lines(bgz_path, region)                       -- convenience Vec<String>

Typed wrappers (call fetch_tabix_lines_with directly, single allocation):
  fetch_bed(bgz_path, region) -> Vec<BedRow>    in bed.rs
    - parses each line as BedRow
    - applies secondary overlap filter to discard tabix-bin false positives
  fetch_vcf(bgz_path, region) -> Vec<VcfVariant> in vcf.rs
    - parses each line as VcfVariant

Coordinate conversion in fetch_tabix_lines_with (0-based half-open → 1-based):
  tabix_start = region.range.start + 1
  tabix_end   = region.range.end      (0-based exclusive = 1-based inclusive)
Uses noodles_core::region::Interval::from(start..=end) for the query call.
Missing contig in the index returns an empty result (not an error).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas hace 1 mes
padre
commit
8806fc5b0a
Se han modificado 3 ficheros con 213 adiciones y 5 borrados
  1. 40 1
      src/io/bed.rs
  2. 134 3
      src/io/readers.rs
  3. 39 1
      src/io/vcf.rs

+ 40 - 1
src/io/bed.rs

@@ -10,6 +10,7 @@
 //! |------|---------|
 //! | [`BedRow`] | One parsed BED line (up to BED6) |
 //! | [`read_bed`] | Load a BED file into memory, skipping headers and blank lines |
+//! | [`fetch_bed`] | Random-access region fetch from a Tabix-indexed `.bed.gz` |
 //! | [`annotate_with_bed`] | Annotate a [`Variants`] collection from a BED region set |
 //! | [`bedrow_overlaps_par`] | Parallel overlap query: which BED rows hit a set of query ranges |
 //! | [`GenesBedIndex`] | Per-contig indexed structure for fast gene-name lookup |
@@ -26,7 +27,7 @@ use rayon::prelude::*;
 
 use crate::{annotation::Annotation, io::writers::{BgzTabixWriter, IndexFormat}, positions::{GenomePosition, GenomeRange, GetGenomeRange, extract_contig_indices, find_contig_indices, overlaps_par}, variant::variant_collection::Variants};
 
-use super::readers::get_reader;
+use super::readers::{fetch_tabix_lines_with, get_reader};
 
 use std::io::Write;
 
@@ -349,6 +350,44 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
     writer.finish()
 }
 
+/// Fetch BED rows overlapping `region` from a Tabix-indexed BGZF file.
+///
+/// Delegates the BGZF seek and line extraction to [`fetch_tabix_lines`], then
+/// parses each returned line as a [`BedRow`] and applies a secondary overlap
+/// filter to discard any false positives from the Tabix bin query.
+///
+/// # Coordinate system
+///
+/// `region` is **0-based half-open** `[start, end)`. The secondary filter uses
+/// the same convention: a row `[rs, re)` is kept iff `rs < region.end && re > region.start`.
+///
+/// # Arguments
+///
+/// * `bgz_path` - Path to the `.bed.gz` file; index expected at `<bgz_path>.tbi`
+/// * `region` - Genomic region to query (0-based half-open)
+///
+/// # Returns
+///
+/// Parsed [`BedRow`]s that genuinely overlap `region`, in file order.
+///
+/// # Errors
+///
+/// Returns an error if the index or file cannot be read, or if a fetched line
+/// fails to parse as a BED row.
+pub fn fetch_bed(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<BedRow>> {
+    let mut rows = Vec::new();
+    fetch_tabix_lines_with(bgz_path, region, |line| {
+        let row: BedRow = line.parse()
+            .with_context(|| format!("failed to parse BED line: {line}"))?;
+        // Secondary overlap filter: discard tabix-bin false positives.
+        if row.range.range.end > region.range.start && row.range.range.start < region.range.end {
+            rows.push(row);
+        }
+        Ok(())
+    })?;
+    Ok(rows)
+}
+
 /// Parse centromeric intervals from a UCSC cytoband BED file.
 ///
 /// Identifies rows whose stain field (column 5) equals `acen` and returns their

+ 134 - 3
src/io/readers.rs

@@ -1,20 +1,34 @@
-//! File reader helpers for plain-text and BGZF-compressed genomic files.
+//! File reader helpers for plain-text, BGZF, and Tabix-indexed genomic files.
 //!
 //! **Important:** `.gz` files are assumed to be **BGZF** (block gzip), not standard gzip.
 //! BGZF is the format produced by `bgzip` and used by BAM, VCF, BED.gz, etc.
 //! Plain `gzip` output will not decompress correctly here.
+//!
+//! # Tabix region fetch
+//!
+//! [`fetch_tabix_lines`] performs random-access region queries against any
+//! Tabix-indexed BGZF file, returning the raw data lines that overlap the
+//! requested [`GenomeRange`]. Format-specific wrappers in `bed.rs` and `vcf.rs`
+//! parse those lines into typed structures.
 
 use std::{
     fs::{self, File},
-    io::{BufReader, Read, Write},
+    io::{BufRead, BufReader, Read, Write},
     path::Path,
 };
 
 use anyhow::Context;
 use log::debug;
 use noodles_bgzf as bgzf;
+use noodles_core::{region::Interval, Position};
+use noodles_csi::BinningIndex;
+use noodles_tabix as tabix;
 
-use crate::{helpers::TempFileGuard, io::writers::{finalize_bgzf_file, get_gz_writer}};
+use crate::{
+    helpers::TempFileGuard,
+    io::writers::{finalize_bgzf_file, get_gz_writer},
+    positions::GenomeRange,
+};
 
 /// Type alias for a BGZF reader wrapping any `Read` source.
 pub type BGZFReader<R> = bgzf::io::Reader<R>;
@@ -158,3 +172,120 @@ pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result<String> {
     guard.disarm();
     Ok(output_path)
 }
+
+/// Visit every data line in a Tabix-indexed BGZF file that overlaps `region`,
+/// calling `on_line` for each one.
+///
+/// This is the zero-allocation core of the tabix fetch family. A single `String`
+/// buffer is reused across all lines; `on_line` receives a `&str` reference into
+/// that buffer — no intermediate `Vec<String>` is ever built. Callers that need
+/// a collected result (e.g. [`fetch_tabix_lines`], [`fetch_bed`](super::bed::fetch_bed))
+/// simply push into their own accumulator inside the closure.
+///
+/// Header lines (`#`) and blank lines are skipped before `on_line` is called.
+/// Lines are newline-stripped before being passed to the callback.
+///
+/// # Coordinate system
+///
+/// `region` is **0-based half-open** `[start, end)`. The conversion to 1-based
+/// inclusive Tabix positions is performed internally:
+/// - `tabix_start = region.range.start + 1`
+/// - `tabix_end   = region.range.end`
+///
+/// # Arguments
+///
+/// * `bgz_path` - Path to the BGZF file; index expected at `<bgz_path>.tbi`
+/// * `region` - Genomic interval to query (0-based half-open)
+/// * `on_line` - Callback invoked for each matching data line; returning `Err`
+///   aborts the fetch immediately and propagates the error
+///
+/// # Errors
+///
+/// Returns an error if the index or file cannot be read, a seek fails, or
+/// `on_line` returns an error.
+pub fn fetch_tabix_lines_with<F>(
+    bgz_path: &str,
+    region: &GenomeRange,
+    mut on_line: F,
+) -> anyhow::Result<()>
+where
+    F: FnMut(&str) -> anyhow::Result<()>,
+{
+    let tbi_path = format!("{bgz_path}.tbi");
+
+    let index = tabix::fs::read(&tbi_path)
+        .with_context(|| format!("cannot read tabix index: {tbi_path}"))?;
+
+    let rname = region.contig();
+
+    let ref_seq_id = match index
+        .header()
+        .ok_or_else(|| anyhow::anyhow!("tabix index has no header: {tbi_path}"))?
+        .reference_sequence_names()
+        .get_index_of(rname.as_bytes())
+    {
+        Some(id) => id,
+        None => return Ok(()),
+    };
+
+    // 0-based half-open [start, end) → 1-based inclusive [start+1, end]
+    let interval_start = Position::try_from(region.range.start as usize + 1)
+        .context("region start overflow")?;
+    let interval_end = Position::try_from(region.range.end as usize)
+        .context("region end overflow")?;
+
+    let interval = Interval::from(interval_start..=interval_end);
+    let chunks = index
+        .query(ref_seq_id, interval)
+        .context("tabix region query failed")?;
+
+    if chunks.is_empty() {
+        return Ok(());
+    }
+
+    let mut reader = bgzf::io::Reader::new(
+        File::open(bgz_path).with_context(|| format!("cannot open {bgz_path}"))?,
+    );
+
+    // Single buffer reused across all lines — no per-line allocation.
+    let mut buf = String::new();
+
+    for chunk in chunks {
+        reader.seek(chunk.start()).context("BGZF seek failed")?;
+
+        loop {
+            if reader.virtual_position() >= chunk.end() {
+                break;
+            }
+            buf.clear();
+            let n = reader.read_line(&mut buf).context("BGZF read_line failed")?;
+            if n == 0 {
+                break;
+            }
+            let line = buf.trim_end_matches(|c| c == '\n' || c == '\r');
+            if !line.is_empty() && !line.starts_with('#') {
+                on_line(line)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Convenience wrapper over [`fetch_tabix_lines_with`] that collects results
+/// into a `Vec<String>`.
+///
+/// Prefer [`fetch_tabix_lines_with`] directly when you need to parse or filter
+/// lines, to avoid the intermediate string allocation.
+///
+/// # Errors
+///
+/// Returns an error if the index or file cannot be read.
+pub fn fetch_tabix_lines(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<String>> {
+    let mut lines = Vec::new();
+    fetch_tabix_lines_with(bgz_path, region, |line| {
+        lines.push(line.to_owned());
+        Ok(())
+    })?;
+    Ok(lines)
+}

+ 39 - 1
src/io/vcf.rs

@@ -15,10 +15,11 @@ use noodles_core::Position;
 
 use crate::{
     io::writers::{BgzTabixWriter, IndexFormat},
+    positions::GenomeRange,
     variant::vcf_variant::VcfVariant,
 };
 
-use super::{dict::read_dict, readers::get_reader};
+use super::{dict::read_dict, readers::{fetch_tabix_lines_with, get_reader}};
 
 /// Load a BGZF-compressed or plain VCF file into memory.
 ///
@@ -98,6 +99,43 @@ pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
     writer.finish()
 }
 
+/// Fetch variants overlapping `region` from a Tabix-indexed BGZF VCF file.
+///
+/// Delegates BGZF seeking and line extraction to [`fetch_tabix_lines`], then
+/// parses each line as a [`VcfVariant`]. No secondary position filter is applied
+/// beyond what Tabix provides — VCF variants are point features (POS = both
+/// tabix start and end), so false positives from the bin query are rare.
+///
+/// # Coordinate system
+///
+/// `region` is **0-based half-open** `[start, end)`. The Tabix index was built
+/// with VCF POS (1-based), so [`fetch_tabix_lines`] converts internally:
+/// `tabix_start = region.start + 1`, `tabix_end = region.end`.
+///
+/// # Arguments
+///
+/// * `bgz_path` - Path to the `.vcf.gz` file; index expected at `<bgz_path>.tbi`
+/// * `region` - Genomic region to query (0-based half-open)
+///
+/// # Returns
+///
+/// Parsed [`VcfVariant`]s overlapping `region`, in file order.
+///
+/// # Errors
+///
+/// Returns an error if the index or file cannot be read, or if a fetched line
+/// fails to parse as a VCF variant.
+pub fn fetch_vcf(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<VcfVariant>> {
+    let mut variants = Vec::new();
+    fetch_tabix_lines_with(bgz_path, region, |line| {
+        let v: VcfVariant = line.parse()
+            .with_context(|| format!("failed to parse VCF line: {line}"))?;
+        variants.push(v);
+        Ok(())
+    })?;
+    Ok(variants)
+}
+
 /// A flat VCF row for CSV/TSV deserialization (e.g. via `serde`).
 ///
 /// Use [`VcfVariant`] for richer parsed access. This struct exists for