hace 2 meses · 8806fc5b0a
--- a/src/io/bed.rs
+++ b/src/io/bed.rs
@@ -10,6 +10,7 @@
 
				 //! |------|---------|
			
 
				 //! | [`BedRow`] | One parsed BED line (up to BED6) |
			
 
				 //! | [`read_bed`] | Load a BED file into memory, skipping headers and blank lines |
			
 
				+//! | [`fetch_bed`] | Random-access region fetch from a Tabix-indexed `.bed.gz` |
			
 
				 //! | [`annotate_with_bed`] | Annotate a [`Variants`] collection from a BED region set |
			
 
				 //! | [`bedrow_overlaps_par`] | Parallel overlap query: which BED rows hit a set of query ranges |
			
 
				 //! | [`GenesBedIndex`] | Per-contig indexed structure for fast gene-name lookup |
			
@@ -26,7 +27,7 @@ use rayon::prelude::*;
 
				 
			
 
				 use crate::{annotation::Annotation, io::writers::{BgzTabixWriter, IndexFormat}, positions::{GenomePosition, GenomeRange, GetGenomeRange, extract_contig_indices, find_contig_indices, overlaps_par}, variant::variant_collection::Variants};
			
 
				 
			
 
				-use super::readers::get_reader;
			
 
				+use super::readers::{fetch_tabix_lines_with, get_reader};
			
 
				 
			
 
				 use std::io::Write;
			
 
				 
			
@@ -349,6 +350,44 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
 
				     writer.finish()
			
 
				 }
			
 
				 
			
 
				+/// Fetch BED rows overlapping `region` from a Tabix-indexed BGZF file.
			
 
				+///
			
 
				+/// Delegates the BGZF seek and line extraction to [`fetch_tabix_lines`], then
			
 
				+/// parses each returned line as a [`BedRow`] and applies a secondary overlap
			
 
				+/// filter to discard any false positives from the Tabix bin query.
			
 
				+///
			
 
				+/// # Coordinate system
			
 
				+///
			
 
				+/// `region` is **0-based half-open** `[start, end)`. The secondary filter uses
			
 
				+/// the same convention: a row `[rs, re)` is kept iff `rs < region.end && re > region.start`.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `bgz_path` - Path to the `.bed.gz` file; index expected at `<bgz_path>.tbi`
			
 
				+/// * `region` - Genomic region to query (0-based half-open)
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// Parsed [`BedRow`]s that genuinely overlap `region`, in file order.
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Returns an error if the index or file cannot be read, or if a fetched line
			
 
				+/// fails to parse as a BED row.
			
 
				+pub fn fetch_bed(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<BedRow>> {
			
 
				+    let mut rows = Vec::new();
			
 
				+    fetch_tabix_lines_with(bgz_path, region, |line| {
			
 
				+        let row: BedRow = line.parse()
			
 
				+            .with_context(|| format!("failed to parse BED line: {line}"))?;
			
 
				+        // Secondary overlap filter: discard tabix-bin false positives.
			
 
				+        if row.range.range.end > region.range.start && row.range.range.start < region.range.end {
			
 
				+            rows.push(row);
			
 
				+        }
			
 
				+        Ok(())
			
 
				+    })?;
			
 
				+    Ok(rows)
			
 
				+}
			
 
				+
			
 
				 /// Parse centromeric intervals from a UCSC cytoband BED file.
			
 
				 ///
			
 
				 /// Identifies rows whose stain field (column 5) equals `acen` and returns their
			
--- a/src/io/readers.rs
+++ b/src/io/readers.rs
@@ -1,20 +1,34 @@
 
				-//! File reader helpers for plain-text and BGZF-compressed genomic files.
			
 
				+//! File reader helpers for plain-text, BGZF, and Tabix-indexed genomic files.
			
 
				 //!
			
 
				 //! **Important:** `.gz` files are assumed to be **BGZF** (block gzip), not standard gzip.
			
 
				 //! BGZF is the format produced by `bgzip` and used by BAM, VCF, BED.gz, etc.
			
 
				 //! Plain `gzip` output will not decompress correctly here.
			
 
				+//!
			
 
				+//! # Tabix region fetch
			
 
				+//!
			
 
				+//! [`fetch_tabix_lines`] performs random-access region queries against any
			
 
				+//! Tabix-indexed BGZF file, returning the raw data lines that overlap the
			
 
				+//! requested [`GenomeRange`]. Format-specific wrappers in `bed.rs` and `vcf.rs`
			
 
				+//! parse those lines into typed structures.
			
 
				 
			
 
				 use std::{
			
 
				     fs::{self, File},
			
 
				-    io::{BufReader, Read, Write},
			
 
				+    io::{BufRead, BufReader, Read, Write},
			
 
				     path::Path,
			
 
				 };
			
 
				 
			
 
				 use anyhow::Context;
			
 
				 use log::debug;
			
 
				 use noodles_bgzf as bgzf;
			
 
				+use noodles_core::{region::Interval, Position};
			
 
				+use noodles_csi::BinningIndex;
			
 
				+use noodles_tabix as tabix;
			
 
				 
			
 
				-use crate::{helpers::TempFileGuard, io::writers::{finalize_bgzf_file, get_gz_writer}};
			
 
				+use crate::{
			
 
				+    helpers::TempFileGuard,
			
 
				+    io::writers::{finalize_bgzf_file, get_gz_writer},
			
 
				+    positions::GenomeRange,
			
 
				+};
			
 
				 
			
 
				 /// Type alias for a BGZF reader wrapping any `Read` source.
			
 
				 pub type BGZFReader<R> = bgzf::io::Reader<R>;
			
@@ -158,3 +172,120 @@ pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result<String> {
 
				     guard.disarm();
			
 
				     Ok(output_path)
			
 
				 }
			
 
				+
			
 
				+/// Visit every data line in a Tabix-indexed BGZF file that overlaps `region`,
			
 
				+/// calling `on_line` for each one.
			
 
				+///
			
 
				+/// This is the zero-allocation core of the tabix fetch family. A single `String`
			
 
				+/// buffer is reused across all lines; `on_line` receives a `&str` reference into
			
 
				+/// that buffer — no intermediate `Vec<String>` is ever built. Callers that need
			
 
				+/// a collected result (e.g. [`fetch_tabix_lines`], [`fetch_bed`](super::bed::fetch_bed))
			
 
				+/// simply push into their own accumulator inside the closure.
			
 
				+///
			
 
				+/// Header lines (`#`) and blank lines are skipped before `on_line` is called.
			
 
				+/// Lines are newline-stripped before being passed to the callback.
			
 
				+///
			
 
				+/// # Coordinate system
			
 
				+///
			
 
				+/// `region` is **0-based half-open** `[start, end)`. The conversion to 1-based
			
 
				+/// inclusive Tabix positions is performed internally:
			
 
				+/// - `tabix_start = region.range.start + 1`
			
 
				+/// - `tabix_end   = region.range.end`
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `bgz_path` - Path to the BGZF file; index expected at `<bgz_path>.tbi`
			
 
				+/// * `region` - Genomic interval to query (0-based half-open)
			
 
				+/// * `on_line` - Callback invoked for each matching data line; returning `Err`
			
 
				+///   aborts the fetch immediately and propagates the error
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Returns an error if the index or file cannot be read, a seek fails, or
			
 
				+/// `on_line` returns an error.
			
 
				+pub fn fetch_tabix_lines_with<F>(
			
 
				+    bgz_path: &str,
			
 
				+    region: &GenomeRange,
			
 
				+    mut on_line: F,
			
 
				+) -> anyhow::Result<()>
			
 
				+where
			
 
				+    F: FnMut(&str) -> anyhow::Result<()>,
			
 
				+{
			
 
				+    let tbi_path = format!("{bgz_path}.tbi");
			
 
				+
			
 
				+    let index = tabix::fs::read(&tbi_path)
			
 
				+        .with_context(|| format!("cannot read tabix index: {tbi_path}"))?;
			
 
				+
			
 
				+    let rname = region.contig();
			
 
				+
			
 
				+    let ref_seq_id = match index
			
 
				+        .header()
			
 
				+        .ok_or_else(|| anyhow::anyhow!("tabix index has no header: {tbi_path}"))?
			
 
				+        .reference_sequence_names()
			
 
				+        .get_index_of(rname.as_bytes())
			
 
				+    {
			
 
				+        Some(id) => id,
			
 
				+        None => return Ok(()),
			
 
				+    };
			
 
				+
			
 
				+    // 0-based half-open [start, end) → 1-based inclusive [start+1, end]
			
 
				+    let interval_start = Position::try_from(region.range.start as usize + 1)
			
 
				+        .context("region start overflow")?;
			
 
				+    let interval_end = Position::try_from(region.range.end as usize)
			
 
				+        .context("region end overflow")?;
			
 
				+
			
 
				+    let interval = Interval::from(interval_start..=interval_end);
			
 
				+    let chunks = index
			
 
				+        .query(ref_seq_id, interval)
			
 
				+        .context("tabix region query failed")?;
			
 
				+
			
 
				+    if chunks.is_empty() {
			
 
				+        return Ok(());
			
 
				+    }
			
 
				+
			
 
				+    let mut reader = bgzf::io::Reader::new(
			
 
				+        File::open(bgz_path).with_context(|| format!("cannot open {bgz_path}"))?,
			
 
				+    );
			
 
				+
			
 
				+    // Single buffer reused across all lines — no per-line allocation.
			
 
				+    let mut buf = String::new();
			
 
				+
			
 
				+    for chunk in chunks {
			
 
				+        reader.seek(chunk.start()).context("BGZF seek failed")?;
			
 
				+
			
 
				+        loop {
			
 
				+            if reader.virtual_position() >= chunk.end() {
			
 
				+                break;
			
 
				+            }
			
 
				+            buf.clear();
			
 
				+            let n = reader.read_line(&mut buf).context("BGZF read_line failed")?;
			
 
				+            if n == 0 {
			
 
				+                break;
			
 
				+            }
			
 
				+            let line = buf.trim_end_matches(|c| c == '\n' || c == '\r');
			
 
				+            if !line.is_empty() && !line.starts_with('#') {
			
 
				+                on_line(line)?;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    Ok(())
			
 
				+}
			
 
				+
			
 
				+/// Convenience wrapper over [`fetch_tabix_lines_with`] that collects results
			
 
				+/// into a `Vec<String>`.
			
 
				+///
			
 
				+/// Prefer [`fetch_tabix_lines_with`] directly when you need to parse or filter
			
 
				+/// lines, to avoid the intermediate string allocation.
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Returns an error if the index or file cannot be read.
			
 
				+pub fn fetch_tabix_lines(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<String>> {
			
 
				+    let mut lines = Vec::new();
			
 
				+    fetch_tabix_lines_with(bgz_path, region, |line| {
			
 
				+        lines.push(line.to_owned());
			
 
				+        Ok(())
			
 
				+    })?;
			
 
				+    Ok(lines)
			
 
				+}
			
--- a/src/io/vcf.rs
+++ b/src/io/vcf.rs
@@ -15,10 +15,11 @@ use noodles_core::Position;
 
				 
			
 
				 use crate::{
			
 
				     io::writers::{BgzTabixWriter, IndexFormat},
			
 
				+    positions::GenomeRange,
			
 
				     variant::vcf_variant::VcfVariant,
			
 
				 };
			
 
				 
			
 
				-use super::{dict::read_dict, readers::get_reader};
			
 
				+use super::{dict::read_dict, readers::{fetch_tabix_lines_with, get_reader}};
			
 
				 
			
 
				 /// Load a BGZF-compressed or plain VCF file into memory.
			
 
				 ///
			
@@ -98,6 +99,43 @@ pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
 
				     writer.finish()
			
 
				 }
			
 
				 
			
 
				+/// Fetch variants overlapping `region` from a Tabix-indexed BGZF VCF file.
			
 
				+///
			
 
				+/// Delegates BGZF seeking and line extraction to [`fetch_tabix_lines`], then
			
 
				+/// parses each line as a [`VcfVariant`]. No secondary position filter is applied
			
 
				+/// beyond what Tabix provides — VCF variants are point features (POS = both
			
 
				+/// tabix start and end), so false positives from the bin query are rare.
			
 
				+///
			
 
				+/// # Coordinate system
			
 
				+///
			
 
				+/// `region` is **0-based half-open** `[start, end)`. The Tabix index was built
			
 
				+/// with VCF POS (1-based), so [`fetch_tabix_lines`] converts internally:
			
 
				+/// `tabix_start = region.start + 1`, `tabix_end = region.end`.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `bgz_path` - Path to the `.vcf.gz` file; index expected at `<bgz_path>.tbi`
			
 
				+/// * `region` - Genomic region to query (0-based half-open)
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// Parsed [`VcfVariant`]s overlapping `region`, in file order.
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Returns an error if the index or file cannot be read, or if a fetched line
			
 
				+/// fails to parse as a VCF variant.
			
 
				+pub fn fetch_vcf(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<VcfVariant>> {
			
 
				+    let mut variants = Vec::new();
			
 
				+    fetch_tabix_lines_with(bgz_path, region, |line| {
			
 
				+        let v: VcfVariant = line.parse()
			
 
				+            .with_context(|| format!("failed to parse VCF line: {line}"))?;
			
 
				+        variants.push(v);
			
 
				+        Ok(())
			
 
				+    })?;
			
 
				+    Ok(variants)
			
 
				+}
			
 
				+
			
 
				 /// A flat VCF row for CSV/TSV deserialization (e.g. via `serde`).
			
 
				 ///
			
 
				 /// Use [`VcfVariant`] for richer parsed access. This struct exists for