|
@@ -1,20 +1,34 @@
|
|
|
-//! File reader helpers for plain-text and BGZF-compressed genomic files.
|
|
|
|
|
|
|
+//! File reader helpers for plain-text, BGZF, and Tabix-indexed genomic files.
|
|
|
//!
|
|
//!
|
|
|
//! **Important:** `.gz` files are assumed to be **BGZF** (block gzip), not standard gzip.
|
|
//! **Important:** `.gz` files are assumed to be **BGZF** (block gzip), not standard gzip.
|
|
|
//! BGZF is the format produced by `bgzip` and used by BAM, VCF, BED.gz, etc.
|
|
//! BGZF is the format produced by `bgzip` and used by BAM, VCF, BED.gz, etc.
|
|
|
//! Plain `gzip` output will not decompress correctly here.
|
|
//! Plain `gzip` output will not decompress correctly here.
|
|
|
|
|
+//!
|
|
|
|
|
+//! # Tabix region fetch
|
|
|
|
|
+//!
|
|
|
|
|
+//! [`fetch_tabix_lines`] performs random-access region queries against any
|
|
|
|
|
+//! Tabix-indexed BGZF file, returning the raw data lines that overlap the
|
|
|
|
|
+//! requested [`GenomeRange`]. Format-specific wrappers in `bed.rs` and `vcf.rs`
|
|
|
|
|
+//! parse those lines into typed structures.
|
|
|
|
|
|
|
|
use std::{
|
|
use std::{
|
|
|
fs::{self, File},
|
|
fs::{self, File},
|
|
|
- io::{BufReader, Read, Write},
|
|
|
|
|
|
|
+ io::{BufRead, BufReader, Read, Write},
|
|
|
path::Path,
|
|
path::Path,
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
use anyhow::Context;
|
|
use anyhow::Context;
|
|
|
use log::debug;
|
|
use log::debug;
|
|
|
use noodles_bgzf as bgzf;
|
|
use noodles_bgzf as bgzf;
|
|
|
|
|
+use noodles_core::{region::Interval, Position};
|
|
|
|
|
+use noodles_csi::BinningIndex;
|
|
|
|
|
+use noodles_tabix as tabix;
|
|
|
|
|
|
|
|
-use crate::{helpers::TempFileGuard, io::writers::{finalize_bgzf_file, get_gz_writer}};
|
|
|
|
|
|
|
+use crate::{
|
|
|
|
|
+ helpers::TempFileGuard,
|
|
|
|
|
+ io::writers::{finalize_bgzf_file, get_gz_writer},
|
|
|
|
|
+ positions::GenomeRange,
|
|
|
|
|
+};
|
|
|
|
|
|
|
|
/// Type alias for a BGZF reader wrapping any `Read` source.
|
|
/// Type alias for a BGZF reader wrapping any `Read` source.
|
|
|
pub type BGZFReader<R> = bgzf::io::Reader<R>;
|
|
pub type BGZFReader<R> = bgzf::io::Reader<R>;
|
|
@@ -158,3 +172,120 @@ pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result<String> {
|
|
|
guard.disarm();
|
|
guard.disarm();
|
|
|
Ok(output_path)
|
|
Ok(output_path)
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+/// Visit every data line in a Tabix-indexed BGZF file that overlaps `region`,
|
|
|
|
|
+/// calling `on_line` for each one.
|
|
|
|
|
+///
|
|
|
|
|
+/// This is the zero-allocation core of the tabix fetch family. A single `String`
|
|
|
|
|
+/// buffer is reused across all lines; `on_line` receives a `&str` reference into
|
|
|
|
|
+/// that buffer — no intermediate `Vec<String>` is ever built. Callers that need
|
|
|
|
|
+/// a collected result (e.g. [`fetch_tabix_lines`], [`fetch_bed`](super::bed::fetch_bed))
|
|
|
|
|
+/// simply push into their own accumulator inside the closure.
|
|
|
|
|
+///
|
|
|
|
|
+/// Header lines (`#`) and blank lines are skipped before `on_line` is called.
|
|
|
|
|
+/// Lines are newline-stripped before being passed to the callback.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Coordinate system
|
|
|
|
|
+///
|
|
|
|
|
+/// `region` is **0-based half-open** `[start, end)`. The conversion to 1-based
|
|
|
|
|
+/// inclusive Tabix positions is performed internally:
|
|
|
|
|
+/// - `tabix_start = region.range.start + 1`
|
|
|
|
|
+/// - `tabix_end = region.range.end`
|
|
|
|
|
+///
|
|
|
|
|
+/// # Arguments
|
|
|
|
|
+///
|
|
|
|
|
+/// * `bgz_path` - Path to the BGZF file; index expected at `<bgz_path>.tbi`
|
|
|
|
|
+/// * `region` - Genomic interval to query (0-based half-open)
|
|
|
|
|
+/// * `on_line` - Callback invoked for each matching data line; returning `Err`
|
|
|
|
|
+/// aborts the fetch immediately and propagates the error
|
|
|
|
|
+///
|
|
|
|
|
+/// # Errors
|
|
|
|
|
+///
|
|
|
|
|
+/// Returns an error if the index or file cannot be read, a seek fails, or
|
|
|
|
|
+/// `on_line` returns an error.
|
|
|
|
|
+pub fn fetch_tabix_lines_with<F>(
|
|
|
|
|
+ bgz_path: &str,
|
|
|
|
|
+ region: &GenomeRange,
|
|
|
|
|
+ mut on_line: F,
|
|
|
|
|
+) -> anyhow::Result<()>
|
|
|
|
|
+where
|
|
|
|
|
+ F: FnMut(&str) -> anyhow::Result<()>,
|
|
|
|
|
+{
|
|
|
|
|
+ let tbi_path = format!("{bgz_path}.tbi");
|
|
|
|
|
+
|
|
|
|
|
+ let index = tabix::fs::read(&tbi_path)
|
|
|
|
|
+ .with_context(|| format!("cannot read tabix index: {tbi_path}"))?;
|
|
|
|
|
+
|
|
|
|
|
+ let rname = region.contig();
|
|
|
|
|
+
|
|
|
|
|
+ let ref_seq_id = match index
|
|
|
|
|
+ .header()
|
|
|
|
|
+ .ok_or_else(|| anyhow::anyhow!("tabix index has no header: {tbi_path}"))?
|
|
|
|
|
+ .reference_sequence_names()
|
|
|
|
|
+ .get_index_of(rname.as_bytes())
|
|
|
|
|
+ {
|
|
|
|
|
+ Some(id) => id,
|
|
|
|
|
+ None => return Ok(()),
|
|
|
|
|
+ };
|
|
|
|
|
+
|
|
|
|
|
+ // 0-based half-open [start, end) → 1-based inclusive [start+1, end]
|
|
|
|
|
+ let interval_start = Position::try_from(region.range.start as usize + 1)
|
|
|
|
|
+ .context("region start overflow")?;
|
|
|
|
|
+ let interval_end = Position::try_from(region.range.end as usize)
|
|
|
|
|
+ .context("region end overflow")?;
|
|
|
|
|
+
|
|
|
|
|
+ let interval = Interval::from(interval_start..=interval_end);
|
|
|
|
|
+ let chunks = index
|
|
|
|
|
+ .query(ref_seq_id, interval)
|
|
|
|
|
+ .context("tabix region query failed")?;
|
|
|
|
|
+
|
|
|
|
|
+ if chunks.is_empty() {
|
|
|
|
|
+ return Ok(());
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ let mut reader = bgzf::io::Reader::new(
|
|
|
|
|
+ File::open(bgz_path).with_context(|| format!("cannot open {bgz_path}"))?,
|
|
|
|
|
+ );
|
|
|
|
|
+
|
|
|
|
|
+ // Single buffer reused across all lines — no per-line allocation.
|
|
|
|
|
+ let mut buf = String::new();
|
|
|
|
|
+
|
|
|
|
|
+ for chunk in chunks {
|
|
|
|
|
+ reader.seek(chunk.start()).context("BGZF seek failed")?;
|
|
|
|
|
+
|
|
|
|
|
+ loop {
|
|
|
|
|
+ if reader.virtual_position() >= chunk.end() {
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ buf.clear();
|
|
|
|
|
+ let n = reader.read_line(&mut buf).context("BGZF read_line failed")?;
|
|
|
|
|
+ if n == 0 {
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ let line = buf.trim_end_matches(|c| c == '\n' || c == '\r');
|
|
|
|
|
+ if !line.is_empty() && !line.starts_with('#') {
|
|
|
|
|
+ on_line(line)?;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ Ok(())
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+/// Convenience wrapper over [`fetch_tabix_lines_with`] that collects results
|
|
|
|
|
+/// into a `Vec<String>`.
|
|
|
|
|
+///
|
|
|
|
|
+/// Prefer [`fetch_tabix_lines_with`] directly when you need to parse or filter
|
|
|
|
|
+/// lines, to avoid the intermediate string allocation.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Errors
|
|
|
|
|
+///
|
|
|
|
|
+/// Returns an error if the index or file cannot be read.
|
|
|
|
|
+pub fn fetch_tabix_lines(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<String>> {
|
|
|
|
|
+ let mut lines = Vec::new();
|
|
|
|
|
+ fetch_tabix_lines_with(bgz_path, region, |line| {
|
|
|
|
|
+ lines.push(line.to_owned());
|
|
|
|
|
+ Ok(())
|
|
|
|
|
+ })?;
|
|
|
|
|
+ Ok(lines)
|
|
|
|
|
+}
|