| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291 |
- //! File reader helpers for plain-text, BGZF, and Tabix-indexed genomic files.
- //!
- //! **Important:** `.gz` files are assumed to be **BGZF** (block gzip), not standard gzip.
- //! BGZF is the format produced by `bgzip` and used by BAM, VCF, BED.gz, etc.
- //! Plain `gzip` output will not decompress correctly here.
- //!
- //! # Tabix region fetch
- //!
- //! [`fetch_tabix_lines`] performs random-access region queries against any
- //! Tabix-indexed BGZF file, returning the raw data lines that overlap the
- //! requested [`GenomeRange`]. Format-specific wrappers in `bed.rs` and `vcf.rs`
- //! parse those lines into typed structures.
- use std::{
- fs::{self, File},
- io::{BufRead, BufReader, Read, Write},
- path::Path,
- };
- use anyhow::Context;
- use log::debug;
- use noodles_bgzf as bgzf;
- use noodles_core::{region::Interval, Position};
- use noodles_csi::BinningIndex;
- use noodles_tabix as tabix;
- use crate::{
- helpers::TempFileGuard,
- io::writers::{finalize_bgzf_file, get_gz_writer},
- positions::GenomeRange,
- };
- /// Type alias for a BGZF reader wrapping any `Read` source.
- pub type BGZFReader<R> = bgzf::io::Reader<R>;
- /// Open a plain-text or BGZF-compressed file for reading.
- ///
- /// The file format is detected from the extension. Supported extensions:
- /// `gz` (BGZF), `vcf`, `bed`, `tsv`, `json`, `chain`.
- ///
- /// `.gz` files are opened with [`BGZFReader`] — they must be BGZF-encoded,
- /// not plain gzip.
- ///
- /// # Arguments
- ///
- /// * `path` - Path to the file
- ///
- /// # Returns
- ///
- /// A boxed `Read` impl, buffered internally.
- ///
- /// # Errors
- ///
- /// Returns an error if the extension is missing, unrecognised, or the file
- /// cannot be opened.
- pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn Read>> {
- debug!("Reading: {path}");
- let file_type = Path::new(path)
- .extension()
- .and_then(|s| s.to_str())
- .with_context(|| format!("can't parse extension from {path}"))?;
- anyhow::ensure!(
- matches!(file_type, "gz" | "vcf" | "bed" | "tsv" | "json" | "chain"),
- "unsupported file extension: {file_type}"
- );
- let raw_reader = File::open(path).with_context(|| format!("failed to open {path}"))?;
- match file_type {
- "gz" => Ok(Box::new(BufReader::new(BGZFReader::new(raw_reader)))),
- _ => Ok(Box::new(BufReader::new(raw_reader))),
- }
- }
- /// Open a file as a BGZF reader, compressing it first if it is not already `.gz`.
- ///
- /// If `path` does not end in `.gz`, [`compress_to_bgzip`] is called to produce
- /// `<path>.gz` on disk before opening it. This is a **side-effecting** operation:
- /// the compressed file is created and cached for subsequent calls.
- ///
- /// # Arguments
- ///
- /// * `path` - Path to the input file (plain text or BGZF)
- ///
- /// # Returns
- ///
- /// A [`BGZFReader`] over the (possibly newly created) `.gz` file.
- ///
- /// # Errors
- ///
- /// Returns an error if the extension cannot be determined, compression fails,
- /// or the resulting `.gz` file cannot be opened.
- pub fn get_gz_reader(path: &str) -> anyhow::Result<BGZFReader<File>> {
- debug!("Reading: {path}");
- let file_type = Path::new(path)
- .extension()
- .and_then(|s| s.to_str())
- .with_context(|| format!("can't parse extension from {path}"))?;
- let path = if file_type != "gz" {
- compress_to_bgzip(path)?
- } else {
- path.to_string()
- };
- let file = File::open(&path).with_context(|| format!("failed to open BGZF file: {path}"))?;
- Ok(BGZFReader::new(file))
- }
- /// Compress a plain-text file to BGZF, returning the path of the `.gz` output.
- ///
- /// If `<input_path>.gz` already exists it is returned immediately (cached).
- /// Otherwise the file is written atomically: a UUID-named temp file is created
- /// in the same directory as the output (same filesystem, guaranteeing an atomic
- /// rename), then renamed to `<input_path>.gz` on success. A [`TempFileGuard`]
- /// ensures the temp file is removed automatically if the function fails or panics.
- ///
- /// # Arguments
- ///
- /// * `input_path` - Path to the plain-text input file
- ///
- /// # Returns
- ///
- /// The path of the BGZF-compressed output file (`<input_path>.gz`).
- ///
- /// # Errors
- ///
- /// Returns an error if the input cannot be read, the output cannot be written,
- /// or the atomic rename fails.
- pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result<String> {
- let output_path = format!("{input_path}.gz");
- if Path::new(&output_path).exists() {
- return Ok(output_path);
- }
- debug!("Compressing {input_path} → {output_path}");
- let tmp_dir = Path::new(input_path)
- .parent()
- .unwrap_or(Path::new("."));
- let mut guard = TempFileGuard::new();
- let tmp_path = guard.tmp_path(".gz", tmp_dir);
- let tmp_str = tmp_path.to_string_lossy();
- let input_file = File::open(input_path)
- .with_context(|| format!("failed to open input file: {input_path}"))?;
- let mut reader = BufReader::new(input_file);
- let mut writer = get_gz_writer(&tmp_str, false)?;
- let mut buffer = [0u8; 8192];
- loop {
- let n = reader
- .read(&mut buffer)
- .with_context(|| format!("failed reading {input_path}"))?;
- if n == 0 { break; }
- writer
- .write_all(&buffer[..n])
- .with_context(|| format!("failed writing {tmp_str}"))?;
- }
- finalize_bgzf_file(writer, &tmp_str)?;
- fs::rename(&tmp_path, &output_path)
- .with_context(|| format!("failed to rename {tmp_str} → {output_path}"))?;
- guard.disarm();
- Ok(output_path)
- }
- /// Visit every data line in a Tabix-indexed BGZF file that overlaps `region`,
- /// calling `on_line` for each one.
- ///
- /// This is the zero-allocation core of the tabix fetch family. A single `String`
- /// buffer is reused across all lines; `on_line` receives a `&str` reference into
- /// that buffer — no intermediate `Vec<String>` is ever built. Callers that need
- /// a collected result (e.g. [`fetch_tabix_lines`], [`fetch_bed`](super::bed::fetch_bed))
- /// simply push into their own accumulator inside the closure.
- ///
- /// Header lines (`#`) and blank lines are skipped before `on_line` is called.
- /// Lines are newline-stripped before being passed to the callback.
- ///
- /// # Coordinate system
- ///
- /// `region` is **0-based half-open** `[start, end)`. The conversion to 1-based
- /// inclusive Tabix positions is performed internally:
- /// - `tabix_start = region.range.start + 1`
- /// - `tabix_end = region.range.end`
- ///
- /// # Arguments
- ///
- /// * `bgz_path` - Path to the BGZF file; index expected at `<bgz_path>.tbi`
- /// * `region` - Genomic interval to query (0-based half-open)
- /// * `on_line` - Callback invoked for each matching data line; returning `Err`
- /// aborts the fetch immediately and propagates the error
- ///
- /// # Errors
- ///
- /// Returns an error if the index or file cannot be read, a seek fails, or
- /// `on_line` returns an error.
- pub fn fetch_tabix_lines_with<F>(
- bgz_path: &str,
- region: &GenomeRange,
- mut on_line: F,
- ) -> anyhow::Result<()>
- where
- F: FnMut(&str) -> anyhow::Result<()>,
- {
- let tbi_path = format!("{bgz_path}.tbi");
- let index = tabix::fs::read(&tbi_path)
- .with_context(|| format!("cannot read tabix index: {tbi_path}"))?;
- let rname = region.contig();
- let ref_seq_id = match index
- .header()
- .ok_or_else(|| anyhow::anyhow!("tabix index has no header: {tbi_path}"))?
- .reference_sequence_names()
- .get_index_of(rname.as_bytes())
- {
- Some(id) => id,
- None => return Ok(()),
- };
- // 0-based half-open [start, end) → 1-based inclusive [start+1, end]
- let interval_start = Position::try_from(region.range.start as usize + 1)
- .context("region start overflow")?;
- let interval_end = Position::try_from(region.range.end as usize)
- .context("region end overflow")?;
- let interval = Interval::from(interval_start..=interval_end);
- let chunks = index
- .query(ref_seq_id, interval)
- .context("tabix region query failed")?;
- if chunks.is_empty() {
- return Ok(());
- }
- let mut reader = bgzf::io::Reader::new(
- File::open(bgz_path).with_context(|| format!("cannot open {bgz_path}"))?,
- );
- // Single buffer reused across all lines — no per-line allocation.
- let mut buf = String::new();
- for chunk in chunks {
- reader.seek(chunk.start()).context("BGZF seek failed")?;
- loop {
- if reader.virtual_position() >= chunk.end() {
- break;
- }
- buf.clear();
- let n = reader.read_line(&mut buf).context("BGZF read_line failed")?;
- if n == 0 {
- break;
- }
- let line = buf.trim_end_matches(|c| c == '\n' || c == '\r');
- if !line.is_empty() && !line.starts_with('#') {
- on_line(line)?;
- }
- }
- }
- Ok(())
- }
- /// Convenience wrapper over [`fetch_tabix_lines_with`] that collects results
- /// into a `Vec<String>`.
- ///
- /// Prefer [`fetch_tabix_lines_with`] directly when you need to parse or filter
- /// lines, to avoid the intermediate string allocation.
- ///
- /// # Errors
- ///
- /// Returns an error if the index or file cannot be read.
- pub fn fetch_tabix_lines(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<String>> {
- let mut lines = Vec::new();
- fetch_tabix_lines_with(bgz_path, region, |line| {
- lines.push(line.to_owned());
- Ok(())
- })?;
- Ok(lines)
- }
|