//! File reader helpers for plain-text, BGZF, and Tabix-indexed genomic files. //! //! **Important:** `.gz` files are assumed to be **BGZF** (block gzip), not standard gzip. //! BGZF is the format produced by `bgzip` and used by BAM, VCF, BED.gz, etc. //! Plain `gzip` output will not decompress correctly here. //! //! # Tabix region fetch //! //! [`fetch_tabix_lines`] performs random-access region queries against any //! Tabix-indexed BGZF file, returning the raw data lines that overlap the //! requested [`GenomeRange`]. Format-specific wrappers in `bed.rs` and `vcf.rs` //! parse those lines into typed structures. use std::{ fs::{self, File}, io::{BufRead, BufReader, Read, Write}, path::Path, }; use anyhow::Context; use log::debug; use noodles_bgzf as bgzf; use noodles_core::{region::Interval, Position}; use noodles_csi::BinningIndex; use noodles_tabix as tabix; use crate::{ helpers::TempFileGuard, io::writers::{finalize_bgzf_file, get_gz_writer}, positions::GenomeRange, }; /// Type alias for a BGZF reader wrapping any `Read` source. pub type BGZFReader = bgzf::io::Reader; /// Open a plain-text or BGZF-compressed file for reading. /// /// The file format is detected from the extension. Supported extensions: /// `gz` (BGZF), `vcf`, `bed`, `tsv`, `json`, `chain`. /// /// `.gz` files are opened with [`BGZFReader`] — they must be BGZF-encoded, /// not plain gzip. /// /// # Arguments /// /// * `path` - Path to the file /// /// # Returns /// /// A boxed `Read` impl, buffered internally. /// /// # Errors /// /// Returns an error if the extension is missing, unrecognised, or the file /// cannot be opened. pub fn get_reader(path: &str) -> anyhow::Result> { debug!("Reading: {path}"); let file_type = Path::new(path) .extension() .and_then(|s| s.to_str()) .with_context(|| format!("can't parse extension from {path}"))?; anyhow::ensure!( matches!(file_type, "gz" | "vcf" | "bed" | "tsv" | "json" | "chain"), "unsupported file extension: {file_type}" ); let raw_reader = File::open(path).with_context(|| format!("failed to open {path}"))?; match file_type { "gz" => Ok(Box::new(BufReader::new(BGZFReader::new(raw_reader)))), _ => Ok(Box::new(BufReader::new(raw_reader))), } } /// Open a file as a BGZF reader, compressing it first if it is not already `.gz`. /// /// If `path` does not end in `.gz`, [`compress_to_bgzip`] is called to produce /// `.gz` on disk before opening it. This is a **side-effecting** operation: /// the compressed file is created and cached for subsequent calls. /// /// # Arguments /// /// * `path` - Path to the input file (plain text or BGZF) /// /// # Returns /// /// A [`BGZFReader`] over the (possibly newly created) `.gz` file. /// /// # Errors /// /// Returns an error if the extension cannot be determined, compression fails, /// or the resulting `.gz` file cannot be opened. pub fn get_gz_reader(path: &str) -> anyhow::Result> { debug!("Reading: {path}"); let file_type = Path::new(path) .extension() .and_then(|s| s.to_str()) .with_context(|| format!("can't parse extension from {path}"))?; let path = if file_type != "gz" { compress_to_bgzip(path)? } else { path.to_string() }; let file = File::open(&path).with_context(|| format!("failed to open BGZF file: {path}"))?; Ok(BGZFReader::new(file)) } /// Compress a plain-text file to BGZF, returning the path of the `.gz` output. /// /// If `.gz` already exists it is returned immediately (cached). /// Otherwise the file is written atomically: a UUID-named temp file is created /// in the same directory as the output (same filesystem, guaranteeing an atomic /// rename), then renamed to `.gz` on success. A [`TempFileGuard`] /// ensures the temp file is removed automatically if the function fails or panics. /// /// # Arguments /// /// * `input_path` - Path to the plain-text input file /// /// # Returns /// /// The path of the BGZF-compressed output file (`.gz`). /// /// # Errors /// /// Returns an error if the input cannot be read, the output cannot be written, /// or the atomic rename fails. pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result { let output_path = format!("{input_path}.gz"); if Path::new(&output_path).exists() { return Ok(output_path); } debug!("Compressing {input_path} → {output_path}"); let tmp_dir = Path::new(input_path) .parent() .unwrap_or(Path::new(".")); let mut guard = TempFileGuard::new(); let tmp_path = guard.tmp_path(".gz", tmp_dir); let tmp_str = tmp_path.to_string_lossy(); let input_file = File::open(input_path) .with_context(|| format!("failed to open input file: {input_path}"))?; let mut reader = BufReader::new(input_file); let mut writer = get_gz_writer(&tmp_str, false)?; let mut buffer = [0u8; 8192]; loop { let n = reader .read(&mut buffer) .with_context(|| format!("failed reading {input_path}"))?; if n == 0 { break; } writer .write_all(&buffer[..n]) .with_context(|| format!("failed writing {tmp_str}"))?; } finalize_bgzf_file(writer, &tmp_str)?; fs::rename(&tmp_path, &output_path) .with_context(|| format!("failed to rename {tmp_str} → {output_path}"))?; guard.disarm(); Ok(output_path) } /// Visit every data line in a Tabix-indexed BGZF file that overlaps `region`, /// calling `on_line` for each one. /// /// This is the zero-allocation core of the tabix fetch family. A single `String` /// buffer is reused across all lines; `on_line` receives a `&str` reference into /// that buffer — no intermediate `Vec` is ever built. Callers that need /// a collected result (e.g. [`fetch_tabix_lines`], [`fetch_bed`](super::bed::fetch_bed)) /// simply push into their own accumulator inside the closure. /// /// Header lines (`#`) and blank lines are skipped before `on_line` is called. /// Lines are newline-stripped before being passed to the callback. /// /// # Coordinate system /// /// `region` is **0-based half-open** `[start, end)`. The conversion to 1-based /// inclusive Tabix positions is performed internally: /// - `tabix_start = region.range.start + 1` /// - `tabix_end = region.range.end` /// /// # Arguments /// /// * `bgz_path` - Path to the BGZF file; index expected at `.tbi` /// * `region` - Genomic interval to query (0-based half-open) /// * `on_line` - Callback invoked for each matching data line; returning `Err` /// aborts the fetch immediately and propagates the error /// /// # Errors /// /// Returns an error if the index or file cannot be read, a seek fails, or /// `on_line` returns an error. pub fn fetch_tabix_lines_with( bgz_path: &str, region: &GenomeRange, mut on_line: F, ) -> anyhow::Result<()> where F: FnMut(&str) -> anyhow::Result<()>, { let tbi_path = format!("{bgz_path}.tbi"); let index = tabix::fs::read(&tbi_path) .with_context(|| format!("cannot read tabix index: {tbi_path}"))?; let rname = region.contig(); let ref_seq_id = match index .header() .ok_or_else(|| anyhow::anyhow!("tabix index has no header: {tbi_path}"))? .reference_sequence_names() .get_index_of(rname.as_bytes()) { Some(id) => id, None => return Ok(()), }; // 0-based half-open [start, end) → 1-based inclusive [start+1, end] let interval_start = Position::try_from(region.range.start as usize + 1) .context("region start overflow")?; let interval_end = Position::try_from(region.range.end as usize) .context("region end overflow")?; let interval = Interval::from(interval_start..=interval_end); let chunks = index .query(ref_seq_id, interval) .context("tabix region query failed")?; if chunks.is_empty() { return Ok(()); } let mut reader = bgzf::io::Reader::new( File::open(bgz_path).with_context(|| format!("cannot open {bgz_path}"))?, ); // Single buffer reused across all lines — no per-line allocation. let mut buf = String::new(); for chunk in chunks { reader.seek(chunk.start()).context("BGZF seek failed")?; loop { if reader.virtual_position() >= chunk.end() { break; } buf.clear(); let n = reader.read_line(&mut buf).context("BGZF read_line failed")?; if n == 0 { break; } let line = buf.trim_end_matches(|c| c == '\n' || c == '\r'); if !line.is_empty() && !line.starts_with('#') { on_line(line)?; } } } Ok(()) } /// Convenience wrapper over [`fetch_tabix_lines_with`] that collects results /// into a `Vec`. /// /// Prefer [`fetch_tabix_lines_with`] directly when you need to parse or filter /// lines, to avoid the intermediate string allocation. /// /// # Errors /// /// Returns an error if the index or file cannot be read. pub fn fetch_tabix_lines(bgz_path: &str, region: &GenomeRange) -> anyhow::Result> { let mut lines = Vec::new(); fetch_tabix_lines_with(bgz_path, region, |line| { lines.push(line.to_owned()); Ok(()) })?; Ok(lines) }