//! VCF file I/O utilities. //! //! Write operations produce BGZF-compressed output with an accompanying Tabix //! index (`.tbi`), enabling direct use with `bcftools`, `tabix`, and IGV without //! a separate indexing step. Read operations support both sequential loading and //! random-access region queries via the Tabix index. //! //! | Function | Purpose | //! |----------|---------| //! | [`read_vcf`] | Load a full VCF file into memory | //! | [`write_vcf`] | Write variants to `.vcf.gz` + `.tbi` atomically | //! | [`fetch_vcf`] | Random-access region fetch from a Tabix-indexed `.vcf.gz` | //! | [`vcf_header`] | Build a full VCF header with contig lines and INFO definitions | //! //! **Coordinate note:** [`GenomePosition::position`] is 0-based; VCF POS is //! 1-based. The conversion `vcf_pos = position + 1` is applied internally. use std::io::{BufRead, BufReader}; use anyhow::Context; use log::{info, warn}; use noodles_core::Position; use crate::{ io::writers::{BgzTabixWriter, IndexFormat}, positions::GenomeRange, variant::vcf_variant::VcfVariant, }; use super::{ dict::read_dict, readers::{fetch_tabix_lines_with, get_reader}, }; /// Load a BGZF-compressed or plain VCF file into memory. /// /// Header lines (starting with `#`) and blank lines are skipped. /// I/O errors on individual lines are logged as warnings and skipped. /// /// # Arguments /// /// * `path` - Path to the VCF file (plain text or BGZF) /// /// # Returns /// /// A vector of parsed variants in file order. /// /// # Errors /// /// Returns an error if the file cannot be opened or a data line fails to parse. pub fn read_vcf(path: &str) -> anyhow::Result> { let reader = BufReader::new(get_reader(path)?); let mut res = Vec::new(); for (i, line) in reader.lines().enumerate() { let line_no = i + 1; match line { Ok(line) => { if line.starts_with('#') || line.is_empty() { continue; } res.push(line.parse().with_context(|| { format!("failed parsing VCF record at {path}:{line_no}: {line}") })?); } Err(e) => warn!("Can't read {path}:{line_no}: {e}"), } } Ok(res) } /// Write variants to a BGZF-compressed VCF with a Tabix index. /// /// Produces `path` (`.vcf.gz`) and `path.tbi` in a single atomic pass using /// [`BgzTabixWriter`]. The output is immediately usable with `bcftools view`, /// `tabix`, and IGV without a separate indexing step. /// /// Tabix positions are derived from [`VcfVariant::position`]: the 0-based /// `GenomePosition.position` is converted to 1-based VCF POS internally. /// POS is used as both the tabix start and end (sufficient for all query types; /// a more precise end would require the REF length or INFO/END field). /// /// The output header is minimal (`##fileformat` + column line). Use /// [`vcf_header`] to build a full header with contig lines and INFO definitions. /// /// # Arguments /// /// * `variants` - Variants to write, in coordinate-sorted order /// * `path` - Destination path (must end in `.gz`) /// /// # Errors /// /// Returns an error if writing fails or if the variants are not coordinate-sorted /// (required by Tabix). pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> { info!("Writing: {path}"); let mut writer = BgzTabixWriter::new(path, IndexFormat::Tbi, true)?; writer.write_header(b"##fileformat=VCFv4.2\n")?; writer.write_header(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?; for variant in variants { let row = format!("{}\n", variant.commun_deepvariant_clairs().into_vcf_row()); let rname = variant.position.contig(); // GenomePosition.position is 0-based; VCF POS is 1-based let vcf_pos = variant.position.position as usize + 1; let pos = Position::try_from(vcf_pos) .with_context(|| format!("invalid VCF position: {vcf_pos}"))?; writer.write_record(row.as_bytes(), &rname, pos, pos)?; } writer.finish() } /// Fetch variants whose POS falls inside `region` from a Tabix-indexed BGZF VCF file. /// /// Delegates BGZF seeking and line extraction to [`fetch_tabix_lines`], then /// parses each line as a [`VcfVariant`] and applies a secondary POS filter to /// discard any false positives from the Tabix bin query. /// /// TODO: this is POS-based, not true interval-overlap fetching. Deletions and /// symbolic SVs with `INFO/END` can overlap a region even when their POS is /// outside it. Supporting that correctly should be done alongside explicit VCF /// span parsing, not inside this wrapper. /// /// TODO: VCF allows `POS=0` as a telomere sentinel. Parsing preserves that as /// internal position `0`, but tabix region queries use 1-based coordinates and /// this wrapper does not currently perform a separate sentinel lookup. /// /// # Coordinate system /// /// `region` is **0-based half-open** `[start, end)`. The Tabix index was built /// with VCF POS (1-based), so [`fetch_tabix_lines`] converts internally: /// `tabix_start = region.start + 1`, `tabix_end = region.end`. /// /// # Arguments /// /// * `bgz_path` - Path to the `.vcf.gz` file; index expected at `.tbi` /// * `region` - Genomic region to query (0-based half-open) /// /// # Returns /// /// Parsed [`VcfVariant`]s whose POS is inside `region`, in file order. /// /// # Errors /// /// Returns an error if the index or file cannot be read, or if a fetched line /// fails to parse as a VCF variant. pub fn fetch_vcf(bgz_path: &str, region: &GenomeRange) -> anyhow::Result> { let mut variants = Vec::new(); fetch_tabix_lines_with(bgz_path, region, |line| { let v: VcfVariant = line .parse() .with_context(|| format!("failed to parse VCF line: {line}"))?; if v.position.contig == region.contig && region.range.contains(&v.position.position) { variants.push(v); } Ok(()) })?; Ok(variants) } /// A flat VCF row for tab-separated line parsing. /// /// Use [`VcfVariant`] for richer parsed access. This struct exists for /// simple tabular ingestion where field-by-field access is not needed. /// /// `pos` is **1-based** as stored in the VCF file. Parse from a tab-separated /// line via `FromStr`; the `serde::Deserialize` derive is kept for callers /// that still use serde-based deserialization. #[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)] pub struct VCFRow { pub chr: String, /// 1-based position (VCF convention) pub pos: u32, pub id: String, pub reference: String, pub alt: String, pub qual: String, pub filter: String, pub info: String, pub format: String, pub value: String, } impl std::str::FromStr for VCFRow { type Err = anyhow::Error; /// Parse a tab-separated VCF data line (no header, 10 positional columns). fn from_str(s: &str) -> anyhow::Result { let f: Vec<&str> = s.split('\t').collect(); let get = |i: usize, name: &str| -> anyhow::Result<&str> { f.get(i) .copied() .ok_or_else(|| anyhow::anyhow!("missing {name} (col {i})")) }; Ok(Self { chr: get(0, "chr")?.to_string(), pos: get(1, "pos")?.parse().context("bad pos")?, id: get(2, "id")?.to_string(), reference: get(3, "reference")?.to_string(), alt: get(4, "alt")?.to_string(), qual: get(5, "qual")?.to_string(), filter: get(6, "filter")?.to_string(), info: get(7, "info")?.to_string(), format: get(8, "format")?.to_string(), value: get(9, "value")?.to_string(), }) } } /// Build a full VCF header with INFO definitions, contig lines, and Pandora version. /// /// Includes SV-specific INFO fields (`SVTYPE`, `SVLEN`, `END`) and reads contig /// lengths from a `.dict` file. The last element is always the column header line /// (`#CHROM\tPOS\t...`). /// /// Note: [`write_vcf`] uses a minimal inline header and does not call this function. /// Use this when constructing a header separately before writing with a custom writer. /// /// # Arguments /// /// * `dict` - Path to a sequence dictionary (`.dict`) file /// /// # Returns /// /// Ordered header lines, each without a trailing newline. /// /// # Errors /// /// Returns an error if the dictionary file cannot be read. pub fn vcf_header(dict: &str) -> anyhow::Result> { let mut header = Vec::new(); header.push("##fileformat=VCFv4.2".to_string()); header.push( "##INFO=" .to_string(), ); header.push("##INFO=".to_string()); header.push("##INFO=".to_string()); header.push(format!( "##Pandora_lib_version={}", env!("CARGO_PKG_VERSION") )); read_dict(dict)? .into_iter() .for_each(|(id, len)| header.push(format!("##contig="))); header.push("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE".to_string()); Ok(header) } #[cfg(test)] mod tests { use crate::config::Config; use super::*; #[test] fn vcf_dbsnp() { let config = Config::default(); fetch_vcf(conf.db_snp, &GenomeRange::new("chr10", , end)) } }