Thomas
/
pandora_lib_promethion


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
							//! VCF file I/O utilities.
//!
//! Write operations produce BGZF-compressed output with an accompanying Tabix
//! index (`.tbi`), enabling direct use with `bcftools`, `tabix`, and IGV without
//! a separate indexing step. Read operations support both sequential loading and
//! random-access region queries via the Tabix index.
//!
//! | Function | Purpose |
//! |----------|---------|
//! | [`read_vcf`] | Load a full VCF file into memory |
//! | [`write_vcf`] | Write variants to `.vcf.gz` + `.tbi` atomically |
//! | [`fetch_vcf`] | Random-access region fetch from a Tabix-indexed `.vcf.gz` |
//! | [`vcf_header`] | Build a full VCF header with contig lines and INFO definitions |
//!
//! **Coordinate note:** [`GenomePosition::position`] is 0-based; VCF POS is
//! 1-based. The conversion `vcf_pos = position + 1` is applied internally.

use std::{
    borrow::Borrow,
    io::{BufRead, BufReader},
};

use anyhow::Context;
use log::{info, warn};
use noodles_core::Position;

use crate::{
    io::writers::{BgzTabixWriter, IndexFormat},
    positions::GenomeRange,
    variant::vcf_variant::{from_multiallelic, VcfVariant},
};

use super::{
    dict::read_dict,
    readers::{fetch_tabix_lines_with, get_reader, TabixReader},
};

/// Load a BGZF-compressed or plain VCF file into memory.
///
/// Header lines (starting with `#`) and blank lines are skipped.
/// I/O errors on individual lines are logged as warnings and skipped.
///
/// # Arguments
///
/// * `path` - Path to the VCF file (plain text or BGZF)
///
/// # Returns
///
/// A vector of parsed variants in file order.
///
/// # Errors
///
/// Returns an error if the file cannot be opened or a data line fails to parse.
pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
    let reader = BufReader::new(get_reader(path)?);

    let mut res = Vec::new();
    for (i, line) in reader.lines().enumerate() {
        let line_no = i + 1;
        match line {
            Ok(line) => {
                if line.starts_with('#') || line.is_empty() {
                    continue;
                }
                res.push(line.parse().with_context(|| {
                    format!("failed parsing VCF record at {path}:{line_no}: {line}")
                })?);
            }
            Err(e) => warn!("Can't read {path}:{line_no}: {e}"),
        }
    }

    Ok(res)
}

/// Write variants to a BGZF-compressed VCF with a Tabix index.
///
/// Produces `path` (`.vcf.gz`) and `path.tbi` in a single atomic pass using
/// [`BgzTabixWriter`]. The output is immediately usable with `bcftools view`,
/// `tabix`, and IGV without a separate indexing step.
///
/// Tabix positions are derived from [`VcfVariant::position`]: the 0-based
/// `GenomePosition.position` is converted to 1-based VCF POS internally.
/// POS is used as both the tabix start and end (sufficient for all query types;
/// a more precise end would require the REF length or INFO/END field).
///
/// The output header is minimal (`##fileformat` + column line). Use
/// [`vcf_header`] to build a full header with contig lines and INFO definitions.
///
/// # Arguments
///
/// * `variants` - Variants to write, in coordinate-sorted order
/// * `path` - Destination path (must end in `.gz`)
///
/// # Errors
///
/// Returns an error if writing fails or if the variants are not coordinate-sorted
/// (required by Tabix).
pub fn write_vcf<B: Borrow<VcfVariant>>(variants: &[B], path: &str) -> anyhow::Result<()> {
    info!("Writing: {path}");

    let mut writer = BgzTabixWriter::new(path, IndexFormat::Tbi, true)?;
    writer.write_header(b"##fileformat=VCFv4.2\n")?;
    writer.write_header(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;

    for variant in variants {
        let variant_ref = variant.borrow();
        let row = format!(
            "{}\n",
            variant_ref.commun_deepvariant_clairs().into_vcf_row()
        );
        let rname = variant_ref.position.contig();

        // GenomePosition.position is 0-based; VCF POS is 1-based
        let vcf_pos = variant_ref.position.position as usize + 1;
        let pos = Position::try_from(vcf_pos)
            .with_context(|| format!("invalid VCF position: {vcf_pos}"))?;
        writer.write_record(row.as_bytes(), &rname, pos, pos)?;
    }

    writer.finish()
}

/// Fetch variants whose POS falls inside `region` from a Tabix-indexed BGZF VCF file.
///
/// Delegates BGZF seeking and line extraction to [`fetch_tabix_lines`], then
/// parses each line as a [`VcfVariant`] and applies a secondary POS filter to
/// discard any false positives from the Tabix bin query.
///
/// TODO: this is POS-based, not true interval-overlap fetching. Deletions and
/// symbolic SVs with `INFO/END` can overlap a region even when their POS is
/// outside it. Supporting that correctly should be done alongside explicit VCF
/// span parsing, not inside this wrapper.
///
/// TODO: VCF allows `POS=0` as a telomere sentinel. Parsing preserves that as
/// internal position `0`, but tabix region queries use 1-based coordinates and
/// this wrapper does not currently perform a separate sentinel lookup.
///
/// # Coordinate system
///
/// `region` is **0-based half-open** `[start, end)`. The Tabix index was built
/// with VCF POS (1-based), so [`fetch_tabix_lines`] converts internally:
/// `tabix_start = region.start + 1`, `tabix_end = region.end`.
///
/// # Arguments
///
/// * `bgz_path` - Path to the `.vcf.gz` file; index expected at `<bgz_path>.tbi`
/// * `region` - Genomic region to query (0-based half-open)
///
/// # Returns
///
/// Parsed [`VcfVariant`]s whose POS is inside `region`, in file order.
///
/// # Errors
///
/// Returns an error if the index or file cannot be read, or if a fetched line
/// fails to parse as a VCF variant.
pub fn fetch_vcf(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<VcfVariant>> {
    let mut variants = Vec::new();
    fetch_tabix_lines_with(bgz_path, region, |line| {
        let v: VcfVariant = line
            .parse()
            .with_context(|| format!("failed to parse VCF line: {line}"))?;
        if v.position.contig == region.contig && region.range.contains(&v.position.position) {
            for split in from_multiallelic(v)? {
                variants.push(split);
            }
        }
        Ok(())
    })?;
    Ok(variants)
}

/// Like [`fetch_vcf`] but reuses a pre-opened [`TabixReader`] instead of
/// reopening the index and BGZF file on every call.
///
/// Use this inside `par_chunks` / `for_each_init` workers where one reader
/// is initialised per thread and shared across all variants in that chunk.
pub fn fetch_vcf_with_reader(
    reader: &mut TabixReader,
    region: &GenomeRange,
) -> anyhow::Result<Vec<VcfVariant>> {
    let mut variants = Vec::new();
    reader.fetch_with(region, |line| {
        let v: VcfVariant = line
            .parse()
            .with_context(|| format!("failed to parse VCF line: {line}"))?;
        if v.position.contig == region.contig && region.range.contains(&v.position.position) {
            for split in from_multiallelic(v)? {
                variants.push(split);
            }
        }
        Ok(())
    })?;
    Ok(variants)
}

/// A flat VCF row for tab-separated line parsing.
///
/// Use [`VcfVariant`] for richer parsed access. This struct exists for
/// simple tabular ingestion where field-by-field access is not needed.
///
/// `pos` is **1-based** as stored in the VCF file. Parse from a tab-separated
/// line via `FromStr`; the `serde::Deserialize` derive is kept for callers
/// that still use serde-based deserialization.
#[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
pub struct VCFRow {
    pub chr: String,
    /// 1-based position (VCF convention)
    pub pos: u32,
    pub id: String,
    pub reference: String,
    pub alt: String,
    pub qual: String,
    pub filter: String,
    pub info: String,
    pub format: String,
    pub value: String,
}

impl std::str::FromStr for VCFRow {
    type Err = anyhow::Error;

    /// Parse a tab-separated VCF data line (no header, 10 positional columns).
    fn from_str(s: &str) -> anyhow::Result<Self> {
        let f: Vec<&str> = s.split('\t').collect();
        let get = |i: usize, name: &str| -> anyhow::Result<&str> {
            f.get(i)
                .copied()
                .ok_or_else(|| anyhow::anyhow!("missing {name} (col {i})"))
        };
        Ok(Self {
            chr: get(0, "chr")?.to_string(),
            pos: get(1, "pos")?.parse().context("bad pos")?,
            id: get(2, "id")?.to_string(),
            reference: get(3, "reference")?.to_string(),
            alt: get(4, "alt")?.to_string(),
            qual: get(5, "qual")?.to_string(),
            filter: get(6, "filter")?.to_string(),
            info: get(7, "info")?.to_string(),
            format: get(8, "format")?.to_string(),
            value: get(9, "value")?.to_string(),
        })
    }
}

/// Build a full VCF header with INFO definitions, contig lines, and Pandora version.
///
/// Includes SV-specific INFO fields (`SVTYPE`, `SVLEN`, `END`) and reads contig
/// lengths from a `.dict` file. The last element is always the column header line
/// (`#CHROM\tPOS\t...`).
///
/// Note: [`write_vcf`] uses a minimal inline header and does not call this function.
/// Use this when constructing a header separately before writing with a custom writer.
///
/// # Arguments
///
/// * `dict` - Path to a sequence dictionary (`.dict`) file
///
/// # Returns
///
/// Ordered header lines, each without a trailing newline.
///
/// # Errors
///
/// Returns an error if the dictionary file cannot be read.
pub fn vcf_header(dict: &str) -> anyhow::Result<Vec<String>> {
    let mut header = Vec::new();
    header.push("##fileformat=VCFv4.2".to_string());
    header.push(
        "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"
            .to_string(),
    );
    header.push("##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">".to_string());
    header.push("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">".to_string());
    header.push(format!(
        "##Pandora_lib_version={}",
        env!("CARGO_PKG_VERSION")
    ));

    read_dict(dict)?
        .into_iter()
        .for_each(|(id, len)| header.push(format!("##contig=<ID={id},length={len}>")));

    header.push("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE".to_string());

    Ok(header)
}

#[cfg(test)]
mod tests {
    use crate::config::Config;

    use super::*;

    #[test]
    fn vcf_dbsnp() -> anyhow::Result<()> {
        let config = Config::default();
        let v = fetch_vcf(&config.db_snp, &GenomeRange::new("chr1", 5656, 5660))?;
        println!("{v:#?}");
        Ok(())
    }
}