преди 2 месеца · bceb3174ad
--- a/src/io/vcf.rs
+++ b/src/io/vcf.rs
@@ -1,15 +1,41 @@
 
															-use std::{
														
 
															-    fs::File,
														
 
															-    io::{BufRead, BufReader, Write},
														
 
															-};
														
 
															+//! VCF file I/O utilities.
														
 
															+//!
														
 
															+//! All write operations produce BGZF-compressed output with an accompanying
														
 
															+//! Tabix index (`.tbi`), enabling direct use with `bcftools`, `tabix`, and IGV
														
 
															+//! without a separate indexing step.
														
 
															+//!
														
 
															+//! **Coordinate note:** [`GenomePosition::position`] is 0-based; VCF POS is
														
 
															+//! 1-based. The conversion `vcf_pos = position + 1` is applied internally.
														
 
															+
														
 
															+use std::io::{BufRead, BufReader};
														
 
															 use anyhow::Context;
														
 
															 use log::{info, warn};
														
 
															+use noodles_core::Position;
														
 
															-use crate::{io::writers::{finalize_bgzf_file, get_gz_writer}, variant::vcf_variant::VcfVariant};
														
 
															+use crate::{
														
 
															+    io::writers::{BgzTabixWriter, IndexFormat},
														
 
															+    variant::vcf_variant::VcfVariant,
														
 
															+};
														
 
															 use super::{dict::read_dict, readers::get_reader};
														
 
															+/// Load a BGZF-compressed or plain VCF file into memory.
														
 
															+///
														
 
															+/// Header lines (starting with `#`) and blank lines are skipped.
														
 
															+/// I/O errors on individual lines are logged as warnings and skipped.
														
 
															+///
														
 
															+/// # Arguments
														
 
															+///
														
 
															+/// * `path` - Path to the VCF file (plain text or BGZF)
														
 
															+///
														
 
															+/// # Returns
														
 
															+///
														
 
															+/// A vector of parsed variants in file order.
														
 
															+///
														
 
															+/// # Errors
														
 
															+///
														
 
															+/// Returns an error if the file cannot be opened or a data line fails to parse.
														
 
															 pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
														
 
															     let reader = BufReader::new(get_reader(path)?);
														
@@ -17,7 +43,7 @@ pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
 
															     for (i, line) in reader.lines().enumerate() {
														
 
															         match line {
														
 
															             Ok(line) => {
														
 
															-                if line.starts_with("#") {
														
 
															+                if line.starts_with('#') || line.is_empty() {
														
 
															                     continue;
														
 
															                 }
														
 
															                 res.push(line.parse().context(format!("Can't parse {line}"))?);
														
@@ -29,26 +55,59 @@ pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
 
															     Ok(res)
														
 
															 }
														
 
															+/// Write variants to a BGZF-compressed VCF with a Tabix index.
														
 
															+///
														
 
															+/// Produces `path` (`.vcf.gz`) and `path.tbi` in a single atomic pass using
														
 
															+/// [`BgzTabixWriter`]. The output is immediately usable with `bcftools view`,
														
 
															+/// `tabix`, and IGV without a separate indexing step.
														
 
															+///
														
 
															+/// Tabix positions are derived from [`VcfVariant::position`]: the 0-based
														
 
															+/// `GenomePosition.position` is converted to 1-based VCF POS internally.
														
 
															+/// POS is used as both the tabix start and end (sufficient for all query types;
														
 
															+/// a more precise end would require the REF length or INFO/END field).
														
 
															+///
														
 
															+/// The output header is minimal (`##fileformat` + column line). Use
														
 
															+/// [`vcf_header`] to build a full header with contig lines and INFO definitions.
														
 
															+///
														
 
															+/// # Arguments
														
 
															+///
														
 
															+/// * `variants` - Variants to write, in coordinate-sorted order
														
 
															+/// * `path` - Destination path (must end in `.gz`)
														
 
															+///
														
 
															+/// # Errors
														
 
															+///
														
 
															+/// Returns an error if writing fails or if the variants are not coordinate-sorted
														
 
															+/// (required by Tabix).
														
 
															 pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
														
 
															     info!("Writing: {path}");
														
 
															-    let mut writer = get_gz_writer(path, true)?;
														
 
															-    // write!(writer, b"##fileformat=VCFv4.2\n")
														
 
															-    writer.write_all(b"##fileformat=VCFv4.2\n")?;
														
 
															-    writer.write_all(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;
														
 
															+
														
 
															+    let mut writer = BgzTabixWriter::new(path, IndexFormat::Tbi, true)?;
														
 
															+    writer.write_header(b"##fileformat=VCFv4.2\n")?;
														
 
															+    writer.write_header(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;
														
 
															     for variant in variants {
														
 
															-        writer.write_fmt(format_args!(
														
 
															-            "{}\n",
														
 
															-            variant.commun_deepvariant_clairs().into_vcf_row()
														
 
															-        ))?;
														
 
															+        let row = format!("{}\n", variant.commun_deepvariant_clairs().into_vcf_row());
														
 
															+        let rname = variant.position.contig();
														
 
															+        // GenomePosition.position is 0-based; VCF POS is 1-based
														
 
															+        let vcf_pos = variant.position.position as usize + 1;
														
 
															+        let pos = Position::try_from(vcf_pos)
														
 
															+            .with_context(|| format!("invalid VCF position: {vcf_pos}"))?;
														
 
															+        writer.write_record(row.as_bytes(), &rname, pos, pos)?;
														
 
															     }
														
 
															-    finalize_bgzf_file(writer, path)
														
 
															+    writer.finish()
														
 
															 }
														
 
															+/// A flat VCF row for CSV/TSV deserialization (e.g. via `serde`).
														
 
															+///
														
 
															+/// Use [`VcfVariant`] for richer parsed access. This struct exists for
														
 
															+/// simple tabular ingestion where field-by-field access is not needed.
														
 
															+///
														
 
															+/// `pos` is **1-based** as stored in the VCF file.
														
 
															 #[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
														
 
															 pub struct VCFRow {
														
 
															     pub chr: String,
														
 
															+    /// 1-based position (VCF convention)
														
 
															     pub pos: u32,
														
 
															     pub id: String,
														
 
															     pub reference: String,
														
@@ -60,24 +119,37 @@ pub struct VCFRow {
 
															     pub value: String,
														
 
															 }
														
 
															+/// Build a full VCF header with INFO definitions, contig lines, and Pandora version.
														
 
															+///
														
 
															+/// Includes SV-specific INFO fields (`SVTYPE`, `SVLEN`, `END`) and reads contig
														
 
															+/// lengths from a `.dict` file. The last element is always the column header line
														
 
															+/// (`#CHROM\tPOS\t...`).
														
 
															+///
														
 
															+/// Note: [`write_vcf`] uses a minimal inline header and does not call this function.
														
 
															+/// Use this when constructing a header separately before writing with a custom writer.
														
 
															+///
														
 
															+/// # Arguments
														
 
															+///
														
 
															+/// * `dict` - Path to a sequence dictionary (`.dict`) file
														
 
															+///
														
 
															+/// # Returns
														
 
															+///
														
 
															+/// Ordered header lines, each without a trailing newline.
														
 
															+///
														
 
															+/// # Errors
														
 
															+///
														
 
															+/// Returns an error if the dictionary file cannot be read.
														
 
															 pub fn vcf_header(dict: &str) -> anyhow::Result<Vec<String>> {
														
 
															     let mut header = Vec::new();
														
 
															-    // file format
														
 
															     header.push("##fileformat=VCFv4.2".to_string());
														
 
															-    // Format
														
 
															-    // Filter
														
 
															-    // Info
														
 
															     header.push(
														
 
															         "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"
														
 
															             .to_string(),
														
 
															     );
														
 
															     header.push("##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">".to_string());
														
 
															     header.push("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">".to_string());
														
 
															-
														
 
															-    // version
														
 
															     header.push(format!("##Pandora_lib_version={}", env!("CARGO_PKG_VERSION")));
														
 
															-    // contig
														
 
															     read_dict(dict)?
														
 
															         .into_iter()
														
 
															         .for_each(|(id, len)| header.push(format!("##contig=<ID={id},length={len}>")));