2 місяців тому · bceb3174ad
--- a/src/io/vcf.rs
+++ b/src/io/vcf.rs
@@ -1,15 +1,41 @@
 
				-use std::{
			
 
				-    fs::File,
			
 
				-    io::{BufRead, BufReader, Write},
			
 
				-};
			
 
				+//! VCF file I/O utilities.
			
 
				+//!
			
 
				+//! All write operations produce BGZF-compressed output with an accompanying
			
 
				+//! Tabix index (`.tbi`), enabling direct use with `bcftools`, `tabix`, and IGV
			
 
				+//! without a separate indexing step.
			
 
				+//!
			
 
				+//! **Coordinate note:** [`GenomePosition::position`] is 0-based; VCF POS is
			
 
				+//! 1-based. The conversion `vcf_pos = position + 1` is applied internally.
			
 
				+
			
 
				+use std::io::{BufRead, BufReader};
			
 
				 
			
 
				 use anyhow::Context;
			
 
				 use log::{info, warn};
			
 
				+use noodles_core::Position;
			
 
				 
			
 
				-use crate::{io::writers::{finalize_bgzf_file, get_gz_writer}, variant::vcf_variant::VcfVariant};
			
 
				+use crate::{
			
 
				+    io::writers::{BgzTabixWriter, IndexFormat},
			
 
				+    variant::vcf_variant::VcfVariant,
			
 
				+};
			
 
				 
			
 
				 use super::{dict::read_dict, readers::get_reader};
			
 
				 
			
 
				+/// Load a BGZF-compressed or plain VCF file into memory.
			
 
				+///
			
 
				+/// Header lines (starting with `#`) and blank lines are skipped.
			
 
				+/// I/O errors on individual lines are logged as warnings and skipped.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `path` - Path to the VCF file (plain text or BGZF)
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// A vector of parsed variants in file order.
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Returns an error if the file cannot be opened or a data line fails to parse.
			
 
				 pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
			
 
				     let reader = BufReader::new(get_reader(path)?);
			
 
				 
			
@@ -17,7 +43,7 @@ pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
 
				     for (i, line) in reader.lines().enumerate() {
			
 
				         match line {
			
 
				             Ok(line) => {
			
 
				-                if line.starts_with("#") {
			
 
				+                if line.starts_with('#') || line.is_empty() {
			
 
				                     continue;
			
 
				                 }
			
 
				                 res.push(line.parse().context(format!("Can't parse {line}"))?);
			
@@ -29,26 +55,59 @@ pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
 
				     Ok(res)
			
 
				 }
			
 
				 
			
 
				+/// Write variants to a BGZF-compressed VCF with a Tabix index.
			
 
				+///
			
 
				+/// Produces `path` (`.vcf.gz`) and `path.tbi` in a single atomic pass using
			
 
				+/// [`BgzTabixWriter`]. The output is immediately usable with `bcftools view`,
			
 
				+/// `tabix`, and IGV without a separate indexing step.
			
 
				+///
			
 
				+/// Tabix positions are derived from [`VcfVariant::position`]: the 0-based
			
 
				+/// `GenomePosition.position` is converted to 1-based VCF POS internally.
			
 
				+/// POS is used as both the tabix start and end (sufficient for all query types;
			
 
				+/// a more precise end would require the REF length or INFO/END field).
			
 
				+///
			
 
				+/// The output header is minimal (`##fileformat` + column line). Use
			
 
				+/// [`vcf_header`] to build a full header with contig lines and INFO definitions.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `variants` - Variants to write, in coordinate-sorted order
			
 
				+/// * `path` - Destination path (must end in `.gz`)
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Returns an error if writing fails or if the variants are not coordinate-sorted
			
 
				+/// (required by Tabix).
			
 
				 pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
			
 
				     info!("Writing: {path}");
			
 
				-    let mut writer = get_gz_writer(path, true)?;
			
 
				-    // write!(writer, b"##fileformat=VCFv4.2\n")
			
 
				-    writer.write_all(b"##fileformat=VCFv4.2\n")?;
			
 
				-    writer.write_all(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;
			
 
				+
			
 
				+    let mut writer = BgzTabixWriter::new(path, IndexFormat::Tbi, true)?;
			
 
				+    writer.write_header(b"##fileformat=VCFv4.2\n")?;
			
 
				+    writer.write_header(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;
			
 
				 
			
 
				     for variant in variants {
			
 
				-        writer.write_fmt(format_args!(
			
 
				-            "{}\n",
			
 
				-            variant.commun_deepvariant_clairs().into_vcf_row()
			
 
				-        ))?;
			
 
				+        let row = format!("{}\n", variant.commun_deepvariant_clairs().into_vcf_row());
			
 
				+        let rname = variant.position.contig();
			
 
				+        // GenomePosition.position is 0-based; VCF POS is 1-based
			
 
				+        let vcf_pos = variant.position.position as usize + 1;
			
 
				+        let pos = Position::try_from(vcf_pos)
			
 
				+            .with_context(|| format!("invalid VCF position: {vcf_pos}"))?;
			
 
				+        writer.write_record(row.as_bytes(), &rname, pos, pos)?;
			
 
				     }
			
 
				 
			
 
				-    finalize_bgzf_file(writer, path)
			
 
				+    writer.finish()
			
 
				 }
			
 
				 
			
 
				+/// A flat VCF row for CSV/TSV deserialization (e.g. via `serde`).
			
 
				+///
			
 
				+/// Use [`VcfVariant`] for richer parsed access. This struct exists for
			
 
				+/// simple tabular ingestion where field-by-field access is not needed.
			
 
				+///
			
 
				+/// `pos` is **1-based** as stored in the VCF file.
			
 
				 #[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
			
 
				 pub struct VCFRow {
			
 
				     pub chr: String,
			
 
				+    /// 1-based position (VCF convention)
			
 
				     pub pos: u32,
			
 
				     pub id: String,
			
 
				     pub reference: String,
			
@@ -60,24 +119,37 @@ pub struct VCFRow {
 
				     pub value: String,
			
 
				 }
			
 
				 
			
 
				+/// Build a full VCF header with INFO definitions, contig lines, and Pandora version.
			
 
				+///
			
 
				+/// Includes SV-specific INFO fields (`SVTYPE`, `SVLEN`, `END`) and reads contig
			
 
				+/// lengths from a `.dict` file. The last element is always the column header line
			
 
				+/// (`#CHROM\tPOS\t...`).
			
 
				+///
			
 
				+/// Note: [`write_vcf`] uses a minimal inline header and does not call this function.
			
 
				+/// Use this when constructing a header separately before writing with a custom writer.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `dict` - Path to a sequence dictionary (`.dict`) file
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// Ordered header lines, each without a trailing newline.
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Returns an error if the dictionary file cannot be read.
			
 
				 pub fn vcf_header(dict: &str) -> anyhow::Result<Vec<String>> {
			
 
				     let mut header = Vec::new();
			
 
				-    // file format
			
 
				     header.push("##fileformat=VCFv4.2".to_string());
			
 
				-    // Format
			
 
				-    // Filter
			
 
				-    // Info
			
 
				     header.push(
			
 
				         "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"
			
 
				             .to_string(),
			
 
				     );
			
 
				     header.push("##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">".to_string());
			
 
				     header.push("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">".to_string());
			
 
				-
			
 
				-    // version
			
 
				     header.push(format!("##Pandora_lib_version={}", env!("CARGO_PKG_VERSION")));
			
 
				 
			
 
				-    // contig
			
 
				     read_dict(dict)?
			
 
				         .into_iter()
			
 
				         .for_each(|(id, len)| header.push(format!("##contig=<ID={id},length={len}>")));