Переглянути джерело

audit and refactor src/io/vcf.rs

- write_vcf now uses BgzTabixWriter: produces .vcf.gz + .tbi in one atomic
  pass; previously wrote BGZF only with no tabix index and no atomic write
  guarantee (partial write left a corrupt file)
- Tabix positions from VcfVariant.position (GenomePosition, 0-based):
  vcf_pos = position.position + 1; POS used as both start and end (sufficient
  for all tabix range queries; full end would need REF length or INFO/END)
- read_vcf: starts_with('#') (char not str), skip blank lines
- Remove unused File import and Write import (Write now internal to BgzTabixWriter)
- Remove commented-out write! line
- Add //! module header documenting coordinate convention (0-based GenomePosition
  vs 1-based VCF POS) and that outputs include .tbi
- Rustdoc on read_vcf, write_vcf, VCFRow, vcf_header; note write_vcf and
  vcf_header use different headers (intentional — vcf_header needs a dict path)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas 1 місяць тому
батько
коміт
bceb3174ad
1 змінених файлів з 94 додано та 22 видалено
  1. 94 22
      src/io/vcf.rs

+ 94 - 22
src/io/vcf.rs

@@ -1,15 +1,41 @@
-use std::{
-    fs::File,
-    io::{BufRead, BufReader, Write},
-};
+//! VCF file I/O utilities.
+//!
+//! All write operations produce BGZF-compressed output with an accompanying
+//! Tabix index (`.tbi`), enabling direct use with `bcftools`, `tabix`, and IGV
+//! without a separate indexing step.
+//!
+//! **Coordinate note:** [`GenomePosition::position`] is 0-based; VCF POS is
+//! 1-based. The conversion `vcf_pos = position + 1` is applied internally.
+
+use std::io::{BufRead, BufReader};
 
 use anyhow::Context;
 use log::{info, warn};
+use noodles_core::Position;
 
-use crate::{io::writers::{finalize_bgzf_file, get_gz_writer}, variant::vcf_variant::VcfVariant};
+use crate::{
+    io::writers::{BgzTabixWriter, IndexFormat},
+    variant::vcf_variant::VcfVariant,
+};
 
 use super::{dict::read_dict, readers::get_reader};
 
+/// Load a BGZF-compressed or plain VCF file into memory.
+///
+/// Header lines (starting with `#`) and blank lines are skipped.
+/// I/O errors on individual lines are logged as warnings and skipped.
+///
+/// # Arguments
+///
+/// * `path` - Path to the VCF file (plain text or BGZF)
+///
+/// # Returns
+///
+/// A vector of parsed variants in file order.
+///
+/// # Errors
+///
+/// Returns an error if the file cannot be opened or a data line fails to parse.
 pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
     let reader = BufReader::new(get_reader(path)?);
 
@@ -17,7 +43,7 @@ pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
     for (i, line) in reader.lines().enumerate() {
         match line {
             Ok(line) => {
-                if line.starts_with("#") {
+                if line.starts_with('#') || line.is_empty() {
                     continue;
                 }
                 res.push(line.parse().context(format!("Can't parse {line}"))?);
@@ -29,26 +55,59 @@ pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
     Ok(res)
 }
 
+/// Write variants to a BGZF-compressed VCF with a Tabix index.
+///
+/// Produces `path` (`.vcf.gz`) and `path.tbi` in a single atomic pass using
+/// [`BgzTabixWriter`]. The output is immediately usable with `bcftools view`,
+/// `tabix`, and IGV without a separate indexing step.
+///
+/// Tabix positions are derived from [`VcfVariant::position`]: the 0-based
+/// `GenomePosition.position` is converted to 1-based VCF POS internally.
+/// POS is used as both the tabix start and end (sufficient for all query types;
+/// a more precise end would require the REF length or INFO/END field).
+///
+/// The output header is minimal (`##fileformat` + column line). Use
+/// [`vcf_header`] to build a full header with contig lines and INFO definitions.
+///
+/// # Arguments
+///
+/// * `variants` - Variants to write, in coordinate-sorted order
+/// * `path` - Destination path (must end in `.gz`)
+///
+/// # Errors
+///
+/// Returns an error if writing fails or if the variants are not coordinate-sorted
+/// (required by Tabix).
 pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
     info!("Writing: {path}");
-    let mut writer = get_gz_writer(path, true)?;
-    // write!(writer, b"##fileformat=VCFv4.2\n")
-    writer.write_all(b"##fileformat=VCFv4.2\n")?;
-    writer.write_all(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;
+
+    let mut writer = BgzTabixWriter::new(path, IndexFormat::Tbi, true)?;
+    writer.write_header(b"##fileformat=VCFv4.2\n")?;
+    writer.write_header(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;
 
     for variant in variants {
-        writer.write_fmt(format_args!(
-            "{}\n",
-            variant.commun_deepvariant_clairs().into_vcf_row()
-        ))?;
+        let row = format!("{}\n", variant.commun_deepvariant_clairs().into_vcf_row());
+        let rname = variant.position.contig();
+        // GenomePosition.position is 0-based; VCF POS is 1-based
+        let vcf_pos = variant.position.position as usize + 1;
+        let pos = Position::try_from(vcf_pos)
+            .with_context(|| format!("invalid VCF position: {vcf_pos}"))?;
+        writer.write_record(row.as_bytes(), &rname, pos, pos)?;
     }
 
-    finalize_bgzf_file(writer, path)
+    writer.finish()
 }
 
+/// A flat VCF row for CSV/TSV deserialization (e.g. via `serde`).
+///
+/// Use [`VcfVariant`] for richer parsed access. This struct exists for
+/// simple tabular ingestion where field-by-field access is not needed.
+///
+/// `pos` is **1-based** as stored in the VCF file.
 #[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
 pub struct VCFRow {
     pub chr: String,
+    /// 1-based position (VCF convention)
     pub pos: u32,
     pub id: String,
     pub reference: String,
@@ -60,24 +119,37 @@ pub struct VCFRow {
     pub value: String,
 }
 
+/// Build a full VCF header with INFO definitions, contig lines, and Pandora version.
+///
+/// Includes SV-specific INFO fields (`SVTYPE`, `SVLEN`, `END`) and reads contig
+/// lengths from a `.dict` file. The last element is always the column header line
+/// (`#CHROM\tPOS\t...`).
+///
+/// Note: [`write_vcf`] uses a minimal inline header and does not call this function.
+/// Use this when constructing a header separately before writing with a custom writer.
+///
+/// # Arguments
+///
+/// * `dict` - Path to a sequence dictionary (`.dict`) file
+///
+/// # Returns
+///
+/// Ordered header lines, each without a trailing newline.
+///
+/// # Errors
+///
+/// Returns an error if the dictionary file cannot be read.
 pub fn vcf_header(dict: &str) -> anyhow::Result<Vec<String>> {
     let mut header = Vec::new();
-    // file format
     header.push("##fileformat=VCFv4.2".to_string());
-    // Format
-    // Filter
-    // Info
     header.push(
         "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"
             .to_string(),
     );
     header.push("##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">".to_string());
     header.push("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">".to_string());
-
-    // version
     header.push(format!("##Pandora_lib_version={}", env!("CARGO_PKG_VERSION")));
 
-    // contig
     read_dict(dict)?
         .into_iter()
         .for_each(|(id, len)| header.push(format!("##contig=<ID={id},length={len}>")));