Browse Source

VCF write

Thomas 8 months ago
parent
commit
86513cba22
3 changed files with 185 additions and 28 deletions
  1. 3 0
      src/pipes/somatic.rs
  2. 72 26
      src/variant/variant.rs
  3. 110 2
      src/variant/variant_collection.rs

+ 3 - 0
src/pipes/somatic.rs

@@ -185,6 +185,7 @@ impl Run for SomaticPipe {
         // Define output paths for the final somatic variants
         let result_json = format!("{}/{id}_somatic_variants.json.gz", config.tumoral_dir(&id));
         let result_bit = format!("{}/{id}_somatic_variants.bit", config.tumoral_dir(&id));
+        let result_vcf = format!("{}/{id}_somatic_variants.vcf.gz", config.tumoral_dir(&id));
 
         if Path::new(&result_bit).exists() && !config.somatic_pipe_force {
             return Err(anyhow::anyhow!(
@@ -458,6 +459,8 @@ impl Run for SomaticPipe {
         info!("Final unique variants: {}", variants.data.len());
         variants.save_to_json(&result_json)?;
         variants.save_to_file(&result_bit)?;
+        variants.write_vcf(&result_vcf, &config.dict_file, config.somatic_pipe_force)?;
+
 
         Ok(())
     }

+ 72 - 26
src/variant/variant.rs

@@ -346,40 +346,66 @@ impl VcfVariant {
         }
     }
 
+    /// Returns the length of the deletion if the variant is a deletion (`DEL`).
+    ///
+    /// The length is determined using the following priority:
+    /// 1. If an `END` field is present in the `INFO` section, it is used to compute the length as `END - POS`.
+    /// 2. If an `SVLEN` field is present, its absolute value is used.
+    /// 3. If both `reference` and `alternative` alleles are available and compatible,
+    ///    the length is inferred from the length of the deleted sequence.
+    ///
+    /// Returns `None` if the variant is not a deletion or if no valid length information can be derived.
     pub fn deletion_len(&self) -> Option<u32> {
         if self.alteration_category() != AlterationCategory::DEL {
             return None;
         }
 
-        self.infos
-            .0
-            .iter()
-            .find_map(|i| {
-                if let Info::SVLEN(len) = i {
-                    Some(*len as u32)
-                } else {
-                    None
-                }
-            })
-            .or_else(|| {
-                if let (
-                    ReferenceAlternative::Nucleotides(nt),
-                    ReferenceAlternative::Nucleotide(_),
-                ) = (&self.reference, &self.alternative)
-                {
-                    Some(nt.len().saturating_sub(1) as u32)
-                } else {
-                    None
-                }
-            })
+        // Try using the END field
+        if let Some(len) = self.infos.0.iter().find_map(|i| {
+            if let Info::END(end) = i {
+                Some(end.saturating_sub(self.position.position))
+            } else {
+                None
+            }
+        }) {
+            return Some(len);
+        }
+
+        // Fallback to SVLEN field
+        if let Some(len) = self.infos.0.iter().find_map(|i| {
+            if let Info::SVLEN(len) = i {
+                Some(len.unsigned_abs())
+            } else {
+                None
+            }
+        }) {
+            return Some(len);
+        }
+
+        // Final fallback: infer from reference and alternative
+        if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotide(_)) =
+            (&self.reference, &self.alternative)
+        {
+            return Some(nt.len().saturating_sub(1) as u32);
+        }
+
+        None
     }
 
     pub fn deletion_desc(&self) -> Option<DeletionDesc> {
-        self.deletion_len().map(|len| DeletionDesc {
-            contig: self.position.contig(),
-            start: self.position.position + 1,
-            end: self.position.position.checked_add(len).unwrap_or(u32::MAX), // TODO
-        })
+        if let Some(len) = self.deletion_len() {
+            Some(DeletionDesc {
+                contig: self.position.contig(),
+                start: self.position.position + 1,
+                end: self
+                    .position
+                    .position
+                    .checked_add(len)
+                    .unwrap_or(self.position.position + 1),
+            })
+        } else {
+            None
+        }
     }
 }
 
@@ -827,6 +853,18 @@ pub fn concat_numbers<T: ToString>(v: &[T]) -> String {
 }
 
 impl Info {
+    /// Returns the VCF key name (e.g. "DP", "SVTYPE", "PRECISE") for this INFO field.
+    ///
+    /// This uses the string representation (`Display`) and extracts the part before `=`,
+    /// which is valid for both key=value and flag-only entries.
+    pub fn key(&self) -> String {
+        self.to_string()
+            .split('=')
+            .next()
+            .unwrap_or(".")
+            .to_string()
+    }
+
     /// Returns the complete set of known VCF `INFO` header definitions used by `Info` variants.
     ///
     /// # Example
@@ -1561,6 +1599,14 @@ impl Formats {
         );
         headers.insert(r#"##FORMAT=<ID=hVAF,Number=3,Type=Float,Description="Haplotype specific variant Allele frequency (H0,H1,H2)">"#.to_string());
 
+        headers.insert(
+            r#"##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">"#.to_string(),
+        );
+        headers.insert(
+            r#"##FORMAT=<ID=NAF,Number=1,Type=Float,Description="Normal Allele Frequency">"#
+                .to_string(),
+        );
+
         // headers.insert(
         //     r#"##FORMAT=<ID=Other,Number=.,Type=String,Description="Unspecified FORMAT field">"#
         //         .to_string(),

+ 110 - 2
src/variant/variant_collection.rs

@@ -14,7 +14,7 @@ use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
 
-use super::variant::{AlterationCategory, Info, ReferenceAlternative, VcfVariant};
+use super::variant::{AlterationCategory, Formats, Info, Infos, ReferenceAlternative, VcfVariant};
 use crate::{
     annotation::{
         cosmic::Cosmic,
@@ -29,7 +29,7 @@ use crate::{
         vcf::Vcf,
     },
     helpers::{app_storage_dir, estimate_shannon_entropy, mean, temp_file_path, Hash128},
-    io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header},
+    io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header, writers::get_gz_writer},
     positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomePosition},
 };
 
@@ -587,6 +587,80 @@ impl Variant {
         callers.sort();
         callers.join(", ")
     }
+
+    /// Merge all `Infos` from the list of `VcfVariant`s.
+    pub fn merge_infos(&self) -> Infos {
+        let mut seen_keys = HashSet::new();
+        let mut merged = Vec::new();
+
+        for vcf in self.vcf_variants.iter() {
+            for info in &vcf.infos.0 {
+                let key = info.key();
+                if seen_keys.insert(key.to_string()) {
+                    merged.push(info.clone());
+                }
+            }
+        }
+
+        Infos(merged)
+    }
+
+    pub fn merge_formats(&self) -> Formats {
+        let mut seen_keys = HashSet::new();
+        let mut merged = Vec::new();
+
+        for vcf in self.vcf_variants.iter() {
+            for format in &vcf.formats.0 {
+                let (key, _) = format.clone().into();
+                if seen_keys.insert(key) {
+                    merged.push(format.clone());
+                }
+            }
+        }
+
+        Formats(merged)
+    }
+
+    /// Writes the merged VCF representation of this `Variant` to the provided writer.
+    ///
+    /// Merges INFO and FORMAT fields from all underlying `VcfVariant`s.
+    /// Assumes a single-sample VCF structure.
+    pub fn write_vcf<W: Write>(&self, writer: &mut W) -> std::io::Result<()> {
+        if self.vcf_variants.is_empty() {
+            return Ok(());
+        }
+
+        let vcf = &self.vcf_variants[0];
+        let merged_infos = self.merge_infos();
+        let merged_formats = self.merge_formats();
+
+        let (format_keys, format_values): (String, String) = merged_formats.into();
+        
+        writeln!(
+            writer,
+            "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
+            self.position.contig(),
+            self.position.position + 1,
+            if vcf.id.is_empty() { "." } else { &vcf.id },
+            self.reference,
+            self.alternative,
+            vcf.quality
+                .map(|q| format!("{:.2}", q))
+                .unwrap_or_else(|| ".".to_string()),
+            vcf.filter,
+            merged_infos,
+            if format_keys.is_empty() {
+                "."
+            } else {
+                &format_keys
+            },
+            if format_values.is_empty() {
+                "."
+            } else {
+                &format_values
+            },
+        )
+    }
 }
 
 /// A collection of genomic variants.
@@ -953,6 +1027,40 @@ impl Variants {
         let decoded: Self = bitcode::decode(&buffer)?;
         Ok(decoded)
     }
+
+    /// Write the complete VCF to the given output path, using a dict file for contig headers.
+    pub fn write_vcf(&self, output_path: &str, dict_path: &str, force: bool) -> anyhow::Result<()> {
+        let contigs = crate::io::dict::read_dict(dict_path)?;
+
+        let mut writer = get_gz_writer(output_path, force)?;
+
+        // File format and contig headers
+        writeln!(writer, "##fileformat=VCFv4.3")?;
+        for (name, len) in contigs {
+            writeln!(writer, "##contig=<ID={},length={}>", name, len)?;
+        }
+
+        // INFO and FORMAT headers
+        for info_header in Info::header_definitions() {
+            writeln!(writer, "{info_header}")?;
+        }
+        for format_header in Formats::format_headers() {
+            writeln!(writer, "{format_header}")?;
+        }
+
+        // Column header
+        writeln!(
+            writer,
+            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE"
+        )?;
+
+        // Write all variants
+        for variant in &self.data {
+            variant.write_vcf(&mut writer)?;
+        }
+
+        Ok(())
+    }
 }
 
 /// Creates a new Variant instance from a collection of VcfVariants and annotations.