Преглед изворни кода

variants merging and json save

Thomas пре 1 година
родитељ
комит
c95d7f33a0
4 измењених фајлова са 143 додато и 15 уклоњено
  1. 4 4
      src/annotation/mod.rs
  2. 14 7
      src/pipes/somatic.rs
  3. 1 1
      src/positions.rs
  4. 124 3
      src/variant/variant_collection.rs

+ 4 - 4
src/annotation/mod.rs

@@ -20,7 +20,7 @@ use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use vep::{get_best_vep, VepConsequence, VepImpact, VEP};
 
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub enum Annotation {
     Callers(Caller, Sample),
     AlterationCategory(AlterationCategory),
@@ -35,7 +35,7 @@ pub enum Annotation {
     VEP(Vec<VEP>),
 }
 
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub enum Sample {
     SoloTumor,
     SoloConstit,
@@ -114,7 +114,7 @@ impl FromStr for Annotation {
     }
 }
 
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub enum Caller {
     DeepVariant,
     ClairS,
@@ -156,7 +156,7 @@ impl FromStr for Caller {
     }
 }
 
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Annotations {
     pub store: DashMap<Hash128, Vec<Annotation>, Blake3BuildHasher>,
 }

+ 14 - 7
src/pipes/somatic.rs

@@ -20,7 +20,7 @@ use crate::{
     runners::Run,
     variant::{
         variant::{load_variants, CallerBox},
-        variant_collection::{ExternalAnnotation, VariantCollection},
+        variant_collection::{ExternalAnnotation, VariantCollection, Variants},
     },
 };
 
@@ -250,13 +250,9 @@ impl Run for Somatic {
         info!("Initialization of callers...");
 
         let mut callers = init_somatic_callers!(
-            &id,
-            &config,
-            ClairS,
-            NanomonSV,
-            // Severus,
+            &id, &config, ClairS, NanomonSV, // Severus,
             Savana,
-            DeepSomatic
+            // DeepSomatic
         );
 
         callers.extend(init_solo_callers!(
@@ -481,6 +477,17 @@ impl Run for Somatic {
 
         annotations.vep_stats()?;
 
+        let variants = variants_collections.into_iter().fold(
+            Variants::default(),
+            |mut acc, variants_collection| {
+                acc.merge(variants_collection, &annotations);
+                acc
+            },
+        );
+
+        info!("Final unique variants: {}", variants.data.len());
+        variants.save_to_json(&format!("{}/somatic_variants.json.gz", config.tumoral_dir(&id)))?;
+
         Ok(())
     }
 }

+ 1 - 1
src/positions.rs

@@ -5,7 +5,7 @@ use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 
 // 0-based
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash, Default)]
 pub struct GenomePosition {
     pub contig: u8,
     pub position: u32,

+ 124 - 3
src/variant/variant_collection.rs

@@ -1,16 +1,18 @@
 use std::{
     collections::{HashMap, HashSet},
     fs::{self, File},
-    io::Write,
+    io::{Read, Write},
 };
 
 use anyhow::Context;
+use bgzip::{BGZFReader, BGZFWriter};
 use csv::ReaderBuilder;
 use log::{debug, info, warn};
 use rayon::prelude::*;
+use serde::{Deserialize, Serialize};
 use uuid::Uuid;
 
-use super::variant::{AlterationCategory, VcfVariant};
+use super::variant::{AlterationCategory, ReferenceAlternative, VcfVariant};
 use crate::{
     annotation::{
         cosmic::Cosmic,
@@ -26,6 +28,7 @@ use crate::{
     helpers::{app_storage_dir, estimate_shannon_entropy, temp_file_path, Hash128},
     io::{readers::get_reader, vcf::vcf_header},
     pipes::somatic::sequence_at,
+    positions::GenomePosition,
 };
 
 #[derive(Debug, Clone)]
@@ -184,8 +187,126 @@ impl VariantCollection {
             self.variants.len()
         );
     }
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Variant {
+    pub hash: Hash128,
+    pub position: GenomePosition,
+    pub reference: ReferenceAlternative,
+    pub alternative: ReferenceAlternative,
+    pub vcf_variants: Vec<VcfVariant>,
+    pub annotations: Vec<Annotation>,
+}
+
+impl PartialEq for Variant {
+    fn eq(&self, other: &Self) -> bool {
+        self.position == other.position
+            && self.reference == other.reference
+            && self.alternative == other.alternative
+    }
+}
+
+#[derive(Debug, Default, Serialize, Deserialize)]
+pub struct Variants {
+    pub data: Vec<Variant>,
+}
+
+impl Variants {
+    pub fn sort(&mut self) {
+        self.data
+            .sort_unstable_by(|a, b| a.position.cmp(&b.position));
+    }
+
+    pub fn merge(&mut self, others: VariantCollection, annotations: &Annotations) {
+        let mut result = Vec::new();
+        let mut n_merged = 0;
+
+        let mut self_iter = self.data.drain(..).peekable(); // Iterator for self.data
+        let mut others_iter = others.variants.into_iter().peekable(); // Iterator for others.variants
+
+        // Merge using two-pointer technique
+        while let (Some(self_variant), Some(other_variant)) = (self_iter.peek(), others_iter.peek())
+        {
+            match self_variant.position.cmp(&other_variant.position) {
+                std::cmp::Ordering::Less => {
+                    result.push(self_iter.next().unwrap());
+                }
+                std::cmp::Ordering::Greater => {
+                    result.push(create_variant(
+                        vec![others_iter.next().unwrap()],
+                        annotations,
+                    ));
+                }
+                std::cmp::Ordering::Equal => {
+                    match (
+                        self_variant.reference == other_variant.reference,
+                        self_variant.alternative == other_variant.alternative,
+                    ) {
+                        (true, true) => {
+                            let mut merged_variant = self_iter.next().unwrap();
+
+                            merged_variant
+                                .vcf_variants
+                                .push(others_iter.next().unwrap());
+                            n_merged += 1;
+                            result.push(merged_variant);
+                        }
+                        _ => {
+                            result.push(self_iter.next().unwrap());
+                            result.push(create_variant(
+                                vec![others_iter.next().unwrap()],
+                                annotations,
+                            ));
+                        }
+                    }
+                }
+            }
+        }
+
+        // Drain remaining elements from iterators
+        result.extend(self_iter);
+        result.extend(others_iter.map(|v| create_variant(vec![v], annotations)));
+
+        info!("n merged: {}", n_merged);
+        self.data = result;
+    }
 
-    pub fn external_annotation() {}
+    pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
+        let json = serde_json::to_string(self)?;
+        let file = File::create(filename)?;
+        let mut writer = BGZFWriter::new(file, bgzip::Compression::default());
+        writer.write_all(json.as_bytes())?;
+        writer.close()?;
+        Ok(())
+    }
+
+    pub fn load_from_json(filename: &str) -> anyhow::Result<Self> {
+        let file = File::open(filename)?;
+        let mut reader = BGZFReader::new(file)?;
+        let mut json = String::new();
+        reader.read_to_string(&mut json)?;
+        let variants: Variants = serde_json::from_str(&json)?;
+        Ok(variants)
+    }
+
+}
+
+fn create_variant(vcf_variants: Vec<VcfVariant>, annotations: &Annotations) -> Variant {
+    let first = &vcf_variants[0];
+    let annotations = annotations
+        .store
+        .get(&first.hash)
+        .map(|v| v.value().to_vec())
+        .unwrap_or_default();
+    Variant {
+        hash: first.hash,
+        position: first.position.clone(),
+        reference: first.reference.clone(),
+        alternative: first.alternative.clone(),
+        vcf_variants,
+        annotations,
+    }
 }
 
 pub enum ExtAnnotationSource {