Thomas 9 сар өмнө
parent
commit
e09d58cbdb

+ 14 - 0
Cargo.lock

@@ -2766,6 +2766,17 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
 
+[[package]]
+name = "ordered-float"
+version = "5.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2c1f9f56e534ac6a9b8a4600bdf0f530fb393b5f393e7b4d03489c3cf0c3f01"
+dependencies = [
+ "num-traits",
+ "rand 0.8.5",
+ "serde",
+]
+
 [[package]]
 name = "os_pipe"
 version = "1.2.1"
@@ -2889,6 +2900,7 @@ dependencies = [
  "noodles-fasta 0.48.0",
  "noodles-gff 0.43.0",
  "num-format",
+ "ordered-float",
  "pandora_lib_assembler",
  "pandora_lib_scan",
  "pandora_lib_variants",
@@ -3286,6 +3298,7 @@ dependencies = [
  "libc",
  "rand_chacha 0.3.1",
  "rand_core 0.6.4",
+ "serde",
 ]
 
 [[package]]
@@ -3326,6 +3339,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
 dependencies = [
  "getrandom 0.2.15",
+ "serde",
 ]
 
 [[package]]

+ 1 - 0
Cargo.toml

@@ -42,6 +42,7 @@ itertools = "0.14.0"
 rand = "0.9.0"
 tar = "0.4.43"
 flatbuffers = "25.2.10"
+ordered-float = { version = "5.0.0", features = ["serde"] }
 
 [profile.dev]
 opt-level = 0

+ 1 - 1
src/annotation/mod.rs

@@ -400,7 +400,7 @@ impl Annotations {
         self.store.retain(|key, _| !keys_to_remove.contains(key));
     }
 
-    pub fn solo_constit_boundaries(&self, max_alt_constit: u16, min_constit_depth: u16) {
+    pub fn somatic_constit_boundaries(&self, max_alt_constit: u16, min_constit_depth: u16) {
         self.store
             .iter_mut()
             .filter(|anns| {

+ 22 - 0
src/annotation/vep.rs

@@ -5,6 +5,7 @@ use log::{debug, warn};
 use serde::{Deserialize, Serialize};
 use std::{
     cmp::{Ordering, Reverse},
+    fmt::Display,
     io::{BufRead, BufReader},
     process::{Command, Stdio},
     str::FromStr,
@@ -236,6 +237,21 @@ pub enum VepImpact {
     MODIFIER,
 }
 
+impl Display for VepImpact {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}",
+            match self {
+                VepImpact::HIGH => "HIGH",
+                VepImpact::MODERATE => "MODERATE",
+                VepImpact::LOW => "LOW",
+                VepImpact::MODIFIER => "MODIFIER",
+            }
+        )
+    }
+}
+
 impl From<VepImpact> for String {
     /// Converts a VepImpact enum variant to its string representation.
     fn from(value: VepImpact) -> Self {
@@ -400,6 +416,12 @@ impl From<VepConsequence> for String {
     }
 }
 
+impl Display for VepConsequence {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", String::from(self.clone()))
+    }
+}
+
 impl FromStr for VepConsequence {
     type Err = anyhow::Error;
 

+ 4 - 4
src/config.rs

@@ -52,8 +52,8 @@ pub struct Config {
     pub clairs_platform: String,
     pub clairs_output_dir: String,
     pub mask_bed: String,
-    pub solo_min_constit_depth: u16,
-    pub solo_max_alt_constit: u16,
+    pub somatic_min_constit_depth: u16,
+    pub somatic_max_alt_constit: u16,
     pub min_shannon_entropy: f64,
     pub nanomonsv_bin: String,
     pub nanomonsv_output_dir: String,
@@ -175,8 +175,8 @@ impl Default for Config {
 
             // Pipe
             somatic_pipe_force: true,
-            solo_min_constit_depth: 5,
-            solo_max_alt_constit: 1,
+            somatic_min_constit_depth: 5,
+            somatic_max_alt_constit: 1,
             min_shannon_entropy: 1.0,
         }
     }

+ 59 - 0
src/helpers.rs

@@ -292,6 +292,65 @@ where
     sum.into() / count as f64
 }
 
+pub fn bin_data<V>(data: Vec<V>, bin_size: V) -> Vec<(V, usize)>
+where
+    V: Copy + PartialOrd + std::ops::AddAssign + std::ops::Add<Output = V>,
+{
+    // Sort the data
+    let mut sorted_data = data.clone();
+    sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
+
+    // Initialize bins
+    let mut bins: Vec<(V, usize)> = Vec::new();
+    let mut current_bin_start = sorted_data[0];
+    let mut count = 0;
+
+    for &value in &sorted_data {
+        if value < current_bin_start + bin_size {
+            count += 1;
+        } else {
+            bins.push((current_bin_start, count));
+            current_bin_start += bin_size;
+            count = 1;
+        }
+    }
+
+    // Push the last bin
+    bins.push((current_bin_start, count));
+
+    bins
+}
+
+fn aggregate_data(data: &[(u128, u32)], num_bins: usize) -> Vec<(u32, u32)> {
+    if data.is_empty() || num_bins == 0 {
+        return vec![];
+    }
+
+    let bin_size = ((data.len() as f64) / (num_bins as f64)).ceil() as usize;
+
+    (0..num_bins)
+        .map(|i| {
+            let start = i * bin_size;
+            let end = (start + bin_size).min(data.len()); // Ensure `end` does not exceed `data.len()`
+
+            // If `start` is out of bounds, return (0, 0)
+            if start >= data.len() {
+                return (0, 0);
+            }
+
+            let bin = &data[start..end];
+
+            let sum_x: u128 = bin.iter().map(|&(x, _)| x).sum();
+            let count = bin.len() as u128;
+            let mean_x = (sum_x / count) as u32; // Rounded down to nearest u32
+
+            let sum_n: u32 = bin.iter().map(|&(_, n)| n).sum();
+
+            (mean_x, sum_n)
+        })
+        .collect()
+}
+
 pub fn app_storage_dir() -> anyhow::Result<PathBuf> {
     let app_name = env!("CARGO_PKG_NAME");
     let app_dir = dirs::data_dir()

+ 1 - 1
src/lib.rs

@@ -704,7 +704,7 @@ mod tests {
     #[test]
     fn run_somatic_case() -> anyhow::Result<()> {
         init();
-        let id = "TAHIR";
+        let id = "ADJAGBA";
         let mut config = Config::default();
         config.somatic_pipe_force = true;
         match Somatic::initialize(id, config)?.run() {

+ 8 - 8
src/pipes/somatic.rs

@@ -333,9 +333,9 @@ impl Run for Somatic {
         variants_collections.iter().try_for_each(|c| {
             c.annotate_with_constit_bam(&annotations, &self.config.normal_bam(&id), 150)
         })?;
-        annotations.solo_constit_boundaries(
-            self.config.solo_max_alt_constit,
-            self.config.solo_min_constit_depth,
+        annotations.somatic_constit_boundaries(
+            self.config.somatic_max_alt_constit,
+            self.config.somatic_min_constit_depth,
         );
         annotations
             .callers_stat(Some(Box::new(|v| {
@@ -353,7 +353,7 @@ impl Run for Somatic {
         // Filter: Remove LowConstitDepth from annotations and variants collections
         info!(
             "Removing variants when depth in constit bam < {}.",
-            self.config.solo_min_constit_depth
+            self.config.somatic_min_constit_depth
         );
         somatic_stats.n_low_constit = annotations
             .retain_variants(&mut variants_collections, |anns| {
@@ -361,13 +361,13 @@ impl Run for Somatic {
             });
         info!(
             "{} variants removed when depth in constit bam < {}.",
-            somatic_stats.n_low_constit, self.config.solo_min_constit_depth
+            somatic_stats.n_low_constit, self.config.somatic_min_constit_depth
         );
 
         // Filter: remove HighConstitAlt
         info!(
             "Removing variants with SNP/indel inside the constit alignements > {}",
-            self.config.solo_max_alt_constit
+            self.config.somatic_max_alt_constit
         );
         somatic_stats.n_high_alt_constit = annotations
             .retain_variants(&mut variants_collections, |anns| {
@@ -375,7 +375,7 @@ impl Run for Somatic {
             });
         info!(
             "{} variants removed with SNP/indel inside the constit alignements > {}",
-            somatic_stats.n_high_alt_constit, self.config.solo_max_alt_constit
+            somatic_stats.n_high_alt_constit, self.config.somatic_max_alt_constit
         );
 
         annotations
@@ -445,7 +445,7 @@ impl Run for Somatic {
 
         info!(
             "{} variants filtered, with constit alt <= max contig alt ({}) and in GnomAD.",
-            somatic_stats.n_high_alt_constit_gnomad, self.config.solo_max_alt_constit
+            somatic_stats.n_high_alt_constit_gnomad, self.config.somatic_max_alt_constit
         );
         // TODO: These stats doesn't capture filter metrics !!!
         annotations

+ 87 - 14
src/variant/variant_collection.rs

@@ -1,5 +1,5 @@
 use std::{
-    collections::{HashMap, HashSet},
+    collections::{BTreeMap, HashMap, HashSet},
     fs::{self, File},
     io::Write,
     path::Path,
@@ -10,6 +10,7 @@ use bgzip::{BGZFReader, BGZFWriter};
 use csv::ReaderBuilder;
 use dashmap::DashMap;
 use log::{debug, error, info, warn};
+use ordered_float::OrderedFloat;
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
@@ -27,7 +28,7 @@ use crate::{
         bam::{counts_at, counts_ins_at},
         vcf::Vcf,
     },
-    helpers::{app_storage_dir, estimate_shannon_entropy, mean, temp_file_path, Hash128},
+    helpers::{app_storage_dir, bin_data, estimate_shannon_entropy, mean, temp_file_path, Hash128},
     io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header},
     positions::GenomePosition,
 };
@@ -742,19 +743,83 @@ impl Variants {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct VariantsStats {
     pub n: u32,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
     pub alteration_categories: DashMap<String, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
     pub cosmic: DashMap<u64, u32>,
-    pub gnomad: DashMap<String, Vec<f64>>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub n_alts: DashMap<u32, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub depths: DashMap<u32, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub vafs: DashMap<OrderedFloat<f32>, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub vep_impact: DashMap<String, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub consequences: DashMap<String, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub genes: DashMap<String, u32>,
+    pub n_gnomad: usize,
+    pub gnomad: Vec<(String, Vec<(f64, usize)>)>,
+}
+
+use serde::Serializer;
+
+pub fn serialize_dashmap_sort<S, T>(
+    data: &DashMap<T, u32>,
+    serializer: S,
+) -> Result<S::Ok, S::Error>
+where
+    S: Serializer,
+    T: Serialize + Ord + std::hash::Hash + Clone,
+{
+    let ordered: BTreeMap<_, _> = data
+        .iter()
+        .map(|entry| (entry.key().clone(), *entry.value()))
+        .collect();
+
+    ordered.serialize(serializer)
 }
 
 impl VariantsStats {
     pub fn new(variants: &Variants) -> Self {
         let n = variants.data.len() as u32;
         let alteration_categories: DashMap<String, u32> = DashMap::new();
+        let vep_impact: DashMap<String, u32> = DashMap::new();
+        let genes: DashMap<String, u32> = DashMap::new();
+        let consequences: DashMap<String, u32> = DashMap::new();
+        let n_alts: DashMap<u32, u32> = DashMap::new();
+        let depths: DashMap<u32, u32> = DashMap::new();
+        let vafs: DashMap<OrderedFloat<f32>, u32> = DashMap::new();
         let cosmic: DashMap<u64, u32> = DashMap::new();
-        let gnomad: DashMap<String, Vec<f64>> = DashMap::new();
+        let gnomads: DashMap<String, Vec<f64>> = DashMap::new();
 
         variants.data.par_iter().for_each(|v| {
+            if let Ok(best_vep) = v.best_vep() {
+                if let Some(impact) = best_vep.extra.impact {
+                    *vep_impact.entry(impact.to_string()).or_default() += 1;
+                }
+                if let Some(gene) = best_vep.extra.symbol {
+                    *genes.entry(gene).or_default() += 1;
+                }
+                if let Some(csqs) = best_vep.consequence {
+                    let mut csq: Vec<String> = csqs.into_iter().map(String::from).collect();
+                    csq.sort();
+                    csq.dedup();
+                    *consequences.entry(csq.join(", ")).or_default() += 1;
+                }
+            }
+
+            let (n_alt, depth) = v.n_alt_depth();
+            *n_alts.entry(n_alt as u32).or_default() += 1;
+            *depths.entry(depth as u32).or_default() += 1;
+
+            
+            let vaf = OrderedFloat::from(((n_alt * 1_000.0 / depth).round() / 10.0) as f32);
+            if !vaf.is_nan() {
+                *vafs.entry(vaf).or_default() += 1;
+            }
+
             v.annotations.iter().for_each(|annotation| {
                 match annotation {
                     Annotation::Cosmic(v) => *cosmic.entry(v.cosmic_cnt).or_default() += 1,
@@ -763,7 +828,7 @@ impl VariantsStats {
                             .iter()
                             .map(|e| e.to_string_value_pair())
                             .for_each(|(key, value)| {
-                                gnomad.entry(key).or_default().push(value);
+                                gnomads.entry(key).or_default().push(value);
                             });
                     }
                     _ => (),
@@ -781,30 +846,40 @@ impl VariantsStats {
                 .entry(alteration_category_str.join(", "))
                 .or_default() += 1;
         });
-
-        gnomad.iter().for_each(|e| {
-            println!("{}\t{}", e.key(), mean(e.value()));
-        });
+        
+        let mut n_gnomad = 0;
+        let gnomad = gnomads.iter().map(|e| {
+            let data = e.value().to_vec();
+            n_gnomad = data.len();
+            (e.key().to_string(), bin_data(data, 0.0001))
+        }).collect();
 
         Self {
             n,
             alteration_categories,
             cosmic,
             gnomad,
+            vafs,
+            n_alts,
+            depths,
+            vep_impact,
+            consequences,
+            genes,
+            n_gnomad,
         }
     }
 
     pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
         let file = File::create(filename)
-            .with_context(|| format!("Failed to create file: {}", filename))?;
+            .map_err(|e| anyhow::anyhow!("Failed to create file: {}\n{e}", filename))?;
         let mut writer = BGZFWriter::new(file, bgzip::Compression::default());
 
         serde_json::to_writer(&mut writer, self)
-            .with_context(|| format!("Failed to serialize JSON to file: {}", filename))?;
+            .map_err(|e| anyhow::anyhow!("Failed to serialize JSON to file: {}\n{e}", filename))?;
 
         writer
             .close()
-            .with_context(|| format!("Failed to close BGZF writer for file: {}", filename))?;
+            .map_err(|e| anyhow::anyhow!("Failed to close BGZF writer for file: {}\n{e}", filename))?;
 
         debug!("Successfully saved variants to {}", filename);
         Ok(())
@@ -1139,8 +1214,6 @@ impl ExternalAnnotation {
         let (sv, unfound): (Vec<VcfVariant>, Vec<VcfVariant>) =
             unfound.into_iter().partition(|v| v.has_svtype());
 
-        warn!("SV {}", sv.len());
-
         let min_chunk_size = 1000;
         let max_chunks = 150;