Thomas vor 10 Monaten
Ursprung
Commit
fde4568495
6 geänderte Dateien mit 122 neuen und 62 gelöschten Zeilen
  1. 94 47
      src/annotation/mod.rs
  2. 11 0
      src/annotation/vep.rs
  3. 1 0
      src/collection/bam.rs
  4. 12 12
      src/lib.rs
  5. 2 1
      src/pipes/somatic.rs
  6. 2 2
      src/variant/variant_collection.rs

+ 94 - 47
src/annotation/mod.rs

@@ -5,7 +5,15 @@ pub mod ncbi;
 pub mod vep;
 
 use std::{
-    collections::{HashMap, HashSet}, fmt, fs::File, io::{Read, Write}, str::FromStr, sync::Arc
+    collections::{HashMap, HashSet},
+    fmt,
+    fs::File,
+    io::{Read, Write},
+    str::FromStr,
+    sync::{
+        atomic::{AtomicU32, Ordering},
+        Arc,
+    },
 };
 
 use crate::{
@@ -253,7 +261,7 @@ impl Annotations {
                     .push(v_num);
             }
         });
-        
+
         println!("\nCallers stats:");
         println!("\tn categories: {}", map.len());
         let mut n = 0;
@@ -284,45 +292,60 @@ impl Annotations {
         }
     }
 
-    pub fn vep_stats(&self) -> anyhow::Result<()> {
-        let (n_coding, n_syn, total) = self
-            .store
-            .par_iter()
-            .map(|v| {
-                v.value()
-                    .iter()
-                    .find_map(|ann| {
-                        if let Annotation::VEP(veps) = ann {
-                            get_best_vep(veps).ok()
-                        } else {
-                            None
-                        }
-                    })
-                    .and_then(|vep| vep.consequence.clone())
-                    .map_or((0, 0, 0), |consequences| {
-                        let impact = consequences
-                            .iter()
-                            .map(VepImpact::from)
-                            .min()
-                            .unwrap_or(VepImpact::MODIFIER);
-                        match impact {
-                            VepImpact::HIGH | VepImpact::MODERATE => (1, 0, 1),
-                            _ => {
-                                if consequences.contains(&VepConsequence::SynonymousVariant) {
-                                    (0, 1, 1)
-                                } else {
-                                    (0, 0, 1)
-                                }
-                            }
-                        }
-                    })
-            })
-            .reduce(|| (0, 0, 0), |a, b| (a.0 + b.0, a.1 + b.1, a.2 + b.2));
+    pub fn vep_stats(&self) -> anyhow::Result<VepStats> {
+        let genes: DashMap<String, u32> = DashMap::new();
+        let genes_distant: DashMap<String, u32> = DashMap::new();
+        let features: DashMap<String, u32> = DashMap::new();
+        let consequences: DashMap<String, u32> = DashMap::new();
+        let impact: DashMap<String, u32> = DashMap::new();
+        let n_all = Arc::new(AtomicU32::new(0));
+        let n_vep = Arc::new(AtomicU32::new(0));
 
-        println!(
-            "VEP annotations\n\t- Coding: {n_coding}\n\t- Synonymous: {n_syn}\nTotal: {total}"
-        );
-        Ok(())
+        self.store.par_iter().for_each(|e| {
+            n_all.fetch_add(1, Ordering::SeqCst);
+
+            let best_vep_opt = e.value().iter().find_map(|a| {
+                if let Annotation::VEP(veps) = a {
+                    get_best_vep(veps).ok()
+                } else {
+                    None
+                }
+            });
+
+            if let Some(best_vep) = best_vep_opt {
+                n_vep.fetch_add(1, Ordering::SeqCst);
+                if let Some(gene) = best_vep.gene {
+                    if best_vep.extra.distance.is_some() {
+                        *genes_distant.entry(gene).or_default() += 1;
+                    } else {
+                        *genes.entry(gene).or_default() += 1;
+                    }
+                }
+                if let Some(feature) = best_vep.feature {
+                    *features.entry(feature).or_default() += 1;
+                }
+                if let Some(cs) = best_vep.consequence {
+                    let mut cs = cs.into_iter().map(String::from).collect::<Vec<String>>();
+                    cs.sort();
+                    *consequences.entry(cs.join(" ")).or_default() += 1;
+                }
+                if let Some(imp) = best_vep.extra.impact {
+                    *impact.entry(String::from(imp)).or_default() += 1;
+                }
+            }
+        });
+
+        let vep_stats = VepStats {
+            genes: genes.into_iter().collect(),
+            genes_distant: genes_distant.into_iter().collect(),
+            features: features.into_iter().collect(),
+            consequences: consequences.into_iter().collect(),
+            impact: impact.into_iter().collect(),
+            n_all: n_all.load(Ordering::SeqCst),
+            n_vep: n_vep.load(Ordering::SeqCst),
+        };
+
+        Ok(vep_stats)
     }
 
     pub fn get_keys_filter(
@@ -358,12 +381,7 @@ impl Annotations {
                 let before = c.variants.len();
                 c.variants.retain(|a| keys.contains(&a.hash()));
                 let after = c.variants.len();
-                info!(
-                    "\t- {}\t{}/{}",
-                    c.caller,
-                    before - after,
-                    before
-                );
+                info!("\t- {}\t{}/{}", c.caller, before - after, before);
                 before - after
             })
             .sum();
@@ -468,13 +486,42 @@ impl Annotations {
                     && !anns
                         .iter()
                         .any(|a| matches!(a, Annotation::Callers(_, Sample::Somatic)))
-            }) && !anns.contains(&Annotation::LowEntropy) {
+            }) && !anns.contains(&Annotation::LowEntropy)
+            {
                 anns.push(Annotation::LowEntropy);
             }
         });
     }
 }
 
+#[derive(Debug, Serialize, Deserialize)]
+pub struct VepStats {
+    pub genes: Vec<(String, u32)>,
+    pub genes_distant: Vec<(String, u32)>,
+    pub features: Vec<(String, u32)>,
+    pub consequences: Vec<(String, u32)>,
+    pub impact: Vec<(String, u32)>,
+    pub n_all: u32,
+    pub n_vep: u32,
+}
+
+impl VepStats {
+    pub fn save_to_json(&self, file_path: &str) -> anyhow::Result<()> {
+        let json = serde_json::to_string_pretty(self)?;
+        let mut file = File::create(file_path)?;
+        file.write_all(json.as_bytes())?;
+        Ok(())
+    }
+
+    pub fn load_from_json(file_path: &str) -> anyhow::Result<Self> {
+        let mut file = File::open(file_path)?;
+        let mut contents = String::new();
+        file.read_to_string(&mut contents)?;
+        let stats: Self = serde_json::from_str(&contents)?;
+        Ok(stats)
+    }
+}
+
 pub trait CallerCat {
     fn caller_cat(&self) -> Annotation;
 }

+ 11 - 0
src/annotation/vep.rs

@@ -126,6 +126,17 @@ pub enum VepImpact {
     MODIFIER,
 }
 
+impl From<VepImpact> for String {
+    fn from(value: VepImpact) -> Self {
+        match value {
+            VepImpact::HIGH => "High",
+            VepImpact::MODERATE => "Moderate",
+            VepImpact::LOW => "Low",
+            VepImpact::MODIFIER => "Modifier",
+        }.to_string()
+    }
+}
+
 impl PartialOrd for VepImpact {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))

+ 1 - 0
src/collection/bam.rs

@@ -608,6 +608,7 @@ impl WGSBamStats {
         let mut by_lengths: Vec<(u128, u32)> =
             by_lengths.iter().map(|e| (*e.key(), *e.value())).collect();
         by_lengths.par_sort_by(|a, b| a.0.cmp(&b.0));
+        // This is not n50
         let n50 = by_lengths[by_lengths.len() / 2].0;
 
         let mut karyotype: Vec<(i32, u64, String, u128)> = lengths_by_tid

+ 12 - 12
src/lib.rs

@@ -666,18 +666,18 @@ mod tests {
         Ok(())
     }
 
-    // #[test]
-    // fn run_somatic() -> anyhow::Result<()> {
-    //     init();
-    //     let id = "ADJAGBA";
-    //     let mut config = Config::default();
-    //     config.somatic_pipe_force = true;
-    //     match Somatic::initialize(id, config)?.run() {
-    //         Ok(_) => (),
-    //         Err(e) => error!("{id} {e}"),
-    //     };
-    //     Ok(())
-    // }
+    #[test]
+    fn run_somatic_case() -> anyhow::Result<()> {
+        init();
+        let id = "ADJAGBA";
+        let mut config = Config::default();
+        config.somatic_pipe_force = true;
+        match Somatic::initialize(id, config)?.run() {
+            Ok(_) => (),
+            Err(e) => error!("{id} {e}"),
+        };
+        Ok(())
+    }
     
     #[test]
     fn load_variants() -> anyhow::Result<()> {

+ 2 - 1
src/pipes/somatic.rs

@@ -504,7 +504,8 @@ impl Run for Somatic {
             },
         );
 
-        annotations.vep_stats()?;
+        let vep_stats = annotations.vep_stats()?;
+        vep_stats.save_to_json(&format!("{stats_dir}/{id}_annotations_10_vep.json"))?;
 
         info!("Final unique variants: {}", variants.data.len());
         variants.save_to_json(&result_json)?;

+ 2 - 2
src/variant/variant_collection.rs

@@ -676,7 +676,7 @@ impl ExternalAnnotation {
                         lines.entry(key).or_default().push(line);
                     }
 
-                    // fs::remove_file(&in_tmp).context(format!("Can't remove file {in_tmp}"))?;
+                    fs::remove_file(&in_tmp).context(format!("Can't remove file {in_tmp}"))?;
 
                     let mut n_not_vep = 0;
                     let mut chunk_results: Vec<(Hash128, Vec<VEP>)> = Vec::new();
@@ -698,7 +698,7 @@ impl ExternalAnnotation {
                             n_not_vep += 1;
                         }
                     });
-                    // fs::remove_file(&out_vep).context(format!("Can't remove file {out_vep}"))?;
+                    fs::remove_file(&out_vep).context(format!("Can't remove file {out_vep}"))?;
 
                     if n_not_vep > 0 {
                         debug!("{n_not_vep} variants not annotated by VEP.");