ソースを参照

sankey charming

Thomas 1 年間 前
コミット
4c2b6e5fa5
10 ファイル変更462 行追加301 行削除
  1. 204 280
      Cargo.lock
  2. 2 1
      Cargo.toml
  3. BIN
      prob.vcf.gz
  4. 64 4
      src/annotation/mod.rs
  5. 1 1
      src/callers/clairs.rs
  6. 18 7
      src/callers/mod.rs
  7. 2 2
      src/collection/bam.rs
  8. 1 1
      src/lib.rs
  9. 166 5
      src/pipes/somatic.rs
  10. 4 0
      src/variant/variant_collection.rs

ファイルの差分が大きいため隠しています
+ 204 - 280
Cargo.lock


+ 2 - 1
Cargo.toml

@@ -41,10 +41,11 @@ features = "0.10.0"
 full = "0.3.0"
 rust-htslib = "0.49.0"
 podders = "0.1.4"
-arrow = "53.3.0"
+arrow = "54.0.0"
 bgzip = "0.3.1"
 tempfile = "3.14.0"
 dashmap = { version = "6.1.0", features = ["rayon"] }
 noodles-fasta = "0.46.0"
 noodles-core = "0.15.0"
 blake3 = "1.5.5"
+charming = { version = "0.4.0", features = ["ssr"] }

BIN
prob.vcf.gz


+ 64 - 4
src/annotation/mod.rs

@@ -1,6 +1,5 @@
 use std::{
-    collections::{HashMap, HashSet},
-    fmt,
+    collections::{HashMap, HashSet}, fmt, str::FromStr, sync::Arc
 };
 
 use crate::helpers::mean;
@@ -39,6 +38,27 @@ impl fmt::Display for Annotation {
     }
 }
 
+impl FromStr for Annotation {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> anyhow::Result<Self> {
+        match s {
+            "SoloDiag" => Ok(Annotation::SoloDiag),
+            "SoloConstit" => Ok(Annotation::SoloConstit),
+            "DeepVariant" => Ok(Annotation::Callers(Caller::DeepVariant)),
+            "ClairS" => Ok(Annotation::Callers(Caller::ClairS)),
+            "Germline" => Ok(Annotation::Germline),
+            "Somatic" => Ok(Annotation::Somatic),
+            s if s.starts_with("ShannonEntropy") => Ok(Annotation::ShannonEntropy(0.0)),
+            s if s.starts_with("ConstitDepth") => Ok(Annotation::ConstitDepth(0)),
+            s if s.starts_with("ConstitAlt") => Ok(Annotation::ConstitAlt(0)),
+            "LowConstitDepth" => Ok(Annotation::LowConstitDepth),
+            "HighConstitAlt" => Ok(Annotation::HighConstitAlt),
+            _ => Err(anyhow::anyhow!("Unknown Annotation: {}", s)),
+        }
+    }
+}
+
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum Caller {
     DeepVariant,
@@ -59,6 +79,12 @@ pub struct Annotations {
     pub store: DashMap<u128, Vec<Annotation>>,
 }
 
+#[derive(Debug, Default, Clone)]
+pub struct AnnotationsStats {
+    pub categorical: DashMap<String, u64>,
+    pub numeric: DashMap<String, HashMap<String, Vec<f64>>>,
+}
+
 impl Annotations {
     pub fn insert_update(&self, key: u128, add: &[Annotation]) {
         self.store
@@ -67,7 +93,7 @@ impl Annotations {
             .extend(add.iter().cloned())
     }
 
-    pub fn callers_stat(&self) {
+    pub fn callers_stat(&self) -> AnnotationsStats {
         let map: DashMap<String, u64> = DashMap::new();
         let num_maps: DashMap<String, HashMap<String, Vec<f64>>> = DashMap::new();
 
@@ -126,6 +152,11 @@ impl Annotations {
         });
 
         println!("Total\t{n}");
+
+        AnnotationsStats {
+            categorical: map,
+            numeric: num_maps,
+        }
     }
 
     pub fn get_keys_filter(
@@ -165,10 +196,39 @@ impl Annotations {
                         if *v > max_alt_constit {
                             to_add.push(Annotation::HighConstitAlt);
                         }
-                    },
+                    }
                     _ => (),
                 });
                 v.extend(to_add);
             });
     }
+
+    pub fn count_annotations(&self, annotation_types: Vec<Annotation>) -> Vec<usize> {
+        let annotation_types = Arc::new(annotation_types);
+
+        self.store
+            .par_iter()
+            .fold(
+                || vec![0; annotation_types.len()],
+                |mut counts, r| {
+                    let annotations = r.value();
+                    for (index, annotation_type) in annotation_types.iter().enumerate() {
+                        counts[index] += annotations
+                            .iter()
+                            .filter(|a| *a == annotation_type)
+                            .count();
+                    }
+                    counts
+                },
+            )
+            .reduce(
+                || vec![0; annotation_types.len()],
+                |mut a, b| {
+                    for i in 0..a.len() {
+                        a[i] += b[i];
+                    }
+                    a
+                },
+            )
+    }
 }

+ 1 - 1
src/callers/clairs.rs

@@ -216,6 +216,7 @@ impl Variants for ClairS {
         })
     }
 }
+
 impl ClairS {
     pub fn germline(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection> {
         let add = vec![Annotation::Callers(Caller::ClairS), Annotation::Germline];
@@ -232,4 +233,3 @@ impl ClairS {
 }
 
 impl RunnerVariants for ClairS {}
-

+ 18 - 7
src/callers/mod.rs

@@ -1,5 +1,6 @@
-// use deep_variant::DeepVariant;
-// use nanomonsv::NanomonSV;
+use deep_variant::DeepVariant;
+use nanomonsv::NanomonSV;
+use clairs::ClairS;
 
 pub mod clairs;
 pub mod deep_variant;
@@ -7,9 +8,19 @@ pub mod nanomonsv;
 pub mod savana;
 pub mod severus;
 
-// #[derive(Debug)]
-// pub enum CallersSolo {
-//     DeepVariant(DeepVariant),
-//     NanomonSV(NanomonSV),
-// }
+#[derive(Debug)]
+pub enum CallersSolo {
+    DeepVariant(DeepVariant),
+    NanomonSV(NanomonSV),
+}
 
+#[derive(Debug)]
+pub enum CallersSomatic {
+    ClairS(ClairS)
+}
+
+#[derive(Debug)]
+pub enum Callers {
+    CallersSomatic,
+    CallersSolo,
+}

+ 2 - 2
src/collection/bam.rs

@@ -8,7 +8,7 @@ use std::{
 use anyhow::{anyhow, Context};
 use chrono::{DateTime, Utc};
 use glob::glob;
-use log::{debug, warn};
+use log::warn;
 use pandora_lib_bindings::{
     progs::cramino::{Cramino, CraminoRes},
     utils::RunBin,
@@ -390,7 +390,7 @@ pub fn ins_at(
     //     String::from_utf8(record.qname().to_vec()).unwrap()
     // );
 
-    for (id, op) in cigar.iter().enumerate() {
+    for op in cigar.iter() {
         let (add_read, add_ref) = match *op {
             Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len),
             Cigar::Ins(len) => (len, 0),

+ 1 - 1
src/lib.rs

@@ -554,7 +554,7 @@ mod tests {
         let c = Config::default();
         let chr = "chr1";
         let position = 52232; // 1-based like in vcf
-        let mut bam = rust_htslib::bam::IndexedReader::from_path(c.solo_bam(id, "diag"))?;
+        let mut bam = rust_htslib::bam::IndexedReader::from_path(c.solo_bam(id, "mrd"))?;
         // let p = ins_pileup(&mut bam, chr, position - 1, true)?.iter().map(|e| String::from_utf8(vec![*e]).unwrap()).collect::<Vec<_>>();
         let counts = counts_ins_at(&mut bam, chr, position -1)?;
         for (key, value) in &counts {

+ 166 - 5
src/pipes/somatic.rs

@@ -1,15 +1,24 @@
+use charming::{
+    element::{Emphasis, EmphasisFocus},
+    series::{Sankey, SankeyLink, SankeyNode},
+    Chart,
+};
+use hashbrown::HashMap;
 use log::info;
 use rayon::prelude::*;
 use std::{collections::HashSet, fs::File, sync::Arc};
 
 use crate::{
-    annotation::{Annotation, Annotations},
+    annotation::{Annotation, Annotations, AnnotationsStats, Caller},
     callers::{clairs::ClairS, deep_variant::DeepVariant},
     collection::{Initialize, InitializeSolo},
     config::Config,
     io::vcf::write_vcf,
     runners::Run,
-    variant::variant::{load_variants, parallel_intersection, RunnerVariants, VcfVariant},
+    variant::{
+        variant::{load_variants, parallel_intersection, RunnerVariants, VcfVariant},
+        variant_collection::VariantCollection,
+    },
 };
 
 pub struct Somatic {
@@ -29,10 +38,157 @@ impl Initialize for Somatic {
     }
 }
 
+#[derive(Debug, Default, Clone)]
+struct SomaticStats {
+    initial: HashMap<String, usize>,
+    annotations_stats: Vec<AnnotationsStats>,
+}
+
+impl SomaticStats {
+    pub fn init(collections: &[VariantCollection]) -> Self {
+        let mut initial = HashMap::new();
+
+        for collection in collections.iter() {
+            let name = format!("{}_{}", collection.vcf.caller, collection.vcf.time);
+            initial.insert(name, collection.variants.len());
+        }
+
+        Self {
+            initial,
+            annotations_stats: Default::default(),
+        }
+    }
+
+    pub fn push_annotations_stats(&mut self, annotations_stats: AnnotationsStats) {
+        self.annotations_stats.push(annotations_stats);
+    }
+
+    pub fn aggregation(&mut self) -> anyhow::Result<()> {
+        let annotations_stats = self
+            .annotations_stats
+            .get(0)
+            .ok_or(anyhow::anyhow!("Can't find stats"))?;
+        let step_cat = vec![
+            (Annotation::Germline, 0),
+            (Annotation::Somatic, 0),
+            (Annotation::SoloDiag, 0),
+            (Annotation::SoloConstit, 0),
+        ];
+
+        let mut callers_cats: Vec<((Annotation, Annotation), Vec<(Annotation, u64)>)> = self
+            .initial
+            .keys()
+            .map(|s| (to_callers_cat(s), step_cat.clone()))
+            .collect();
+
+        let mut node_names = Vec::new();
+
+        node_names.extend(step_cat.clone().into_iter().map(|(cat, _)| cat.to_string()));
+
+        let stats: anyhow::Result<Vec<()>> = annotations_stats
+            .categorical
+            .iter()
+            .map(|e| {
+                let v = e.value();
+                let keys: Vec<&str> = e.key().split(" + ").collect();
+                let k_a: Vec<Annotation> = keys
+                    .into_iter()
+                    .map(|e| e.parse())
+                    .collect::<anyhow::Result<_>>()?;
+
+                for ((caller, cat), counts) in callers_cats.iter_mut() {
+                    node_names.push(format!("{} {}", caller, cat));
+                    if k_a.contains(caller) && k_a.contains(cat) {
+                        for (c_annot, value) in counts.iter_mut() {
+                            if k_a.contains(c_annot) {
+                                *value += v;
+                            }
+                        }
+                    }
+                }
+
+                Ok(())
+            })
+            .collect();
+        stats?;
+
+        println!("{callers_cats:#?}");
+        let mut links: Vec<(String, String, f64)> = callers_cats
+            .iter()
+            .flat_map(|((caller, cat), counts)| {
+                let from = format!("{} {}", caller, cat);
+                counts
+                    .iter()
+                    .map(move |(annot, count)| (from.clone(), annot.to_string(), *count as f64))
+            })
+            .collect();
+        links.sort_by(|a, b| {
+            a.2.partial_cmp(&b.2)
+                .unwrap()
+                .then(a.0.cmp(&b.0))
+                .then(a.1.cmp(&b.1))
+        });
+
+        links.dedup();
+
+        node_names.sort();
+        node_names.dedup();
+
+        println!("{links:?}");
+        println!("{node_names:?}");
+
+        let chart = Chart::new().series(
+            Sankey::new()
+                .emphasis(Emphasis::new().focus(EmphasisFocus::Adjacency))
+                // .data(vec!["a", "b", "a1", "a2", "b1", "c"])
+                .data(node_names)
+                .links(links), // .links(vec![
+                               //     ("a", "a1", 5),
+                               //     ("a", "a2", 3),
+                               //     ("b", "b1", 8),
+                               //     ("a", "b1", 3),
+                               //     ("b1", "a1", 1),
+                               //     ("b1", "c", 2),
+                               // ]),
+                               // .data(node_names)
+                               // .links(vec![("ClairS Somatic", "Germline", 10)])
+                               //     ,
+                               // ),
+        );
+
+        let mut renderer = charming::ImageRenderer::new(1000, 800);
+        renderer.save(&chart, "/data/sankey.svg")?;
+
+        Ok(())
+    }
+}
+
+pub fn to_callers_cat(s: &str) -> (Annotation, Annotation) {
+    let splits: Vec<&str> = s.split("_").collect();
+
+    if splits.contains(&"clairs") {
+        (Annotation::Callers(Caller::ClairS), Annotation::Somatic)
+    } else if splits.contains(&"clair3-germline") {
+        (Annotation::Callers(Caller::ClairS), Annotation::Germline)
+    } else if splits.contains(&"DeepVariant") && splits.contains(&"mrd") {
+        (
+            Annotation::Callers(Caller::DeepVariant),
+            Annotation::SoloConstit,
+        )
+    } else if splits.contains(&"DeepVariant") && splits.contains(&"diag") {
+        (
+            Annotation::Callers(Caller::DeepVariant),
+            Annotation::SoloDiag,
+        )
+    } else {
+        panic!("unknown caller: {s}");
+    }
+}
+
 impl Run for Somatic {
     fn run(&mut self) -> anyhow::Result<()> {
-        info!("Running somatic pipe for {}.", self.id);
         let id = self.id.clone();
+        info!("Running somatic pipe for {id}.");
 
         info!("Initialization...");
         let mut v: Vec<Box<dyn RunnerVariants + Send + Sync>> = vec![
@@ -49,9 +205,14 @@ impl Run for Somatic {
         variants_collection.push(clairs_germline);
         info!("Variants sources loaded: {}", variants_collection.len());
 
+        let mut somatic_stats = SomaticStats::init(&variants_collection);
+
         // Annotations Stats
-        let mut annotations = Arc::unwrap_or_clone(annotations);
-        annotations.callers_stat();
+        // let mut annotations = Arc::unwrap_or_clone(annotations);
+        somatic_stats.push_annotations_stats(annotations.callers_stat());
+        somatic_stats.aggregation()?;
+
+        return Ok(());
         // TODO: look at variants: ClairS + DeepVariant + SoloConstit + SoloDiag + Somatic (error
         // in ClairS somatic)
 

+ 4 - 0
src/variant/variant_collection.rs

@@ -100,9 +100,13 @@ impl VariantCollection {
                         let mut alt_seq = var.alternative.to_string();
 
                         let c = if var.alteration_category() == AlterationCategory::Ins {
+                            // TODO: Add stretch comparison
                             alt_seq = alt_seq.split_off(1);
                             counts_ins_at(&mut bam, &var.position.contig(), var.position.position)?
                         } else {
+                            if var.alteration_category() == AlterationCategory::Del {
+                                alt_seq = "D".to_string();
+                            }
                             counts_at(&mut bam, &var.position.contig(), var.position.position)?
                         };
 

この差分においてかなりの量のファイルが変更されているため、一部のファイルを表示していません