Pārlūkot izejas kodu

filters BAM finished (del pos + 1) // begin ExternalAnnotations

Thomas 1 gadu atpakaļ
vecāks
revīzija
45e1063e97

+ 13 - 0
Cargo.lock

@@ -1528,6 +1528,15 @@ dependencies = [
  "dirs-sys",
 ]
 
+[[package]]
+name = "dirs"
+version = "5.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225"
+dependencies = [
+ "dirs-sys",
+]
+
 [[package]]
 name = "dirs-next"
 version = "2.0.0"
@@ -3598,6 +3607,7 @@ dependencies = [
  "csv",
  "ctrlc",
  "dashmap",
+ "dirs",
  "duct",
  "env_logger 0.11.6",
  "expectrl",
@@ -3624,6 +3634,7 @@ dependencies = [
  "ptyprocess",
  "rayon",
  "regex",
+ "rusqlite",
  "rust-htslib 0.49.0",
  "serde",
  "serde_json",
@@ -4406,10 +4417,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7753b721174eb8ff87a9a0e799e2d7bc3749323e773db92e0984debb00019d6e"
 dependencies = [
  "bitflags 2.6.0",
+ "chrono",
  "fallible-iterator",
  "fallible-streaming-iterator",
  "hashlink",
  "libsqlite3-sys",
+ "serde_json",
  "smallvec",
 ]
 

+ 2 - 0
Cargo.toml

@@ -49,3 +49,5 @@ noodles-fasta = "0.46.0"
 noodles-core = "0.15.0"
 blake3 = "1.5.5"
 charming = { version = "0.4.0", features = ["ssr"] }
+rusqlite = { version = "0.32.1", features = ["chrono", "serde_json"] }
+dirs = "5.0.1"

BIN
prob.vcf.gz


+ 32 - 0
src/annotation/cosmic.rs

@@ -0,0 +1,32 @@
+use std::str::FromStr;
+
+use serde::{Serialize, Deserialize};
+
+use anyhow::{anyhow, Context, Ok, Result};
+
+#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
+pub struct Cosmic {
+    pub cosmic_cnt: u64,
+}
+
+impl FromStr for Cosmic {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self> {
+        let vs: Vec<&str> = s.split(";").collect();
+        if vs.len() != 3 {
+            return Err(anyhow!("Error while parsing Cosmic results not the right number of parts for {s}"));
+        }
+
+        if vs[0].contains("MISSING") {
+            Err(anyhow!("MISSING values in Cosmic results: {s}"))
+        } else {
+            let v: Vec<&str> = vs[2].split("=").collect();
+
+            Ok(Cosmic {
+                cosmic_cnt: v[1].parse().context("parsing cosmic cnt")?,
+            })
+        }
+    }
+}
+

+ 63 - 0
src/annotation/echtvar.rs

@@ -0,0 +1,63 @@
+use std::{
+    io::{BufRead, BufReader},
+    process::{Command, Stdio},
+};
+
+use anyhow::{Context, Ok, Result};
+use log::warn;
+
+use super::{cosmic::Cosmic, gnomad::GnomAD};
+
+// /data/tools/echtvar anno -e /data/ref/hs1/CosmicCodingMuts.echtvar.zip -e /data/ref/hs1/gnomAD_4-2022_10-gnomad.echtvar.zip BENGUIRAT_diag_clairs_PASSED.vcf.gz test.bcf
+pub fn run_echtvar(in_path: &str, out_path: &str) -> Result<()> {
+    let bin_dir = "/data/tools";
+
+    let annot_sources: Vec<&str> = [
+        "/data/ref/hs1/CosmicCodingMuts.echtvar.zip",
+        "/data/ref/hs1/gnomAD_4-2022_10-gnomad.echtvar.zip",
+    ]
+    .iter()
+    .flat_map(|e| vec!["-e", e])
+    .collect();
+
+    // info!("Running echtvar anno for {}", in_path);
+    let mut cmd = Command::new(format!("{}/echtvar", bin_dir))
+        .arg("anno")
+        .args(annot_sources)
+        .arg(in_path)
+        .arg(out_path)
+        .stderr(Stdio::piped())
+        .spawn()
+        .context("echtvar anno failed to start")?;
+
+    let stderr = cmd.stderr.take().unwrap();
+    let reader = BufReader::new(stderr);
+    reader
+        .lines()
+        .map_while(Result::ok)
+        .filter(|line| line.contains("error"))
+        .for_each(|line| warn!("{}", line));
+
+    cmd.wait()?;
+    Ok(())
+}
+
+pub fn parse_echtvar_val(s: &str) -> Result<(Option<Cosmic>, Option<GnomAD>)> {
+    let mi: Vec<_> = s.match_indices(r";gnomad").collect();
+    let (i, _) = mi[0];
+    let str_cosmic = &s[..i];
+    let str_gnomad = &s[i + 1..];
+
+    let cosmic = if str_cosmic.contains("MISSING") {
+        None
+    } else {
+        Some(str_cosmic.parse::<Cosmic>()?)
+    };
+
+    let gnomad = if str_gnomad.contains("-1") {
+        None
+    } else {
+        Some(str_gnomad.parse::<GnomAD>()?)
+    };
+    Ok((cosmic, gnomad))
+}

+ 61 - 0
src/annotation/gnomad.rs

@@ -0,0 +1,61 @@
+use std::str::FromStr;
+use serde::{Serialize, Deserialize};
+use anyhow::{anyhow, Ok, Result};
+
+#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
+pub struct GnomAD {
+    pub gnomad_ac: u64,
+    pub gnomad_an: u64,
+    pub gnomad_af: f64,
+    pub gnomad_af_oth: f64,
+    pub gnomad_af_ami: f64,
+    pub gnomad_af_sas: f64,
+    pub gnomad_af_fin: f64,
+    pub gnomad_af_eas: f64,
+    pub gnomad_af_amr: f64,
+    pub gnomad_af_afr: f64,
+    pub gnomad_af_mid: f64,
+    pub gnomad_af_asj: f64,
+    pub gnomad_af_nfe: f64,
+}
+
+impl FromStr for GnomAD {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self> {
+        let vs: Vec<_> = s.split(";").collect();
+        if vs.len() < 13 {
+            return Err(anyhow!("Error not the right number of parts for {:?}", s));
+        }
+        if vs[0].contains("-1") {
+            return Err(anyhow!(
+                "MISSING values check for -1 before parsing for {:?}",
+                s
+            ));
+        }
+
+        let vv: Vec<&str> = vs
+            .iter()
+            .map(|e| {
+                let v: Vec<_> = e.split("=").collect();
+                v[1]
+            })
+            .collect();
+
+        Ok(GnomAD {
+            gnomad_ac: vv[0].parse()?,
+            gnomad_an: vv[1].parse()?,
+            gnomad_af: vv[2].parse()?,
+            gnomad_af_oth: vv[3].parse()?,
+            gnomad_af_ami: vv[4].parse()?,
+            gnomad_af_sas: vv[5].parse()?,
+            gnomad_af_fin: vv[6].parse()?,
+            gnomad_af_eas: vv[7].parse()?,
+            gnomad_af_amr: vv[8].parse()?,
+            gnomad_af_afr: vv[9].parse()?,
+            gnomad_af_mid: vv[10].parse()?,
+            gnomad_af_asj: vv[11].parse()?,
+            gnomad_af_nfe: vv[12].parse()?,
+        })
+    }
+}

+ 29 - 10
src/annotation/mod.rs

@@ -1,14 +1,23 @@
+pub mod cosmic;
+pub mod echtvar;
+pub mod gnomad;
+
 use std::{
-    collections::{HashMap, HashSet}, fmt, str::FromStr, sync::Arc
+    collections::{HashMap, HashSet},
+    fmt,
+    str::FromStr,
+    sync::Arc,
 };
 
 use crate::helpers::mean;
+use cosmic::Cosmic;
 use dashmap::DashMap;
+use gnomad::GnomAD;
 use rayon::prelude::*;
 
 #[derive(Debug, Clone, PartialEq)]
 pub enum Annotation {
-    SoloDiag,
+    SoloTumor,
     SoloConstit,
     Callers(Caller),
     Germline,
@@ -18,12 +27,14 @@ pub enum Annotation {
     ConstitAlt(u16),
     LowConstitDepth,
     HighConstitAlt,
+    Cosmic(Cosmic),
+    GnomAD(GnomAD),
 }
 
 impl fmt::Display for Annotation {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let str = match self {
-            Annotation::SoloDiag => "SoloDiag",
+            Annotation::SoloTumor => "SoloTumor",
             Annotation::SoloConstit => "SoloConstit",
             Annotation::Callers(caller) => &caller.to_string(),
             Annotation::Germline => "Germline",
@@ -33,6 +44,8 @@ impl fmt::Display for Annotation {
             Annotation::ConstitAlt(_) => "ConstitAlt",
             Annotation::LowConstitDepth => "LowConstitDepth",
             Annotation::HighConstitAlt => "HighConstitAlt",
+            Annotation::Cosmic(_) => "Cosmic",
+            Annotation::GnomAD(_) => "GnomAD",
         };
         write!(f, "{}", str)
     }
@@ -43,7 +56,7 @@ impl FromStr for Annotation {
 
     fn from_str(s: &str) -> anyhow::Result<Self> {
         match s {
-            "SoloDiag" => Ok(Annotation::SoloDiag),
+            "SoloTumor" => Ok(Annotation::SoloTumor),
             "SoloConstit" => Ok(Annotation::SoloConstit),
             "DeepVariant" => Ok(Annotation::Callers(Caller::DeepVariant)),
             "ClairS" => Ok(Annotation::Callers(Caller::ClairS)),
@@ -104,17 +117,19 @@ impl Annotations {
             let mut numerical = Vec::new();
             for ann in anns {
                 match ann {
-                    Annotation::SoloDiag
+                    Annotation::SoloTumor
                     | Annotation::SoloConstit
                     | Annotation::Germline
                     | Annotation::Somatic
                     | Annotation::LowConstitDepth
+                    | Annotation::GnomAD(_)
                     | Annotation::HighConstitAlt => categorical.push(ann.to_string()),
                     Annotation::Callers(caller) => categorical.push(caller.to_string()),
                     Annotation::ShannonEntropy(v) => numerical.push((ann.to_string(), *v)),
                     Annotation::ConstitDepth(v) | Annotation::ConstitAlt(v) => {
                         numerical.push((ann.to_string(), *v as f64));
                     }
+                    Annotation::Cosmic(c) => numerical.push((ann.to_string(), c.cosmic_cnt as f64)),
                 }
             }
 
@@ -174,11 +189,17 @@ impl Annotations {
         self.store.retain(|key, _| keys_to_keep.contains(key));
     }
 
+    pub fn remove_keys(&mut self, keys_to_remove: &HashSet<u128>) {
+        self.store.retain(|key, _| !keys_to_remove.contains(key));
+    }
+
     pub fn solo_constit_boundaries(&self, max_alt_constit: u16, min_constit_depth: u16) {
         self.store
             .iter_mut()
             .filter(|anns| {
-                let contains = anns.iter().any(|item| matches!(item, Annotation::SoloDiag));
+                let contains = anns
+                    .iter()
+                    .any(|item| matches!(item, Annotation::SoloTumor));
                 let contains_not = anns.iter().all(|item| !matches!(item, Annotation::Somatic));
 
                 contains && contains_not
@@ -213,10 +234,8 @@ impl Annotations {
                 |mut counts, r| {
                     let annotations = r.value();
                     for (index, annotation_type) in annotation_types.iter().enumerate() {
-                        counts[index] += annotations
-                            .iter()
-                            .filter(|a| *a == annotation_type)
-                            .count();
+                        counts[index] +=
+                            annotations.iter().filter(|a| *a == annotation_type).count();
                     }
                     counts
                 },

+ 1 - 1
src/callers/deep_variant.rs

@@ -127,7 +127,7 @@ impl Run for DeepVariant {
 impl Variants for DeepVariant {
     fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection> {
         let solo = match self.time.as_str() {
-            "diag" => Annotation::SoloDiag,
+            "diag" => Annotation::SoloTumor,
             "mrd" => Annotation::SoloConstit,
             _ => return Err(anyhow::anyhow!("Invalid time point.")),
         };

+ 15 - 0
src/helpers.rs

@@ -288,3 +288,18 @@ where
     // Calculate and return the average as f64
    sum.into() / count as f64
 }
+
+pub fn app_storage_dir() -> anyhow::Result<PathBuf> {
+    let app_name = env!("CARGO_PKG_NAME");
+    let app_dir = dirs::data_dir()
+        .context("Failed to get data directory")?
+        .join(app_name);
+    
+    if !app_dir.exists() {
+        fs::create_dir_all(&app_dir)
+            .context("Failed to create application directory")?;
+    }
+    
+    Ok(app_dir)
+}
+

+ 8 - 1
src/lib.rs

@@ -29,7 +29,7 @@ mod tests {
 
     use annotation::Annotations;
     use callers::{nanomonsv::nanomonsv_create_pon, savana::Savana, severus::{Severus, SeverusSolo}};
-    use collection::{bam::{counts_ins_at, ins_pileup, nt_pileup}, Initialize, InitializeSolo, Version};
+    use collection::{bam::{counts_at, counts_ins_at, ins_pileup, nt_pileup}, Initialize, InitializeSolo, Version};
     use commands::{longphase::{LongphaseConfig, LongphaseHap, LongphaseModcallSolo, LongphasePhase}, modkit::{bed_methyl, ModkitConfig}};
     use functions::assembler::{Assembler, AssemblerConfig};
     use helpers::estimate_shannon_entropy;
@@ -531,6 +531,13 @@ mod tests {
         assert_eq!(13, *counts.get("G").unwrap());
         assert_eq!(6, *counts.get("D").unwrap());
 
+        let chr = "chr1";
+        let position = 3220; // 1-based
+
+        let mut bam = rust_htslib::bam::IndexedReader::from_path(c.solo_bam(id, "mrd"))?;
+        let p = counts_at(&mut bam, chr, position - 1)?;
+
+        println!("{p:#?}");
         Ok(())
     }
 

+ 128 - 137
src/pipes/somatic.rs

@@ -39,128 +39,48 @@ impl Initialize for Somatic {
 }
 
 #[derive(Debug, Default, Clone)]
-struct SomaticStats {
-    initial: HashMap<String, usize>,
-    annotations_stats: Vec<AnnotationsStats>,
+pub struct SomaticStats {
+    pub input: InputStats,
+    pub n_constit_germline: usize,
+    pub n_low_constit: usize,
+    pub n_high_alt_constit: usize,
 }
 
-impl SomaticStats {
-    pub fn init(collections: &[VariantCollection]) -> Self {
-        let mut initial = HashMap::new();
+#[derive(Debug, Default, Clone)]
+pub struct InputStats {
+    pub solo_tumor: Vec<(Annotation, usize)>,
+    pub solo_constit: Vec<(Annotation, usize)>,
+    pub germline: Vec<(Annotation, usize)>,
+    pub somatic: Vec<(Annotation, usize)>,
+}
 
+impl InputStats {
+    pub fn from_collections(collections: &[VariantCollection]) -> Self {
+        let mut stats = Self::default();
         for collection in collections.iter() {
             let name = format!("{}_{}", collection.vcf.caller, collection.vcf.time);
-            initial.insert(name, collection.variants.len());
+            let (caller, cat) = to_callers_cat(&name);
+            match cat {
+                Annotation::SoloTumor => stats.solo_tumor.push((caller, collection.variants.len())),
+                Annotation::SoloConstit => {
+                    stats.solo_constit.push((caller, collection.variants.len()))
+                }
+                Annotation::Germline => stats.germline.push((caller, collection.variants.len())),
+                Annotation::Somatic => stats.somatic.push((caller, collection.variants.len())),
+                _ => (),
+            };
         }
+        stats
+    }
+}
 
+impl SomaticStats {
+    pub fn init(collections: &[VariantCollection]) -> Self {
         Self {
-            initial,
-            annotations_stats: Default::default(),
+            input: InputStats::from_collections(collections),
+            ..Default::default()
         }
     }
-
-    pub fn push_annotations_stats(&mut self, annotations_stats: AnnotationsStats) {
-        self.annotations_stats.push(annotations_stats);
-    }
-
-    pub fn aggregation(&mut self) -> anyhow::Result<()> {
-        let annotations_stats = self
-            .annotations_stats
-            .get(0)
-            .ok_or(anyhow::anyhow!("Can't find stats"))?;
-        let step_cat = vec![
-            (Annotation::Germline, 0),
-            (Annotation::Somatic, 0),
-            (Annotation::SoloDiag, 0),
-            (Annotation::SoloConstit, 0),
-        ];
-
-        let mut callers_cats: Vec<((Annotation, Annotation), Vec<(Annotation, u64)>)> = self
-            .initial
-            .keys()
-            .map(|s| (to_callers_cat(s), step_cat.clone()))
-            .collect();
-
-        let mut node_names = Vec::new();
-
-        node_names.extend(step_cat.clone().into_iter().map(|(cat, _)| cat.to_string()));
-
-        let stats: anyhow::Result<Vec<()>> = annotations_stats
-            .categorical
-            .iter()
-            .map(|e| {
-                let v = e.value();
-                let keys: Vec<&str> = e.key().split(" + ").collect();
-                let k_a: Vec<Annotation> = keys
-                    .into_iter()
-                    .map(|e| e.parse())
-                    .collect::<anyhow::Result<_>>()?;
-
-                for ((caller, cat), counts) in callers_cats.iter_mut() {
-                    node_names.push(format!("{} {}", caller, cat));
-                    if k_a.contains(caller) && k_a.contains(cat) {
-                        for (c_annot, value) in counts.iter_mut() {
-                            if k_a.contains(c_annot) {
-                                *value += v;
-                            }
-                        }
-                    }
-                }
-
-                Ok(())
-            })
-            .collect();
-        stats?;
-
-        println!("{callers_cats:#?}");
-        let mut links: Vec<(String, String, f64)> = callers_cats
-            .iter()
-            .flat_map(|((caller, cat), counts)| {
-                let from = format!("{} {}", caller, cat);
-                counts
-                    .iter()
-                    .map(move |(annot, count)| (from.clone(), annot.to_string(), *count as f64))
-            })
-            .collect();
-        links.sort_by(|a, b| {
-            a.2.partial_cmp(&b.2)
-                .unwrap()
-                .then(a.0.cmp(&b.0))
-                .then(a.1.cmp(&b.1))
-        });
-
-        links.dedup();
-
-        node_names.sort();
-        node_names.dedup();
-
-        println!("{links:?}");
-        println!("{node_names:?}");
-
-        let chart = Chart::new().series(
-            Sankey::new()
-                .emphasis(Emphasis::new().focus(EmphasisFocus::Adjacency))
-                // .data(vec!["a", "b", "a1", "a2", "b1", "c"])
-                .data(node_names)
-                .links(links), // .links(vec![
-                               //     ("a", "a1", 5),
-                               //     ("a", "a2", 3),
-                               //     ("b", "b1", 8),
-                               //     ("a", "b1", 3),
-                               //     ("b1", "a1", 1),
-                               //     ("b1", "c", 2),
-                               // ]),
-                               // .data(node_names)
-                               // .links(vec![("ClairS Somatic", "Germline", 10)])
-                               //     ,
-                               // ),
-        );
-
-        let mut renderer = charming::ImageRenderer::new(1000, 800);
-        renderer.save(&chart, "/data/sankey.svg")?;
-
-        Ok(())
-    }
 }
 
 pub fn to_callers_cat(s: &str) -> (Annotation, Annotation) {
@@ -178,7 +98,7 @@ pub fn to_callers_cat(s: &str) -> (Annotation, Annotation) {
     } else if splits.contains(&"DeepVariant") && splits.contains(&"diag") {
         (
             Annotation::Callers(Caller::DeepVariant),
-            Annotation::SoloDiag,
+            Annotation::SoloTumor,
         )
     } else {
         panic!("unknown caller: {s}");
@@ -209,16 +129,16 @@ impl Run for Somatic {
 
         // Annotations Stats
         // let mut annotations = Arc::unwrap_or_clone(annotations);
-        somatic_stats.push_annotations_stats(annotations.callers_stat());
-        somatic_stats.aggregation()?;
+        annotations.callers_stat();
+        // somatic_stats.push_annotations_stats();
 
-        return Ok(());
         // TODO: look at variants: ClairS + DeepVariant + SoloConstit + SoloDiag + Somatic (error
         // in ClairS somatic)
 
         // Filtering Somatic variants
         info!("Filtering somatic variants (variants not in salo callers on constit sample or germline).");
-        let germline_or_somatic_keys = annotations.get_keys_filter(|anns| {
+
+        let tumor_or_somatic_keys = annotations.get_keys_filter(|anns| {
             !anns.contains(&Annotation::Germline) && !anns.contains(&Annotation::SoloConstit)
         });
 
@@ -227,30 +147,41 @@ impl Run for Somatic {
                 .iter()
                 .flat_map(|e| e.keys())
                 .collect::<Vec<u128>>(),
-            &germline_or_somatic_keys,
+            &tumor_or_somatic_keys,
         );
         assert_eq!(0, remains.len());
 
-        info!("Somatic variants positions {}.", somatic_keys.len());
-        info!("Germline variants positions {}.", germline_keys.len());
+        info!(
+            "Germline variants positions removed {}.",
+            germline_keys.len()
+        );
+        germline_keys.len();
 
         let somatic_keys: HashSet<u128> = somatic_keys.into_iter().collect();
+        let mut annotations = Arc::try_unwrap(annotations)
+            .map_err(|e| anyhow::anyhow!("Failed to unwrap Arc: {:?}", e))?;
         annotations.retain_keys(&somatic_keys);
         annotations.callers_stat();
 
-        variants_collection.par_iter_mut().for_each(|c| {
-            let before = c.variants.len();
-            c.retain_keys(&somatic_keys);
-            let after = c.variants.len();
-            info!(
-                "Variants removed from {}: {}",
-                c.vcf.path.display(),
+        somatic_stats.n_constit_germline = variants_collection
+            .par_iter_mut()
+            .map(|c| {
+                let before = c.variants.len();
+                c.retain_keys(&somatic_keys);
+                let after = c.variants.len();
+                info!(
+                    "Variants removed from {}: {}",
+                    c.vcf.path.display(),
+                    before - after
+                );
                 before - after
-            );
-        });
+            })
+            .sum();
 
         variants_collection.retain(|e| !e.variants.is_empty());
 
+        variants_collection.iter().for_each(|c| c.stats());
+
         info!("Constit Bam annotation...");
         variants_collection.iter().try_for_each(|c| {
             c.annotate_with_constit_bam(&annotations, &self.config.normal_bam(&id), 150)
@@ -260,22 +191,81 @@ impl Run for Somatic {
             self.config.solo_min_constit_depth,
         );
 
+        annotations.callers_stat();
+
+        //
+        // Remove LowConstitDepth
+        info!(
+            "Removing low constit depth (depth < {})",
+            self.config.solo_min_constit_depth
+        );
+
+        let low_constit_keys: HashSet<u128> = annotations
+            .get_keys_filter(|anns| anns.contains(&Annotation::LowConstitDepth))
+            .into_iter()
+            .collect();
+
+        annotations.remove_keys(&low_constit_keys);
+
+        somatic_stats.n_low_constit = variants_collection
+            .par_iter_mut()
+            .map(|c| {
+                let before = c.variants.len();
+                c.remove_keys(&low_constit_keys);
+                let after = c.variants.len();
+                let rm = before - after;
+                info!("Variants removed from {}: {rm}", c.vcf.path.display(),);
+                rm
+            })
+            .sum();
+
+        variants_collection.retain(|e| !e.variants.is_empty());
+
+        variants_collection.iter().for_each(|c| c.stats());
+
+        // Remove High Constit Alt
+        info!(
+            "Removing high constit alternative allele (alt constit > {})",
+            self.config.solo_max_alt_constit
+        );
+
+        let high_alt_keys: HashSet<u128> = annotations
+            .get_keys_filter(|anns| anns.contains(&Annotation::HighConstitAlt))
+            .into_iter()
+            .collect();
+
+        annotations.remove_keys(&high_alt_keys);
+
+        somatic_stats.n_high_alt_constit = variants_collection
+            .par_iter_mut()
+            .map(|c| {
+                let before = c.variants.len();
+                c.remove_keys(&high_alt_keys);
+                let after = c.variants.len();
+                let rm = before - after;
+                info!("Variants removed from {}: {rm}", c.vcf.path.display(),);
+                rm
+            })
+            .sum();
+
+        variants_collection.retain(|e| !e.variants.is_empty());
+
+        variants_collection.iter().for_each(|c| c.stats());
+
+        annotations.callers_stat();
+
+        // Entropy
         info!("Entropy annotation...");
         variants_collection.iter().for_each(|c| {
             c.annotate_with_sequence_entropy(&annotations, &self.config.reference, 10, 150);
         });
         annotations.callers_stat();
 
+
         let prob_keys: HashSet<u128> = annotations
             .get_keys_filter(|anns| {
-                // let contains = anns.iter().any(|item| matches!(item, Annotation::SoloDiag));
-                // let contains_not = anns.iter().all(|item| !matches!(item, Annotation::Somatic));
-                //
-                // contains && contains_not
-                anns.contains(&Annotation::SoloDiag)
-                    && !anns.contains(&Annotation::Somatic)
-                    && !anns.contains(&Annotation::LowConstitDepth)
-                    && !anns.contains(&Annotation::HighConstitAlt)
+                anns.contains(&Annotation::SoloTumor)
+                    && !anns.contains(&Annotation::Callers(Caller::ClairS))
             })
             .into_iter()
             .collect();
@@ -287,6 +277,7 @@ impl Run for Somatic {
         let problematic_variants: Vec<VcfVariant> = problematic_variants
             .iter_mut()
             .flat_map(|e| {
+                info!("{} {}:\t{}", e.vcf.caller, e.vcf.time, e.variants.len());
                 e.retain_keys(&prob_keys);
                 e.variants.clone()
             })

+ 114 - 7
src/variant/variant_collection.rs

@@ -1,12 +1,17 @@
 use std::collections::HashSet;
 
+use chrono::Utc;
+use log::info;
 use rayon::prelude::*;
 
 use super::variant::{AlterationCategory, VcfVariant};
 use crate::{
-    annotation::{Annotation, Annotations},
-    collection::{bam::{counts_at, counts_ins_at}, vcf::Vcf},
-    helpers::estimate_shannon_entropy,
+    annotation::{cosmic::Cosmic, gnomad::GnomAD, Annotation, Annotations},
+    collection::{
+        bam::{counts_at, counts_ins_at},
+        vcf::Vcf,
+    },
+    helpers::{app_storage_dir, estimate_shannon_entropy},
     pipes::somatic::sequence_at,
 };
 
@@ -26,6 +31,11 @@ impl VariantCollection {
             .retain(|v| keys_to_keep.contains(&v.hash_variant()));
     }
 
+    pub fn remove_keys(&mut self, keys_to_remove: &HashSet<u128>) {
+        self.variants
+            .retain(|v| !keys_to_remove.contains(&v.hash_variant()));
+    }
+
     pub fn chunk_size(&self, max_threads: u8) -> usize {
         let total_items = self.variants.len();
         let min_chunk_size = 1000;
@@ -103,10 +113,10 @@ impl VariantCollection {
                             // TODO: Add stretch comparison
                             alt_seq = alt_seq.split_off(1);
                             counts_ins_at(&mut bam, &var.position.contig(), var.position.position)?
+                        } else if var.alteration_category() == AlterationCategory::Del {
+                            alt_seq = "D".to_string();
+                            counts_at(&mut bam, &var.position.contig(), var.position.position + 1)?
                         } else {
-                            if var.alteration_category() == AlterationCategory::Del {
-                                alt_seq = "D".to_string();
-                            }
                             counts_at(&mut bam, &var.position.contig(), var.position.position)?
                         };
 
@@ -117,7 +127,6 @@ impl VariantCollection {
                             depth += n;
                         }
 
-
                         anns.push(Annotation::ConstitAlt(n_alt as u16));
                         anns.push(Annotation::ConstitDepth(depth as u16));
                     }
@@ -126,4 +135,102 @@ impl VariantCollection {
             })?;
         Ok(())
     }
+
+    pub fn stats(&self) {
+        println!(
+            "{} {}\t{}",
+            self.vcf.caller,
+            self.vcf.time,
+            self.variants.len()
+        );
+    }
+
+    pub fn external_annotation() {}
+}
+
+pub enum ExtAnnotationSource {
+    Cosmic(Cosmic),
+    GnomAD(GnomAD),
+}
+
+use rusqlite::{params, Connection, Result as SqliteResult};
+
+pub struct ExternalAnnotation {
+    pub conn: Connection,
+}
+
+impl ExternalAnnotation {
+    pub fn init() -> anyhow::Result<Self> {
+        let mut db_path = app_storage_dir()?;
+        db_path.push("annotations_db.sqlite");
+        info!("Opening DB: {}", db_path.display());
+        let conn = Connection::open(db_path)?;
+
+        conn.execute(
+            "CREATE TABLE IF NOT EXISTS your_table (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            hash BLOB(16) UNIQUE NOT NULL,
+            source TEXT NOT NULL,
+            data BLOB,
+            created_at TEXT NOT NULL
+        )",
+            [],
+        )?;
+
+        Ok(Self { conn })
+    }
+
+    pub fn annotate(
+        &self,
+        variants: &[VcfVariant],
+        annotations: &Annotations,
+        source: ExtAnnotationSource,
+    ) -> anyhow::Result<()> {
+        let mut unfound = Vec::new();
+        let mut found_count = 0;
+
+        let source = match &source {
+            ExtAnnotationSource::Cosmic(_) => "COSMIC",
+            ExtAnnotationSource::GnomAD(_) => "GnomAD",
+        };
+
+        for variant in variants {
+            let hash: u128 = variant.hash_variant();
+
+            // Attempt to retrieve the data from the database
+            let result: SqliteResult<(Vec<u8>, String)> = self.conn.query_row(
+                "SELECT data, created_at FROM your_table WHERE hash = ? AND source = ?",
+                params![hash.to_le_bytes().to_vec(), source],
+                |row| Ok((row.get(0)?, row.get(1)?)),
+            );
+
+            match result {
+                Ok((data, created_at)) => {
+                    // Deserialize the data based on the source
+                    let annotation = match source {
+                        "COSMIC" => {
+                            let cosmic: Cosmic = serde_json::from_slice(&data)?;
+                            Annotation::Cosmic(cosmic)
+                        }
+                        "GnomAD" => {
+                            let gnomad: GnomAD = serde_json::from_slice(&data)?;
+                            Annotation::GnomAD(gnomad)
+                        }
+                        _ => return Err(anyhow::anyhow!("Unknown source: {}", source)),
+                    };
+
+                    // Update the in-memory annotations
+                    annotations.store.entry(hash).or_default().push(annotation);
+                    found_count += 1;
+                }
+                Err(rusqlite::Error::QueryReturnedNoRows) => {
+                    // Variant not found in the database
+                    unfound.push(variant.clone());
+                }
+                Err(e) => return Err(e.into()),
+            }
+        }
+
+        Ok(())
+    }
 }