Thomas 10 сар өмнө
parent
commit
605ee768fb

+ 132 - 35
Cargo.lock

@@ -1525,16 +1525,16 @@ version = "5.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9a49173b84e034382284f27f1af4dcbbd231ffa358c0fe316541a7337f376a35"
 dependencies = [
- "dirs-sys",
+ "dirs-sys 0.4.1",
 ]
 
 [[package]]
 name = "dirs"
-version = "5.0.1"
+version = "6.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225"
+checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e"
 dependencies = [
- "dirs-sys",
+ "dirs-sys 0.5.0",
 ]
 
 [[package]]
@@ -1555,10 +1555,22 @@ checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c"
 dependencies = [
  "libc",
  "option-ext",
- "redox_users",
+ "redox_users 0.4.6",
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "dirs-sys"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab"
+dependencies = [
+ "libc",
+ "option-ext",
+ "redox_users 0.5.0",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "dirs-sys-next"
 version = "0.1.2"
@@ -1566,7 +1578,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
 dependencies = [
  "libc",
- "redox_users",
+ "redox_users 0.4.6",
  "winapi",
 ]
 
@@ -3149,9 +3161,9 @@ dependencies = [
  "byteorder",
  "indexmap",
  "memchr",
- "noodles-bgzf",
- "noodles-core",
- "noodles-csi",
+ "noodles-bgzf 0.34.0",
+ "noodles-core 0.15.0",
+ "noodles-csi 0.41.0",
  "noodles-sam",
 ]
 
@@ -3167,6 +3179,18 @@ dependencies = [
  "flate2",
 ]
 
+[[package]]
+name = "noodles-bgzf"
+version = "0.35.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6786136e224bdb8550b077ad44ef2bd5ebc8b06d07fab69aaa7f47d06f0da75"
+dependencies = [
+ "byteorder",
+ "bytes",
+ "crossbeam-channel",
+ "flate2",
+]
+
 [[package]]
 name = "noodles-core"
 version = "0.15.0"
@@ -3176,6 +3200,15 @@ dependencies = [
  "bstr",
 ]
 
+[[package]]
+name = "noodles-core"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "962b13b79312f773a12ffcb0cdaccab6327f8343b6f440a888eff10c749d52b0"
+dependencies = [
+ "bstr",
+]
+
 [[package]]
 name = "noodles-csi"
 version = "0.41.0"
@@ -3186,8 +3219,22 @@ dependencies = [
  "bstr",
  "byteorder",
  "indexmap",
- "noodles-bgzf",
- "noodles-core",
+ "noodles-bgzf 0.34.0",
+ "noodles-core 0.15.0",
+]
+
+[[package]]
+name = "noodles-csi"
+version = "0.43.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "197f4c332f233135159b62bd9a6c35d0bf8366ccf0d7b9cbed3c6ec92a8e4464"
+dependencies = [
+ "bit-vec 0.8.0",
+ "bstr",
+ "byteorder",
+ "indexmap",
+ "noodles-bgzf 0.35.0",
+ "noodles-core 0.16.0",
 ]
 
 [[package]]
@@ -3199,8 +3246,21 @@ dependencies = [
  "bstr",
  "bytes",
  "memchr",
- "noodles-bgzf",
- "noodles-core",
+ "noodles-bgzf 0.34.0",
+ "noodles-core 0.15.0",
+]
+
+[[package]]
+name = "noodles-fasta"
+version = "0.47.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad6daab7b0475167724742ce5e799de6aaac19c75ed1b56d0040ef1dab6ca079"
+dependencies = [
+ "bstr",
+ "bytes",
+ "memchr",
+ "noodles-bgzf 0.35.0",
+ "noodles-core 0.16.0",
 ]
 
 [[package]]
@@ -3210,9 +3270,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f5d1b448c6cf133a71e6370499d875ff9ad4b1d0a502dbfd8b067a1f1153f67f"
 dependencies = [
  "indexmap",
- "noodles-bgzf",
- "noodles-core",
- "noodles-csi",
+ "noodles-bgzf 0.34.0",
+ "noodles-core 0.15.0",
+ "noodles-csi 0.41.0",
+ "percent-encoding",
+]
+
+[[package]]
+name = "noodles-gff"
+version = "0.43.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d968405cf400568b24a47587b6e916d0cf118d7d38625c29472f491461f094e"
+dependencies = [
+ "indexmap",
+ "noodles-bgzf 0.35.0",
+ "noodles-core 0.16.0",
+ "noodles-csi 0.43.0",
  "percent-encoding",
 ]
 
@@ -3227,9 +3300,9 @@ dependencies = [
  "indexmap",
  "lexical-core 1.0.5",
  "memchr",
- "noodles-bgzf",
- "noodles-core",
- "noodles-csi",
+ "noodles-bgzf 0.34.0",
+ "noodles-core 0.15.0",
+ "noodles-csi 0.41.0",
 ]
 
 [[package]]
@@ -3240,9 +3313,9 @@ checksum = "fde991a31c6203845117944c1d5f697b69c382e37eb2d70f3e3f2b575fbca62d"
 dependencies = [
  "byteorder",
  "indexmap",
- "noodles-bgzf",
- "noodles-core",
- "noodles-csi",
+ "noodles-bgzf 0.34.0",
+ "noodles-core 0.15.0",
+ "noodles-csi 0.41.0",
 ]
 
 [[package]]
@@ -3253,9 +3326,9 @@ checksum = "1ae18ab19252b5f8a4fe3310a0a1d2e2875a886e81a9e64aa69510a471655921"
 dependencies = [
  "indexmap",
  "memchr",
- "noodles-bgzf",
- "noodles-core",
- "noodles-csi",
+ "noodles-bgzf 0.34.0",
+ "noodles-core 0.15.0",
+ "noodles-csi 0.41.0",
  "noodles-tabix",
  "percent-encoding",
 ]
@@ -3596,7 +3669,7 @@ dependencies = [
  "average",
  "env_logger 0.11.6",
  "log",
- "noodles-fasta",
+ "noodles-fasta 0.46.0",
  "rayon",
  "rust-htslib 0.49.0",
  "uuid",
@@ -3631,10 +3704,10 @@ dependencies = [
  "log",
  "logtest",
  "nix 0.29.0",
- "noodles-core",
- "noodles-csi",
- "noodles-fasta",
- "noodles-gff",
+ "noodles-core 0.16.0",
+ "noodles-csi 0.43.0",
+ "noodles-fasta 0.47.0",
+ "noodles-gff 0.43.0",
  "num-format",
  "pandora_lib_assembler",
  "pandora_lib_bindings",
@@ -3643,12 +3716,14 @@ dependencies = [
  "podders",
  "pty-process",
  "ptyprocess",
+ "rand",
  "rayon",
  "regex",
  "rusqlite",
  "rust-htslib 0.49.0",
  "serde",
  "serde_json",
+ "tar",
  "tempfile",
  "test-log",
  "tokio",
@@ -3702,11 +3777,11 @@ dependencies = [
  "indicatif-log-bridge",
  "log",
  "noodles-bam",
- "noodles-bgzf",
- "noodles-core",
- "noodles-csi",
- "noodles-fasta",
- "noodles-gff",
+ "noodles-bgzf 0.34.0",
+ "noodles-core 0.15.0",
+ "noodles-csi 0.41.0",
+ "noodles-fasta 0.46.0",
+ "noodles-gff 0.41.0",
  "noodles-sam",
  "noodles-tabix",
  "noodles-vcf",
@@ -4260,6 +4335,17 @@ dependencies = [
  "thiserror 1.0.69",
 ]
 
+[[package]]
+name = "redox_users"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b"
+dependencies = [
+ "getrandom",
+ "libredox",
+ "thiserror 2.0.9",
+]
+
 [[package]]
 name = "regex"
 version = "1.11.1"
@@ -5205,6 +5291,17 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
 
+[[package]]
+name = "tar"
+version = "0.4.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c65998313f8e17d0d553d28f91a0df93e4dbbbf770279c7bc21ca0f09ea1a1f6"
+dependencies = [
+ "filetime",
+ "libc",
+ "xattr",
+]
+
 [[package]]
 name = "tempfile"
 version = "3.15.0"

+ 8 - 6
Cargo.toml

@@ -21,7 +21,7 @@ tracing-test = "0.2.5"
 tracing = "0.1.40"
 logtest = "2.0.0"
 test-log = "0.2.16"
-noodles-csi = "0.41.0"
+noodles-csi = "0.43.0"
 num-format = "0.4.4"
 locale_config = "0.3.0"
 byte-unit = "5.1.4"
@@ -44,12 +44,14 @@ podders = "0.1.4"
 arrow = "54.0.0"
 bgzip = "0.3.1"
 tempfile = "3.14.0"
-dashmap = { version = "6.1.0", features = ["rayon"] }
-noodles-fasta = "0.46.0"
-noodles-core = "0.15.0"
+dashmap = { version = "6.1.0", features = ["rayon", "serde"] }
+noodles-fasta = "0.47.0"
+noodles-core = "0.16.0"
 blake3 = "1.5.5"
 charming = { version = "0.4.0", features = ["ssr"] }
 rusqlite = { version = "0.32.1", features = ["chrono", "serde_json"] }
-dirs = "5.0.1"
-noodles-gff = "0.41.0"
+dirs = "6.0.0"
+noodles-gff = "0.43.0"
 itertools = "0.14.0"
+rand = "0.8.5"
+tar = "0.4.43"

+ 129 - 64
src/annotation/mod.rs

@@ -5,10 +5,7 @@ pub mod ncbi;
 pub mod vep;
 
 use std::{
-    collections::{HashMap, HashSet},
-    fmt,
-    str::FromStr,
-    sync::Arc,
+    collections::{HashMap, HashSet}, fmt, fs::File, io::{Read, Write}, str::FromStr, sync::Arc
 };
 
 use crate::{
@@ -20,15 +17,12 @@ use dashmap::DashMap;
 use gnomad::GnomAD;
 use log::info;
 use rayon::prelude::*;
+use serde::{Deserialize, Serialize};
 use vep::{get_best_vep, VepConsequence, VepImpact, VEP};
 
 #[derive(Debug, Clone, PartialEq)]
 pub enum Annotation {
-    SoloTumor,
-    SoloConstit,
-    Callers(Caller),
-    Germline,
-    Somatic,
+    Callers(Caller, Sample),
     AlterationCategory(AlterationCategory),
     ShannonEntropy(f64),
     ConstitDepth(u16),
@@ -41,14 +35,45 @@ pub enum Annotation {
     VEP(Vec<VEP>),
 }
 
+#[derive(Debug, Clone, PartialEq)]
+pub enum Sample {
+    SoloTumor,
+    SoloConstit,
+    Germline,
+    Somatic,
+}
+impl fmt::Display for Sample {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{}",
+            match self {
+                Sample::SoloTumor => "SoloTumor",
+                Sample::SoloConstit => "SoloConstit",
+                Sample::Germline => "Germline",
+                Sample::Somatic => "Somatic",
+            }
+        )
+    }
+}
+
+impl FromStr for Sample {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> anyhow::Result<Self> {
+        match s {
+            "SoloTumor" => Ok(Sample::SoloTumor),
+            "SoloConstit" => Ok(Sample::SoloConstit),
+            "Germline" => Ok(Sample::Germline),
+            "Somatic" => Ok(Sample::Somatic),
+            _ => Err(anyhow::anyhow!("Can't parse Sample from {s}")),
+        }
+    }
+}
 impl fmt::Display for Annotation {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let str = match self {
-            Annotation::SoloTumor => "SoloTumor",
-            Annotation::SoloConstit => "SoloConstit",
-            Annotation::Callers(caller) => &caller.to_string(),
-            Annotation::Germline => "Germline",
-            Annotation::Somatic => "Somatic",
+            Annotation::Callers(caller, sample) => &format!("{caller} {sample}"),
             Annotation::ShannonEntropy(_) => "ShannonEntropy",
             Annotation::ConstitDepth(_) => "ConstitDepth",
             Annotation::ConstitAlt(_) => "ConstitAlt",
@@ -68,23 +93,23 @@ impl FromStr for Annotation {
     type Err = anyhow::Error;
 
     fn from_str(s: &str) -> anyhow::Result<Self> {
-        match s {
-            "SoloTumor" => Ok(Annotation::SoloTumor),
-            "SoloConstit" => Ok(Annotation::SoloConstit),
-            "DeepVariant" => Ok(Annotation::Callers(Caller::DeepVariant)),
-            "DeepSomatic" => Ok(Annotation::Callers(Caller::DeepSomatic)),
-            "Savana" => Ok(Annotation::Callers(Caller::Savana)),
-            "NanomonSV" => Ok(Annotation::Callers(Caller::NanomonSV)),
-            "Severus" => Ok(Annotation::Callers(Caller::Severus)),
-            "ClairS" => Ok(Annotation::Callers(Caller::ClairS)),
-            "Germline" => Ok(Annotation::Germline),
-            "Somatic" => Ok(Annotation::Somatic),
-            s if s.starts_with("ShannonEntropy") => Ok(Annotation::ShannonEntropy(0.0)),
-            s if s.starts_with("ConstitDepth") => Ok(Annotation::ConstitDepth(0)),
-            s if s.starts_with("ConstitAlt") => Ok(Annotation::ConstitAlt(0)),
-            "LowConstitDepth" => Ok(Annotation::LowConstitDepth),
-            "HighConstitAlt" => Ok(Annotation::HighConstitAlt),
-            _ => Err(anyhow::anyhow!("Unknown Annotation: {}", s)),
+        if s.contains(" ") {
+            let (caller_str, sample_str) = s
+                .split_once(" ")
+                .ok_or_else(|| anyhow::anyhow!("Can't parse {s}"))?;
+            Ok(Annotation::Callers(
+                caller_str.parse()?,
+                sample_str.parse()?,
+            ))
+        } else {
+            match s {
+                s if s.starts_with("ShannonEntropy") => Ok(Annotation::ShannonEntropy(0.0)),
+                s if s.starts_with("ConstitDepth") => Ok(Annotation::ConstitDepth(0)),
+                s if s.starts_with("ConstitAlt") => Ok(Annotation::ConstitAlt(0)),
+                "LowConstitDepth" => Ok(Annotation::LowConstitDepth),
+                "HighConstitAlt" => Ok(Annotation::HighConstitAlt),
+                _ => Err(anyhow::anyhow!("Unknown Annotation: {}", s)),
+            }
         }
     }
 }
@@ -114,6 +139,23 @@ impl fmt::Display for Caller {
     }
 }
 
+impl FromStr for Caller {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> anyhow::Result<Self> {
+        match s {
+            "DeepVariant" => Ok(Caller::DeepVariant),
+            "ClairS" => Ok(Caller::ClairS),
+            "NanomonSV" => Ok(Caller::NanomonSV),
+            "NanomonSV-solo" => Ok(Caller::NanomonSVSolo),
+            "Savana" => Ok(Caller::Savana),
+            "Severus" => Ok(Caller::Severus),
+            "DeepSomatic" => Ok(Caller::DeepSomatic),
+            _ => Err(anyhow::anyhow!("Can't parse Caller {s}")),
+        }
+    }
+}
+
 #[derive(Debug, Clone)]
 pub struct Annotations {
     pub store: DashMap<Hash128, Vec<Annotation>, Blake3BuildHasher>,
@@ -127,12 +169,29 @@ impl Default for Annotations {
     }
 }
 
-#[derive(Debug, Default, Clone)]
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
 pub struct AnnotationsStats {
     pub categorical: DashMap<String, u64>,
     pub numeric: DashMap<String, HashMap<String, Vec<f64>>>,
 }
 
+impl AnnotationsStats {
+    pub fn save_to_json(&self, file_path: &str) -> anyhow::Result<()> {
+        let json = serde_json::to_string_pretty(self)?;
+        let mut file = File::create(file_path)?;
+        file.write_all(json.as_bytes())?;
+        Ok(())
+    }
+
+    pub fn load_from_json(file_path: &str) -> anyhow::Result<Self> {
+        let mut file = File::open(file_path)?;
+        let mut contents = String::new();
+        file.read_to_string(&mut contents)?;
+        let stats: AnnotationsStats = serde_json::from_str(&contents)?;
+        Ok(stats)
+    }
+}
+
 #[allow(clippy::type_complexity)]
 impl Annotations {
     pub fn insert_update(&self, key: Hash128, add: &[Annotation]) {
@@ -151,11 +210,7 @@ impl Annotations {
 
         self.store.par_iter().for_each(|e| {
             let anns = if let Some(sel) = &annotations {
-                e.value()
-                    .iter()
-                    .filter(|a| sel(a))
-                    .cloned()
-                    .collect()
+                e.value().iter().filter(|a| sel(a)).cloned().collect()
             } else {
                 e.value().clone()
             };
@@ -164,16 +219,14 @@ impl Annotations {
             let mut numerical = Vec::new();
             for ann in anns.iter() {
                 match ann {
-                    Annotation::SoloTumor
-                    | Annotation::SoloConstit
-                    | Annotation::Germline
-                    | Annotation::Somatic
-                    | Annotation::LowConstitDepth
+                    Annotation::LowConstitDepth
                     | Annotation::LowEntropy
                     | Annotation::GnomAD(_)
                     | Annotation::VEP(_)
                     | Annotation::HighConstitAlt => categorical.push(ann.to_string()),
-                    Annotation::Callers(caller) => categorical.push(caller.to_string()),
+                    Annotation::Callers(caller, sample) => {
+                        categorical.push(format!("{caller} {sample}"))
+                    }
                     Annotation::ShannonEntropy(v) => numerical.push((ann.to_string(), *v)),
                     Annotation::ConstitDepth(v) | Annotation::ConstitAlt(v) => {
                         numerical.push((ann.to_string(), *v as f64));
@@ -200,6 +253,7 @@ impl Annotations {
                     .push(v_num);
             }
         });
+        
         println!("\nCallers stats:");
         println!("\tn categories: {}", map.len());
         let mut n = 0;
@@ -303,18 +357,10 @@ impl Annotations {
             .map(|c| {
                 let before = c.variants.len();
                 c.variants.retain(|a| keys.contains(&a.hash()));
-
-                // c.variants = c
-                //     .variants
-                //     .par_iter()
-                //     .filter(|a| keys.contains(&a.hash()))
-                //     .cloned()
-                //     .collect();
                 let after = c.variants.len();
                 info!(
-                    "\t- {} {}\t{}/{}",
+                    "\t- {}\t{}/{}",
                     c.caller,
-                    c.category,
                     before - after,
                     before
                 );
@@ -340,8 +386,10 @@ impl Annotations {
             .filter(|anns| {
                 let contains = anns
                     .iter()
-                    .any(|item| matches!(item, Annotation::SoloTumor));
-                let contains_not = anns.iter().all(|item| !matches!(item, Annotation::Somatic));
+                    .any(|item| matches!(item, Annotation::Callers(_, Sample::SoloTumor)));
+                let contains_not = anns
+                    .iter()
+                    .all(|item| !matches!(item, Annotation::Callers(_, Sample::Somatic)));
 
                 contains && contains_not
             })
@@ -392,18 +440,35 @@ impl Annotations {
             )
     }
 
+    // pub fn low_shannon_entropy(&mut self, min_shannon_entropy: f64) {
+    //     self.store.iter_mut().for_each(|mut e| {
+    //         let anns = e.value_mut();
+    //         let mut is_low = false;
+    //         anns.iter().for_each(|ann| {
+    //             if let Annotation::ShannonEntropy(ent) = ann {
+    //                 if *ent < min_shannon_entropy
+    //                     && !anns.contains(&Annotation::Callers(_, Sample::Somatic))
+    //                 {
+    //                     is_low = true
+    //                 }
+    //             }
+    //         });
+    //         if is_low {
+    //             anns.push(Annotation::LowEntropy);
+    //         }
+    //     });
+    // }
+    //
+    /// Annotate low shannon ent for solo callers (not Somatic)
     pub fn low_shannon_entropy(&mut self, min_shannon_entropy: f64) {
         self.store.iter_mut().for_each(|mut e| {
             let anns = e.value_mut();
-            let mut is_low = false;
-            anns.iter().for_each(|ann| {
-                if let Annotation::ShannonEntropy(ent) = ann {
-                    if *ent < min_shannon_entropy && !anns.contains(&Annotation::Somatic) {
-                        is_low = true
-                    }
-                }
-            });
-            if is_low {
+            if anns.iter().any(|ann| {
+                matches!(ann, Annotation::ShannonEntropy(ent) if *ent < min_shannon_entropy)
+                    && !anns
+                        .iter()
+                        .any(|a| matches!(a, Annotation::Callers(_, Sample::Somatic)))
+            }) && !anns.contains(&Annotation::LowEntropy) {
                 anns.push(Annotation::LowEntropy);
             }
         });
@@ -411,5 +476,5 @@ impl Annotations {
 }
 
 pub trait CallerCat {
-    fn caller_cat(&self) -> (Caller, Annotation);
+    fn caller_cat(&self) -> Annotation;
 }

+ 14 - 19
src/callers/clairs.rs

@@ -1,5 +1,5 @@
 use crate::{
-    annotation::{Annotation, Annotations, Caller, CallerCat},
+    annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
     collection::{vcf::Vcf, Initialize},
     commands::bcftools::{bcftools_concat, bcftools_keep_pass, BcftoolsConfig},
     config::Config,
@@ -182,27 +182,26 @@ impl Run for ClairS {
 }
 
 impl CallerCat for ClairS {
-    fn caller_cat(&self) -> (Caller, Annotation) {
-        (Caller::ClairS, Annotation::Somatic)
+    fn caller_cat(&self) -> Annotation {
+        Annotation::Callers(Caller::ClairS, Sample::Somatic)
     }
 }
 
 impl Variants for ClairS {
     fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection> {
-        let (caller, category) = self.caller_cat();
-        let add = vec![Annotation::Callers(caller.clone()), category.clone()];
+        let caller = self.caller_cat();
+        let add = vec![caller.clone()];
         info!(
-            "Loading variants from {} {}: {}",
-            caller, category, self.vcf_passed
+            "Loading variants from {}: {}",
+            caller, self.vcf_passed
         );
         let variants = read_vcf(&self.vcf_passed)?;
         variants.par_iter().for_each(|v| {
             annotations.insert_update(v.hash(), &add);
         });
         info!(
-            "{} {}, {} variants loaded.",
+            "{}, {} variants loaded.",
             caller,
-            category,
             variants.len()
         );
 
@@ -210,19 +209,17 @@ impl Variants for ClairS {
             variants,
             vcf: Vcf::new(self.vcf_passed.clone().into())?,
             caller,
-            category,
         })
     }
 }
 
 impl ClairS {
     pub fn germline(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection> {
-        let caller = Caller::ClairS;
-        let category = Annotation::Germline;
-        let add = vec![Annotation::Callers(caller.clone()), category.clone()];
+        let caller = Annotation::Callers(Caller::ClairS, Sample::Germline);
+        let add = vec![caller.clone()];
         info!(
-            "Loading variants from {} {}: {}",
-            caller, category, self.vcf_passed
+            "Loading variants from {}: {}",
+            caller, self.vcf_passed
         );
 
         let variants = read_vcf(&self.clair3_germline_passed)?;
@@ -230,17 +227,15 @@ impl ClairS {
             annotations.insert_update(v.hash(), &add);
         });
         info!(
-            "{} {}, {} variants loaded.",
+            "{}, {} variants loaded.",
             caller,
-            category,
             variants.len()
         );
 
         Ok(VariantCollection {
             variants,
             vcf: Vcf::new(self.clair3_germline_passed.clone().into())?,
-            caller: Caller::ClairS,
-            category: Annotation::Germline,
+            caller,
         })
     }
 }

+ 8 - 10
src/callers/deep_somatic.rs

@@ -5,7 +5,7 @@ use log::info;
 use rayon::prelude::*;
 
 use crate::{
-    annotation::{Annotation, Annotations, Caller, CallerCat},
+    annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
     collection::{vcf::Vcf, Initialize},
     commands::bcftools::{bcftools_keep_pass, BcftoolsConfig},
     config::Config,
@@ -127,34 +127,32 @@ impl Run for DeepSomatic {
 }
 
 impl CallerCat for DeepSomatic {
-    fn caller_cat(&self) -> (Caller, Annotation) {
-        (Caller::DeepSomatic, Annotation::Somatic)
+    fn caller_cat(&self) -> Annotation {
+        Annotation::Callers(Caller::DeepSomatic, Sample::Somatic)
     }
 }
 
 impl Variants for DeepSomatic {
     fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection> {
-        let (caller, category) = self.caller_cat();
-        let add = vec![Annotation::Callers(caller.clone()), category.clone()];
+        let caller = self.caller_cat();
+        let add = vec![caller.clone()];
         info!(
-            "Loading variants from {} {}: {}",
-            caller, category, self.vcf_passed
+            "Loading variants from {}: {}",
+            caller, self.vcf_passed
         );
         let variants = read_vcf(&self.vcf_passed)?;
         variants.par_iter().for_each(|v| {
             annotations.insert_update(v.hash(), &add);
         });
         info!(
-            "{} {}, {} variants loaded.",
+            "{}, {} variants loaded.",
             caller,
-            category,
             variants.len()
         );
         Ok(VariantCollection {
             variants,
             vcf: Vcf::new(self.vcf_passed.clone().into())?,
             caller,
-            category,
         })
     }
 }

+ 11 - 14
src/callers/deep_variant.rs

@@ -4,7 +4,7 @@ use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
 use std::{fs, path::Path};
 
 use crate::{
-    annotation::{Annotation, Annotations, Caller, CallerCat},
+    annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
     collection::{vcf::Vcf, InitializeSolo},
     commands::bcftools::{bcftools_keep_pass, BcftoolsConfig},
     config::Config,
@@ -125,46 +125,43 @@ impl Run for DeepVariant {
 }
 
 impl CallerCat for DeepVariant {
-    fn caller_cat(&self) -> (Caller, Annotation) {
+    fn caller_cat(&self) -> Annotation {
         let Config {
             normal_name,
             tumoral_name,
             ..
         } = &self.config;
-        let cat = if *normal_name == self.time {
-            Annotation::SoloConstit
+        let caller = if *normal_name == self.time {
+            Annotation::Callers(Caller::DeepVariant, Sample::SoloConstit)
         } else if *tumoral_name == self.time {
-            Annotation::SoloTumor
+            Annotation::Callers(Caller::DeepVariant, Sample::SoloTumor)
         } else {
             panic!("Error in time_point name: {}", self.time);
         };
-        (Caller::DeepVariant, cat)
+        caller
     }
 }
 
 impl Variants for DeepVariant {
     fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection> {
-        let (caller, category) = self.caller_cat();
-        let add = vec![Annotation::Callers(caller.clone()), category.clone()];
+        let caller = self.caller_cat();
         info!(
-            "Loading variants from {} {}: {}",
-            caller, category, self.vcf_passed
+            "Loading variants from {}: {}",
+            caller, self.vcf_passed
         );
         let variants = read_vcf(&self.vcf_passed)?;
         variants.par_iter().for_each(|v| {
-            annotations.insert_update(v.hash(), &add);
+            annotations.insert_update(v.hash(), &vec![caller.clone()]);
         });
         info!(
-            "{} {}, {} variants loaded.",
+            "{}, {} variants loaded.",
             caller,
-            category,
             variants.len()
         );
         Ok(VariantCollection {
             variants,
             vcf: Vcf::new(self.vcf_passed.clone().into())?,
             caller,
-            category,
         })
     }
 }

+ 17 - 25
src/callers/nanomonsv.rs

@@ -9,7 +9,7 @@ use anyhow::Context;
 use log::{debug, error, info, warn};
 
 use crate::{
-    annotation::{Annotation, Annotations, Caller, CallerCat},
+    annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
     collection::{vcf::Vcf, Initialize, InitializeSolo},
     commands::bcftools::{bcftools_concat, bcftools_keep_pass, BcftoolsConfig},
     config::Config,
@@ -117,18 +117,18 @@ impl Run for NanomonSV {
 }
 
 impl CallerCat for NanomonSV {
-    fn caller_cat(&self) -> (Caller, Annotation) {
-        (Caller::NanomonSV, Annotation::Somatic)
+    fn caller_cat(&self) -> Annotation {
+        Annotation::Callers(Caller::NanomonSV, Sample::Somatic)
     }
 }
 
 impl Variants for NanomonSV {
     fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection> {
-        let (caller, category) = self.caller_cat();
-        let add = vec![Annotation::Callers(caller.clone()), category.clone()];
+        let caller = self.caller_cat();
+        let add = vec![caller.clone()];
         info!(
-            "Loading variants from {} {}: {}",
-            caller, category, self.vcf_passed
+            "Loading variants from {}: {}",
+            caller, self.vcf_passed
         );
 
         let variants = read_vcf(&self.vcf_passed)?;
@@ -137,16 +137,14 @@ impl Variants for NanomonSV {
             annotations.insert_update(v.hash(), &add);
         });
         info!(
-            "{} {}, {} variants loaded.",
+            "{}, {} variants loaded.",
             caller,
-            category,
             variants.len()
         );
         Ok(VariantCollection {
             variants,
             vcf: Vcf::new(self.vcf_passed.clone().into())?,
             caller,
-            category,
         })
     }
 }
@@ -244,20 +242,19 @@ impl Run for NanomonSVSolo {
 }
 
 impl CallerCat for NanomonSVSolo {
-    fn caller_cat(&self) -> (Caller, Annotation) {
+    fn caller_cat(&self) -> Annotation {
         let Config {
             normal_name,
             tumoral_name,
             ..
         } = &self.config;
-        let cat = if *normal_name == self.time_point {
-            Annotation::SoloConstit
+        if *normal_name == self.time_point {
+            Annotation::Callers(Caller::DeepSomatic, Sample::SoloConstit)
         } else if *tumoral_name == self.time_point {
-            Annotation::SoloTumor
+            Annotation::Callers(Caller::DeepSomatic, Sample::SoloTumor)
         } else {
             panic!("Error in time_point name: {}", self.time_point);
-        };
-        (Caller::NanomonSVSolo, cat)
+        }
     }
 }
 
@@ -265,8 +262,8 @@ impl RunnerVariants for NanomonSVSolo {}
 
 impl Variants for NanomonSVSolo {
     fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection> {
-        let (caller, category) = self.caller_cat();
-        let add = vec![Annotation::Callers(caller.clone()), category.clone()];
+        let caller = self.caller_cat();
+        let add = vec![caller.clone()];
         info!(
             "Loading variant from ClairS {} with annotations: {:?}",
             self.id, add
@@ -274,21 +271,16 @@ impl Variants for NanomonSVSolo {
         let variants = read_vcf(&self.vcf_passed)?;
 
         variants.par_iter().for_each(|v| {
-            let var_cat = v.alteration_category();
+            // let var_cat = v.alteration_category();
             annotations.insert_update(
                 v.hash(),
-                &vec![
-                    Annotation::Callers(caller.clone()),
-                    category.clone(),
-                    Annotation::AlterationCategory(var_cat),
-                ],
+                &add,
             );
         });
         Ok(VariantCollection {
             variants,
             vcf: Vcf::new(self.vcf_passed.clone().into())?,
             caller,
-            category,
         })
     }
 }

+ 220 - 18
src/callers/savana.rs

@@ -1,12 +1,13 @@
 use crate::{
-    annotation::{Annotation, Annotations, Caller, CallerCat},
+    annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
     collection::{vcf::Vcf, HasOutputs, Initialize, Version},
     commands::{
         bcftools::{bcftools_keep_pass, BcftoolsConfig},
         longphase::{LongphaseConfig, LongphaseHap, LongphasePhase},
     },
     config::Config,
-    io::vcf::read_vcf,
+    io::{readers::get_gz_reader, vcf::read_vcf},
+    positions::{num_to_contig, GenomeRange},
     runners::{run_wait, CommandRun, Run},
     variant::{
         variant::{RunnerVariants, Variants},
@@ -14,9 +15,10 @@ use crate::{
     },
 };
 use anyhow::Context;
+use itertools::Itertools;
 use log::info;
 use rayon::prelude::*;
-use std::{fs, path::Path};
+use std::{fs, io::BufRead, path::Path, str::FromStr};
 
 #[derive(Debug)]
 pub struct Savana {
@@ -185,36 +187,236 @@ impl Version for Savana {
 }
 
 impl CallerCat for Savana {
-    fn caller_cat(&self) -> (Caller, Annotation) {
-        (Caller::Savana, Annotation::Somatic)
+    fn caller_cat(&self) -> Annotation {
+        Annotation::Callers(Caller::Savana, Sample::Somatic)
     }
 }
 
 impl Variants for Savana {
     fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection> {
-        let (caller, category) = self.caller_cat();
-        let add = vec![Annotation::Callers(caller.clone()), category.clone()];
-        info!(
-            "Loading variants from {} {}: {}",
-            caller, category, self.passed_vcf
-        );
+        let caller = self.caller_cat();
+        let add = vec![caller.clone()];
+        info!("Loading variants from {}: {}", caller, self.passed_vcf);
         let variants = read_vcf(&self.passed_vcf)?;
         variants.par_iter().for_each(|v| {
             annotations.insert_update(v.hash(), &add);
         });
-        info!(
-            "{} {}, {} variants loaded.",
-            caller,
-            category,
-            variants.len()
-        );
+        info!("{}, {} variants loaded.", caller, variants.len());
         Ok(VariantCollection {
             variants,
             vcf: Vcf::new(self.passed_vcf.clone().into())?,
             caller,
-            category,
         })
     }
 }
 
 impl RunnerVariants for Savana {}
+
+pub struct SavanaCNRow {
+    pub range: GenomeRange,
+    pub segment_id: String,
+    pub bin_count: u32,
+    pub sum_of_bin_lengths: u32,
+    pub weight: f32,
+    pub copy_number: f32,
+    pub minor_allele_copy_number: Option<f64>,
+    pub mean_baf: Option<f64>,
+    pub n_het_snps: u32,
+}
+
+impl FromStr for SavanaCNRow {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> anyhow::Result<Self> {
+        let cells: Vec<&str> = s.split("\t").collect();
+        let range = GenomeRange::from_1_inclusive(
+            cells
+                .first()
+                .context(format!("Error while parsing contig {s}."))?,
+            cells
+                .get(1)
+                .context(format!("Error while parsing start {s}."))?,
+            cells
+                .get(2)
+                .context(format!("Error while parsing end {s}."))?,
+        )
+        .map_err(|e| anyhow::anyhow!("Error while parsing range {s}.\n{e}"))?;
+
+        Ok(Self {
+            range,
+            segment_id: cells
+                .get(3)
+                .context(format!("Error while parsing segment_id {s}."))?
+                .to_string(),
+            bin_count: cells
+                .get(4)
+                .context(format!("Error while parsing bin_count {s}."))?
+                .parse()
+                .map_err(|e| anyhow::anyhow!("Error while parsing bin_count {s}.\n{e}"))?,
+            sum_of_bin_lengths: cells
+                .get(5)
+                .context(format!("Error while parsing sum_of_bin_lengths {s}."))?
+                .parse()
+                .map_err(|e| anyhow::anyhow!("Error while parsing sum_of_bin_lengths {s}.\n{e}"))?,
+            weight: cells
+                .get(6)
+                .context(format!("Error while parsing weight {s}."))?
+                .parse()
+                .map_err(|e| anyhow::anyhow!("Error while parsing weight {s}.\n{e}"))?,
+            copy_number: cells
+                .get(7)
+                .context(format!("Error while parsing copy_number {s}."))?
+                .parse()
+                .map_err(|e| anyhow::anyhow!("Error while parsing copy_number {s}.\n{e}"))?,
+            minor_allele_copy_number: cells
+                .get(8)
+                .context(format!("Error while parsing minor_allele_copy_number {s}."))?
+                .parse::<f64>()
+                .map(Some)
+                .unwrap_or(None),
+            mean_baf: cells
+                .get(9)
+                .context(format!("Error while parsing mean_baf {s}."))?
+                .parse::<f64>()
+                .map(Some)
+                .unwrap_or(None),
+            n_het_snps: cells
+                .get(10)
+                .context(format!("Error while parsing n_het_snps {s}."))?
+                .parse()
+                .map_err(|e| anyhow::anyhow!("Error while parsing n_het_snps {s}.\n{e}"))?,
+        })
+    }
+}
+
+pub struct SavanaCopyNumber {
+    pub data: Vec<SavanaCNRow>,
+}
+
+impl SavanaCopyNumber {
+    pub fn load_id(id: &str, config: Config) -> anyhow::Result<Self> {
+        let path = config.savana_copy_number(id);
+        let reader = get_gz_reader(&path)?;
+        let mut data = Vec::new();
+        for line in reader.lines() {
+            let line = line?;
+            if line.starts_with("chromosome") {
+                continue;
+            }
+            let cn: SavanaCNRow = line.parse()?;
+            data.push(cn);
+        }
+        Ok(Self { data })
+    }
+}
+
+// bin chromosome  start   end perc_known_bases    use_bin tumour_read_count   normal_read_count
+pub struct SavanaRCRow {
+    pub range: GenomeRange,
+    pub perc_known_bases: f32,
+    pub use_bin: bool,
+    pub tumour_read_count: u32,
+    pub normal_read_count: u32,
+}
+
+impl FromStr for SavanaRCRow {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> anyhow::Result<Self> {
+        let cells: Vec<&str> = s.split("\t").collect();
+        let bin = cells
+            .first()
+            .context(format!("Error while parsing range {s}."))?;
+
+        let range = bin
+            .split_once(':')
+            .and_then(|(a, b)| {
+                b.split_once('_').map(|(b, c)| {
+                    GenomeRange::from_1_inclusive(a, b, c)
+                        .context(format!("Error while parsing range {}", bin))
+                })
+            })
+            .context(format!("Invalid range format: {}", bin))??;
+
+        Ok(Self {
+            range,
+            perc_known_bases: cells
+                .get(4)
+                .context(format!("Error while parsing perc_known_bases {s}."))?
+                .parse()
+                .map_err(|e| anyhow::anyhow!("Error while parsing perc_known_bases {s}.\n{e}"))?,
+            use_bin: cells
+                .get(5)
+                .context(format!("Error while parsing use_bin {s}."))?
+                .to_lowercase()
+                .parse()
+                .map_err(|e| anyhow::anyhow!("Error while parsing use_bin {s}.\n{e}"))?,
+            tumour_read_count: cells
+                .get(6)
+                .context(format!("Error while parsing tumour_read_count {s}."))?
+                .parse()
+                .map_err(|e| anyhow::anyhow!("Error while parsing tumour_read_count {s}.\n{e}"))?,
+            normal_read_count: cells
+                .get(7)
+                .context(format!("Error while parsing normal_read_count {s}."))?
+                .parse()
+                .map_err(|e| anyhow::anyhow!("Error while parsing normal_read_count {s}.\n{e}"))?,
+        })
+    }
+}
+
+pub struct SavanaReadCounts {
+    pub data: Vec<SavanaRCRow>,
+}
+
+impl SavanaReadCounts {
+    pub fn load_id(id: &str, config: Config) -> anyhow::Result<Self> {
+        let path = config.savana_read_counts(id);
+        let reader = get_gz_reader(&path)?;
+        let mut data = Vec::new();
+        for line in reader.lines() {
+            let line = line?;
+            if line.starts_with("bin") {
+                continue;
+            }
+            let cn: SavanaRCRow = line.parse()?;
+            data.push(cn);
+        }
+        Ok(Self { data })
+    }
+
+    pub fn n_tumoral_reads(&self) -> usize {
+        self.data
+            .par_iter()
+            .map(|c| c.tumour_read_count as usize)
+            .sum::<usize>()
+    }
+
+    pub fn n_normal_reads(&self) -> usize {
+        self.data
+            .par_iter()
+            .map(|c| c.normal_read_count as usize)
+            .sum::<usize>()
+    }
+
+    pub fn norm_chr_counts(&self) -> Vec<(String, f64)> {
+        let n_tum = self.n_tumoral_reads();
+        let n_bins = self.data.len();
+        self.data
+            .iter()
+            .chunk_by(|c| c.range.contig)
+            .into_iter()
+            .map(|(contig_num, chr_counts)| {
+                let chr_counts: Vec<_> = chr_counts.collect();
+
+                let chr_n_bins = chr_counts.len();
+                let chr_counts = chr_counts
+                    .par_iter()
+                    .map(|c| c.tumour_read_count as usize)
+                    .sum::<usize>();
+
+                let r = chr_counts as f64 / (( chr_n_bins as f64 * n_tum as f64) / n_bins as f64);
+                (num_to_contig(contig_num), r)
+            }).collect()
+    }
+}

+ 9 - 12
src/callers/severus.rs

@@ -1,5 +1,5 @@
 use crate::{
-    annotation::{Annotation, Annotations, Caller, CallerCat},
+    annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
     collection::{vcf::Vcf, HasOutputs, Initialize, InitializeSolo, Version},
     commands::{
         bcftools::{bcftools_keep_pass_precise, BcftoolsConfig},
@@ -42,7 +42,6 @@ impl Initialize for Severus {
         let phased_germline_vcf = &config.constit_phased_vcf(id);
         if !Path::new(phased_germline_vcf).exists() {
             LongphasePhase::initialize(id, config.clone())?.run()?;
-            // anyhow::bail!("Could not find required phased VCF: {phased_germline_vcf}")
         }
         fs::create_dir_all(config.severus_output_dir(id))?;
 
@@ -56,7 +55,7 @@ impl Initialize for Severus {
 
 impl Run for Severus {
     fn run(&mut self) -> anyhow::Result<()> {
-        // TODO make that work
+        // TODO make that version check work
         info!("Running Severus v{}", Severus::version(&self.config)?);
 
         let id = &self.id;
@@ -150,19 +149,19 @@ impl Version for Severus {
 }
 
 impl CallerCat for Severus {
-    fn caller_cat(&self) -> (Caller, Annotation) {
-        (Caller::Severus, Annotation::Somatic)
+    fn caller_cat(&self) -> Annotation {
+        Annotation::Callers(Caller::Severus, Sample::Somatic)
     }
 }
 
 impl Variants for Severus {
     fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection> {
         let vcf_passed = self.config.severus_passed_vcf(&self.id);
-        let (caller, category) = self.caller_cat();
-        let add = vec![Annotation::Callers(caller.clone()), category.clone()];
+        let caller = self.caller_cat();
+        let add = vec![caller.clone()];
         info!(
-            "Loading variants from {} {}: {}",
-            caller, category, vcf_passed
+            "Loading variants from {}: {}",
+            caller, vcf_passed
         );
 
         let variants = read_vcf(&vcf_passed)?;
@@ -171,16 +170,14 @@ impl Variants for Severus {
             annotations.insert_update(v.hash(), &add);
         });
         info!(
-            "{} {}, {} variants loaded.",
+            "{}, {} variants loaded.",
             caller,
-            category,
             variants.len()
         );
         Ok(VariantCollection {
             variants,
             vcf: Vcf::new(vcf_passed.into())?,
             caller,
-            category,
         })
     }
 }

+ 99 - 0
src/cases/case.rs

@@ -0,0 +1,99 @@
+use std::path::Path;
+
+use rusqlite::{Connection, OptionalExtension};
+
+use crate::config::Config;
+
+#[derive(Debug)]
+pub struct Case {
+    pub id: String,
+    pub n_runs: CaseType,
+}
+
+#[derive(Debug)]
+pub enum CaseType {
+    Paired { n_runs_diag: u8, n_runs_mrd: u8 },
+    Diag { n_runs: u8 },
+}
+
+pub struct DBCases {
+    conn: Connection,
+}
+
+impl DBCases {
+    pub fn new(config: &Config) -> anyhow::Result<Self> {
+        let conn = Connection::open(&config.db_cases_path)?;
+        Ok(DBCases { conn })
+    }
+
+    pub fn add_case(&self, case: &Case) -> anyhow::Result<()> {
+        match case.n_runs {
+            CaseType::Paired {
+                n_runs_diag,
+                n_runs_mrd,
+            } => {
+                self.conn.execute(
+                    "INSERT INTO cases (id, type, n_runs_diag, n_runs_mrd) VALUES (?1, ?2, ?3, ?4)",
+                    (&case.id, "Paired", n_runs_diag, n_runs_mrd),
+                )?;
+            }
+            CaseType::Diag { n_runs } => {
+                self.conn.execute(
+                    "INSERT INTO cases (id, type, n_runs) VALUES (?1, ?2, ?3)",
+                    (&case.id, "Diag", n_runs),
+                )?;
+            }
+        }
+        Ok(())
+    }
+
+    pub fn remove_case(&self, id: &str) -> anyhow::Result<()> {
+        self.conn.execute("DELETE FROM cases WHERE id = ?1", [id])?;
+        Ok(())
+    }
+
+    pub fn search_case(&self, id: &str) -> anyhow::Result<Option<Case>> {
+        let mut stmt = self
+            .conn
+            .prepare("SELECT id, type, n_runs_diag, n_runs_mrd, n_runs FROM cases WHERE id = ?1")?;
+        let case = stmt
+            .query_row([id], |row| {
+                let case_type: String = row.get(1)?;
+                let n_runs = match case_type.as_str() {
+                    "Paired" => CaseType::Paired {
+                        n_runs_diag: row.get(2)?,
+                        n_runs_mrd: row.get(3)?,
+                    },
+                    "Diag" => CaseType::Diag {
+                        n_runs: row.get(4)?,
+                    },
+                    _ => {
+                        return Err(rusqlite::Error::InvalidColumnType(
+                            1,
+                            "Unknown case type".into(),
+                            rusqlite::types::Type::Text,
+                        ))
+                    }
+                };
+                Ok(Case {
+                    id: row.get(0)?,
+                    n_runs,
+                })
+            }).optional()?;
+        Ok(case)
+    }
+
+    pub fn create_table(&self) -> anyhow::Result<()> {
+        self.conn.execute(
+            "CREATE TABLE IF NOT EXISTS cases (
+                id TEXT PRIMARY KEY,
+                type TEXT NOT NULL,
+                n_runs_diag INTEGER,
+                n_runs_mrd INTEGER,
+                n_runs INTEGER
+            )",
+            [],
+        )?;
+        Ok(())
+    }
+}

+ 2 - 0
src/cases/mod.rs

@@ -0,0 +1,2 @@
+pub mod case;
+

+ 288 - 64
src/collection/bam.rs

@@ -8,35 +8,39 @@ use std::{
 use anyhow::{anyhow, Context};
 use chrono::{DateTime, Utc};
 use glob::glob;
-use log::warn;
+use log::{debug, info, warn};
 use pandora_lib_bindings::{
     progs::cramino::{Cramino, CraminoRes},
     utils::RunBin,
 };
+use rand::{thread_rng, Rng};
 use rayon::prelude::*;
-use rust_htslib::bam::{record::Cigar, Read};
+use rust_htslib::bam::{ext::BamRecordExtensions, record::Cigar, Read};
 use serde::{Deserialize, Serialize};
 
+use crate::config::Config;
+
 #[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct Bam {
+pub struct WGSBam {
     pub id: String,
     pub time_point: String,
     pub reference_genome: String,
-    pub bam_type: BamType,
+    // pub bam_type: BamType,
     pub path: PathBuf,
     pub modified: DateTime<Utc>,
-    pub cramino: Option<CraminoRes>,
+    pub bam_stats: WGSBamStats,
+    // pub cramino: Option<CraminoRes>,
     pub composition: Vec<(String, f64)>, // acquisition id
 }
 
-#[derive(Debug, PartialEq, Clone, Deserialize, Serialize)]
-pub enum BamType {
-    WGS,
-    Panel(String),
-    ChIP(String),
-}
-
-impl Bam {
+// #[derive(Debug, PartialEq, Clone, Deserialize, Serialize)]
+// pub enum BamType {
+//     WGS,
+//     Panel(String),
+//     ChIP(String),
+// }
+//
+impl WGSBam {
     pub fn new(path: PathBuf) -> anyhow::Result<Self> {
         let stem = path
             .clone()
@@ -46,7 +50,7 @@ impl Bam {
             .to_string();
         let stem: Vec<&str> = stem.split('_').collect();
 
-        if stem.len() > 4 || stem.len() < 3 {
+        if stem.len() != 3 {
             return Err(anyhow!("Error in bam name: {}", path.display()));
         }
 
@@ -58,16 +62,16 @@ impl Bam {
             .context("Can't get last from stem {stem}")?
             .to_string();
 
-        let bam_type = if stem.len() == 4 {
-            match stem[2] {
-                "oncoT" => BamType::Panel("oncoT".to_string()),
-                "H3K27ac" => BamType::ChIP("H3K27ac".to_string()),
-                "H3K4me3" => BamType::ChIP("H3K4me3".to_string()),
-                _ => return Err(anyhow!("Error in bam name: {}", path.display())),
-            }
-        } else {
-            BamType::WGS
-        };
+        // let bam_type = if stem.len() == 4 {
+        //     match stem[2] {
+        //         "oncoT" => BamType::Panel("oncoT".to_string()),
+        //         "H3K27ac" => BamType::ChIP("H3K27ac".to_string()),
+        //         "H3K4me3" => BamType::ChIP("H3K4me3".to_string()),
+        //         _ => return Err(anyhow!("Error in bam name: {}", path.display())),
+        //     }
+        // } else {
+        //     BamType::WGS
+        // };
         let file_metadata = fs::metadata(&path)?;
         let modified = file_metadata.modified()?;
 
@@ -85,40 +89,42 @@ impl Bam {
             return Self::load_json(&json_path);
         }
 
-        let cramino_path = format!(
-            "{}/{id}_{time_point}_{reference_genome}_cramino.txt",
-            tp_dir.to_string_lossy()
-        );
-
-        let cramino = if bam_type == BamType::WGS {
-            if !PathBuf::from_str(&cramino_path)?.exists() {
-                return Err(anyhow!("Cramino file missing {cramino_path}"));
-            }
-            let mut cramino = Cramino::default().with_result_path(&cramino_path);
-            cramino
-                .parse_results()
-                .context(format!("Error while parsing cramino for {cramino_path}"))?;
-
-            if let Some(cramino) = cramino.results {
-                Some(cramino)
-            } else {
-                return Err(anyhow!("Cramino results parsing failed"));
-            }
-        } else {
-            None
-        };
-
-        let composition = bam_compo(path.to_string_lossy().as_ref(), 20000).context(format!(
+        // let cramino_path = format!(
+        //     "{}/{id}_{time_point}_{reference_genome}_cramino.txt",
+        //     tp_dir.to_string_lossy()
+        // );
+
+        // let cramino = if bam_type == BamType::WGS {
+        //     if !PathBuf::from_str(&cramino_path)?.exists() {
+        //         // return Err(anyhow!("Cramino file missing {cramino_path}"));
+        //         Cramino::default().with_bam(path.to_str().unwrap())?;
+        //     }
+        //     let mut cramino = Cramino::default().with_result_path(&cramino_path);
+        //     cramino
+        //         .parse_results()
+        //         .context(format!("Error while parsing cramino for {cramino_path}"))?;
+        //
+        //     if let Some(cramino) = cramino.results {
+        //         Some(cramino)
+        //     } else {
+        //         return Err(anyhow!("Cramino results parsing failed"));
+        //     }
+        // } else {
+        //     None
+        // };
+
+        let composition = bam_compo(path.to_string_lossy().as_ref(), 20_000).context(format!(
             "Error while reading BAM composition for {}",
             path.display()
         ))?;
 
         let s = Self {
             path,
-            cramino,
+            bam_stats: WGSBamStats::new(&id, &time_point, Config::default())?,
+            // cramino,
             id: id.to_string(),
             time_point: time_point.to_string(),
-            bam_type,
+            // bam_type,
             reference_genome,
             composition,
             modified: modified.into(),
@@ -142,7 +148,7 @@ impl Bam {
 
 #[derive(Debug)]
 pub struct BamCollection {
-    pub bams: Vec<Bam>,
+    pub bams: Vec<WGSBam>,
 }
 
 impl BamCollection {
@@ -150,8 +156,8 @@ impl BamCollection {
         load_bam_collection(result_dir)
     }
 
-    pub fn by_acquisition_id(&self) -> HashMap<String, Vec<&Bam>> {
-        let mut acq: HashMap<String, Vec<&Bam>> = HashMap::new();
+    pub fn by_acquisition_id(&self) -> HashMap<String, Vec<&WGSBam>> {
+        let mut acq: HashMap<String, Vec<&WGSBam>> = HashMap::new();
         for bam in self.bams.iter() {
             for (acq_id, _) in bam.composition.iter() {
                 if let Some(entry) = acq.get_mut(acq_id) {
@@ -164,25 +170,30 @@ impl BamCollection {
         acq
     }
 
-    pub fn get(&self, id: &str, time_point: &str) -> Vec<&Bam> {
+    pub fn get(&self, id: &str, time_point: &str) -> Vec<&WGSBam> {
         self.bams
             .iter()
             .filter(|b| b.id == id && b.time_point == time_point)
             .collect()
     }
 
-    pub fn by_id_completed(&self, min_diag_cov: f32, min_mrd_cov: f32) -> Vec<Bam> {
+    pub fn by_id_completed(&self, min_diag_cov: f32, min_mrd_cov: f32) -> Vec<WGSBam> {
         self.bams
             .iter()
-            .filter(|b| matches!(b.bam_type, BamType::WGS))
-            .filter(|b| match &b.cramino {
-                Some(cramino) => match b.time_point.as_str() {
-                    "diag" => cramino.mean_length >= min_diag_cov as f64,
-                    "mrd" => cramino.mean_length >= min_mrd_cov as f64,
-                    _ => false,
-                },
+            // .filter(|b| matches!(b.bam_type, BamType::WGS))
+            .filter(|b| match b.time_point.as_str() {
+                "diag" => b.bam_stats.global_coverage >= min_diag_cov as f64,
+                "mrd" => b.bam_stats.global_coverage >= min_mrd_cov as f64,
                 _ => false,
             })
+            // .filter(|b| match &b.cramino {
+            //     Some(cramino) => match b.time_point.as_str() {
+            //         "diag" => cramino.mean_length >= min_diag_cov as f64,
+            //         "mrd" => cramino.mean_length >= min_mrd_cov as f64,
+            //         _ => false,
+            //     },
+            //     _ => false,
+            // })
             .cloned()
             .collect()
     }
@@ -199,10 +210,10 @@ pub fn load_bam_collection(result_dir: &str) -> BamCollection {
     let pattern = format!("{}/*/*/*.bam", result_dir);
     let bams = glob(&pattern)
         .expect("Failed to read glob pattern")
-        .par_bridge()
+        // .par_bridge()
         .filter_map(|entry| {
             match entry {
-                Ok(path) => match Bam::new(path) {
+                Ok(path) => match WGSBam::new(path) {
                     Ok(bam) => return Some(bam),
                     Err(e) => warn!("{e}"),
                 },
@@ -235,6 +246,77 @@ pub fn bam_compo(file_path: &str, sample_size: usize) -> anyhow::Result<Vec<(Str
         .collect())
 }
 
+pub struct BamReadsSampler {
+    pub positions: Vec<(String, u64)>,
+    pub reader: rust_htslib::bam::IndexedReader,
+    current_index: usize,
+}
+
+impl BamReadsSampler {
+    pub fn new(path: &str, n: usize) -> anyhow::Result<Self> {
+        debug!("Reading BAM file: {path}");
+        let reader = rust_htslib::bam::IndexedReader::from_path(path)?;
+        let header = reader.header();
+
+        let contig_len: Vec<(String, u64)> = header
+            .target_names()
+            .into_iter()
+            .map(|target_name| {
+                let tid = header.tid(target_name).unwrap();
+                (
+                    String::from_utf8(target_name.to_vec()).unwrap(),
+                    header.target_len(tid).unwrap(),
+                )
+            })
+            .collect();
+
+        let positions = sample_random_positions(&contig_len, n);
+
+        Ok(Self {
+            positions,
+            reader,
+            current_index: 0,
+        })
+    }
+}
+
+impl Iterator for BamReadsSampler {
+    type Item = Result<rust_htslib::bam::Record, rust_htslib::errors::Error>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            if self.current_index < self.positions.len() {
+                let (contig, pos) = &self.positions[self.current_index];
+
+                match self.reader.fetch((contig, *pos, pos + 1)) {
+                    Ok(_) => (),
+                    Err(e) => warn!("Error while reading BAM {e}"),
+                }
+
+                let result = self.reader.records().next();
+
+                self.current_index += 1;
+                if result.is_some() {
+                    return result;
+                }
+            } else {
+                return None;
+            }
+        }
+    }
+}
+
+pub fn n_mapped_reads(bam_path: &str) -> anyhow::Result<u64> {
+    let mut bam = rust_htslib::bam::IndexedReader::from_path(bam_path)?;
+    let mapped = bam
+        .index_stats()?
+        .into_iter()
+        .filter(|(tid, _, _, _)| *tid < 24)
+        .map(|(_, _, m, _)| m)
+        .sum();
+    Ok(mapped)
+}
+
 // 0-based inclusif
 pub fn nt_pileup(
     bam: &mut rust_htslib::bam::IndexedReader,
@@ -434,3 +516,145 @@ pub fn counts_ins_at(
     }
     Ok(counts)
 }
+
+pub fn sample_random_positions(chromosomes: &Vec<(String, u64)>, n: usize) -> Vec<(String, u64)> {
+    let mut rng = thread_rng();
+    let total_length: u64 = chromosomes.iter().map(|(_, length)| length).sum();
+
+    (0..n)
+        .map(|_| {
+            let random_position = rng.gen_range(0..total_length);
+            let mut cumulative_length = 0;
+
+            for (chr, length) in chromosomes {
+                cumulative_length += length;
+                if random_position < cumulative_length {
+                    let position_within_chr = random_position - (cumulative_length - length);
+                    return (chr.clone(), position_within_chr);
+                }
+            }
+
+            unreachable!("Should always find a chromosome")
+        })
+        .collect()
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct WGSBamStats {
+    pub all_records: u128,
+    pub n_reads: u128,
+    pub mapped_yield: u128,
+    pub global_coverage: f64,
+    pub karyotype: Vec<(i32, u64, String, u128)>,
+}
+
+impl WGSBamStats {
+    pub fn new(id: &str, time: &str, config: Config) -> anyhow::Result<Self> {
+        let bam_path = config.solo_bam(id, time);
+        let Config {
+            bam_min_mapq,
+            bam_n_threads,
+            ..
+        } = config;
+        let mut bam = rust_htslib::bam::Reader::from_path(bam_path)?;
+        let h = bam.header().clone();
+        let header = rust_htslib::bam::Header::from_template(&h);
+        bam.set_threads(bam_n_threads as usize)?;
+
+        let mut mapped_lens = Vec::new();
+        let mut all_records = 0;
+        let mut n_reads = 0;
+        let mut lengths_by_tid: HashMap<i32, Vec<u128>> = HashMap::new();
+
+        for read in bam
+            .rc_records()
+            .map(|x| x.expect("Failure parsing Bam file"))
+            .inspect(|_| all_records += 1)
+            .filter(|read| {
+                read.flags()
+                    & (rust_htslib::htslib::BAM_FUNMAP
+                        | rust_htslib::htslib::BAM_FSECONDARY
+                        | rust_htslib::htslib::BAM_FQCFAIL
+                        | rust_htslib::htslib::BAM_FDUP) as u16
+                    == 0
+            })
+            .filter(|r| r.mapq() >= bam_min_mapq)
+        {
+            n_reads += 1;
+            let mapped_len = (read.reference_end() - read.pos()) as u128;
+            mapped_lens.push(mapped_len);
+            lengths_by_tid
+                .entry(read.tid())
+                .or_default()
+                .push(mapped_len);
+            if n_reads % 500_000 == 0 {
+                info!("{n_reads} reads parsed");
+            }
+        }
+
+        let mapped_yield = mapped_lens.par_iter().sum::<u128>();
+        let genome = get_genome_sizes(&header)?;
+        let genome_size = genome.values().map(|v| *v as u128).sum::<u128>();
+        let global_coverage = mapped_yield as f64 / genome_size as f64;
+
+        let mut karyotype: Vec<(i32, u64, String, u128)> = lengths_by_tid
+            .iter()
+            .map(|(tid, lengths)| {
+                let contig = String::from_utf8(h.tid2name(*tid as u32).to_vec()).unwrap();
+                (
+                    *tid,
+                    *genome.get(&contig).unwrap(),
+                    contig,
+                    lengths.par_iter().sum::<u128>(),
+                )
+            })
+            .collect();
+
+        karyotype.sort_by(|(ac, _, _, _), (bc, _, _, _)| ac.cmp(bc));
+
+        Ok(Self {
+            all_records,
+            n_reads,
+            mapped_yield,
+            global_coverage,
+            karyotype,
+        })
+    }
+
+    pub fn print(&self) {
+        println!("N records\t{}", self.all_records);
+        println!("N counted reads\t{}", self.n_reads);
+
+        println!("Mapped yield [Gb]\t{:.2}", self.mapped_yield as f64 / 1e9);
+        println!("Mean coverage\t{:.2}", self.global_coverage);
+        self.karyotype
+            .iter()
+            .for_each(|(_, contig_len, contig, contig_yield)| {
+                println!(
+                    "{contig}\t{:.2}",
+                    (*contig_yield as f64 / *contig_len as f64) / self.global_coverage
+                );
+            });
+    }
+}
+
+pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<HashMap<String, u64>> {
+    let mut genome = HashMap::new();
+    for (key, records) in header.to_hashmap() {
+        for record in records {
+            if key == "SQ" {
+                genome.insert(
+                    record["SN"].to_string(),
+                    record["LN"]
+                        .parse::<u64>()
+                        .expect("Failed parsing length of chromosomes"),
+                );
+            }
+        }
+    }
+    Ok(genome)
+}
+
+// fn softclipped_bases(read: &rust_htslib::bam::Record) -> u128 {
+//     (read.cigar().leading_softclips() + read.cigar().trailing_softclips()) as u128
+// }

+ 1 - 1
src/collection/mod.rs

@@ -512,7 +512,7 @@ impl Collections {
     /// * `Vec<(bam::Bam, bam::Bam)>` - A vector of tuples, each containing a pair of BAM files
     ///   (diagnostic and MRD) for a unique ID.
     ///
-    pub fn bam_pairs(&self) -> Vec<(bam::Bam, bam::Bam)> {
+    pub fn bam_pairs(&self) -> Vec<(bam::WGSBam, bam::WGSBam)> {
         let mut ids: Vec<String> = self.bam.bams.iter().map(|b| b.id.clone()).collect();
         ids.sort();
         ids.dedup();

+ 1 - 0
src/commands/dorado.rs

@@ -60,6 +60,7 @@ impl Dorado {
             case,
         })
     }
+
     fn create_reference_mmi(&self) -> anyhow::Result<()> {
         if !std::path::Path::new(&self.config.align.ref_mmi).exists() {
             cmd!(

+ 46 - 2
src/config.rs

@@ -5,12 +5,15 @@ pub struct Config {
     pub align: AlignConfig,
     pub reference: String,
     pub reference_name: String,
+    pub dict_file: String,
     pub savana_bin: String,
     pub savana_threads: u8,
     pub tumoral_name: String,
     pub normal_name: String,
     pub haplotagged_bam_tag_name: String,
     pub savana_output_dir: String,
+    pub savana_copy_number: String,
+    pub savana_read_counts: String,
     pub germline_phased_vcf: String,
     pub savana_passed_vcf: String,
     pub conda_sh: String,
@@ -38,6 +41,10 @@ pub struct Config {
     pub deepsomatic_threads: u8,
     pub deepsomatic_bin_version: String,
     pub deepsomatic_model_type: String,
+    pub bam_min_mapq: u8,
+    pub bam_n_threads: u8,
+    pub db_cases_path: String,
+    pub somatic_pipe_stats: String,
 
     pub clairs_threads: u8,
     pub clairs_force: bool,
@@ -74,6 +81,7 @@ impl Default for Config {
             // Reference genome
             reference: "/data/ref/hs1/chm13v2.0.fa".to_string(),
             reference_name: "hs1".to_string(),
+            dict_file: "/data/ref/hs1/chm13v2.0.dict".to_string(),
 
             // File structure
             result_dir: "/data/longreads_basic_pipe".to_string(),
@@ -82,6 +90,11 @@ impl Default for Config {
             normal_name: "mrd".to_string(),
             haplotagged_bam_tag_name: "HP".to_string(),
 
+            bam_min_mapq: 40,
+            bam_n_threads: 150,
+
+            db_cases_path: "/data/cases.sqlite".to_string(),
+
             //
             mask_bed: "{result_dir}/{id}/diag/mask.bed".to_string(),
 
@@ -90,6 +103,9 @@ impl Default for Config {
             .to_string(),
             conda_sh: "/data/miniconda3/etc/profile.d/conda.sh".to_string(),
 
+            somatic_pipe_stats: "{result_dir}/{id}/diag/somatic_pipe_stats"
+            .to_string(),
+
             // DeepVariant
             deepvariant_output_dir: "{result_dir}/{id}/{time}/DeepVariant".to_string(),
             deepvariant_threads: 155,
@@ -114,6 +130,8 @@ impl Default for Config {
             savana_threads: 150,
             savana_output_dir: "{result_dir}/{id}/diag/savana".to_string(),
             savana_passed_vcf: "{output_dir}/{id}_diag_savana_PASSED.vcf.gz".to_string(),
+            savana_copy_number: "{output_dir}/{id}_diag_{reference_name}_{haplotagged_bam_tag_name}_segmented_absolute_copy_number.tsv".to_string(),
+            savana_read_counts: "{output_dir}/{id}_diag_{reference_name}_{haplotagged_bam_tag_name}_raw_read_counts.tsv".to_string(),
             savana_force: false,
 
             // Severus
@@ -170,7 +188,7 @@ pub struct AlignConfig {
 impl Default for AlignConfig {
     fn default() -> Self {
         Self {
-            dorado_bin: "/data/tools/dorado-0.9.0-linux-x64/bin/dorado".to_string(),
+            dorado_bin: "/data/tools/dorado-0.9.1-linux-x64/bin/dorado".to_string(),
             dorado_basecall_arg: "-x 'cuda:0,1,2,3' sup,5mC_5hmC".to_string(), // since v0.8.0 need
             // to specify cuda devices (exclude the T1000)
             ref_fa: "/data/ref/hs1/chm13v2.0.fa".to_string(),
@@ -258,6 +276,12 @@ impl Config {
             .replace("{id}", id)
     }
 
+    pub fn somatic_pipe_stats(&self, id: &str) -> String {
+        self.somatic_pipe_stats
+            .replace("{result_dir}", &self.result_dir)
+            .replace("{id}", id)
+    }
+
     // DeepVariant
     pub fn deepvariant_output_dir(&self, id: &str, time: &str) -> String {
         self.deepvariant_output_dir
@@ -353,7 +377,10 @@ impl Config {
     pub fn savana_output_vcf(&self, id: &str) -> String {
         let output_dir = self.savana_output_dir(id);
 
-        format!("{output_dir}/{id}_{}_{}_{}.classified.somatic.vcf", self.tumoral_name, self.reference_name, self.haplotagged_bam_tag_name)
+        format!(
+            "{output_dir}/{id}_{}_{}_{}.classified.somatic.vcf",
+            self.tumoral_name, self.reference_name, self.haplotagged_bam_tag_name
+        )
     }
 
     pub fn savana_passed_vcf(&self, id: &str) -> String {
@@ -362,6 +389,23 @@ impl Config {
             .replace("{id}", id)
     }
 
+    // {output_dir}/{id}_diag_{reference_name}_{haplotagged_bam_tag_name}
+    pub fn savana_read_counts(&self, id: &str) -> String {
+        self.savana_read_counts
+            .replace("{output_dir}", &self.savana_output_dir(id))
+            .replace("{id}", id)
+            .replace("{reference_name}", &self.reference_name)
+            .replace("{haplotagged_bam_tag_name}", &self.haplotagged_bam_tag_name)
+    }
+
+    pub fn savana_copy_number(&self, id: &str) -> String {
+        self.savana_copy_number
+            .replace("{output_dir}", &self.savana_output_dir(id))
+            .replace("{id}", id)
+            .replace("{reference_name}", &self.reference_name)
+            .replace("{haplotagged_bam_tag_name}", &self.haplotagged_bam_tag_name)
+    }
+
     // Severus
     pub fn severus_output_dir(&self, id: &str) -> String {
         self.severus_output_dir

+ 53 - 4
src/io/readers.rs

@@ -1,7 +1,11 @@
-use std::{fs::File, io::BufReader};
+use std::{
+    fs::File,
+    io::{BufReader, Read, Write},
+    path::Path,
+};
 
 use anyhow::Context;
-use bgzip::BGZFReader;
+use bgzip::{BGZFReader, BGZFWriter, Compression};
 use log::debug;
 
 pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn std::io::Read>> {
@@ -11,7 +15,7 @@ pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn std::io::Read>> {
         .collect::<Vec<&str>>()
         .last()
         .context(format!("Can't parse {path}"))?;
-    assert!(file_type == "gz" || file_type == "vcf" || file_type == "bed");
+    assert!(file_type == "gz" || file_type == "vcf" || file_type == "bed" || file_type == "tsv");
 
     let raw_reader: Box<dyn std::io::Read> = Box::new(File::open(path)?);
 
@@ -20,9 +24,54 @@ pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn std::io::Read>> {
             let reader = Box::new(BGZFReader::new(raw_reader)?);
             Ok(Box::new(BufReader::new(reader)))
         }
-        "vcf" | "bed" => Ok(Box::new(BufReader::new(raw_reader))),
+        "vcf" | "bed" | "tsv" => Ok(Box::new(BufReader::new(raw_reader))),
         t => {
             panic!("unknown file type: {}", t)
         }
     }
 }
+
+pub fn get_gz_reader(path: &str) -> anyhow::Result<BGZFReader<File>> {
+    debug!("Reading: {path}");
+    let file_type = *path
+        .split(".")
+        .collect::<Vec<&str>>()
+        .last()
+        .context("Can't parse {path}.")?;
+
+    let path = if file_type != "gz" {
+        compress_to_bgzip(path)?
+    } else {
+        path.to_string()
+    };
+
+    let reader = File::open(path)?;
+    Ok(BGZFReader::new(reader)?)
+}
+
+pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result<String> {
+    let output_path = format!("{}.gz", input_path);
+
+    if Path::new(&output_path).exists() {
+        return Ok(output_path);
+    }
+
+    debug!("Compressing {input_path}");
+    let input_file = File::open(input_path)?;
+    let mut reader = BufReader::new(input_file);
+
+    let output_file = File::create(&output_path)?;
+    let mut writer = BGZFWriter::new(output_file, Compression::default());
+
+    let mut buffer = [0; 8192];
+    loop {
+        let bytes_read = reader.read(&mut buffer)?;
+        if bytes_read == 0 {
+            break;
+        }
+        writer.write_all(&buffer[..bytes_read])?;
+    }
+
+    writer.close()?;
+    Ok(output_path)
+}

+ 73 - 8
src/lib.rs

@@ -13,7 +13,7 @@ pub mod io;
 pub mod pipes;
 pub mod positions;
 pub mod annotation;
-// pub mod vcf_reader;
+pub mod cases;
 
 #[macro_use]
 extern crate lazy_static;
@@ -28,23 +28,22 @@ mod tests {
     use std::{collections::HashMap, fs, path::Path};
 
     use annotation::{vep::{VepLine, VEP}, Annotations};
-    use callers::{nanomonsv::nanomonsv_create_pon, savana::Savana, severus::{Severus, SeverusSolo}};
-    use collection::{bam::{counts_at, counts_ins_at, nt_pileup}, Initialize, InitializeSolo, Version};
+    use callers::{nanomonsv::nanomonsv_create_pon, savana::{Savana, SavanaCopyNumber, SavanaReadCounts}, severus::{Severus, SeverusSolo}};
+    use collection::{bam::{counts_at, counts_ins_at, nt_pileup, WGSBamStats}, pod5::{Pod5, Pod5Config}, Initialize, InitializeSolo, Version};
     use commands::{longphase::{LongphaseConfig, LongphaseHap, LongphaseModcallSolo, LongphasePhase}, modkit::{bed_methyl, ModkitConfig}};
     use functions::assembler::{Assembler, AssemblerConfig};
     use helpers::estimate_shannon_entropy;
     use io::bed::read_bed;
-    use log::info;
+    use log::{error, info};
     use pipes::somatic::Somatic;
     use positions::{overlaps_par, GenomePosition, GenomeRange};
-    // use pandora_lib_assembler::assembler::AssembleConfig;
     use rayon::prelude::*;
     use runners::Run;
     use variant::variant::{VcfVariant, Variants};
 
     use self::{collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config};
     use super::*;
-    use crate::{callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}}, collection::{bam::{self, BamType}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado};
+    use crate::{callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}}, collection::{bam, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado};
 
     // export RUST_LOG="debug"
     fn init() {
@@ -87,7 +86,7 @@ mod tests {
         bam_collection
             .bams
             .iter()
-            .filter(|b| matches!(b.bam_type, BamType::Panel(_)))
+            // .filter(|b| matches!(b.bam_type, BamType::Panel(_)))
             .for_each(|b| println!("{b:#?}"));
 
         let u = bam_collection.get("PARACHINI", "mrd");
@@ -333,7 +332,21 @@ mod tests {
     #[test]
     fn run_savana() -> anyhow::Result<()> {
         init();
-        Savana::initialize("LEVASSEUR", Config::default())?.run()
+         let  collections = Collections::new(
+            CollectionsConfig::default()
+        )?;
+        for bam in collections.bam.by_id_completed(15.0, 10.0).iter() {
+            let id = &bam.id;
+            match ClairS::initialize(id, Config::default())?.run() {
+                Ok(_) => match Savana::initialize(id, Config::default())?.run() {
+                    Ok(_) => (),
+                    Err(e) => error!("{e}"),
+                },
+                Err(e) => error!("{e}"),
+            }
+            ;
+        }
+        Ok(())
     }
 
     #[test]
@@ -569,4 +582,56 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn savana_cn() -> anyhow::Result<()> {
+        init();
+        let id = "CAMARA";
+        let s = SavanaCopyNumber::load_id(id, Config::default())?;
+        let s = SavanaReadCounts::load_id(id, Config::default())?;
+        println!("tumoral reads: {}", s.n_tumoral_reads());
+        println!("normal reads: {}", s.n_normal_reads());
+        println!("tumoral:\n{:#?}", s.norm_chr_counts());
+        Ok(())
+    }
+
+    #[test]
+    fn coverage() -> anyhow::Result<()> {
+        init();
+        let id = "CAMARA";
+        let time = "diag";
+        WGSBamStats::new(id, time, Config::default())?.print();
+        Ok(())
+    }
+
+    #[test]
+    fn tar() {
+        init();
+        let mut ar = tar::Archive::new(fs::File::open("/data/lto/20241030_CUNVI-DG-N20_SEBZA-DG-N21.tar").unwrap());
+
+        for file in ar.entries().unwrap() {
+            let file = file.unwrap();
+            let p = String::from_utf8(file.path_bytes().into_owned()).unwrap();
+            if p.contains("/pod5/") {
+                let u = Pod5::from_path(&file.path().unwrap().into_owned(), &Pod5Config::default());
+                println!("{p}\n{u:#?}");
+            }
+        }
+
+    }
+    #[test]
+    fn run_somatic() -> anyhow::Result<()> {
+        init();
+         let  collections = Collections::new(
+            CollectionsConfig::default()
+        )?;
+        for bam in collections.bam.by_id_completed(15.0, 10.0).iter() {
+            let id = &bam.id;
+            match Somatic::initialize(id, Config::default())?.run() {
+                Ok(_) => (),
+                Err(e) => error!("{e}"),
+            };
+        }
+        Ok(())
+    }
 }

+ 163 - 146
src/pipes/somatic.rs

@@ -1,8 +1,15 @@
-use log::{info, kv::Source};
-use std::{collections::HashMap, fs::File, sync::Arc};
+use itertools::Itertools;
+use log::info;
+use std::{
+    collections::HashMap,
+    fs::{self, File},
+    io::Write,
+    path::Path,
+    sync::Arc,
+};
 
 use crate::{
-    annotation::{Annotation, Annotations, AnnotationsStats, Caller},
+    annotation::{Annotation, Annotations, AnnotationsStats, Sample},
     callers::{
         clairs::ClairS, deep_somatic::DeepSomatic, deep_variant::DeepVariant, nanomonsv::NanomonSV,
         savana::Savana, severus::Severus,
@@ -46,27 +53,27 @@ pub struct SomaticStats {
 
 #[derive(Debug, Default, Clone)]
 pub struct InputStats {
-    pub solo_tumor: Vec<(Caller, usize)>,
-    pub solo_constit: Vec<(Caller, usize)>,
-    pub germline: Vec<(Caller, usize)>,
-    pub somatic: Vec<(Caller, usize)>,
+    pub solo_tumor: Vec<(Annotation, usize)>,
+    pub solo_constit: Vec<(Annotation, usize)>,
+    pub germline: Vec<(Annotation, usize)>,
+    pub somatic: Vec<(Annotation, usize)>,
 }
 
 impl InputStats {
     pub fn from_collections(collections: &[VariantCollection]) -> Self {
         let mut stats = Self::default();
         for collection in collections.iter() {
-            match collection.category {
-                Annotation::SoloTumor => stats
+            match collection.caller {
+                Annotation::Callers(_, Sample::SoloTumor) => stats
                     .solo_tumor
                     .push((collection.caller.clone(), collection.variants.len())),
-                Annotation::SoloConstit => stats
+                Annotation::Callers(_, Sample::SoloConstit) => stats
                     .solo_constit
                     .push((collection.caller.clone(), collection.variants.len())),
-                Annotation::Germline => stats
+                Annotation::Callers(_, Sample::Germline) => stats
                     .germline
                     .push((collection.caller.clone(), collection.variants.len())),
-                Annotation::Somatic => stats
+                Annotation::Callers(_, Sample::Somatic) => stats
                     .somatic
                     .push((collection.caller.clone(), collection.variants.len())),
                 _ => (),
@@ -83,7 +90,8 @@ impl SomaticStats {
             ..Default::default()
         }
     }
-    pub fn annot_init(&self, stats: AnnotationsStats) {
+
+    pub fn annot_init(&self, stats: &AnnotationsStats, json_path: &str) -> anyhow::Result<()> {
         let stats: Vec<(Vec<Annotation>, u64)> = stats
             .categorical
             .iter()
@@ -102,12 +110,12 @@ impl SomaticStats {
             self.input
                 .somatic
                 .iter()
-                .map(|(caller, _)| vec![Annotation::Callers(caller.clone()), Annotation::Somatic])
-                .collect::<Vec<Vec<Annotation>>>(),
+                .map(|(caller, _)| caller.clone())
+                .collect::<Vec<Annotation>>(),
             self.input
                 .solo_tumor
                 .iter()
-                .map(|(caller, _)| vec![Annotation::Callers(caller.clone()), Annotation::SoloTumor])
+                .map(|(caller, _)| caller.clone())
                 .collect(),
         ]
         .concat();
@@ -116,31 +124,30 @@ impl SomaticStats {
             self.input
                 .germline
                 .iter()
-                .map(|(caller, _)| vec![Annotation::Callers(caller.clone()), Annotation::Germline])
-                .collect::<Vec<Vec<Annotation>>>(),
+                .map(|(caller, _)| caller.clone())
+                .collect::<Vec<Annotation>>(),
             self.input
                 .solo_constit
                 .iter()
-                .map(|(caller, _)| {
-                    vec![Annotation::Callers(caller.clone()), Annotation::SoloConstit]
-                })
+                .map(|(caller, _)| caller.clone())
                 .collect(),
         ]
         .concat();
 
         let mut with_germline: HashMap<String, HashMap<String, u64>> = HashMap::new();
         stats.iter().for_each(|(anns, v)| {
-            if anns
-                .iter()
-                .any(|a| matches!(a, Annotation::SoloConstit | Annotation::Germline))
-            {
+            if anns.iter().any(|a| {
+                matches!(
+                    a,
+                    Annotation::Callers(_, Sample::SoloConstit)
+                        | Annotation::Callers(_, Sample::Germline)
+                )
+            }) {
                 let n_by_tumor: Vec<(String, u64)> = callers_somatic_solo_tumor
                     .iter()
                     .flat_map(|tumor| {
-                        if tumor.iter().all(|a| anns.contains(a)) {
-                            let tum_call =
-                                format!("{} {}", tumor.first().unwrap(), tumor.get(1).unwrap());
-                            vec![(tum_call, *v)]
+                        if anns.contains(tumor) {
+                            vec![(tumor.to_string(), *v)]
                         } else {
                             vec![]
                         }
@@ -150,10 +157,8 @@ impl SomaticStats {
                 let mut germline_caller: Vec<String> = callers_germline_solo_constit
                     .iter()
                     .flat_map(|germ| {
-                        if germ.iter().all(|a| anns.contains(a)) {
-                            let germ_call =
-                                format!("{} {}", germ.first().unwrap(), germ.get(1).unwrap());
-                            vec![germ_call]
+                        if anns.contains(germ) {
+                            vec![germ.to_string()]
                         } else {
                             vec![]
                         }
@@ -162,45 +167,67 @@ impl SomaticStats {
                 germline_caller.sort();
                 let germline_caller = germline_caller.join(" + ");
 
-                
                 n_by_tumor.iter().for_each(|(tumoral_caller, n)| {
                     if let Some(row) = with_germline.get_mut(tumoral_caller) {
-                        // germline_caller.iter().for_each(|germline_caller| {
-                            if tumoral_caller == "ClairS Somatic" {
-                                println!("{tumoral_caller} {germline_caller} {n}");
-                            }
-                            if let Some(col) = row.get_mut(&germline_caller) {
-                                *col += *n;
-                            } else {
-                                row.insert(germline_caller.to_string(), *n);
-                            }
-                        // });
+                        if let Some(col) = row.get_mut(&germline_caller) {
+                            *col += *n;
+                        } else {
+                            row.insert(germline_caller.to_string(), *n);
+                        }
                     } else {
                         let mut row = HashMap::new();
-                        // germline_caller.iter().for_each(|germline_caller| {
-                            row.insert(germline_caller.to_string(), *n);
-                        // });
+                        row.insert(germline_caller.to_string(), *n);
                         with_germline.insert(tumoral_caller.to_string(), row);
                     }
                 });
             }
         });
 
-        let mut germlines_callers: Vec<String> = with_germline.iter().flat_map(|(_, r)| {
-            r.iter().map(|(k,_)| k.to_string()).collect::<Vec<String>>()
-        }).collect();
+        let mut germlines_callers: Vec<String> = with_germline
+            .iter()
+            .flat_map(|(_, r)| {
+                r.iter()
+                    .map(|(k, _)| k.to_string())
+                    .collect::<Vec<String>>()
+            })
+            .collect();
         germlines_callers.sort();
         germlines_callers.dedup();
 
-        with_germline.iter().for_each(|(tumor, row)| {
-            print!("{tumor}\t");
-            germlines_callers.iter().for_each(|g| {
-                let v = row.get(g).unwrap_or(&0);
-                print!("{g}:{v}\t");
-            });
-            println!();
-        });
-        println!();
+        let mut json = Vec::new();
+        let mut lines: Vec<String> = with_germline
+            .iter()
+            .map(|(tumor, row)| {
+                json.push(format!(
+                    "{{caller_name:\"{tumor}\", germline: [{}] }}",
+                    germlines_callers
+                        .iter()
+                        .map(|g| {
+                            let v = row.get(g).unwrap_or(&0);
+                            format!("{{{g}: {v}}}")
+                        })
+                        .join(", ")
+                ));
+                format!(
+                    "{tumor}\t{}",
+                    germlines_callers
+                        .iter()
+                        .map(|g| {
+                            let v = row.get(g).unwrap_or(&0);
+                            format!("{g}: {v}")
+                        })
+                        .join("\t")
+                )
+            })
+            .collect();
+        lines.sort();
+        println!("{}", lines.join("\n"));
+
+        let json = format!("[{}]", json.join(", "));
+        let mut file = File::create(json_path)?;
+        file.write_all(json.as_bytes())?;
+
+        Ok(())
     }
 }
 
@@ -211,6 +238,11 @@ impl Run for Somatic {
         let config = self.config.clone();
         let annotations = Arc::new(self.annotations.clone());
 
+        // Stats dir
+        let stats_dir = config.somatic_pipe_stats(&id);
+        if !Path::new(&stats_dir).exists() {
+            fs::create_dir(&stats_dir)?;
+        }
         // TODO: GZ !!!
         // LongphasePhase::initialize(&id, self.config.clone())?.run()?;
 
@@ -222,7 +254,7 @@ impl Run for Somatic {
             &config,
             ClairS,
             NanomonSV,
-            Severus,
+            // Severus,
             Savana,
             DeepSomatic
         );
@@ -256,26 +288,31 @@ impl Run for Somatic {
 
         let mut annotations = Arc::try_unwrap(annotations)
             .map_err(|e| anyhow::anyhow!("Failed to unwrap Arc: {:?}", e))?;
-        let caller_cat_anns = |v: &Annotation| {
-            matches!(
-                v,
-                Annotation::Callers(_)
-                    | Annotation::Germline
-                    | Annotation::Somatic
-                    | Annotation::SoloConstit
-                    | Annotation::SoloTumor
-            )
-        };
+        let caller_cat_anns = |v: &Annotation| matches!(v, Annotation::Callers(_, _));
         let annot_init = annotations.callers_stat(Some(Box::new(caller_cat_anns)));
-        somatic_stats.annot_init(annot_init);
+        somatic_stats.annot_init(
+            &annot_init,
+            &format!("{stats_dir}/{id}_germline_filter.json"),
+        )?;
+        annot_init.save_to_json(&format!("{stats_dir}/{id}_annotations_01.json"))?;
 
         // Filter: Variants neither Germline nor SoloConstit
         info!("Keeping somatic variants (variants neither in solo nor in germline).");
         somatic_stats.n_constit_germline =
             annotations.retain_variants(&mut variants_collections, |anns| {
-                !anns.contains(&Annotation::Germline) && !anns.contains(&Annotation::SoloConstit)
+                !anns.iter().any(|ann| {
+                    matches!(
+                        ann,
+                        Annotation::Callers(_, Sample::Germline)
+                            | Annotation::Callers(_, Sample::SoloConstit)
+                    )
+                })
             });
-        annotations.callers_stat(Some(Box::new(caller_cat_anns)));
+        annotations
+            .callers_stat(Some(Box::new(caller_cat_anns)))
+            .save_to_json(&format!(
+                "{stats_dir}/{id}_annotations_02_post_germline.json"
+            ))?;
 
         // Annotation: BAM depth, n_alt
         info!("Reading Constit BAM file for depth and pileup annotation.");
@@ -286,20 +323,18 @@ impl Run for Somatic {
             self.config.solo_max_alt_constit,
             self.config.solo_min_constit_depth,
         );
-        annotations.callers_stat(Some(Box::new(|v| {
-            matches!(
-                v,
-                Annotation::Callers(_)
-                    | Annotation::Germline
-                    | Annotation::Somatic
-                    | Annotation::SoloConstit
-                    | Annotation::SoloTumor
-                    | Annotation::ConstitAlt(_)
-                    | Annotation::ConstitDepth(_)
-                    | Annotation::HighConstitAlt
-                    | Annotation::LowConstitDepth
-            )
-        })));
+        annotations
+            .callers_stat(Some(Box::new(|v| {
+                matches!(
+                    v,
+                    Annotation::Callers(_, _)
+                        | Annotation::ConstitAlt(_)
+                        | Annotation::ConstitDepth(_)
+                        | Annotation::HighConstitAlt
+                        | Annotation::LowConstitDepth
+                )
+            })))
+            .save_to_json(&format!("{stats_dir}/{id}_annotations_03_bam.json"))?;
 
         // Filter: Remove LowConstitDepth from annotations and variants collections
         info!(
@@ -329,18 +364,16 @@ impl Run for Somatic {
             somatic_stats.n_high_alt_constit, self.config.solo_max_alt_constit
         );
 
-        annotations.callers_stat(Some(Box::new(|v| {
-            matches!(
-                v,
-                Annotation::Callers(_)
-                    | Annotation::Germline
-                    | Annotation::Somatic
-                    | Annotation::SoloConstit
-                    | Annotation::SoloTumor
-                    | Annotation::ConstitAlt(_)
-                    | Annotation::ConstitDepth(_)
-            )
-        })));
+        annotations
+            .callers_stat(Some(Box::new(|v| {
+                matches!(
+                    v,
+                    Annotation::Callers(_, _)
+                        | Annotation::ConstitAlt(_)
+                        | Annotation::ConstitDepth(_)
+                )
+            })))
+            .save_to_json(&format!("{stats_dir}/{id}_annotations_04_bam_filter.json"))?;
 
         // Annotation: Entropy
         info!(
@@ -360,18 +393,14 @@ impl Run for Somatic {
                 ext_annot.annotate(&c.variants, &annotations)?;
                 Ok(())
             })?;
-        annotations.callers_stat(Some(Box::new(|v| {
-            matches!(
-                v,
-                Annotation::Callers(_)
-                    | Annotation::Germline
-                    | Annotation::Somatic
-                    | Annotation::SoloConstit
-                    | Annotation::SoloTumor
-                    | Annotation::ConstitAlt(_)
-                    | Annotation::GnomAD(_)
-            )
-        })));
+        annotations
+            .callers_stat(Some(Box::new(|v| {
+                matches!(
+                    v,
+                    Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_)
+                )
+            })))
+            .save_to_json(&format!("{stats_dir}/{id}_annotations_05_gnomad.json"))?;
 
         // Filter: Remove variants in Gnomad and in constit bam
         info!("Filtering out variants in GnomAD and in constit bam at low AF.");
@@ -404,18 +433,16 @@ impl Run for Somatic {
             "{} variants filtered, with constit alt <= max contig alt ({}) and in GnomAD.",
             somatic_stats.n_high_alt_constit_gnomad, self.config.solo_max_alt_constit
         );
-        annotations.callers_stat(Some(Box::new(|v| {
-            matches!(
-                v,
-                Annotation::Callers(_)
-                    | Annotation::Germline
-                    | Annotation::Somatic
-                    | Annotation::SoloConstit
-                    | Annotation::SoloTumor
-                    | Annotation::ConstitAlt(_)
-                    | Annotation::GnomAD(_)
-            )
-        })));
+        annotations
+            .callers_stat(Some(Box::new(|v| {
+                matches!(
+                    v,
+                    Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_)
+                )
+            })))
+            .save_to_json(&format!(
+                "{stats_dir}/{id}_annotations_06_gnomad_filter.json"
+            ))?;
 
         // Annotation low entropy
         annotations.low_shannon_entropy(self.config.min_shannon_entropy);
@@ -423,33 +450,21 @@ impl Run for Somatic {
 
         // Filtering low entropy for solo variants.
         info!("Filtering low entropies");
-        annotations.callers_stat(Some(Box::new(|v| {
-            matches!(
-                v,
-                Annotation::Callers(_)
-                    | Annotation::Germline
-                    | Annotation::Somatic
-                    | Annotation::SoloConstit
-                    | Annotation::SoloTumor
-                    | Annotation::LowEntropy
-            )
-        })));
+        annotations
+            .callers_stat(Some(Box::new(|v| {
+                matches!(v, Annotation::Callers(_, _) | Annotation::LowEntropy)
+            })))
+            .save_to_json(&format!("{stats_dir}/{id}_annotations_07_entropy.json"))?;
 
         somatic_stats.n_low_entropies = annotations
             .retain_variants(&mut variants_collections, |anns| {
                 !anns.contains(&Annotation::LowEntropy)
             });
-        annotations.callers_stat(Some(Box::new(|v| {
-            matches!(
-                v,
-                Annotation::Callers(_)
-                    | Annotation::Germline
-                    | Annotation::Somatic
-                    | Annotation::SoloConstit
-                    | Annotation::SoloTumor
-                    | Annotation::LowEntropy
-            )
-        })));
+        annotations
+            .callers_stat(Some(Box::new(|v| matches!(v, Annotation::Callers(_, _)))))
+            .save_to_json(&format!(
+                "{stats_dir}/{id}_annotations_08_entropy_filter.json"
+            ))?;
 
         // VEP
         info!("VEP annotation.");
@@ -460,7 +475,9 @@ impl Run for Somatic {
                 ext_annot.annotate_vep(&c.variants, &annotations)?;
                 Ok(())
             })?;
-        annotations.callers_stat(Some(Box::new(caller_cat_anns)));
+        annotations
+            .callers_stat(Some(Box::new(caller_cat_anns)))
+            .save_to_json(&format!("{stats_dir}/{id}_annotations_09_vep.json"))?;
 
         annotations.vep_stats()?;
 

+ 23 - 5
src/positions.rs

@@ -13,7 +13,7 @@ pub struct GenomePosition {
 
 impl GenomePosition {
     pub fn contig(&self) -> String {
-        num_to_string(self.contig)
+        num_to_contig(self.contig)
     }
 }
 
@@ -27,7 +27,9 @@ impl FromStr for GenomePosition {
     type Err = anyhow::Error;
 
     fn from_str(s: &str) -> anyhow::Result<Self> {
-        let (c, p) = s.split_once(":").ok_or(anyhow::anyhow!("Can't split {s}"))?;
+        let (c, p) = s
+            .split_once(":")
+            .ok_or(anyhow::anyhow!("Can't split {s}"))?;
         Ok(Self {
             contig: contig_to_num(c),
             position: p.parse()?,
@@ -63,7 +65,7 @@ impl PartialOrd for GenomePosition {
 
 impl From<GenomePosition> for (String, String) {
     fn from(val: GenomePosition) -> Self {
-        (num_to_string(val.contig), val.position.to_string())
+        (num_to_contig(val.contig), val.position.to_string())
     }
 }
 
@@ -85,7 +87,7 @@ pub fn contig_to_num(contig: &str) -> u8 {
     }
 }
 
-pub fn num_to_string(num: u8) -> String {
+pub fn num_to_contig(num: u8) -> String {
     match num {
         23 => "chrX".to_string(),
         24 => "chrY".to_string(),
@@ -123,10 +125,11 @@ impl From<GenomePosition> for VcfPosition {
 
 impl From<VcfPosition> for (String, String) {
     fn from(val: VcfPosition) -> Self {
-        (num_to_string(val.contig), val.position.to_string())
+        (num_to_contig(val.contig), val.position.to_string())
     }
 }
 
+// [0;n[
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct GenomeRange {
     pub contig: u8,
@@ -147,6 +150,21 @@ impl TryFrom<(&str, &str, &str)> for GenomeRange {
     }
 }
 
+impl GenomeRange {
+    pub fn from_1_inclusive(contig: &str, start: &str, end: &str) -> anyhow::Result<Self> {
+        Ok(Self {
+            contig: contig_to_num(contig),
+            range: Range {
+                start: start
+                    .parse::<u32>()
+                    .context(format!("Can't parse {start} into u32."))?
+                    - 1,
+                end: end.parse().context(format!("Can't parse {end} into u32."))?,
+            },
+        })
+    }
+}
+
 pub fn overlaps_par(positions: &[&GenomePosition], ranges: &[&GenomeRange]) -> Vec<usize> {
     let chunk_size = (positions.len() / rayon::current_num_threads()).max(1);
 

+ 2 - 6
src/variant/variant_collection.rs

@@ -17,7 +17,7 @@ use crate::{
         echtvar::{parse_echtvar_val, run_echtvar},
         gnomad::GnomAD,
         vep::{run_vep, VepLine, VEP},
-        Annotation, Annotations, Caller,
+        Annotation, Annotations,
     },
     collection::{
         bam::{counts_at, counts_ins_at},
@@ -32,8 +32,7 @@ use crate::{
 pub struct VariantCollection {
     pub variants: Vec<VcfVariant>,
     pub vcf: Vcf,
-    pub caller: Caller,
-    pub category: Annotation,
+    pub caller: Annotation,
 }
 
 impl VariantCollection {
@@ -58,7 +57,6 @@ impl VariantCollection {
             variants,
             vcf,
             caller,
-            category,
         } = self;
         let (left_data, right_data) = variants.into_iter().partition(predicate);
         (
@@ -66,13 +64,11 @@ impl VariantCollection {
                 variants: left_data,
                 vcf: vcf.clone(),
                 caller: caller.clone(),
-                category: category.clone(),
             },
             Self {
                 variants: right_data,
                 vcf,
                 caller,
-                category,
             },
         )
     }