Thomas 1 vuosi sitten
vanhempi
commit
af8fd4c4d4

Tiedoston diff-näkymää rajattu, sillä se on liian suuri
+ 264 - 175
Cargo.lock


+ 4 - 3
Cargo.toml

@@ -8,8 +8,6 @@ log = "^0.4.22"
 env_logger = "^0.11.3"
 anyhow = "1.0.86"
 glob = "0.3.1"
-pandora_lib_pod5 = { git = "https://git.t0m4.fr/Thomas/pandora_lib_pod5.git" }
-pandora_lib_pileup = { git = "https://git.t0m4.fr/Thomas/pandora_lib_pileup.git" }
 pandora_lib_bindings = { git = "https://git.t0m4.fr/Thomas/pandora_lib_bindings.git" }
 pandora_lib_scan = { git = "https://git.t0m4.fr/Thomas/pandora_lib_scan.git" }
 pandora_lib_variants = { git = "https://git.t0m4.fr/Thomas/pandora_lib_variants.git" }
@@ -23,7 +21,7 @@ tracing-test = "0.2.5"
 tracing = "0.1.40"
 logtest = "2.0.0"
 test-log = "0.2.16"
-noodles-csi = "0.39.0"
+noodles-csi = "0.40.0"
 num-format = "0.4.4"
 locale_config = "0.3.0"
 byte-unit = "5.1.4"
@@ -42,3 +40,6 @@ tempfile = "3.12.0"
 tokio = "1.41.1"
 features = "0.10.0"
 full = "0.3.0"
+rust-htslib = "0.49.0"
+podders = "0.1.4"
+arrow = "53.3.0"

+ 16 - 15
src/callers/clairs.rs

@@ -6,6 +6,7 @@ use crate::{
     runners::{run_wait, DockerRun},
 };
 use anyhow::Context;
+use pandora_lib_scan::Config;
 use std::{fs, path::Path};
 use tracing::info;
 
@@ -181,31 +182,31 @@ impl ClairS {
                 .context(format!("Error while writing logs into {log_file}"))?;
         }
 
-        LongphaseHap::new(
+
+        let bam = Path::new(&self.diag_bam);
+        // let new_fn = format!("{}_hp.bam", bam.file_stem().unwrap().to_str().unwrap());
+        // let bam_hp = bam.with_file_name(new_fn);
+        LongphasePhase::new(
             &self.id,
-            &self.diag_bam,
+            bam.to_str().unwrap(),
             &germline_normal_tumor,
             LongphaseConfig::default(),
-        )
+        )?
         .run()?;
         LongphaseHap::new(
             &self.id,
-            &self.mrd_bam,
-            &germline_normal_tumor,
+            &self.diag_bam,
+            &format!("{}/clair3_normal_tumoral_germline_output_PS.vcf.gz", self.output_dir),
             LongphaseConfig::default(),
         )
-        .run()?;
-
-        let bam = Path::new(&self.diag_bam);
-        let new_fn = format!("{}_hp.bam", bam.file_stem().unwrap().to_str().unwrap());
-        let bam_hp = bam.with_file_name(new_fn);
-        LongphasePhase::new(
+            .run()?;
+        LongphaseHap::new(
             &self.id,
-            bam_hp.to_str().unwrap(),
-            &germline_normal_tumor,
+            &self.mrd_bam,
+            &format!("{}/clair3_normal_tumoral_germline_output_PS.vcf.gz", self.output_dir),
             LongphaseConfig::default(),
-        )?
-        .run()?;
+        )
+            .run()?;
         Ok(())
     }
 }

+ 9 - 0
src/callers/mod.rs

@@ -1,6 +1,15 @@
+use deep_variant::DeepVariant;
+use nanomonsv::NanomonSV;
+
 pub mod deep_variant;
 pub mod clairs;
 pub mod nanomonsv;
 pub mod severus;
 pub mod savana;
 
+
+#[derive(Debug)]
+pub enum Caller {
+    DeepVariant(DeepVariant),
+    NanomonSV(NanomonSV),
+}

+ 2 - 2
src/callers/nanomonsv.rs

@@ -180,7 +180,7 @@ pub struct NanomonSVSolo {
 
 impl NanomonSVSolo {
     pub fn new(id: &str, bam: &str, time_point: &str, config: NanomonSVConfig) -> Self {
-        let out_dir = format!("{}/{id}/{time_point}/nanomonsv", &config.result_dir);
+        let out_dir = format!("{}/{id}/{time_point}/nanomonsv-solo", &config.result_dir);
         let log_dir = format!("{}/{id}/log/nanomonsv", &config.result_dir);
         NanomonSVSolo {
             id: id.to_string(),
@@ -233,7 +233,7 @@ impl NanomonSVSolo {
                 .context(format!("Error while writing log into {log_file}"))?;
         }
 
-        let vcf_passed = format!("{out_prefix}_nanomonsv_PASSED.vcf.gz");
+        let vcf_passed = format!("{out_prefix}_nanomonsv-solo_PASSED.vcf.gz");
         if !Path::new(&vcf_passed).exists() {
             let report = bcftools_keep_pass(&result_vcf, &vcf_passed, BcftoolsConfig::default())
                 .context(format!(

+ 12 - 2
src/callers/savana.rs

@@ -1,5 +1,5 @@
 use crate::{
-    collection::{HasOutputs, InitSomatic, Version},
+    collection::{HasOutputs, Initialize, Version},
     commands::bcftools::{bcftools_keep_pass, BcftoolsConfig},
     config::Config,
     runners::{run_wait, CommandRun, Run},
@@ -14,7 +14,7 @@ pub struct Savana {
     pub log_dir: String,
 }
 
-impl InitSomatic for Savana {
+impl Initialize for Savana {
     fn initialize(id: &str, config: Config) -> anyhow::Result<Self> {
         let mut output_vcf_exists = Path::new(&config.savana_output_vcf(id)).exists();
         if config.savana_force && output_vcf_exists {
@@ -55,6 +55,9 @@ impl Run for Savana {
             &self.config.reference,
             "--phased_vcf",
             &self.config.germline_phased_vcf(id),
+            "--no_blacklist",
+            "--threads",
+            &self.config.savana_threads.to_string()
         ];
         let args = [
             "-c",
@@ -129,3 +132,10 @@ impl Version for Savana {
         Ok(log[start_index..start_index + end].to_string().trim().to_string())
     }
 }
+
+// impl LoadVariants for Savana {
+//     fn load_variants(&self) -> anyhow::Result<Variants> {
+//         let variants = read_vcf(self.savana_output_vcf, source, variant_type)?;
+//         Ok()
+//     }
+// }

+ 86 - 2
src/callers/severus.rs

@@ -1,5 +1,5 @@
 use crate::{
-    collection::{HasOutputs, InitSomatic, Version},
+    collection::{HasOutputs, Initialize, InitializeSolo, Version},
     commands::bcftools::{bcftools_keep_pass, BcftoolsConfig},
     config::Config,
     runners::{run_wait, CommandRun, Run},
@@ -14,7 +14,7 @@ pub struct Severus {
     pub log_dir: String,
 }
 
-impl InitSomatic for Severus {
+impl Initialize for Severus {
     fn initialize(id: &str, config: Config) -> anyhow::Result<Self> {
         let mut output_vcf_exists = Path::new(&config.severus_output_vcf(id)).exists();
         if config.severus_force && output_vcf_exists {
@@ -132,3 +132,87 @@ impl Version for Severus {
         Ok(v.to_string())
     }
 }
+
+#[derive(Debug)]
+pub struct SeverusSolo {
+    pub id: String,
+    pub time: String,
+    pub config: Config,
+    pub log_dir: String,
+}
+
+impl InitializeSolo for SeverusSolo {
+    fn initialize(id: &str, time: &str, config: Config) -> anyhow::Result<Self> {
+        let log_dir = format!("{}/{}/log/severus_solo", config.result_dir, id);
+        if !Path::new(&log_dir).exists() {
+            fs::create_dir_all(&log_dir)
+                .context(format!("Failed  to create {log_dir} directory"))?;
+        }
+
+        Ok(SeverusSolo {
+            id: id.to_string(),
+            time: time.to_string(),
+            config,
+            log_dir,
+        })
+    }
+    // add code here
+}
+
+impl Run for SeverusSolo {
+    fn run(&mut self) -> anyhow::Result<()> {
+        let id = &self.id;
+        let time = &self.time;
+
+        let output_vcf = &self.config.severus_solo_output_vcf(id, time);
+        let passed_vcf = &self.config.severus_solo_passed_vcf(id, time);
+
+        // Run command if output VCF doesn't exist
+        let severus_args = [
+            "--target-bam",
+            &self.config.solo_bam(id, time),
+            "--out-dir",
+            &self.config.severus_solo_output_dir(id, time),
+            "-t",
+            &self.config.severus_threads.to_string(),
+            "--write-alignments",
+            "--use-supplementary-tag",
+            "--resolve-overlaps",
+            "--between-junction-ins",
+            "--vntr-bed",
+            &self.config.vntrs_bed,
+        ];
+        let args = [
+            "-c",
+            &format!(
+                "source {} && conda activate severus_env && {} {}",
+                self.config.conda_sh,
+                self.config.severus_bin,
+                severus_args.join(" ")
+            ),
+        ];
+        let mut cmd_run = CommandRun::new("bash", &args);
+        let report = run_wait(&mut cmd_run).context(format!(
+            "Error while running `severus.py {}`",
+            args.join(" ")
+        ))?;
+
+        let log_file = format!("{}/severus_", self.log_dir);
+        report
+            .save_to_file(&log_file)
+            .context(format!("Error while writing logs into {log_file}"))?;
+
+        // Keep PASS
+        if !Path::new(passed_vcf).exists() && Path::new(output_vcf).exists() {
+            let report = bcftools_keep_pass(output_vcf, passed_vcf, BcftoolsConfig::default())
+                .context(format!(
+                    "Error while running bcftools keep PASS for {output_vcf}"
+                ))?;
+            let log_file = format!("{}/bcftools_pass", self.log_dir);
+            report
+                .save_to_file(&log_file)
+                .context(format!("Error while writing logs into {log_file}"))?;
+        }
+        Ok(())
+    }
+}

+ 23 - 2
src/collection/bam.rs

@@ -14,6 +14,7 @@ use pandora_lib_bindings::{
     utils::RunBin,
 };
 use rayon::prelude::*;
+use rust_htslib::bam::Read;
 use serde::{Deserialize, Serialize};
 
 #[derive(Debug, Clone, Deserialize, Serialize)]
@@ -25,7 +26,7 @@ pub struct Bam {
     pub path: PathBuf,
     pub modified: DateTime<Utc>,
     pub cramino: Option<CraminoRes>,
-    pub composition: Vec<(String, f64)>,
+    pub composition: Vec<(String, f64)>, // acquisition id
 }
 
 #[derive(Debug, PartialEq, Clone, Deserialize, Serialize)]
@@ -108,7 +109,7 @@ impl Bam {
         };
 
         let composition =
-            pandora_lib_pileup::bam_compo(path.to_string_lossy().as_ref(), 20000).context(
+            bam_compo(path.to_string_lossy().as_ref(), 20000).context(
                 format!("Error while reading BAM composition for {}", path.display()),
             )?;
 
@@ -213,3 +214,23 @@ pub fn load_bam_collection(result_dir: &str) -> BamCollection {
 
     BamCollection { bams }
 }
+
+pub fn bam_compo(file_path: &str, sample_size: usize) -> anyhow::Result<Vec<(String, f64)>> {
+    let mut bam = rust_htslib::bam::Reader::from_path(file_path)?;
+
+    let mut rgs: HashMap<String, u64> = HashMap::new();
+    for result in bam.records().filter_map(Result::ok).take(sample_size) {
+        if let rust_htslib::bam::record::Aux::String(s) = result.aux(b"RG")? {
+            *rgs.entry(s.to_string()).or_default() += 1;
+        }
+    }
+
+    Ok(rgs
+        .into_iter()
+        .map(|(rg, n)| (rg.to_string(), n as f64 * 100.0 / sample_size as f64))
+        .map(|(rg, p)| {
+            let (a, _) = rg.split_once('_').unwrap();
+            (a.to_string(), p)
+        })
+        .collect())
+}

+ 31 - 13
src/collection/mod.rs

@@ -13,6 +13,7 @@ use chrono::{DateTime, Utc};
 use glob::glob;
 use log::{error, info, warn};
 use modbases::{Dmr, ModBasesCollection, ModType};
+use pandora_lib_variants::variants::Variants;
 
 use self::{bam::BamCollection, pod5::Pod5Collection, vcf::VcfCollection};
 use crate::{
@@ -29,7 +30,7 @@ use crate::{
     config::Config,
     functions::{
         assembler::{Assembler, AssemblerConfig},
-        variants::{Variants, VariantsConfig},
+        variants::{RunVariantsAgg, VariantsConfig},
         whole_scan::{WholeScan, WholeScanConfig},
     },
 };
@@ -285,22 +286,22 @@ impl Collections {
         self.tasks = hs.into_values().collect();
 
         // Variants DeepVariant
-        self.tasks.extend(self.todo_deepvariants());
+        // self.tasks.extend(self.todo_deepvariants());
 
         // Variants ClairS
-        self.tasks.extend(self.todo_clairs());
+        // self.tasks.extend(self.todo_clairs());
 
         // Variants Nanomonsv
-        self.tasks.extend(self.todo_nanomonsv());
+        // self.tasks.extend(self.todo_nanomonsv());
 
         // Variants aggregation
         self.tasks.extend(self.todo_variants_agg()?);
 
         // ModPileup
-        self.tasks.extend(self.todo_mod_pileup());
+        // self.tasks.extend(self.todo_mod_pileup());
 
         // DMR C diag vs mrd
-        self.tasks.extend(self.todo_dmr_c_diag_mrd());
+        // self.tasks.extend(self.todo_dmr_c_diag_mrd());
 
         // Tasks sorting
         self.tasks.sort_by_cached_key(|task| task.get_order());
@@ -583,14 +584,14 @@ impl Collections {
                         .filter(|vcf| mtime < vcf.file_metadata.mtime())
                         .count();
                     if n_new > 0 {
-                        tasks.push(CollectionsTasks::Variants {
+                        tasks.push(CollectionsTasks::SomaticVariants {
                             id: pair.0.id.clone(),
                             config: config.clone(),
                         });
                     }
                 }
             } else {
-                tasks.push(CollectionsTasks::Variants {
+                tasks.push(CollectionsTasks::SomaticVariants {
                     id: pair.0.id.clone(),
                     config: config.clone(),
                 });
@@ -765,7 +766,7 @@ pub enum CollectionsTasks {
         mrd_bam: String,
         config: NanomonSVConfig,
     },
-    Variants {
+    SomaticVariants {
         id: String,
         config: VariantsConfig,
     },
@@ -805,7 +806,7 @@ impl CollectionsTasks {
                 bam,
                 config,
             } => WholeScan::new(id, time_point, bam, config)?.run(),
-            CollectionsTasks::Variants { id, config } => Variants::new(id, config).run(),
+            CollectionsTasks::SomaticVariants { id, config } => RunVariantsAgg::new(id, config).run(),
             CollectionsTasks::Assemble {
                 id,
                 time_point,
@@ -826,7 +827,7 @@ impl CollectionsTasks {
             CollectionsTasks::DeepVariant { .. } => 6,
             CollectionsTasks::ClairS { .. } => 7,
             CollectionsTasks::NanomonSV { .. } => 8,
-            CollectionsTasks::Variants { .. } => 9,
+            CollectionsTasks::SomaticVariants { .. } => 9,
         }
     }
 }
@@ -893,7 +894,7 @@ impl fmt::Display for CollectionsTasks {
                 time_point,
                 ..
             } => write!(f, "Whole scan for {} {}, bam: {}", id, time_point, bam),
-            Variants { id, .. } => write!(f, "Variants aggregation for {}", id),
+            SomaticVariants { id, .. } => write!(f, "Variants aggregation for {}", id),
             Assemble { id, time_point, .. } => {
                 write!(f, "De novo assemblage for {} {}", id, time_point)
             }
@@ -944,10 +945,14 @@ pub fn run_tasks(config: CollectionsConfig) -> anyhow::Result<()> {
     Ok(())
 }
 
-pub trait InitSomatic: Sized {
+pub trait Initialize: Sized {
     fn initialize(id: &str, config: Config) -> anyhow::Result<Self>;
 }
 
+pub trait InitializeSolo: Sized {
+    fn initialize(id: &str, time: &str, config: Config) -> anyhow::Result<Self>;
+}
+
 pub trait HasOutputs {
     fn has_output(&self, id: &str) -> (bool, bool);
 }
@@ -955,3 +960,16 @@ pub trait HasOutputs {
 pub trait Version {
     fn version(config: &Config) -> anyhow::Result<String>;
 }
+
+pub trait LoadVariants {
+    fn load_variants(&self) -> anyhow::Result<Variants>;
+}
+
+pub fn exists_all(paths: Vec<&str>) -> anyhow::Result<()> {
+    for path in paths.iter() {
+        if !Path::new(path).exists() {
+            anyhow::bail!("{path} should exist")
+        }
+    }
+    Ok(())
+}

+ 2 - 1
src/collection/pod5.rs

@@ -3,7 +3,6 @@ use chrono::{DateTime, Utc};
 use csv::ReaderBuilder;
 use glob::glob;
 use log::{info, warn};
-use pandora_lib_pod5::Pod5Info;
 use serde::Deserialize;
 use std::{
     fmt::Display,
@@ -14,6 +13,8 @@ use std::{
 use hashbrown::HashMap;
 use rayon::prelude::*;
 
+use crate::io::pod5_infos::Pod5Info;
+
 #[derive(Debug, Clone)]
 pub struct Pod5 {
     pub path: PathBuf,

+ 22 - 4
src/commands/bcftools.rs

@@ -24,12 +24,30 @@ pub fn bcftools_keep_pass(
     output: &str,
     config: BcftoolsConfig,
 ) -> anyhow::Result<RunReport> {
-    let tmp_file = format!("/tmp/{}", Uuid::new_v4());
-
     // First sort
+    let tmp_file = format!("/tmp/{}", Uuid::new_v4());
     let mut cmd_run = CommandRun::new(&config.bin, &["sort", input, "-o", &tmp_file]);
     let _ = run_wait(&mut cmd_run)?;
 
+    // 2. norm
+    let tmp2_file = format!("/tmp/{}", Uuid::new_v4());
+    let mut cmd_run = CommandRun::new(
+        &config.bin,
+        &[
+            "norm",
+            "--threads",
+            &config.threads.to_string(),
+            "-a",
+            "--atom-overlaps",
+            ".",
+            &tmp_file,
+            "-o",
+            &tmp2_file,
+        ],
+    );
+    let _ = run_wait(&mut cmd_run)?;
+    fs::remove_file(tmp_file)?;
+
     // Then filter
     let mut cmd_run = CommandRun::new(
         &config.bin,
@@ -40,13 +58,13 @@ pub fn bcftools_keep_pass(
             &config.threads.to_string(),
             "-i",
             "FILTER='PASS'",
-            &tmp_file,
+            &tmp2_file,
             "-o",
             output,
         ],
     );
     let res = run_wait(&mut cmd_run)?;
-    fs::remove_file(tmp_file)?;
+    fs::remove_file(tmp2_file)?;
     Ok(res)
 }
 

+ 4 - 4
src/commands/dorado.rs

@@ -9,7 +9,7 @@ use duct::cmd;
 use log::{debug, info, warn};
 use uuid::Uuid;
 
-use crate::{collection::pod5::FlowCellCase, config::Config, helpers::find_unique_file};
+use crate::{collection::{bam::bam_compo, pod5::FlowCellCase}, config::Config, helpers::find_unique_file, io::pod5_infos::Pod5Info};
 
 #[derive(Debug, Clone)]
 pub struct DoradoParams {
@@ -188,11 +188,11 @@ impl Dorado {
 
     pub fn merge_bam(&self, bam: &Path) -> anyhow::Result<()> {
         let composition_a: Vec<String> =
-            pandora_lib_pileup::bam_compo(bam.to_string_lossy().as_ref(), 20000)?
+            bam_compo(bam.to_string_lossy().as_ref(), 20000)?
                 .iter()
                 .map(|(i, _)| i.clone())
                 .collect();
-        let composition_b: Vec<String> = pandora_lib_pileup::bam_compo(&self.bam, 20000)?
+        let composition_b: Vec<String> = bam_compo(&self.bam, 20000)?
             .iter()
             .map(|(i, _)| i.clone())
             .collect();
@@ -294,7 +294,7 @@ impl Dorado {
             .collect::<Vec<PathBuf>>()
             .pop()
             .unwrap();
-        let sequencing_kit = pandora_lib_pod5::Pod5Info::from_pod5(pod_path.to_str().unwrap())
+        let sequencing_kit = Pod5Info::from_pod5(pod_path.to_str().unwrap())
             .sequencing_kit
             .to_uppercase();
 

+ 65 - 2
src/config.rs

@@ -7,6 +7,7 @@ pub struct Config {
     pub reference: String,
     pub reference_name: String,
     pub savana_bin: String,
+    pub savana_threads: u8,
     pub tumoral_name: String,
     pub normal_name: String,
     pub haplotagged_bam_tag_name: String,
@@ -15,13 +16,18 @@ pub struct Config {
     pub savana_passed_vcf: String,
     pub conda_sh: String,
     pub savana_force: bool,
-
+    pub deepvariant_output_dir: String,
     pub severus_bin: String,
     pub severus_force: bool,
     pub severus_threads: u8,
     pub vntrs_bed: String,
     pub severus_pon: String,
     pub severus_output_dir: String,
+    pub severus_solo_output_dir: String,
+}
+
+lazy_static! {
+    static ref DEEPVARIANT_OUTPUT_NAME: &'static str = "{id}_{time}_DeepVariant.vcf.gz";
 }
 
 impl Default for Config {
@@ -29,6 +35,7 @@ impl Default for Config {
         Self {
             pod_dir: "/data/run_data".to_string(),
             align: Default::default(),
+
             // Reference genome
             reference: "/data/ref/hs1/chm13v2.0.fa".to_string(),
             reference_name: "hs1".to_string(),
@@ -44,10 +51,14 @@ impl Default for Config {
                     .to_string(),
             conda_sh: "/data/miniconda3/etc/profile.d/conda.sh".to_string(),
 
+            // DeepVariant
+            deepvariant_output_dir: "{result_dir}/{id}/{time}/DeepVariant".to_string(),
+            
             // Savana
             savana_bin: "savana".to_string(),
+            savana_threads: 150,
             savana_output_dir: "{result_dir}/{id}/diag/savana".to_string(),
-            savana_passed_vcf: "{output_dir}/{id}_diag_savana_PASSED.vcf.gz".to_string(),
+            savana_passed_vcf: "{output_dir}/{id}_diag_savana_PASSED.vcf".to_string(),
             savana_force: true,
 
             // Severus
@@ -56,6 +67,7 @@ impl Default for Config {
             vntrs_bed: "/data/ref/hs1/vntrs_chm13.bed".to_string(),
             severus_pon: "/data/ref/hs1/PoN_1000G_chm13.tsv.gz".to_string(),
             severus_output_dir: "{result_dir}/{id}/diag/severus".to_string(),
+            severus_solo_output_dir: "{result_dir}/{id}/{time}/severus".to_string(),
 
             severus_force: true,
         }
@@ -95,6 +107,20 @@ impl Config {
         format!("{}/{}/{}", self.result_dir, id, self.normal_name)
     }
 
+    pub fn solo_dir(&self, id: &str, time: &str) -> String {
+        format!("{}/{}/{}", self.result_dir, id, time)
+    }
+
+    pub fn solo_bam(&self, id: &str, time: &str) -> String {
+        format!(
+            "{}/{}_{}_{}.bam",
+            self.solo_dir(id, time),
+            id,
+            time,
+            self.reference_name,
+        )
+    }
+
     pub fn tumoral_bam(&self, id: &str) -> String {
         format!(
             "{}/{}_{}_{}.bam",
@@ -144,6 +170,21 @@ impl Config {
             .replace("{id}", id)
     }
 
+    // DeepVariant
+    pub fn deepvariant_output_dir(&self, id: &str, time: &str) -> String {
+        self.deepvariant_output_dir
+            .replace("{result_dir}", &self.result_dir)
+            .replace("{id}", id)
+            .replace("{time}", time)
+    }
+
+    pub fn deepvariant_output_vcf(&self, id: &str, time: &str) -> String {
+        format!("{}/{}", self.deepvariant_output_dir(id, time), *DEEPVARIANT_OUTPUT_NAME)
+            .replace("{id}", id)
+            .replace("{time}", time)
+    }
+
+    // Savana
     pub fn savana_output_dir(&self, id: &str) -> String {
         self.savana_output_dir
             .replace("{result_dir}", &self.result_dir)
@@ -162,6 +203,7 @@ impl Config {
             .replace("{id}", id)
     }
 
+    // Severus
     pub fn severus_output_dir(&self, id: &str) -> String {
         self.severus_output_dir
             .replace("{result_dir}", &self.result_dir)
@@ -180,4 +222,25 @@ impl Config {
             id
         )
     }
+
+    // Severus solo
+    pub fn severus_solo_output_dir(&self, id: &str, time: &str) -> String {
+        self.severus_solo_output_dir
+            .replace("{result_dir}", &self.result_dir)
+            .replace("{id}", id)
+            .replace("{time}", time)
+    }
+
+    pub fn severus_solo_output_vcf(&self, id: &str, time: &str) -> String {
+        let output_dir = self.severus_solo_output_dir(id, time);
+        format!("{output_dir}/all_SVs/severus_all.vcf")
+    }
+
+    pub fn severus_solo_passed_vcf(&self, id: &str, time: &str) -> String {
+        format!(
+            "{}/{}_{}_severus-solo_PASSED.vcf.gz",
+            &self.severus_solo_output_dir(id, time),
+            id, time
+        )
+    }
 }

+ 2 - 2
src/functions/variants.rs

@@ -15,12 +15,12 @@ impl Default for VariantsConfig {
 }
 
 #[derive(Debug)]
-pub struct Variants {
+pub struct RunVariantsAgg {
     pub id: String,
     pub config: VariantsConfig,
 }
 
-impl Variants {
+impl RunVariantsAgg {
     pub fn new(id: String, config: VariantsConfig) -> Self {
         Self { id, config }
     }

+ 3 - 0
src/io/mod.rs

@@ -0,0 +1,3 @@
+
+pub mod pod5_infos;
+

+ 262 - 0
src/io/pod5_infos.rs

@@ -0,0 +1,262 @@
+use std::{
+    fs::File,
+    io::{Read, Seek, SeekFrom},
+};
+
+use arrow::array::{ArrayRef, Int16Array, StringArray, TimestampMillisecondArray, UInt16Array};
+use arrow::{array::RecordBatch, ipc::reader::FileReader};
+use chrono::TimeZone;
+use chrono::{DateTime, Utc};
+use podders::{root_as_footer, ContentType};
+
+#[derive(Debug, Clone)]
+pub struct Pod5Info {
+    pub acquisition_id: String,
+    pub acquisition_start_time: DateTime<Utc>,
+    pub adc_max: i16,
+    pub adc_min: i16,
+    pub experiment_name: String,
+    pub flow_cell_id: String,
+    pub flow_cell_product_code: String,
+    pub protocol_name: String,
+    pub protocol_run_id: String,
+    pub protocol_start_time: DateTime<Utc>,
+    pub sample_id: String,
+    pub sample_rate: u16,
+    pub sequencing_kit: String,
+    pub sequencer_position: String,
+    pub sequencer_position_type: String,
+    pub software: String,
+    pub system_name: String,
+    pub system_type: String,
+}
+
+impl Pod5Info {
+    pub fn from_pod5(file_path: &str) -> Self {
+        let mut file = File::open(file_path).unwrap();
+        let _end = file.seek(SeekFrom::End(0)).unwrap();
+        file.seek(SeekFrom::Current(-32)).unwrap(); // Signature + Section marker + 8 bytes for footer length
+        let mut buffer = [0; 8]; // Buffer for 8 bytes
+
+        file.read_exact(&mut buffer).unwrap(); // Read 8 bytes
+
+        // Convert bytes to little-endian i64
+        let value = i64::from_le_bytes(buffer);
+
+        // Seek to the footer position
+        file.seek(SeekFrom::Current(-(8 + value))).unwrap();
+
+        // Read the footer data
+        let mut buf = vec![0; value as usize];
+        file.read_exact(&mut buf).unwrap();
+
+        // Deserialize the FlatBuffer
+        let footer = root_as_footer(&buf).unwrap();
+
+        let mut acquisition_id = String::new();
+        let mut acquisition_start_time = Utc::now();
+        let mut adc_max = 0;
+        let mut adc_min = 0;
+        let mut experiment_name = String::new();
+        let mut flow_cell_id = String::new();
+        let mut flow_cell_product_code = String::new();
+        let mut protocol_name = String::new();
+        let mut protocol_run_id = String::new();
+        let mut protocol_start_time = Utc::now();
+        let mut sample_id = String::new();
+        let mut sample_rate = 0;
+        let mut sequencing_kit = String::new();
+        let mut sequencer_position = String::new();
+        let mut sequencer_position_type = String::new();
+        let mut software = String::new();
+        let mut system_name = String::new();
+        let mut system_type = String::new();
+
+        if let Some(contents) = footer.contents() {
+            for content in contents.iter() {
+                if let ContentType::RunInfoTable = content.content_type() {
+                    // println!("{content:#?}");
+                    let batch = read_arrow_table(
+                        file_path,
+                        content.offset() as u64,
+                        content.length() as u64,
+                    )
+                    .unwrap();
+                    let schema = batch[0].schema();
+                    for column in 0..batch[0].num_columns() {
+                        let array: ArrayRef = batch[0].column(column).clone();
+
+                        // Print column name and values
+                        let column_name = schema.field(column).name().to_string();
+                        // println!("Column: {}", column_name);
+
+                        // Match the type of the array to extract values
+                        match array.data_type() {
+                            arrow::datatypes::DataType::Int16 => {
+                                let int_array =
+                                    array.as_any().downcast_ref::<Int16Array>().unwrap();
+                                for i in 0..int_array.len() {
+                                    // println!("{}: i16,", column_name);
+                                    match column_name.as_str() {
+                                        "adc_max" => adc_max = int_array.value(i),
+                                        "adc_min" => adc_min = int_array.value(i),
+                                        _ => (),
+                                    }
+
+                                    // println!("{}", int_array.value(i));
+                                }
+                            }
+                            arrow::datatypes::DataType::UInt16 => {
+                                let int_array =
+                                    array.as_any().downcast_ref::<UInt16Array>().unwrap();
+                                for i in 0..int_array.len() {
+                                    // println!("{}: u16,", column_name);
+                                    if let "sample_rate" = column_name.as_str() {
+                                        sample_rate = int_array.value(i)
+                                    }
+                                }
+                            }
+
+                            // arrow::datatypes::DataType::Int32 => {
+                            //     let int_array =
+                            //         array.as_any().downcast_ref::<Int32Array>().unwrap();
+                            //     for i in 0..int_array.len() {
+                            //         println!("{}: i32,", column_name);
+                            //
+                            //         // println!("{}", int_array.value(i));
+                            //     }
+                            // }
+                            // arrow::datatypes::DataType::UInt32 => {
+                            //     let int_array =
+                            //         array.as_any().downcast_ref::<UInt32Array>().unwrap();
+                            //     for i in 0..int_array.len() {
+                            //         println!("{}: u32,", column_name);
+                            //
+                            //         // println!("{}", int_array.value(i));
+                            //     }
+                            // }
+                            // arrow::datatypes::DataType::Float64 => {
+                            //     let float_array =
+                            //         array.as_any().downcast_ref::<Float64Array>().unwrap();
+                            //     for i in 0..float_array.len() {
+                            //         println!("{}: f64,", column_name);
+                            //
+                            //         // println!("{}", float_array.value(i));
+                            //     }
+                            // }
+                            arrow::datatypes::DataType::Utf8 => {
+                                let string_array =
+                                    array.as_any().downcast_ref::<StringArray>().unwrap();
+                                let string_array: Vec<String> = string_array
+                                    .iter()
+                                    .flat_map(|v| match v {
+                                        Some(v) => vec![v.to_string()],
+                                        None => vec![],
+                                    })
+                                    .collect();
+
+                                let value = string_array.join(" ");
+
+                                match column_name.as_str() {
+                                    "acquisition_id" => acquisition_id = value,
+                                    "experiment_name" => experiment_name = value,
+                                    "flow_cell_id" => flow_cell_id = value,
+                                    "flow_cell_product_code" => flow_cell_product_code = value,
+                                    "protocol_name" => protocol_name = value,
+                                    "protocol_run_id" => protocol_run_id = value,
+                                    "sample_id" => sample_id = value,
+                                    "sequencing_kit" => sequencing_kit = value,
+                                    "sequencer_position" => sequencer_position = value,
+                                    "sequencer_position_type" => sequencer_position_type = value,
+                                    "software" => software = value,
+                                    "system_name" => system_name = value,
+                                    "system_type" => system_type = value,
+                                    _ => (),
+                                }
+                                // println!("{}: String,", column_name);
+                                // println!("{}", string_array.join(" "));
+                            }
+                            arrow::datatypes::DataType::Timestamp(
+                                arrow::datatypes::TimeUnit::Millisecond,
+                                Some(timezone),
+                            ) => {
+                                if &timezone.to_string() == "UTC" {
+                                    let timestamp_array = array
+                                        .as_any()
+                                        .downcast_ref::<TimestampMillisecondArray>()
+                                        .unwrap();
+                                    for i in 0..timestamp_array.len() {
+                                        let timestamp = timestamp_array.value(i);
+                                        let datetime: DateTime<Utc> =
+                                            Utc.timestamp_millis_opt(timestamp).unwrap();
+                                        // println!("{}: DateTime<Utc>,", column_name);
+
+                                        match column_name.as_str() {
+                                            "acquisition_start_time" => {
+                                                acquisition_start_time = datetime
+                                            }
+                                            "protocol_start_time" => protocol_start_time = datetime,
+                                            _ => (),
+                                        }
+
+                                        // println!("{}", datetime.to_rfc3339());
+                                    }
+                                }
+                            }
+
+                            _ => {
+                                // println!("Unsupported data type: {:?}", array.data_type());
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        Pod5Info {
+            acquisition_id,
+            acquisition_start_time,
+            adc_max,
+            adc_min,
+            experiment_name,
+            flow_cell_id,
+            flow_cell_product_code,
+            protocol_name,
+            protocol_run_id,
+            protocol_start_time,
+            sample_id,
+            sample_rate,
+            sequencing_kit,
+            sequencer_position,
+            sequencer_position_type,
+            software,
+            system_name,
+            system_type,
+        }
+    }
+}
+
+fn read_arrow_table(
+    file_path: &str,
+    offset: u64,
+    length: u64,
+) -> arrow::error::Result<Vec<RecordBatch>> {
+    let mut file = File::open(file_path)?;
+
+    // Seek to the start of the embedded table
+    file.seek(SeekFrom::Start(offset))?;
+
+    // Read the specified length of bytes
+    let mut buffer = vec![0; length as usize];
+    file.read_exact(&mut buffer)?;
+
+    // Deserialize bytes into Arrow RecordBatch
+    let cursor = std::io::Cursor::new(buffer);
+    let reader = FileReader::try_new(cursor, None)?;
+
+    let mut batches = Vec::new();
+    for batch in reader {
+        batches.push(batch?);
+    }
+
+    Ok(batches)
+}

+ 64 - 20
src/lib.rs

@@ -8,6 +8,9 @@ pub mod runners;
 pub mod collection;
 pub mod functions;
 pub mod helpers;
+pub mod variant;
+pub mod io;
+// pub mod vcf_reader;
 
 #[macro_use]
 extern crate lazy_static;
@@ -21,9 +24,9 @@ lazy_static! {
 mod tests {
     use std::{fs, path::Path};
 
-    use callers::{nanomonsv::nanomonsv_create_pon, savana::Savana, severus::Severus};
-    use collection::{InitSomatic, Version};
-    use commands::{longphase::{LongphaseHap, LongphaseConfig}, modkit::{bed_methyl, ModkitConfig}};
+    use callers::{nanomonsv::nanomonsv_create_pon, savana::Savana, severus::{Severus, SeverusSolo}};
+    use collection::{Initialize, InitializeSolo, Version};
+    use commands::{longphase::{LongphaseConfig, LongphaseHap, LongphasePhase}, modkit::{bed_methyl, ModkitConfig}};
     use functions::assembler::{Assembler, AssemblerConfig};
     use log::info;
     // use pandora_lib_assembler::assembler::AssembleConfig;
@@ -148,11 +151,12 @@ mod tests {
         // NanomonSV::new("test_a", "/data/test_data/subset.bam", "/data/test_data/subset_mrd.bam", config).run()
 
         let bam = |id:&str, time_point: &str| format!("/data/longreads_basic_pipe/{id}/{time_point}/{id}_{time_point}_hs1.bam");
-        NanomonSV::new("CAZIER", &bam("CAZIER", "diag"), &bam("CAZIER", "mrd"), NanomonSVConfig::default()).run()
+        let id = "HAMROUNE";
+        NanomonSV::new(id, &bam(id, "diag"), &bam(id, "mrd"), NanomonSVConfig::default()).run()
     }
 
     #[test]
-    fn nanomonsv_solo() -> anyhow::Result<()> {
+    fn nanomddonsv_solo() -> anyhow::Result<()> {
         init();
         let id = "BRETON";
         let time_point = "diag";
@@ -222,14 +226,20 @@ mod tests {
         Ok(())
     }
 
-    #[test_log::test]
-    fn bcftools_pass() {
-        let config = BcftoolsConfig::default();
-        let id = "RICCO";
-        let i = format!("/data/longreads_basic_pipe/{id}/diag/nanomonsv/{id}_diag.nanomonsv.result.vcf");
-        let o = format!("/data/longreads_basic_pipe/{id}/diag/nanomonsv/{id}_diag_nanomonsv_PASSED.vcf.gz");
-        bcftools_keep_pass(&i, &o, config).unwrap();
-    }
+    // #[test_log::test]
+    // fn bcftools_pass() {
+    //     let config = BcftoolsConfig::default();
+    //     let id = "RICCO";
+    //     let time = "diag";
+    //     let caller = "DeepVariant";
+    //
+    //     Config::default();
+    //
+    //     // let (i, o) = 
+    //     // let i = format!("/data/longreads_basic_pipe/{id}/{time}/nanomonsv/{id}_diag.nanomonsv.result.vcf");
+    //     // let o = format!("/data/longreads_basic_pipe/{id}/{time}/nanomonsv/{id}_diag_nanomonsv_PASSED.vcf.gz");
+    //     bcftools_keep_pass(&i, &o, config).unwrap();
+    // }
 
     #[test_log::test]
     fn bam_ok() -> anyhow::Result<()> {
@@ -338,13 +348,19 @@ mod tests {
     #[test]
     fn run_severus() -> anyhow::Result<()> {
         init();
-        Severus::initialize("ACHITE", Config::default())?.run()
+        Severus::initialize("CAMEL", Config::default())?.run()
+    }
+
+    #[test]
+    fn run_severus_solo() -> anyhow::Result<()> {
+        init();
+        SeverusSolo::initialize("CAMEL","diag", Config::default())?.run()
     }
 
     #[test]
     fn run_savana() -> anyhow::Result<()> {
         init();
-        Savana::initialize("BECERRA", Config::default())?.run()
+        Savana::initialize("LEVASSEUR", Config::default())?.run()
     }
 
     #[test]
@@ -370,16 +386,26 @@ mod tests {
     #[test]
     fn run_clairs() -> anyhow::Result<()> {
         init();
-        let collections = Collections::new(
-            CollectionsConfig::default()
-        )?;
-        collections.run_clairs()
+        let id = "HAMROUNE";
+        let diag_bam = format!("/data/longreads_basic_pipe/{id}/diag/{id}_diag_hs1.bam");
+        let mrd_bam = format!("/data/longreads_basic_pipe/{id}/mrd/{id}_mrd_hs1.bam");
+        ClairS::new(id, &diag_bam, &mrd_bam, ClairSConfig::default()).run()
     }
 
+    #[test]
+    fn run_deepvariant() -> anyhow::Result<()> {
+        init();
+        let id = "HAMROUNE";
+        let diag_bam = format!("/data/longreads_basic_pipe/{id}/diag/{id}_diag_hs1.bam");
+        // let mrd_bam = format!("/data/longreads_basic_pipe/{id}/mrd/{id}_mrd_hs1.bam");
+        DeepVariant::new(id, "diag", &diag_bam, DeepVariantConfig::default()).run()
+    }
+
+
     #[test]
     fn run_longphase() -> anyhow::Result<()> {
         init();
-        let id = "CUNY";
+        let id = "BECERRA";
         let diag_bam = format!("/data/longreads_basic_pipe/{id}/diag/{id}_diag_hs1.bam");
         let vcf = format!("/data/longreads_basic_pipe/{id}/diag/ClairS/clair3_normal_tumoral_germline_output.vcf.gz");
         let mrd_bam = format!("/data/longreads_basic_pipe/{id}/mrd/{id}_mrd_hs1.bam");
@@ -387,4 +413,22 @@ mod tests {
         LongphaseHap::new(id, &diag_bam, &vcf, LongphaseConfig::default()).run()?;
         LongphaseHap::new(id, &mrd_bam, &vcf, LongphaseConfig::default()).run()
     }
+
+    #[test]
+    fn run_longphase_phase() -> anyhow::Result<()> {
+        init();
+        let id = "BECERRA";
+        // let config = Config::default();
+        
+        let mrd_bam = format!("/data/longreads_basic_pipe/{id}/mrd/{id}_mrd_hs1.bam");
+        let diag_bam = format!("/data/longreads_basic_pipe/{id}/diag/{id}_diag_hs1.bam");
+        let mrd_bam_hp = format!("/data/longreads_basic_pipe/{id}/mrd/{id}_mrd_hs1_hp.bam");
+        let vcf = format!("/data/longreads_basic_pipe/{id}/mrd/DeepVariant/BECERRA_mrd_DeepVariant_PASSED.vcf.gz");
+        let hp_vcf = format!("/data/longreads_basic_pipe/{id}/mrd/DeepVariant/BECERRA_mrd_DeepVariant_PASSED_PS.vcf.gz");
+
+
+        // LongphasePhase::new(id, &mrd_bam_hp, &vcf, LongphaseConfig::default())?.run();
+        // LongphaseHap::new(id, &mrd_bam, &hp_vcf, LongphaseConfig::default()).run();
+        LongphaseHap::new(id, &diag_bam, &hp_vcf, LongphaseConfig::default()).run()
+    }
 }

+ 2 - 0
src/variant/mod.rs

@@ -0,0 +1,2 @@
+// pub mod variant;
+

+ 407 - 0
src/variant/variant.rs

@@ -0,0 +1,407 @@
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Serialize, Eq, PartialEq, Deserialize)]
+pub enum VariantType {
+    Somatic,
+    Constitutionnal,
+}
+
+
+
+impl Variant {
+    pub fn from_vcfrow(row: &VCFRow, source: VCFSource, variant_type: VariantType) -> anyhow::Result<Self> {
+        let callers_data = vec![CallerData {
+            qual: row.qual.parse::<f32>().ok(),
+            info: parse_info(&row.info, &source).context(anyhow!(
+                "Can't parse {:?} info for {}",
+                source,
+                row.info
+            ))?,
+            format: parse_format(&source, &row.value).context(anyhow!(
+                "Can't parse {:?} format for {}",
+                source,
+                row.value
+            ))?,
+        }];
+
+        Ok(Variant {
+            contig: row.chr.to_string(),
+            position: row.pos,
+            reference: row
+                .reference
+                .parse()
+                .context(anyhow!("Error while parsing {}", row.reference))?,
+            alternative: row
+                .alt
+                .parse()
+                .context(anyhow!("Error while parsing {}", row.alt))?,
+            n_ref: None,
+            n_alt: None,
+            vaf: None,
+            depth: None,
+            callers_data,
+            source: vec![source],
+            variant_type,
+            annotations: Vec::new(),
+        })
+    }
+
+    pub fn get_depth(&mut self) -> u32 {
+        if let Some(depth) = self.depth {
+            depth
+        } else {
+            let depth = self
+                .callers_data
+                .iter_mut()
+                .map(|v| v.get_depth())
+                .max()
+                .unwrap();
+            self.depth = Some(depth);
+            depth
+        }
+    }
+
+    pub fn get_n_alt(&mut self) -> u32 {
+        if let Some(n_alt) = self.n_alt {
+            n_alt
+        } else {
+            let n_alt = self
+                .callers_data
+                .iter_mut()
+                .map(|v| v.get_n_alt())
+                .max()
+                .unwrap();
+            self.n_alt = Some(n_alt);
+            n_alt
+        }
+    }
+
+    pub fn vaf(&mut self) -> f32 {
+        let n_alt = self.get_n_alt() as f32;
+        let depth = self.get_depth() as f32;
+        self.vaf = Some(n_alt / depth);
+        self.vaf.unwrap()
+    }
+
+    pub fn is_ins(&self) -> bool {
+        matches!(
+            (&self.reference, &self.alternative),
+            (
+                ReferenceAlternative::Nucleotide(_),
+                ReferenceAlternative::Nucleotides(_)
+            )
+        )
+    }
+
+    pub fn alteration_category(&self) -> AlterationCategory {
+        match (&self.reference, &self.alternative) {
+            (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => {
+                AlterationCategory::Snv
+            }
+            (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
+                AlterationCategory::Ins
+            }
+            (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
+                AlterationCategory::Other
+            }
+            (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
+                AlterationCategory::Del
+            }
+            (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
+                if a.len() < b.len() =>
+            {
+                AlterationCategory::Ins
+            }
+            (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
+                if a.len() > b.len() =>
+            {
+                AlterationCategory::Del
+            }
+            (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotides(_)) => {
+                AlterationCategory::Rep
+            }
+            (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Unstructured(_)) => {
+                AlterationCategory::Other
+            }
+            (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotide(_)) => {
+                AlterationCategory::Other
+            }
+            (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotides(_)) => {
+                AlterationCategory::Other
+            }
+            (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Unstructured(_)) => {
+                AlterationCategory::Other
+            }
+        }
+    }
+
+    pub fn to_min_string(&mut self) -> String {
+        let depth = self.get_depth();
+        let n_alt = self.get_n_alt();
+
+        format!(
+            "DP:AD\t{}:{}",
+            depth,
+            [(depth - n_alt).to_string(), n_alt.to_string()].join(",")
+        )
+    }
+
+    pub fn get_veps(&self) -> Vec<VEP> {
+        self.annotations
+            .iter()
+            .flat_map(|e| {
+                if let AnnotationType::VEP(e) = e {
+                    e.clone()
+                } else {
+                    vec![]
+                }
+            })
+            .collect()
+    }
+    pub fn get_best_vep(&self) -> Result<VEP> {
+        get_best_vep(&self.get_veps())
+    }
+
+    pub fn is_from_category(&self, and_categories: &[Category]) -> bool {
+        let mut vec_bools = Vec::new();
+        for category in and_categories.iter() {
+            match category {
+                Category::VariantCategory(vc) => {
+                    for annotations in self.annotations.iter() {
+                        if let AnnotationType::VariantCategory(vvc) = annotations {
+                            if vc == vvc {
+                                vec_bools.push(true);
+                                break;
+                            }
+                        }
+                    }
+                }
+                Category::PositionRange { contig, from, to } => {
+                    if self.contig == *contig {
+                        match (from, to) {
+                            (None, None) => vec_bools.push(true),
+                            (None, Some(to)) => vec_bools.push(self.position <= *to),
+                            (Some(from), None) => vec_bools.push(self.position >= *from),
+                            (Some(from), Some(to)) => {
+                                vec_bools.push(self.position >= *from && self.position <= *to)
+                            }
+                        }
+                    } else {
+                        vec_bools.push(false);
+                    }
+                }
+                Category::VCFSource(_) => (),
+                Category::NCosmic(n) => {
+                    let mut bools = Vec::new();
+                    for annotations in self.annotations.iter() {
+                        if let AnnotationType::Cosmic(c) = annotations {
+                            bools.push(c.cosmic_cnt >= *n);
+                            break;
+                        }
+                    }
+                    vec_bools.push(bools.iter().any(|&b| b));
+                }
+                Category::NCBIFeature(ncbi_feature) => {
+                    let mut bools = Vec::new();
+                    for annotations in self.annotations.iter() {
+                        if let AnnotationType::NCBIGFF(v) = annotations {
+                            bools.push(v.feature == *ncbi_feature);
+                        }
+                    }
+                    vec_bools.push(bools.iter().any(|&b| b));
+                }
+                Category::VAF { min, max } => {
+                    let v = if self.vaf.is_none() {
+                        let mut s = self.clone();
+                        s.vaf()
+                    } else {
+                        self.vaf.unwrap()
+                    };
+                    vec_bools.push(v >= *min && v <= *max);
+                }
+                Category::Pangolin => {
+                    vec_bools.push(
+                        self.annotations
+                            .iter()
+                            .filter(|a| matches!(a, AnnotationType::Pangolin(_)))
+                            .count()
+                            > 0,
+                    );
+                }
+            }
+        }
+        vec_bools.iter().all(|&x| x)
+    }
+
+    pub fn callers(&self) -> Vec<String> {
+        self.source
+            .iter()
+            .map(|source| source.to_string())
+            .collect()
+    }
+}
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
+pub enum AlterationCategory {
+    Snv,
+    Ins,
+    Del,
+    Rep,
+    Other,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
+pub enum AnnotationType {
+    VariantCategory(VariantCategory),
+    VEP(Vec<VEP>),
+    Cluster(i32),
+    Cosmic(Cosmic),
+    GnomAD(GnomAD),
+    NCBIGFF(NCBIGFF),
+    Pangolin(Pangolin),
+    Phase(PhaseAnnotation),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
+pub enum VariantCategory {
+    Somatic,
+    LowMRDDepth,
+    LOH,
+    Constit,
+    LowDiversity,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, ToSchema)]
+pub enum ReferenceAlternative {
+    Nucleotide(Base),
+    Nucleotides(Vec<Base>),
+    Unstructured(String),
+}
+
+impl FromStr for ReferenceAlternative {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self> {
+        let possible_bases = s.as_bytes().iter();
+        let mut res: Vec<Base> = Vec::new();
+        for &base in possible_bases {
+            match base.try_into() {
+                std::result::Result::Ok(b) => res.push(b),
+                Err(_) => {
+                    return Ok(Self::Unstructured(s.to_string()));
+                }
+            }
+        }
+
+        if res.len() == 1 {
+            Ok(Self::Nucleotide(res.pop().unwrap()))
+        } else {
+            Ok(Self::Nucleotides(res))
+        }
+    }
+}
+
+impl fmt::Display for ReferenceAlternative {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let string = match self {
+            ReferenceAlternative::Nucleotide(b) => b.to_string(),
+            ReferenceAlternative::Nucleotides(bases) => bases
+                .iter()
+                .fold(String::new(), |acc, e| format!("{}{}", acc, e)),
+            ReferenceAlternative::Unstructured(s) => s.to_string(),
+        };
+        write!(f, "{}", string)
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, ToSchema)]
+pub enum Base {
+    A,
+    T,
+    C,
+    G,
+    N,
+}
+
+impl TryFrom<u8> for Base {
+    type Error = anyhow::Error;
+    fn try_from(base: u8) -> Result<Self> {
+        match base {
+            b'A' => Ok(Base::A),
+            b'T' => Ok(Base::T),
+            b'C' => Ok(Base::C),
+            b'G' => Ok(Base::G),
+            b'N' => Ok(Base::N),
+            _ => Err(anyhow!(
+                "Unknown base: {}",
+                String::from_utf8_lossy(&[base])
+            )),
+        }
+    }
+}
+
+impl Base {
+    pub fn into_u8(self) -> u8 {
+        match self {
+            Base::A => b'A',
+            Base::T => b'T',
+            Base::C => b'C',
+            Base::G => b'G',
+            Base::N => b'N',
+        }
+    }
+}
+
+impl fmt::Display for Base {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // Use `self.number` to refer to each positional data point.
+        let str = match self {
+            Base::A => "A",
+            Base::T => "T",
+            Base::C => "C",
+            Base::G => "G",
+            Base::N => "N",
+        };
+        write!(f, "{}", str)
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, ToSchema)]
+pub enum Format {
+    DeepVariant(DeepVariantFormat),
+    ClairS(ClairSFormat),
+    Sniffles(SnifflesFormat),
+    Nanomonsv(NanomonsvFormat),
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, ToSchema)]
+pub enum Info {
+    #[schema(value_type=String)]
+    DeepVariant(DeepVariantInfo),
+    #[schema(value_type=String)]
+    ClairS(ClairSInfo),
+    #[schema(value_type=String)]
+    Sniffles(SnifflesInfo),
+    #[schema(value_type=String)]
+    Nanomonsv(NanomonsvInfo),
+}
+
+fn parse_info(s: &str, source: &VCFSource) -> Result<Info> {
+    match source {
+        VCFSource::DeepVariant => Ok(Info::DeepVariant(s.parse()?)),
+        VCFSource::ClairS => Ok(Info::ClairS(s.parse()?)),
+        VCFSource::Sniffles => Ok(Info::Sniffles(s.parse()?)),
+        VCFSource::Nanomonsv => Ok(Info::Nanomonsv(s.parse()?)),
+    }
+}
+
+fn parse_format(vcf_source: &VCFSource, data: &str) -> Result<Format> {
+    let res = match vcf_source {
+        VCFSource::DeepVariant => Format::DeepVariant(data.parse()?),
+        VCFSource::ClairS => Format::ClairS(data.parse()?),
+        VCFSource::Sniffles => Format::Sniffles(data.parse()?),
+        VCFSource::Nanomonsv => Format::Nanomonsv(data.parse()?),
+    };
+    Ok(res)
+}
+
+

+ 118 - 0
src/vcf_reader.rs

@@ -0,0 +1,118 @@
+use std::{fs::File, io::BufReader};
+
+use csv::ReaderBuilder;
+use pandora_lib_variants::variants::Variant;
+
+use crate::{callers::Caller, variant::variant::VariantType};
+
+#[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
+pub struct VCFRow {
+    pub chr: String,
+    pub pos: u32,
+    pub id: String,
+    pub reference: String,
+    pub alt: String,
+    pub qual: String,
+    pub filter: String,
+    pub info: String,
+    pub format: String,
+    pub value: String,
+}
+
+pub fn read_vcf(
+    path: &str,
+    caller: &Caller,
+    variant_type: &VariantType,
+) -> anyhow::Result<Vec<Variant>> {
+    let mut reader = ReaderBuilder::new()
+        .delimiter(b'\t')
+        .comment(Some(b'#'))
+        .has_headers(false)
+        .flexible(true)
+        .from_reader(get_reader(path)?);
+    let iter = reader.deserialize();
+
+    let mut all = Vec::new();
+
+    // should be replaced with bcftools
+    for result in iter {
+        let record: VCFRow = result?;
+
+        // Normalize into multirows
+        if record.alt.contains(",") {
+            let alts = record.alt.split(',').collect::<Vec<&str>>();
+            let n = alts.len();
+
+            let vals = record.value.split(':').collect::<Vec<&str>>();
+            let ads = vals.get(3).unwrap().split(',').collect::<Vec<&str>>();
+            let vafs = vals.get(4).unwrap().split(',').collect::<Vec<&str>>();
+            let pls = vals.get(5).unwrap().split(',').collect::<Vec<&str>>();
+
+            for i in 0..n {
+                let cp = &pls[(i * 3)..(i * 3 + 3)];
+                let nval = format!(
+                    "{}:{}:{}:{}:{}:{}",
+                    vals[0],
+                    vals[1],
+                    vals[2],
+                    [ads[0], ads[i + 1]].join(","),
+                    vafs[i],
+                    cp.join(",")
+                );
+
+                let mut rec = record.clone();
+                rec.value = nval.clone();
+                rec.alt = alts[i].to_string();
+
+                all.push(rec);
+            }
+        } else {
+            all.push(record);
+        }
+    }
+
+    let base_n = "N".to_string();
+    let res: Vec<Variant> = all
+        .par_iter_mut()
+        .map(|row| {
+            // for Sniffles normalize insertion/deletion position (after the pos)
+            if caller == &VCFSource::Sniffles && row.reference == base_n && row.alt.len() > 1 {
+                row.pos -= 1;
+            }
+            let mut v = Variant::from_vcfrow(row, caller.clone(), variant_type.clone()).unwrap();
+            v.get_depth();
+            v.get_n_alt();
+            v
+        })
+        .filter(|v| {
+            for cd in v.callers_data.iter() {
+                if cd.should_filter() {
+                    return false;
+                }
+            }
+            true
+        })
+        .collect();
+
+    Ok(res)
+}
+
+pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn std::io::Read>> {
+    let file_type = *path.split(".").collect::<Vec<&str>>().last()?;
+    assert!(file_type == "gz" || file_type == "vcf");
+
+    let raw_reader: Box<dyn std::io::Read> = Box::new(File::open(path)?);
+
+    match file_type {
+        "gz" => {
+            let reader = Box::new(BGZFReader::new(raw_reader)?);
+            Ok(Box::new(BufReader::new(reader)))
+        }
+        "vcf" => {
+            Ok(Box::new(BufReader::new(raw_reader)))
+        }
+        t => {
+            panic!("unknown file type: {}", t)
+        }
+    }
+}

Kaikkia tiedostoja ei voida näyttää, sillä liian monta tiedostoa muuttui tässä diffissä