Forráskód Böngészése

entropy filter // echtvar annotate

Thomas 11 hónapja
szülő
commit
add4a2ecc9

+ 1 - 0
Cargo.lock

@@ -3624,6 +3624,7 @@ dependencies = [
  "noodles-core",
  "noodles-csi",
  "noodles-fasta",
+ "noodles-gff",
  "num-format",
  "pandora_lib_assembler",
  "pandora_lib_bindings",

+ 1 - 0
Cargo.toml

@@ -51,3 +51,4 @@ blake3 = "1.5.5"
 charming = { version = "0.4.0", features = ["ssr"] }
 rusqlite = { version = "0.32.1", features = ["chrono", "serde_json"] }
 dirs = "5.0.1"
+noodles-gff = "0.41.0"

BIN
prob.vcf.gz


+ 1 - 0
src/annotation/echtvar.rs

@@ -11,6 +11,7 @@ use super::{cosmic::Cosmic, gnomad::GnomAD};
 // /data/tools/echtvar anno -e /data/ref/hs1/CosmicCodingMuts.echtvar.zip -e /data/ref/hs1/gnomAD_4-2022_10-gnomad.echtvar.zip BENGUIRAT_diag_clairs_PASSED.vcf.gz test.bcf
 pub fn run_echtvar(in_path: &str, out_path: &str) -> Result<()> {
     let bin_dir = "/data/tools";
+    // let _ = Command::new("tabix").arg(in_path).spawn()?.wait()?;
 
     let annot_sources: Vec<&str> = [
         "/data/ref/hs1/CosmicCodingMuts.echtvar.zip",

+ 20 - 0
src/annotation/mod.rs

@@ -1,6 +1,8 @@
 pub mod cosmic;
 pub mod echtvar;
 pub mod gnomad;
+pub mod vep;
+pub mod ncbi;
 
 use std::{
     collections::{HashMap, HashSet},
@@ -29,6 +31,7 @@ pub enum Annotation {
     HighConstitAlt,
     Cosmic(Cosmic),
     GnomAD(GnomAD),
+    LowEntropy,
 }
 
 impl fmt::Display for Annotation {
@@ -46,6 +49,7 @@ impl fmt::Display for Annotation {
             Annotation::HighConstitAlt => "HighConstitAlt",
             Annotation::Cosmic(_) => "Cosmic",
             Annotation::GnomAD(_) => "GnomAD",
+            Annotation::LowEntropy => "LowEntropy",
         };
         write!(f, "{}", str)
     }
@@ -122,6 +126,7 @@ impl Annotations {
                     | Annotation::Germline
                     | Annotation::Somatic
                     | Annotation::LowConstitDepth
+                    | Annotation::LowEntropy
                     | Annotation::GnomAD(_)
                     | Annotation::HighConstitAlt => categorical.push(ann.to_string()),
                     Annotation::Callers(caller) => categorical.push(caller.to_string()),
@@ -250,4 +255,19 @@ impl Annotations {
                 },
             )
     }
+
+    pub fn low_shannon_entropy(&mut self, min_shannon_entropy: f64) {
+        self.store.iter_mut().for_each(|mut e| {
+            let anns = e.value_mut();
+            let mut is_low = false;
+            anns.iter().for_each(|ann| if let Annotation::ShannonEntropy(ent) = ann {
+                if *ent < min_shannon_entropy && !anns.contains(&Annotation::Somatic) {
+                    is_low = true
+                }
+            });
+            if is_low {
+                anns.push(Annotation::LowEntropy);
+            }
+        });
+    }
 }

+ 86 - 0
src/annotation/ncbi.rs

@@ -0,0 +1,86 @@
+use anyhow::{Context, Ok, Result};
+use serde::{Deserialize, Serialize};
+use std::str::FromStr;
+
+#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
+pub struct NCBIGFF {
+    pub feature: String,
+    pub name: Option<String>,
+    pub standard_name: Option<String>,
+    pub function: Option<String>,
+    pub experiment: Option<String>,
+    pub note: Option<String>,
+    pub regulatory_class: Option<String>,
+}
+
+impl From<noodles_gff::RecordBuf> for NCBIGFF {
+    fn from(r: noodles_gff::RecordBuf) -> Self {
+        let attr = r.attributes();
+
+        let inner_string = |name: &str| {
+            attr.get(name).map(|e| match e {
+                noodles_gff::record_buf::attributes::field::Value::String(s) => s.to_string(),
+                noodles_gff::record_buf::attributes::field::Value::Array(v) => v.join(" "),
+            })
+        };
+
+        NCBIGFF {
+            feature: r.ty().to_string(),
+            name: inner_string("Name"),
+            standard_name: inner_string("standard_name"),
+            function: inner_string("function"),
+            experiment: inner_string("experiment"),
+            note: inner_string("Note"),
+            regulatory_class: inner_string("regulatory_class"),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct NCBIAcc {
+    pub prefix: String,
+    pub number: u64,
+    pub version: f32,
+}
+
+impl FromStr for NCBIAcc {
+    type Err = anyhow::Error;
+    fn from_str(s: &str) -> Result<Self> {
+        if s.contains("unassigned_transcript_") {
+            let s = s.replace("unassigned_transcript_", "");
+            let (num, v) = if s.contains("_") {
+                s.split_once("_").context("first split error")?
+            } else {
+                (s.as_str(), "0")
+            };
+            Ok(NCBIAcc {
+                prefix: "unassigned_transcript".to_string(),
+                number: num.parse().context("Error parsing NCBI accession number")?,
+                version: v.parse().context("Error parsing NCBI accession version")?,
+            })
+        } else if s.contains("_") {
+            if s.contains(".") {
+                let (rest, v) = s.split_once(".").context("first split error")?;
+                let (pref, num) = rest.split_once("_").context("second split error")?;
+                let v = v.replace("_", ".");
+                Ok(NCBIAcc {
+                    prefix: pref.to_string(),
+                    number: num.parse().context("Error parsing NCBI accession number")?,
+                    version: v.parse().context("Error parsing NCBI accession version")?,
+                })
+            } else {
+                Ok(NCBIAcc {
+                    prefix: s.to_string(),
+                    number: u64::MAX,
+                    version: 0.0,
+                })
+            }
+        } else {
+            Ok(NCBIAcc {
+                prefix: s.to_string(),
+                number: u64::MAX,
+                version: 0.0,
+            })
+        }
+    }
+}

+ 367 - 0
src/annotation/vep.rs

@@ -0,0 +1,367 @@
+use anyhow::{anyhow, Context, Ok, Result};
+use csv::ReaderBuilder;
+use hashbrown::HashMap;
+use log::warn;
+use serde::{Deserialize, Serialize};
+use std::io::Write;
+use std::{
+    env::temp_dir,
+    fs::{self, File},
+    io::{BufRead, BufReader},
+    process::{Command, Stdio},
+    str::FromStr,
+};
+
+use crate::io::vcf::vcf_header;
+
+use super::ncbi::NCBIAcc;
+
+#[derive(Debug, PartialEq, Serialize, Deserialize)]
+pub struct VEPLine {
+    pub uploaded_variation: String,
+    pub location: String,
+    pub allele: String,
+    pub gene: String,
+    pub feature: String,
+    pub feature_type: String,
+    pub consequence: String,
+    pub cdna_position: String,
+    pub cds_position: String,
+    pub protein_position: String,
+    pub amino_acids: String,
+    pub codons: String,
+    pub existing_variation: String,
+    pub extra: String,
+}
+
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub struct VEP {
+    pub gene: Option<String>,
+    pub feature: Option<String>,
+    pub feature_type: Option<String>,
+    pub consequence: Option<Vec<String>>,
+    pub cdna_position: Option<String>,
+    pub cds_position: Option<String>,
+    pub protein_position: Option<String>,
+    pub amino_acids: Option<String>,
+    pub codons: Option<String>,
+    pub existing_variation: Option<String>,
+    pub extra: VEPExtra,
+}
+
+// ensembl.org/info/genome/variation/prediction/predicted_data.html
+#[derive(Debug, PartialEq, Eq)]
+pub enum VepConsequence {
+    Transcript_ablation,
+    Splice_acceptor_variant,
+    Splice_donor_variant,
+    Stop_gained,
+    Frameshift_variant,
+    Stop_lost,
+    Start_lost,
+    Transcript_amplification,
+    Inframe_insertion,
+    Inframe_deletion,
+    Missense_variant,
+    Protein_altering_variant,
+    Splice_region_variant,
+    Incomplete_terminal_codon_variant,
+    Start_retained_variant,
+    Stop_retained_variant,
+    Synonymous_variant,
+    Coding_sequence_variant,
+    Mature_miRNA_variant,
+    Five_prime_UTR_variant,
+    Three_prime_UTR_variant,
+    Non_coding_transcript_exon_variant,
+    Intron_variant,
+    NMD_transcript_variant,
+    Non_coding_transcript_variant,
+    Upstream_gene_variant,
+    Downstream_gene_variant,
+    TFBS_ablation,
+    TFBS_amplification,
+    TF_binding_site_variant,
+    Regulatory_region_ablation,
+    Regulatory_region_amplification,
+    Feature_elongation,
+    Regulatory_region_variant,
+    Feature_truncation,
+    Intergenic_variant,
+}
+
+impl VEP {
+    fn from_vep_line(d: &VEPLine) -> Result<VEP> {
+        let or_opt = |s: &str| match s {
+            "-" => None,
+            _ => Some(s.to_string()),
+        };
+
+        let consequence = or_opt(&d.consequence)
+            .map(|c| c.split(",").map(|e| e.to_string()).collect::<Vec<String>>());
+
+        Ok(VEP {
+            gene: or_opt(&d.gene),
+            feature: or_opt(&d.feature),
+            feature_type: or_opt(&d.feature_type),
+            consequence,
+            cdna_position: or_opt(&d.feature_type),
+            cds_position: or_opt(&d.cds_position),
+            protein_position: or_opt(&d.protein_position),
+            amino_acids: or_opt(&d.amino_acids),
+            codons: or_opt(&d.codons),
+            existing_variation: or_opt(&d.existing_variation),
+            extra: d.extra.parse()?,
+        })
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub struct VEPExtra {
+    pub impact: Option<VEPImpact>,
+    pub symbol: Option<String>,
+    pub distance: Option<u32>,
+    pub hgvs_c: Option<String>,
+    pub hgvs_p: Option<String>,
+}
+impl FromStr for VEPExtra {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self> {
+        let err = |c| anyhow!("Error {} parsing VEP Extra field {}", c, s);
+
+        let elements = s.split(";").collect::<Vec<&str>>();
+
+        let mut kv = HashMap::new();
+
+        for e in elements.iter() {
+            let (k, v) = e.split_once("=").ok_or(err("in split '='"))?;
+            if kv.insert(k, v).is_some() {
+                return Err(err("kv insert"));
+            };
+        }
+
+        let impact: Option<VEPImpact> = if let Some(v) = kv.get("IMPACT") {
+            Some(v.parse()?)
+        } else {
+            None
+        };
+        let symbol: Option<String> = kv.get("SYMBOL").map(|v| v.to_string());
+        let distance: Option<u32> = if let Some(v) = kv.get("DISTANCE") {
+            Some(v.parse()?)
+        } else {
+            None
+        };
+        let hgvs_c: Option<String> = kv.get("HGVSc").map(|v| v.to_string());
+        let hgvs_p: Option<String> = kv.get("HGVSp").map(|v| v.to_string());
+
+        Ok(VEPExtra {
+            impact,
+            symbol,
+            distance,
+            hgvs_c,
+            hgvs_p,
+        })
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub enum VEPImpact {
+    Low,
+    Moderate,
+    High,
+    Modifier,
+}
+
+impl FromStr for VEPImpact {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self> {
+        match s {
+            "LOW" => Ok(VEPImpact::Low),
+            "MODERATE" => Ok(VEPImpact::Moderate),
+            "HIGH" => Ok(VEPImpact::High),
+            "MODIFIER" => Ok(VEPImpact::Modifier),
+            _ => Err(anyhow!("Unexpected VEP Impact value")),
+        }
+    }
+}
+// pub fn vep_chunk(data: &mut [Variant]) -> Result<()> {
+//     let in_vcf = format!(
+//         "{}/vep_{}.vcf",
+//         temp_dir().to_str().unwrap(),
+//         uuid::Uuid::new_v4()
+//     );
+//     let out_vep = format!(
+//         "{}/vep_{}.txt",
+//         temp_dir().to_str().unwrap(),
+//         uuid::Uuid::new_v4()
+//     );
+//
+//     let mut vcf = File::create(&in_vcf).unwrap();
+//     let vcf_header = vcf_header("/data/ref/hs1/chm13v2.0.dict")?;
+//
+//     writeln!(vcf, "{}", vcf_header.join("\n")).unwrap();
+//
+//     for (i, row) in data.iter().enumerate() {
+//         writeln!(
+//             vcf,
+//             "{}\t{}\t{}\t{}\t{}\t.\tPASS\t.\t.\t.",
+//             row.contig,
+//             row.position,
+//             i + 1,
+//             row.reference,
+//             row.alternative
+//         )?;
+//     }
+//
+//     if let Err(err) = run_vep(&in_vcf, &out_vep) {
+//         panic!("{err}");
+//     };
+//
+//     // read the results in txt file, parse and add to HashMap
+//     let mut reader_vep = ReaderBuilder::new()
+//         .delimiter(b'\t')
+//         .has_headers(false)
+//         .comment(Some(b'#'))
+//         .flexible(true)
+//         .from_reader(fs::File::open(out_vep.clone())?);
+//
+//     let mut lines: HashMap<u64, Vec<VEPLine>> = HashMap::new();
+//     for line in reader_vep.deserialize::<VEPLine>() {
+//         if let std::result::Result::Ok(line) = line {
+//             if let std::result::Result::Ok(k) = line.uploaded_variation.parse::<u64>() {
+//                 lines
+//                     .raw_entry_mut()
+//                     .from_key(&k)
+//                     .or_insert_with(|| (k, vec![]))
+//                     .1
+//                     .push(line);
+//             } else {
+//                 return Err(anyhow!("Error while parsing: {:?}", line));
+//             }
+//         } else {
+//             return Err(anyhow!("Error while parsing: {:?}", line));
+//         }
+//     }
+//
+//     // remove input and result file
+//     fs::remove_file(in_vcf)?;
+//     fs::remove_file(out_vep)?;
+//
+//     let mut n_not_vep = 0;
+//     data.iter_mut().enumerate().for_each(|(i, entry)| {
+//         let k = (i + 1) as u64;
+//
+//         match lines.get(&k) {
+//             Some(vep_lines) => {
+//                 let vep: Vec<VEP> = vep_lines
+//                     .iter()
+//                     .map(|e| match VEP::from_vep_line(e) {
+//                         std::result::Result::Ok(r) => r,
+//                         Err(err) => panic!("Error while parsing: {} line: {:?}", err, e),
+//                     })
+//                     .collect();
+//                 entry.annotations.push(AnnotationType::VEP(vep.to_vec()));
+//             }
+//             None => {
+//                 n_not_vep += 1;
+//             }
+//         };
+//     });
+//
+//     if n_not_vep > 0 {
+//         warn!("{} variants not annotated by VEP", n_not_vep);
+//     }
+//
+//     Ok(())
+// }
+//
+// VEP need plugin Downstream and SpliceRegion /home/prom/.vep/Plugins
+fn run_vep(in_path: &str, out_path: &str) -> Result<()> {
+    let bin_dir = "/data/tools/ensembl-vep";
+    let dir_cache = "/data/ref/hs1/vepcache/";
+    let fasta = "/data/ref/hs1/chm13v2.0.fa";
+    let gff = "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_sorted.gff3.gz";
+    // let gff = "/data/ref/hs1/ncbi_dataset/data/GCF_009914755.1/genomic_chr_sorted.gff.gz";
+
+    // info!("Running VEP for {}", in_path);
+    let mut cmd = Command::new(format!("{}/vep", bin_dir))
+        .arg("--dir_cache")
+        .arg(dir_cache)
+        .arg("--cache")
+        .arg("--offline")
+        .arg("--fasta")
+        .arg(fasta)
+        .arg("--gff")
+        .arg(gff)
+        .arg("--symbol")
+        .arg("--plugin")
+        .arg("SpliceRegion")
+        .arg("--plugin")
+        .arg("Downstream")
+        .arg("--hgvs")
+        .arg("-i")
+        .arg(in_path)
+        .arg("-o")
+        .arg(out_path)
+        .stderr(Stdio::piped())
+        // .stderr(Stdio::null())
+        .spawn()
+        .expect("VEP failed to start");
+    // .stderr
+    // .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::Other, "Could not capture standard output.")).unwrap();
+
+    let stderr = cmd.stderr.take().unwrap();
+    let reader = BufReader::new(stderr);
+    reader
+        .lines()
+        .map_while(Result::ok)
+        // .inspect(|y| println!("{y}"))
+        .filter(|line| line.contains("error"))
+        .for_each(|line| warn!("{}", line));
+
+    cmd.wait()?;
+    Ok(())
+}
+
+pub fn get_best_vep(d: &[VEP]) -> Result<VEP> {
+    d.into_iter().filter(|v| v.)
+
+    if d.is_empty() {
+        return Err(anyhow!("No element in VEP vector"));
+    }
+    if d.len() == 1 {
+        return Ok(d.first().unwrap().clone());
+    }
+
+    let mut parsed: Vec<(usize, NCBIAcc)> = Vec::new();
+    for (i, vep) in d.iter().enumerate() {
+        if let Some(feat) = &vep.feature {
+            if let std::result::Result::Ok(f) = feat
+                .parse::<NCBIAcc>()
+                .context("Error parsing NCBI accession")
+            {
+                parsed.push((i, f));
+            } else {
+                warn!("Can't parse {}", feat);
+            }
+        }
+    }
+
+    parsed.sort_by(|(_, a), (_, b)| a.number.cmp(&b.number));
+
+    let nm: Vec<(usize, NCBIAcc)> = parsed
+        .clone()
+        .into_iter()
+        .filter(|(_, e)| e.prefix == *"NM")
+        .collect();
+
+    if !nm.is_empty() {
+        let (k, _) = nm.first().unwrap();
+        return Ok(d.get(*k).unwrap().clone());
+    } else {
+        let (k, _) = parsed.first().unwrap();
+        return Ok(d.get(*k).unwrap().clone());
+    }
+}

+ 2 - 0
src/config.rs

@@ -41,6 +41,7 @@ pub struct Config {
     pub mask_bed: String,
     pub solo_min_constit_depth: u16,
     pub solo_max_alt_constit: u16,
+    pub min_shannon_entropy: f64,
 }
 
 // Here comes names that can't be changed from output of tools
@@ -121,6 +122,7 @@ impl Default for Config {
             // Pipe
             solo_min_constit_depth: 5,
             solo_max_alt_constit: 1,
+            min_shannon_entropy: 1.0,
         }
     }
 }

+ 1 - 1
src/helpers.rs

@@ -233,7 +233,7 @@ pub fn par_intersection<T: Ord + Send + Sync + Clone>(
 
 pub fn get_temp_file_path() -> std::io::Result<PathBuf> {
     let temp_path = tempfile::Builder::new()
-        .prefix("my-temp-")
+        .prefix("pandora-temp-")
         .suffix(".tmp")
         .rand_bytes(5)
         .tempfile()?

+ 33 - 0
src/io/dict.rs

@@ -0,0 +1,33 @@
+use std::fs;
+use anyhow::{Context, Ok, Result};
+use csv::ReaderBuilder;
+use log::info;
+
+pub fn read_dict(path: &str) -> Result<Vec<(String, u32)>> {
+    info!("Parsing {path}.");
+
+    let mut reader = ReaderBuilder::new()
+        .delimiter(b'\t')
+        .flexible(true)
+        .has_headers(false)
+        .from_reader(fs::File::open(path)?);
+
+    let mut res = Vec::new();
+    for line in reader.records() {
+        let line = line.context("Can't parse dict file.")?;
+        if line.get(0).context("Can't parse dict file.")? == "@SQ" {
+            res.push((
+                line.get(1)
+                    .map(|s| s.replace("SN:", ""))
+                    .context("Can't parse dict file.")?
+                    .parse()?,
+                line.get(2)
+                    .map(|s| s.replace("LN:", ""))
+                    .context("Can't parse dict file.")?
+                    .parse()?,
+            ));
+        }
+    }
+
+    Ok(res)
+}

+ 1 - 0
src/io/mod.rs

@@ -2,3 +2,4 @@ pub mod pod5_infos;
 pub mod readers;
 pub mod vcf;
 pub mod bed;
+pub mod dict;

+ 42 - 3
src/io/vcf.rs

@@ -1,4 +1,7 @@
-use std::{fs::File, io::{BufRead, BufReader, Write}};
+use std::{
+    fs::File,
+    io::{BufRead, BufReader, Write},
+};
 
 use anyhow::Context;
 use bgzip::{write::BGZFMultiThreadWriter, Compression};
@@ -6,7 +9,7 @@ use log::{info, warn};
 
 use crate::variant::variant::VcfVariant;
 
-use super::readers::get_reader;
+use super::{dict::read_dict, readers::get_reader};
 
 pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
     let reader = BufReader::new(get_reader(path)?);
@@ -35,11 +38,47 @@ pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
     writer.write_all(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;
 
     for variant in variants {
-        writer.write_fmt(format_args!("{}\n", variant.commun_deepvariant_clairs().into_vcf_row()))?;
+        writer.write_fmt(format_args!(
+            "{}\n",
+            variant.commun_deepvariant_clairs().into_vcf_row()
+        ))?;
     }
 
     writer.close()?;
     Ok(())
 }
 
+#[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
+pub struct VCFRow {
+    pub chr: String,
+    pub pos: u32,
+    pub id: String,
+    pub reference: String,
+    pub alt: String,
+    pub qual: String,
+    pub filter: String,
+    pub info: String,
+    pub format: String,
+    pub value: String,
+}
+
+pub fn vcf_header(dict: &str) -> anyhow::Result<Vec<String>> {
+    let mut header = Vec::new();
+    // file format
+    header.push("##fileformat=VCFv4.2".to_string());
+    // Format
+    // Filter
+    // Info
+
+    // version
+    header.push(format!("##Pandora_version={}", env!("CARGO_PKG_VERSION")));
 
+    // contig
+    read_dict(dict)?
+        .into_iter()
+        .for_each(|(id, len)| header.push(format!("##contig=<ID={id},length={len}>")));
+
+    header.push("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE".to_string());
+
+    Ok(header)
+}

+ 112 - 12
src/pipes/somatic.rs

@@ -1,15 +1,9 @@
-use charming::{
-    element::{Emphasis, EmphasisFocus},
-    series::{Sankey, SankeyLink, SankeyNode},
-    Chart,
-};
-use hashbrown::HashMap;
 use log::info;
 use rayon::prelude::*;
-use std::{collections::HashSet, fs::File, sync::Arc};
+use std::{collections::HashSet, fs::File, io::Write, sync::Arc};
 
 use crate::{
-    annotation::{Annotation, Annotations, AnnotationsStats, Caller},
+    annotation::{Annotation, Annotations, Caller},
     callers::{clairs::ClairS, deep_variant::DeepVariant},
     collection::{Initialize, InitializeSolo},
     config::Config,
@@ -17,7 +11,7 @@ use crate::{
     runners::Run,
     variant::{
         variant::{load_variants, parallel_intersection, RunnerVariants, VcfVariant},
-        variant_collection::VariantCollection,
+        variant_collection::{ExternalAnnotation, VariantCollection},
     },
 };
 
@@ -44,6 +38,8 @@ pub struct SomaticStats {
     pub n_constit_germline: usize,
     pub n_low_constit: usize,
     pub n_high_alt_constit: usize,
+    pub n_high_alt_constit_gnomad: usize,
+    pub n_low_entropies: usize,
 }
 
 #[derive(Debug, Default, Clone)]
@@ -235,7 +231,6 @@ impl Run for Somatic {
             .collect();
 
         annotations.remove_keys(&high_alt_keys);
-
         somatic_stats.n_high_alt_constit = variants_collection
             .par_iter_mut()
             .map(|c| {
@@ -247,7 +242,6 @@ impl Run for Somatic {
                 rm
             })
             .sum();
-
         variants_collection.retain(|e| !e.variants.is_empty());
 
         variants_collection.iter().for_each(|c| c.stats());
@@ -261,6 +255,65 @@ impl Run for Somatic {
         });
         annotations.callers_stat();
 
+        variants_collection
+            .iter()
+            .try_for_each(|c| -> anyhow::Result<()> {
+                let ext_annot = ExternalAnnotation::init()?;
+                ext_annot.annotate(&c.variants, &annotations)?;
+                Ok(())
+            })?;
+
+        annotations.callers_stat();
+
+        let high_alt_gnomad_keys: HashSet<u128> = annotations
+            .get_keys_filter(|anns| {
+                anns.iter()
+                    .find_map(|a| {
+                        if let Annotation::GnomAD(gnomad) = a {
+                            Some(gnomad.gnomad_af > 0.0)
+                        } else {
+                            None
+                        }
+                    })
+                    .and_then(|gnomad_condition| {
+                        anns.iter()
+                            .find_map(|a| {
+                                if let Annotation::ConstitAlt(n_alt) = a {
+                                    Some(*n_alt > 0)
+                                } else {
+                                    None
+                                }
+                            })
+                            .map(|constit_alt_condition| gnomad_condition && constit_alt_condition)
+                    })
+                    .unwrap_or(false)
+            })
+            .into_iter()
+            .collect();
+
+        info!(
+            "Filtering out {} positions, with constit alt <= max contig alt ({}) and in GnomAD.",
+            high_alt_gnomad_keys.len(),
+            self.config.solo_max_alt_constit
+        );
+
+        annotations.remove_keys(&high_alt_gnomad_keys);
+        somatic_stats.n_high_alt_constit_gnomad = variants_collection
+            .par_iter_mut()
+            .map(|c| {
+                let before = c.variants.len();
+                c.remove_keys(&high_alt_gnomad_keys);
+                let after = c.variants.len();
+                let rm = before - after;
+                info!("Variants removed from {}: {rm}", c.vcf.path.display(),);
+                rm
+            })
+            .sum();
+        variants_collection.retain(|e| !e.variants.is_empty());
+        info!(
+            "{} variants filtered, with constit alt <= max contig alt ({}) and in GnomAD.",
+            somatic_stats.n_high_alt_constit_gnomad, self.config.solo_max_alt_constit
+        );
 
         let prob_keys: HashSet<u128> = annotations
             .get_keys_filter(|anns| {
@@ -282,9 +335,56 @@ impl Run for Somatic {
                 e.variants.clone()
             })
             .collect();
-
         write_vcf(&problematic_variants, "prob.vcf.gz")?;
 
+        // Entropies
+        let entropies: Vec<f64> = problematic_variants
+            .iter()
+            .flat_map(|v| {
+                annotations
+                    .store
+                    .get(&v.hash_variant())
+                    .unwrap()
+                    .iter()
+                    .flat_map(|a| {
+                        if let Annotation::ShannonEntropy(ent) = a {
+                            vec![*ent]
+                        } else {
+                            Vec::new()
+                        }
+                    })
+                    .collect::<Vec<_>>()
+            })
+            .collect();
+        let json_string = serde_json::to_string(&entropies)?;
+        let mut file = File::create("/data/entropies.json")?;
+        file.write_all(json_string.as_bytes())?;
+
+
+        annotations.low_shannon_entropy(self.config.min_shannon_entropy);
+        let low_ent_keys: HashSet<u128> = annotations
+            .get_keys_filter(|anns| anns.contains(&Annotation::LowEntropy))
+            .into_iter()
+            .collect();
+
+        annotations.remove_keys(&low_ent_keys);
+        annotations.callers_stat();
+        somatic_stats.n_low_entropies = variants_collection
+            .par_iter_mut()
+            .map(|c| {
+                let before = c.variants.len();
+                c.remove_keys(&low_ent_keys);
+                let after = c.variants.len();
+                let rm = before - after;
+                info!("Variants removed from {}: {rm}", c.vcf.path.display());
+                rm
+            })
+            .sum();
+        variants_collection.retain(|e| !e.variants.is_empty());
+        // let tw: Vec<VcfVariant> = variants_collection.iter().flat_map(|c| c.variants.clone()).collect();
+        // write_vcf(&tw, "low_ent.vcf.gz")?;
+
+
         Ok(())
     }
 }

+ 197 - 39
src/variant/variant_collection.rs

@@ -1,17 +1,29 @@
-use std::collections::HashSet;
+use std::{collections::HashSet, fs::{self, File}, io::{Read, Write}};
 
+use anyhow::Context;
 use chrono::Utc;
+use csv::ReaderBuilder;
 use log::info;
 use rayon::prelude::*;
+use uuid::Uuid;
 
 use super::variant::{AlterationCategory, VcfVariant};
 use crate::{
-    annotation::{cosmic::Cosmic, gnomad::GnomAD, Annotation, Annotations},
+    annotation::{
+        cosmic::Cosmic,
+        echtvar::{parse_echtvar_val, run_echtvar},
+        gnomad::GnomAD,
+        Annotation, Annotations,
+    },
     collection::{
         bam::{counts_at, counts_ins_at},
         vcf::Vcf,
     },
-    helpers::{app_storage_dir, estimate_shannon_entropy},
+    helpers::{app_storage_dir, estimate_shannon_entropy, get_temp_file_path},
+    io::{
+        readers::get_reader,
+        vcf::{vcf_header, write_vcf},
+    },
     pipes::somatic::sequence_at,
 };
 
@@ -124,6 +136,9 @@ impl VariantCollection {
                             if seq == alt_seq {
                                 n_alt = n;
                             }
+                            if seq == *"D" && alt_seq != *"D" {
+                                continue;
+                            }
                             depth += n;
                         }
 
@@ -167,7 +182,7 @@ impl ExternalAnnotation {
         let conn = Connection::open(db_path)?;
 
         conn.execute(
-            "CREATE TABLE IF NOT EXISTS your_table (
+            "CREATE TABLE IF NOT EXISTS annotations (
             id INTEGER PRIMARY KEY AUTOINCREMENT,
             hash BLOB(16) UNIQUE NOT NULL,
             source TEXT NOT NULL,
@@ -184,53 +199,196 @@ impl ExternalAnnotation {
         &self,
         variants: &[VcfVariant],
         annotations: &Annotations,
-        source: ExtAnnotationSource,
     ) -> anyhow::Result<()> {
         let mut unfound = Vec::new();
-        let mut found_count = 0;
-
-        let source = match &source {
-            ExtAnnotationSource::Cosmic(_) => "COSMIC",
-            ExtAnnotationSource::GnomAD(_) => "GnomAD",
-        };
 
         for variant in variants {
             let hash: u128 = variant.hash_variant();
+            let mut has_pushed = false;
+
+            // Check COSMIC
+            match self.get_annotation(hash, "COSMIC")? {
+                Some(cosmic) => {
+                    annotations
+                        .store
+                        .entry(hash)
+                        .or_default()
+                        .push(Annotation::Cosmic(cosmic));
+                }
+                None => {
+                    unfound.push(variant.clone());
+                    has_pushed = true;
+                }
+            }
 
-            // Attempt to retrieve the data from the database
-            let result: SqliteResult<(Vec<u8>, String)> = self.conn.query_row(
-                "SELECT data, created_at FROM your_table WHERE hash = ? AND source = ?",
-                params![hash.to_le_bytes().to_vec(), source],
-                |row| Ok((row.get(0)?, row.get(1)?)),
-            );
-
-            match result {
-                Ok((data, created_at)) => {
-                    // Deserialize the data based on the source
-                    let annotation = match source {
-                        "COSMIC" => {
-                            let cosmic: Cosmic = serde_json::from_slice(&data)?;
-                            Annotation::Cosmic(cosmic)
-                        }
-                        "GnomAD" => {
-                            let gnomad: GnomAD = serde_json::from_slice(&data)?;
-                            Annotation::GnomAD(gnomad)
-                        }
-                        _ => return Err(anyhow::anyhow!("Unknown source: {}", source)),
-                    };
+            // Check GnomAD
+            match self.get_annotation(hash, "GnomAD")? {
+                Some(gnomad) => {
+                    annotations
+                        .store
+                        .entry(hash)
+                        .or_default()
+                        .push(Annotation::GnomAD(gnomad));
+                }
+                None => {
+                    if !has_pushed {
+                        unfound.push(variant.clone())
+                    }
+                }
+            }
+        }
+
+        if !unfound.is_empty() {
+            self.process_unfound_echtvar(unfound, annotations)?;
+        }
 
-                    // Update the in-memory annotations
-                    annotations.store.entry(hash).or_default().push(annotation);
-                    found_count += 1;
+        Ok(())
+    }
+
+    fn get_annotation<T: serde::de::DeserializeOwned>(
+        &self,
+        hash: u128,
+        source: &str,
+    ) -> anyhow::Result<Option<T>> {
+        let result: SqliteResult<Vec<u8>> = self.conn.query_row(
+            "SELECT data FROM annotations WHERE hash = ? AND source = ?",
+            params![hash.to_le_bytes().to_vec(), source],
+            |row| row.get(0),
+        );
+
+        match result {
+            Ok(data) => Ok(Some(serde_json::from_slice(&data)?)),
+            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
+            Err(e) => Err(e.into()),
+        }
+    }
+
+    pub fn process_unfound_echtvar(
+        &self,
+        mut unfound: Vec<VcfVariant>,
+        annotations: &Annotations,
+    ) -> anyhow::Result<()> {
+        let temp_dir = std::env::temp_dir();
+
+        unfound.par_sort();
+        unfound.dedup();
+
+        let header = vcf_header("/data/ref/hs1/chm13v2.0.dict")?.join("\n");
+
+        let results = unfound
+            .par_chunks(unfound.len() / 33)
+            .flat_map(|chunk| -> anyhow::Result<Vec<_>> {
+                let in_tmp = temp_dir.join(format!("echtvar_in_{}.vcf", Uuid::new_v4()));
+                let out_tmp = temp_dir.join(format!("echtvar_out_{}.vcf.gz", Uuid::new_v4()));
+
+                // Write input VCF
+                let mut vcf = File::create(&in_tmp)?;
+                writeln!(vcf, "{}", header)?;
+                for (i, row) in chunk.iter().enumerate() {
+                    writeln!(
+                        vcf,
+                        "{}\t{}\t{}\t{}\t{}\t.\tPASS\t.\t.\t.",
+                        row.position.contig(),
+                        row.position.position + 1, // vcf
+                        i + 1,
+                        row.reference,
+                        row.alternative
+                    )?;
                 }
-                Err(rusqlite::Error::QueryReturnedNoRows) => {
-                    // Variant not found in the database
-                    unfound.push(variant.clone());
+
+                // Run echtvar
+                run_echtvar(in_tmp.to_str().unwrap(), out_tmp.to_str().unwrap())?;
+                fs::remove_file(in_tmp)?;
+
+                // Parse echtvar output
+                let mut reader = ReaderBuilder::new()
+                    .delimiter(b'\t')
+                    .has_headers(false)
+                    .comment(Some(b'#'))
+                    .flexible(true)
+                    .from_reader(get_reader(out_tmp.to_str().unwrap())?);
+
+                let mut chunk_results = Vec::new();
+                for (i, result) in reader.deserialize::<crate::io::vcf::VCFRow>().enumerate() {
+                    let row = result?;
+
+                    // Verify that the ID corresponds to the input
+                    let id: usize = row.id.parse().context("Failed to parse ID")?;
+                    if id != i + 1 {
+                        return Err(anyhow::anyhow!(
+                            "Echtvar output ID {} does not match expected ID {}",
+                            id,
+                            i + 1
+                        ));
+                    }
+
+                    let (cosmic, gnomad) = parse_echtvar_val(&row.info)?;
+
+                    let hash = chunk[i].hash_variant();
+
+                    chunk_results.push((hash, cosmic, gnomad));
                 }
-                Err(e) => return Err(e.into()),
+
+                fs::remove_file(out_tmp)?;
+                Ok(chunk_results)
+            })
+            .flatten()
+            .collect::<Vec<_>>();
+
+        for (hash, cosmic, gnomad) in results {
+            // let created_at = chrono::Utc::now().to_rfc3339();
+
+            // Update COSMIC
+            // self.conn.execute(
+            //     "INSERT OR REPLACE INTO cosmic_annotations (hash, data, created_at) VALUES (?, ?, ?)",
+            //     params![
+            //         hash.to_le_bytes().to_vec(),
+            //         cosmic.as_ref().map(|c| serde_json::to_vec(c).unwrap()),
+            //         created_at
+            //     ],
+            // )?;
+            //
+            // // Update GnomAD
+            // self.conn.execute(
+            //     "INSERT OR REPLACE INTO gnomad_annotations (hash, data, created_at) VALUES (?, ?, ?)",
+            //     params![
+            //         hash.to_le_bytes().to_vec(),
+            //         gnomad.as_ref().map(|g| serde_json::to_vec(g).unwrap()),
+            //         created_at
+            //     ],
+            // )?;
+
+            // Update in-memory annotations
+            if let Some(c) = cosmic {
+                annotations
+                    .store
+                    .entry(hash)
+                    .or_default()
+                    .push(Annotation::Cosmic(c));
+            }
+            if let Some(g) = gnomad {
+                annotations
+                    .store
+                    .entry(hash)
+                    .or_default()
+                    .push(Annotation::GnomAD(g));
             }
         }
 
         Ok(())
     }
+
+    pub fn update_database(
+        &self,
+        hash: u128,
+        source: &str,
+        data: &[u8],
+        created_at: &str,
+    ) -> anyhow::Result<()> {
+        self.conn.execute(
+            "INSERT OR REPLACE INTO annotations (hash, source, data, created_at) VALUES (?, ?, ?, ?)",
+            params![hash.to_le_bytes().to_vec(), source, data, created_at],
+        )?;
+        Ok(())
+    }
 }