pub mod cosmic; pub mod echtvar; pub mod gnomad; pub mod ncbi; pub mod vep; pub mod alpha_genome; use std::{ collections::{HashMap, HashSet}, fmt, fs::File, io::{Read, Write}, str::FromStr, sync::{ atomic::{AtomicU32, Ordering}, Arc, }, }; use crate::{ helpers::{mean, Blake3BuildHasher, Hash128}, variant::{variant_collection::VariantCollection, vcf_variant::AlterationCategory}, }; use bitcode::{Decode, Encode}; use cosmic::Cosmic; use dashmap::DashMap; use gnomad::GnomAD; use log::info; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use vep::{get_best_vep, VEP}; /// Represents various types of annotations that can be associated with a variant. /// /// These annotations cover caller-specific metadata, biological properties, database hits, /// and computed statistics like entropy or trinucleotide context. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)] pub enum Annotation { /// Annotation from a specific variant caller and associated sample type. Callers(Caller, Sample), /// Categorization of the alteration (e.g., SNV, indel, etc.). AlterationCategory(AlterationCategory), /// Shannon entropy of the surrounding genomic region or read distribution. ShannonEntropy(f64), /// Depth of coverage in the constitutional (normal) sample. ConstitDepth(u16), /// Alternate allele count in the constitutional (normal) sample. ConstitAlt(u16), /// Flag indicating low depth in the constitutional sample. LowConstitDepth, /// Flag indicating high alternate allele count in the constitutional sample. HighConstitAlt, /// COSMIC database hit (cancer-associated mutation). Cosmic(Cosmic), /// GnomAD population frequency database annotation. GnomAD(GnomAD), /// Flag indicating low Shannon entropy (possibly less reliable region). LowEntropy, /// Trinucleotide context surrounding the variant. TriNucleotides([Base; 3]), /// Variant Effect Predictor (VEP) annotations. VEP(Vec), /// Timing of replication for the variant's genomic position. ReplicationTiming(ReplicationClass), /// Variant in an High Depth position HighDepth, /// Variant in the given panel ranges Panel(String), /// Variant at a CpG CpG, /// Variant in a region of low alignement quality LowMAPQ, /// Variant in a variable number of tandem repeat locus VNTR, /// RepeatMasker Repeat, } /// Denotes the biological sample type associated with a variant call. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)] pub enum Sample { /// Tumor-only sample without matched normal. SoloTumor, /// Constitutional (normal) sample without matched tumor. SoloConstit, /// Variant observed in germline context. Germline, /// Variant identified as somatic (tumor-specific). Somatic, } /// A nucleotide base used for representing DNA sequence. /// /// Includes the four standard bases (A, T, C, G) and `N` for ambiguous or unknown bases. #[derive(Copy, Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)] pub enum Base { /// Adenine A, /// Thymine T, /// Cytosine C, /// Guanine G, /// Unknown or ambiguous nucleotide N, } impl fmt::Display for Base { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}", match self { Base::A => "A", Base::T => "T", Base::C => "C", Base::G => "G", Base::N => "N", } ) } } /// Helper to convert a 3-base string into a [Base; 3] array. /// Returns `None` if any base is invalid. pub fn parse_trinuc(s: &str) -> [Base; 3] { fn char_to_base(c: char) -> Base { match c.to_ascii_uppercase() { 'A' => Base::A, 'T' => Base::T, 'C' => Base::C, 'G' => Base::G, _ => Base::N, } } let chars: Vec = s.chars().map(char_to_base).collect(); [chars[0], chars[1], chars[2]] } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)] pub enum ReplicationClass { Early, Late, } impl fmt::Display for Sample { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}", match self { Sample::SoloTumor => "SoloTumor", Sample::SoloConstit => "SoloConstit", Sample::Germline => "Germline", Sample::Somatic => "Somatic", } ) } } impl FromStr for Sample { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { match s { "SoloTumor" => Ok(Sample::SoloTumor), "SoloConstit" => Ok(Sample::SoloConstit), "Germline" => Ok(Sample::Germline), "Somatic" => Ok(Sample::Somatic), _ => Err(anyhow::anyhow!("Can't parse Sample from {s}")), } } } /// Implements string formatting for `Annotation`, providing a human-readable summary of each variant. /// /// This is primarily used for debugging, logging, or text-based output of annotated variants. /// For most variants, a short descriptive label is used. Some variants include additional detail, /// such as base content or sample-specific information. impl fmt::Display for Annotation { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use Annotation::*; let s = match self { Callers(caller, sample) => format!("{caller} {sample}"), AlterationCategory(alt_cat) => alt_cat.to_string(), ShannonEntropy(_) => "ShannonEntropy".into(), ConstitDepth(_) => "ConstitDepth".into(), ConstitAlt(_) => "ConstitAlt".into(), LowConstitDepth => "LowConstitDepth".into(), HighConstitAlt => "HighConstitAlt".into(), Cosmic(_) => "Cosmic".into(), GnomAD(_) => "GnomAD".into(), LowEntropy => "LowEntropy".into(), VEP(_) => "VEP".into(), CpG => "CpG".into(), VNTR => "VNTR".into(), Repeat => "Repeat".into(), TriNucleotides(bases) => format!( "Trinucleotides({})", bases.iter().map(|b| b.to_string()).collect::(), ), ReplicationTiming(rt) => match rt { ReplicationClass::Early => "ReplicationEarly".into(), ReplicationClass::Late => "ReplicationLate".into(), }, HighDepth => "HighDepth".into(), Panel(name) => format!("Panel_{name}"), LowMAPQ => "LowMAPQ".to_string(), }; write!(f, "{}", s) } } impl FromStr for Annotation { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { if s.contains(" ") { let (caller_str, sample_str) = s .split_once(" ") .ok_or_else(|| anyhow::anyhow!("Can't parse {s}"))?; Ok(Annotation::Callers( caller_str.parse()?, sample_str.parse()?, )) } else { match s { s if s.starts_with("ShannonEntropy") => Ok(Annotation::ShannonEntropy(0.0)), s if s.starts_with("ConstitDepth") => Ok(Annotation::ConstitDepth(0)), s if s.starts_with("ConstitAlt") => Ok(Annotation::ConstitAlt(0)), "LowConstitDepth" => Ok(Annotation::LowConstitDepth), "HighConstitAlt" => Ok(Annotation::HighConstitAlt), _ => Err(anyhow::anyhow!("Unknown Annotation: {}", s)), } } } } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)] pub enum Caller { DeepVariant, ClairS, NanomonSV, NanomonSVSolo, Savana, Severus, DeepSomatic, } impl fmt::Display for Caller { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use Caller::*; match self { DeepVariant => write!(f, "DeepVariant"), ClairS => write!(f, "ClairS"), NanomonSV => write!(f, "NanomonSV"), NanomonSVSolo => write!(f, "NanomonSV-solo"), Savana => write!(f, "Savana"), Severus => write!(f, "Severus"), DeepSomatic => write!(f, "DeepSomatic"), } } } impl FromStr for Caller { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { use Caller::*; match s { "DeepVariant" => Ok(DeepVariant), "ClairS" => Ok(ClairS), "NanomonSV" => Ok(NanomonSV), "NanomonSV-solo" => Ok(NanomonSVSolo), "Savana" => Ok(Savana), "Severus" => Ok(Severus), "DeepSomatic" => Ok(DeepSomatic), _ => Err(anyhow::anyhow!("Can't parse Caller {s}")), } } } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Annotations { pub store: DashMap, Blake3BuildHasher>, } impl Default for Annotations { fn default() -> Self { Annotations { store: DashMap::with_hasher(Blake3BuildHasher), } } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] pub struct AnnotationsStats { pub categorical: DashMap, pub numeric: DashMap>>, } impl AnnotationsStats { pub fn save_to_json(&self, file_path: &str) -> anyhow::Result<()> { let json = serde_json::to_string_pretty(self)?; let mut file = File::create(file_path)?; file.write_all(json.as_bytes())?; Ok(()) } pub fn load_from_json(file_path: &str) -> anyhow::Result { let mut file = File::open(file_path)?; let mut contents = String::new(); file.read_to_string(&mut contents)?; let stats: AnnotationsStats = serde_json::from_str(&contents)?; Ok(stats) } } #[allow(clippy::type_complexity)] impl Annotations { pub fn insert_update(&self, key: Hash128, add: &[Annotation]) { self.store .entry(key) .or_default() .extend(add.iter().cloned()) } pub fn callers_stat( &self, annotations: Option bool + Send + Sync>>, ) -> AnnotationsStats { use Annotation::*; let map: DashMap = DashMap::new(); let num_maps: DashMap>> = DashMap::new(); self.store.par_iter().for_each(|e| { let anns = if let Some(sel) = &annotations { e.value().iter().filter(|a| sel(a)).cloned().collect() } else { e.value().clone() }; let mut categorical = Vec::new(); let mut numerical = Vec::new(); for ann in anns.iter() { match ann { LowConstitDepth | LowEntropy | GnomAD(_) | VEP(_) | TriNucleotides(_) | ReplicationTiming(_) | HighDepth | CpG | VNTR | Repeat | Panel(_) | LowMAPQ | HighConstitAlt => categorical.push(ann.to_string()), Callers(caller, sample) => categorical.push(format!("{caller} {sample}")), ShannonEntropy(v) => numerical.push((ann.to_string(), *v)), ConstitDepth(v) | Annotation::ConstitAlt(v) => { numerical.push((ann.to_string(), *v as f64)); } Cosmic(c) => numerical.push((ann.to_string(), c.cosmic_cnt as f64)), AlterationCategory(alt_cat) => categorical.push(alt_cat.to_string()), } } categorical.sort(); categorical.dedup(); let k = categorical.join(" + "); *map.entry(k.clone()).or_default() += 1; for (k_num, v_num) in numerical { num_maps .entry(k.clone()) .or_default() .entry(k_num) .or_default() .push(v_num); } }); println!("\nCallers stats:"); println!("\tn categories: {}", map.len()); let mut n = 0; let lines: Vec = map .iter() .map(|e| { let k = e.key(); let v = e.value(); n += v; let mut num_str = Vec::new(); if let Some(nums) = num_maps.get(k) { num_str.extend( nums.iter() .map(|(k_n, v_n)| format!("{k_n} {:.2}", mean(v_n))), ) } num_str.sort(); format!("\t- {k}\t{v}\t{}", num_str.join("\t")) }) .collect(); println!("{}", lines.join("\n")); println!("Total\t{n}\n"); AnnotationsStats { categorical: map, numeric: num_maps, } } pub fn vep_stats(&self) -> anyhow::Result { let genes: DashMap = DashMap::new(); let genes_distant: DashMap = DashMap::new(); let features: DashMap = DashMap::new(); let consequences: DashMap = DashMap::new(); let impact: DashMap = DashMap::new(); let n_all = Arc::new(AtomicU32::new(0)); let n_vep = Arc::new(AtomicU32::new(0)); self.store.par_iter().for_each(|e| { n_all.fetch_add(1, Ordering::SeqCst); let best_vep_opt = e.value().iter().find_map(|a| { if let Annotation::VEP(veps) = a { get_best_vep(veps).ok() } else { None } }); if let Some(best_vep) = best_vep_opt { n_vep.fetch_add(1, Ordering::SeqCst); if let Some(gene) = best_vep.gene { if best_vep.extra.distance.is_some() { *genes_distant.entry(gene).or_default() += 1; } else { *genes.entry(gene).or_default() += 1; } } if let Some(feature) = best_vep.feature { *features.entry(feature).or_default() += 1; } if let Some(cs) = best_vep.consequence { let mut cs = cs.into_iter().map(String::from).collect::>(); cs.sort(); *consequences.entry(cs.join(" ")).or_default() += 1; } if let Some(imp) = best_vep.extra.impact { *impact.entry(String::from(imp)).or_default() += 1; } } }); let vep_stats = VepStats { genes: genes.into_iter().collect(), genes_distant: genes_distant.into_iter().collect(), features: features.into_iter().collect(), consequences: consequences.into_iter().collect(), impact: impact.into_iter().collect(), n_all: n_all.load(Ordering::SeqCst), n_vep: n_vep.load(Ordering::SeqCst), }; Ok(vep_stats) } pub fn get_keys_filter( &self, filter: impl Fn(&Vec) -> bool + Send + Sync, ) -> Vec { self.store .par_iter() .filter(|entry| filter(entry.value())) .map(|entry| *entry.key()) .collect() } /// Retain only the variants that pass a given filter across both the /// internal store and the provided collections. /// /// # Arguments /// /// * `variants` – Mutable reference to a list of `VariantCollection`s, /// each of which contains a set of called variants. /// * `filter` – Predicate applied to each entry of `self.store` /// (value = `Vec`). If it returns `true`, the entry and /// all variants with the same key are kept; otherwise they are removed. /// /// # Behavior /// /// 1. `self.store` is filtered in-place by applying `filter`. /// 2. The keys of surviving entries are collected into a `HashSet`. /// 3. Each `VariantCollection` in `variants` is pruned in parallel so /// that only variants whose `hash()` is in the key set remain. /// 4. Empty `VariantCollection`s are discarded. /// 5. Progress information (number of variants removed per caller) is /// logged via `info!`. /// /// # Returns /// /// The total number of variants removed across all collections. /// /// # Notes /// /// * Keys must be `Copy` or `Clone` to be inserted into the temporary /// `HashSet`. /// * Logging inside the parallel loop may interleave between threads, /// but totals remain correct. pub fn retain_variants( &mut self, variants: &mut Vec, filter: impl Fn(&Vec) -> bool + Send + Sync, ) -> usize { let mut keys = HashSet::new(); self.store.retain(|key, value| { if filter(value) { keys.insert(*key); true } else { false } }); info!("{} unique Variants to keep", keys.len()); let n_removed: usize = variants .par_iter_mut() .map(|c| { let before = c.variants.len(); c.variants.retain(|a| keys.contains(&a.hash)); let after = c.variants.len(); info!("\t- {}\t{}/{}", c.caller, before - after, before); before - after }) .sum(); variants.retain(|e| !e.variants.is_empty()); info!("{n_removed} variants removed from collections."); n_removed } pub fn retain_keys(&mut self, keys_to_keep: &HashSet) { self.store.retain(|key, _| keys_to_keep.contains(key)); } pub fn remove_keys(&mut self, keys_to_remove: &HashSet) { self.store.retain(|key, _| !keys_to_remove.contains(key)); } /// Adds constitutional-boundary annotations to variants lacking somatic evidence. /// /// This method scans all stored variant entries and identifies those that: /// - have tumor-only support (`Sample::SoloTumor`), and /// - lack any somatic call (`Sample::Somatic`). /// /// For such variants, it conditionally appends boundary annotations: /// - [`Annotation::LowConstitDepth`] if any constitutional depth /// (`Annotation::ConstitDepth`) is below `min_constit_depth`; /// - [`Annotation::HighConstitAlt`] if any constitutional alternate allele count /// (`Annotation::ConstitAlt`) exceeds `max_alt_constit`. /// /// Each boundary annotation is added at most once per variant. pub fn somatic_constit_boundaries(&self, max_alt_constit: u16, min_constit_depth: u16) { for mut entry in self.store.iter_mut() { let anns = entry.value_mut(); // tumor but no somatic // let has_tumor = anns // .iter() // .any(|a| matches!(a, Annotation::Callers(_, Sample::SoloTumor))); // let has_somatic = anns // .iter() // .any(|a| matches!(a, Annotation::Callers(_, Sample::Somatic))); // if has_tumor && !has_somatic { // push at most once if anns .iter() .any(|a| matches!(a, Annotation::ConstitDepth(d) if *d < min_constit_depth)) { anns.push(Annotation::LowConstitDepth); } if anns .iter() .any(|a| matches!(a, Annotation::ConstitAlt(a0) if *a0 > max_alt_constit)) { anns.push(Annotation::HighConstitAlt); } // } } } pub fn count_annotations(&self, annotation_types: Vec) -> Vec { let annotation_types = Arc::new(annotation_types); self.store .par_iter() .fold( || vec![0; annotation_types.len()], |mut counts, r| { let annotations = r.value(); for (index, annotation_type) in annotation_types.iter().enumerate() { counts[index] += annotations.iter().filter(|a| *a == annotation_type).count(); } counts }, ) .reduce( || vec![0; annotation_types.len()], |mut a, b| { for i in 0..a.len() { a[i] += b[i]; } a }, ) } /// Annotate low shannon ent for solo callers (not Somatic) pub fn low_shannon_entropy(&mut self, min_shannon_entropy: f64) { self.store.iter_mut().for_each(|mut e| { let anns = e.value_mut(); if anns.iter().any(|ann| { matches!(ann, Annotation::ShannonEntropy(ent) if *ent < min_shannon_entropy) && !anns .iter() .any(|a| matches!(a, Annotation::Callers(_, Sample::Somatic))) }) && !anns.contains(&Annotation::LowEntropy) { anns.push(Annotation::LowEntropy); } }); } } #[derive(Debug, Serialize, Deserialize)] pub struct VepStats { pub genes: Vec<(String, u32)>, pub genes_distant: Vec<(String, u32)>, pub features: Vec<(String, u32)>, pub consequences: Vec<(String, u32)>, pub impact: Vec<(String, u32)>, pub n_all: u32, pub n_vep: u32, } impl VepStats { pub fn save_to_json(&self, file_path: &str) -> anyhow::Result<()> { let json = serde_json::to_string_pretty(self)?; let mut file = File::create(file_path)?; file.write_all(json.as_bytes())?; Ok(()) } pub fn load_from_json(file_path: &str) -> anyhow::Result { let mut file = File::open(file_path)?; let mut contents = String::new(); file.read_to_string(&mut contents)?; let stats: Self = serde_json::from_str(&contents)?; Ok(stats) } } pub trait CallerCat { fn caller_cat(&self) -> Annotation; } /// Determines whether a variant annotation meets either of the following criteria: /// /// 1. It is observed in gnomAD with a non-zero allele frequency *and* has /// at least one constitutive alternate read. /// 2. All variant calls are from `SoloTumor` samples and there is at least one /// constitutive alternate read. /// /// This function is typically used to identify variants that are either likely /// germline (based on population frequency and constitutive signal) or artifacts /// (appearing only in tumor calls but with constitutive support). /// /// # Arguments /// /// * `anns` - A slice of `Annotation` values describing variant annotations, /// such as gnomAD frequency, sample types, and alternate counts. /// /// # Returns /// /// * `true` if the variant matches either of the criteria described above. /// * `false` otherwise. /// /// # Example /// /// ```rust /// let anns = vec![ /// Annotation::GnomAD(GnomAD { gnomad_af: 0.01 }), /// Annotation::ConstitAlt(2), /// ]; /// assert!(is_gnomad_and_constit_alt(&anns)); /// ``` pub fn is_gnomad_and_constit_alt(anns: &[Annotation]) -> bool { let mut has_gnomad = false; let mut constit_alt_count = 0; let mut n_callers = 0; let mut n_tumor_callers = 0; for ann in anns { match ann { Annotation::GnomAD(g) if g.gnomad_af > 0.0 => has_gnomad = true, Annotation::ConstitAlt(n) => constit_alt_count += *n, Annotation::Callers(_, Sample::SoloTumor) => { n_callers += 1; n_tumor_callers += 1; } Annotation::Callers(..) => { n_callers += 1; } _ => {} } } let has_constit_alt = constit_alt_count > 0; let only_tumor_with_constit = n_callers > 0 && n_callers == n_tumor_callers && has_constit_alt; (has_gnomad && has_constit_alt) || only_tumor_with_constit }