mod.rs 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739
  1. pub mod cosmic;
  2. pub mod echtvar;
  3. pub mod gnomad;
  4. pub mod ncbi;
  5. pub mod vep;
  6. pub mod alpha_genome;
  7. use std::{
  8. collections::{HashMap, HashSet},
  9. fmt,
  10. fs::File,
  11. io::{Read, Write},
  12. str::FromStr,
  13. sync::{
  14. atomic::{AtomicU32, Ordering},
  15. Arc,
  16. },
  17. };
  18. use crate::{
  19. helpers::{mean, Blake3BuildHasher, Hash128},
  20. variant::{variant_collection::VariantCollection, vcf_variant::AlterationCategory},
  21. };
  22. use bitcode::{Decode, Encode};
  23. use cosmic::Cosmic;
  24. use dashmap::DashMap;
  25. use gnomad::GnomAD;
  26. use log::info;
  27. use rayon::prelude::*;
  28. use serde::{Deserialize, Serialize};
  29. use vep::{get_best_vep, VEP};
  30. /// Represents various types of annotations that can be associated with a variant.
  31. ///
  32. /// These annotations cover caller-specific metadata, biological properties, database hits,
  33. /// and computed statistics like entropy or trinucleotide context.
  34. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
  35. pub enum Annotation {
  36. /// Annotation from a specific variant caller and associated sample type.
  37. Callers(Caller, Sample),
  38. /// Categorization of the alteration (e.g., SNV, indel, etc.).
  39. AlterationCategory(AlterationCategory),
  40. /// Shannon entropy of the surrounding genomic region or read distribution.
  41. ShannonEntropy(f64),
  42. /// Depth of coverage in the constitutional (normal) sample.
  43. ConstitDepth(u16),
  44. /// Alternate allele count in the constitutional (normal) sample.
  45. ConstitAlt(u16),
  46. /// Flag indicating low depth in the constitutional sample.
  47. LowConstitDepth,
  48. /// Flag indicating high alternate allele count in the constitutional sample.
  49. HighConstitAlt,
  50. /// COSMIC database hit (cancer-associated mutation).
  51. Cosmic(Cosmic),
  52. /// GnomAD population frequency database annotation.
  53. GnomAD(GnomAD),
  54. /// Flag indicating low Shannon entropy (possibly less reliable region).
  55. LowEntropy,
  56. /// Trinucleotide context surrounding the variant.
  57. TriNucleotides([Base; 3]),
  58. /// Variant Effect Predictor (VEP) annotations.
  59. VEP(Vec<VEP>),
  60. /// Timing of replication for the variant's genomic position.
  61. ReplicationTiming(ReplicationClass),
  62. /// Variant in an High Depth position
  63. HighDepth,
  64. /// Variant in the given panel ranges
  65. Panel(String),
  66. /// Variant at a CpG
  67. CpG,
  68. /// Variant in a region of low alignement quality
  69. LowMAPQ,
  70. /// Variant in a variable number of tandem repeat locus
  71. VNTR,
  72. /// RepeatMasker
  73. Repeat,
  74. }
  75. /// Denotes the biological sample type associated with a variant call.
  76. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
  77. pub enum Sample {
  78. /// Tumor-only sample without matched normal.
  79. SoloTumor,
  80. /// Constitutional (normal) sample without matched tumor.
  81. SoloConstit,
  82. /// Variant observed in germline context.
  83. Germline,
  84. /// Variant identified as somatic (tumor-specific).
  85. Somatic,
  86. }
  87. /// A nucleotide base used for representing DNA sequence.
  88. ///
  89. /// Includes the four standard bases (A, T, C, G) and `N` for ambiguous or unknown bases.
  90. #[derive(Copy, Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
  91. pub enum Base {
  92. /// Adenine
  93. A,
  94. /// Thymine
  95. T,
  96. /// Cytosine
  97. C,
  98. /// Guanine
  99. G,
  100. /// Unknown or ambiguous nucleotide
  101. N,
  102. }
  103. impl fmt::Display for Base {
  104. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  105. write!(
  106. f,
  107. "{}",
  108. match self {
  109. Base::A => "A",
  110. Base::T => "T",
  111. Base::C => "C",
  112. Base::G => "G",
  113. Base::N => "N",
  114. }
  115. )
  116. }
  117. }
  118. /// Helper to convert a 3-base string into a [Base; 3] array.
  119. /// Returns `None` if any base is invalid.
  120. pub fn parse_trinuc(s: &str) -> [Base; 3] {
  121. fn char_to_base(c: char) -> Base {
  122. match c.to_ascii_uppercase() {
  123. 'A' => Base::A,
  124. 'T' => Base::T,
  125. 'C' => Base::C,
  126. 'G' => Base::G,
  127. _ => Base::N,
  128. }
  129. }
  130. let chars: Vec<Base> = s.chars().map(char_to_base).collect();
  131. [chars[0], chars[1], chars[2]]
  132. }
  133. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)]
  134. pub enum ReplicationClass {
  135. Early,
  136. Late,
  137. }
  138. impl fmt::Display for Sample {
  139. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  140. write!(
  141. f,
  142. "{}",
  143. match self {
  144. Sample::SoloTumor => "SoloTumor",
  145. Sample::SoloConstit => "SoloConstit",
  146. Sample::Germline => "Germline",
  147. Sample::Somatic => "Somatic",
  148. }
  149. )
  150. }
  151. }
  152. impl FromStr for Sample {
  153. type Err = anyhow::Error;
  154. fn from_str(s: &str) -> anyhow::Result<Self> {
  155. match s {
  156. "SoloTumor" => Ok(Sample::SoloTumor),
  157. "SoloConstit" => Ok(Sample::SoloConstit),
  158. "Germline" => Ok(Sample::Germline),
  159. "Somatic" => Ok(Sample::Somatic),
  160. _ => Err(anyhow::anyhow!("Can't parse Sample from {s}")),
  161. }
  162. }
  163. }
  164. /// Implements string formatting for `Annotation`, providing a human-readable summary of each variant.
  165. ///
  166. /// This is primarily used for debugging, logging, or text-based output of annotated variants.
  167. /// For most variants, a short descriptive label is used. Some variants include additional detail,
  168. /// such as base content or sample-specific information.
  169. impl fmt::Display for Annotation {
  170. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  171. use Annotation::*;
  172. let s = match self {
  173. Callers(caller, sample) => format!("{caller} {sample}"),
  174. AlterationCategory(alt_cat) => alt_cat.to_string(),
  175. ShannonEntropy(_) => "ShannonEntropy".into(),
  176. ConstitDepth(_) => "ConstitDepth".into(),
  177. ConstitAlt(_) => "ConstitAlt".into(),
  178. LowConstitDepth => "LowConstitDepth".into(),
  179. HighConstitAlt => "HighConstitAlt".into(),
  180. Cosmic(_) => "Cosmic".into(),
  181. GnomAD(_) => "GnomAD".into(),
  182. LowEntropy => "LowEntropy".into(),
  183. VEP(_) => "VEP".into(),
  184. CpG => "CpG".into(),
  185. VNTR => "VNTR".into(),
  186. Repeat => "Repeat".into(),
  187. TriNucleotides(bases) => format!(
  188. "Trinucleotides({})",
  189. bases.iter().map(|b| b.to_string()).collect::<String>(),
  190. ),
  191. ReplicationTiming(rt) => match rt {
  192. ReplicationClass::Early => "ReplicationEarly".into(),
  193. ReplicationClass::Late => "ReplicationLate".into(),
  194. },
  195. HighDepth => "HighDepth".into(),
  196. Panel(name) => format!("Panel_{name}"),
  197. LowMAPQ => "LowMAPQ".to_string(),
  198. };
  199. write!(f, "{}", s)
  200. }
  201. }
  202. impl FromStr for Annotation {
  203. type Err = anyhow::Error;
  204. fn from_str(s: &str) -> anyhow::Result<Self> {
  205. if s.contains(" ") {
  206. let (caller_str, sample_str) = s
  207. .split_once(" ")
  208. .ok_or_else(|| anyhow::anyhow!("Can't parse {s}"))?;
  209. Ok(Annotation::Callers(
  210. caller_str.parse()?,
  211. sample_str.parse()?,
  212. ))
  213. } else {
  214. match s {
  215. s if s.starts_with("ShannonEntropy") => Ok(Annotation::ShannonEntropy(0.0)),
  216. s if s.starts_with("ConstitDepth") => Ok(Annotation::ConstitDepth(0)),
  217. s if s.starts_with("ConstitAlt") => Ok(Annotation::ConstitAlt(0)),
  218. "LowConstitDepth" => Ok(Annotation::LowConstitDepth),
  219. "HighConstitAlt" => Ok(Annotation::HighConstitAlt),
  220. _ => Err(anyhow::anyhow!("Unknown Annotation: {}", s)),
  221. }
  222. }
  223. }
  224. }
  225. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)]
  226. pub enum Caller {
  227. DeepVariant,
  228. ClairS,
  229. NanomonSV,
  230. NanomonSVSolo,
  231. Savana,
  232. Severus,
  233. DeepSomatic,
  234. }
  235. impl fmt::Display for Caller {
  236. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  237. use Caller::*;
  238. match self {
  239. DeepVariant => write!(f, "DeepVariant"),
  240. ClairS => write!(f, "ClairS"),
  241. NanomonSV => write!(f, "NanomonSV"),
  242. NanomonSVSolo => write!(f, "NanomonSV-solo"),
  243. Savana => write!(f, "Savana"),
  244. Severus => write!(f, "Severus"),
  245. DeepSomatic => write!(f, "DeepSomatic"),
  246. }
  247. }
  248. }
  249. impl FromStr for Caller {
  250. type Err = anyhow::Error;
  251. fn from_str(s: &str) -> anyhow::Result<Self> {
  252. use Caller::*;
  253. match s {
  254. "DeepVariant" => Ok(DeepVariant),
  255. "ClairS" => Ok(ClairS),
  256. "NanomonSV" => Ok(NanomonSV),
  257. "NanomonSV-solo" => Ok(NanomonSVSolo),
  258. "Savana" => Ok(Savana),
  259. "Severus" => Ok(Severus),
  260. "DeepSomatic" => Ok(DeepSomatic),
  261. _ => Err(anyhow::anyhow!("Can't parse Caller {s}")),
  262. }
  263. }
  264. }
  265. #[derive(Debug, Clone, Serialize, Deserialize)]
  266. pub struct Annotations {
  267. pub store: DashMap<Hash128, Vec<Annotation>, Blake3BuildHasher>,
  268. }
  269. impl Default for Annotations {
  270. fn default() -> Self {
  271. Annotations {
  272. store: DashMap::with_hasher(Blake3BuildHasher),
  273. }
  274. }
  275. }
  276. #[derive(Debug, Default, Clone, Serialize, Deserialize)]
  277. pub struct AnnotationsStats {
  278. pub categorical: DashMap<String, u64>,
  279. pub numeric: DashMap<String, HashMap<String, Vec<f64>>>,
  280. }
  281. impl AnnotationsStats {
  282. pub fn save_to_json(&self, file_path: &str) -> anyhow::Result<()> {
  283. let json = serde_json::to_string_pretty(self)?;
  284. let mut file = File::create(file_path)?;
  285. file.write_all(json.as_bytes())?;
  286. Ok(())
  287. }
  288. pub fn load_from_json(file_path: &str) -> anyhow::Result<Self> {
  289. let mut file = File::open(file_path)?;
  290. let mut contents = String::new();
  291. file.read_to_string(&mut contents)?;
  292. let stats: AnnotationsStats = serde_json::from_str(&contents)?;
  293. Ok(stats)
  294. }
  295. }
  296. #[allow(clippy::type_complexity)]
  297. impl Annotations {
  298. pub fn insert_update(&self, key: Hash128, add: &[Annotation]) {
  299. self.store
  300. .entry(key)
  301. .or_default()
  302. .extend(add.iter().cloned())
  303. }
  304. pub fn callers_stat(
  305. &self,
  306. annotations: Option<Box<dyn Fn(&Annotation) -> bool + Send + Sync>>,
  307. ) -> AnnotationsStats {
  308. use Annotation::*;
  309. let map: DashMap<String, u64> = DashMap::new();
  310. let num_maps: DashMap<String, HashMap<String, Vec<f64>>> = DashMap::new();
  311. self.store.par_iter().for_each(|e| {
  312. let anns = if let Some(sel) = &annotations {
  313. e.value().iter().filter(|a| sel(a)).cloned().collect()
  314. } else {
  315. e.value().clone()
  316. };
  317. let mut categorical = Vec::new();
  318. let mut numerical = Vec::new();
  319. for ann in anns.iter() {
  320. match ann {
  321. LowConstitDepth | LowEntropy | GnomAD(_) | VEP(_) | TriNucleotides(_)
  322. | ReplicationTiming(_) | HighDepth | CpG | VNTR | Repeat | Panel(_)
  323. | LowMAPQ | HighConstitAlt => categorical.push(ann.to_string()),
  324. Callers(caller, sample) => categorical.push(format!("{caller} {sample}")),
  325. ShannonEntropy(v) => numerical.push((ann.to_string(), *v)),
  326. ConstitDepth(v) | Annotation::ConstitAlt(v) => {
  327. numerical.push((ann.to_string(), *v as f64));
  328. }
  329. Cosmic(c) => numerical.push((ann.to_string(), c.cosmic_cnt as f64)),
  330. AlterationCategory(alt_cat) => categorical.push(alt_cat.to_string()),
  331. }
  332. }
  333. categorical.sort();
  334. categorical.dedup();
  335. let k = categorical.join(" + ");
  336. *map.entry(k.clone()).or_default() += 1;
  337. for (k_num, v_num) in numerical {
  338. num_maps
  339. .entry(k.clone())
  340. .or_default()
  341. .entry(k_num)
  342. .or_default()
  343. .push(v_num);
  344. }
  345. });
  346. println!("\nCallers stats:");
  347. println!("\tn categories: {}", map.len());
  348. let mut n = 0;
  349. let lines: Vec<String> = map
  350. .iter()
  351. .map(|e| {
  352. let k = e.key();
  353. let v = e.value();
  354. n += v;
  355. let mut num_str = Vec::new();
  356. if let Some(nums) = num_maps.get(k) {
  357. num_str.extend(
  358. nums.iter()
  359. .map(|(k_n, v_n)| format!("{k_n} {:.2}", mean(v_n))),
  360. )
  361. }
  362. num_str.sort();
  363. format!("\t- {k}\t{v}\t{}", num_str.join("\t"))
  364. })
  365. .collect();
  366. println!("{}", lines.join("\n"));
  367. println!("Total\t{n}\n");
  368. AnnotationsStats {
  369. categorical: map,
  370. numeric: num_maps,
  371. }
  372. }
  373. pub fn vep_stats(&self) -> anyhow::Result<VepStats> {
  374. let genes: DashMap<String, u32> = DashMap::new();
  375. let genes_distant: DashMap<String, u32> = DashMap::new();
  376. let features: DashMap<String, u32> = DashMap::new();
  377. let consequences: DashMap<String, u32> = DashMap::new();
  378. let impact: DashMap<String, u32> = DashMap::new();
  379. let n_all = Arc::new(AtomicU32::new(0));
  380. let n_vep = Arc::new(AtomicU32::new(0));
  381. self.store.par_iter().for_each(|e| {
  382. n_all.fetch_add(1, Ordering::SeqCst);
  383. let best_vep_opt = e.value().iter().find_map(|a| {
  384. if let Annotation::VEP(veps) = a {
  385. get_best_vep(veps).ok()
  386. } else {
  387. None
  388. }
  389. });
  390. if let Some(best_vep) = best_vep_opt {
  391. n_vep.fetch_add(1, Ordering::SeqCst);
  392. if let Some(gene) = best_vep.gene {
  393. if best_vep.extra.distance.is_some() {
  394. *genes_distant.entry(gene).or_default() += 1;
  395. } else {
  396. *genes.entry(gene).or_default() += 1;
  397. }
  398. }
  399. if let Some(feature) = best_vep.feature {
  400. *features.entry(feature).or_default() += 1;
  401. }
  402. if let Some(cs) = best_vep.consequence {
  403. let mut cs = cs.into_iter().map(String::from).collect::<Vec<String>>();
  404. cs.sort();
  405. *consequences.entry(cs.join(" ")).or_default() += 1;
  406. }
  407. if let Some(imp) = best_vep.extra.impact {
  408. *impact.entry(String::from(imp)).or_default() += 1;
  409. }
  410. }
  411. });
  412. let vep_stats = VepStats {
  413. genes: genes.into_iter().collect(),
  414. genes_distant: genes_distant.into_iter().collect(),
  415. features: features.into_iter().collect(),
  416. consequences: consequences.into_iter().collect(),
  417. impact: impact.into_iter().collect(),
  418. n_all: n_all.load(Ordering::SeqCst),
  419. n_vep: n_vep.load(Ordering::SeqCst),
  420. };
  421. Ok(vep_stats)
  422. }
  423. pub fn get_keys_filter(
  424. &self,
  425. filter: impl Fn(&Vec<Annotation>) -> bool + Send + Sync,
  426. ) -> Vec<Hash128> {
  427. self.store
  428. .par_iter()
  429. .filter(|entry| filter(entry.value()))
  430. .map(|entry| *entry.key())
  431. .collect()
  432. }
  433. /// Retain only the variants that pass a given filter across both the
  434. /// internal store and the provided collections.
  435. ///
  436. /// # Arguments
  437. ///
  438. /// * `variants` – Mutable reference to a list of `VariantCollection`s,
  439. /// each of which contains a set of called variants.
  440. /// * `filter` – Predicate applied to each entry of `self.store`
  441. /// (value = `Vec<Annotation>`). If it returns `true`, the entry and
  442. /// all variants with the same key are kept; otherwise they are removed.
  443. ///
  444. /// # Behavior
  445. ///
  446. /// 1. `self.store` is filtered in-place by applying `filter`.
  447. /// 2. The keys of surviving entries are collected into a `HashSet`.
  448. /// 3. Each `VariantCollection` in `variants` is pruned in parallel so
  449. /// that only variants whose `hash()` is in the key set remain.
  450. /// 4. Empty `VariantCollection`s are discarded.
  451. /// 5. Progress information (number of variants removed per caller) is
  452. /// logged via `info!`.
  453. ///
  454. /// # Returns
  455. ///
  456. /// The total number of variants removed across all collections.
  457. ///
  458. /// # Notes
  459. ///
  460. /// * Keys must be `Copy` or `Clone` to be inserted into the temporary
  461. /// `HashSet`.
  462. /// * Logging inside the parallel loop may interleave between threads,
  463. /// but totals remain correct.
  464. pub fn retain_variants(
  465. &mut self,
  466. variants: &mut Vec<VariantCollection>,
  467. filter: impl Fn(&Vec<Annotation>) -> bool + Send + Sync,
  468. ) -> usize {
  469. let mut keys = HashSet::new();
  470. self.store.retain(|key, value| {
  471. if filter(value) {
  472. keys.insert(*key);
  473. true
  474. } else {
  475. false
  476. }
  477. });
  478. info!("{} unique Variants to keep", keys.len());
  479. let n_removed: usize = variants
  480. .par_iter_mut()
  481. .map(|c| {
  482. let before = c.variants.len();
  483. c.variants.retain(|a| keys.contains(&a.hash));
  484. let after = c.variants.len();
  485. info!("\t- {}\t{}/{}", c.caller, before - after, before);
  486. before - after
  487. })
  488. .sum();
  489. variants.retain(|e| !e.variants.is_empty());
  490. info!("{n_removed} variants removed from collections.");
  491. n_removed
  492. }
  493. pub fn retain_keys(&mut self, keys_to_keep: &HashSet<Hash128>) {
  494. self.store.retain(|key, _| keys_to_keep.contains(key));
  495. }
  496. pub fn remove_keys(&mut self, keys_to_remove: &HashSet<Hash128>) {
  497. self.store.retain(|key, _| !keys_to_remove.contains(key));
  498. }
  499. /// Adds constitutional-boundary annotations to variants lacking somatic evidence.
  500. ///
  501. /// This method scans all stored variant entries and identifies those that:
  502. /// - have tumor-only support (`Sample::SoloTumor`), and
  503. /// - lack any somatic call (`Sample::Somatic`).
  504. ///
  505. /// For such variants, it conditionally appends boundary annotations:
  506. /// - [`Annotation::LowConstitDepth`] if any constitutional depth
  507. /// (`Annotation::ConstitDepth`) is below `min_constit_depth`;
  508. /// - [`Annotation::HighConstitAlt`] if any constitutional alternate allele count
  509. /// (`Annotation::ConstitAlt`) exceeds `max_alt_constit`.
  510. ///
  511. /// Each boundary annotation is added at most once per variant.
  512. pub fn somatic_constit_boundaries(&self, max_alt_constit: u16, min_constit_depth: u16) {
  513. for mut entry in self.store.iter_mut() {
  514. let anns = entry.value_mut();
  515. // tumor but no somatic
  516. // let has_tumor = anns
  517. // .iter()
  518. // .any(|a| matches!(a, Annotation::Callers(_, Sample::SoloTumor)));
  519. // let has_somatic = anns
  520. // .iter()
  521. // .any(|a| matches!(a, Annotation::Callers(_, Sample::Somatic)));
  522. // if has_tumor && !has_somatic {
  523. // push at most once
  524. if anns
  525. .iter()
  526. .any(|a| matches!(a, Annotation::ConstitDepth(d) if *d < min_constit_depth))
  527. {
  528. anns.push(Annotation::LowConstitDepth);
  529. }
  530. if anns
  531. .iter()
  532. .any(|a| matches!(a, Annotation::ConstitAlt(a0) if *a0 > max_alt_constit))
  533. {
  534. anns.push(Annotation::HighConstitAlt);
  535. }
  536. // }
  537. }
  538. }
  539. pub fn count_annotations(&self, annotation_types: Vec<Annotation>) -> Vec<usize> {
  540. let annotation_types = Arc::new(annotation_types);
  541. self.store
  542. .par_iter()
  543. .fold(
  544. || vec![0; annotation_types.len()],
  545. |mut counts, r| {
  546. let annotations = r.value();
  547. for (index, annotation_type) in annotation_types.iter().enumerate() {
  548. counts[index] +=
  549. annotations.iter().filter(|a| *a == annotation_type).count();
  550. }
  551. counts
  552. },
  553. )
  554. .reduce(
  555. || vec![0; annotation_types.len()],
  556. |mut a, b| {
  557. for i in 0..a.len() {
  558. a[i] += b[i];
  559. }
  560. a
  561. },
  562. )
  563. }
  564. /// Annotate low shannon ent for solo callers (not Somatic)
  565. pub fn low_shannon_entropy(&mut self, min_shannon_entropy: f64) {
  566. self.store.iter_mut().for_each(|mut e| {
  567. let anns = e.value_mut();
  568. if anns.iter().any(|ann| {
  569. matches!(ann, Annotation::ShannonEntropy(ent) if *ent < min_shannon_entropy)
  570. && !anns
  571. .iter()
  572. .any(|a| matches!(a, Annotation::Callers(_, Sample::Somatic)))
  573. }) && !anns.contains(&Annotation::LowEntropy)
  574. {
  575. anns.push(Annotation::LowEntropy);
  576. }
  577. });
  578. }
  579. }
  580. #[derive(Debug, Serialize, Deserialize)]
  581. pub struct VepStats {
  582. pub genes: Vec<(String, u32)>,
  583. pub genes_distant: Vec<(String, u32)>,
  584. pub features: Vec<(String, u32)>,
  585. pub consequences: Vec<(String, u32)>,
  586. pub impact: Vec<(String, u32)>,
  587. pub n_all: u32,
  588. pub n_vep: u32,
  589. }
  590. impl VepStats {
  591. pub fn save_to_json(&self, file_path: &str) -> anyhow::Result<()> {
  592. let json = serde_json::to_string_pretty(self)?;
  593. let mut file = File::create(file_path)?;
  594. file.write_all(json.as_bytes())?;
  595. Ok(())
  596. }
  597. pub fn load_from_json(file_path: &str) -> anyhow::Result<Self> {
  598. let mut file = File::open(file_path)?;
  599. let mut contents = String::new();
  600. file.read_to_string(&mut contents)?;
  601. let stats: Self = serde_json::from_str(&contents)?;
  602. Ok(stats)
  603. }
  604. }
  605. pub trait CallerCat {
  606. fn caller_cat(&self) -> Annotation;
  607. }
  608. /// Determines whether a variant annotation meets either of the following criteria:
  609. ///
  610. /// 1. It is observed in gnomAD with a non-zero allele frequency *and* has
  611. /// at least one constitutive alternate read.
  612. /// 2. All variant calls are from `SoloTumor` samples and there is at least one
  613. /// constitutive alternate read.
  614. ///
  615. /// This function is typically used to identify variants that are either likely
  616. /// germline (based on population frequency and constitutive signal) or artifacts
  617. /// (appearing only in tumor calls but with constitutive support).
  618. ///
  619. /// # Arguments
  620. ///
  621. /// * `anns` - A slice of `Annotation` values describing variant annotations,
  622. /// such as gnomAD frequency, sample types, and alternate counts.
  623. ///
  624. /// # Returns
  625. ///
  626. /// * `true` if the variant matches either of the criteria described above.
  627. /// * `false` otherwise.
  628. ///
  629. /// # Example
  630. ///
  631. /// ```rust
  632. /// let anns = vec![
  633. /// Annotation::GnomAD(GnomAD { gnomad_af: 0.01 }),
  634. /// Annotation::ConstitAlt(2),
  635. /// ];
  636. /// assert!(is_gnomad_and_constit_alt(&anns));
  637. /// ```
  638. pub fn is_gnomad_and_constit_alt(anns: &[Annotation]) -> bool {
  639. let mut has_gnomad = false;
  640. let mut constit_alt_count = 0;
  641. let mut n_callers = 0;
  642. let mut n_tumor_callers = 0;
  643. for ann in anns {
  644. match ann {
  645. Annotation::GnomAD(g) if g.gnomad_af > 0.0 => has_gnomad = true,
  646. Annotation::ConstitAlt(n) => constit_alt_count += *n,
  647. Annotation::Callers(_, Sample::SoloTumor) => {
  648. n_callers += 1;
  649. n_tumor_callers += 1;
  650. }
  651. Annotation::Callers(..) => {
  652. n_callers += 1;
  653. }
  654. _ => {}
  655. }
  656. }
  657. let has_constit_alt = constit_alt_count > 0;
  658. let only_tumor_with_constit = n_callers > 0 && n_callers == n_tumor_callers && has_constit_alt;
  659. (has_gnomad && has_constit_alt) || only_tumor_with_constit
  660. }