mod.rs 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741
  1. pub mod alpha_genome;
  2. pub mod cosmic;
  3. pub mod echtvar;
  4. pub mod gnomad;
  5. pub mod ncbi;
  6. pub mod vep;
  7. use std::{
  8. collections::{HashMap, HashSet},
  9. fmt,
  10. fs::File,
  11. io::{Read, Write},
  12. str::FromStr,
  13. sync::{
  14. atomic::{AtomicU32, Ordering},
  15. Arc,
  16. },
  17. };
  18. use crate::{
  19. helpers::{mean, Blake3BuildHasher, Hash128},
  20. variant::{variant_collection::VariantCollection, vcf_variant::AlterationCategory},
  21. };
  22. use bitcode::{Decode, Encode};
  23. use cosmic::Cosmic;
  24. use dashmap::DashMap;
  25. use gnomad::GnomAD;
  26. use log::info;
  27. use rayon::prelude::*;
  28. use serde::{Deserialize, Serialize};
  29. use vep::{get_best_vep, VEP};
  30. /// Represents various types of annotations that can be associated with a variant.
  31. ///
  32. /// These annotations cover caller-specific metadata, biological properties, database hits,
  33. /// and computed statistics like entropy or trinucleotide context.
  34. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
  35. pub enum Annotation {
  36. /// Annotation from a specific variant caller and associated sample type.
  37. Callers(Caller, Sample),
  38. /// Categorization of the alteration (e.g., SNV, indel, etc.).
  39. AlterationCategory(AlterationCategory),
  40. /// Shannon entropy of the surrounding genomic region or read distribution.
  41. ShannonEntropy(f64),
  42. /// Depth of coverage in the constitutional (normal) sample.
  43. ConstitDepth(u16),
  44. /// Alternate allele count in the constitutional (normal) sample.
  45. ConstitAlt(u16),
  46. /// Flag indicating low depth in the constitutional sample.
  47. LowConstitDepth,
  48. /// Flag indicating high alternate allele count in the constitutional sample.
  49. HighConstitAlt,
  50. /// COSMIC database hit (cancer-associated mutation).
  51. Cosmic(Cosmic),
  52. /// GnomAD population frequency database annotation.
  53. GnomAD(GnomAD),
  54. /// Flag indicating low Shannon entropy (possibly less reliable region).
  55. LowEntropy,
  56. /// Trinucleotide context surrounding the variant.
  57. TriNucleotides([Base; 3]),
  58. /// Variant Effect Predictor (VEP) annotations.
  59. VEP(Vec<VEP>),
  60. /// Timing of replication for the variant's genomic position.
  61. ReplicationTiming(ReplicationClass),
  62. /// Variant in an High Depth position
  63. HighDepth,
  64. /// Variant in the given panel ranges
  65. Panel(String),
  66. /// Variant at a CpG
  67. CpG,
  68. /// Variant in a region of low alignement quality
  69. LowMAPQ,
  70. /// Variant in a variable number of tandem repeat locus
  71. VNTR,
  72. /// RepeatMasker
  73. Repeat,
  74. }
  75. /// Denotes the biological sample type associated with a variant call.
  76. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
  77. pub enum Sample {
  78. /// Tumor-only sample without matched normal.
  79. SoloTumor,
  80. /// Constitutional (normal) sample without matched tumor.
  81. SoloConstit,
  82. /// Variant observed in germline context.
  83. Germline,
  84. /// Variant identified as somatic (tumor-specific).
  85. Somatic,
  86. }
  87. /// A nucleotide base used for representing DNA sequence.
  88. ///
  89. /// Includes the four standard bases (A, T, C, G) and `N` for ambiguous or unknown bases.
  90. #[derive(Copy, Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
  91. pub enum Base {
  92. /// Adenine
  93. A,
  94. /// Thymine
  95. T,
  96. /// Cytosine
  97. C,
  98. /// Guanine
  99. G,
  100. /// Unknown or ambiguous nucleotide
  101. N,
  102. }
  103. impl fmt::Display for Base {
  104. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  105. write!(
  106. f,
  107. "{}",
  108. match self {
  109. Base::A => "A",
  110. Base::T => "T",
  111. Base::C => "C",
  112. Base::G => "G",
  113. Base::N => "N",
  114. }
  115. )
  116. }
  117. }
  118. /// Helper to convert a 3-base string into a [Base; 3] array.
  119. /// Returns `None` if any base is invalid.
  120. pub fn parse_trinuc(s: &str) -> [Base; 3] {
  121. fn char_to_base(c: char) -> Base {
  122. match c.to_ascii_uppercase() {
  123. 'A' => Base::A,
  124. 'T' => Base::T,
  125. 'C' => Base::C,
  126. 'G' => Base::G,
  127. _ => Base::N,
  128. }
  129. }
  130. let chars: Vec<Base> = s.chars().map(char_to_base).collect();
  131. [chars[0], chars[1], chars[2]]
  132. }
  133. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)]
  134. pub enum ReplicationClass {
  135. Early,
  136. Late,
  137. }
  138. impl fmt::Display for Sample {
  139. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  140. write!(
  141. f,
  142. "{}",
  143. match self {
  144. Sample::SoloTumor => "SoloTumor",
  145. Sample::SoloConstit => "SoloConstit",
  146. Sample::Germline => "Germline",
  147. Sample::Somatic => "Somatic",
  148. }
  149. )
  150. }
  151. }
  152. impl FromStr for Sample {
  153. type Err = anyhow::Error;
  154. fn from_str(s: &str) -> anyhow::Result<Self> {
  155. match s {
  156. "SoloTumor" => Ok(Sample::SoloTumor),
  157. "SoloConstit" => Ok(Sample::SoloConstit),
  158. "Germline" => Ok(Sample::Germline),
  159. "Somatic" => Ok(Sample::Somatic),
  160. _ => Err(anyhow::anyhow!("Can't parse Sample from {s}")),
  161. }
  162. }
  163. }
  164. /// Implements string formatting for `Annotation`, providing a human-readable summary of each variant.
  165. ///
  166. /// This is primarily used for debugging, logging, or text-based output of annotated variants.
  167. /// For most variants, a short descriptive label is used. Some variants include additional detail,
  168. /// such as base content or sample-specific information.
  169. impl fmt::Display for Annotation {
  170. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  171. use Annotation::*;
  172. let s = match self {
  173. Callers(caller, sample) => format!("{caller} {sample}"),
  174. AlterationCategory(alt_cat) => alt_cat.to_string(),
  175. ShannonEntropy(_) => "ShannonEntropy".into(),
  176. ConstitDepth(_) => "ConstitDepth".into(),
  177. ConstitAlt(_) => "ConstitAlt".into(),
  178. LowConstitDepth => "LowConstitDepth".into(),
  179. HighConstitAlt => "HighConstitAlt".into(),
  180. Cosmic(_) => "Cosmic".into(),
  181. GnomAD(_) => "GnomAD".into(),
  182. LowEntropy => "LowEntropy".into(),
  183. VEP(_) => "VEP".into(),
  184. CpG => "CpG".into(),
  185. VNTR => "VNTR".into(),
  186. Repeat => "Repeat".into(),
  187. TriNucleotides(bases) => format!(
  188. "Trinucleotides({})",
  189. bases.iter().map(|b| b.to_string()).collect::<String>(),
  190. ),
  191. ReplicationTiming(rt) => match rt {
  192. ReplicationClass::Early => "ReplicationEarly".into(),
  193. ReplicationClass::Late => "ReplicationLate".into(),
  194. },
  195. HighDepth => "HighDepth".into(),
  196. Panel(name) => format!("Panel_{name}"),
  197. LowMAPQ => "LowMAPQ".to_string(),
  198. };
  199. write!(f, "{}", s)
  200. }
  201. }
  202. impl FromStr for Annotation {
  203. type Err = anyhow::Error;
  204. fn from_str(s: &str) -> anyhow::Result<Self> {
  205. if s.contains(" ") {
  206. let (caller_str, sample_str) = s
  207. .split_once(" ")
  208. .ok_or_else(|| anyhow::anyhow!("Can't parse {s}"))?;
  209. Ok(Annotation::Callers(
  210. caller_str.parse()?,
  211. sample_str.parse()?,
  212. ))
  213. } else {
  214. match s {
  215. s if s.starts_with("ShannonEntropy") => Ok(Annotation::ShannonEntropy(0.0)),
  216. s if s.starts_with("ConstitDepth") => Ok(Annotation::ConstitDepth(0)),
  217. s if s.starts_with("ConstitAlt") => Ok(Annotation::ConstitAlt(0)),
  218. "LowConstitDepth" => Ok(Annotation::LowConstitDepth),
  219. "HighConstitAlt" => Ok(Annotation::HighConstitAlt),
  220. _ => Err(anyhow::anyhow!("Unknown Annotation: {}", s)),
  221. }
  222. }
  223. }
  224. }
  225. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)]
  226. pub enum Caller {
  227. DeepVariant,
  228. ClairS,
  229. NanomonSV,
  230. NanomonSVSolo,
  231. Savana,
  232. Severus,
  233. DeepSomatic,
  234. Mutect2,
  235. }
  236. impl fmt::Display for Caller {
  237. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  238. use Caller::*;
  239. match self {
  240. DeepVariant => write!(f, "DeepVariant"),
  241. ClairS => write!(f, "ClairS"),
  242. NanomonSV => write!(f, "NanomonSV"),
  243. NanomonSVSolo => write!(f, "NanomonSV-solo"),
  244. Savana => write!(f, "Savana"),
  245. Severus => write!(f, "Severus"),
  246. DeepSomatic => write!(f, "DeepSomatic"),
  247. Mutect2 => write!(f, "Mutect2"),
  248. }
  249. }
  250. }
  251. impl FromStr for Caller {
  252. type Err = anyhow::Error;
  253. fn from_str(s: &str) -> anyhow::Result<Self> {
  254. use Caller::*;
  255. match s {
  256. "DeepVariant" => Ok(DeepVariant),
  257. "ClairS" => Ok(ClairS),
  258. "NanomonSV" => Ok(NanomonSV),
  259. "NanomonSV-solo" => Ok(NanomonSVSolo),
  260. "Savana" => Ok(Savana),
  261. "Severus" => Ok(Severus),
  262. "DeepSomatic" => Ok(DeepSomatic),
  263. _ => Err(anyhow::anyhow!("Can't parse Caller {s}")),
  264. }
  265. }
  266. }
  267. #[derive(Debug, Clone, Serialize, Deserialize)]
  268. pub struct Annotations {
  269. pub store: DashMap<Hash128, Vec<Annotation>, Blake3BuildHasher>,
  270. }
  271. impl Default for Annotations {
  272. fn default() -> Self {
  273. Annotations {
  274. store: DashMap::with_hasher(Blake3BuildHasher),
  275. }
  276. }
  277. }
  278. #[derive(Debug, Default, Clone, Serialize, Deserialize)]
  279. pub struct AnnotationsStats {
  280. pub categorical: DashMap<String, u64>,
  281. pub numeric: DashMap<String, HashMap<String, Vec<f64>>>,
  282. }
  283. impl AnnotationsStats {
  284. pub fn save_to_json(&self, file_path: &str) -> anyhow::Result<()> {
  285. let json = serde_json::to_string_pretty(self)?;
  286. let mut file = File::create(file_path)?;
  287. file.write_all(json.as_bytes())?;
  288. Ok(())
  289. }
  290. pub fn load_from_json(file_path: &str) -> anyhow::Result<Self> {
  291. let mut file = File::open(file_path)?;
  292. let mut contents = String::new();
  293. file.read_to_string(&mut contents)?;
  294. let stats: AnnotationsStats = serde_json::from_str(&contents)?;
  295. Ok(stats)
  296. }
  297. }
  298. #[allow(clippy::type_complexity)]
  299. impl Annotations {
  300. pub fn insert_update(&self, key: Hash128, add: &[Annotation]) {
  301. self.store
  302. .entry(key)
  303. .or_default()
  304. .extend(add.iter().cloned())
  305. }
  306. pub fn callers_stat(
  307. &self,
  308. annotations: Option<Box<dyn Fn(&Annotation) -> bool + Send + Sync>>,
  309. ) -> AnnotationsStats {
  310. use Annotation::*;
  311. let map: DashMap<String, u64> = DashMap::new();
  312. let num_maps: DashMap<String, HashMap<String, Vec<f64>>> = DashMap::new();
  313. self.store.par_iter().for_each(|e| {
  314. let anns = if let Some(sel) = &annotations {
  315. e.value().iter().filter(|a| sel(a)).cloned().collect()
  316. } else {
  317. e.value().clone()
  318. };
  319. let mut categorical = Vec::new();
  320. let mut numerical = Vec::new();
  321. for ann in anns.iter() {
  322. match ann {
  323. LowConstitDepth | LowEntropy | GnomAD(_) | VEP(_) | TriNucleotides(_)
  324. | ReplicationTiming(_) | HighDepth | CpG | VNTR | Repeat | Panel(_)
  325. | LowMAPQ | HighConstitAlt => categorical.push(ann.to_string()),
  326. Callers(caller, sample) => categorical.push(format!("{caller} {sample}")),
  327. ShannonEntropy(v) => numerical.push((ann.to_string(), *v)),
  328. ConstitDepth(v) | Annotation::ConstitAlt(v) => {
  329. numerical.push((ann.to_string(), *v as f64));
  330. }
  331. Cosmic(c) => numerical.push((ann.to_string(), c.cosmic_cnt as f64)),
  332. AlterationCategory(alt_cat) => categorical.push(alt_cat.to_string()),
  333. }
  334. }
  335. categorical.sort();
  336. categorical.dedup();
  337. let k = categorical.join(" + ");
  338. *map.entry(k.clone()).or_default() += 1;
  339. for (k_num, v_num) in numerical {
  340. num_maps
  341. .entry(k.clone())
  342. .or_default()
  343. .entry(k_num)
  344. .or_default()
  345. .push(v_num);
  346. }
  347. });
  348. println!("\nCallers stats:");
  349. println!("\tn categories: {}", map.len());
  350. let mut n = 0;
  351. let lines: Vec<String> = map
  352. .iter()
  353. .map(|e| {
  354. let k = e.key();
  355. let v = e.value();
  356. n += v;
  357. let mut num_str = Vec::new();
  358. if let Some(nums) = num_maps.get(k) {
  359. num_str.extend(
  360. nums.iter()
  361. .map(|(k_n, v_n)| format!("{k_n} {:.2}", mean(v_n))),
  362. )
  363. }
  364. num_str.sort();
  365. format!("\t- {k}\t{v}\t{}", num_str.join("\t"))
  366. })
  367. .collect();
  368. println!("{}", lines.join("\n"));
  369. println!("Total\t{n}\n");
  370. AnnotationsStats {
  371. categorical: map,
  372. numeric: num_maps,
  373. }
  374. }
  375. pub fn vep_stats(&self) -> anyhow::Result<VepStats> {
  376. let genes: DashMap<String, u32> = DashMap::new();
  377. let genes_distant: DashMap<String, u32> = DashMap::new();
  378. let features: DashMap<String, u32> = DashMap::new();
  379. let consequences: DashMap<String, u32> = DashMap::new();
  380. let impact: DashMap<String, u32> = DashMap::new();
  381. let n_all = Arc::new(AtomicU32::new(0));
  382. let n_vep = Arc::new(AtomicU32::new(0));
  383. self.store.par_iter().for_each(|e| {
  384. n_all.fetch_add(1, Ordering::SeqCst);
  385. let best_vep_opt = e.value().iter().find_map(|a| {
  386. if let Annotation::VEP(veps) = a {
  387. get_best_vep(veps).ok()
  388. } else {
  389. None
  390. }
  391. });
  392. if let Some(best_vep) = best_vep_opt {
  393. n_vep.fetch_add(1, Ordering::SeqCst);
  394. if let Some(gene) = best_vep.gene {
  395. if best_vep.extra.distance.is_some() {
  396. *genes_distant.entry(gene).or_default() += 1;
  397. } else {
  398. *genes.entry(gene).or_default() += 1;
  399. }
  400. }
  401. if let Some(feature) = best_vep.feature {
  402. *features.entry(feature).or_default() += 1;
  403. }
  404. if let Some(cs) = best_vep.consequence {
  405. let mut cs = cs.into_iter().map(String::from).collect::<Vec<String>>();
  406. cs.sort();
  407. *consequences.entry(cs.join(" ")).or_default() += 1;
  408. }
  409. if let Some(imp) = best_vep.extra.impact {
  410. *impact.entry(String::from(imp)).or_default() += 1;
  411. }
  412. }
  413. });
  414. let vep_stats = VepStats {
  415. genes: genes.into_iter().collect(),
  416. genes_distant: genes_distant.into_iter().collect(),
  417. features: features.into_iter().collect(),
  418. consequences: consequences.into_iter().collect(),
  419. impact: impact.into_iter().collect(),
  420. n_all: n_all.load(Ordering::SeqCst),
  421. n_vep: n_vep.load(Ordering::SeqCst),
  422. };
  423. Ok(vep_stats)
  424. }
  425. pub fn get_keys_filter(
  426. &self,
  427. filter: impl Fn(&Vec<Annotation>) -> bool + Send + Sync,
  428. ) -> Vec<Hash128> {
  429. self.store
  430. .par_iter()
  431. .filter(|entry| filter(entry.value()))
  432. .map(|entry| *entry.key())
  433. .collect()
  434. }
  435. /// Retain only the variants that pass a given filter across both the
  436. /// internal store and the provided collections.
  437. ///
  438. /// # Arguments
  439. ///
  440. /// * `variants` – Mutable reference to a list of `VariantCollection`s,
  441. /// each of which contains a set of called variants.
  442. /// * `filter` – Predicate applied to each entry of `self.store`
  443. /// (value = `Vec<Annotation>`). If it returns `true`, the entry and
  444. /// all variants with the same key are kept; otherwise they are removed.
  445. ///
  446. /// # Behavior
  447. ///
  448. /// 1. `self.store` is filtered in-place by applying `filter`.
  449. /// 2. The keys of surviving entries are collected into a `HashSet`.
  450. /// 3. Each `VariantCollection` in `variants` is pruned in parallel so
  451. /// that only variants whose `hash()` is in the key set remain.
  452. /// 4. Empty `VariantCollection`s are discarded.
  453. /// 5. Progress information (number of variants removed per caller) is
  454. /// logged via `info!`.
  455. ///
  456. /// # Returns
  457. ///
  458. /// The total number of variants removed across all collections.
  459. ///
  460. /// # Notes
  461. ///
  462. /// * Keys must be `Copy` or `Clone` to be inserted into the temporary
  463. /// `HashSet`.
  464. /// * Logging inside the parallel loop may interleave between threads,
  465. /// but totals remain correct.
  466. pub fn retain_variants(
  467. &mut self,
  468. variants: &mut Vec<VariantCollection>,
  469. filter: impl Fn(&Vec<Annotation>) -> bool + Send + Sync,
  470. ) -> usize {
  471. let mut keys = HashSet::new();
  472. self.store.retain(|key, value| {
  473. if filter(value) {
  474. keys.insert(*key);
  475. true
  476. } else {
  477. false
  478. }
  479. });
  480. info!("{} unique Variants to keep", keys.len());
  481. let n_removed: usize = variants
  482. .par_iter_mut()
  483. .map(|c| {
  484. let before = c.variants.len();
  485. c.variants.retain(|a| keys.contains(&a.hash));
  486. let after = c.variants.len();
  487. info!("\t- {}\t{}/{}", c.caller, before - after, before);
  488. before - after
  489. })
  490. .sum();
  491. variants.retain(|e| !e.variants.is_empty());
  492. info!("{n_removed} variants removed from collections.");
  493. n_removed
  494. }
  495. pub fn retain_keys(&mut self, keys_to_keep: &HashSet<Hash128>) {
  496. self.store.retain(|key, _| keys_to_keep.contains(key));
  497. }
  498. pub fn remove_keys(&mut self, keys_to_remove: &HashSet<Hash128>) {
  499. self.store.retain(|key, _| !keys_to_remove.contains(key));
  500. }
  501. /// Adds constitutional-boundary annotations to variants lacking somatic evidence.
  502. ///
  503. /// This method scans all stored variant entries and identifies those that:
  504. /// - have tumor-only support (`Sample::SoloTumor`), and
  505. /// - lack any somatic call (`Sample::Somatic`).
  506. ///
  507. /// For such variants, it conditionally appends boundary annotations:
  508. /// - [`Annotation::LowConstitDepth`] if any constitutional depth
  509. /// (`Annotation::ConstitDepth`) is below `min_constit_depth`;
  510. /// - [`Annotation::HighConstitAlt`] if any constitutional alternate allele count
  511. /// (`Annotation::ConstitAlt`) exceeds `max_alt_constit`.
  512. ///
  513. /// Each boundary annotation is added at most once per variant.
  514. pub fn somatic_constit_boundaries(&self, max_alt_constit: u16, min_constit_depth: u16) {
  515. for mut entry in self.store.iter_mut() {
  516. let anns = entry.value_mut();
  517. // tumor but no somatic
  518. // let has_tumor = anns
  519. // .iter()
  520. // .any(|a| matches!(a, Annotation::Callers(_, Sample::SoloTumor)));
  521. // let has_somatic = anns
  522. // .iter()
  523. // .any(|a| matches!(a, Annotation::Callers(_, Sample::Somatic)));
  524. // if has_tumor && !has_somatic {
  525. // push at most once
  526. if anns
  527. .iter()
  528. .any(|a| matches!(a, Annotation::ConstitDepth(d) if *d < min_constit_depth))
  529. {
  530. anns.push(Annotation::LowConstitDepth);
  531. }
  532. if anns
  533. .iter()
  534. .any(|a| matches!(a, Annotation::ConstitAlt(a0) if *a0 > max_alt_constit))
  535. {
  536. anns.push(Annotation::HighConstitAlt);
  537. }
  538. // }
  539. }
  540. }
  541. pub fn count_annotations(&self, annotation_types: Vec<Annotation>) -> Vec<usize> {
  542. let annotation_types = Arc::new(annotation_types);
  543. self.store
  544. .par_iter()
  545. .fold(
  546. || vec![0; annotation_types.len()],
  547. |mut counts, r| {
  548. let annotations = r.value();
  549. for (index, annotation_type) in annotation_types.iter().enumerate() {
  550. counts[index] +=
  551. annotations.iter().filter(|a| *a == annotation_type).count();
  552. }
  553. counts
  554. },
  555. )
  556. .reduce(
  557. || vec![0; annotation_types.len()],
  558. |mut a, b| {
  559. for i in 0..a.len() {
  560. a[i] += b[i];
  561. }
  562. a
  563. },
  564. )
  565. }
  566. /// Annotate low shannon ent for solo callers (not Somatic)
  567. pub fn low_shannon_entropy(&mut self, min_shannon_entropy: f64) {
  568. self.store.iter_mut().for_each(|mut e| {
  569. let anns = e.value_mut();
  570. if anns.iter().any(|ann| {
  571. matches!(ann, Annotation::ShannonEntropy(ent) if *ent < min_shannon_entropy)
  572. && !anns
  573. .iter()
  574. .any(|a| matches!(a, Annotation::Callers(_, Sample::Somatic)))
  575. }) && !anns.contains(&Annotation::LowEntropy)
  576. {
  577. anns.push(Annotation::LowEntropy);
  578. }
  579. });
  580. }
  581. }
  582. #[derive(Debug, Serialize, Deserialize)]
  583. pub struct VepStats {
  584. pub genes: Vec<(String, u32)>,
  585. pub genes_distant: Vec<(String, u32)>,
  586. pub features: Vec<(String, u32)>,
  587. pub consequences: Vec<(String, u32)>,
  588. pub impact: Vec<(String, u32)>,
  589. pub n_all: u32,
  590. pub n_vep: u32,
  591. }
  592. impl VepStats {
  593. pub fn save_to_json(&self, file_path: &str) -> anyhow::Result<()> {
  594. let json = serde_json::to_string_pretty(self)?;
  595. let mut file = File::create(file_path)?;
  596. file.write_all(json.as_bytes())?;
  597. Ok(())
  598. }
  599. pub fn load_from_json(file_path: &str) -> anyhow::Result<Self> {
  600. let mut file = File::open(file_path)?;
  601. let mut contents = String::new();
  602. file.read_to_string(&mut contents)?;
  603. let stats: Self = serde_json::from_str(&contents)?;
  604. Ok(stats)
  605. }
  606. }
  607. pub trait CallerCat {
  608. fn caller_cat(&self) -> Annotation;
  609. }
  610. /// Determines whether a variant annotation meets either of the following criteria:
  611. ///
  612. /// 1. It is observed in gnomAD with a non-zero allele frequency *and* has
  613. /// at least one constitutive alternate read.
  614. /// 2. All variant calls are from `SoloTumor` samples and there is at least one
  615. /// constitutive alternate read.
  616. ///
  617. /// This function is typically used to identify variants that are either likely
  618. /// germline (based on population frequency and constitutive signal) or artifacts
  619. /// (appearing only in tumor calls but with constitutive support).
  620. ///
  621. /// # Arguments
  622. ///
  623. /// * `anns` - A slice of `Annotation` values describing variant annotations,
  624. /// such as gnomAD frequency, sample types, and alternate counts.
  625. ///
  626. /// # Returns
  627. ///
  628. /// * `true` if the variant matches either of the criteria described above.
  629. /// * `false` otherwise.
  630. ///
  631. /// # Example
  632. ///
  633. /// ```rust
  634. /// let anns = vec![
  635. /// Annotation::GnomAD(GnomAD { gnomad_af: 0.01 }),
  636. /// Annotation::ConstitAlt(2),
  637. /// ];
  638. /// assert!(is_gnomad_and_constit_alt(&anns));
  639. /// ```
  640. pub fn is_gnomad_and_constit_alt(anns: &[Annotation]) -> bool {
  641. let mut has_gnomad = false;
  642. let mut constit_alt_count = 0;
  643. let mut n_callers = 0;
  644. let mut n_tumor_callers = 0;
  645. for ann in anns {
  646. match ann {
  647. Annotation::GnomAD(g) if g.gnomad_af > 0.0 => has_gnomad = true,
  648. Annotation::ConstitAlt(n) => constit_alt_count += *n,
  649. Annotation::Callers(_, Sample::SoloTumor) => {
  650. n_callers += 1;
  651. n_tumor_callers += 1;
  652. }
  653. Annotation::Callers(..) => {
  654. n_callers += 1;
  655. }
  656. _ => {}
  657. }
  658. }
  659. let has_constit_alt = constit_alt_count > 0;
  660. let only_tumor_with_constit = n_callers > 0 && n_callers == n_tumor_callers && has_constit_alt;
  661. (has_gnomad && has_constit_alt) || only_tumor_with_constit
  662. }