somatic.rs 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845
  1. use crate::{
  2. annotation::is_gnomad_and_constit_alt, collection::ShouldRun, create_should_run_normal_tumoral, init_solo_callers_normal_tumoral, io::bed::read_bed, positions::GenomeRange, scan::scan::SomaticScan, variant::{
  3. variant::{run_if_required, ShouldRunBox},
  4. variants_stats::somatic_depth_quality_ranges,
  5. }
  6. };
  7. use itertools::Itertools;
  8. use log::info;
  9. use rayon::slice::ParallelSliceMut;
  10. use std::{
  11. collections::HashMap,
  12. fs::{self, File},
  13. io::Write,
  14. path::Path,
  15. };
  16. use crate::{
  17. annotation::{Annotation, Annotations, AnnotationsStats, Sample},
  18. callers::{
  19. clairs::ClairS, deep_somatic::DeepSomatic, deep_variant::DeepVariant, nanomonsv::NanomonSV,
  20. savana::Savana, severus::Severus,
  21. },
  22. collection::{Initialize, InitializeSolo},
  23. config::Config,
  24. create_should_run, init_somatic_callers,
  25. runners::Run,
  26. variant::{
  27. variant::{load_variants, CallerBox},
  28. variant_collection::{ExternalAnnotation, VariantCollection, Variants},
  29. variants_stats::VariantsStats,
  30. },
  31. };
  32. /// Runs the full somatic variant calling pipeline for a single sample (`id`).
  33. ///
  34. /// This function orchestrates the entire somatic variant discovery process,
  35. /// starting from raw variant caller outputs (`PASSED VCF`) and applying multiple filtering
  36. /// and annotation steps to produce high-confidence somatic variants.
  37. ///
  38. /// This function orchestrates the end-to-end somatic variant discovery process:
  39. /// - Executes and verifies upstream components if necessary
  40. /// - Loads variants from multiple callers (tumor and normal samples)
  41. /// - Applies several annotation and filtering steps (e.g. depth, population frequency, entropy)
  42. /// - Tracks filtering statistics at each step
  43. /// - Outputs high-confidence somatic variants in both `.json.gz` and `.bit` formats
  44. ///
  45. /// ## Output Overview
  46. /// The final output includes:
  47. /// - `{tumoral_dir}/{id}_somatic_variants.json.gz`: annotated somatic variants
  48. /// - `{tumoral_dir}/{id}_somatic_variants.bit`: compact binary variant representation
  49. /// - `{stats_dir}/`: multiple intermediate JSON files with annotations and statistics
  50. ///
  51. /// ## Steps
  52. ///
  53. /// This pipeline performs the following high-level steps:
  54. ///
  55. /// ### 1. Output Existence Check
  56. /// If the final JSON result already exists and [`Config::somatic_pipe_force`] is not set,
  57. /// the pipeline aborts early to avoid overwriting results.
  58. ///
  59. /// ### 2. Initialization
  60. /// - Prepares statistics and output directories.
  61. /// - Initializes variant annotations.
  62. ///
  63. /// ### 3. Pre-requisite Tool Execution
  64. /// Runs any required upstream components (e.g., alignment, basecalling, variant callers) if
  65. /// their outputs are missing, using the [`run_if_required`] logic.
  66. ///
  67. /// ### 4. Load Variant Collections
  68. /// - Initializes the configured somatic variant callers and loads their output variants from PASSED VCF.
  69. /// - Also loads germline variants (from [`ClairS::germline`]) for comparative germline filtering.
  70. ///
  71. /// ### 5. Statistics Initialization
  72. /// - Initializes a [`SomaticPipeStats`] object to track the number of variants at each step.
  73. /// - Captures initial variant counts before filtering.
  74. /// - Aggregates variant counts from all sources and stores initial annotations for quality control and
  75. /// comparison before filtering.
  76. ///
  77. /// ### 6. Filter: Germline & Solo Constitutional Variants
  78. /// - Removes variants labeled as either Germline or Solo Constitutional, assuming they are unlikely to be somatic.
  79. /// - Records count of removed variants in [`SomaticPipeStats::n_constit_germline`].
  80. ///
  81. /// ### 7. Annotation: BAM Read Depth and Alt Allele Counts
  82. /// - Uses the constitutional BAM file to annotate each variant with read depth and the number of alternate reads observed.
  83. /// - Flags variants with low depth or excessive alt reads for filtering as specified in [`Config`].
  84. ///
  85. /// ### 8. Filtering: Low Depth / High Alt Alleles
  86. /// - Removes variants with low coverage in the constitutional sample, or excessive alt allele support (suggestive of germline origin).
  87. /// - Updates stats:
  88. /// - [`SomaticPipeStats::n_low_constit`]
  89. /// - [`SomaticPipeStats::n_high_alt_constit`]
  90. ///
  91. /// ### 9. Annotation: Sequence Entropy
  92. /// Adds Shannon entropy annotation based on the reference sequence context
  93. /// around each variant (cf. [`Config::entropy_seq_len`]) to flag low-complexity regions (often repetitive).
  94. ///
  95. /// ### 10. Annotation: External Databases (COSMIC, GnomAD):
  96. /// - Uses external resources to annotate variants with:
  97. /// - COSMIC hits (somatic mutation database)
  98. /// - GnomAD allele frequencies
  99. ///
  100. /// ### 11. Filtering: GnomAD + Alt Support in Constitutional Sample
  101. /// - Removes variants that are both present in GnomAD **and** show
  102. /// alternate allele support in the constitutional BAM.
  103. /// These are highly likely to be non-somatic germline polymorphisms.
  104. /// - Updates [`SomaticPipeStats::n_high_alt_constit_gnomad`] stat.
  105. ///
  106. /// ### 12. Filtering: Low Shannon Entropy:
  107. /// - Removes variants from low-complexity regions with entropy below the configured threshold
  108. /// (cf. [`Config::min_shannon_entropy`]).
  109. /// - Updates [`SomaticPipeStats::n_low_entropies`].
  110. ///
  111. /// ### 13. Annotation: VEP (Variant Effect Predictor)
  112. /// Adds transcript-level annotations from Ensembl VEP, providing functional consequences,
  113. /// impacted genes, and regulatory features.
  114. ///
  115. /// ### 14. Merging
  116. /// Merges variant collections into a unified [`Variants`] structure,
  117. /// preserving annotations and applying deduplication logic.
  118. ///
  119. /// ### 15. Final Statistics and Saving
  120. /// - Saves final annotation stats, VEP summaries, and variant-level statistics.
  121. /// - Exports the final somatic variant set to both compressed JSON and `.bit` formats.
  122. ///
  123. /// # Returns
  124. /// - `Ok(())` if all steps completed successfully.
  125. /// - `Err` if any tool fails, if file I/O fails, or if logical conditions are violated (e.g., pre-existing output).
  126. ///
  127. /// # Errors
  128. /// - Returns early if the output file already exists and [`Config::somatic_pipe_force`] is `false`.
  129. /// - Wraps all component-specific errors using `anyhow::Result` with context.
  130. ///
  131. /// # Side Effects
  132. /// - Runs external tools conditionally (e.g., [`ClairS`], [`DeepSomatic`]).
  133. /// - Creates intermediate directories and annotation JSONs for debugging/QC.
  134. /// - May consume significant compute time depending on the number of callers, annotations, and variants.
  135. ///
  136. /// # TODOs
  137. /// - Support compressed intermediate files (`// TODO: GZ !!!`)
  138. /// - Improve filtering metrics reporting (currently not all filtered variants are tracked in final stats).
  139. ///
  140. /// # Example Output Files
  141. /// - `tumoral_dir/sample123_somatic_variants.json.gz`
  142. /// - `tumoral_dir/sample123_somatic_variants.bit`
  143. /// - `stats_dir/sample123_annotations_*.json` (intermediate annotation snapshots)
  144. ///
  145. /// # See Also
  146. /// - [`Annotations`] – core structure for managing per-variant metadata
  147. /// - [`Variants`] – the merged final variant structure
  148. /// - [`SomaticPipeStats`] – used for tracking variant counts throughout filtering
  149. ///
  150. pub struct SomaticPipe {
  151. /// Unique identifier for the sample.
  152. pub id: String,
  153. /// Configuration parameters for the pipeline.
  154. pub config: Config,
  155. }
  156. impl Initialize for SomaticPipe {
  157. /// Initializes a new `Somatic` instance with default annotations.
  158. fn initialize(id: &str, config: crate::config::Config) -> anyhow::Result<Self> {
  159. let id = id.to_string();
  160. Ok(Self { id, config })
  161. }
  162. }
  163. impl ShouldRun for SomaticPipe {
  164. fn should_run(&self) -> bool {
  165. let tumoral_dir = self.config.tumoral_dir(&self.id);
  166. let path_str = format!("{}/{}_somatic_variants.bit", tumoral_dir, self.id);
  167. let path = Path::new(&path_str);
  168. !path.exists()
  169. }
  170. }
  171. impl Run for SomaticPipe {
  172. /// Executes the full somatic variant analysis pipeline.
  173. fn run(&mut self) -> anyhow::Result<()> {
  174. let config = self.config.clone();
  175. let id = self.id.clone();
  176. info!("Running somatic pipe for {id}.");
  177. // Define output paths for the final somatic variants
  178. let result_json = format!("{}/{id}_somatic_variants.json.gz", config.tumoral_dir(&id));
  179. let result_bit = format!("{}/{id}_somatic_variants.bit", config.tumoral_dir(&id));
  180. let result_vcf = format!("{}/{id}_somatic_variants.vcf.gz", config.tumoral_dir(&id));
  181. if Path::new(&result_bit).exists() && !config.somatic_pipe_force {
  182. return Err(anyhow::anyhow!(
  183. "Somatic Pipe output already exists for {id}."
  184. ));
  185. }
  186. let mut annotations = Annotations::default();
  187. // Create stats directory if it doesn't exist
  188. let stats_dir = config.somatic_pipe_stats(&id);
  189. if !Path::new(&stats_dir).exists() {
  190. fs::create_dir(&stats_dir)?;
  191. }
  192. // Initialize and run any pre-required input if necessary
  193. info!("Initialization prerequired pipe inputs...");
  194. let mut to_run_if_req = create_should_run!(
  195. &id,
  196. &config,
  197. SomaticScan,
  198. ClairS,
  199. NanomonSV,
  200. Severus,
  201. Savana,
  202. DeepSomatic
  203. );
  204. to_run_if_req.extend(create_should_run_normal_tumoral!(&id, &config, DeepVariant));
  205. info!("Running prerequired pipe components.");
  206. run_if_required(&mut to_run_if_req).map_err(|e| {
  207. anyhow::anyhow!("Failed to run a prerequired component of somatic pipe for {id}.\n{e}")
  208. })?;
  209. // Initialize variant callers
  210. let mut callers = init_somatic_callers!(
  211. &id,
  212. &config,
  213. ClairS,
  214. NanomonSV,
  215. Severus,
  216. Savana,
  217. DeepSomatic
  218. );
  219. callers.extend(init_solo_callers_normal_tumoral!(&id, &config, DeepVariant,));
  220. // Load variants from each caller
  221. info!("Loading variants.");
  222. let mut variants_collections = load_variants(&mut callers, &annotations)
  223. .map_err(|e| anyhow::anyhow!("Error while loading variants\n{e}"))?;
  224. // Load germline variants using ClairS
  225. info!("Loading germline variants.");
  226. let clairs_germline =
  227. ClairS::initialize(&id, self.config.clone())?.germline(&annotations)?;
  228. variants_collections.push(clairs_germline);
  229. // Initialize statistics
  230. let mut somatic_stats = SomaticPipeStats::init(&variants_collections);
  231. info!(
  232. "Variants collections loaded from {} vcf (total of {} variants loaded)",
  233. variants_collections.len(),
  234. variants_collections
  235. .iter()
  236. .map(|v| v.variants.len())
  237. .sum::<usize>()
  238. );
  239. // Initial annotation stats (caller annotations only)
  240. let caller_cat_anns = |v: &Annotation| matches!(v, Annotation::Callers(_, _));
  241. let annot_init = annotations.callers_stat(Some(Box::new(caller_cat_anns)));
  242. somatic_stats.annot_init(
  243. &annot_init,
  244. &format!("{stats_dir}/{id}_germline_filter.json"),
  245. )?;
  246. annot_init.save_to_json(&format!("{stats_dir}/{id}_annotations_01.json"))?;
  247. // Filter out germline and solo constitutional variants
  248. info!(
  249. "Keeping somatic variants (variants neither in solo {} nor in germline).",
  250. config.normal_name
  251. );
  252. somatic_stats.n_constit_germline =
  253. annotations.retain_variants(&mut variants_collections, |anns| {
  254. !anns.iter().any(|ann| {
  255. matches!(
  256. ann,
  257. Annotation::Callers(_, Sample::Germline)
  258. | Annotation::Callers(_, Sample::SoloConstit)
  259. )
  260. })
  261. });
  262. annotations
  263. .callers_stat(Some(Box::new(caller_cat_anns)))
  264. .save_to_json(&format!(
  265. "{stats_dir}/{id}_annotations_02_post_germline.json"
  266. ))?;
  267. // MASK mapq
  268. let (mut high_depth_ranges, mut low_quality_ranges) =
  269. somatic_depth_quality_ranges(&id, &config)?;
  270. high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
  271. low_quality_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
  272. info!("ranges: {}", low_quality_ranges.len());
  273. variants_collections.iter_mut().for_each(|e | {
  274. let _ = e.annotate_with_ranges(&low_quality_ranges, &annotations, Annotation::LowMAPQ);
  275. });
  276. let n_masked_lowqual = annotations.retain_variants(&mut variants_collections, |anns| {
  277. anns.contains(&Annotation::LowMAPQ)
  278. });
  279. info!("N low mapq filtered: {}", n_masked_lowqual);
  280. // Annotate with depth and number of alternate reads from constitutional BAM
  281. info!("Reading Constit BAM file for depth and pileup annotation.");
  282. variants_collections.iter().try_for_each(|c| {
  283. c.annotate_with_constit_bam(&annotations, &self.config.normal_bam(&id), 150)
  284. })?;
  285. annotations.somatic_constit_boundaries(
  286. self.config.somatic_max_alt_constit,
  287. self.config.somatic_min_constit_depth,
  288. );
  289. annotations
  290. .callers_stat(Some(Box::new(|v| {
  291. matches!(
  292. v,
  293. Annotation::Callers(_, _)
  294. | Annotation::ConstitAlt(_)
  295. | Annotation::ConstitDepth(_)
  296. | Annotation::HighConstitAlt
  297. | Annotation::LowConstitDepth
  298. )
  299. })))
  300. .save_to_json(&format!("{stats_dir}/{id}_annotations_03_bam.json"))?;
  301. // Filter based on low constitutional depth
  302. info!(
  303. "Removing variants when depth in constit bam < {}.",
  304. self.config.somatic_min_constit_depth
  305. );
  306. somatic_stats.n_low_constit = annotations
  307. .retain_variants(&mut variants_collections, |anns| {
  308. !anns.contains(&Annotation::LowConstitDepth)
  309. });
  310. info!(
  311. "{} variants removed when depth in constit bam < {}.",
  312. somatic_stats.n_low_constit, self.config.somatic_min_constit_depth
  313. );
  314. // Filter: remove HighConstitAlt
  315. info!(
  316. "Removing variants with SNP/indel inside the constit alignements > {}",
  317. self.config.somatic_max_alt_constit
  318. );
  319. somatic_stats.n_high_alt_constit = annotations
  320. .retain_variants(&mut variants_collections, |anns| {
  321. !anns.contains(&Annotation::HighConstitAlt)
  322. });
  323. info!(
  324. "{} variants removed with SNP/indel inside the constit alignements > {}",
  325. somatic_stats.n_high_alt_constit, self.config.somatic_max_alt_constit
  326. );
  327. annotations
  328. .callers_stat(Some(Box::new(|v| {
  329. matches!(
  330. v,
  331. Annotation::Callers(_, _)
  332. | Annotation::ConstitAlt(_)
  333. | Annotation::ConstitDepth(_)
  334. )
  335. })))
  336. .save_to_json(&format!("{stats_dir}/{id}_annotations_04_bam_filter.json"))?;
  337. // Annotate variants with sequence entropy
  338. info!(
  339. "Entropy annotation from {} sequences.",
  340. self.config.reference
  341. );
  342. variants_collections.iter().for_each(|c| {
  343. c.annotate_with_sequence_context(
  344. &annotations,
  345. &self.config.reference,
  346. self.config.entropy_seq_len,
  347. self.config.somatic_pipe_threads,
  348. );
  349. });
  350. // Annotate with external databases like COSMIC and GnomAD
  351. info!("Annotation with external databases like COSMIC and GnomAD.");
  352. variants_collections
  353. .iter()
  354. .try_for_each(|c| -> anyhow::Result<()> {
  355. let ext_annot = ExternalAnnotation::init()?;
  356. ext_annot.annotate(&c.variants, &annotations)?;
  357. Ok(())
  358. })?;
  359. annotations
  360. .callers_stat(Some(Box::new(|v| {
  361. matches!(
  362. v,
  363. Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_)
  364. )
  365. })))
  366. .save_to_json(&format!("{stats_dir}/{id}_annotations_05_gnomad.json"))?;
  367. // Filter: Remove variants present in GnomAD and have alt reads in constitutional sample
  368. info!("Filtering out variants in GnomAD and in constit bam at low AF.");
  369. somatic_stats.n_high_alt_constit_gnomad = annotations
  370. .retain_variants(&mut variants_collections, |anns| {
  371. !is_gnomad_and_constit_alt(anns)
  372. });
  373. info!(
  374. "{} variants filtered, with constit alt <= max contig alt ({}) and in GnomAD.",
  375. somatic_stats.n_high_alt_constit_gnomad, self.config.somatic_max_alt_constit
  376. );
  377. // TODO: These stats doesn't capture filter metrics !!!
  378. annotations
  379. .callers_stat(Some(Box::new(|v| {
  380. matches!(
  381. v,
  382. Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_)
  383. )
  384. })))
  385. .save_to_json(&format!(
  386. "{stats_dir}/{id}_annotations_06_gnomad_filter.json"
  387. ))?;
  388. // Filter low entropy variants
  389. annotations.low_shannon_entropy(self.config.min_shannon_entropy);
  390. info!(
  391. "Filtering out variants with low entropies ({})",
  392. config.min_shannon_entropy
  393. );
  394. annotations
  395. .callers_stat(Some(Box::new(|v| {
  396. matches!(v, Annotation::Callers(_, _) | Annotation::LowEntropy)
  397. })))
  398. .save_to_json(&format!("{stats_dir}/{id}_annotations_07_entropy.json"))?;
  399. somatic_stats.n_low_entropies = annotations
  400. .retain_variants(&mut variants_collections, |anns| {
  401. !anns.contains(&Annotation::LowEntropy)
  402. });
  403. annotations
  404. .callers_stat(Some(Box::new(|v| matches!(v, Annotation::Callers(_, _)))))
  405. .save_to_json(&format!(
  406. "{stats_dir}/{id}_annotations_08_entropy_filter.json"
  407. ))?;
  408. // Final VEP annotation
  409. info!("Annotation with VEP.");
  410. variants_collections
  411. .iter()
  412. .try_for_each(|c| -> anyhow::Result<()> {
  413. let ext_annot = ExternalAnnotation::init()?;
  414. ext_annot.annotate_vep(&c.variants, &annotations)?;
  415. Ok(())
  416. })?;
  417. annotations
  418. .callers_stat(Some(Box::new(caller_cat_anns)))
  419. .save_to_json(&format!("{stats_dir}/{id}_annotations_09_vep.json"))?;
  420. annotations.vep_stats()?;
  421. // Merge all variants into a final collection
  422. let mut variants = variants_collections.into_iter().fold(
  423. Variants::default(),
  424. |mut acc, variants_collection| {
  425. acc.merge(variants_collection, &annotations);
  426. acc
  427. },
  428. );
  429. let vep_stats = annotations.vep_stats()?;
  430. vep_stats.save_to_json(&format!("{stats_dir}/{id}_annotations_10_vep.json"))?;
  431. VariantsStats::new(&mut variants, &id, &config, &high_depth_ranges)?
  432. .save_to_json(&format!("{stats_dir}/{id}_variants_stats_final.json.gz"))?;
  433. info!("Final unique variants: {}", variants.data.len());
  434. // Ensure sorted (should already be sorted)
  435. variants.sort();
  436. let vntrs: Vec<GenomeRange> = read_bed("/data/ref/hs1/vntrs_chm13.bed")?.iter().map(|r| r.range.clone()).collect();
  437. variants.annotate_with_ranges(&vntrs, Some(Annotation::VNTR), 0, Vec::new());
  438. let repeat_masker: Vec<GenomeRange> = read_bed("/data/ref/hs1/chm13v2.0_RepeatMasker_4.1.2p1.2022Apr14.bed")?.iter().map(|r| r.range.clone()).collect();
  439. variants.annotate_with_ranges(&repeat_masker, Some(Annotation::RepeatMasker), 0, Vec::new());
  440. variants.save_to_json(&result_json)?;
  441. variants.save_to_file(&result_bit)?;
  442. variants.write_vcf(&result_vcf, &config.dict_file, config.somatic_pipe_force)?;
  443. Ok(())
  444. }
  445. }
  446. pub fn const_stats(id: String, config: Config) -> anyhow::Result<()> {
  447. info!("Loading Germline");
  448. let annotations = Annotations::default();
  449. let clairs_germline = ClairS::initialize(&id, config.clone())?.germline(&annotations)?;
  450. info!("Annotation with Cosmic and GnomAD.");
  451. let ext_annot = ExternalAnnotation::init()?;
  452. ext_annot.annotate(&clairs_germline.variants, &annotations)?;
  453. let mut variants = Variants::default();
  454. variants.merge(clairs_germline, &annotations);
  455. info!("-- {}", variants.data.len());
  456. // let u = VariantsStats::new(&mut variants, &id, &config)?;
  457. // info!("++ {}", u.n);
  458. Ok(())
  459. }
  460. /// Holds statistical data for somatic variant pipeline processing,
  461. /// including summary counts and input categorization.
  462. #[derive(Debug, Default, Clone)]
  463. pub struct SomaticPipeStats {
  464. /// Summary of input variant collections grouped by sample type.
  465. pub input: InputStats,
  466. /// Number of variants labeled as both constitutional and germline.
  467. pub n_constit_germline: usize,
  468. /// Number of variants in constitutional samples with low allele frequency.
  469. pub n_low_constit: usize,
  470. /// Number of variants in constitutional samples with high alternative allele count.
  471. pub n_high_alt_constit: usize,
  472. /// Number of high-alt constitutional variants that are also found in gnomAD.
  473. pub n_high_alt_constit_gnomad: usize,
  474. /// Number of variants filtered due to low entropy (indicative of low complexity regions).
  475. pub n_low_entropies: usize,
  476. }
  477. impl SomaticPipeStats {
  478. /// Initializes a `SomaticPipeStats` object with populated `InputStats` based on the
  479. /// provided `VariantCollection`s.
  480. pub fn init(collections: &[VariantCollection]) -> Self {
  481. Self {
  482. input: InputStats::from_collections(collections),
  483. ..Default::default()
  484. }
  485. }
  486. /// Generates a tumor-vs-germline annotation matrix and writes it to a JSON file.
  487. ///
  488. /// This method analyzes co-occurrence patterns between tumor and germline/constit
  489. /// variant callers by iterating through annotation statistics from `AnnotationsStats`.
  490. /// It builds a matrix where each row corresponds to a **tumor caller** and each column
  491. /// corresponds to a **germline or constitutional caller**. Each cell contains the number
  492. /// of variant calls that were annotated by both the tumor and the germline/constit caller.
  493. ///
  494. /// In addition to writing a structured JSON file to the specified path, the function also
  495. /// prints a tab-separated human-readable matrix to stdout for convenience.
  496. ///
  497. /// # Parameters
  498. ///
  499. /// - `stats`: A reference to [`AnnotationsStats`] containing categorical annotations.
  500. /// This object holds a frequency map where keys are combinations of `Annotation`
  501. /// values (serialized as strings) and values are occurrence counts.
  502. /// - `json_path`: The path where the resulting JSON output file should be written.
  503. ///
  504. /// # Output Formats
  505. ///
  506. /// ## JSON Output (`json_path`)
  507. ///
  508. /// The output JSON is an **array of tumor caller records**, each containing:
  509. /// - `caller_name`: The name of the tumor caller (as a string).
  510. /// - `germline`: An array of JSON objects, each with one key-value pair,
  511. /// where the key is a germline/constit caller (or combination) and the value is the count.
  512. ///
  513. /// ### Example
  514. /// ```json
  515. /// [
  516. /// {
  517. /// "caller_name": "DeepVariant SoloTumor",
  518. /// "germline": [
  519. /// {"ClairS Germline": 99978},
  520. /// {"ClairS Germline + DeepVariant SoloConstit": 4710570}
  521. /// ]
  522. /// },
  523. /// {
  524. /// "caller_name": "ClairS Somatic",
  525. /// "germline": [
  526. /// {"ClairS Germline": 944},
  527. /// {"ClairS Germline + DeepVariant SoloConstit": 362}
  528. /// ]
  529. /// }
  530. /// ]
  531. /// ```
  532. ///
  533. /// ## Console Output (TSV)
  534. ///
  535. /// A tab-separated matrix is printed to stdout. Each row begins with a tumor caller,
  536. /// followed by columns showing germline/constit caller combinations and their counts.
  537. ///
  538. /// # Notes
  539. /// - Tumor callers are collected from `self.input.somatic` and `self.input.solo_tumor`.
  540. /// - Germline/constit callers are collected from `self.input.germline` and `self.input.solo_constit`.
  541. /// - Keys in the original `AnnotationsStats` map are split using `" + "` and parsed into `Annotation` enums.
  542. /// - Germline keys are sorted and joined to form canonical column labels.
  543. /// - Tumor and germline annotations are matched by exact `Annotation` equality.
  544. ///
  545. /// # Errors
  546. /// Returns an error if:
  547. /// - Any annotation key in `AnnotationsStats` fails to parse into a valid `Annotation`.
  548. /// - The JSON output file cannot be created or written to.
  549. ///
  550. /// # Dependencies
  551. /// Requires that `Annotation` implements `FromStr`, `ToString`, `PartialEq`, and `Clone`.
  552. ///
  553. /// # Example Usage
  554. /// ```rust
  555. /// let mut stats = SomaticPipeStats::init(&collections);
  556. /// stats.annot_init(&annotation_stats, "output/matrix.json")?;
  557. /// ```
  558. pub fn annot_init(&self, stats: &AnnotationsStats, json_path: &str) -> anyhow::Result<()> {
  559. // Parse annotations from stats
  560. let stats: Vec<(Vec<Annotation>, u64)> = stats
  561. .categorical
  562. .iter()
  563. .map(|e| {
  564. let anns = e
  565. .key()
  566. .split(" + ")
  567. .map(|k| k.parse())
  568. .collect::<anyhow::Result<Vec<Annotation>>>()
  569. .map_err(|err| {
  570. anyhow::anyhow!("Error while splitting key in AnnotationsStats.\n{err}")
  571. })?;
  572. Ok((anns, *e.value()))
  573. })
  574. .collect::<anyhow::Result<Vec<(Vec<Annotation>, u64)>>>()?;
  575. // Collect tumor and germline callers from input stats
  576. let callers_somatic_solo_tumor = [
  577. self.input
  578. .somatic
  579. .iter()
  580. .map(|(caller, _)| caller.clone())
  581. .collect::<Vec<Annotation>>(),
  582. self.input
  583. .solo_tumor
  584. .iter()
  585. .map(|(caller, _)| caller.clone())
  586. .collect(),
  587. ]
  588. .concat();
  589. let callers_germline_solo_constit = [
  590. self.input
  591. .germline
  592. .iter()
  593. .map(|(caller, _)| caller.clone())
  594. .collect::<Vec<Annotation>>(),
  595. self.input
  596. .solo_constit
  597. .iter()
  598. .map(|(caller, _)| caller.clone())
  599. .collect(),
  600. ]
  601. .concat();
  602. // Build a matrix of tumor vs germline hits
  603. let mut with_germline: HashMap<String, HashMap<String, u64>> = HashMap::new();
  604. stats.iter().for_each(|(anns, v)| {
  605. // Only proceed if this annotation includes a germline/constit sample
  606. if anns.iter().any(|a| {
  607. matches!(
  608. a,
  609. Annotation::Callers(_, Sample::SoloConstit)
  610. | Annotation::Callers(_, Sample::Germline)
  611. )
  612. }) {
  613. // Find all tumor callers present in this annotation set
  614. let n_by_tumor: Vec<(String, u64)> = callers_somatic_solo_tumor
  615. .iter()
  616. .flat_map(|tumor| {
  617. if anns.contains(tumor) {
  618. vec![(tumor.to_string(), *v)]
  619. } else {
  620. vec![]
  621. }
  622. })
  623. .collect();
  624. // Build a normalized germline key
  625. let mut germline_caller: Vec<String> = callers_germline_solo_constit
  626. .iter()
  627. .flat_map(|germ| {
  628. if anns.contains(germ) {
  629. vec![germ.to_string()]
  630. } else {
  631. vec![]
  632. }
  633. })
  634. .collect();
  635. germline_caller.sort();
  636. let germline_caller = germline_caller.join(" + ");
  637. // Update matrix: tumor -> germline -> count
  638. n_by_tumor.iter().for_each(|(tumoral_caller, n)| {
  639. if let Some(row) = with_germline.get_mut(tumoral_caller) {
  640. if let Some(col) = row.get_mut(&germline_caller) {
  641. *col += *n;
  642. } else {
  643. row.insert(germline_caller.to_string(), *n);
  644. }
  645. } else {
  646. let mut row = HashMap::new();
  647. row.insert(germline_caller.to_string(), *n);
  648. with_germline.insert(tumoral_caller.to_string(), row);
  649. }
  650. });
  651. }
  652. });
  653. // Extract all unique germline caller labels
  654. let mut germlines_callers: Vec<String> = with_germline
  655. .iter()
  656. .flat_map(|(_, r)| {
  657. r.iter()
  658. .map(|(k, _)| k.to_string())
  659. .collect::<Vec<String>>()
  660. })
  661. .collect();
  662. germlines_callers.sort();
  663. germlines_callers.dedup();
  664. // Print a readable tab-separated matrix
  665. let mut json = Vec::new();
  666. let mut lines: Vec<String> = with_germline
  667. .iter()
  668. .map(|(tumor, row)| {
  669. json.push(format!(
  670. "{{\"caller_name\": \"{tumor}\", \"germline\": [{}] }}",
  671. germlines_callers
  672. .iter()
  673. .map(|g| {
  674. let v = row.get(g).unwrap_or(&0);
  675. format!("{{\"{g}\": {v}}}")
  676. })
  677. .join(", ")
  678. ));
  679. format!(
  680. "{tumor}\t{}",
  681. germlines_callers
  682. .iter()
  683. .map(|g| {
  684. let v = row.get(g).unwrap_or(&0);
  685. format!("{g}: {v}")
  686. })
  687. .join("\t")
  688. )
  689. })
  690. .collect();
  691. lines.sort();
  692. println!("{}", lines.join("\n"));
  693. // Write JSON to file
  694. let json = format!("[{}]", json.join(", "));
  695. let mut file = File::create(json_path)?;
  696. file.write_all(json.as_bytes())?;
  697. Ok(())
  698. }
  699. }
  700. /// Aggregated statistics on variant collections, grouped by sample type.
  701. ///
  702. /// The `InputStats` struct stores the number of variants observed for each
  703. /// caller, categorized by biological sample type:
  704. ///
  705. /// - `solo_tumor`: Variants from tumor-only samples.
  706. /// - `solo_constit`: Variants from normal-only (constitutional) samples.
  707. /// - `germline`: Variants classified as germline (shared between normal and tumor).
  708. /// - `somatic`: Variants classified as somatic (tumor-specific).
  709. ///
  710. /// Each vector contains tuples of the form `(Annotation, usize)`, where:
  711. /// - `Annotation` identifies the caller and sample type.
  712. /// - `usize` represents the number of variants for that annotation.
  713. #[derive(Debug, Default, Clone)]
  714. pub struct InputStats {
  715. /// Variants from tumor-only samples.
  716. pub solo_tumor: Vec<(Annotation, usize)>,
  717. /// Variants from normal-only (constitutional) samples.
  718. pub solo_constit: Vec<(Annotation, usize)>,
  719. /// Germline variants (present in both normal and tumor).
  720. pub germline: Vec<(Annotation, usize)>,
  721. /// Somatic variants (specific to tumor).
  722. pub somatic: Vec<(Annotation, usize)>,
  723. }
  724. impl InputStats {
  725. /// Constructs an `InputStats` instance by aggregating variant counts
  726. /// from a list of [`VariantCollection`]s.
  727. ///
  728. /// Each `VariantCollection` is inspected to determine its sample type,
  729. /// as indicated by the `Annotation::Callers(_, Sample)` variant. The
  730. /// number of variants (`collection.variants.len()`) is recorded in the
  731. /// appropriate field of `InputStats`, grouped by `Sample`.
  732. ///
  733. /// # Parameters
  734. ///
  735. /// - `collections`: A slice of [`VariantCollection`] objects containing
  736. /// variants annotated with a caller and sample type.
  737. ///
  738. /// # Returns
  739. ///
  740. /// A populated `InputStats` instance with counts per (caller, sample type).
  741. ///
  742. /// # Examples
  743. ///
  744. /// ```
  745. /// let collections: Vec<VariantCollection> = load_collections();
  746. /// let stats = InputStats::from_collections(&collections);
  747. /// println!("Somatic calls: {:?}", stats.somatic);
  748. /// ```
  749. pub fn from_collections(collections: &[VariantCollection]) -> Self {
  750. let mut stats = Self::default();
  751. for collection in collections.iter() {
  752. match collection.caller {
  753. Annotation::Callers(_, Sample::SoloTumor) => stats
  754. .solo_tumor
  755. .push((collection.caller.clone(), collection.variants.len())),
  756. Annotation::Callers(_, Sample::SoloConstit) => stats
  757. .solo_constit
  758. .push((collection.caller.clone(), collection.variants.len())),
  759. Annotation::Callers(_, Sample::Germline) => stats
  760. .germline
  761. .push((collection.caller.clone(), collection.variants.len())),
  762. Annotation::Callers(_, Sample::Somatic) => stats
  763. .somatic
  764. .push((collection.caller.clone(), collection.variants.len())),
  765. _ => (),
  766. };
  767. }
  768. stats
  769. }
  770. }