somatic.rs 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900
  1. use crate::{
  2. annotation::is_gnomad_and_constit_alt,
  3. create_should_run_normal_tumoral, init_solo_callers_normal_tumoral,
  4. io::bed::read_bed,
  5. pipes::{Initialize, InitializeSolo, ShouldRun},
  6. positions::{GenomeRange, GenomeRangeExt},
  7. scan::scan::SomaticScan,
  8. variant::{
  9. variants_stats::somatic_depth_quality_ranges,
  10. vcf_variant::{run_if_required, ShouldRunBox},
  11. },
  12. };
  13. use log::info;
  14. use rayon::slice::ParallelSliceMut;
  15. use serde::Serialize;
  16. use std::{
  17. collections::BTreeMap,
  18. fs::{self, File},
  19. path::{Path, PathBuf},
  20. };
  21. use crate::{
  22. annotation::{Annotation, Annotations, AnnotationsStats, Sample},
  23. callers::{
  24. clairs::ClairS, deep_somatic::DeepSomatic, deep_variant::DeepVariant, nanomonsv::NanomonSV,
  25. savana::Savana, severus::Severus,
  26. },
  27. config::Config,
  28. create_should_run, init_somatic_callers,
  29. runners::Run,
  30. variant::{
  31. variant_collection::{ExternalAnnotation, VariantCollection, Variants},
  32. variants_stats::VariantsStats,
  33. vcf_variant::{load_variants, CallerBox},
  34. },
  35. };
  36. /// Runs the full somatic variant calling pipeline for a single sample (`id`).
  37. ///
  38. /// This function orchestrates the entire somatic variant discovery process,
  39. /// starting from raw variant caller outputs (`PASSED VCF`) and applying multiple filtering
  40. /// and annotation steps to produce high-confidence somatic variants.
  41. ///
  42. /// This function orchestrates the end-to-end somatic variant discovery process:
  43. /// - Executes and verifies upstream components if necessary
  44. /// - Loads variants from multiple callers (tumor and normal samples)
  45. /// - Applies several annotation and filtering steps (e.g. depth, population frequency, entropy)
  46. /// - Tracks filtering statistics at each step
  47. /// - Outputs high-confidence somatic variants in both `.json.gz` and `.bit` formats
  48. ///
  49. /// ## Output Overview
  50. /// The final output includes:
  51. /// - `{tumoral_dir}/{id}_somatic_variants.json.gz`: annotated somatic variants
  52. /// - `{tumoral_dir}/{id}_somatic_variants.bit`: compact binary variant representation
  53. /// - `{stats_dir}/`: multiple intermediate JSON files with annotations and statistics
  54. ///
  55. /// ## Steps
  56. ///
  57. /// This pipeline performs the following high-level steps:
  58. ///
  59. /// ### 1. Output Existence Check
  60. /// If the final JSON result already exists and [`Config::somatic_pipe_force`] is not set,
  61. /// the pipeline aborts early to avoid overwriting results.
  62. ///
  63. /// ### 2. Initialization
  64. /// - Prepares statistics and output directories.
  65. /// - Initializes variant annotations.
  66. ///
  67. /// ### 3. Pre-requisite Tool Execution
  68. /// Runs any required upstream components (e.g., alignment, basecalling, variant callers) if
  69. /// their outputs are missing, using the [`run_if_required`] logic.
  70. ///
  71. /// ### 4. Load Variant Collections
  72. /// - Initializes the configured somatic variant callers and loads their output variants from PASSED VCF.
  73. /// - Also loads germline variants (from [`ClairS::germline`]) for comparative germline filtering.
  74. ///
  75. /// ### 5. Statistics Initialization
  76. /// - Initializes a [`SomaticPipeStats`] object to track the number of variants at each step.
  77. /// - Captures initial variant counts before filtering.
  78. /// - Aggregates variant counts from all sources and stores initial annotations for quality control and
  79. /// comparison before filtering.
  80. ///
  81. /// ### 6. Filter: Germline & Solo Constitutional Variants
  82. /// - Removes variants labeled as either Germline or Solo Constitutional, assuming they are unlikely to be somatic.
  83. /// - Records count of removed variants in [`SomaticPipeStats::n_constit_germline`].
  84. ///
  85. /// ### 7. Annotation: BAM Read Depth and Alt Allele Counts
  86. /// - Uses the constitutional BAM file to annotate each variant with read depth and the number of alternate reads observed.
  87. /// - Flags variants with low depth or excessive alt reads for filtering as specified in [`Config`].
  88. ///
  89. /// ### 8. Filtering: Low Depth / High Alt Alleles
  90. /// - Removes variants with low coverage in the constitutional sample, or excessive alt allele support (suggestive of germline origin).
  91. /// - Updates stats:
  92. /// - [`SomaticPipeStats::n_low_constit`]
  93. /// - [`SomaticPipeStats::n_high_alt_constit`]
  94. ///
  95. /// ### 9. Annotation: Sequence Entropy
  96. /// Adds Shannon entropy annotation based on the reference sequence context
  97. /// around each variant (cf. [`Config::entropy_seq_len`]) to flag low-complexity regions (often repetitive).
  98. ///
  99. /// ### 10. Annotation: External Databases (COSMIC, GnomAD):
  100. /// - Uses external resources to annotate variants with:
  101. /// - COSMIC hits (somatic mutation database)
  102. /// - GnomAD allele frequencies
  103. ///
  104. /// ### 11. Filtering: GnomAD + Alt Support in Constitutional Sample
  105. /// - Removes variants that are both present in GnomAD **and** show
  106. /// alternate allele support in the constitutional BAM.
  107. /// These are highly likely to be non-somatic germline polymorphisms.
  108. /// - Updates [`SomaticPipeStats::n_high_alt_constit_gnomad`] stat.
  109. ///
  110. /// ### 12. Filtering: Low Shannon Entropy:
  111. /// - Removes variants from low-complexity regions with entropy below the configured threshold
  112. /// (cf. [`Config::min_shannon_entropy`]).
  113. /// - Updates [`SomaticPipeStats::n_low_entropies`].
  114. ///
  115. /// ### 13. Annotation: VEP (Variant Effect Predictor)
  116. /// Adds transcript-level annotations from Ensembl VEP, providing functional consequences,
  117. /// impacted genes, and regulatory features.
  118. ///
  119. /// ### 14. Merging
  120. /// Merges variant collections into a unified [`Variants`] structure,
  121. /// preserving annotations and applying deduplication logic.
  122. ///
  123. /// ### 15. Final Statistics and Saving
  124. /// - Saves final annotation stats, VEP summaries, and variant-level statistics.
  125. /// - Exports the final somatic variant set to both compressed JSON and `.bit` formats.
  126. ///
  127. /// # Returns
  128. /// - `Ok(())` if all steps completed successfully.
  129. /// - `Err` if any tool fails, if file I/O fails, or if logical conditions are violated (e.g., pre-existing output).
  130. ///
  131. /// # Errors
  132. /// - Returns early if the output file already exists and [`Config::somatic_pipe_force`] is `false`.
  133. /// - Wraps all component-specific errors using `anyhow::Result` with context.
  134. ///
  135. /// # Side Effects
  136. /// - Runs external tools conditionally (e.g., [`ClairS`], [`DeepSomatic`]).
  137. /// - Creates intermediate directories and annotation JSONs for debugging/QC.
  138. /// - May consume significant compute time depending on the number of callers, annotations, and variants.
  139. ///
  140. /// # TODOs
  141. /// - Support compressed intermediate files (`// TODO: GZ !!!`)
  142. /// - Improve filtering metrics reporting (currently not all filtered variants are tracked in final stats).
  143. ///
  144. /// # Example Output Files
  145. /// - `tumoral_dir/sample123_somatic_variants.json.gz`
  146. /// - `tumoral_dir/sample123_somatic_variants.bit`
  147. /// - `stats_dir/sample123_annotations_*.json` (intermediate annotation snapshots)
  148. ///
  149. /// # See Also
  150. /// - [`Annotations`] – core structure for managing per-variant metadata
  151. /// - [`Variants`] – the merged final variant structure
  152. /// - [`SomaticPipeStats`] – used for tracking variant counts throughout filtering
  153. ///
  154. pub struct SomaticPipe {
  155. /// Unique identifier for the sample.
  156. pub id: String,
  157. /// Configuration parameters for the pipeline.
  158. pub config: Config,
  159. }
  160. impl Initialize for SomaticPipe {
  161. /// Initializes a new `Somatic` instance with default annotations.
  162. fn initialize(id: &str, config: &crate::config::Config) -> anyhow::Result<Self> {
  163. let id = id.to_string();
  164. Ok(Self {
  165. id,
  166. config: config.clone(),
  167. })
  168. }
  169. }
  170. impl ShouldRun for SomaticPipe {
  171. fn should_run(&self) -> bool {
  172. // path to the existing “.bit” file
  173. let tumoral_dir = self.config.tumoral_dir(&self.id);
  174. let bit_path = Path::new(&tumoral_dir).join(format!("{}_somatic_variants.bit", self.id));
  175. // if we can’t read its modification time, re-run
  176. let res_meta = match fs::metadata(&bit_path).and_then(|m| m.modified()) {
  177. Ok(ts) => ts,
  178. Err(_) => return true,
  179. };
  180. let deps: [PathBuf; 5] = [
  181. Path::new(&self.config.normal_bam(&self.id)).to_path_buf(),
  182. Path::new(&self.config.tumoral_bam(&self.id)).to_path_buf(),
  183. Path::new(&self.config.reference).to_path_buf(),
  184. Path::new(&self.config.vntrs_bed).to_path_buf(),
  185. Path::new(&self.config.repeats_bed).to_path_buf(),
  186. ];
  187. deps.iter().any(|p| {
  188. fs::metadata(p)
  189. .and_then(|m| m.modified())
  190. .map_or(true, |ts| ts > res_meta)
  191. })
  192. }
  193. }
  194. impl Run for SomaticPipe {
  195. /// Executes the full somatic variant analysis pipeline.
  196. fn run(&mut self) -> anyhow::Result<()> {
  197. let config = self.config.clone();
  198. let id = self.id.clone();
  199. info!("Running somatic pipe for {id}.");
  200. // Define output paths for the final somatic variants
  201. let result_json = format!("{}/{id}_somatic_variants.json.gz", config.tumoral_dir(&id));
  202. let result_bit = format!("{}/{id}_somatic_variants.bit", config.tumoral_dir(&id));
  203. let result_vcf = format!("{}/{id}_somatic_variants.vcf.gz", config.tumoral_dir(&id));
  204. let outputs = [&result_json, &result_bit, &result_vcf];
  205. if !config.somatic_pipe_force && outputs.iter().any(|p| Path::new(p).exists()) {
  206. return Err(anyhow::anyhow!(
  207. "Somatic Pipe output already exists for {id}."
  208. ));
  209. }
  210. let mut annotations = Annotations::default();
  211. // Create stats directory if it doesn't exist
  212. let stats_dir = config.somatic_pipe_stats(&id);
  213. if !Path::new(&stats_dir).exists() {
  214. fs::create_dir(&stats_dir)?;
  215. }
  216. // Initialize and run any pre-required input if necessary
  217. info!("Initialization prerequired pipe inputs...");
  218. let mut to_run_if_req = create_should_run!(
  219. &id,
  220. &config,
  221. SomaticScan,
  222. ClairS,
  223. NanomonSV,
  224. Severus,
  225. Savana,
  226. DeepSomatic
  227. );
  228. to_run_if_req.extend(create_should_run_normal_tumoral!(&id, &config, DeepVariant));
  229. info!("Running prerequired pipe components.");
  230. run_if_required(&mut to_run_if_req).map_err(|e| {
  231. anyhow::anyhow!("Failed to run a prerequired component of somatic pipe for {id}.\n{e}")
  232. })?;
  233. // Initialize variant callers
  234. let mut callers = init_somatic_callers!(
  235. &id,
  236. &config,
  237. ClairS,
  238. NanomonSV,
  239. Severus,
  240. Savana,
  241. DeepSomatic
  242. );
  243. callers.extend(init_solo_callers_normal_tumoral!(&id, &config, DeepVariant,));
  244. // Load variants from each caller
  245. info!("Loading variants.");
  246. let mut variants_collections = load_variants(&mut callers, &annotations)
  247. .map_err(|e| anyhow::anyhow!("Error while loading variants\n{e}"))?;
  248. // Load germline variants using ClairS
  249. info!("Loading germline variants.");
  250. let clairs_germline =
  251. ClairS::initialize(&id, &self.config.clone())?.germline(&annotations)?;
  252. variants_collections.push(clairs_germline);
  253. // Initialize statistics
  254. let mut somatic_stats = SomaticPipeStats::init(&variants_collections);
  255. info!(
  256. "Variants collections loaded from {} vcf (total of {} variants loaded)",
  257. variants_collections.len(),
  258. variants_collections
  259. .iter()
  260. .map(|v| v.variants.len())
  261. .sum::<usize>()
  262. );
  263. // Initial annotation stats (caller annotations only)
  264. let caller_cat_anns = |v: &Annotation| matches!(v, Annotation::Callers(_, _));
  265. let annot_init = annotations.callers_stat(Some(Box::new(caller_cat_anns)));
  266. somatic_stats.annot_init(
  267. &annot_init,
  268. &format!("{stats_dir}/{id}_germline_filter.json"),
  269. )?;
  270. annot_init.save_to_json(&format!("{stats_dir}/{id}_annotations_01.json"))?;
  271. // Filter out germline and solo constitutional variants
  272. info!(
  273. "Keeping somatic variants (variants neither in solo normal {} nor in germline).",
  274. config.normal_name
  275. );
  276. somatic_stats.n_constit_germline =
  277. annotations.retain_variants(&mut variants_collections, |anns| {
  278. !anns.iter().any(|ann| {
  279. matches!(
  280. ann,
  281. Annotation::Callers(_, Sample::Germline)
  282. | Annotation::Callers(_, Sample::SoloConstit)
  283. )
  284. })
  285. });
  286. annotations
  287. .callers_stat(Some(Box::new(caller_cat_anns)))
  288. .save_to_json(&format!(
  289. "{stats_dir}/{id}_annotations_02_post_germline.json"
  290. ))?;
  291. // Remove deletions stretch
  292. // info!("Removing deletions stretchs:");
  293. // variants_collections.iter_mut().for_each(|coll| {
  294. // let rm = coll.remove_strech();
  295. // info!("\t - {} {}: {}", coll.vcf.caller, coll.vcf.time, rm);
  296. // });
  297. // MASK mapq
  298. let (mut high_depth_ranges, mut low_quality_ranges) =
  299. somatic_depth_quality_ranges(&id, &config)?;
  300. high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
  301. low_quality_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
  302. info!(
  303. "High-depth ranges: n={} bp={}",
  304. high_depth_ranges.len(),
  305. high_depth_ranges.total_len(),
  306. );
  307. info!(
  308. "LowQ ranges: n={} bp={}",
  309. low_quality_ranges.len(),
  310. low_quality_ranges.total_len(),
  311. );
  312. variants_collections.iter_mut().for_each(|e| {
  313. let _ = e.annotate_with_ranges(&low_quality_ranges, &annotations, Annotation::LowMAPQ);
  314. });
  315. let n_masked_lowqual = annotations.retain_variants(&mut variants_collections, |anns| {
  316. !anns.contains(&Annotation::LowMAPQ)
  317. });
  318. info!("N low mapq filtered: {}", n_masked_lowqual);
  319. // Annotate with depth and number of alternate reads from constitutional BAM
  320. info!("Reading Constit BAM file for depth and pileup annotation.");
  321. variants_collections.iter().try_for_each(|c| {
  322. c.annotate_with_constit_bam(&annotations, &self.config.normal_bam(&id), 150)
  323. })?;
  324. annotations.somatic_constit_boundaries(
  325. self.config.somatic_max_alt_constit,
  326. self.config.somatic_min_constit_depth,
  327. );
  328. annotations
  329. .callers_stat(Some(Box::new(|v| {
  330. matches!(
  331. v,
  332. Annotation::Callers(_, _)
  333. | Annotation::ConstitAlt(_)
  334. | Annotation::ConstitDepth(_)
  335. | Annotation::HighConstitAlt
  336. | Annotation::LowConstitDepth
  337. )
  338. })))
  339. .save_to_json(&format!("{stats_dir}/{id}_annotations_03_bam.json"))?;
  340. // Filter based on low constitutional depth
  341. info!(
  342. "Removing variants when depth in constit bam < {}.",
  343. self.config.somatic_min_constit_depth
  344. );
  345. somatic_stats.n_low_constit = annotations
  346. .retain_variants(&mut variants_collections, |anns| {
  347. !anns.contains(&Annotation::LowConstitDepth)
  348. });
  349. info!(
  350. "{} variants removed when depth in constit bam < {}.",
  351. somatic_stats.n_low_constit, self.config.somatic_min_constit_depth
  352. );
  353. // Filter: remove HighConstitAlt
  354. info!(
  355. "Removing variants with SNP/indel inside the constit alignements > {}",
  356. self.config.somatic_max_alt_constit
  357. );
  358. somatic_stats.n_high_alt_constit = annotations
  359. .retain_variants(&mut variants_collections, |anns| {
  360. !anns.contains(&Annotation::HighConstitAlt)
  361. });
  362. info!(
  363. "{} variants removed with SNP/indel inside the constit alignements > {}",
  364. somatic_stats.n_high_alt_constit, self.config.somatic_max_alt_constit
  365. );
  366. annotations
  367. .callers_stat(Some(Box::new(|v| {
  368. matches!(
  369. v,
  370. Annotation::Callers(_, _)
  371. | Annotation::ConstitAlt(_)
  372. | Annotation::ConstitDepth(_)
  373. )
  374. })))
  375. .save_to_json(&format!("{stats_dir}/{id}_annotations_04_bam_filter.json"))?;
  376. // Annotate variants with sequence context (entropy + trinucleotide)
  377. info!(
  378. "Annotating variants with sequence context using reference: {}",
  379. self.config.reference
  380. );
  381. variants_collections.iter().try_for_each(|c| {
  382. c.annotate_with_sequence_context(
  383. &annotations,
  384. &self.config.reference,
  385. self.config.entropy_seq_len,
  386. self.config.somatic_pipe_threads,
  387. )
  388. })?;
  389. // Annotate with external databases like COSMIC and GnomAD
  390. info!("Annotation with external databases like COSMIC and GnomAD.");
  391. variants_collections
  392. .iter()
  393. .try_for_each(|c| -> anyhow::Result<()> {
  394. let ext_annot = ExternalAnnotation::init(&config)?;
  395. ext_annot.annotate(&c.variants, &annotations)?;
  396. Ok(())
  397. })?;
  398. annotations
  399. .callers_stat(Some(Box::new(|v| {
  400. matches!(
  401. v,
  402. Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_)
  403. )
  404. })))
  405. .save_to_json(&format!("{stats_dir}/{id}_annotations_05_gnomad.json"))?;
  406. // Filter: Remove variants present in GnomAD and have alt reads in constitutional sample
  407. info!("Filtering out variants in GnomAD and in constit bam at low AF.");
  408. somatic_stats.n_high_alt_constit_gnomad = annotations
  409. .retain_variants(&mut variants_collections, |anns| {
  410. !is_gnomad_and_constit_alt(anns)
  411. });
  412. info!(
  413. "{} variants filtered, with constit alt <= max contig alt ({}) and in GnomAD.",
  414. somatic_stats.n_high_alt_constit_gnomad, self.config.somatic_max_alt_constit
  415. );
  416. // TODO: These stats doesn't capture filter metrics !!!
  417. annotations
  418. .callers_stat(Some(Box::new(|v| {
  419. matches!(
  420. v,
  421. Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_)
  422. )
  423. })))
  424. .save_to_json(&format!(
  425. "{stats_dir}/{id}_annotations_06_gnomad_filter.json"
  426. ))?;
  427. // Filter low entropy variants
  428. annotations.low_shannon_entropy(self.config.min_shannon_entropy);
  429. info!(
  430. "Filtering out variants with low entropies ({})",
  431. config.min_shannon_entropy
  432. );
  433. annotations
  434. .callers_stat(Some(Box::new(|v| {
  435. matches!(v, Annotation::Callers(_, _) | Annotation::LowEntropy)
  436. })))
  437. .save_to_json(&format!("{stats_dir}/{id}_annotations_07_entropy.json"))?;
  438. somatic_stats.n_low_entropies = annotations
  439. .retain_variants(&mut variants_collections, |anns| {
  440. !anns.contains(&Annotation::LowEntropy)
  441. });
  442. annotations
  443. .callers_stat(Some(Box::new(|v| matches!(v, Annotation::Callers(_, _)))))
  444. .save_to_json(&format!(
  445. "{stats_dir}/{id}_annotations_08_entropy_filter.json"
  446. ))?;
  447. // Final VEP annotation
  448. info!("Annotation with VEP.");
  449. variants_collections
  450. .iter()
  451. .try_for_each(|c| -> anyhow::Result<()> {
  452. let ext_annot = ExternalAnnotation::init(&config)?;
  453. ext_annot.annotate_vep(&c.variants, &annotations)?;
  454. Ok(())
  455. })?;
  456. annotations
  457. .callers_stat(Some(Box::new(caller_cat_anns)))
  458. .save_to_json(&format!("{stats_dir}/{id}_annotations_09_vep.json"))?;
  459. // Merge all variants into a final collection
  460. let mut variants = variants_collections.into_iter().fold(
  461. Variants::default(),
  462. |mut acc, variants_collection| {
  463. acc.merge(variants_collection, &annotations);
  464. acc
  465. },
  466. );
  467. let vep_stats = annotations.vep_stats()?;
  468. vep_stats.save_to_json(&format!("{stats_dir}/{id}_annotations_10_vep.json"))?;
  469. VariantsStats::new(&mut variants, &id, &config, &high_depth_ranges)?
  470. .save_to_json(&format!("{stats_dir}/{id}_variants_stats_final.json.gz"))?;
  471. info!("Final unique variants: {}", variants.data.len());
  472. // Ensure sorted (should already be sorted)
  473. variants.sort();
  474. let vntrs: Vec<GenomeRange> = read_bed(&config.vntrs_bed)?
  475. .iter()
  476. .map(|r| r.range.clone())
  477. .collect();
  478. variants.annotate_with_ranges(&vntrs, Some(Annotation::VNTR), 0, Vec::new());
  479. let repeat_masker: Vec<GenomeRange> = read_bed(&config.repeats_bed)?
  480. .iter()
  481. .map(|r| r.range.clone())
  482. .collect();
  483. variants.annotate_with_ranges(&repeat_masker, Some(Annotation::Repeat), 0, Vec::new());
  484. variants.save_to_json(&result_json)?;
  485. variants.save_to_file(&result_bit)?;
  486. variants.write_vcf(&result_vcf, &config.dict_file, config.somatic_pipe_force)?;
  487. Ok(())
  488. }
  489. }
  490. pub fn const_stats(id: String, config: Config) -> anyhow::Result<()> {
  491. info!("Loading Germline");
  492. let annotations = Annotations::default();
  493. let clairs_germline = ClairS::initialize(&id, &config.clone())?.germline(&annotations)?;
  494. info!("Annotation with Cosmic and GnomAD.");
  495. let ext_annot = ExternalAnnotation::init(&config)?;
  496. ext_annot.annotate(&clairs_germline.variants, &annotations)?;
  497. let mut variants = Variants::default();
  498. variants.merge(clairs_germline, &annotations);
  499. info!("-- {}", variants.data.len());
  500. // let u = VariantsStats::new(&mut variants, &id, &config)?;
  501. // info!("++ {}", u.n);
  502. Ok(())
  503. }
  504. /// Holds statistical data for somatic variant pipeline processing,
  505. /// including summary counts and input categorization.
  506. #[derive(Debug, Default, Clone)]
  507. pub struct SomaticPipeStats {
  508. /// Summary of input variant collections grouped by sample type.
  509. pub input: InputStats,
  510. /// Number of variants removed because they were classified as germline
  511. /// by at least one caller, or detected in the normal-only (solo constitutional)
  512. /// callset (including variants classified as both).
  513. pub n_constit_germline: usize,
  514. /// Number of variants removed due to low depth in the constitutional (normal) BAM
  515. /// (i.e. annotated with [`Annotation::LowConstitDepth`]).
  516. pub n_low_constit: usize,
  517. /// Number of variants removed due to excessive alternate-allele support in the
  518. /// constitutional (normal) BAM (i.e. annotated with [`Annotation::HighConstitAlt`]).
  519. pub n_high_alt_constit: usize,
  520. /// Number of variants removed because they are present in gnomAD and also show
  521. /// alternate-allele support in the constitutional (normal) BAM, according to
  522. /// [`is_gnomad_and_constit_alt`].
  523. pub n_high_alt_constit_gnomad: usize,
  524. /// Number of variants removed due to low sequence complexity (Shannon entropy),
  525. /// i.e. annotated with [`Annotation::LowEntropy`].
  526. pub n_low_entropies: usize,
  527. }
  528. impl SomaticPipeStats {
  529. /// Initializes a `SomaticPipeStats` object with populated `InputStats` based on the
  530. /// provided `VariantCollection`s.
  531. pub fn init(collections: &[VariantCollection]) -> Self {
  532. Self {
  533. input: InputStats::from_collections(collections),
  534. ..Default::default()
  535. }
  536. }
  537. /// Generates a tumor-vs-germline annotation matrix and writes it to a JSON file.
  538. ///
  539. /// This method analyzes co-occurrence patterns between tumor and germline/constit
  540. /// variant callers by iterating through annotation statistics from `AnnotationsStats`.
  541. /// It builds a matrix where each row corresponds to a **tumor caller** and each column
  542. /// corresponds to a **germline or constitutional caller**. Each cell contains the number
  543. /// of variant calls that were annotated by both the tumor and the germline/constit caller.
  544. ///
  545. /// In addition to writing a structured JSON file to the specified path, the function also
  546. /// prints a tab-separated human-readable matrix to stdout for convenience.
  547. ///
  548. /// # Parameters
  549. ///
  550. /// - `stats`: A reference to [`AnnotationsStats`] containing categorical annotations.
  551. /// This object holds a frequency map where keys are combinations of `Annotation`
  552. /// values (serialized as strings) and values are occurrence counts.
  553. /// - `json_path`: The path where the resulting JSON output file should be written.
  554. ///
  555. /// # Output Formats
  556. ///
  557. /// ## JSON Output (`json_path`)
  558. ///
  559. /// The output JSON is an **array of tumor caller records**, each containing:
  560. /// - `caller_name`: The name of the tumor caller (as a string).
  561. /// - `germline`: An array of JSON objects, each with one key-value pair,
  562. /// where the key is a germline/constit caller (or combination) and the value is the count.
  563. ///
  564. /// ### Example
  565. /// ```json
  566. /// [
  567. /// {
  568. /// "caller_name": "DeepVariant SoloTumor",
  569. /// "germline": {
  570. /// "ClairS Germline": 99978,
  571. /// "ClairS Germline + DeepVariant SoloConstit": 4710570
  572. /// }
  573. /// },
  574. /// {
  575. /// "caller_name": "ClairS Somatic",
  576. /// "germline": {
  577. /// "ClairS Germline": 944,
  578. /// "ClairS Germline + DeepVariant SoloConstit": 362
  579. /// }
  580. /// }
  581. /// ]
  582. /// ```
  583. ///
  584. /// ## Console Output (TSV)
  585. ///
  586. /// A tab-separated matrix is printed to stdout. Each row begins with a tumor caller,
  587. /// followed by columns showing germline/constit caller combinations and their counts.
  588. ///
  589. /// # Notes
  590. /// - Tumor callers are collected from `self.input.somatic` and `self.input.solo_tumor`.
  591. /// - Germline/constit callers are collected from `self.input.germline` and `self.input.solo_constit`.
  592. /// - Keys in the original `AnnotationsStats` map are split using `" + "` and parsed into `Annotation` enums.
  593. /// - Germline keys are sorted and joined to form canonical column labels.
  594. /// - Tumor and germline annotations are matched by exact `Annotation` equality.
  595. ///
  596. /// # Errors
  597. /// Returns an error if:
  598. /// - Any annotation key in `AnnotationsStats` fails to parse into a valid `Annotation`.
  599. /// - The JSON output file cannot be created or written to.
  600. ///
  601. /// # Dependencies
  602. /// Requires that `Annotation` implements `FromStr`, `ToString`, `PartialEq`, and `Clone`.
  603. ///
  604. /// # Example Usage
  605. /// ```rust
  606. /// let mut stats = SomaticPipeStats::init(&collections);
  607. /// stats.annot_init(&annotation_stats, "output/matrix.json")?;
  608. /// ```
  609. pub fn annot_init(&self, stats: &AnnotationsStats, json_path: &str) -> anyhow::Result<()> {
  610. #[derive(Serialize)]
  611. struct TumorRow {
  612. caller_name: String,
  613. germline: BTreeMap<String, u64>,
  614. }
  615. // Parse stats keys into Vec<Annotation> once
  616. let parsed: Vec<(Vec<Annotation>, u64)> = stats
  617. .categorical
  618. .iter()
  619. .map(|e| {
  620. let anns = e
  621. .key()
  622. .split(" + ")
  623. .map(|k| k.parse::<Annotation>())
  624. .collect::<anyhow::Result<Vec<_>>>()
  625. .map_err(|err| {
  626. anyhow::anyhow!("Error while parsing AnnotationsStats key: {err}")
  627. })?;
  628. Ok((anns, *e.value()))
  629. })
  630. .collect::<anyhow::Result<_>>()?;
  631. // Collect caller labels
  632. let tumor_callers: Vec<Annotation> = self
  633. .input
  634. .somatic
  635. .iter()
  636. .map(|(a, _)| a.clone())
  637. .chain(self.input.solo_tumor.iter().map(|(a, _)| a.clone()))
  638. .collect();
  639. let germ_callers: Vec<Annotation> = self
  640. .input
  641. .germline
  642. .iter()
  643. .map(|(a, _)| a.clone())
  644. .chain(self.input.solo_constit.iter().map(|(a, _)| a.clone()))
  645. .collect();
  646. // Matrix: tumor -> (germline label -> count), deterministic by construction
  647. let mut matrix: BTreeMap<String, BTreeMap<String, u64>> = BTreeMap::new();
  648. for (anns, v) in parsed.iter() {
  649. // only rows that include any germline/constit label
  650. if !anns.iter().any(|a| {
  651. matches!(
  652. a,
  653. Annotation::Callers(_, Sample::SoloConstit)
  654. | Annotation::Callers(_, Sample::Germline)
  655. )
  656. }) {
  657. continue;
  658. }
  659. // Which tumor callers are present in this set?
  660. let present_tumors: Vec<String> = tumor_callers
  661. .iter()
  662. .filter(|t| anns.contains(t))
  663. .map(|t| t.to_string())
  664. .collect();
  665. if present_tumors.is_empty() {
  666. continue;
  667. }
  668. // Canonical germline key: sorted labels joined by " + "
  669. let mut present_germs: Vec<String> = germ_callers
  670. .iter()
  671. .filter(|g| anns.contains(g))
  672. .map(|g| g.to_string())
  673. .collect();
  674. present_germs.sort();
  675. let germ_key = present_germs.join(" + ");
  676. for tumor in present_tumors {
  677. *matrix
  678. .entry(tumor)
  679. .or_default()
  680. .entry(germ_key.clone())
  681. .or_default() += *v;
  682. }
  683. }
  684. // Collect all germline columns (deterministic)
  685. let mut germ_cols: Vec<String> = matrix
  686. .values()
  687. .flat_map(|row| row.keys().cloned())
  688. .collect();
  689. germ_cols.sort();
  690. germ_cols.dedup();
  691. // TSV output (deterministic ordering)
  692. let mut lines: Vec<String> = Vec::with_capacity(matrix.len());
  693. for (tumor, row) in matrix.iter() {
  694. let line = format!(
  695. "{tumor}\t{}",
  696. germ_cols
  697. .iter()
  698. .map(|g| format!("{g}: {}", row.get(g).copied().unwrap_or(0)))
  699. .collect::<Vec<_>>()
  700. .join("\t")
  701. );
  702. lines.push(line);
  703. }
  704. println!("{}", lines.join("\n"));
  705. // JSON output: list of rows with all columns filled (0 if absent)
  706. let json_rows: Vec<TumorRow> = matrix
  707. .iter()
  708. .map(|(tumor, row)| {
  709. let mut full = BTreeMap::new();
  710. for g in germ_cols.iter() {
  711. full.insert(g.clone(), row.get(g).copied().unwrap_or(0));
  712. }
  713. TumorRow {
  714. caller_name: tumor.clone(),
  715. germline: full,
  716. }
  717. })
  718. .collect();
  719. let file = File::create(json_path)?;
  720. serde_json::to_writer_pretty(file, &json_rows)?;
  721. Ok(())
  722. }
  723. }
  724. /// Aggregated statistics on variant collections, grouped by sample type.
  725. ///
  726. /// The `InputStats` struct stores the number of variants observed for each
  727. /// caller, categorized by biological sample type:
  728. ///
  729. /// - `solo_tumor`: Variants from tumor-only samples.
  730. /// - `solo_constit`: Variants from normal-only (constitutional) samples.
  731. /// - `germline`: Variants classified as germline (shared between normal and tumor).
  732. /// - `somatic`: Variants classified as somatic (tumor-specific).
  733. ///
  734. /// Each vector contains tuples of the form `(Annotation, usize)`, where:
  735. /// - `Annotation` identifies the caller and sample type.
  736. /// - `usize` represents the number of variants for that annotation.
  737. #[derive(Debug, Default, Clone)]
  738. pub struct InputStats {
  739. /// Variants from tumor-only samples.
  740. pub solo_tumor: Vec<(Annotation, usize)>,
  741. /// Variants from normal-only (constitutional) samples.
  742. pub solo_constit: Vec<(Annotation, usize)>,
  743. /// Germline variants (present in both normal and tumor).
  744. pub germline: Vec<(Annotation, usize)>,
  745. /// Somatic variants (specific to tumor).
  746. pub somatic: Vec<(Annotation, usize)>,
  747. }
  748. impl InputStats {
  749. /// Constructs an `InputStats` instance by aggregating variant counts
  750. /// from a list of [`VariantCollection`]s.
  751. ///
  752. /// Each `VariantCollection` is inspected to determine its sample type,
  753. /// as indicated by the `Annotation::Callers(_, Sample)` variant. The
  754. /// number of variants (`collection.variants.len()`) is recorded in the
  755. /// appropriate field of `InputStats`, grouped by `Sample`.
  756. ///
  757. /// # Parameters
  758. ///
  759. /// - `collections`: A slice of [`VariantCollection`] objects containing
  760. /// variants annotated with a caller and sample type.
  761. ///
  762. /// # Returns
  763. ///
  764. /// A populated `InputStats` instance with counts per (caller, sample type).
  765. ///
  766. /// # Examples
  767. ///
  768. /// ```
  769. /// let collections: Vec<VariantCollection> = load_collections();
  770. /// let stats = InputStats::from_collections(&collections);
  771. /// println!("Somatic calls: {:?}", stats.somatic);
  772. /// ```
  773. pub fn from_collections(collections: &[VariantCollection]) -> Self {
  774. let mut stats = Self::default();
  775. for collection in collections.iter() {
  776. match collection.caller {
  777. Annotation::Callers(_, Sample::SoloTumor) => stats
  778. .solo_tumor
  779. .push((collection.caller.clone(), collection.variants.len())),
  780. Annotation::Callers(_, Sample::SoloConstit) => stats
  781. .solo_constit
  782. .push((collection.caller.clone(), collection.variants.len())),
  783. Annotation::Callers(_, Sample::Germline) => stats
  784. .germline
  785. .push((collection.caller.clone(), collection.variants.len())),
  786. Annotation::Callers(_, Sample::Somatic) => stats
  787. .somatic
  788. .push((collection.caller.clone(), collection.variants.len())),
  789. _ => (),
  790. };
  791. }
  792. stats
  793. }
  794. }
  795. #[cfg(test)]
  796. mod tests {
  797. use rayon::ThreadPoolBuilder;
  798. use super::*;
  799. use crate::helpers::test_init;
  800. #[test]
  801. fn somatic_pipe_run() -> anyhow::Result<()> {
  802. test_init();
  803. let config = Config::default();
  804. let pool = ThreadPoolBuilder::new()
  805. .num_threads(config.threads.into())
  806. .build()?;
  807. pool.install(|| SomaticPipe::initialize("CHAHA", &config)?.run())?;
  808. Ok(())
  809. }
  810. }