somatic.rs 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028
  1. use crate::{
  2. annotation::{is_gnomad_and_constit_alt, Caller},
  3. create_should_run_normal_tumoral, init_solo_callers_normal_tumoral,
  4. io::bed::read_bed,
  5. pipes::{Initialize, InitializeSolo, ShouldRun},
  6. positions::{GenomeRange, GenomeRangeExt},
  7. scan::scan::SomaticScan,
  8. variant::{
  9. variants_stats::somatic_depth_quality_ranges,
  10. vcf_variant::{run_if_required, ShouldRunBox},
  11. },
  12. };
  13. use log::info;
  14. use rayon::slice::ParallelSliceMut;
  15. use serde::{Deserialize, Serialize};
  16. use std::{
  17. collections::BTreeMap,
  18. fs::{self, File},
  19. path::{Path, PathBuf},
  20. };
  21. use crate::{
  22. annotation::{Annotation, Annotations, AnnotationsStats, Sample},
  23. callers::{
  24. clairs::ClairS,
  25. deep_somatic::DeepSomatic,
  26. deep_variant::DeepVariant,
  27. nanomonsv::NanomonSV,
  28. savana::{Savana, SavanaCN},
  29. severus::Severus,
  30. },
  31. collection::bam_stats::WGSBamStats,
  32. config::Config,
  33. create_should_run, init_somatic_callers,
  34. io::somaticpipe_container::{
  35. bam_qc_section, copy_number_section, pipe_qc_section, provenance_section,
  36. variant_records_and_index_sections, variants_section, BamQcPayload, CompressionMetadata,
  37. ContainerHeader, EncryptionMetadata, PipeQcPayload, ProducerMetadata, ProvenancePayload,
  38. SampleMetadata,
  39. },
  40. runners::Run,
  41. variant::{
  42. variant_collection::{ExternalAnnotation, VariantCollection, Variants},
  43. variants_stats::VariantsStats,
  44. vcf_variant::{load_variants, CallerBox},
  45. },
  46. };
  47. /// Runs the full somatic variant calling pipeline for a single sample (`id`).
  48. ///
  49. /// This function orchestrates the entire somatic variant discovery process,
  50. /// starting from raw variant caller outputs (`PASSED VCF`) and applying multiple filtering
  51. /// and annotation steps to produce high-confidence somatic variants.
  52. ///
  53. /// This function orchestrates the end-to-end somatic variant discovery process:
  54. /// - Executes and verifies upstream components if necessary
  55. /// - Loads variants from multiple callers (tumor and normal samples)
  56. /// - Applies several annotation and filtering steps (e.g. depth, population frequency, entropy)
  57. /// - Tracks filtering statistics at each step
  58. /// - Outputs high-confidence somatic variants in both `.json.gz` and `.bit` formats
  59. ///
  60. /// ## Output Overview
  61. /// The final output includes:
  62. /// - `{tumoral_dir}/{id}_somatic_variants.json.gz`: annotated somatic variants
  63. /// - `{tumoral_dir}/{id}_somatic_variants.bit`: compact binary variant representation
  64. /// - `{stats_dir}/`: multiple intermediate JSON files with annotations and statistics
  65. ///
  66. /// ## Steps
  67. ///
  68. /// This pipeline performs the following high-level steps:
  69. ///
  70. /// ### 1. Output Existence Check
  71. /// If the final JSON result already exists and [`Config::somatic_pipe_force`] is not set,
  72. /// the pipeline aborts early to avoid overwriting results.
  73. ///
  74. /// ### 2. Initialization
  75. /// - Prepares statistics and output directories.
  76. /// - Initializes variant annotations.
  77. ///
  78. /// ### 3. Pre-requisite Tool Execution
  79. /// Runs any required upstream components (e.g., alignment, basecalling, variant callers) if
  80. /// their outputs are missing, using the [`run_if_required`] logic.
  81. ///
  82. /// ### 4. Load Variant Collections
  83. /// - Initializes the configured somatic variant callers and loads their output variants from PASSED VCF.
  84. /// - Also loads germline variants (from [`ClairS::germline`]) for comparative germline filtering.
  85. ///
  86. /// ### 5. Statistics Initialization
  87. /// - Initializes a [`SomaticPipeStats`] object to track the number of variants at each step.
  88. /// - Captures initial variant counts before filtering.
  89. /// - Aggregates variant counts from all sources and stores initial annotations for quality control and
  90. /// comparison before filtering.
  91. ///
  92. /// ### 6. Filter: Germline & Solo Constitutional Variants
  93. /// - Removes variants labeled as either Germline or Solo Constitutional, assuming they are unlikely to be somatic.
  94. /// - Records count of removed variants in [`SomaticPipeStats::n_constit_germline`].
  95. ///
  96. /// ### 7. Annotation: BAM Read Depth and Alt Allele Counts
  97. /// - Uses the constitutional BAM file to annotate each variant with read depth and the number of alternate reads observed.
  98. /// - Flags variants with low depth or excessive alt reads for filtering as specified in [`Config`].
  99. ///
  100. /// ### 8. Filtering: Low Depth / High Alt Alleles
  101. /// - Removes variants with low coverage in the constitutional sample, or excessive alt allele support (suggestive of germline origin).
  102. /// - Updates stats:
  103. /// - [`SomaticPipeStats::n_low_constit`]
  104. /// - [`SomaticPipeStats::n_high_alt_constit`]
  105. ///
  106. /// ### 9. Annotation: Sequence Entropy
  107. /// Adds Shannon entropy annotation based on the reference sequence context
  108. /// around each variant (cf. [`Config::entropy_seq_len`]) to flag low-complexity regions (often repetitive).
  109. ///
  110. /// ### 10. Annotation: External Databases (COSMIC, GnomAD):
  111. /// - Uses external resources to annotate variants with:
  112. /// - COSMIC hits (somatic mutation database)
  113. /// - GnomAD allele frequencies
  114. ///
  115. /// ### 11. Filtering: GnomAD + Alt Support in Constitutional Sample
  116. /// - Removes variants that are both present in GnomAD **and** show
  117. /// alternate allele support in the constitutional BAM.
  118. /// These are highly likely to be non-somatic germline polymorphisms.
  119. /// - Updates [`SomaticPipeStats::n_high_alt_constit_gnomad`] stat.
  120. ///
  121. /// ### 12. Filtering: Low Shannon Entropy:
  122. /// - Removes variants from low-complexity regions with entropy below the configured threshold
  123. /// (cf. [`Config::min_shannon_entropy`]).
  124. /// - Updates [`SomaticPipeStats::n_low_entropies`].
  125. ///
  126. /// ### 13. Annotation: VEP (Variant Effect Predictor)
  127. /// Adds transcript-level annotations from Ensembl VEP, providing functional consequences,
  128. /// impacted genes, and regulatory features.
  129. ///
  130. /// ### 14. Merging
  131. /// Merges variant collections into a unified [`Variants`] structure,
  132. /// preserving annotations and applying deduplication logic.
  133. ///
  134. /// ### 15. Final Statistics and Saving
  135. /// - Saves final annotation stats, VEP summaries, and variant-level statistics.
  136. /// - Exports the final somatic variant set to both compressed JSON and `.bit` formats.
  137. ///
  138. /// # Returns
  139. /// - `Ok(())` if all steps completed successfully.
  140. /// - `Err` if any tool fails, if file I/O fails, or if logical conditions are violated (e.g., pre-existing output).
  141. ///
  142. /// # Errors
  143. /// - Returns early if the output file already exists and [`Config::somatic_pipe_force`] is `false`.
  144. /// - Wraps all component-specific errors using `anyhow::Result` with context.
  145. ///
  146. /// # Side Effects
  147. /// - Runs external tools conditionally (e.g., [`ClairS`], [`DeepSomatic`]).
  148. /// - Creates intermediate directories and annotation JSONs for debugging/QC.
  149. /// - May consume significant compute time depending on the number of callers, annotations, and variants.
  150. ///
  151. /// # TODOs
  152. /// - Support compressed intermediate files (`// TODO: GZ !!!`)
  153. /// - Improve filtering metrics reporting (currently not all filtered variants are tracked in final stats).
  154. ///
  155. /// # Example Output Files
  156. /// - `tumoral_dir/sample123_somatic_variants.json.gz`
  157. /// - `tumoral_dir/sample123_somatic_variants.bit`
  158. /// - `stats_dir/sample123_annotations_*.json` (intermediate annotation snapshots)
  159. ///
  160. /// # See Also
  161. /// - [`Annotations`] – core structure for managing per-variant metadata
  162. /// - [`Variants`] – the merged final variant structure
  163. /// - [`SomaticPipeStats`] – used for tracking variant counts throughout filtering
  164. ///
  165. pub struct SomaticPipe {
  166. /// Unique identifier for the sample.
  167. pub id: String,
  168. /// Configuration parameters for the pipeline.
  169. pub config: Config,
  170. }
  171. impl Initialize for SomaticPipe {
  172. /// Initializes a new `Somatic` instance with default annotations.
  173. fn initialize(id: &str, config: &crate::config::Config) -> anyhow::Result<Self> {
  174. let id = id.to_string();
  175. Ok(Self {
  176. id,
  177. config: config.clone(),
  178. })
  179. }
  180. }
  181. impl ShouldRun for SomaticPipe {
  182. fn should_run(&self) -> bool {
  183. // path to the existing “.bit” file
  184. let tumoral_dir = self.config.tumoral_dir(&self.id);
  185. let bit_path = Path::new(&tumoral_dir).join(format!("{}_somatic_variants.bit", self.id));
  186. // if we can’t read its modification time, re-run
  187. let res_meta = match fs::metadata(&bit_path).and_then(|m| m.modified()) {
  188. Ok(ts) => ts,
  189. Err(_) => return true,
  190. };
  191. let deps: [PathBuf; 5] = [
  192. Path::new(&self.config.normal_bam(&self.id)).to_path_buf(),
  193. Path::new(&self.config.tumoral_bam(&self.id)).to_path_buf(),
  194. Path::new(&self.config.reference).to_path_buf(),
  195. Path::new(&self.config.vntrs_bed).to_path_buf(),
  196. Path::new(&self.config.repeats_bed).to_path_buf(),
  197. ];
  198. deps.iter().any(|p| {
  199. fs::metadata(p)
  200. .and_then(|m| m.modified())
  201. .map_or(true, |ts| ts > res_meta)
  202. })
  203. }
  204. }
  205. impl Run for SomaticPipe {
  206. /// Executes the full somatic variant analysis pipeline.
  207. fn run(&mut self) -> anyhow::Result<()> {
  208. let config = self.config.clone();
  209. let id = self.id.clone();
  210. info!("Running somatic pipe for {id}.");
  211. // Define output paths for the final somatic variants
  212. let result_json = format!("{}/{id}_somatic_variants.json.gz", config.tumoral_dir(&id));
  213. let result_bit = format!("{}/{id}_somatic_variants.bit", config.tumoral_dir(&id));
  214. let result_vcf = format!("{}/{id}_somatic_variants.vcf.gz", config.tumoral_dir(&id));
  215. let result_pandora = format!("{}/{id}.pandora", config.tumoral_dir(&id));
  216. let outputs = [&result_json, &result_bit, &result_vcf, &result_pandora];
  217. if !config.somatic_pipe_force && outputs.iter().any(|p| Path::new(p).exists()) {
  218. return Err(anyhow::anyhow!(
  219. "Somatic Pipe output already exists for {id}."
  220. ));
  221. }
  222. let mut annotations = Annotations::default();
  223. // Create stats directory if it doesn't exist
  224. let stats_dir = config.somatic_pipe_stats(&id);
  225. if !Path::new(&stats_dir).exists() {
  226. fs::create_dir(&stats_dir)?;
  227. }
  228. // Initialize and run any pre-required input if necessary
  229. info!("Initialization prerequired pipe inputs...");
  230. let mut to_run_if_req = create_should_run!(
  231. &id,
  232. &config,
  233. SomaticScan,
  234. ClairS,
  235. NanomonSV,
  236. Severus,
  237. Savana,
  238. DeepSomatic
  239. );
  240. to_run_if_req.extend(create_should_run_normal_tumoral!(&id, &config, DeepVariant));
  241. info!("Running prerequired pipe components.");
  242. run_if_required(&mut to_run_if_req).map_err(|e| {
  243. anyhow::anyhow!("Failed to run a prerequired component of somatic pipe for {id}.\n{e}")
  244. })?;
  245. // Initialize variant callers
  246. let mut callers = init_somatic_callers!(
  247. &id,
  248. &config,
  249. ClairS,
  250. NanomonSV,
  251. Severus,
  252. Savana,
  253. DeepSomatic
  254. );
  255. callers.extend(init_solo_callers_normal_tumoral!(&id, &config, DeepVariant,));
  256. // Load variants from each caller
  257. info!("Loading variants.");
  258. let mut variants_collections = load_variants(&mut callers, &annotations)
  259. .map_err(|e| anyhow::anyhow!("Error while loading variants\n{e}"))?;
  260. // Load germline variants using ClairS
  261. info!("Loading germline variants.");
  262. let clairs_germline =
  263. ClairS::initialize(&id, &self.config.clone())?.germline(&annotations)?;
  264. variants_collections.push(clairs_germline);
  265. // Initialize statistics
  266. let mut somatic_stats = SomaticPipeStats::init(&variants_collections);
  267. info!(
  268. "Variants collections loaded from {} vcf (total of {} variants loaded)",
  269. variants_collections.len(),
  270. variants_collections
  271. .iter()
  272. .map(|v| v.variants.len())
  273. .sum::<usize>()
  274. );
  275. // Initial annotation stats (caller annotations only)
  276. let caller_cat_anns = |v: &Annotation| matches!(v, Annotation::Callers(_, _));
  277. let annot_init = annotations.callers_stat(Some(Box::new(caller_cat_anns)));
  278. somatic_stats.annot_init(
  279. &annot_init,
  280. &format!("{stats_dir}/{id}_germline_filter.json"),
  281. )?;
  282. annot_init.save_to_json(&format!("{stats_dir}/{id}_annotations_01.json"))?;
  283. // Filter out germline and solo constitutional variants
  284. info!(
  285. "Keeping somatic variants (variants neither in solo normal {} nor in germline).",
  286. config.normal_name
  287. );
  288. somatic_stats.n_constit_germline =
  289. annotations.retain_variants(&mut variants_collections, |anns| {
  290. !anns.iter().any(|ann| {
  291. matches!(
  292. ann,
  293. Annotation::Callers(_, Sample::Germline)
  294. | Annotation::Callers(_, Sample::SoloConstit)
  295. )
  296. })
  297. });
  298. annotations
  299. .callers_stat(Some(Box::new(caller_cat_anns)))
  300. .save_to_json(&format!(
  301. "{stats_dir}/{id}_annotations_02_post_germline.json"
  302. ))?;
  303. // Remove deletions stretch
  304. // info!("Removing deletions stretchs:");
  305. // variants_collections.iter_mut().for_each(|coll| {
  306. // let rm = coll.remove_strech();
  307. // info!("\t - {} {}: {}", coll.vcf.caller, coll.vcf.time, rm);
  308. // });
  309. // MASK mapq
  310. let (mut high_depth_ranges, mut low_quality_ranges) =
  311. somatic_depth_quality_ranges(&id, &config)?;
  312. high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
  313. low_quality_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
  314. info!(
  315. "High-depth ranges: n={} bp={}",
  316. high_depth_ranges.len(),
  317. high_depth_ranges.total_len(),
  318. );
  319. info!(
  320. "LowQ ranges: n={} bp={}",
  321. low_quality_ranges.len(),
  322. low_quality_ranges.total_len(),
  323. );
  324. variants_collections.iter_mut().for_each(|e| {
  325. let _ = e.annotate_with_ranges(&low_quality_ranges, &annotations, Annotation::LowMAPQ);
  326. });
  327. let n_masked_lowqual = annotations.retain_variants(&mut variants_collections, |anns| {
  328. !anns.contains(&Annotation::LowMAPQ)
  329. });
  330. info!("N low mapq filtered: {}", n_masked_lowqual);
  331. // Annotate with depth and number of alternate reads from constitutional BAM
  332. info!("Reading Constit BAM file for depth and pileup annotation.");
  333. variants_collections.iter().try_for_each(|c| {
  334. c.annotate_with_constit_bam(&annotations, &self.config.normal_bam(&id), 150)
  335. })?;
  336. annotations.somatic_constit_boundaries(
  337. self.config.somatic_max_alt_constit,
  338. self.config.somatic_min_constit_depth,
  339. );
  340. annotations
  341. .callers_stat(Some(Box::new(|v| {
  342. matches!(
  343. v,
  344. Annotation::Callers(_, _)
  345. | Annotation::ConstitAlt(_)
  346. | Annotation::ConstitDepth(_)
  347. | Annotation::HighConstitAlt
  348. | Annotation::LowConstitDepth
  349. )
  350. })))
  351. .save_to_json(&format!("{stats_dir}/{id}_annotations_03_bam.json"))?;
  352. // Filter based on low constitutional depth
  353. info!(
  354. "Removing variants when depth in constit bam < {}.",
  355. self.config.somatic_min_constit_depth
  356. );
  357. somatic_stats.n_low_constit = annotations
  358. .retain_variants(&mut variants_collections, |anns| {
  359. !anns.contains(&Annotation::LowConstitDepth)
  360. });
  361. info!(
  362. "{} variants removed when depth in constit bam < {}.",
  363. somatic_stats.n_low_constit, self.config.somatic_min_constit_depth
  364. );
  365. // Filter: remove HighConstitAlt
  366. info!(
  367. "Removing variants with SNP/indel inside the constit alignements > {}",
  368. self.config.somatic_max_alt_constit
  369. );
  370. somatic_stats.n_high_alt_constit = annotations
  371. .retain_variants(&mut variants_collections, |anns| {
  372. !anns.contains(&Annotation::HighConstitAlt)
  373. });
  374. info!(
  375. "{} variants removed with SNP/indel inside the constit alignements > {}",
  376. somatic_stats.n_high_alt_constit, self.config.somatic_max_alt_constit
  377. );
  378. annotations
  379. .callers_stat(Some(Box::new(|v| {
  380. matches!(
  381. v,
  382. Annotation::Callers(_, _)
  383. | Annotation::ConstitAlt(_)
  384. | Annotation::ConstitDepth(_)
  385. )
  386. })))
  387. .save_to_json(&format!("{stats_dir}/{id}_annotations_04_bam_filter.json"))?;
  388. // Annotate variants with sequence context (entropy + trinucleotide)
  389. info!(
  390. "Annotating variants with sequence context using reference: {}",
  391. self.config.reference
  392. );
  393. variants_collections.iter().try_for_each(|c| {
  394. c.annotate_with_sequence_context(
  395. &annotations,
  396. &self.config.reference,
  397. self.config.entropy_seq_len,
  398. self.config.somatic_pipe_threads,
  399. )
  400. })?;
  401. // Annotate with external databases like COSMIC and GnomAD
  402. info!("Annotation with external databases like COSMIC and GnomAD.");
  403. variants_collections
  404. .iter()
  405. .try_for_each(|c| -> anyhow::Result<()> {
  406. let ext_annot = ExternalAnnotation::init(&id, &config)?;
  407. ext_annot.annotate(&c.variants, &annotations)?;
  408. Ok(())
  409. })?;
  410. annotations
  411. .callers_stat(Some(Box::new(|v| {
  412. matches!(
  413. v,
  414. Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_)
  415. )
  416. })))
  417. .save_to_json(&format!("{stats_dir}/{id}_annotations_05_gnomad.json"))?;
  418. // Filter: Remove variants present in GnomAD and have alt reads in constitutional sample
  419. info!("Filtering out variants in GnomAD and in constit bam at low AF.");
  420. somatic_stats.n_high_alt_constit_gnomad = annotations
  421. .retain_variants(&mut variants_collections, |anns| {
  422. !is_gnomad_and_constit_alt(anns)
  423. });
  424. info!(
  425. "{} variants filtered, with constit alt <= max contig alt ({}) and in GnomAD.",
  426. somatic_stats.n_high_alt_constit_gnomad, self.config.somatic_max_alt_constit
  427. );
  428. // TODO: These stats doesn't capture filter metrics !!!
  429. annotations
  430. .callers_stat(Some(Box::new(|v| {
  431. matches!(
  432. v,
  433. Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_)
  434. )
  435. })))
  436. .save_to_json(&format!(
  437. "{stats_dir}/{id}_annotations_06_gnomad_filter.json"
  438. ))?;
  439. // Filter low entropy variants
  440. annotations.low_shannon_entropy(self.config.min_shannon_entropy);
  441. info!(
  442. "Filtering out variants with low entropies ({})",
  443. config.min_shannon_entropy
  444. );
  445. annotations
  446. .callers_stat(Some(Box::new(|v| {
  447. matches!(v, Annotation::Callers(_, _) | Annotation::LowEntropy)
  448. })))
  449. .save_to_json(&format!("{stats_dir}/{id}_annotations_07_entropy.json"))?;
  450. somatic_stats.n_low_entropies = annotations
  451. .retain_variants(&mut variants_collections, |anns| {
  452. !anns.contains(&Annotation::LowEntropy)
  453. });
  454. annotations
  455. .callers_stat(Some(Box::new(|v| matches!(v, Annotation::Callers(_, _)))))
  456. .save_to_json(&format!(
  457. "{stats_dir}/{id}_annotations_08_entropy_filter.json"
  458. ))?;
  459. // Final VEP annotation
  460. info!("Annotation with VEP.");
  461. variants_collections
  462. .iter()
  463. .try_for_each(|c| -> anyhow::Result<()> {
  464. let ext_annot = ExternalAnnotation::init(&id, &config)?;
  465. ext_annot.annotate_vep(&c.variants, &annotations)?;
  466. Ok(())
  467. })?;
  468. // Insertion annotation
  469. info!("Annotation of insertions with NanomonSV (RepeatMasker and Minimap2 on reference genome).");
  470. variants_collections
  471. .iter()
  472. .filter(|c| {
  473. matches!(
  474. c.caller,
  475. Annotation::Callers(Caller::Savana, Sample::Somatic)
  476. | Annotation::Callers(Caller::Severus, Sample::Somatic)
  477. | Annotation::Callers(Caller::NanomonSV, Sample::Somatic)
  478. )
  479. })
  480. .try_for_each(|c| -> anyhow::Result<()> {
  481. c.annotate_insertions_with_nanomonsv(&annotations, &config)?;
  482. Ok(())
  483. })?;
  484. annotations
  485. .callers_stat(Some(Box::new(caller_cat_anns)))
  486. .save_to_json(&format!("{stats_dir}/{id}_annotations_09_vep.json"))?;
  487. // Merge all variants into a final collection
  488. let mut variants = variants_collections.into_iter().fold(
  489. Variants::default(),
  490. |mut acc, variants_collection| {
  491. acc.merge(variants_collection, &annotations);
  492. acc
  493. },
  494. );
  495. let vep_stats = annotations.vep_stats()?;
  496. vep_stats.save_to_json(&format!("{stats_dir}/{id}_annotations_10_vep.json"))?;
  497. let variant_stats = VariantsStats::new(&mut variants, &id, &config, &high_depth_ranges)?;
  498. variant_stats.save_to_json(&format!("{stats_dir}/{id}_variants_stats_final.json.gz"))?;
  499. info!("Final unique variants: {}", variants.data.len());
  500. // Ensure sorted (should already be sorted)
  501. variants.sort();
  502. let vntrs: Vec<GenomeRange> = read_bed(&config.vntrs_bed)?
  503. .iter()
  504. .map(|r| r.range.clone())
  505. .collect();
  506. variants.annotate_with_ranges(&vntrs, Some(Annotation::VNTR), 0, Vec::new());
  507. let repeat_masker: Vec<GenomeRange> = read_bed(&config.repeats_bed)?
  508. .iter()
  509. .map(|r| r.range.clone())
  510. .collect();
  511. variants.annotate_with_ranges(&repeat_masker, Some(Annotation::Repeat), 0, Vec::new());
  512. variants.save_to_json(&result_json)?;
  513. variants.save_to_file(&result_bit)?;
  514. variants.write_vcf(&result_vcf, &config.dict_file, config.somatic_pipe_force)?;
  515. write_pandora_output(
  516. &result_pandora,
  517. &id,
  518. &config,
  519. &variants,
  520. somatic_stats,
  521. variant_stats,
  522. vep_stats,
  523. &annotations.callers_stat(None),
  524. )?;
  525. Ok(())
  526. }
  527. }
  528. fn write_pandora_output(
  529. output_path: &str,
  530. id: &str,
  531. config: &Config,
  532. variants: &Variants,
  533. somatic_pipe_stats: SomaticPipeStats,
  534. variant_stats: VariantsStats,
  535. vep_stats: crate::annotation::VepStats,
  536. final_annotation_stats: &AnnotationsStats,
  537. ) -> anyhow::Result<()> {
  538. let mut sections = Vec::new();
  539. sections.push(variants_section(variants));
  540. let (variant_records, variant_index) = variant_records_and_index_sections(variants)?;
  541. sections.push(variant_records);
  542. sections.push(variant_index);
  543. match SavanaCN::parse_file(id, config) {
  544. Ok(copy_number) => sections.push(copy_number_section(&copy_number)?.optional()),
  545. Err(err) => info!("Skipping .pandora copy_number section for {id}: {err}"),
  546. }
  547. let mut bam_qc = BamQcPayload {
  548. by_role: BTreeMap::new(),
  549. };
  550. match WGSBamStats::open(id, &config.tumoral_name, config) {
  551. Ok(stats) => {
  552. bam_qc.by_role.insert(config.tumoral_name.clone(), stats);
  553. }
  554. Err(err) => info!("Skipping .pandora tumor BAM QC for {id}: {err}"),
  555. }
  556. match WGSBamStats::open(id, &config.normal_name, config) {
  557. Ok(stats) => {
  558. bam_qc.by_role.insert(config.normal_name.clone(), stats);
  559. }
  560. Err(err) => info!("Skipping .pandora normal BAM QC for {id}: {err}"),
  561. }
  562. if !bam_qc.by_role.is_empty() {
  563. sections.push(bam_qc_section(&bam_qc)?);
  564. }
  565. let mut annotation_stats = BTreeMap::new();
  566. annotation_stats.insert("final".to_string(), final_annotation_stats.clone());
  567. let pipe_qc = PipeQcPayload {
  568. somatic_pipe_stats,
  569. variant_stats: Some(variant_stats),
  570. annotation_stats,
  571. vep_stats: Some(vep_stats),
  572. caller_outputs: Vec::new(),
  573. filter_steps: Vec::new(),
  574. };
  575. sections.push(pipe_qc_section(&pipe_qc)?);
  576. let config_digest = serde_json::to_vec(config)
  577. .ok()
  578. .map(|bytes| format!("blake3:{}", blake3::hash(&bytes).to_hex()));
  579. let provenance = ProvenancePayload {
  580. config_digest,
  581. input_digests: BTreeMap::new(),
  582. tool_versions: BTreeMap::new(),
  583. command_lines: Vec::new(),
  584. };
  585. sections.push(provenance_section(&provenance)?);
  586. let header = ContainerHeader {
  587. format: "somaticpipe.output".to_string(),
  588. format_version: crate::io::somaticpipe_container::PANDORA_FORMAT_VERSION,
  589. producer: ProducerMetadata {
  590. name: env!("CARGO_PKG_NAME").to_string(),
  591. pipeline: "SomaticPipe".to_string(),
  592. pipeline_version: env!("CARGO_PKG_VERSION").to_string(),
  593. git_commit: option_env!("GIT_COMMIT").map(ToString::to_string),
  594. created_at: chrono::Utc::now(),
  595. },
  596. sample: SampleMetadata {
  597. sample_id: id.to_string(),
  598. tumor_timepoint: Some(config.tumoral_name.clone()),
  599. normal_timepoint: Some(config.normal_name.clone()),
  600. reference: Some(config.reference_name.clone()),
  601. reference_digest: None,
  602. },
  603. compression: CompressionMetadata::default(),
  604. encryption: EncryptionMetadata::default(),
  605. sections: Vec::new(),
  606. };
  607. crate::io::somaticpipe_container::write_container(output_path, header, sections)
  608. }
  609. pub fn const_stats(id: String, config: Config) -> anyhow::Result<()> {
  610. info!("Loading Germline");
  611. let annotations = Annotations::default();
  612. let clairs_germline = ClairS::initialize(&id, &config.clone())?.germline(&annotations)?;
  613. info!("Annotation with Cosmic and GnomAD.");
  614. let ext_annot = ExternalAnnotation::init(&id, &config)?;
  615. ext_annot.annotate(&clairs_germline.variants, &annotations)?;
  616. let mut variants = Variants::default();
  617. variants.merge(clairs_germline, &annotations);
  618. info!("-- {}", variants.data.len());
  619. // let u = VariantsStats::new(&mut variants, &id, &config)?;
  620. // info!("++ {}", u.n);
  621. Ok(())
  622. }
  623. /// Holds statistical data for somatic variant pipeline processing,
  624. /// including summary counts and input categorization.
  625. #[derive(Debug, Default, Clone, Serialize, Deserialize)]
  626. pub struct SomaticPipeStats {
  627. /// Summary of input variant collections grouped by sample type.
  628. pub input: InputStats,
  629. /// Number of variants removed because they were classified as germline
  630. /// by at least one caller, or detected in the normal-only (solo constitutional)
  631. /// callset (including variants classified as both).
  632. pub n_constit_germline: usize,
  633. /// Number of variants removed due to low depth in the constitutional (normal) BAM
  634. /// (i.e. annotated with [`Annotation::LowConstitDepth`]).
  635. pub n_low_constit: usize,
  636. /// Number of variants removed due to excessive alternate-allele support in the
  637. /// constitutional (normal) BAM (i.e. annotated with [`Annotation::HighConstitAlt`]).
  638. pub n_high_alt_constit: usize,
  639. /// Number of variants removed because they are present in gnomAD and also show
  640. /// alternate-allele support in the constitutional (normal) BAM, according to
  641. /// [`is_gnomad_and_constit_alt`].
  642. pub n_high_alt_constit_gnomad: usize,
  643. /// Number of variants removed due to low sequence complexity (Shannon entropy),
  644. /// i.e. annotated with [`Annotation::LowEntropy`].
  645. pub n_low_entropies: usize,
  646. }
  647. impl SomaticPipeStats {
  648. /// Initializes a `SomaticPipeStats` object with populated `InputStats` based on the
  649. /// provided `VariantCollection`s.
  650. pub fn init(collections: &[VariantCollection]) -> Self {
  651. Self {
  652. input: InputStats::from_collections(collections),
  653. ..Default::default()
  654. }
  655. }
  656. /// Generates a tumor-vs-germline annotation matrix and writes it to a JSON file.
  657. ///
  658. /// This method analyzes co-occurrence patterns between tumor and germline/constit
  659. /// variant callers by iterating through annotation statistics from `AnnotationsStats`.
  660. /// It builds a matrix where each row corresponds to a **tumor caller** and each column
  661. /// corresponds to a **germline or constitutional caller**. Each cell contains the number
  662. /// of variant calls that were annotated by both the tumor and the germline/constit caller.
  663. ///
  664. /// In addition to writing a structured JSON file to the specified path, the function also
  665. /// prints a tab-separated human-readable matrix to stdout for convenience.
  666. ///
  667. /// # Parameters
  668. ///
  669. /// - `stats`: A reference to [`AnnotationsStats`] containing categorical annotations.
  670. /// This object holds a frequency map where keys are combinations of `Annotation`
  671. /// values (serialized as strings) and values are occurrence counts.
  672. /// - `json_path`: The path where the resulting JSON output file should be written.
  673. ///
  674. /// # Output Formats
  675. ///
  676. /// ## JSON Output (`json_path`)
  677. ///
  678. /// The output JSON is an **array of tumor caller records**, each containing:
  679. /// - `caller_name`: The name of the tumor caller (as a string).
  680. /// - `germline`: An array of JSON objects, each with one key-value pair,
  681. /// where the key is a germline/constit caller (or combination) and the value is the count.
  682. ///
  683. /// ### Example
  684. /// ```json
  685. /// [
  686. /// {
  687. /// "caller_name": "DeepVariant SoloTumor",
  688. /// "germline": {
  689. /// "ClairS Germline": 99978,
  690. /// "ClairS Germline + DeepVariant SoloConstit": 4710570
  691. /// }
  692. /// },
  693. /// {
  694. /// "caller_name": "ClairS Somatic",
  695. /// "germline": {
  696. /// "ClairS Germline": 944,
  697. /// "ClairS Germline + DeepVariant SoloConstit": 362
  698. /// }
  699. /// }
  700. /// ]
  701. /// ```
  702. ///
  703. /// ## Console Output (TSV)
  704. ///
  705. /// A tab-separated matrix is printed to stdout. Each row begins with a tumor caller,
  706. /// followed by columns showing germline/constit caller combinations and their counts.
  707. ///
  708. /// # Notes
  709. /// - Tumor callers are collected from `self.input.somatic` and `self.input.solo_tumor`.
  710. /// - Germline/constit callers are collected from `self.input.germline` and `self.input.solo_constit`.
  711. /// - Keys in the original `AnnotationsStats` map are split using `" + "` and parsed into `Annotation` enums.
  712. /// - Germline keys are sorted and joined to form canonical column labels.
  713. /// - Tumor and germline annotations are matched by exact `Annotation` equality.
  714. ///
  715. /// # Errors
  716. /// Returns an error if:
  717. /// - Any annotation key in `AnnotationsStats` fails to parse into a valid `Annotation`.
  718. /// - The JSON output file cannot be created or written to.
  719. ///
  720. /// # Dependencies
  721. /// Requires that `Annotation` implements `FromStr`, `ToString`, `PartialEq`, and `Clone`.
  722. ///
  723. /// # Example Usage
  724. /// ```rust
  725. /// let mut stats = SomaticPipeStats::init(&collections);
  726. /// stats.annot_init(&annotation_stats, "output/matrix.json")?;
  727. /// ```
  728. pub fn annot_init(&self, stats: &AnnotationsStats, json_path: &str) -> anyhow::Result<()> {
  729. #[derive(Serialize)]
  730. struct TumorRow {
  731. caller_name: String,
  732. germline: BTreeMap<String, u64>,
  733. }
  734. // Parse stats keys into Vec<Annotation> once
  735. let parsed: Vec<(Vec<Annotation>, u64)> = stats
  736. .categorical
  737. .iter()
  738. .map(|e| {
  739. let anns = e
  740. .key()
  741. .split(" + ")
  742. .map(|k| k.parse::<Annotation>())
  743. .collect::<anyhow::Result<Vec<_>>>()
  744. .map_err(|err| {
  745. anyhow::anyhow!("Error while parsing AnnotationsStats key: {err}")
  746. })?;
  747. Ok((anns, *e.value()))
  748. })
  749. .collect::<anyhow::Result<_>>()?;
  750. // Collect caller labels
  751. let tumor_callers: Vec<Annotation> = self
  752. .input
  753. .somatic
  754. .iter()
  755. .map(|(a, _)| a.clone())
  756. .chain(self.input.solo_tumor.iter().map(|(a, _)| a.clone()))
  757. .collect();
  758. let germ_callers: Vec<Annotation> = self
  759. .input
  760. .germline
  761. .iter()
  762. .map(|(a, _)| a.clone())
  763. .chain(self.input.solo_constit.iter().map(|(a, _)| a.clone()))
  764. .collect();
  765. // Matrix: tumor -> (germline label -> count), deterministic by construction
  766. let mut matrix: BTreeMap<String, BTreeMap<String, u64>> = BTreeMap::new();
  767. for (anns, v) in parsed.iter() {
  768. // only rows that include any germline/constit label
  769. if !anns.iter().any(|a| {
  770. matches!(
  771. a,
  772. Annotation::Callers(_, Sample::SoloConstit)
  773. | Annotation::Callers(_, Sample::Germline)
  774. )
  775. }) {
  776. continue;
  777. }
  778. // Which tumor callers are present in this set?
  779. let present_tumors: Vec<String> = tumor_callers
  780. .iter()
  781. .filter(|t| anns.contains(t))
  782. .map(|t| t.to_string())
  783. .collect();
  784. if present_tumors.is_empty() {
  785. continue;
  786. }
  787. // Canonical germline key: sorted labels joined by " + "
  788. let mut present_germs: Vec<String> = germ_callers
  789. .iter()
  790. .filter(|g| anns.contains(g))
  791. .map(|g| g.to_string())
  792. .collect();
  793. present_germs.sort();
  794. let germ_key = present_germs.join(" + ");
  795. for tumor in present_tumors {
  796. *matrix
  797. .entry(tumor)
  798. .or_default()
  799. .entry(germ_key.clone())
  800. .or_default() += *v;
  801. }
  802. }
  803. // Collect all germline columns (deterministic)
  804. let mut germ_cols: Vec<String> = matrix
  805. .values()
  806. .flat_map(|row| row.keys().cloned())
  807. .collect();
  808. germ_cols.sort();
  809. germ_cols.dedup();
  810. // TSV output (deterministic ordering)
  811. let mut lines: Vec<String> = Vec::with_capacity(matrix.len());
  812. for (tumor, row) in matrix.iter() {
  813. let line = format!(
  814. "{tumor}\t{}",
  815. germ_cols
  816. .iter()
  817. .map(|g| format!("{g}: {}", row.get(g).copied().unwrap_or(0)))
  818. .collect::<Vec<_>>()
  819. .join("\t")
  820. );
  821. lines.push(line);
  822. }
  823. println!("{}", lines.join("\n"));
  824. // JSON output: list of rows with all columns filled (0 if absent)
  825. let json_rows: Vec<TumorRow> = matrix
  826. .iter()
  827. .map(|(tumor, row)| {
  828. let mut full = BTreeMap::new();
  829. for g in germ_cols.iter() {
  830. full.insert(g.clone(), row.get(g).copied().unwrap_or(0));
  831. }
  832. TumorRow {
  833. caller_name: tumor.clone(),
  834. germline: full,
  835. }
  836. })
  837. .collect();
  838. let file = File::create(json_path)?;
  839. serde_json::to_writer_pretty(file, &json_rows)?;
  840. Ok(())
  841. }
  842. }
  843. /// Aggregated statistics on variant collections, grouped by sample type.
  844. ///
  845. /// The `InputStats` struct stores the number of variants observed for each
  846. /// caller, categorized by biological sample type:
  847. ///
  848. /// - `solo_tumor`: Variants from tumor-only samples.
  849. /// - `solo_constit`: Variants from normal-only (constitutional) samples.
  850. /// - `germline`: Variants classified as germline (shared between normal and tumor).
  851. /// - `somatic`: Variants classified as somatic (tumor-specific).
  852. ///
  853. /// Each vector contains tuples of the form `(Annotation, usize)`, where:
  854. /// - `Annotation` identifies the caller and sample type.
  855. /// - `usize` represents the number of variants for that annotation.
  856. #[derive(Debug, Default, Clone, Serialize, Deserialize)]
  857. pub struct InputStats {
  858. /// Variants from tumor-only samples.
  859. pub solo_tumor: Vec<(Annotation, usize)>,
  860. /// Variants from normal-only (constitutional) samples.
  861. pub solo_constit: Vec<(Annotation, usize)>,
  862. /// Germline variants (present in both normal and tumor).
  863. pub germline: Vec<(Annotation, usize)>,
  864. /// Somatic variants (specific to tumor).
  865. pub somatic: Vec<(Annotation, usize)>,
  866. }
  867. impl InputStats {
  868. /// Constructs an `InputStats` instance by aggregating variant counts
  869. /// from a list of [`VariantCollection`]s.
  870. ///
  871. /// Each `VariantCollection` is inspected to determine its sample type,
  872. /// as indicated by the `Annotation::Callers(_, Sample)` variant. The
  873. /// number of variants (`collection.variants.len()`) is recorded in the
  874. /// appropriate field of `InputStats`, grouped by `Sample`.
  875. ///
  876. /// # Parameters
  877. ///
  878. /// - `collections`: A slice of [`VariantCollection`] objects containing
  879. /// variants annotated with a caller and sample type.
  880. ///
  881. /// # Returns
  882. ///
  883. /// A populated `InputStats` instance with counts per (caller, sample type).
  884. ///
  885. /// # Examples
  886. ///
  887. /// ```
  888. /// let collections: Vec<VariantCollection> = load_collections();
  889. /// let stats = InputStats::from_collections(&collections);
  890. /// println!("Somatic calls: {:?}", stats.somatic);
  891. /// ```
  892. pub fn from_collections(collections: &[VariantCollection]) -> Self {
  893. let mut stats = Self::default();
  894. for collection in collections.iter() {
  895. match collection.caller {
  896. Annotation::Callers(_, Sample::SoloTumor) => stats
  897. .solo_tumor
  898. .push((collection.caller.clone(), collection.variants.len())),
  899. Annotation::Callers(_, Sample::SoloConstit) => stats
  900. .solo_constit
  901. .push((collection.caller.clone(), collection.variants.len())),
  902. Annotation::Callers(_, Sample::Germline) => stats
  903. .germline
  904. .push((collection.caller.clone(), collection.variants.len())),
  905. Annotation::Callers(_, Sample::Somatic) => stats
  906. .somatic
  907. .push((collection.caller.clone(), collection.variants.len())),
  908. _ => (),
  909. };
  910. }
  911. stats
  912. }
  913. }
  914. #[cfg(test)]
  915. mod tests {
  916. use rayon::ThreadPoolBuilder;
  917. use super::*;
  918. use crate::helpers::test_init;
  919. #[test]
  920. fn somatic_pipe_run() -> anyhow::Result<()> {
  921. test_init();
  922. let config = Config::default();
  923. let pool = ThreadPoolBuilder::new()
  924. .num_threads(config.threads.into())
  925. .build()?;
  926. pool.install(|| SomaticPipe::initialize("CHAHA", &config)?.run())?;
  927. Ok(())
  928. }
  929. }