| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845 |
- use crate::{
- annotation::is_gnomad_and_constit_alt, collection::ShouldRun, create_should_run_normal_tumoral, init_solo_callers_normal_tumoral, io::bed::read_bed, positions::GenomeRange, scan::scan::SomaticScan, variant::{
- variant::{run_if_required, ShouldRunBox},
- variants_stats::somatic_depth_quality_ranges,
- }
- };
- use itertools::Itertools;
- use log::info;
- use rayon::slice::ParallelSliceMut;
- use std::{
- collections::HashMap,
- fs::{self, File},
- io::Write,
- path::Path,
- };
- use crate::{
- annotation::{Annotation, Annotations, AnnotationsStats, Sample},
- callers::{
- clairs::ClairS, deep_somatic::DeepSomatic, deep_variant::DeepVariant, nanomonsv::NanomonSV,
- savana::Savana, severus::Severus,
- },
- collection::{Initialize, InitializeSolo},
- config::Config,
- create_should_run, init_somatic_callers,
- runners::Run,
- variant::{
- variant::{load_variants, CallerBox},
- variant_collection::{ExternalAnnotation, VariantCollection, Variants},
- variants_stats::VariantsStats,
- },
- };
- /// Runs the full somatic variant calling pipeline for a single sample (`id`).
- ///
- /// This function orchestrates the entire somatic variant discovery process,
- /// starting from raw variant caller outputs (`PASSED VCF`) and applying multiple filtering
- /// and annotation steps to produce high-confidence somatic variants.
- ///
- /// This function orchestrates the end-to-end somatic variant discovery process:
- /// - Executes and verifies upstream components if necessary
- /// - Loads variants from multiple callers (tumor and normal samples)
- /// - Applies several annotation and filtering steps (e.g. depth, population frequency, entropy)
- /// - Tracks filtering statistics at each step
- /// - Outputs high-confidence somatic variants in both `.json.gz` and `.bit` formats
- ///
- /// ## Output Overview
- /// The final output includes:
- /// - `{tumoral_dir}/{id}_somatic_variants.json.gz`: annotated somatic variants
- /// - `{tumoral_dir}/{id}_somatic_variants.bit`: compact binary variant representation
- /// - `{stats_dir}/`: multiple intermediate JSON files with annotations and statistics
- ///
- /// ## Steps
- ///
- /// This pipeline performs the following high-level steps:
- ///
- /// ### 1. Output Existence Check
- /// If the final JSON result already exists and [`Config::somatic_pipe_force`] is not set,
- /// the pipeline aborts early to avoid overwriting results.
- ///
- /// ### 2. Initialization
- /// - Prepares statistics and output directories.
- /// - Initializes variant annotations.
- ///
- /// ### 3. Pre-requisite Tool Execution
- /// Runs any required upstream components (e.g., alignment, basecalling, variant callers) if
- /// their outputs are missing, using the [`run_if_required`] logic.
- ///
- /// ### 4. Load Variant Collections
- /// - Initializes the configured somatic variant callers and loads their output variants from PASSED VCF.
- /// - Also loads germline variants (from [`ClairS::germline`]) for comparative germline filtering.
- ///
- /// ### 5. Statistics Initialization
- /// - Initializes a [`SomaticPipeStats`] object to track the number of variants at each step.
- /// - Captures initial variant counts before filtering.
- /// - Aggregates variant counts from all sources and stores initial annotations for quality control and
- /// comparison before filtering.
- ///
- /// ### 6. Filter: Germline & Solo Constitutional Variants
- /// - Removes variants labeled as either Germline or Solo Constitutional, assuming they are unlikely to be somatic.
- /// - Records count of removed variants in [`SomaticPipeStats::n_constit_germline`].
- ///
- /// ### 7. Annotation: BAM Read Depth and Alt Allele Counts
- /// - Uses the constitutional BAM file to annotate each variant with read depth and the number of alternate reads observed.
- /// - Flags variants with low depth or excessive alt reads for filtering as specified in [`Config`].
- ///
- /// ### 8. Filtering: Low Depth / High Alt Alleles
- /// - Removes variants with low coverage in the constitutional sample, or excessive alt allele support (suggestive of germline origin).
- /// - Updates stats:
- /// - [`SomaticPipeStats::n_low_constit`]
- /// - [`SomaticPipeStats::n_high_alt_constit`]
- ///
- /// ### 9. Annotation: Sequence Entropy
- /// Adds Shannon entropy annotation based on the reference sequence context
- /// around each variant (cf. [`Config::entropy_seq_len`]) to flag low-complexity regions (often repetitive).
- ///
- /// ### 10. Annotation: External Databases (COSMIC, GnomAD):
- /// - Uses external resources to annotate variants with:
- /// - COSMIC hits (somatic mutation database)
- /// - GnomAD allele frequencies
- ///
- /// ### 11. Filtering: GnomAD + Alt Support in Constitutional Sample
- /// - Removes variants that are both present in GnomAD **and** show
- /// alternate allele support in the constitutional BAM.
- /// These are highly likely to be non-somatic germline polymorphisms.
- /// - Updates [`SomaticPipeStats::n_high_alt_constit_gnomad`] stat.
- ///
- /// ### 12. Filtering: Low Shannon Entropy:
- /// - Removes variants from low-complexity regions with entropy below the configured threshold
- /// (cf. [`Config::min_shannon_entropy`]).
- /// - Updates [`SomaticPipeStats::n_low_entropies`].
- ///
- /// ### 13. Annotation: VEP (Variant Effect Predictor)
- /// Adds transcript-level annotations from Ensembl VEP, providing functional consequences,
- /// impacted genes, and regulatory features.
- ///
- /// ### 14. Merging
- /// Merges variant collections into a unified [`Variants`] structure,
- /// preserving annotations and applying deduplication logic.
- ///
- /// ### 15. Final Statistics and Saving
- /// - Saves final annotation stats, VEP summaries, and variant-level statistics.
- /// - Exports the final somatic variant set to both compressed JSON and `.bit` formats.
- ///
- /// # Returns
- /// - `Ok(())` if all steps completed successfully.
- /// - `Err` if any tool fails, if file I/O fails, or if logical conditions are violated (e.g., pre-existing output).
- ///
- /// # Errors
- /// - Returns early if the output file already exists and [`Config::somatic_pipe_force`] is `false`.
- /// - Wraps all component-specific errors using `anyhow::Result` with context.
- ///
- /// # Side Effects
- /// - Runs external tools conditionally (e.g., [`ClairS`], [`DeepSomatic`]).
- /// - Creates intermediate directories and annotation JSONs for debugging/QC.
- /// - May consume significant compute time depending on the number of callers, annotations, and variants.
- ///
- /// # TODOs
- /// - Support compressed intermediate files (`// TODO: GZ !!!`)
- /// - Improve filtering metrics reporting (currently not all filtered variants are tracked in final stats).
- ///
- /// # Example Output Files
- /// - `tumoral_dir/sample123_somatic_variants.json.gz`
- /// - `tumoral_dir/sample123_somatic_variants.bit`
- /// - `stats_dir/sample123_annotations_*.json` (intermediate annotation snapshots)
- ///
- /// # See Also
- /// - [`Annotations`] – core structure for managing per-variant metadata
- /// - [`Variants`] – the merged final variant structure
- /// - [`SomaticPipeStats`] – used for tracking variant counts throughout filtering
- ///
- pub struct SomaticPipe {
- /// Unique identifier for the sample.
- pub id: String,
- /// Configuration parameters for the pipeline.
- pub config: Config,
- }
- impl Initialize for SomaticPipe {
- /// Initializes a new `Somatic` instance with default annotations.
- fn initialize(id: &str, config: crate::config::Config) -> anyhow::Result<Self> {
- let id = id.to_string();
- Ok(Self { id, config })
- }
- }
- impl ShouldRun for SomaticPipe {
- fn should_run(&self) -> bool {
- let tumoral_dir = self.config.tumoral_dir(&self.id);
- let path_str = format!("{}/{}_somatic_variants.bit", tumoral_dir, self.id);
- let path = Path::new(&path_str);
- !path.exists()
- }
- }
- impl Run for SomaticPipe {
- /// Executes the full somatic variant analysis pipeline.
- fn run(&mut self) -> anyhow::Result<()> {
- let config = self.config.clone();
- let id = self.id.clone();
- info!("Running somatic pipe for {id}.");
- // Define output paths for the final somatic variants
- let result_json = format!("{}/{id}_somatic_variants.json.gz", config.tumoral_dir(&id));
- let result_bit = format!("{}/{id}_somatic_variants.bit", config.tumoral_dir(&id));
- let result_vcf = format!("{}/{id}_somatic_variants.vcf.gz", config.tumoral_dir(&id));
- if Path::new(&result_bit).exists() && !config.somatic_pipe_force {
- return Err(anyhow::anyhow!(
- "Somatic Pipe output already exists for {id}."
- ));
- }
- let mut annotations = Annotations::default();
- // Create stats directory if it doesn't exist
- let stats_dir = config.somatic_pipe_stats(&id);
- if !Path::new(&stats_dir).exists() {
- fs::create_dir(&stats_dir)?;
- }
- // Initialize and run any pre-required input if necessary
- info!("Initialization prerequired pipe inputs...");
- let mut to_run_if_req = create_should_run!(
- &id,
- &config,
- SomaticScan,
- ClairS,
- NanomonSV,
- Severus,
- Savana,
- DeepSomatic
- );
- to_run_if_req.extend(create_should_run_normal_tumoral!(&id, &config, DeepVariant));
- info!("Running prerequired pipe components.");
- run_if_required(&mut to_run_if_req).map_err(|e| {
- anyhow::anyhow!("Failed to run a prerequired component of somatic pipe for {id}.\n{e}")
- })?;
- // Initialize variant callers
- let mut callers = init_somatic_callers!(
- &id,
- &config,
- ClairS,
- NanomonSV,
- Severus,
- Savana,
- DeepSomatic
- );
- callers.extend(init_solo_callers_normal_tumoral!(&id, &config, DeepVariant,));
- // Load variants from each caller
- info!("Loading variants.");
- let mut variants_collections = load_variants(&mut callers, &annotations)
- .map_err(|e| anyhow::anyhow!("Error while loading variants\n{e}"))?;
- // Load germline variants using ClairS
- info!("Loading germline variants.");
- let clairs_germline =
- ClairS::initialize(&id, self.config.clone())?.germline(&annotations)?;
- variants_collections.push(clairs_germline);
- // Initialize statistics
- let mut somatic_stats = SomaticPipeStats::init(&variants_collections);
- info!(
- "Variants collections loaded from {} vcf (total of {} variants loaded)",
- variants_collections.len(),
- variants_collections
- .iter()
- .map(|v| v.variants.len())
- .sum::<usize>()
- );
- // Initial annotation stats (caller annotations only)
- let caller_cat_anns = |v: &Annotation| matches!(v, Annotation::Callers(_, _));
- let annot_init = annotations.callers_stat(Some(Box::new(caller_cat_anns)));
- somatic_stats.annot_init(
- &annot_init,
- &format!("{stats_dir}/{id}_germline_filter.json"),
- )?;
- annot_init.save_to_json(&format!("{stats_dir}/{id}_annotations_01.json"))?;
- // Filter out germline and solo constitutional variants
- info!(
- "Keeping somatic variants (variants neither in solo {} nor in germline).",
- config.normal_name
- );
- somatic_stats.n_constit_germline =
- annotations.retain_variants(&mut variants_collections, |anns| {
- !anns.iter().any(|ann| {
- matches!(
- ann,
- Annotation::Callers(_, Sample::Germline)
- | Annotation::Callers(_, Sample::SoloConstit)
- )
- })
- });
- annotations
- .callers_stat(Some(Box::new(caller_cat_anns)))
- .save_to_json(&format!(
- "{stats_dir}/{id}_annotations_02_post_germline.json"
- ))?;
- // MASK mapq
- let (mut high_depth_ranges, mut low_quality_ranges) =
- somatic_depth_quality_ranges(&id, &config)?;
- high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
- low_quality_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
- info!("ranges: {}", low_quality_ranges.len());
- variants_collections.iter_mut().for_each(|e | {
- let _ = e.annotate_with_ranges(&low_quality_ranges, &annotations, Annotation::LowMAPQ);
- });
- let n_masked_lowqual = annotations.retain_variants(&mut variants_collections, |anns| {
- anns.contains(&Annotation::LowMAPQ)
- });
- info!("N low mapq filtered: {}", n_masked_lowqual);
- // Annotate with depth and number of alternate reads from constitutional BAM
- info!("Reading Constit BAM file for depth and pileup annotation.");
- variants_collections.iter().try_for_each(|c| {
- c.annotate_with_constit_bam(&annotations, &self.config.normal_bam(&id), 150)
- })?;
- annotations.somatic_constit_boundaries(
- self.config.somatic_max_alt_constit,
- self.config.somatic_min_constit_depth,
- );
- annotations
- .callers_stat(Some(Box::new(|v| {
- matches!(
- v,
- Annotation::Callers(_, _)
- | Annotation::ConstitAlt(_)
- | Annotation::ConstitDepth(_)
- | Annotation::HighConstitAlt
- | Annotation::LowConstitDepth
- )
- })))
- .save_to_json(&format!("{stats_dir}/{id}_annotations_03_bam.json"))?;
- // Filter based on low constitutional depth
- info!(
- "Removing variants when depth in constit bam < {}.",
- self.config.somatic_min_constit_depth
- );
- somatic_stats.n_low_constit = annotations
- .retain_variants(&mut variants_collections, |anns| {
- !anns.contains(&Annotation::LowConstitDepth)
- });
- info!(
- "{} variants removed when depth in constit bam < {}.",
- somatic_stats.n_low_constit, self.config.somatic_min_constit_depth
- );
- // Filter: remove HighConstitAlt
- info!(
- "Removing variants with SNP/indel inside the constit alignements > {}",
- self.config.somatic_max_alt_constit
- );
- somatic_stats.n_high_alt_constit = annotations
- .retain_variants(&mut variants_collections, |anns| {
- !anns.contains(&Annotation::HighConstitAlt)
- });
- info!(
- "{} variants removed with SNP/indel inside the constit alignements > {}",
- somatic_stats.n_high_alt_constit, self.config.somatic_max_alt_constit
- );
- annotations
- .callers_stat(Some(Box::new(|v| {
- matches!(
- v,
- Annotation::Callers(_, _)
- | Annotation::ConstitAlt(_)
- | Annotation::ConstitDepth(_)
- )
- })))
- .save_to_json(&format!("{stats_dir}/{id}_annotations_04_bam_filter.json"))?;
- // Annotate variants with sequence entropy
- info!(
- "Entropy annotation from {} sequences.",
- self.config.reference
- );
- variants_collections.iter().for_each(|c| {
- c.annotate_with_sequence_context(
- &annotations,
- &self.config.reference,
- self.config.entropy_seq_len,
- self.config.somatic_pipe_threads,
- );
- });
- // Annotate with external databases like COSMIC and GnomAD
- info!("Annotation with external databases like COSMIC and GnomAD.");
- variants_collections
- .iter()
- .try_for_each(|c| -> anyhow::Result<()> {
- let ext_annot = ExternalAnnotation::init()?;
- ext_annot.annotate(&c.variants, &annotations)?;
- Ok(())
- })?;
- annotations
- .callers_stat(Some(Box::new(|v| {
- matches!(
- v,
- Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_)
- )
- })))
- .save_to_json(&format!("{stats_dir}/{id}_annotations_05_gnomad.json"))?;
- // Filter: Remove variants present in GnomAD and have alt reads in constitutional sample
- info!("Filtering out variants in GnomAD and in constit bam at low AF.");
- somatic_stats.n_high_alt_constit_gnomad = annotations
- .retain_variants(&mut variants_collections, |anns| {
- !is_gnomad_and_constit_alt(anns)
- });
- info!(
- "{} variants filtered, with constit alt <= max contig alt ({}) and in GnomAD.",
- somatic_stats.n_high_alt_constit_gnomad, self.config.somatic_max_alt_constit
- );
- // TODO: These stats doesn't capture filter metrics !!!
- annotations
- .callers_stat(Some(Box::new(|v| {
- matches!(
- v,
- Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_)
- )
- })))
- .save_to_json(&format!(
- "{stats_dir}/{id}_annotations_06_gnomad_filter.json"
- ))?;
- // Filter low entropy variants
- annotations.low_shannon_entropy(self.config.min_shannon_entropy);
- info!(
- "Filtering out variants with low entropies ({})",
- config.min_shannon_entropy
- );
- annotations
- .callers_stat(Some(Box::new(|v| {
- matches!(v, Annotation::Callers(_, _) | Annotation::LowEntropy)
- })))
- .save_to_json(&format!("{stats_dir}/{id}_annotations_07_entropy.json"))?;
- somatic_stats.n_low_entropies = annotations
- .retain_variants(&mut variants_collections, |anns| {
- !anns.contains(&Annotation::LowEntropy)
- });
- annotations
- .callers_stat(Some(Box::new(|v| matches!(v, Annotation::Callers(_, _)))))
- .save_to_json(&format!(
- "{stats_dir}/{id}_annotations_08_entropy_filter.json"
- ))?;
- // Final VEP annotation
- info!("Annotation with VEP.");
- variants_collections
- .iter()
- .try_for_each(|c| -> anyhow::Result<()> {
- let ext_annot = ExternalAnnotation::init()?;
- ext_annot.annotate_vep(&c.variants, &annotations)?;
- Ok(())
- })?;
- annotations
- .callers_stat(Some(Box::new(caller_cat_anns)))
- .save_to_json(&format!("{stats_dir}/{id}_annotations_09_vep.json"))?;
- annotations.vep_stats()?;
- // Merge all variants into a final collection
- let mut variants = variants_collections.into_iter().fold(
- Variants::default(),
- |mut acc, variants_collection| {
- acc.merge(variants_collection, &annotations);
- acc
- },
- );
- let vep_stats = annotations.vep_stats()?;
- vep_stats.save_to_json(&format!("{stats_dir}/{id}_annotations_10_vep.json"))?;
- VariantsStats::new(&mut variants, &id, &config, &high_depth_ranges)?
- .save_to_json(&format!("{stats_dir}/{id}_variants_stats_final.json.gz"))?;
- info!("Final unique variants: {}", variants.data.len());
- // Ensure sorted (should already be sorted)
- variants.sort();
- let vntrs: Vec<GenomeRange> = read_bed("/data/ref/hs1/vntrs_chm13.bed")?.iter().map(|r| r.range.clone()).collect();
- variants.annotate_with_ranges(&vntrs, Some(Annotation::VNTR), 0, Vec::new());
- let repeat_masker: Vec<GenomeRange> = read_bed("/data/ref/hs1/chm13v2.0_RepeatMasker_4.1.2p1.2022Apr14.bed")?.iter().map(|r| r.range.clone()).collect();
- variants.annotate_with_ranges(&repeat_masker, Some(Annotation::RepeatMasker), 0, Vec::new());
- variants.save_to_json(&result_json)?;
- variants.save_to_file(&result_bit)?;
- variants.write_vcf(&result_vcf, &config.dict_file, config.somatic_pipe_force)?;
- Ok(())
- }
- }
- pub fn const_stats(id: String, config: Config) -> anyhow::Result<()> {
- info!("Loading Germline");
- let annotations = Annotations::default();
- let clairs_germline = ClairS::initialize(&id, config.clone())?.germline(&annotations)?;
- info!("Annotation with Cosmic and GnomAD.");
- let ext_annot = ExternalAnnotation::init()?;
- ext_annot.annotate(&clairs_germline.variants, &annotations)?;
- let mut variants = Variants::default();
- variants.merge(clairs_germline, &annotations);
- info!("-- {}", variants.data.len());
- // let u = VariantsStats::new(&mut variants, &id, &config)?;
- // info!("++ {}", u.n);
- Ok(())
- }
- /// Holds statistical data for somatic variant pipeline processing,
- /// including summary counts and input categorization.
- #[derive(Debug, Default, Clone)]
- pub struct SomaticPipeStats {
- /// Summary of input variant collections grouped by sample type.
- pub input: InputStats,
- /// Number of variants labeled as both constitutional and germline.
- pub n_constit_germline: usize,
- /// Number of variants in constitutional samples with low allele frequency.
- pub n_low_constit: usize,
- /// Number of variants in constitutional samples with high alternative allele count.
- pub n_high_alt_constit: usize,
- /// Number of high-alt constitutional variants that are also found in gnomAD.
- pub n_high_alt_constit_gnomad: usize,
- /// Number of variants filtered due to low entropy (indicative of low complexity regions).
- pub n_low_entropies: usize,
- }
- impl SomaticPipeStats {
- /// Initializes a `SomaticPipeStats` object with populated `InputStats` based on the
- /// provided `VariantCollection`s.
- pub fn init(collections: &[VariantCollection]) -> Self {
- Self {
- input: InputStats::from_collections(collections),
- ..Default::default()
- }
- }
- /// Generates a tumor-vs-germline annotation matrix and writes it to a JSON file.
- ///
- /// This method analyzes co-occurrence patterns between tumor and germline/constit
- /// variant callers by iterating through annotation statistics from `AnnotationsStats`.
- /// It builds a matrix where each row corresponds to a **tumor caller** and each column
- /// corresponds to a **germline or constitutional caller**. Each cell contains the number
- /// of variant calls that were annotated by both the tumor and the germline/constit caller.
- ///
- /// In addition to writing a structured JSON file to the specified path, the function also
- /// prints a tab-separated human-readable matrix to stdout for convenience.
- ///
- /// # Parameters
- ///
- /// - `stats`: A reference to [`AnnotationsStats`] containing categorical annotations.
- /// This object holds a frequency map where keys are combinations of `Annotation`
- /// values (serialized as strings) and values are occurrence counts.
- /// - `json_path`: The path where the resulting JSON output file should be written.
- ///
- /// # Output Formats
- ///
- /// ## JSON Output (`json_path`)
- ///
- /// The output JSON is an **array of tumor caller records**, each containing:
- /// - `caller_name`: The name of the tumor caller (as a string).
- /// - `germline`: An array of JSON objects, each with one key-value pair,
- /// where the key is a germline/constit caller (or combination) and the value is the count.
- ///
- /// ### Example
- /// ```json
- /// [
- /// {
- /// "caller_name": "DeepVariant SoloTumor",
- /// "germline": [
- /// {"ClairS Germline": 99978},
- /// {"ClairS Germline + DeepVariant SoloConstit": 4710570}
- /// ]
- /// },
- /// {
- /// "caller_name": "ClairS Somatic",
- /// "germline": [
- /// {"ClairS Germline": 944},
- /// {"ClairS Germline + DeepVariant SoloConstit": 362}
- /// ]
- /// }
- /// ]
- /// ```
- ///
- /// ## Console Output (TSV)
- ///
- /// A tab-separated matrix is printed to stdout. Each row begins with a tumor caller,
- /// followed by columns showing germline/constit caller combinations and their counts.
- ///
- /// # Notes
- /// - Tumor callers are collected from `self.input.somatic` and `self.input.solo_tumor`.
- /// - Germline/constit callers are collected from `self.input.germline` and `self.input.solo_constit`.
- /// - Keys in the original `AnnotationsStats` map are split using `" + "` and parsed into `Annotation` enums.
- /// - Germline keys are sorted and joined to form canonical column labels.
- /// - Tumor and germline annotations are matched by exact `Annotation` equality.
- ///
- /// # Errors
- /// Returns an error if:
- /// - Any annotation key in `AnnotationsStats` fails to parse into a valid `Annotation`.
- /// - The JSON output file cannot be created or written to.
- ///
- /// # Dependencies
- /// Requires that `Annotation` implements `FromStr`, `ToString`, `PartialEq`, and `Clone`.
- ///
- /// # Example Usage
- /// ```rust
- /// let mut stats = SomaticPipeStats::init(&collections);
- /// stats.annot_init(&annotation_stats, "output/matrix.json")?;
- /// ```
- pub fn annot_init(&self, stats: &AnnotationsStats, json_path: &str) -> anyhow::Result<()> {
- // Parse annotations from stats
- let stats: Vec<(Vec<Annotation>, u64)> = stats
- .categorical
- .iter()
- .map(|e| {
- let anns = e
- .key()
- .split(" + ")
- .map(|k| k.parse())
- .collect::<anyhow::Result<Vec<Annotation>>>()
- .map_err(|err| {
- anyhow::anyhow!("Error while splitting key in AnnotationsStats.\n{err}")
- })?;
- Ok((anns, *e.value()))
- })
- .collect::<anyhow::Result<Vec<(Vec<Annotation>, u64)>>>()?;
- // Collect tumor and germline callers from input stats
- let callers_somatic_solo_tumor = [
- self.input
- .somatic
- .iter()
- .map(|(caller, _)| caller.clone())
- .collect::<Vec<Annotation>>(),
- self.input
- .solo_tumor
- .iter()
- .map(|(caller, _)| caller.clone())
- .collect(),
- ]
- .concat();
- let callers_germline_solo_constit = [
- self.input
- .germline
- .iter()
- .map(|(caller, _)| caller.clone())
- .collect::<Vec<Annotation>>(),
- self.input
- .solo_constit
- .iter()
- .map(|(caller, _)| caller.clone())
- .collect(),
- ]
- .concat();
- // Build a matrix of tumor vs germline hits
- let mut with_germline: HashMap<String, HashMap<String, u64>> = HashMap::new();
- stats.iter().for_each(|(anns, v)| {
- // Only proceed if this annotation includes a germline/constit sample
- if anns.iter().any(|a| {
- matches!(
- a,
- Annotation::Callers(_, Sample::SoloConstit)
- | Annotation::Callers(_, Sample::Germline)
- )
- }) {
- // Find all tumor callers present in this annotation set
- let n_by_tumor: Vec<(String, u64)> = callers_somatic_solo_tumor
- .iter()
- .flat_map(|tumor| {
- if anns.contains(tumor) {
- vec![(tumor.to_string(), *v)]
- } else {
- vec![]
- }
- })
- .collect();
- // Build a normalized germline key
- let mut germline_caller: Vec<String> = callers_germline_solo_constit
- .iter()
- .flat_map(|germ| {
- if anns.contains(germ) {
- vec![germ.to_string()]
- } else {
- vec![]
- }
- })
- .collect();
- germline_caller.sort();
- let germline_caller = germline_caller.join(" + ");
- // Update matrix: tumor -> germline -> count
- n_by_tumor.iter().for_each(|(tumoral_caller, n)| {
- if let Some(row) = with_germline.get_mut(tumoral_caller) {
- if let Some(col) = row.get_mut(&germline_caller) {
- *col += *n;
- } else {
- row.insert(germline_caller.to_string(), *n);
- }
- } else {
- let mut row = HashMap::new();
- row.insert(germline_caller.to_string(), *n);
- with_germline.insert(tumoral_caller.to_string(), row);
- }
- });
- }
- });
- // Extract all unique germline caller labels
- let mut germlines_callers: Vec<String> = with_germline
- .iter()
- .flat_map(|(_, r)| {
- r.iter()
- .map(|(k, _)| k.to_string())
- .collect::<Vec<String>>()
- })
- .collect();
- germlines_callers.sort();
- germlines_callers.dedup();
- // Print a readable tab-separated matrix
- let mut json = Vec::new();
- let mut lines: Vec<String> = with_germline
- .iter()
- .map(|(tumor, row)| {
- json.push(format!(
- "{{\"caller_name\": \"{tumor}\", \"germline\": [{}] }}",
- germlines_callers
- .iter()
- .map(|g| {
- let v = row.get(g).unwrap_or(&0);
- format!("{{\"{g}\": {v}}}")
- })
- .join(", ")
- ));
- format!(
- "{tumor}\t{}",
- germlines_callers
- .iter()
- .map(|g| {
- let v = row.get(g).unwrap_or(&0);
- format!("{g}: {v}")
- })
- .join("\t")
- )
- })
- .collect();
- lines.sort();
- println!("{}", lines.join("\n"));
- // Write JSON to file
- let json = format!("[{}]", json.join(", "));
- let mut file = File::create(json_path)?;
- file.write_all(json.as_bytes())?;
- Ok(())
- }
- }
- /// Aggregated statistics on variant collections, grouped by sample type.
- ///
- /// The `InputStats` struct stores the number of variants observed for each
- /// caller, categorized by biological sample type:
- ///
- /// - `solo_tumor`: Variants from tumor-only samples.
- /// - `solo_constit`: Variants from normal-only (constitutional) samples.
- /// - `germline`: Variants classified as germline (shared between normal and tumor).
- /// - `somatic`: Variants classified as somatic (tumor-specific).
- ///
- /// Each vector contains tuples of the form `(Annotation, usize)`, where:
- /// - `Annotation` identifies the caller and sample type.
- /// - `usize` represents the number of variants for that annotation.
- #[derive(Debug, Default, Clone)]
- pub struct InputStats {
- /// Variants from tumor-only samples.
- pub solo_tumor: Vec<(Annotation, usize)>,
- /// Variants from normal-only (constitutional) samples.
- pub solo_constit: Vec<(Annotation, usize)>,
- /// Germline variants (present in both normal and tumor).
- pub germline: Vec<(Annotation, usize)>,
- /// Somatic variants (specific to tumor).
- pub somatic: Vec<(Annotation, usize)>,
- }
- impl InputStats {
- /// Constructs an `InputStats` instance by aggregating variant counts
- /// from a list of [`VariantCollection`]s.
- ///
- /// Each `VariantCollection` is inspected to determine its sample type,
- /// as indicated by the `Annotation::Callers(_, Sample)` variant. The
- /// number of variants (`collection.variants.len()`) is recorded in the
- /// appropriate field of `InputStats`, grouped by `Sample`.
- ///
- /// # Parameters
- ///
- /// - `collections`: A slice of [`VariantCollection`] objects containing
- /// variants annotated with a caller and sample type.
- ///
- /// # Returns
- ///
- /// A populated `InputStats` instance with counts per (caller, sample type).
- ///
- /// # Examples
- ///
- /// ```
- /// let collections: Vec<VariantCollection> = load_collections();
- /// let stats = InputStats::from_collections(&collections);
- /// println!("Somatic calls: {:?}", stats.somatic);
- /// ```
- pub fn from_collections(collections: &[VariantCollection]) -> Self {
- let mut stats = Self::default();
- for collection in collections.iter() {
- match collection.caller {
- Annotation::Callers(_, Sample::SoloTumor) => stats
- .solo_tumor
- .push((collection.caller.clone(), collection.variants.len())),
- Annotation::Callers(_, Sample::SoloConstit) => stats
- .solo_constit
- .push((collection.caller.clone(), collection.variants.len())),
- Annotation::Callers(_, Sample::Germline) => stats
- .germline
- .push((collection.caller.clone(), collection.variants.len())),
- Annotation::Callers(_, Sample::Somatic) => stats
- .somatic
- .push((collection.caller.clone(), collection.variants.len())),
- _ => (),
- };
- }
- stats
- }
- }
|