use crate::{ annotation::is_gnomad_and_constit_alt, create_should_run_normal_tumoral, init_solo_callers_normal_tumoral, io::bed::read_bed, pipes::{Initialize, InitializeSolo, ShouldRun}, positions::{GenomeRange, GenomeRangeExt}, scan::scan::SomaticScan, variant::{ variants_stats::somatic_depth_quality_ranges, vcf_variant::{run_if_required, ShouldRunBox}, }, }; use log::info; use rayon::slice::ParallelSliceMut; use serde::Serialize; use std::{ collections::BTreeMap, fs::{self, File}, path::{Path, PathBuf}, }; use crate::{ annotation::{Annotation, Annotations, AnnotationsStats, Sample}, callers::{ clairs::ClairS, deep_somatic::DeepSomatic, deep_variant::DeepVariant, nanomonsv::NanomonSV, savana::Savana, severus::Severus, }, config::Config, create_should_run, init_somatic_callers, runners::Run, variant::{ variant_collection::{ExternalAnnotation, VariantCollection, Variants}, variants_stats::VariantsStats, vcf_variant::{load_variants, CallerBox}, }, }; /// Runs the full somatic variant calling pipeline for a single sample (`id`). /// /// This function orchestrates the entire somatic variant discovery process, /// starting from raw variant caller outputs (`PASSED VCF`) and applying multiple filtering /// and annotation steps to produce high-confidence somatic variants. /// /// This function orchestrates the end-to-end somatic variant discovery process: /// - Executes and verifies upstream components if necessary /// - Loads variants from multiple callers (tumor and normal samples) /// - Applies several annotation and filtering steps (e.g. depth, population frequency, entropy) /// - Tracks filtering statistics at each step /// - Outputs high-confidence somatic variants in both `.json.gz` and `.bit` formats /// /// ## Output Overview /// The final output includes: /// - `{tumoral_dir}/{id}_somatic_variants.json.gz`: annotated somatic variants /// - `{tumoral_dir}/{id}_somatic_variants.bit`: compact binary variant representation /// - `{stats_dir}/`: multiple intermediate JSON files with annotations and statistics /// /// ## Steps /// /// This pipeline performs the following high-level steps: /// /// ### 1. Output Existence Check /// If the final JSON result already exists and [`Config::somatic_pipe_force`] is not set, /// the pipeline aborts early to avoid overwriting results. /// /// ### 2. Initialization /// - Prepares statistics and output directories. /// - Initializes variant annotations. /// /// ### 3. Pre-requisite Tool Execution /// Runs any required upstream components (e.g., alignment, basecalling, variant callers) if /// their outputs are missing, using the [`run_if_required`] logic. /// /// ### 4. Load Variant Collections /// - Initializes the configured somatic variant callers and loads their output variants from PASSED VCF. /// - Also loads germline variants (from [`ClairS::germline`]) for comparative germline filtering. /// /// ### 5. Statistics Initialization /// - Initializes a [`SomaticPipeStats`] object to track the number of variants at each step. /// - Captures initial variant counts before filtering. /// - Aggregates variant counts from all sources and stores initial annotations for quality control and /// comparison before filtering. /// /// ### 6. Filter: Germline & Solo Constitutional Variants /// - Removes variants labeled as either Germline or Solo Constitutional, assuming they are unlikely to be somatic. /// - Records count of removed variants in [`SomaticPipeStats::n_constit_germline`]. /// /// ### 7. Annotation: BAM Read Depth and Alt Allele Counts /// - Uses the constitutional BAM file to annotate each variant with read depth and the number of alternate reads observed. /// - Flags variants with low depth or excessive alt reads for filtering as specified in [`Config`]. /// /// ### 8. Filtering: Low Depth / High Alt Alleles /// - Removes variants with low coverage in the constitutional sample, or excessive alt allele support (suggestive of germline origin). /// - Updates stats: /// - [`SomaticPipeStats::n_low_constit`] /// - [`SomaticPipeStats::n_high_alt_constit`] /// /// ### 9. Annotation: Sequence Entropy /// Adds Shannon entropy annotation based on the reference sequence context /// around each variant (cf. [`Config::entropy_seq_len`]) to flag low-complexity regions (often repetitive). /// /// ### 10. Annotation: External Databases (COSMIC, GnomAD): /// - Uses external resources to annotate variants with: /// - COSMIC hits (somatic mutation database) /// - GnomAD allele frequencies /// /// ### 11. Filtering: GnomAD + Alt Support in Constitutional Sample /// - Removes variants that are both present in GnomAD **and** show /// alternate allele support in the constitutional BAM. /// These are highly likely to be non-somatic germline polymorphisms. /// - Updates [`SomaticPipeStats::n_high_alt_constit_gnomad`] stat. /// /// ### 12. Filtering: Low Shannon Entropy: /// - Removes variants from low-complexity regions with entropy below the configured threshold /// (cf. [`Config::min_shannon_entropy`]). /// - Updates [`SomaticPipeStats::n_low_entropies`]. /// /// ### 13. Annotation: VEP (Variant Effect Predictor) /// Adds transcript-level annotations from Ensembl VEP, providing functional consequences, /// impacted genes, and regulatory features. /// /// ### 14. Merging /// Merges variant collections into a unified [`Variants`] structure, /// preserving annotations and applying deduplication logic. /// /// ### 15. Final Statistics and Saving /// - Saves final annotation stats, VEP summaries, and variant-level statistics. /// - Exports the final somatic variant set to both compressed JSON and `.bit` formats. /// /// # Returns /// - `Ok(())` if all steps completed successfully. /// - `Err` if any tool fails, if file I/O fails, or if logical conditions are violated (e.g., pre-existing output). /// /// # Errors /// - Returns early if the output file already exists and [`Config::somatic_pipe_force`] is `false`. /// - Wraps all component-specific errors using `anyhow::Result` with context. /// /// # Side Effects /// - Runs external tools conditionally (e.g., [`ClairS`], [`DeepSomatic`]). /// - Creates intermediate directories and annotation JSONs for debugging/QC. /// - May consume significant compute time depending on the number of callers, annotations, and variants. /// /// # TODOs /// - Support compressed intermediate files (`// TODO: GZ !!!`) /// - Improve filtering metrics reporting (currently not all filtered variants are tracked in final stats). /// /// # Example Output Files /// - `tumoral_dir/sample123_somatic_variants.json.gz` /// - `tumoral_dir/sample123_somatic_variants.bit` /// - `stats_dir/sample123_annotations_*.json` (intermediate annotation snapshots) /// /// # See Also /// - [`Annotations`] – core structure for managing per-variant metadata /// - [`Variants`] – the merged final variant structure /// - [`SomaticPipeStats`] – used for tracking variant counts throughout filtering /// pub struct SomaticPipe { /// Unique identifier for the sample. pub id: String, /// Configuration parameters for the pipeline. pub config: Config, } impl Initialize for SomaticPipe { /// Initializes a new `Somatic` instance with default annotations. fn initialize(id: &str, config: &crate::config::Config) -> anyhow::Result { let id = id.to_string(); Ok(Self { id, config: config.clone(), }) } } impl ShouldRun for SomaticPipe { fn should_run(&self) -> bool { // path to the existing “.bit” file let tumoral_dir = self.config.tumoral_dir(&self.id); let bit_path = Path::new(&tumoral_dir).join(format!("{}_somatic_variants.bit", self.id)); // if we can’t read its modification time, re-run let res_meta = match fs::metadata(&bit_path).and_then(|m| m.modified()) { Ok(ts) => ts, Err(_) => return true, }; let deps: [PathBuf; 5] = [ Path::new(&self.config.normal_bam(&self.id)).to_path_buf(), Path::new(&self.config.tumoral_bam(&self.id)).to_path_buf(), Path::new(&self.config.reference).to_path_buf(), Path::new(&self.config.vntrs_bed).to_path_buf(), Path::new(&self.config.repeats_bed).to_path_buf(), ]; deps.iter().any(|p| { fs::metadata(p) .and_then(|m| m.modified()) .map_or(true, |ts| ts > res_meta) }) } } impl Run for SomaticPipe { /// Executes the full somatic variant analysis pipeline. fn run(&mut self) -> anyhow::Result<()> { let config = self.config.clone(); let id = self.id.clone(); info!("Running somatic pipe for {id}."); // Define output paths for the final somatic variants let result_json = format!("{}/{id}_somatic_variants.json.gz", config.tumoral_dir(&id)); let result_bit = format!("{}/{id}_somatic_variants.bit", config.tumoral_dir(&id)); let result_vcf = format!("{}/{id}_somatic_variants.vcf.gz", config.tumoral_dir(&id)); let outputs = [&result_json, &result_bit, &result_vcf]; if !config.somatic_pipe_force && outputs.iter().any(|p| Path::new(p).exists()) { return Err(anyhow::anyhow!( "Somatic Pipe output already exists for {id}." )); } let mut annotations = Annotations::default(); // Create stats directory if it doesn't exist let stats_dir = config.somatic_pipe_stats(&id); if !Path::new(&stats_dir).exists() { fs::create_dir(&stats_dir)?; } // Initialize and run any pre-required input if necessary info!("Initialization prerequired pipe inputs..."); let mut to_run_if_req = create_should_run!( &id, &config, SomaticScan, ClairS, NanomonSV, Severus, Savana, DeepSomatic ); to_run_if_req.extend(create_should_run_normal_tumoral!(&id, &config, DeepVariant)); info!("Running prerequired pipe components."); run_if_required(&mut to_run_if_req).map_err(|e| { anyhow::anyhow!("Failed to run a prerequired component of somatic pipe for {id}.\n{e}") })?; // Initialize variant callers let mut callers = init_somatic_callers!( &id, &config, ClairS, NanomonSV, Severus, Savana, DeepSomatic ); callers.extend(init_solo_callers_normal_tumoral!(&id, &config, DeepVariant,)); // Load variants from each caller info!("Loading variants."); let mut variants_collections = load_variants(&mut callers, &annotations) .map_err(|e| anyhow::anyhow!("Error while loading variants\n{e}"))?; // Load germline variants using ClairS info!("Loading germline variants."); let clairs_germline = ClairS::initialize(&id, &self.config.clone())?.germline(&annotations)?; variants_collections.push(clairs_germline); // Initialize statistics let mut somatic_stats = SomaticPipeStats::init(&variants_collections); info!( "Variants collections loaded from {} vcf (total of {} variants loaded)", variants_collections.len(), variants_collections .iter() .map(|v| v.variants.len()) .sum::() ); // Initial annotation stats (caller annotations only) let caller_cat_anns = |v: &Annotation| matches!(v, Annotation::Callers(_, _)); let annot_init = annotations.callers_stat(Some(Box::new(caller_cat_anns))); somatic_stats.annot_init( &annot_init, &format!("{stats_dir}/{id}_germline_filter.json"), )?; annot_init.save_to_json(&format!("{stats_dir}/{id}_annotations_01.json"))?; // Filter out germline and solo constitutional variants info!( "Keeping somatic variants (variants neither in solo normal {} nor in germline).", config.normal_name ); somatic_stats.n_constit_germline = annotations.retain_variants(&mut variants_collections, |anns| { !anns.iter().any(|ann| { matches!( ann, Annotation::Callers(_, Sample::Germline) | Annotation::Callers(_, Sample::SoloConstit) ) }) }); annotations .callers_stat(Some(Box::new(caller_cat_anns))) .save_to_json(&format!( "{stats_dir}/{id}_annotations_02_post_germline.json" ))?; // Remove deletions stretch // info!("Removing deletions stretchs:"); // variants_collections.iter_mut().for_each(|coll| { // let rm = coll.remove_strech(); // info!("\t - {} {}: {}", coll.vcf.caller, coll.vcf.time, rm); // }); // MASK mapq let (mut high_depth_ranges, mut low_quality_ranges) = somatic_depth_quality_ranges(&id, &config)?; high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start)); low_quality_ranges.par_sort_by_key(|r| (r.contig, r.range.start)); info!( "High-depth ranges: n={} bp={}", high_depth_ranges.len(), high_depth_ranges.total_len(), ); info!( "LowQ ranges: n={} bp={}", low_quality_ranges.len(), low_quality_ranges.total_len(), ); variants_collections.iter_mut().for_each(|e| { let _ = e.annotate_with_ranges(&low_quality_ranges, &annotations, Annotation::LowMAPQ); }); let n_masked_lowqual = annotations.retain_variants(&mut variants_collections, |anns| { !anns.contains(&Annotation::LowMAPQ) }); info!("N low mapq filtered: {}", n_masked_lowqual); // Annotate with depth and number of alternate reads from constitutional BAM info!("Reading Constit BAM file for depth and pileup annotation."); variants_collections.iter().try_for_each(|c| { c.annotate_with_constit_bam(&annotations, &self.config.normal_bam(&id), 150) })?; annotations.somatic_constit_boundaries( self.config.somatic_max_alt_constit, self.config.somatic_min_constit_depth, ); annotations .callers_stat(Some(Box::new(|v| { matches!( v, Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::ConstitDepth(_) | Annotation::HighConstitAlt | Annotation::LowConstitDepth ) }))) .save_to_json(&format!("{stats_dir}/{id}_annotations_03_bam.json"))?; // Filter based on low constitutional depth info!( "Removing variants when depth in constit bam < {}.", self.config.somatic_min_constit_depth ); somatic_stats.n_low_constit = annotations .retain_variants(&mut variants_collections, |anns| { !anns.contains(&Annotation::LowConstitDepth) }); info!( "{} variants removed when depth in constit bam < {}.", somatic_stats.n_low_constit, self.config.somatic_min_constit_depth ); // Filter: remove HighConstitAlt info!( "Removing variants with SNP/indel inside the constit alignements > {}", self.config.somatic_max_alt_constit ); somatic_stats.n_high_alt_constit = annotations .retain_variants(&mut variants_collections, |anns| { !anns.contains(&Annotation::HighConstitAlt) }); info!( "{} variants removed with SNP/indel inside the constit alignements > {}", somatic_stats.n_high_alt_constit, self.config.somatic_max_alt_constit ); annotations .callers_stat(Some(Box::new(|v| { matches!( v, Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::ConstitDepth(_) ) }))) .save_to_json(&format!("{stats_dir}/{id}_annotations_04_bam_filter.json"))?; // Annotate variants with sequence context (entropy + trinucleotide) info!( "Annotating variants with sequence context using reference: {}", self.config.reference ); variants_collections.iter().try_for_each(|c| { c.annotate_with_sequence_context( &annotations, &self.config.reference, self.config.entropy_seq_len, self.config.somatic_pipe_threads, ) })?; // Annotate with external databases like COSMIC and GnomAD info!("Annotation with external databases like COSMIC and GnomAD."); variants_collections .iter() .try_for_each(|c| -> anyhow::Result<()> { let ext_annot = ExternalAnnotation::init(&config)?; ext_annot.annotate(&c.variants, &annotations)?; Ok(()) })?; annotations .callers_stat(Some(Box::new(|v| { matches!( v, Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_) ) }))) .save_to_json(&format!("{stats_dir}/{id}_annotations_05_gnomad.json"))?; // Filter: Remove variants present in GnomAD and have alt reads in constitutional sample info!("Filtering out variants in GnomAD and in constit bam at low AF."); somatic_stats.n_high_alt_constit_gnomad = annotations .retain_variants(&mut variants_collections, |anns| { !is_gnomad_and_constit_alt(anns) }); info!( "{} variants filtered, with constit alt <= max contig alt ({}) and in GnomAD.", somatic_stats.n_high_alt_constit_gnomad, self.config.somatic_max_alt_constit ); // TODO: These stats doesn't capture filter metrics !!! annotations .callers_stat(Some(Box::new(|v| { matches!( v, Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_) ) }))) .save_to_json(&format!( "{stats_dir}/{id}_annotations_06_gnomad_filter.json" ))?; // Filter low entropy variants annotations.low_shannon_entropy(self.config.min_shannon_entropy); info!( "Filtering out variants with low entropies ({})", config.min_shannon_entropy ); annotations .callers_stat(Some(Box::new(|v| { matches!(v, Annotation::Callers(_, _) | Annotation::LowEntropy) }))) .save_to_json(&format!("{stats_dir}/{id}_annotations_07_entropy.json"))?; somatic_stats.n_low_entropies = annotations .retain_variants(&mut variants_collections, |anns| { !anns.contains(&Annotation::LowEntropy) }); annotations .callers_stat(Some(Box::new(|v| matches!(v, Annotation::Callers(_, _))))) .save_to_json(&format!( "{stats_dir}/{id}_annotations_08_entropy_filter.json" ))?; // Final VEP annotation info!("Annotation with VEP."); variants_collections .iter() .try_for_each(|c| -> anyhow::Result<()> { let ext_annot = ExternalAnnotation::init(&config)?; ext_annot.annotate_vep(&c.variants, &annotations)?; Ok(()) })?; annotations .callers_stat(Some(Box::new(caller_cat_anns))) .save_to_json(&format!("{stats_dir}/{id}_annotations_09_vep.json"))?; // Merge all variants into a final collection let mut variants = variants_collections.into_iter().fold( Variants::default(), |mut acc, variants_collection| { acc.merge(variants_collection, &annotations); acc }, ); let vep_stats = annotations.vep_stats()?; vep_stats.save_to_json(&format!("{stats_dir}/{id}_annotations_10_vep.json"))?; VariantsStats::new(&mut variants, &id, &config, &high_depth_ranges)? .save_to_json(&format!("{stats_dir}/{id}_variants_stats_final.json.gz"))?; info!("Final unique variants: {}", variants.data.len()); // Ensure sorted (should already be sorted) variants.sort(); let vntrs: Vec = read_bed(&config.vntrs_bed)? .iter() .map(|r| r.range.clone()) .collect(); variants.annotate_with_ranges(&vntrs, Some(Annotation::VNTR), 0, Vec::new()); let repeat_masker: Vec = read_bed(&config.repeats_bed)? .iter() .map(|r| r.range.clone()) .collect(); variants.annotate_with_ranges(&repeat_masker, Some(Annotation::Repeat), 0, Vec::new()); variants.save_to_json(&result_json)?; variants.save_to_file(&result_bit)?; variants.write_vcf(&result_vcf, &config.dict_file, config.somatic_pipe_force)?; Ok(()) } } pub fn const_stats(id: String, config: Config) -> anyhow::Result<()> { info!("Loading Germline"); let annotations = Annotations::default(); let clairs_germline = ClairS::initialize(&id, &config.clone())?.germline(&annotations)?; info!("Annotation with Cosmic and GnomAD."); let ext_annot = ExternalAnnotation::init(&config)?; ext_annot.annotate(&clairs_germline.variants, &annotations)?; let mut variants = Variants::default(); variants.merge(clairs_germline, &annotations); info!("-- {}", variants.data.len()); // let u = VariantsStats::new(&mut variants, &id, &config)?; // info!("++ {}", u.n); Ok(()) } /// Holds statistical data for somatic variant pipeline processing, /// including summary counts and input categorization. #[derive(Debug, Default, Clone)] pub struct SomaticPipeStats { /// Summary of input variant collections grouped by sample type. pub input: InputStats, /// Number of variants removed because they were classified as germline /// by at least one caller, or detected in the normal-only (solo constitutional) /// callset (including variants classified as both). pub n_constit_germline: usize, /// Number of variants removed due to low depth in the constitutional (normal) BAM /// (i.e. annotated with [`Annotation::LowConstitDepth`]). pub n_low_constit: usize, /// Number of variants removed due to excessive alternate-allele support in the /// constitutional (normal) BAM (i.e. annotated with [`Annotation::HighConstitAlt`]). pub n_high_alt_constit: usize, /// Number of variants removed because they are present in gnomAD and also show /// alternate-allele support in the constitutional (normal) BAM, according to /// [`is_gnomad_and_constit_alt`]. pub n_high_alt_constit_gnomad: usize, /// Number of variants removed due to low sequence complexity (Shannon entropy), /// i.e. annotated with [`Annotation::LowEntropy`]. pub n_low_entropies: usize, } impl SomaticPipeStats { /// Initializes a `SomaticPipeStats` object with populated `InputStats` based on the /// provided `VariantCollection`s. pub fn init(collections: &[VariantCollection]) -> Self { Self { input: InputStats::from_collections(collections), ..Default::default() } } /// Generates a tumor-vs-germline annotation matrix and writes it to a JSON file. /// /// This method analyzes co-occurrence patterns between tumor and germline/constit /// variant callers by iterating through annotation statistics from `AnnotationsStats`. /// It builds a matrix where each row corresponds to a **tumor caller** and each column /// corresponds to a **germline or constitutional caller**. Each cell contains the number /// of variant calls that were annotated by both the tumor and the germline/constit caller. /// /// In addition to writing a structured JSON file to the specified path, the function also /// prints a tab-separated human-readable matrix to stdout for convenience. /// /// # Parameters /// /// - `stats`: A reference to [`AnnotationsStats`] containing categorical annotations. /// This object holds a frequency map where keys are combinations of `Annotation` /// values (serialized as strings) and values are occurrence counts. /// - `json_path`: The path where the resulting JSON output file should be written. /// /// # Output Formats /// /// ## JSON Output (`json_path`) /// /// The output JSON is an **array of tumor caller records**, each containing: /// - `caller_name`: The name of the tumor caller (as a string). /// - `germline`: An array of JSON objects, each with one key-value pair, /// where the key is a germline/constit caller (or combination) and the value is the count. /// /// ### Example /// ```json /// [ /// { /// "caller_name": "DeepVariant SoloTumor", /// "germline": { /// "ClairS Germline": 99978, /// "ClairS Germline + DeepVariant SoloConstit": 4710570 /// } /// }, /// { /// "caller_name": "ClairS Somatic", /// "germline": { /// "ClairS Germline": 944, /// "ClairS Germline + DeepVariant SoloConstit": 362 /// } /// } /// ] /// ``` /// /// ## Console Output (TSV) /// /// A tab-separated matrix is printed to stdout. Each row begins with a tumor caller, /// followed by columns showing germline/constit caller combinations and their counts. /// /// # Notes /// - Tumor callers are collected from `self.input.somatic` and `self.input.solo_tumor`. /// - Germline/constit callers are collected from `self.input.germline` and `self.input.solo_constit`. /// - Keys in the original `AnnotationsStats` map are split using `" + "` and parsed into `Annotation` enums. /// - Germline keys are sorted and joined to form canonical column labels. /// - Tumor and germline annotations are matched by exact `Annotation` equality. /// /// # Errors /// Returns an error if: /// - Any annotation key in `AnnotationsStats` fails to parse into a valid `Annotation`. /// - The JSON output file cannot be created or written to. /// /// # Dependencies /// Requires that `Annotation` implements `FromStr`, `ToString`, `PartialEq`, and `Clone`. /// /// # Example Usage /// ```rust /// let mut stats = SomaticPipeStats::init(&collections); /// stats.annot_init(&annotation_stats, "output/matrix.json")?; /// ``` pub fn annot_init(&self, stats: &AnnotationsStats, json_path: &str) -> anyhow::Result<()> { #[derive(Serialize)] struct TumorRow { caller_name: String, germline: BTreeMap, } // Parse stats keys into Vec once let parsed: Vec<(Vec, u64)> = stats .categorical .iter() .map(|e| { let anns = e .key() .split(" + ") .map(|k| k.parse::()) .collect::>>() .map_err(|err| { anyhow::anyhow!("Error while parsing AnnotationsStats key: {err}") })?; Ok((anns, *e.value())) }) .collect::>()?; // Collect caller labels let tumor_callers: Vec = self .input .somatic .iter() .map(|(a, _)| a.clone()) .chain(self.input.solo_tumor.iter().map(|(a, _)| a.clone())) .collect(); let germ_callers: Vec = self .input .germline .iter() .map(|(a, _)| a.clone()) .chain(self.input.solo_constit.iter().map(|(a, _)| a.clone())) .collect(); // Matrix: tumor -> (germline label -> count), deterministic by construction let mut matrix: BTreeMap> = BTreeMap::new(); for (anns, v) in parsed.iter() { // only rows that include any germline/constit label if !anns.iter().any(|a| { matches!( a, Annotation::Callers(_, Sample::SoloConstit) | Annotation::Callers(_, Sample::Germline) ) }) { continue; } // Which tumor callers are present in this set? let present_tumors: Vec = tumor_callers .iter() .filter(|t| anns.contains(t)) .map(|t| t.to_string()) .collect(); if present_tumors.is_empty() { continue; } // Canonical germline key: sorted labels joined by " + " let mut present_germs: Vec = germ_callers .iter() .filter(|g| anns.contains(g)) .map(|g| g.to_string()) .collect(); present_germs.sort(); let germ_key = present_germs.join(" + "); for tumor in present_tumors { *matrix .entry(tumor) .or_default() .entry(germ_key.clone()) .or_default() += *v; } } // Collect all germline columns (deterministic) let mut germ_cols: Vec = matrix .values() .flat_map(|row| row.keys().cloned()) .collect(); germ_cols.sort(); germ_cols.dedup(); // TSV output (deterministic ordering) let mut lines: Vec = Vec::with_capacity(matrix.len()); for (tumor, row) in matrix.iter() { let line = format!( "{tumor}\t{}", germ_cols .iter() .map(|g| format!("{g}: {}", row.get(g).copied().unwrap_or(0))) .collect::>() .join("\t") ); lines.push(line); } println!("{}", lines.join("\n")); // JSON output: list of rows with all columns filled (0 if absent) let json_rows: Vec = matrix .iter() .map(|(tumor, row)| { let mut full = BTreeMap::new(); for g in germ_cols.iter() { full.insert(g.clone(), row.get(g).copied().unwrap_or(0)); } TumorRow { caller_name: tumor.clone(), germline: full, } }) .collect(); let file = File::create(json_path)?; serde_json::to_writer_pretty(file, &json_rows)?; Ok(()) } } /// Aggregated statistics on variant collections, grouped by sample type. /// /// The `InputStats` struct stores the number of variants observed for each /// caller, categorized by biological sample type: /// /// - `solo_tumor`: Variants from tumor-only samples. /// - `solo_constit`: Variants from normal-only (constitutional) samples. /// - `germline`: Variants classified as germline (shared between normal and tumor). /// - `somatic`: Variants classified as somatic (tumor-specific). /// /// Each vector contains tuples of the form `(Annotation, usize)`, where: /// - `Annotation` identifies the caller and sample type. /// - `usize` represents the number of variants for that annotation. #[derive(Debug, Default, Clone)] pub struct InputStats { /// Variants from tumor-only samples. pub solo_tumor: Vec<(Annotation, usize)>, /// Variants from normal-only (constitutional) samples. pub solo_constit: Vec<(Annotation, usize)>, /// Germline variants (present in both normal and tumor). pub germline: Vec<(Annotation, usize)>, /// Somatic variants (specific to tumor). pub somatic: Vec<(Annotation, usize)>, } impl InputStats { /// Constructs an `InputStats` instance by aggregating variant counts /// from a list of [`VariantCollection`]s. /// /// Each `VariantCollection` is inspected to determine its sample type, /// as indicated by the `Annotation::Callers(_, Sample)` variant. The /// number of variants (`collection.variants.len()`) is recorded in the /// appropriate field of `InputStats`, grouped by `Sample`. /// /// # Parameters /// /// - `collections`: A slice of [`VariantCollection`] objects containing /// variants annotated with a caller and sample type. /// /// # Returns /// /// A populated `InputStats` instance with counts per (caller, sample type). /// /// # Examples /// /// ``` /// let collections: Vec = load_collections(); /// let stats = InputStats::from_collections(&collections); /// println!("Somatic calls: {:?}", stats.somatic); /// ``` pub fn from_collections(collections: &[VariantCollection]) -> Self { let mut stats = Self::default(); for collection in collections.iter() { match collection.caller { Annotation::Callers(_, Sample::SoloTumor) => stats .solo_tumor .push((collection.caller.clone(), collection.variants.len())), Annotation::Callers(_, Sample::SoloConstit) => stats .solo_constit .push((collection.caller.clone(), collection.variants.len())), Annotation::Callers(_, Sample::Germline) => stats .germline .push((collection.caller.clone(), collection.variants.len())), Annotation::Callers(_, Sample::Somatic) => stats .somatic .push((collection.caller.clone(), collection.variants.len())), _ => (), }; } stats } } #[cfg(test)] mod tests { use rayon::ThreadPoolBuilder; use super::*; use crate::helpers::test_init; #[test] fn somatic_pipe_run() -> anyhow::Result<()> { test_init(); let config = Config::default(); let pool = ThreadPoolBuilder::new() .num_threads(config.threads.into()) .build()?; pool.install(|| SomaticPipe::initialize("CHAHA", &config)?.run())?; Ok(()) } }