use crate::{ annotation::is_gnomad_and_constit_alt, collection::ShouldRun, create_should_run_normal_tumoral, init_solo_callers_normal_tumoral, io::bed::read_bed, positions::GenomeRange, scan::scan::SomaticScan, variant::{ variant::{run_if_required, ShouldRunBox}, variants_stats::somatic_depth_quality_ranges, } }; use itertools::Itertools; use log::info; use rayon::slice::ParallelSliceMut; use std::{ collections::HashMap, fs::{self, File}, io::Write, path::Path, }; use crate::{ annotation::{Annotation, Annotations, AnnotationsStats, Sample}, callers::{ clairs::ClairS, deep_somatic::DeepSomatic, deep_variant::DeepVariant, nanomonsv::NanomonSV, savana::Savana, severus::Severus, }, collection::{Initialize, InitializeSolo}, config::Config, create_should_run, init_somatic_callers, runners::Run, variant::{ variant::{load_variants, CallerBox}, variant_collection::{ExternalAnnotation, VariantCollection, Variants}, variants_stats::VariantsStats, }, }; /// Runs the full somatic variant calling pipeline for a single sample (`id`). /// /// This function orchestrates the entire somatic variant discovery process, /// starting from raw variant caller outputs (`PASSED VCF`) and applying multiple filtering /// and annotation steps to produce high-confidence somatic variants. /// /// This function orchestrates the end-to-end somatic variant discovery process: /// - Executes and verifies upstream components if necessary /// - Loads variants from multiple callers (tumor and normal samples) /// - Applies several annotation and filtering steps (e.g. depth, population frequency, entropy) /// - Tracks filtering statistics at each step /// - Outputs high-confidence somatic variants in both `.json.gz` and `.bit` formats /// /// ## Output Overview /// The final output includes: /// - `{tumoral_dir}/{id}_somatic_variants.json.gz`: annotated somatic variants /// - `{tumoral_dir}/{id}_somatic_variants.bit`: compact binary variant representation /// - `{stats_dir}/`: multiple intermediate JSON files with annotations and statistics /// /// ## Steps /// /// This pipeline performs the following high-level steps: /// /// ### 1. Output Existence Check /// If the final JSON result already exists and [`Config::somatic_pipe_force`] is not set, /// the pipeline aborts early to avoid overwriting results. /// /// ### 2. Initialization /// - Prepares statistics and output directories. /// - Initializes variant annotations. /// /// ### 3. Pre-requisite Tool Execution /// Runs any required upstream components (e.g., alignment, basecalling, variant callers) if /// their outputs are missing, using the [`run_if_required`] logic. /// /// ### 4. Load Variant Collections /// - Initializes the configured somatic variant callers and loads their output variants from PASSED VCF. /// - Also loads germline variants (from [`ClairS::germline`]) for comparative germline filtering. /// /// ### 5. Statistics Initialization /// - Initializes a [`SomaticPipeStats`] object to track the number of variants at each step. /// - Captures initial variant counts before filtering. /// - Aggregates variant counts from all sources and stores initial annotations for quality control and /// comparison before filtering. /// /// ### 6. Filter: Germline & Solo Constitutional Variants /// - Removes variants labeled as either Germline or Solo Constitutional, assuming they are unlikely to be somatic. /// - Records count of removed variants in [`SomaticPipeStats::n_constit_germline`]. /// /// ### 7. Annotation: BAM Read Depth and Alt Allele Counts /// - Uses the constitutional BAM file to annotate each variant with read depth and the number of alternate reads observed. /// - Flags variants with low depth or excessive alt reads for filtering as specified in [`Config`]. /// /// ### 8. Filtering: Low Depth / High Alt Alleles /// - Removes variants with low coverage in the constitutional sample, or excessive alt allele support (suggestive of germline origin). /// - Updates stats: /// - [`SomaticPipeStats::n_low_constit`] /// - [`SomaticPipeStats::n_high_alt_constit`] /// /// ### 9. Annotation: Sequence Entropy /// Adds Shannon entropy annotation based on the reference sequence context /// around each variant (cf. [`Config::entropy_seq_len`]) to flag low-complexity regions (often repetitive). /// /// ### 10. Annotation: External Databases (COSMIC, GnomAD): /// - Uses external resources to annotate variants with: /// - COSMIC hits (somatic mutation database) /// - GnomAD allele frequencies /// /// ### 11. Filtering: GnomAD + Alt Support in Constitutional Sample /// - Removes variants that are both present in GnomAD **and** show /// alternate allele support in the constitutional BAM. /// These are highly likely to be non-somatic germline polymorphisms. /// - Updates [`SomaticPipeStats::n_high_alt_constit_gnomad`] stat. /// /// ### 12. Filtering: Low Shannon Entropy: /// - Removes variants from low-complexity regions with entropy below the configured threshold /// (cf. [`Config::min_shannon_entropy`]). /// - Updates [`SomaticPipeStats::n_low_entropies`]. /// /// ### 13. Annotation: VEP (Variant Effect Predictor) /// Adds transcript-level annotations from Ensembl VEP, providing functional consequences, /// impacted genes, and regulatory features. /// /// ### 14. Merging /// Merges variant collections into a unified [`Variants`] structure, /// preserving annotations and applying deduplication logic. /// /// ### 15. Final Statistics and Saving /// - Saves final annotation stats, VEP summaries, and variant-level statistics. /// - Exports the final somatic variant set to both compressed JSON and `.bit` formats. /// /// # Returns /// - `Ok(())` if all steps completed successfully. /// - `Err` if any tool fails, if file I/O fails, or if logical conditions are violated (e.g., pre-existing output). /// /// # Errors /// - Returns early if the output file already exists and [`Config::somatic_pipe_force`] is `false`. /// - Wraps all component-specific errors using `anyhow::Result` with context. /// /// # Side Effects /// - Runs external tools conditionally (e.g., [`ClairS`], [`DeepSomatic`]). /// - Creates intermediate directories and annotation JSONs for debugging/QC. /// - May consume significant compute time depending on the number of callers, annotations, and variants. /// /// # TODOs /// - Support compressed intermediate files (`// TODO: GZ !!!`) /// - Improve filtering metrics reporting (currently not all filtered variants are tracked in final stats). /// /// # Example Output Files /// - `tumoral_dir/sample123_somatic_variants.json.gz` /// - `tumoral_dir/sample123_somatic_variants.bit` /// - `stats_dir/sample123_annotations_*.json` (intermediate annotation snapshots) /// /// # See Also /// - [`Annotations`] – core structure for managing per-variant metadata /// - [`Variants`] – the merged final variant structure /// - [`SomaticPipeStats`] – used for tracking variant counts throughout filtering /// pub struct SomaticPipe { /// Unique identifier for the sample. pub id: String, /// Configuration parameters for the pipeline. pub config: Config, } impl Initialize for SomaticPipe { /// Initializes a new `Somatic` instance with default annotations. fn initialize(id: &str, config: crate::config::Config) -> anyhow::Result { let id = id.to_string(); Ok(Self { id, config }) } } impl ShouldRun for SomaticPipe { fn should_run(&self) -> bool { let tumoral_dir = self.config.tumoral_dir(&self.id); let path_str = format!("{}/{}_somatic_variants.bit", tumoral_dir, self.id); let path = Path::new(&path_str); !path.exists() } } impl Run for SomaticPipe { /// Executes the full somatic variant analysis pipeline. fn run(&mut self) -> anyhow::Result<()> { let config = self.config.clone(); let id = self.id.clone(); info!("Running somatic pipe for {id}."); // Define output paths for the final somatic variants let result_json = format!("{}/{id}_somatic_variants.json.gz", config.tumoral_dir(&id)); let result_bit = format!("{}/{id}_somatic_variants.bit", config.tumoral_dir(&id)); let result_vcf = format!("{}/{id}_somatic_variants.vcf.gz", config.tumoral_dir(&id)); if Path::new(&result_bit).exists() && !config.somatic_pipe_force { return Err(anyhow::anyhow!( "Somatic Pipe output already exists for {id}." )); } let mut annotations = Annotations::default(); // Create stats directory if it doesn't exist let stats_dir = config.somatic_pipe_stats(&id); if !Path::new(&stats_dir).exists() { fs::create_dir(&stats_dir)?; } // Initialize and run any pre-required input if necessary info!("Initialization prerequired pipe inputs..."); let mut to_run_if_req = create_should_run!( &id, &config, SomaticScan, ClairS, NanomonSV, Severus, Savana, DeepSomatic ); to_run_if_req.extend(create_should_run_normal_tumoral!(&id, &config, DeepVariant)); info!("Running prerequired pipe components."); run_if_required(&mut to_run_if_req).map_err(|e| { anyhow::anyhow!("Failed to run a prerequired component of somatic pipe for {id}.\n{e}") })?; // Initialize variant callers let mut callers = init_somatic_callers!( &id, &config, ClairS, NanomonSV, Severus, Savana, DeepSomatic ); callers.extend(init_solo_callers_normal_tumoral!(&id, &config, DeepVariant,)); // Load variants from each caller info!("Loading variants."); let mut variants_collections = load_variants(&mut callers, &annotations) .map_err(|e| anyhow::anyhow!("Error while loading variants\n{e}"))?; // Load germline variants using ClairS info!("Loading germline variants."); let clairs_germline = ClairS::initialize(&id, self.config.clone())?.germline(&annotations)?; variants_collections.push(clairs_germline); // Initialize statistics let mut somatic_stats = SomaticPipeStats::init(&variants_collections); info!( "Variants collections loaded from {} vcf (total of {} variants loaded)", variants_collections.len(), variants_collections .iter() .map(|v| v.variants.len()) .sum::() ); // Initial annotation stats (caller annotations only) let caller_cat_anns = |v: &Annotation| matches!(v, Annotation::Callers(_, _)); let annot_init = annotations.callers_stat(Some(Box::new(caller_cat_anns))); somatic_stats.annot_init( &annot_init, &format!("{stats_dir}/{id}_germline_filter.json"), )?; annot_init.save_to_json(&format!("{stats_dir}/{id}_annotations_01.json"))?; // Filter out germline and solo constitutional variants info!( "Keeping somatic variants (variants neither in solo {} nor in germline).", config.normal_name ); somatic_stats.n_constit_germline = annotations.retain_variants(&mut variants_collections, |anns| { !anns.iter().any(|ann| { matches!( ann, Annotation::Callers(_, Sample::Germline) | Annotation::Callers(_, Sample::SoloConstit) ) }) }); annotations .callers_stat(Some(Box::new(caller_cat_anns))) .save_to_json(&format!( "{stats_dir}/{id}_annotations_02_post_germline.json" ))?; // MASK mapq let (mut high_depth_ranges, mut low_quality_ranges) = somatic_depth_quality_ranges(&id, &config)?; high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start)); low_quality_ranges.par_sort_by_key(|r| (r.contig, r.range.start)); info!("ranges: {}", low_quality_ranges.len()); variants_collections.iter_mut().for_each(|e | { let _ = e.annotate_with_ranges(&low_quality_ranges, &annotations, Annotation::LowMAPQ); }); let n_masked_lowqual = annotations.retain_variants(&mut variants_collections, |anns| { anns.contains(&Annotation::LowMAPQ) }); info!("N low mapq filtered: {}", n_masked_lowqual); // Annotate with depth and number of alternate reads from constitutional BAM info!("Reading Constit BAM file for depth and pileup annotation."); variants_collections.iter().try_for_each(|c| { c.annotate_with_constit_bam(&annotations, &self.config.normal_bam(&id), 150) })?; annotations.somatic_constit_boundaries( self.config.somatic_max_alt_constit, self.config.somatic_min_constit_depth, ); annotations .callers_stat(Some(Box::new(|v| { matches!( v, Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::ConstitDepth(_) | Annotation::HighConstitAlt | Annotation::LowConstitDepth ) }))) .save_to_json(&format!("{stats_dir}/{id}_annotations_03_bam.json"))?; // Filter based on low constitutional depth info!( "Removing variants when depth in constit bam < {}.", self.config.somatic_min_constit_depth ); somatic_stats.n_low_constit = annotations .retain_variants(&mut variants_collections, |anns| { !anns.contains(&Annotation::LowConstitDepth) }); info!( "{} variants removed when depth in constit bam < {}.", somatic_stats.n_low_constit, self.config.somatic_min_constit_depth ); // Filter: remove HighConstitAlt info!( "Removing variants with SNP/indel inside the constit alignements > {}", self.config.somatic_max_alt_constit ); somatic_stats.n_high_alt_constit = annotations .retain_variants(&mut variants_collections, |anns| { !anns.contains(&Annotation::HighConstitAlt) }); info!( "{} variants removed with SNP/indel inside the constit alignements > {}", somatic_stats.n_high_alt_constit, self.config.somatic_max_alt_constit ); annotations .callers_stat(Some(Box::new(|v| { matches!( v, Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::ConstitDepth(_) ) }))) .save_to_json(&format!("{stats_dir}/{id}_annotations_04_bam_filter.json"))?; // Annotate variants with sequence entropy info!( "Entropy annotation from {} sequences.", self.config.reference ); variants_collections.iter().for_each(|c| { c.annotate_with_sequence_context( &annotations, &self.config.reference, self.config.entropy_seq_len, self.config.somatic_pipe_threads, ); }); // Annotate with external databases like COSMIC and GnomAD info!("Annotation with external databases like COSMIC and GnomAD."); variants_collections .iter() .try_for_each(|c| -> anyhow::Result<()> { let ext_annot = ExternalAnnotation::init()?; ext_annot.annotate(&c.variants, &annotations)?; Ok(()) })?; annotations .callers_stat(Some(Box::new(|v| { matches!( v, Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_) ) }))) .save_to_json(&format!("{stats_dir}/{id}_annotations_05_gnomad.json"))?; // Filter: Remove variants present in GnomAD and have alt reads in constitutional sample info!("Filtering out variants in GnomAD and in constit bam at low AF."); somatic_stats.n_high_alt_constit_gnomad = annotations .retain_variants(&mut variants_collections, |anns| { !is_gnomad_and_constit_alt(anns) }); info!( "{} variants filtered, with constit alt <= max contig alt ({}) and in GnomAD.", somatic_stats.n_high_alt_constit_gnomad, self.config.somatic_max_alt_constit ); // TODO: These stats doesn't capture filter metrics !!! annotations .callers_stat(Some(Box::new(|v| { matches!( v, Annotation::Callers(_, _) | Annotation::ConstitAlt(_) | Annotation::GnomAD(_) ) }))) .save_to_json(&format!( "{stats_dir}/{id}_annotations_06_gnomad_filter.json" ))?; // Filter low entropy variants annotations.low_shannon_entropy(self.config.min_shannon_entropy); info!( "Filtering out variants with low entropies ({})", config.min_shannon_entropy ); annotations .callers_stat(Some(Box::new(|v| { matches!(v, Annotation::Callers(_, _) | Annotation::LowEntropy) }))) .save_to_json(&format!("{stats_dir}/{id}_annotations_07_entropy.json"))?; somatic_stats.n_low_entropies = annotations .retain_variants(&mut variants_collections, |anns| { !anns.contains(&Annotation::LowEntropy) }); annotations .callers_stat(Some(Box::new(|v| matches!(v, Annotation::Callers(_, _))))) .save_to_json(&format!( "{stats_dir}/{id}_annotations_08_entropy_filter.json" ))?; // Final VEP annotation info!("Annotation with VEP."); variants_collections .iter() .try_for_each(|c| -> anyhow::Result<()> { let ext_annot = ExternalAnnotation::init()?; ext_annot.annotate_vep(&c.variants, &annotations)?; Ok(()) })?; annotations .callers_stat(Some(Box::new(caller_cat_anns))) .save_to_json(&format!("{stats_dir}/{id}_annotations_09_vep.json"))?; annotations.vep_stats()?; // Merge all variants into a final collection let mut variants = variants_collections.into_iter().fold( Variants::default(), |mut acc, variants_collection| { acc.merge(variants_collection, &annotations); acc }, ); let vep_stats = annotations.vep_stats()?; vep_stats.save_to_json(&format!("{stats_dir}/{id}_annotations_10_vep.json"))?; VariantsStats::new(&mut variants, &id, &config, &high_depth_ranges)? .save_to_json(&format!("{stats_dir}/{id}_variants_stats_final.json.gz"))?; info!("Final unique variants: {}", variants.data.len()); // Ensure sorted (should already be sorted) variants.sort(); let vntrs: Vec = read_bed("/data/ref/hs1/vntrs_chm13.bed")?.iter().map(|r| r.range.clone()).collect(); variants.annotate_with_ranges(&vntrs, Some(Annotation::VNTR), 0, Vec::new()); let repeat_masker: Vec = read_bed("/data/ref/hs1/chm13v2.0_RepeatMasker_4.1.2p1.2022Apr14.bed")?.iter().map(|r| r.range.clone()).collect(); variants.annotate_with_ranges(&repeat_masker, Some(Annotation::RepeatMasker), 0, Vec::new()); variants.save_to_json(&result_json)?; variants.save_to_file(&result_bit)?; variants.write_vcf(&result_vcf, &config.dict_file, config.somatic_pipe_force)?; Ok(()) } } pub fn const_stats(id: String, config: Config) -> anyhow::Result<()> { info!("Loading Germline"); let annotations = Annotations::default(); let clairs_germline = ClairS::initialize(&id, config.clone())?.germline(&annotations)?; info!("Annotation with Cosmic and GnomAD."); let ext_annot = ExternalAnnotation::init()?; ext_annot.annotate(&clairs_germline.variants, &annotations)?; let mut variants = Variants::default(); variants.merge(clairs_germline, &annotations); info!("-- {}", variants.data.len()); // let u = VariantsStats::new(&mut variants, &id, &config)?; // info!("++ {}", u.n); Ok(()) } /// Holds statistical data for somatic variant pipeline processing, /// including summary counts and input categorization. #[derive(Debug, Default, Clone)] pub struct SomaticPipeStats { /// Summary of input variant collections grouped by sample type. pub input: InputStats, /// Number of variants labeled as both constitutional and germline. pub n_constit_germline: usize, /// Number of variants in constitutional samples with low allele frequency. pub n_low_constit: usize, /// Number of variants in constitutional samples with high alternative allele count. pub n_high_alt_constit: usize, /// Number of high-alt constitutional variants that are also found in gnomAD. pub n_high_alt_constit_gnomad: usize, /// Number of variants filtered due to low entropy (indicative of low complexity regions). pub n_low_entropies: usize, } impl SomaticPipeStats { /// Initializes a `SomaticPipeStats` object with populated `InputStats` based on the /// provided `VariantCollection`s. pub fn init(collections: &[VariantCollection]) -> Self { Self { input: InputStats::from_collections(collections), ..Default::default() } } /// Generates a tumor-vs-germline annotation matrix and writes it to a JSON file. /// /// This method analyzes co-occurrence patterns between tumor and germline/constit /// variant callers by iterating through annotation statistics from `AnnotationsStats`. /// It builds a matrix where each row corresponds to a **tumor caller** and each column /// corresponds to a **germline or constitutional caller**. Each cell contains the number /// of variant calls that were annotated by both the tumor and the germline/constit caller. /// /// In addition to writing a structured JSON file to the specified path, the function also /// prints a tab-separated human-readable matrix to stdout for convenience. /// /// # Parameters /// /// - `stats`: A reference to [`AnnotationsStats`] containing categorical annotations. /// This object holds a frequency map where keys are combinations of `Annotation` /// values (serialized as strings) and values are occurrence counts. /// - `json_path`: The path where the resulting JSON output file should be written. /// /// # Output Formats /// /// ## JSON Output (`json_path`) /// /// The output JSON is an **array of tumor caller records**, each containing: /// - `caller_name`: The name of the tumor caller (as a string). /// - `germline`: An array of JSON objects, each with one key-value pair, /// where the key is a germline/constit caller (or combination) and the value is the count. /// /// ### Example /// ```json /// [ /// { /// "caller_name": "DeepVariant SoloTumor", /// "germline": [ /// {"ClairS Germline": 99978}, /// {"ClairS Germline + DeepVariant SoloConstit": 4710570} /// ] /// }, /// { /// "caller_name": "ClairS Somatic", /// "germline": [ /// {"ClairS Germline": 944}, /// {"ClairS Germline + DeepVariant SoloConstit": 362} /// ] /// } /// ] /// ``` /// /// ## Console Output (TSV) /// /// A tab-separated matrix is printed to stdout. Each row begins with a tumor caller, /// followed by columns showing germline/constit caller combinations and their counts. /// /// # Notes /// - Tumor callers are collected from `self.input.somatic` and `self.input.solo_tumor`. /// - Germline/constit callers are collected from `self.input.germline` and `self.input.solo_constit`. /// - Keys in the original `AnnotationsStats` map are split using `" + "` and parsed into `Annotation` enums. /// - Germline keys are sorted and joined to form canonical column labels. /// - Tumor and germline annotations are matched by exact `Annotation` equality. /// /// # Errors /// Returns an error if: /// - Any annotation key in `AnnotationsStats` fails to parse into a valid `Annotation`. /// - The JSON output file cannot be created or written to. /// /// # Dependencies /// Requires that `Annotation` implements `FromStr`, `ToString`, `PartialEq`, and `Clone`. /// /// # Example Usage /// ```rust /// let mut stats = SomaticPipeStats::init(&collections); /// stats.annot_init(&annotation_stats, "output/matrix.json")?; /// ``` pub fn annot_init(&self, stats: &AnnotationsStats, json_path: &str) -> anyhow::Result<()> { // Parse annotations from stats let stats: Vec<(Vec, u64)> = stats .categorical .iter() .map(|e| { let anns = e .key() .split(" + ") .map(|k| k.parse()) .collect::>>() .map_err(|err| { anyhow::anyhow!("Error while splitting key in AnnotationsStats.\n{err}") })?; Ok((anns, *e.value())) }) .collect::, u64)>>>()?; // Collect tumor and germline callers from input stats let callers_somatic_solo_tumor = [ self.input .somatic .iter() .map(|(caller, _)| caller.clone()) .collect::>(), self.input .solo_tumor .iter() .map(|(caller, _)| caller.clone()) .collect(), ] .concat(); let callers_germline_solo_constit = [ self.input .germline .iter() .map(|(caller, _)| caller.clone()) .collect::>(), self.input .solo_constit .iter() .map(|(caller, _)| caller.clone()) .collect(), ] .concat(); // Build a matrix of tumor vs germline hits let mut with_germline: HashMap> = HashMap::new(); stats.iter().for_each(|(anns, v)| { // Only proceed if this annotation includes a germline/constit sample if anns.iter().any(|a| { matches!( a, Annotation::Callers(_, Sample::SoloConstit) | Annotation::Callers(_, Sample::Germline) ) }) { // Find all tumor callers present in this annotation set let n_by_tumor: Vec<(String, u64)> = callers_somatic_solo_tumor .iter() .flat_map(|tumor| { if anns.contains(tumor) { vec![(tumor.to_string(), *v)] } else { vec![] } }) .collect(); // Build a normalized germline key let mut germline_caller: Vec = callers_germline_solo_constit .iter() .flat_map(|germ| { if anns.contains(germ) { vec![germ.to_string()] } else { vec![] } }) .collect(); germline_caller.sort(); let germline_caller = germline_caller.join(" + "); // Update matrix: tumor -> germline -> count n_by_tumor.iter().for_each(|(tumoral_caller, n)| { if let Some(row) = with_germline.get_mut(tumoral_caller) { if let Some(col) = row.get_mut(&germline_caller) { *col += *n; } else { row.insert(germline_caller.to_string(), *n); } } else { let mut row = HashMap::new(); row.insert(germline_caller.to_string(), *n); with_germline.insert(tumoral_caller.to_string(), row); } }); } }); // Extract all unique germline caller labels let mut germlines_callers: Vec = with_germline .iter() .flat_map(|(_, r)| { r.iter() .map(|(k, _)| k.to_string()) .collect::>() }) .collect(); germlines_callers.sort(); germlines_callers.dedup(); // Print a readable tab-separated matrix let mut json = Vec::new(); let mut lines: Vec = with_germline .iter() .map(|(tumor, row)| { json.push(format!( "{{\"caller_name\": \"{tumor}\", \"germline\": [{}] }}", germlines_callers .iter() .map(|g| { let v = row.get(g).unwrap_or(&0); format!("{{\"{g}\": {v}}}") }) .join(", ") )); format!( "{tumor}\t{}", germlines_callers .iter() .map(|g| { let v = row.get(g).unwrap_or(&0); format!("{g}: {v}") }) .join("\t") ) }) .collect(); lines.sort(); println!("{}", lines.join("\n")); // Write JSON to file let json = format!("[{}]", json.join(", ")); let mut file = File::create(json_path)?; file.write_all(json.as_bytes())?; Ok(()) } } /// Aggregated statistics on variant collections, grouped by sample type. /// /// The `InputStats` struct stores the number of variants observed for each /// caller, categorized by biological sample type: /// /// - `solo_tumor`: Variants from tumor-only samples. /// - `solo_constit`: Variants from normal-only (constitutional) samples. /// - `germline`: Variants classified as germline (shared between normal and tumor). /// - `somatic`: Variants classified as somatic (tumor-specific). /// /// Each vector contains tuples of the form `(Annotation, usize)`, where: /// - `Annotation` identifies the caller and sample type. /// - `usize` represents the number of variants for that annotation. #[derive(Debug, Default, Clone)] pub struct InputStats { /// Variants from tumor-only samples. pub solo_tumor: Vec<(Annotation, usize)>, /// Variants from normal-only (constitutional) samples. pub solo_constit: Vec<(Annotation, usize)>, /// Germline variants (present in both normal and tumor). pub germline: Vec<(Annotation, usize)>, /// Somatic variants (specific to tumor). pub somatic: Vec<(Annotation, usize)>, } impl InputStats { /// Constructs an `InputStats` instance by aggregating variant counts /// from a list of [`VariantCollection`]s. /// /// Each `VariantCollection` is inspected to determine its sample type, /// as indicated by the `Annotation::Callers(_, Sample)` variant. The /// number of variants (`collection.variants.len()`) is recorded in the /// appropriate field of `InputStats`, grouped by `Sample`. /// /// # Parameters /// /// - `collections`: A slice of [`VariantCollection`] objects containing /// variants annotated with a caller and sample type. /// /// # Returns /// /// A populated `InputStats` instance with counts per (caller, sample type). /// /// # Examples /// /// ``` /// let collections: Vec = load_collections(); /// let stats = InputStats::from_collections(&collections); /// println!("Somatic calls: {:?}", stats.somatic); /// ``` pub fn from_collections(collections: &[VariantCollection]) -> Self { let mut stats = Self::default(); for collection in collections.iter() { match collection.caller { Annotation::Callers(_, Sample::SoloTumor) => stats .solo_tumor .push((collection.caller.clone(), collection.variants.len())), Annotation::Callers(_, Sample::SoloConstit) => stats .solo_constit .push((collection.caller.clone(), collection.variants.len())), Annotation::Callers(_, Sample::Germline) => stats .germline .push((collection.caller.clone(), collection.variants.len())), Annotation::Callers(_, Sample::Somatic) => stats .somatic .push((collection.caller.clone(), collection.variants.len())), _ => (), }; } stats } }