| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016 |
- //! # Variant Statistics and Analysis
- //!
- //! This module provides statistical analysis, rate calculations, and data preparation
- //! for genomic variant collections, particularly focused on somatic variant analysis
- //! and generalized linear modeling (GLM) of mutation rates.
- //!
- //! ## Core Types
- //!
- //! ### Statistics Aggregation
- //!
- //! - [`VariantsStats`] - Comprehensive statistics for a variant collection including:
- //! - Variant counts by category (SNV, insertion, deletion, etc.)
- //! - Variant allele frequency (VAF) distributions
- //! - Read depth distributions
- //! - VEP consequence and impact distributions
- //! - Gene-level mutation counts
- //! - COSMIC and gnomAD frequency overlaps
- //! - Somatic mutation rate calculations
- //!
- //! ### Mutation Rate Analysis
- //!
- //! - [`SomaticVariantRates`] - Somatic mutation burden metrics:
- //! - Whole-genome mutation rate (mutations per megabase)
- //! - Coding region mutation rate
- //! - Nonsynonymous mutation rate
- //! - Exon coverage statistics
- //!
- //! ### GLM Data Preparation
- //!
- //! - [`GlmRow`] - Genomic region with mutation count and feature annotations for statistical modeling
- //! - Used to build datasets for generalized linear models (GLMs)
- //! - Enables analysis of mutation rate drivers (e.g., replication timing, chromatin state)
- //!
- //! ## Key Functions
- //!
- //! ### Mutation Rate Calculation
- //!
- //! - [`somatic_rates()`] - Calculate somatic mutation rates across WGS and coding regions
- //! - [`somatic_depth_quality_ranges()`] - Identify high-depth and low-quality genomic regions
- //!
- //! ### GLM Dataset Generation
- //!
- //! - [`make_glm_rows_from_regions_par()`] - Build GLM-ready dataset from variants and genomic bins
- //! - [`write_glm_rows()`] - Export GLM rows to CSV for downstream statistical analysis
- //!
- //! ### Utilities
- //!
- //! - [`ranges_from_consecutive_true_iter()`] - Convert boolean iterator to genomic ranges
- //! - [`merge_adjacent_ranges()`] - Merge adjacent genomic ranges
- //!
- //! ## Usage Examples
- //!
- //! ### Computing Variant Statistics
- //!
- //! ```ignore
- //! use pandora_lib_promethion::variant::variants_stats::VariantsStats;
- //! use pandora_lib_promethion::variant::variant_collection::Variants;
- //!
- //! let mut variants = clairs.variants(&annotations)?;
- //! let high_depth_ranges = Vec::new(); // or from somatic_depth_quality_ranges()
- //!
- //! let stats = VariantsStats::new(&mut variants, "sample_001", &config, &high_depth_ranges)?;
- //!
- //! println!("Total variants: {}", stats.n);
- //! println!("WGS mutation rate: {:.2} mutations/Mb", stats.somatic_rates.somatic_mutation_rate_wgs);
- //! println!("Coding mutation rate: {:.2} mutations/Mb", stats.somatic_rates.somatic_mutation_rate_coding);
- //! # Ok::<(), anyhow::Error>(())
- //! ```
- //!
- //! ### Calculating Somatic Mutation Rates
- //!
- //! ```ignore
- //! use pandora_lib_promethion::variant::variants_stats::somatic_rates;
- //! use pandora_lib_promethion::io::gff::features_ranges;
- //!
- //! // Load exon ranges from GFF
- //! let exon_ranges = features_ranges(&config.genes_gtf, "exon")?;
- //!
- //! // Calculate mutation rates
- //! let rates = somatic_rates(&variants.data, &exon_ranges, &config)?;
- //!
- //! println!("Total variants: {}", rates.total_variants);
- //! println!("Variants in coding: {}", rates.variants_in_coding);
- //! println!("Nonsynonymous rate: {:.2} mutations/Mb", rates.somatic_nonsynonymous_rate_coding);
- //! # Ok::<(), anyhow::Error>(())
- //! ```
- //!
- //! ### Identifying High-Depth Regions
- //!
- //! ```ignore
- //! use pandora_lib_promethion::variant::variants_stats::somatic_depth_quality_ranges;
- //!
- //! let (high_depth, low_quality) = somatic_depth_quality_ranges("sample_001", &config)?;
- //!
- //! println!("High-depth regions: {}", high_depth.len());
- //! println!("Low-quality regions: {}", low_quality.len());
- //! # Ok::<(), anyhow::Error>(())
- //! ```
- //!
- //! ### Preparing GLM Dataset
- //!
- //! ```ignore
- //! use pandora_lib_promethion::variant::variants_stats::{
- //! make_glm_rows_from_regions_par, write_glm_rows
- //! };
- //! use pandora_lib_promethion::annotation::{Annotation, ReplicationClass};
- //! use pandora_lib_promethion::io::bed::read_bed;
- //!
- //! // Load genomic bins (e.g., 1 Mb windows)
- //! let bins = read_bed("genome_1mb_bins.bed")?;
- //!
- //! // Define features to track
- //! let tracked_features = vec![
- //! Annotation::ReplicationTiming(ReplicationClass::Early),
- //! Annotation::ReplicationTiming(ReplicationClass::Late),
- //! Annotation::HighDepths,
- //! ];
- //!
- //! // Generate GLM dataset
- //! let (glm_rows, mutation_rates) = make_glm_rows_from_regions_par(
- //! &variants,
- //! &bins,
- //! &tracked_features,
- //! 2, // min_callers
- //! );
- //!
- //! // Save to CSV for R/Python analysis
- //! write_glm_rows(&glm_rows, "glm_data.csv")?;
- //!
- //! // Print per-feature mutation rates
- //! for (feature, rate) in mutation_rates {
- //! println!("{}: {:.2} mutations/Mb", feature, rate);
- //! }
- //! # Ok::<(), anyhow::Error>(())
- //! ```
- //!
- //! ## Statistical Modeling Workflow
- //!
- //! This module supports a complete workflow for modeling somatic mutation rates:
- //!
- //! 1. **Load variants** from callers (ClairS, DeepSomatic, etc.)
- //! 2. **Annotate variants** with VEP, COSMIC, gnomAD
- //! 3. **Compute statistics** using `VariantsStats::new()`
- //! 4. **Identify quality regions** using `somatic_depth_quality_ranges()`
- //! 5. **Prepare GLM dataset** using `make_glm_rows_from_regions_par()`
- //! 6. **Export to CSV** using `write_glm_rows()`
- //! 7. **Model in R/Python** using GLM/GAM frameworks
- //!
- //! ## Performance
- //!
- //! All major functions use Rayon for parallel processing:
- //! - `VariantsStats::new()` parallelizes per-variant aggregation
- //! - `somatic_depth_quality_ranges()` processes contigs in parallel
- //! - `make_glm_rows_from_regions_par()` parallelizes region processing with efficient binary search
- use std::collections::{BTreeMap, BTreeSet};
- use anyhow::Context;
- use csv::ByteRecord;
- use dashmap::DashMap;
- use log::debug;
- use ordered_float::OrderedFloat;
- use rayon::prelude::*;
- use serde::{Deserialize, Serialize, Serializer};
- use crate::{
- annotation::{vep::VepImpact, Annotation},
- config::Config,
- helpers::bin_data,
- io::{
- bed::read_bed, dict::read_dict, gff::features_ranges, readers::get_gz_reader,
- tsv::tsv_reader, writers::get_gz_writer,
- },
- positions::{
- contig_to_num, merge_overlapping_genome_ranges, par_overlaps, range_intersection_par,
- GenomeRange,
- },
- scan::bin::{parse_bin_record_into, BinRowBuf},
- };
- use super::variant_collection::{Variant, Variants};
- #[derive(Debug, Clone, Serialize, Deserialize)]
- pub struct VariantsStats {
- pub n: u32,
- #[serde(serialize_with = "serialize_dashmap_sort")]
- pub alteration_categories: DashMap<String, u32>,
- #[serde(serialize_with = "serialize_dashmap_sort")]
- pub cosmic: DashMap<u64, u32>,
- #[serde(serialize_with = "serialize_dashmap_sort")]
- pub n_alts: DashMap<u32, u32>,
- #[serde(serialize_with = "serialize_dashmap_sort")]
- pub depths: DashMap<u32, u32>,
- #[serde(serialize_with = "serialize_dashmap_sort")]
- pub vafs: DashMap<OrderedFloat<f32>, u32>,
- #[serde(serialize_with = "serialize_dashmap_sort")]
- pub vep_impact: DashMap<String, u32>,
- #[serde(serialize_with = "serialize_dashmap_sort")]
- pub consequences: DashMap<String, u32>,
- #[serde(serialize_with = "serialize_dashmap_sort")]
- pub genes: DashMap<String, u32>,
- #[serde(serialize_with = "serialize_dashmap_sort")]
- pub genes_nonsynonymus: DashMap<String, u32>,
- pub n_gnomad: usize,
- pub gnomad: Vec<(String, Vec<(f64, usize)>)>,
- pub somatic_rates: SomaticVariantRates,
- pub high_depth_somatic_rates: SomaticVariantRates,
- pub mutation_rates: Vec<(String, (u32, usize))>,
- #[serde(serialize_with = "serialize_dashmap_sort")]
- pub insertions_len: DashMap<u32, u32>,
- #[serde(serialize_with = "serialize_dashmap_sort")]
- pub deletions_len: DashMap<u32, u32>,
- }
- pub fn serialize_dashmap_sort<S, T>(
- data: &DashMap<T, u32>,
- serializer: S,
- ) -> Result<S::Ok, S::Error>
- where
- S: Serializer,
- T: Serialize + Ord + std::hash::Hash + Clone,
- {
- let ordered: BTreeMap<_, _> = data
- .iter()
- .map(|entry| (entry.key().clone(), *entry.value()))
- .collect();
- ordered.serialize(serializer)
- }
- impl VariantsStats {
- pub fn new(
- variants: &mut Variants,
- _id: &str,
- config: &Config,
- high_depth_ranges: &[GenomeRange],
- ) -> anyhow::Result<Self> {
- let n = variants.data.len() as u32;
- let alteration_categories: DashMap<String, u32> = DashMap::new();
- let vep_impact: DashMap<String, u32> = DashMap::new();
- let genes: DashMap<String, u32> = DashMap::new();
- let genes_nonsynonymus: DashMap<String, u32> = DashMap::new();
- let consequences: DashMap<String, u32> = DashMap::new();
- let n_alts: DashMap<u32, u32> = DashMap::new();
- let depths: DashMap<u32, u32> = DashMap::new();
- let vafs: DashMap<OrderedFloat<f32>, u32> = DashMap::new();
- let cosmic: DashMap<u64, u32> = DashMap::new();
- let gnomads: DashMap<String, Vec<f64>> = DashMap::new();
- let context: DashMap<String, Vec<String>> = DashMap::new();
- let insertions_len: DashMap<u32, u32> = DashMap::new();
- let deletions_len: DashMap<u32, u32> = DashMap::new();
- variants.data.par_iter().for_each(|v| {
- // VEP
- if let Ok(best_vep) = v.best_vep() {
- if let Some(ref impact) = best_vep.extra.impact {
- *vep_impact.entry(impact.to_string()).or_default() += 1;
- }
- if let Some(ref gene) = best_vep.extra.symbol {
- *genes.entry(gene.to_string()).or_default() += 1;
- }
- if let (Some(impact), Some(gene)) = (best_vep.extra.impact, best_vep.extra.symbol) {
- if impact <= VepImpact::MODERATE {
- *genes_nonsynonymus.entry(gene).or_default() += 1;
- }
- }
- if let Some(csqs) = best_vep.consequence {
- let mut csq: Vec<String> = csqs.into_iter().map(String::from).collect();
- csq.sort();
- csq.dedup();
- *consequences.entry(csq.join(", ")).or_default() += 1;
- }
- }
- // VAF
- let (n_alt, depth) = v.n_alt_depth();
- *n_alts.entry(n_alt as u32).or_default() += 1;
- *depths.entry(depth as u32).or_default() += 1;
- let vaf = OrderedFloat::from(((n_alt * 1_000.0 / depth).round() / 10.0) as f32);
- if !vaf.is_nan() {
- *vafs.entry(vaf).or_default() += 1;
- }
- let callers = v.callers();
- // Annotations
- v.annotations.iter().for_each(|annotation| {
- match annotation {
- Annotation::Cosmic(v) => *cosmic.entry(v.cosmic_cnt).or_default() += 1,
- Annotation::GnomAD(v) => {
- v.to_vec()
- .iter()
- .map(|e| e.to_string_value_pair())
- .for_each(|(key, value)| {
- gnomads.entry(key).or_default().push(value);
- });
- }
- Annotation::TriNucleotides(bases) => context
- .entry(bases.iter().map(|v| v.to_string()).collect())
- .or_default()
- .push(callers.clone()),
- _ => (),
- };
- });
- // Alteration category
- let mut alteration_category_str = v
- .alteration_category()
- .iter()
- .map(|c| c.to_string())
- .collect::<Vec<String>>();
- alteration_category_str.sort();
- alteration_category_str.dedup();
- *alteration_categories
- .entry(alteration_category_str.join(", "))
- .or_default() += 1;
- if let Some(len) = v.insertion_length() {
- *insertions_len.entry(len).or_default() += 1;
- }
- if let Some(len) = v.deletion_length() {
- *deletions_len.entry(len).or_default() += 1;
- }
- });
- let mut n_gnomad = 0;
- let gnomad = gnomads
- .iter()
- .map(|e| {
- let data = e.value().to_vec();
- n_gnomad = data.len();
- (e.key().to_string(), bin_data(data, 0.0001))
- })
- .collect();
- let exon_ranges = features_ranges("exon", config)?;
- let exon_ranges = merge_overlapping_genome_ranges(&exon_ranges);
- let all_somatic_rates = somatic_rates(&variants.data, &exon_ranges, config)?;
- let exon_ranges_ref: Vec<&GenomeRange> = exon_ranges.iter().collect();
- let exons_high_depth = range_intersection_par(
- &high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(),
- &exon_ranges_ref,
- );
- let high_depth = somatic_rates(&variants.data, &exons_high_depth, config)?;
- let mut mutation_rates = Vec::new();
- // HighDepths
- let ann = Annotation::HighDepth;
- let res = variants.annotate_with_ranges(
- high_depth_ranges,
- Some(ann.clone()),
- config.min_n_callers,
- Vec::new(),
- );
- mutation_rates.push((ann.to_string(), res));
- // Exons
- let res =
- variants.annotate_with_ranges(&exon_ranges, None, config.min_n_callers, Vec::new());
- mutation_rates.push(("Exons".to_string(), res));
- // ExonsHighDepths
- let res = variants.annotate_with_ranges(
- &exons_high_depth,
- None,
- config.min_n_callers,
- Vec::new(),
- );
- mutation_rates.push(("Exons HighDepths".to_string(), res));
- for (name, path) in config.panels.iter() {
- let panel_ranges: Vec<GenomeRange> =
- read_bed(path)?.into_iter().map(|e| e.range).collect();
- let ann = Annotation::Panel(name.to_string());
- let res = variants.annotate_with_ranges(
- &panel_ranges,
- Some(ann.clone()),
- config.min_n_callers,
- Vec::new(),
- );
- mutation_rates.push((ann.to_string(), res));
- let panel_ranges: Vec<GenomeRange> = range_intersection_par(
- &high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(),
- &panel_ranges.iter().collect::<Vec<&GenomeRange>>(),
- );
- let ann = Annotation::Panel(format!("{}_HighDepths", name));
- let res = variants.annotate_with_ranges(
- &panel_ranges,
- Some(ann.clone()),
- config.min_n_callers,
- Vec::new(),
- );
- mutation_rates.push((ann.to_string(), res));
- }
- // mutation_rates.sort_by(|(_, (_, an)), (_, (_, bn))| an.cmp(bn));
- // Output rates per megabase
- for (feature, (bp, n)) in &mutation_rates {
- let rate = (*n as f64) / ((*bp as f64) / 1e6);
- println!("{feature}: {rate:.2} mutations/Mb ({n} / {bp})");
- }
- // let (glm_rows, rates_per_mb) =
- // make_glm_rows_from_regions_par(variants, &high_depth_ranges, &tracked_annotations, config.min_n_callers);
- //
- // // Output rates per megabase
- // for (feature, rate) in &rates_per_mb {
- // println!("{feature}: {rate:.2} mutations/Mb");
- // }
- //
- // write_glm_rows(
- // &glm_rows,
- // &format!("{}/{id}_glm_rows.csv", config.somatic_pipe_stats(id)),
- // )?;
- Ok(Self {
- n,
- alteration_categories,
- cosmic,
- gnomad,
- vafs,
- n_alts,
- depths,
- vep_impact,
- consequences,
- genes,
- genes_nonsynonymus,
- n_gnomad,
- somatic_rates: all_somatic_rates,
- high_depth_somatic_rates: high_depth,
- mutation_rates,
- insertions_len,
- deletions_len,
- })
- }
- pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
- let mut writer = get_gz_writer(filename, true)
- .with_context(|| anyhow::anyhow!("Failed to create file: {}", filename))?;
- serde_json::to_writer(&mut writer, self)
- .with_context(|| anyhow::anyhow!("Failed to serialize JSON to file: {}", filename))?;
- writer.close().with_context(|| {
- anyhow::anyhow!("Failed to close BGZF writer for file: {}", filename)
- })?;
- debug!("Successfully saved variants to {}", filename);
- Ok(())
- }
- pub fn load_from_json(filename: &str) -> anyhow::Result<Self> {
- let mut reader = get_gz_reader(filename)
- .with_context(|| format!("Failed to create BGZF reader for file: {}", filename))?;
- let variants: Self = serde_json::from_reader(&mut reader)
- .with_context(|| format!("Failed to deserialize JSON from file: {}", filename))?;
- debug!("Successfully loaded variants from {}", filename);
- Ok(variants)
- }
- }
- #[derive(Debug, Serialize, Deserialize, Clone)]
- pub struct SomaticVariantRates {
- pub wgs_length: u32,
- pub total_variants: usize,
- pub somatic_mutation_rate_wgs: f64,
- pub exon_count: usize,
- pub total_exon_bases: u32,
- pub variants_in_coding: usize,
- pub somatic_mutation_rate_coding: f64,
- pub nonsynonymous_variants: usize,
- pub somatic_nonsynonymous_rate_coding: f64,
- }
- pub fn somatic_rates(
- variants: &[Variant],
- exon_ranges: &Vec<GenomeRange>,
- config: &Config,
- ) -> anyhow::Result<SomaticVariantRates> {
- let ol = par_overlaps(variants, exon_ranges);
- let n_coding = ol
- .iter()
- .filter_map(|i| variants[*i].best_vep().ok())
- .filter_map(|bv| bv.impact())
- .filter(|impact| *impact <= VepImpact::MODERATE)
- .count();
- let n_bases_m: u32 = exon_ranges.par_iter().map(|gr| gr.length()).sum();
- let mega_base_m = n_bases_m as f64 / 10.0e6;
- let wgs_len: u32 = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum();
- let rate_wgs = variants.len() as f64 / (wgs_len as f64 / 10.0e6);
- let n_exons_mb = ol.len() as f64 / mega_base_m;
- let n_coding_mb = n_coding as f64 / mega_base_m;
- Ok(SomaticVariantRates {
- total_variants: variants.len(),
- exon_count: exon_ranges.len(),
- variants_in_coding: ol.len(),
- nonsynonymous_variants: n_coding,
- total_exon_bases: n_bases_m,
- wgs_length: wgs_len,
- somatic_mutation_rate_wgs: rate_wgs,
- somatic_mutation_rate_coding: n_exons_mb,
- somatic_nonsynonymous_rate_coding: n_coding_mb,
- })
- }
- /// Computes genomic ranges with sufficient depth and with excessive low-quality reads
- /// by jointly scanning normal and tumoral per-contig depth TSV files.
- ///
- /// For each contig, the function reads the corresponding gzipped TSV files from the
- /// normal and tumoral directories, iterates over bins in lockstep, and validates that
- /// both files are perfectly aligned (same contig, same start position, same bin layout).
- ///
- /// Two kinds of ranges are produced:
- /// - **High-quality depth ranges**: consecutive bins where *both* normal and tumoral
- /// depths are greater than or equal to `config.min_high_quality_depth`.
- /// - **Low-quality excess ranges**: consecutive bins where *both* normal and tumoral
- /// bins contain more low-quality reads than `config.max_depth_low_quality`.
- ///
- /// Contigs are processed in parallel. Within each contig, adjacent or overlapping
- /// ranges are merged before being returned.
- ///
- /// # Errors
- ///
- /// Returns an error if:
- /// - A normal or tumoral TSV file cannot be opened or decompressed.
- /// - A TSV record cannot be parsed.
- /// - The normal and tumoral files differ in number of lines.
- /// - Corresponding records disagree on contig name, start position, or bin structure
- /// (depth or low-quality vector length).
- ///
- /// # Returns
- ///
- /// A tuple `(high_depth_ranges, lowq_excess_ranges)` where each element is a `Vec<GenomeRange>`
- /// spanning all contigs.
- ///
- /// # Notes
- ///
- /// - Input TSV files are expected to be tab-delimited, headerless, and sorted by
- /// genomic position.
- /// - Low-quality ranges represent **excess low-quality signal** (bins exceeding the
- /// configured threshold), not acceptable regions.
- /// - The implementation reuses parsing buffers and avoids per-line allocations for
- /// performance.
- ///
- /// # Panics
- ///
- /// This function does not intentionally panic.
- pub fn somatic_depth_quality_ranges(
- id: &str,
- config: &Config,
- ) -> anyhow::Result<(Vec<GenomeRange>, Vec<GenomeRange>)> {
- // chr1..chr22 + X,Y,M
- let contigs: Vec<String> = read_dict(&config.dict_file)?
- .into_iter()
- .map(|(sn, _ln)| sn)
- .collect();
- let cfg = config;
- let per_contig = contigs
- .into_par_iter()
- .map(|contig| {
- let normal_path = format!("{}/{}_count.tsv.gz", cfg.normal_dir_count(id), contig);
- let tumor_path = format!("{}/{}_count.tsv.gz", cfg.tumoral_dir_count(id), contig);
- let normal_rdr = get_gz_reader(&normal_path)
- .with_context(|| format!("Failed to open normal file: {}", normal_path))?;
- let tumor_rdr = get_gz_reader(&tumor_path)
- .with_context(|| format!("Failed to open tumor file: {}", tumor_path))?;
- let mut high_runs: Vec<GenomeRange> = Vec::new();
- let mut lowq_runs: Vec<GenomeRange> = Vec::new();
- let mut n_tsv = tsv_reader(normal_rdr); // normal_rdr: impl Read
- let mut t_tsv = tsv_reader(tumor_rdr);
- let mut n_rec = ByteRecord::new();
- let mut t_rec = ByteRecord::new();
- let mut n_buf = BinRowBuf::default();
- let mut t_buf = BinRowBuf::default();
- let mut line_no = 0usize;
- loop {
- let n_ok = n_tsv.read_byte_record(&mut n_rec)?;
- let t_ok = t_tsv.read_byte_record(&mut t_rec)?;
- if n_ok || t_ok {
- line_no += 1;
- }
- match (n_ok, t_ok) {
- (false, false) => break,
- (true, false) => {
- anyhow::bail!("{} has extra lines at {}", normal_path, line_no)
- }
- (false, true) => anyhow::bail!("{} has extra lines at {}", tumor_path, line_no),
- (true, true) => {
- let (n_start, n_depths, n_lowq) =
- parse_bin_record_into(&n_rec, &mut n_buf, &contig)
- .with_context(|| format!("{} line {}", normal_path, line_no))?;
- let (t_start, t_depths, t_lowq) =
- parse_bin_record_into(&t_rec, &mut t_buf, &contig)
- .with_context(|| format!("{} line {}", tumor_path, line_no))?;
- anyhow::ensure!(n_start == t_start, "start mismatch at line {}", line_no);
- anyhow::ensure!(
- n_depths.len() == t_depths.len(),
- "depth len mismatch at line {}",
- line_no
- );
- anyhow::ensure!(
- n_lowq.len() == t_lowq.len(),
- "lowq len mismatch at line {}",
- line_no
- );
- let high_mask_iter = n_depths.iter().zip(t_depths).map(|(&nd, &td)| {
- nd >= cfg.min_high_quality_depth && td >= cfg.min_high_quality_depth
- });
- let lowq_mask_iter = n_lowq.iter().zip(t_lowq).map(|(&nq, &tq)| {
- nq > cfg.max_depth_low_quality && tq > cfg.max_depth_low_quality
- });
- high_runs.extend(ranges_from_consecutive_true_iter(
- high_mask_iter,
- n_start,
- &contig,
- ));
- lowq_runs.extend(ranges_from_consecutive_true_iter(
- lowq_mask_iter,
- n_start,
- &contig,
- ));
- }
- }
- }
- // Merge adjacent/overlapping ranges within this contig
- high_runs = merge_adjacent_ranges(high_runs);
- lowq_runs = merge_adjacent_ranges(lowq_runs);
- Ok((high_runs, lowq_runs))
- })
- .collect::<anyhow::Result<Vec<_>>>()?;
- // Flatten
- let (high_all, low_all): (Vec<_>, Vec<_>) = per_contig.into_iter().unzip();
- Ok((
- high_all.into_iter().flatten().collect(),
- low_all.into_iter().flatten().collect(),
- ))
- }
- /// Iterator-based version (no temporary Vec<bool>).
- /// Produces end-exclusive ranges in the same shape as your old function.
- pub fn ranges_from_consecutive_true_iter<I>(mask: I, start0: u32, contig: &str) -> Vec<GenomeRange>
- where
- I: IntoIterator<Item = bool>,
- {
- let contig = contig_to_num(contig);
- let mut ranges = Vec::new();
- let mut current_start: Option<u32> = None;
- let mut i: u32 = 0;
- for v in mask {
- if v {
- if current_start.is_none() {
- current_start = Some(start0 + i);
- }
- } else if let Some(s) = current_start.take() {
- ranges.push(GenomeRange {
- contig,
- range: s..(start0 + i),
- });
- }
- i += 1;
- }
- if let Some(s) = current_start {
- ranges.push(GenomeRange {
- contig,
- range: s..(start0 + i),
- });
- }
- ranges
- }
- /// Merge overlapping or touching ranges per contig (end-exclusive).
- pub fn merge_adjacent_ranges(mut ranges: Vec<GenomeRange>) -> Vec<GenomeRange> {
- if ranges.is_empty() {
- return ranges;
- }
- ranges.sort_by(|a, b| (a.contig, a.range.start).cmp(&(b.contig, b.range.start)));
- let mut merged = Vec::with_capacity(ranges.len());
- let mut cur = ranges[0].clone();
- for r in ranges.into_iter().skip(1) {
- if r.contig == cur.contig && r.range.start <= cur.range.end {
- if r.range.end > cur.range.end {
- cur.range.end = r.range.end;
- }
- } else {
- merged.push(cur);
- cur = r;
- }
- }
- merged.push(cur);
- merged
- }
- /// A region-level data structure for modeling mutation rates using GLMs or other statistical models.
- ///
- /// Each `GlmRow` represents a genomic interval (e.g., from a BED file) and contains:
- /// - The region's coordinates (`contig`, `start`, `end`)
- /// - Its length in base pairs
- /// - The number of somatic mutations overlapping the region
- /// - A list of binary feature labels (e.g., "Early", "HighGC")
- ///
- /// This structure is designed to be serializable (e.g., to CSV) for downstream statistical modeling.
- ///
- /// # Example
- /// ```
- /// GlmRow {
- /// contig: "chr1".to_string(),
- /// start: 100_000,
- /// end: 101_000,
- /// length: 1000,
- /// mutation_count: 3,
- /// features: vec!["Early".into(), "HighGC".into()],
- /// }
- /// ```
- #[derive(Debug, Serialize)]
- pub struct GlmRow {
- /// Chromosome name (e.g., "chr1")
- pub contig: String,
- /// Start coordinate (0-based, inclusive)
- pub start: u32,
- /// End coordinate (0-based, exclusive)
- pub end: u32,
- /// Length of the region in base pairs (end - start)
- pub length: usize,
- /// Number of variants overlapping this region
- pub mutation_count: usize,
- /// Binary feature labels associated with this region
- pub features: Vec<String>,
- }
- /// Builds a GLM-ready table from a set of fixed genomic regions and a list of annotated variants,
- /// in parallel, using efficient binary search-based double-pointer traversal.
- ///
- /// For each region, this function:
- /// - Counts how many variants fall within the region
- /// - Extracts the relevant annotations (features) from overlapping variants
- /// - Produces one `GlmRow` with coordinates, mutation count, and feature labels
- ///
- /// Additionally, it computes the **mutation rate per megabase** for each tracked annotation
- /// by aggregating counts and total base coverage across regions where each feature is present.
- ///
- /// # Assumptions
- /// - `variants.data` must be sorted by `position.contig` then `position.position`
- /// - `regions` must be sorted by `range.contig` then `range.start`
- ///
- /// # Arguments
- /// * `variants` - A list of annotated somatic variants (sorted).
- /// * `regions` - A set of genomic bins or intervals (e.g., from BED) used as modeling units (sorted).
- /// * `tracked_annotations` - A list of `Annotation` values to track as binary features.
- ///
- /// # Returns
- /// A tuple:
- /// * `Vec<GlmRow>` — one per region, with mutation count and feature labels.
- /// * `Vec<(String, f64)>` — per-feature mutation rate (mutations per megabase).
- ///
- /// # Example
- /// ```rust
- /// let (rows, rates) = make_glm_rows_from_regions_par(
- /// &variants,
- /// &genome_bins,
- /// &[
- /// Annotation::ReplicationTiming(ReplicationClass::Early),
- /// Annotation::HighDepths
- /// ]
- /// );
- ///
- /// for (feature, rate) in rates {
- /// println!("{feature}: {:.2} mutations/Mb", rate);
- /// }
- /// ```
- pub fn make_glm_rows_from_regions_par(
- variants: &Variants,
- regions: &[GenomeRange],
- tracked_annotations: &[Annotation],
- min_callers: u8,
- ) -> (Vec<GlmRow>, Vec<(String, f64)>) {
- let (glm_rows, feature_stats): (Vec<GlmRow>, BTreeMap<String, (usize, usize)>) = regions
- .par_iter()
- .map(|region| {
- let mut mutation_count = 0;
- let mut feature_set = BTreeSet::new();
- // Binary search into sorted variant list
- let start_idx = variants.data.partition_point(|v| {
- v.position.contig < region.contig
- || (v.position.contig == region.contig
- && v.position.position < region.range.start)
- });
- for var in &variants.data[start_idx..] {
- let pos = &var.position;
- if pos.contig != region.contig || pos.position >= region.range.end {
- break;
- }
- if region.range.contains(&pos.position) && var.n_callers() >= min_callers {
- mutation_count += 1;
- for ann in &var.annotations {
- if tracked_annotations.contains(ann) {
- feature_set.insert(ann.to_string());
- }
- }
- }
- }
- let region_len = region.length() as usize;
- let row = GlmRow {
- contig: region.contig.to_string(),
- start: region.range.start,
- end: region.range.end,
- length: region_len,
- mutation_count,
- features: feature_set.clone().into_iter().collect(),
- };
- // Local (per-thread) accumulation of stats
- let mut stats = BTreeMap::new();
- for feat in &feature_set {
- stats.insert(feat.clone(), (mutation_count, region_len));
- }
- (vec![row], stats)
- })
- .reduce(
- || (Vec::new(), BTreeMap::new()),
- |(mut rows_a, mut stats_a), (mut rows_b, stats_b)| {
- rows_a.append(&mut rows_b);
- for (feat, (n, bp)) in stats_b {
- stats_a
- .entry(feat)
- .and_modify(|(na, bpa)| {
- *na += n;
- *bpa += bp;
- })
- .or_insert((n, bp));
- }
- (rows_a, stats_a)
- },
- );
- let ranges_len: usize = regions.iter().map(|a| a.length() as usize).sum();
- assert_eq!(10e6 as usize, 1_000_000);
- // Compute mutation rates per Mb from aggregate stats
- let rates_per_mb: Vec<(String, f64)> = feature_stats
- .into_iter()
- .map(|(feat, (n, bp))| {
- debug!("{bp}");
- let rate = if bp > 0 {
- (n as f64) / ((ranges_len as f64) / 10e6)
- } else {
- 0.0
- };
- (feat, rate)
- })
- .collect();
- (glm_rows, rates_per_mb)
- }
- /// Returns a flat table: one-hot encoded feature columns.
- fn flatten_glm_rows(
- rows: &[GlmRow],
- ) -> (Vec<String>, Vec<std::collections::HashMap<String, String>>) {
- let mut all_features: BTreeSet<String> = BTreeSet::new();
- for r in rows {
- all_features.extend(r.features.iter().cloned());
- }
- let feature_list: Vec<_> = all_features.into_iter().collect();
- let mut table: Vec<_> = rows
- .iter()
- .map(|r| {
- let mut row = std::collections::HashMap::new();
- row.insert("contig".into(), r.contig.clone());
- row.insert("start".into(), r.start.to_string());
- row.insert("end".into(), r.end.to_string());
- row.insert("length".into(), r.length.to_string());
- row.insert("mutation_count".into(), r.mutation_count.to_string());
- row.insert("log_length".into(), (r.length as f64).ln().to_string());
- for f in &feature_list {
- row.insert(
- f.clone(),
- if r.features.contains(f) { "1" } else { "0" }.into(),
- );
- }
- row
- })
- .collect();
- // Parallel sort by contig and start
- table.par_sort_by(|a, b| {
- let ca = a.get("contig").unwrap();
- let cb = b.get("contig").unwrap();
- let sa = a.get("start").unwrap().parse::<u32>().unwrap();
- let sb = b.get("start").unwrap().parse::<u32>().unwrap();
- (ca, sa).cmp(&(cb, sb))
- });
- (feature_list, table)
- }
- pub fn write_glm_rows(all_rows: &[GlmRow], csv_path: &str) -> anyhow::Result<()> {
- let (features, flat_rows) = flatten_glm_rows(all_rows);
- let mut writer = csv::Writer::from_path(csv_path)?;
- let mut headers = vec![
- "contig",
- "start",
- "end",
- "length",
- "log_length",
- "mutation_count",
- ];
- headers.extend(features.iter().map(|s| s.as_str()));
- writer.write_record(&headers)?;
- for row in flat_rows {
- let values: Vec<_> = headers.iter().map(|&h| row.get(h).unwrap()).collect();
- writer.write_record(values)?;
- }
- writer.flush()?;
- Ok(())
- }
- #[cfg(test)]
- mod tests {
- use log::info;
- use super::*;
- use crate::{helpers::test_init, positions::GenomeRangeExt};
- #[test]
- fn high_depths() -> anyhow::Result<()> {
- test_init();
- let config = Config::default();
- let id = "DUMCO";
- let (mut high_depth_ranges, mut low_quality_ranges) =
- somatic_depth_quality_ranges(id, &config)?;
- high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
- low_quality_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
- // let high_depth_ranges = merge_adjacent_ranges(high_depth_ranges);
- // let low_quality_ranges = merge_adjacent_ranges(low_quality_ranges);
- info!(
- "High-depth ranges: n={} bp={}\nLowQ ranges: n={} bp={}",
- high_depth_ranges.len(),
- high_depth_ranges.total_len(),
- low_quality_ranges.len(),
- low_quality_ranges.total_len(),
- );
- high_depth_ranges.iter().take(10).for_each(|e| println!("{e:?}"));
- low_quality_ranges.iter().take(10).for_each(|e| println!("{e:?}"));
- Ok(())
- }
- }
|