|
|
@@ -1,3 +1,158 @@
|
|
|
+//! # Variant Statistics and Analysis
|
|
|
+//!
|
|
|
+//! This module provides statistical analysis, rate calculations, and data preparation
|
|
|
+//! for genomic variant collections, particularly focused on somatic variant analysis
|
|
|
+//! and generalized linear modeling (GLM) of mutation rates.
|
|
|
+//!
|
|
|
+//! ## Core Types
|
|
|
+//!
|
|
|
+//! ### Statistics Aggregation
|
|
|
+//!
|
|
|
+//! - [`VariantsStats`] - Comprehensive statistics for a variant collection including:
|
|
|
+//! - Variant counts by category (SNV, insertion, deletion, etc.)
|
|
|
+//! - Variant allele frequency (VAF) distributions
|
|
|
+//! - Read depth distributions
|
|
|
+//! - VEP consequence and impact distributions
|
|
|
+//! - Gene-level mutation counts
|
|
|
+//! - COSMIC and gnomAD frequency overlaps
|
|
|
+//! - Somatic mutation rate calculations
|
|
|
+//!
|
|
|
+//! ### Mutation Rate Analysis
|
|
|
+//!
|
|
|
+//! - [`SomaticVariantRates`] - Somatic mutation burden metrics:
|
|
|
+//! - Whole-genome mutation rate (mutations per megabase)
|
|
|
+//! - Coding region mutation rate
|
|
|
+//! - Nonsynonymous mutation rate
|
|
|
+//! - Exon coverage statistics
|
|
|
+//!
|
|
|
+//! ### GLM Data Preparation
|
|
|
+//!
|
|
|
+//! - [`GlmRow`] - Genomic region with mutation count and feature annotations for statistical modeling
|
|
|
+//! - Used to build datasets for generalized linear models (GLMs)
|
|
|
+//! - Enables analysis of mutation rate drivers (e.g., replication timing, chromatin state)
|
|
|
+//!
|
|
|
+//! ## Key Functions
|
|
|
+//!
|
|
|
+//! ### Mutation Rate Calculation
|
|
|
+//!
|
|
|
+//! - [`somatic_rates()`] - Calculate somatic mutation rates across WGS and coding regions
|
|
|
+//! - [`somatic_depth_quality_ranges()`] - Identify high-depth and low-quality genomic regions
|
|
|
+//!
|
|
|
+//! ### GLM Dataset Generation
|
|
|
+//!
|
|
|
+//! - [`make_glm_rows_from_regions_par()`] - Build GLM-ready dataset from variants and genomic bins
|
|
|
+//! - [`write_glm_rows()`] - Export GLM rows to CSV for downstream statistical analysis
|
|
|
+//!
|
|
|
+//! ### Utilities
|
|
|
+//!
|
|
|
+//! - [`ranges_from_consecutive_true_iter()`] - Convert boolean iterator to genomic ranges
|
|
|
+//! - [`merge_adjacent_ranges()`] - Merge adjacent genomic ranges
|
|
|
+//!
|
|
|
+//! ## Usage Examples
|
|
|
+//!
|
|
|
+//! ### Computing Variant Statistics
|
|
|
+//!
|
|
|
+//! ```ignore
|
|
|
+//! use pandora_lib_promethion::variant::variants_stats::VariantsStats;
|
|
|
+//! use pandora_lib_promethion::variant::variant_collection::Variants;
|
|
|
+//!
|
|
|
+//! let mut variants = clairs.variants(&annotations)?;
|
|
|
+//! let high_depth_ranges = Vec::new(); // or from somatic_depth_quality_ranges()
|
|
|
+//!
|
|
|
+//! let stats = VariantsStats::new(&mut variants, "sample_001", &config, &high_depth_ranges)?;
|
|
|
+//!
|
|
|
+//! println!("Total variants: {}", stats.n);
|
|
|
+//! println!("WGS mutation rate: {:.2} mutations/Mb", stats.somatic_rates.somatic_mutation_rate_wgs);
|
|
|
+//! println!("Coding mutation rate: {:.2} mutations/Mb", stats.somatic_rates.somatic_mutation_rate_coding);
|
|
|
+//! # Ok::<(), anyhow::Error>(())
|
|
|
+//! ```
|
|
|
+//!
|
|
|
+//! ### Calculating Somatic Mutation Rates
|
|
|
+//!
|
|
|
+//! ```ignore
|
|
|
+//! use pandora_lib_promethion::variant::variants_stats::somatic_rates;
|
|
|
+//! use pandora_lib_promethion::io::gff::features_ranges;
|
|
|
+//!
|
|
|
+//! // Load exon ranges from GFF
|
|
|
+//! let exon_ranges = features_ranges(&config.genes_gtf, "exon")?;
|
|
|
+//!
|
|
|
+//! // Calculate mutation rates
|
|
|
+//! let rates = somatic_rates(&variants.data, &exon_ranges, &config)?;
|
|
|
+//!
|
|
|
+//! println!("Total variants: {}", rates.total_variants);
|
|
|
+//! println!("Variants in coding: {}", rates.variants_in_coding);
|
|
|
+//! println!("Nonsynonymous rate: {:.2} mutations/Mb", rates.somatic_nonsynonymous_rate_coding);
|
|
|
+//! # Ok::<(), anyhow::Error>(())
|
|
|
+//! ```
|
|
|
+//!
|
|
|
+//! ### Identifying High-Depth Regions
|
|
|
+//!
|
|
|
+//! ```ignore
|
|
|
+//! use pandora_lib_promethion::variant::variants_stats::somatic_depth_quality_ranges;
|
|
|
+//!
|
|
|
+//! let (high_depth, low_quality) = somatic_depth_quality_ranges("sample_001", &config)?;
|
|
|
+//!
|
|
|
+//! println!("High-depth regions: {}", high_depth.len());
|
|
|
+//! println!("Low-quality regions: {}", low_quality.len());
|
|
|
+//! # Ok::<(), anyhow::Error>(())
|
|
|
+//! ```
|
|
|
+//!
|
|
|
+//! ### Preparing GLM Dataset
|
|
|
+//!
|
|
|
+//! ```ignore
|
|
|
+//! use pandora_lib_promethion::variant::variants_stats::{
|
|
|
+//! make_glm_rows_from_regions_par, write_glm_rows
|
|
|
+//! };
|
|
|
+//! use pandora_lib_promethion::annotation::{Annotation, ReplicationClass};
|
|
|
+//! use pandora_lib_promethion::io::bed::read_bed;
|
|
|
+//!
|
|
|
+//! // Load genomic bins (e.g., 1 Mb windows)
|
|
|
+//! let bins = read_bed("genome_1mb_bins.bed")?;
|
|
|
+//!
|
|
|
+//! // Define features to track
|
|
|
+//! let tracked_features = vec![
|
|
|
+//! Annotation::ReplicationTiming(ReplicationClass::Early),
|
|
|
+//! Annotation::ReplicationTiming(ReplicationClass::Late),
|
|
|
+//! Annotation::HighDepths,
|
|
|
+//! ];
|
|
|
+//!
|
|
|
+//! // Generate GLM dataset
|
|
|
+//! let (glm_rows, mutation_rates) = make_glm_rows_from_regions_par(
|
|
|
+//! &variants,
|
|
|
+//! &bins,
|
|
|
+//! &tracked_features,
|
|
|
+//! 2, // min_callers
|
|
|
+//! );
|
|
|
+//!
|
|
|
+//! // Save to CSV for R/Python analysis
|
|
|
+//! write_glm_rows(&glm_rows, "glm_data.csv")?;
|
|
|
+//!
|
|
|
+//! // Print per-feature mutation rates
|
|
|
+//! for (feature, rate) in mutation_rates {
|
|
|
+//! println!("{}: {:.2} mutations/Mb", feature, rate);
|
|
|
+//! }
|
|
|
+//! # Ok::<(), anyhow::Error>(())
|
|
|
+//! ```
|
|
|
+//!
|
|
|
+//! ## Statistical Modeling Workflow
|
|
|
+//!
|
|
|
+//! This module supports a complete workflow for modeling somatic mutation rates:
|
|
|
+//!
|
|
|
+//! 1. **Load variants** from callers (ClairS, DeepSomatic, etc.)
|
|
|
+//! 2. **Annotate variants** with VEP, COSMIC, gnomAD
|
|
|
+//! 3. **Compute statistics** using `VariantsStats::new()`
|
|
|
+//! 4. **Identify quality regions** using `somatic_depth_quality_ranges()`
|
|
|
+//! 5. **Prepare GLM dataset** using `make_glm_rows_from_regions_par()`
|
|
|
+//! 6. **Export to CSV** using `write_glm_rows()`
|
|
|
+//! 7. **Model in R/Python** using GLM/GAM frameworks
|
|
|
+//!
|
|
|
+//! ## Performance
|
|
|
+//!
|
|
|
+//! All major functions use Rayon for parallel processing:
|
|
|
+//! - `VariantsStats::new()` parallelizes per-variant aggregation
|
|
|
+//! - `somatic_depth_quality_ranges()` processes contigs in parallel
|
|
|
+//! - `make_glm_rows_from_regions_par()` parallelizes region processing with efficient binary search
|
|
|
+
|
|
|
use std::{
|
|
|
collections::{BTreeMap, BTreeSet},
|
|
|
io::BufRead,
|