Browse Source

stats variants

Thomas 1 day ago
parent
commit
453f3db650
1 changed files with 155 additions and 0 deletions
  1. 155 0
      src/variant/variants_stats.rs

+ 155 - 0
src/variant/variants_stats.rs

@@ -1,3 +1,158 @@
+//! # Variant Statistics and Analysis
+//!
+//! This module provides statistical analysis, rate calculations, and data preparation
+//! for genomic variant collections, particularly focused on somatic variant analysis
+//! and generalized linear modeling (GLM) of mutation rates.
+//!
+//! ## Core Types
+//!
+//! ### Statistics Aggregation
+//!
+//! - [`VariantsStats`] - Comprehensive statistics for a variant collection including:
+//!   - Variant counts by category (SNV, insertion, deletion, etc.)
+//!   - Variant allele frequency (VAF) distributions
+//!   - Read depth distributions
+//!   - VEP consequence and impact distributions
+//!   - Gene-level mutation counts
+//!   - COSMIC and gnomAD frequency overlaps
+//!   - Somatic mutation rate calculations
+//!
+//! ### Mutation Rate Analysis
+//!
+//! - [`SomaticVariantRates`] - Somatic mutation burden metrics:
+//!   - Whole-genome mutation rate (mutations per megabase)
+//!   - Coding region mutation rate
+//!   - Nonsynonymous mutation rate
+//!   - Exon coverage statistics
+//!
+//! ### GLM Data Preparation
+//!
+//! - [`GlmRow`] - Genomic region with mutation count and feature annotations for statistical modeling
+//!   - Used to build datasets for generalized linear models (GLMs)
+//!   - Enables analysis of mutation rate drivers (e.g., replication timing, chromatin state)
+//!
+//! ## Key Functions
+//!
+//! ### Mutation Rate Calculation
+//!
+//! - [`somatic_rates()`] - Calculate somatic mutation rates across WGS and coding regions
+//! - [`somatic_depth_quality_ranges()`] - Identify high-depth and low-quality genomic regions
+//!
+//! ### GLM Dataset Generation
+//!
+//! - [`make_glm_rows_from_regions_par()`] - Build GLM-ready dataset from variants and genomic bins
+//! - [`write_glm_rows()`] - Export GLM rows to CSV for downstream statistical analysis
+//!
+//! ### Utilities
+//!
+//! - [`ranges_from_consecutive_true_iter()`] - Convert boolean iterator to genomic ranges
+//! - [`merge_adjacent_ranges()`] - Merge adjacent genomic ranges
+//!
+//! ## Usage Examples
+//!
+//! ### Computing Variant Statistics
+//!
+//! ```ignore
+//! use pandora_lib_promethion::variant::variants_stats::VariantsStats;
+//! use pandora_lib_promethion::variant::variant_collection::Variants;
+//!
+//! let mut variants = clairs.variants(&annotations)?;
+//! let high_depth_ranges = Vec::new(); // or from somatic_depth_quality_ranges()
+//!
+//! let stats = VariantsStats::new(&mut variants, "sample_001", &config, &high_depth_ranges)?;
+//!
+//! println!("Total variants: {}", stats.n);
+//! println!("WGS mutation rate: {:.2} mutations/Mb", stats.somatic_rates.somatic_mutation_rate_wgs);
+//! println!("Coding mutation rate: {:.2} mutations/Mb", stats.somatic_rates.somatic_mutation_rate_coding);
+//! # Ok::<(), anyhow::Error>(())
+//! ```
+//!
+//! ### Calculating Somatic Mutation Rates
+//!
+//! ```ignore
+//! use pandora_lib_promethion::variant::variants_stats::somatic_rates;
+//! use pandora_lib_promethion::io::gff::features_ranges;
+//!
+//! // Load exon ranges from GFF
+//! let exon_ranges = features_ranges(&config.genes_gtf, "exon")?;
+//!
+//! // Calculate mutation rates
+//! let rates = somatic_rates(&variants.data, &exon_ranges, &config)?;
+//!
+//! println!("Total variants: {}", rates.total_variants);
+//! println!("Variants in coding: {}", rates.variants_in_coding);
+//! println!("Nonsynonymous rate: {:.2} mutations/Mb", rates.somatic_nonsynonymous_rate_coding);
+//! # Ok::<(), anyhow::Error>(())
+//! ```
+//!
+//! ### Identifying High-Depth Regions
+//!
+//! ```ignore
+//! use pandora_lib_promethion::variant::variants_stats::somatic_depth_quality_ranges;
+//!
+//! let (high_depth, low_quality) = somatic_depth_quality_ranges("sample_001", &config)?;
+//!
+//! println!("High-depth regions: {}", high_depth.len());
+//! println!("Low-quality regions: {}", low_quality.len());
+//! # Ok::<(), anyhow::Error>(())
+//! ```
+//!
+//! ### Preparing GLM Dataset
+//!
+//! ```ignore
+//! use pandora_lib_promethion::variant::variants_stats::{
+//!     make_glm_rows_from_regions_par, write_glm_rows
+//! };
+//! use pandora_lib_promethion::annotation::{Annotation, ReplicationClass};
+//! use pandora_lib_promethion::io::bed::read_bed;
+//!
+//! // Load genomic bins (e.g., 1 Mb windows)
+//! let bins = read_bed("genome_1mb_bins.bed")?;
+//!
+//! // Define features to track
+//! let tracked_features = vec![
+//!     Annotation::ReplicationTiming(ReplicationClass::Early),
+//!     Annotation::ReplicationTiming(ReplicationClass::Late),
+//!     Annotation::HighDepths,
+//! ];
+//!
+//! // Generate GLM dataset
+//! let (glm_rows, mutation_rates) = make_glm_rows_from_regions_par(
+//!     &variants,
+//!     &bins,
+//!     &tracked_features,
+//!     2, // min_callers
+//! );
+//!
+//! // Save to CSV for R/Python analysis
+//! write_glm_rows(&glm_rows, "glm_data.csv")?;
+//!
+//! // Print per-feature mutation rates
+//! for (feature, rate) in mutation_rates {
+//!     println!("{}: {:.2} mutations/Mb", feature, rate);
+//! }
+//! # Ok::<(), anyhow::Error>(())
+//! ```
+//!
+//! ## Statistical Modeling Workflow
+//!
+//! This module supports a complete workflow for modeling somatic mutation rates:
+//!
+//! 1. **Load variants** from callers (ClairS, DeepSomatic, etc.)
+//! 2. **Annotate variants** with VEP, COSMIC, gnomAD
+//! 3. **Compute statistics** using `VariantsStats::new()`
+//! 4. **Identify quality regions** using `somatic_depth_quality_ranges()`
+//! 5. **Prepare GLM dataset** using `make_glm_rows_from_regions_par()`
+//! 6. **Export to CSV** using `write_glm_rows()`
+//! 7. **Model in R/Python** using GLM/GAM frameworks
+//!
+//! ## Performance
+//!
+//! All major functions use Rayon for parallel processing:
+//! - `VariantsStats::new()` parallelizes per-variant aggregation
+//! - `somatic_depth_quality_ranges()` processes contigs in parallel
+//! - `make_glm_rows_from_regions_par()` parallelizes region processing with efficient binary search
+
 use std::{
     collections::{BTreeMap, BTreeSet},
     io::BufRead,