Browse Source

stats + callers() + bed compare

Thomas 8 months ago
parent
commit
4c5672cc47
7 changed files with 581 additions and 91 deletions
  1. 12 0
      src/annotation/mod.rs
  2. 15 2
      src/config.rs
  3. 15 27
      src/io/bed.rs
  4. 39 7
      src/lib.rs
  5. 19 11
      src/pipes/somatic.rs
  6. 59 38
      src/variant/variant_collection.rs
  7. 422 6
      src/variant/variants_stats.rs

+ 12 - 0
src/annotation/mod.rs

@@ -73,6 +73,12 @@ pub enum Annotation {
 
     /// Timing of replication for the variant's genomic position.
     ReplicationTiming(ReplicationClass),
+
+    HighDepths,
+
+    Panel(String),
+
+    CpG,
 }
 
 /// Denotes the biological sample type associated with a variant call.
@@ -195,6 +201,7 @@ impl fmt::Display for Annotation {
             Annotation::GnomAD(_) => "GnomAD".into(),
             Annotation::LowEntropy => "LowEntropy".into(),
             Annotation::VEP(_) => "VEP".into(),
+            Annotation::CpG => "CpG".into(),
             Annotation::TriNucleotides(bases) => format!(
                 "Trinucleotides({})",
                 bases.iter().map(|b| b.to_string()).collect::<String>(),
@@ -203,6 +210,8 @@ impl fmt::Display for Annotation {
                 ReplicationClass::Early => "ReplicationEarly".into(),
                 ReplicationClass::Late => "ReplicationLate".into(),
             },
+            Annotation::HighDepths => "HighDepths".into(),
+            Annotation::Panel(name) => format!("Panel_{name}"),
         };
         write!(f, "{}", s)
     }
@@ -344,6 +353,9 @@ impl Annotations {
                     | Annotation::VEP(_)
                     | Annotation::TriNucleotides(_)
                     | Annotation::ReplicationTiming(_)
+                    | Annotation::HighDepths
+                    | Annotation::CpG
+                    | Annotation::Panel(_)
                     | Annotation::HighConstitAlt => categorical.push(ann.to_string()),
                     Annotation::Callers(caller, sample) => {
                         categorical.push(format!("{caller} {sample}"))

+ 15 - 2
src/config.rs

@@ -71,7 +71,12 @@ pub struct Config {
     pub somatic_pipe_force: bool,
     pub somatic_pipe_threads: u8,
     pub min_high_quality_depth: u32,
+    pub min_n_callers: u8,
     pub somatic_scan_force: bool,
+    pub early_bed: String,
+    pub late_bed: String,
+    pub panels: Vec<(String, String)>,
+    pub cpg_bed: String,
 }
 
 // Here comes names that can't be changed from output of tools
@@ -193,14 +198,22 @@ impl Default for Config {
             somatic_scan_force: false,
 
             // Pipe
-            somatic_pipe_force: false,
+            somatic_pipe_force: true,
             somatic_pipe_threads: 150,
             somatic_min_constit_depth: 5,
             somatic_max_alt_constit: 1,
             entropy_seq_len: 10,
             min_shannon_entropy: 1.0,
 
-            min_high_quality_depth: 14,
+            min_high_quality_depth: 14, 
+            min_n_callers: 1,
+            early_bed: "/data/ref/hs1/replication_early_25_hs1.bed".to_string(),
+            late_bed: "/data/ref/hs1/replication_late_75_hs1.bed".to_string(),
+            panels: vec![
+                ("OncoT".to_string(), "/data/ref/hs1/V1_V2_V3_V4_V5_intersect_targets_hs1_uniq.bed".to_string()),
+                ("variable_chips".to_string(), "/data/ref/hs1/top_1500_sd_pos.bed".to_string()),
+            ],
+            cpg_bed: "/data/ref/hs1/hs1/hs1_CpG.bed".to_string(),
         }
     }
 }

+ 15 - 27
src/io/bed.rs

@@ -72,51 +72,39 @@ pub fn read_bed(path: &str) -> anyhow::Result<Vec<BedRow>> {
     Ok(res)
 }
 
-/// Annotates variants with a given annotation based on overlap with a BED file.
-///
-/// This function reads a BED file, extracts genomic ranges, and finds overlaps between those
-/// ranges and the positions of the variants in the provided `Variants` object. For each
-/// overlapping variant, the specified annotation is added to its `annotations` list.
-/// Returns the total genomic length (in base pairs) covered by the BED regions useful for rate
-/// computing.
+/// Annotates variants with a given annotation based on overlap with a BED file,
+/// and returns total base count and number of overlapping mutations.
 ///
 /// # Arguments
-/// * `variants` - A mutable reference to the `Variants` object to be annotated.
-/// * `bed_path` - Path to the BED file containing genomic intervals (e.g., early or late replication).
-/// * `annotation` - The `Annotation` to apply to overlapping variants.
+/// * `variants` - Mutable reference to a `Variants` collection to annotate.
+/// * `bed_path` - Path to the BED file defining the region class.
+/// * `annotation` - The `Annotation` to assign to overlapping variants.
 ///
 /// # Returns
-/// * `Ok(total_bases)` — total number of base pairs in the BED intervals, regardless of overlap.
-///
-/// # Errors
-/// This function can return errors related to file I/O or malformed BED lines.
-///
-/// # Example
-/// ```
-/// annotate_with_bed(&mut variants, "early_25.bed", Annotation::Early)?;
-/// annotate_with_bed(&mut variants, "late_25.bed", Annotation::Late)?;
-/// ```
+/// `Ok((total_bp, overlap_count))` — where:
+///   - `total_bp`: number of base pairs in the BED regions
+///   - `overlap_count`: number of variants overlapping those regions
 pub fn annotate_with_bed(
     variants: &mut Variants,
     bed_path: &str,
     annotation: Annotation,
-) -> anyhow::Result<u32> {
+) -> anyhow::Result<(usize, usize)> {
     let bed_rows = read_bed(bed_path)?;
     let ranges: Vec<&GenomeRange> = bed_rows.iter().map(|b| &b.range).collect();
 
-    let total_bases: u32 = ranges
-        .iter()
-        .map(|r| r.length())
-        .sum();
+    // Total number of base pairs in the BED regions
+    let total_bp: usize = ranges.iter().map(|r| r.length() as usize).sum();
 
+    // Extract positions of all variants
     let positions: Vec<&GenomePosition> = variants.data.iter().map(|v| &v.position).collect();
 
+    // Find indices of variants overlapping the BED ranges
     let overlaps = overlaps_par(&positions, &ranges);
 
+    // Annotate overlapping variants
     for &idx in &overlaps {
         variants.data[idx].annotations.push(annotation.clone());
     }
 
-    Ok(total_bases)
+    Ok((total_bp, overlaps.len()))
 }
-

+ 39 - 7
src/lib.rs

@@ -177,7 +177,7 @@ mod tests {
 
     use self::{collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config};
     use super::*;
-    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::SavanaCN}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado, io::gff::features_ranges, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, range_intersection_par, sort_ranges}, scan::scan::somatic_scan, variant::{variant::AlterationCategory, variants_stats::{self, VariantsStats}}};
+    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::SavanaCN}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig, ShouldRun}, commands::dorado::Dorado, helpers::find_files, io::gff::features_ranges, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, range_intersection_par, sort_ranges}, scan::scan::somatic_scan, variant::{variant::AlterationCategory, variants_stats::{self, VariantsStats}}};
 
     // export RUST_LOG="debug"
     fn init() {
@@ -665,8 +665,24 @@ mod tests {
     #[test]
     fn pipe_somatic() -> anyhow::Result<()> {   
         init();
-        let id = "AOUF";
-        SomaticPipe::initialize(id, Config::default())?.run()
+        let collections = Collections::new(
+            CollectionsConfig::default()
+        )?;
+        for (a, _) in collections.bam_pairs().iter() {
+            if a.id.as_str() != "PASSARD" {
+                continue;
+            }
+            if let Err(e) = SomaticPipe::initialize(&a.id, Config::default()).map(|mut p| if p.should_run() {
+                if let Err(e) = p.run() {
+                    error!("{e}");
+                }
+            }) {
+                error!("{e}");
+            }
+        }
+        Ok(())
+        // let id = "ACHITE";
+        // SomaticPipe::initialize(id, Config::default())?.run()
     }
 
     #[test]
@@ -893,12 +909,28 @@ mod tests {
     #[test]
     fn variants_stats() -> anyhow::Result<()> {
         init();
-        let id = "ADJAGBA";
         let config = Config::default();
-        let path = format!("{}/{id}/diag/somatic_variants.json.gz", config.result_dir);
-        let variants = variant_collection::Variants::load_from_json(&path)?;
 
-        VariantsStats::new(&variants, id, &config)?.save_to_json(&format!("{}/{id}/diag/somatic_variants_stats.json.gz", config.result_dir))
+        let all_variants_bit = find_files(&format!("{}/*/diag/*_somatic_variants.bit", config.result_dir))?;
+        for v in all_variants_bit.into_iter() {
+            let id = v.file_name().unwrap().to_str().unwrap().split("_somatic").next().unwrap();
+            println!("{id}");
+
+            let config = Config::default();
+            let path = format!("{}/{id}/diag/{id}_somatic_variants.bit", config.result_dir);
+            match variant_collection::Variants::load_from_file(&path) {
+                Ok(mut variants) => {
+                    let res = VariantsStats::new(&mut variants, id, &config)?.save_to_json(&format!(
+                        "{}/{id}/diag/{id}_somatic_variants_stats.json.gz", config.result_dir));
+                    if res.is_err() {
+                        info!("{:#?}", res);
+                    }
+                },
+                Err(e) => error!("{e}"),
+            }
+        }
+        Ok(())
+
     }
 
     #[test]

+ 19 - 11
src/pipes/somatic.rs

@@ -1,5 +1,9 @@
 use crate::{
-    annotation::is_gnomad_and_constit_alt, collection::ShouldRun, create_should_run_normal_tumoral, init_solo_callers_normal_tumoral, scan::scan::SomaticScan, variant::variant::{run_if_required, ShouldRunBox}
+    annotation::is_gnomad_and_constit_alt,
+    collection::ShouldRun,
+    create_should_run_normal_tumoral, init_solo_callers_normal_tumoral,
+    scan::scan::SomaticScan,
+    variant::variant::{run_if_required, ShouldRunBox},
 };
 use anyhow::Context;
 use itertools::Itertools;
@@ -163,7 +167,11 @@ impl Initialize for SomaticPipe {
 
 impl ShouldRun for SomaticPipe {
     fn should_run(&self) -> bool {
-        todo!()
+        let tumoral_dir = self.config.tumoral_dir(&self.id);
+        let path_str = format!("{}/{}_somatic_variants.bit", tumoral_dir, self.id);
+        let path = Path::new(&path_str);
+
+        !path.exists()
     }
 }
 
@@ -178,7 +186,7 @@ impl Run for SomaticPipe {
         let result_json = format!("{}/{id}_somatic_variants.json.gz", config.tumoral_dir(&id));
         let result_bit = format!("{}/{id}_somatic_variants.bit", config.tumoral_dir(&id));
 
-        if Path::new(&result_json).exists() && !config.somatic_pipe_force {
+        if Path::new(&result_bit).exists() && !config.somatic_pipe_force {
             return Err(anyhow::anyhow!(
                 "Somatic Pipe output already exists for {id}."
             ));
@@ -209,7 +217,7 @@ impl Run for SomaticPipe {
 
         info!("Running prerequired pipe components.");
         run_if_required(&mut to_run_if_req)
-            .context("Failed to run a prerequired component of somatic pipe.")?;
+            .with_context(|| format!("Failed to run a prerequired component of somatic pipe for {id}."))?;
 
         // Initialize variant callers
         let mut callers = init_somatic_callers!(
@@ -433,7 +441,7 @@ impl Run for SomaticPipe {
         annotations.vep_stats()?;
 
         // Merge all variants into a final collection
-        let variants = variants_collections.into_iter().fold(
+        let mut variants = variants_collections.into_iter().fold(
             Variants::default(),
             |mut acc, variants_collection| {
                 acc.merge(variants_collection, &annotations);
@@ -444,7 +452,7 @@ impl Run for SomaticPipe {
         let vep_stats = annotations.vep_stats()?;
         vep_stats.save_to_json(&format!("{stats_dir}/{id}_annotations_10_vep.json"))?;
 
-        VariantsStats::new(&variants, &id, &config)?
+        VariantsStats::new(&mut variants, &id, &config)?
             .save_to_json(&format!("{stats_dir}/{id}_variants_stats_final.json.gz"))?;
 
         info!("Final unique variants: {}", variants.data.len());
@@ -468,7 +476,7 @@ pub fn const_stats(id: String, config: Config) -> anyhow::Result<()> {
     variants.merge(clairs_germline, &annotations);
     info!("-- {}", variants.data.len());
 
-    let u = VariantsStats::new(&variants, &id, &config)?;
+    let u = VariantsStats::new(&mut variants, &id, &config)?;
     info!("++ {}", u.n);
 
     Ok(())
@@ -748,19 +756,19 @@ impl SomaticPipeStats {
 pub struct InputStats {
     /// Variants from tumor-only samples.
     pub solo_tumor: Vec<(Annotation, usize)>,
-    
+
     /// Variants from normal-only (constitutional) samples.
     pub solo_constit: Vec<(Annotation, usize)>,
-    
+
     /// Germline variants (present in both normal and tumor).
     pub germline: Vec<(Annotation, usize)>,
-    
+
     /// Somatic variants (specific to tumor).
     pub somatic: Vec<(Annotation, usize)>,
 }
 
 impl InputStats {
-/// Constructs an `InputStats` instance by aggregating variant counts
+    /// Constructs an `InputStats` instance by aggregating variant counts
     /// from a list of [`VariantCollection`]s.
     ///
     /// Each `VariantCollection` is inspected to determine its sample type,

+ 59 - 38
src/variant/variant_collection.rs

@@ -22,15 +22,15 @@ use crate::{
         gnomad::GnomAD,
         parse_trinuc,
         vep::{get_best_vep, run_vep, VepLine, VEP},
-        Annotation, Annotations, ReplicationClass,
+        Annotation, Annotations,
     },
     collection::{
         bam::{counts_at, counts_ins_at},
         vcf::Vcf,
     },
     helpers::{app_storage_dir, estimate_shannon_entropy, mean, temp_file_path, Hash128},
-    io::{bed::annotate_with_bed, fasta::sequence_at, readers::get_reader, vcf::vcf_header},
-    positions::{GenomePosition, GetGenomePosition},
+    io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header},
+    positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomePosition},
 };
 
 /// A collection of VCF variants along with associated metadata.
@@ -281,7 +281,6 @@ impl VariantCollection {
                             if seq.len() >= center + 2 {
                                 let trinuc = &seq[center - 1..=center + 1];
                                 let bases = parse_trinuc(trinuc);
-                                debug!("{bases:?}");
                                 anns.push(Annotation::TriNucleotides(bases));
                             }
                         }
@@ -505,6 +504,19 @@ impl Variant {
             })
             .unwrap_or_default()
     }
+
+    pub fn n_callers(&self) -> u8 {
+        self.annotations
+            .iter()
+            .filter(|a| matches!(a, Annotation::Callers(..)))
+            .count() as u8
+    }
+
+    pub fn callers(&self) -> String {
+        let mut callers: Vec<String> = self.annotations.iter().map(|c| c.to_string()).collect();
+        callers.sort();
+        callers.join(", ")
+    }
 }
 
 /// A collection of genomic variants.
@@ -656,40 +668,6 @@ impl Variants {
         self.data = result;
     }
 
-    /// Annotates variants based on overlap with early and late replication timing regions.
-    ///
-    /// This function applies `Annotation::Early` to variants that overlap with regions in the
-    /// provided early BED file, and `Annotation::Late` for variants overlapping the late BED file.
-    ///
-    /// # Arguments
-    /// * `early_bed` - Path to a BED file representing early-replicating regions.
-    /// * `late_bed` - Path to a BED file representing late-replicating regions.
-    ///
-    /// # Returns
-    /// * `Ok(())` on success, or an error if BED files cannot be read or parsed.
-    ///
-    /// # Example
-    /// ```
-    /// variants.annotate_replication_timing("early_25.bed", "late_25.bed")?;
-    /// ```
-    pub fn annotate_replication_timing(
-        &mut self,
-        early_bed: &str,
-        late_bed: &str,
-    ) -> anyhow::Result<()> {
-        annotate_with_bed(
-            self,
-            early_bed,
-            Annotation::ReplicationTiming(ReplicationClass::Early),
-        )?;
-        annotate_with_bed(
-            self,
-            late_bed,
-            Annotation::ReplicationTiming(ReplicationClass::Late),
-        )?;
-        Ok(())
-    }
-
     /// Filters and returns variants of a specific alteration category.
     ///
     /// This method creates a new vector containing clones of all variants
@@ -723,6 +701,49 @@ impl Variants {
             .collect()
     }
 
+    /// Annotates variants overlapping a list of genome ranges and meeting specified constraints.
+    ///
+    /// # Arguments
+    /// * `ranges` - List of genomic regions to scan.
+    /// * `opt_annotation` - Optional annotation to add to overlapping variants that pass filters.
+    /// * `min_callers` - Minimum number of callers required for a variant to count.
+    /// * `required_ann` - List of annotations that **must all be present** in the variant.
+    ///
+    /// # Returns
+    /// * `total_bp` - Total base pairs spanned by the provided ranges
+    /// * `n_restricted_call` - Number of variants passing all constraints
+    pub fn annotate_with_ranges(
+        &mut self,
+        ranges: &[GenomeRange],
+        opt_annotation: Option<Annotation>,
+        min_callers: u8,
+        required_ann: Vec<Annotation>,
+    ) -> (u32, usize) {
+        let ranges_ref: Vec<&GenomeRange> = ranges.iter().collect();
+        let positions: Vec<&GenomePosition> = self.data.iter().map(|v| &v.position).collect();
+
+        let overlaps = overlaps_par(&positions, &ranges_ref);
+
+        let mut n_restricted_call = 0;
+        for &idx in &overlaps {
+            let variant = &mut self.data[idx];
+
+            let has_all_required = required_ann
+                .iter()
+                .all(|req| variant.annotations.contains(req));
+
+            if variant.n_callers() >= min_callers && has_all_required {
+                n_restricted_call += 1;
+                if let Some(ref annotation) = opt_annotation {
+                    variant.annotations.push(annotation.clone());
+                }
+            }
+        }
+
+        let total_bp: u32 = ranges.iter().map(|r| r.length()).sum();
+        (total_bp, n_restricted_call)
+    }
+
     /// Saves the Variants collection to a BGZF-compressed JSON file.
     ///
     /// This method serializes the Variants struct to JSON format and writes it to a file

+ 422 - 6
src/variant/variants_stats.rs

@@ -1,4 +1,8 @@
-use std::{collections::BTreeMap, io::BufRead, sync::Arc};
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    io::BufRead,
+    sync::Arc,
+};
 
 use anyhow::Context;
 use dashmap::DashMap;
@@ -8,10 +12,13 @@ use rayon::prelude::*;
 use serde::{Deserialize, Serialize, Serializer};
 
 use crate::{
-    annotation::{vep::VepImpact, Annotation},
+    annotation::{vep::VepImpact, Annotation, ReplicationClass},
     config::Config,
     helpers::bin_data,
-    io::{dict::read_dict, gff::features_ranges, readers::get_gz_reader, writers::get_gz_writer},
+    io::{
+        bed::read_bed, dict::read_dict, gff::features_ranges, readers::get_gz_reader,
+        writers::get_gz_writer,
+    },
     positions::{
         contig_to_num, merge_overlapping_genome_ranges, par_overlaps, range_intersection_par,
         GenomeRange,
@@ -48,6 +55,7 @@ pub struct VariantsStats {
 
     pub somatic_rates: SomaticVariantRates,
     pub high_depth_somatic_rates: SomaticVariantRates,
+    pub mutation_rates: Vec<(String, (u32, usize))>,
 }
 
 pub fn serialize_dashmap_sort<S, T>(
@@ -67,7 +75,7 @@ where
 }
 
 impl VariantsStats {
-    pub fn new(variants: &Variants, id: &str, config: &Config) -> anyhow::Result<Self> {
+    pub fn new(variants: &mut Variants, id: &str, config: &Config) -> anyhow::Result<Self> {
         let n = variants.data.len() as u32;
         let alteration_categories: DashMap<String, u32> = DashMap::new();
         let vep_impact: DashMap<String, u32> = DashMap::new();
@@ -79,8 +87,10 @@ impl VariantsStats {
         let vafs: DashMap<OrderedFloat<f32>, u32> = DashMap::new();
         let cosmic: DashMap<u64, u32> = DashMap::new();
         let gnomads: DashMap<String, Vec<f64>> = DashMap::new();
+        let context: DashMap<String, Vec<String>> = DashMap::new();
 
         variants.data.par_iter().for_each(|v| {
+            // VEP
             if let Ok(best_vep) = v.best_vep() {
                 if let Some(ref impact) = best_vep.extra.impact {
                     *vep_impact.entry(impact.to_string()).or_default() += 1;
@@ -102,6 +112,7 @@ impl VariantsStats {
                 }
             }
 
+            // VAF
             let (n_alt, depth) = v.n_alt_depth();
             *n_alts.entry(n_alt as u32).or_default() += 1;
             *depths.entry(depth as u32).or_default() += 1;
@@ -111,6 +122,9 @@ impl VariantsStats {
                 *vafs.entry(vaf).or_default() += 1;
             }
 
+            let callers = v.callers();
+
+            // Annotations
             v.annotations.iter().for_each(|annotation| {
                 match annotation {
                     Annotation::Cosmic(v) => *cosmic.entry(v.cosmic_cnt).or_default() += 1,
@@ -122,9 +136,15 @@ impl VariantsStats {
                                 gnomads.entry(key).or_default().push(value);
                             });
                     }
+                    Annotation::TriNucleotides(bases) => context
+                        .entry(bases.iter().map(|v| v.to_string()).collect())
+                        .or_default()
+                        .push(callers.clone()),
                     _ => (),
                 };
             });
+
+            // Alteration category
             let mut alteration_category_str = v
                 .alteration_category()
                 .iter()
@@ -155,15 +175,168 @@ impl VariantsStats {
 
         let mut high_depth_ranges = high_depth_somatic(id, config)?;
         high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
-
         let exon_ranges_ref: Vec<&GenomeRange> = exon_ranges.iter().collect();
         let exons_high_depth = range_intersection_par(
             &high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(),
             &exon_ranges_ref,
         );
-
         let high_depth = somatic_rates(&variants.data, &exons_high_depth, config)?;
 
+        let mut mutation_rates = Vec::new();
+
+        // HighDepths
+        let ann = Annotation::HighDepths;
+        let res = variants.annotate_with_ranges(
+            &high_depth_ranges,
+            Some(ann.clone()),
+            config.min_n_callers,
+            Vec::new(),
+        );
+        mutation_rates.push((ann.to_string(), res));
+
+        // Exons
+        let res =
+            variants.annotate_with_ranges(&exon_ranges, None, config.min_n_callers, Vec::new());
+        mutation_rates.push(("Exons".to_string(), res));
+
+        // ExonsHighDepths
+        let res = variants.annotate_with_ranges(
+            &exons_high_depth,
+            None,
+            config.min_n_callers,
+            Vec::new(),
+        );
+        mutation_rates.push(("Exons HighDepths".to_string(), res));
+
+        // CpG
+        let cpg_ranges: Vec<GenomeRange> = read_bed(&config.cpg_bed)?
+            .into_iter()
+            .map(|e| e.range)
+            .collect();
+        let ann = Annotation::CpG;
+        let res = variants.annotate_with_ranges(
+            &cpg_ranges,
+            Some(ann.clone()),
+            config.min_n_callers,
+            Vec::new(),
+        );
+        mutation_rates.push((ann.to_string(), res));
+
+        // CpG HighDepths
+        let cpg_high_depth = range_intersection_par(
+            &high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(),
+            &cpg_ranges.iter().collect::<Vec<&GenomeRange>>(),
+        );
+        let res = variants.annotate_with_ranges(
+            &cpg_high_depth,
+            Some(ann.clone()),
+            config.min_n_callers,
+            Vec::new(),
+        );
+        mutation_rates.push(("CpG HighDepths".to_string(), res));
+
+        // Early replication
+        let early_ranges: Vec<GenomeRange> = read_bed(&config.early_bed)?
+            .into_iter()
+            .map(|e| e.range)
+            .collect();
+        let ann = Annotation::ReplicationTiming(ReplicationClass::Early);
+        let res = variants.annotate_with_ranges(
+            &early_ranges,
+            Some(ann.clone()),
+            config.min_n_callers,
+            Vec::new(),
+        );
+        mutation_rates.push((ann.to_string(), res));
+
+        // Early replication HighDepths
+        let early_ranges_high_depth = range_intersection_par(
+            &high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(),
+            &early_ranges.iter().collect::<Vec<&GenomeRange>>(),
+        );
+        let res = variants.annotate_with_ranges(
+            &early_ranges_high_depth,
+            Some(ann.clone()),
+            config.min_n_callers,
+            Vec::new(),
+        );
+        mutation_rates.push(("Early replication HighDepths".to_string(), res));
+
+        // Late replication
+        let late_ranges: Vec<GenomeRange> = read_bed(&config.late_bed)?
+            .into_iter()
+            .map(|e| e.range)
+            .collect();
+        let ann = Annotation::ReplicationTiming(ReplicationClass::Late);
+        let res = variants.annotate_with_ranges(
+            &late_ranges,
+            Some(ann.clone()),
+            config.min_n_callers,
+            Vec::new(),
+        );
+        mutation_rates.push((ann.to_string(), res));
+
+        // Late replication HighDepths
+        let late_ranges_high_depth = range_intersection_par(
+            &high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(),
+            &late_ranges.iter().collect::<Vec<&GenomeRange>>(),
+        );
+        let res = variants.annotate_with_ranges(
+            &late_ranges_high_depth,
+            Some(ann.clone()),
+            config.min_n_callers,
+            Vec::new(),
+        );
+        mutation_rates.push(("Late replication HighDepths".to_string(), res));
+
+        for (name, path) in config.panels.iter() {
+            let panel_ranges: Vec<GenomeRange> =
+                read_bed(path)?.into_iter().map(|e| e.range).collect();
+            let ann = Annotation::Panel(name.to_string());
+            let res = variants.annotate_with_ranges(
+                &panel_ranges,
+                Some(ann.clone()),
+                config.min_n_callers,
+                Vec::new(),
+            );
+            mutation_rates.push((ann.to_string(), res));
+
+            let panel_ranges: Vec<GenomeRange> = range_intersection_par(
+                &high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(),
+                &panel_ranges.iter().collect::<Vec<&GenomeRange>>(),
+            );
+
+            let ann = Annotation::Panel(format!("{}_HighDepths", name));
+            let res = variants.annotate_with_ranges(
+                &panel_ranges,
+                Some(ann.clone()),
+                config.min_n_callers,
+                Vec::new(),
+            );
+            mutation_rates.push((ann.to_string(), res));
+        }
+
+        // mutation_rates.sort_by(|(_, (_, an)), (_, (_, bn))| an.cmp(bn));
+
+        // Output rates per megabase
+        for (feature, (bp, n)) in &mutation_rates {
+            let rate = (*n as f64) / ((*bp as f64) / 1e6);
+            println!("{feature}: {rate:.2} mutations/Mb ({n} / {bp})");
+        }
+
+        // let (glm_rows, rates_per_mb) =
+        //     make_glm_rows_from_regions_par(variants, &high_depth_ranges, &tracked_annotations, config.min_n_callers);
+        //
+        // // Output rates per megabase
+        // for (feature, rate) in &rates_per_mb {
+        //     println!("{feature}: {rate:.2} mutations/Mb");
+        // }
+        //
+        // write_glm_rows(
+        //     &glm_rows,
+        //     &format!("{}/{id}_glm_rows.csv", config.somatic_pipe_stats(id)),
+        // )?;
+
         Ok(Self {
             n,
             alteration_categories,
@@ -179,6 +352,7 @@ impl VariantsStats {
             n_gnomad,
             somatic_rates: all_somatic_rates,
             high_depth_somatic_rates: high_depth,
+            mutation_rates,
         })
     }
 
@@ -432,3 +606,245 @@ fn ranges_from_consecutive_true(vec: &[bool], start: u32, contig: &str) -> Vec<G
     ranges
 }
 
+/// A region-level data structure for modeling mutation rates using GLMs or other statistical models.
+///
+/// Each `GlmRow` represents a genomic interval (e.g., from a BED file) and contains:
+/// - The region's coordinates (`contig`, `start`, `end`)
+/// - Its length in base pairs
+/// - The number of somatic mutations overlapping the region
+/// - A list of binary feature labels (e.g., "Early", "HighGC")
+///
+/// This structure is designed to be serializable (e.g., to CSV) for downstream statistical modeling.
+///
+/// # Example
+/// ```
+/// GlmRow {
+///     contig: "chr1".to_string(),
+///     start: 100_000,
+///     end: 101_000,
+///     length: 1000,
+///     mutation_count: 3,
+///     features: vec!["Early".into(), "HighGC".into()],
+/// }
+/// ```
+#[derive(Debug, Serialize)]
+pub struct GlmRow {
+    /// Chromosome name (e.g., "chr1")
+    pub contig: String,
+
+    /// Start coordinate (0-based, inclusive)
+    pub start: u32,
+
+    /// End coordinate (0-based, exclusive)
+    pub end: u32,
+
+    /// Length of the region in base pairs (end - start)
+    pub length: usize,
+
+    /// Number of variants overlapping this region
+    pub mutation_count: usize,
+
+    /// Binary feature labels associated with this region
+    pub features: Vec<String>,
+}
+
+/// Builds a GLM-ready table from a set of fixed genomic regions and a list of annotated variants,
+/// in parallel, using efficient binary search-based double-pointer traversal.
+///
+/// For each region, this function:
+/// - Counts how many variants fall within the region
+/// - Extracts the relevant annotations (features) from overlapping variants
+/// - Produces one `GlmRow` with coordinates, mutation count, and feature labels
+///
+/// Additionally, it computes the **mutation rate per megabase** for each tracked annotation
+/// by aggregating counts and total base coverage across regions where each feature is present.
+///
+/// # Assumptions
+/// - `variants.data` must be sorted by `position.contig` then `position.position`
+/// - `regions` must be sorted by `range.contig` then `range.start`
+///
+/// # Arguments
+/// * `variants` - A list of annotated somatic variants (sorted).
+/// * `regions` - A set of genomic bins or intervals (e.g., from BED) used as modeling units (sorted).
+/// * `tracked_annotations` - A list of `Annotation` values to track as binary features.
+///
+/// # Returns
+/// A tuple:
+/// * `Vec<GlmRow>` — one per region, with mutation count and feature labels.
+/// * `Vec<(String, f64)>` — per-feature mutation rate (mutations per megabase).
+///
+/// # Example
+/// ```rust
+/// let (rows, rates) = make_glm_rows_from_regions_par(
+///     &variants,
+///     &genome_bins,
+///     &[
+///         Annotation::ReplicationTiming(ReplicationClass::Early),
+///         Annotation::HighDepths
+///     ]
+/// );
+///
+/// for (feature, rate) in rates {
+///     println!("{feature}: {:.2} mutations/Mb", rate);
+/// }
+/// ```
+pub fn make_glm_rows_from_regions_par(
+    variants: &Variants,
+    regions: &[GenomeRange],
+    tracked_annotations: &[Annotation],
+    min_callers: u8,
+) -> (Vec<GlmRow>, Vec<(String, f64)>) {
+    let (glm_rows, feature_stats): (Vec<GlmRow>, BTreeMap<String, (usize, usize)>) = regions
+        .par_iter()
+        .map(|region| {
+            let mut mutation_count = 0;
+            let mut feature_set = BTreeSet::new();
+
+            // Binary search into sorted variant list
+            let start_idx = variants.data.partition_point(|v| {
+                v.position.contig < region.contig
+                    || (v.position.contig == region.contig
+                        && v.position.position < region.range.start)
+            });
+
+            for var in &variants.data[start_idx..] {
+                let pos = &var.position;
+
+                if pos.contig != region.contig || pos.position >= region.range.end {
+                    break;
+                }
+
+                if region.range.contains(&pos.position) && var.n_callers() >= min_callers {
+                    mutation_count += 1;
+
+                    for ann in &var.annotations {
+                        if tracked_annotations.contains(ann) {
+                            feature_set.insert(ann.to_string());
+                        }
+                    }
+                }
+            }
+
+            let region_len = region.length() as usize;
+
+            let row = GlmRow {
+                contig: region.contig.to_string(),
+                start: region.range.start,
+                end: region.range.end,
+                length: region_len,
+                mutation_count,
+                features: feature_set.clone().into_iter().collect(),
+            };
+
+            // Local (per-thread) accumulation of stats
+            let mut stats = BTreeMap::new();
+            for feat in &feature_set {
+                stats.insert(feat.clone(), (mutation_count, region_len));
+            }
+
+            (vec![row], stats)
+        })
+        .reduce(
+            || (Vec::new(), BTreeMap::new()),
+            |(mut rows_a, mut stats_a), (mut rows_b, stats_b)| {
+                rows_a.append(&mut rows_b);
+                for (feat, (n, bp)) in stats_b {
+                    stats_a
+                        .entry(feat)
+                        .and_modify(|(na, bpa)| {
+                            *na += n;
+                            *bpa += bp;
+                        })
+                        .or_insert((n, bp));
+                }
+                (rows_a, stats_a)
+            },
+        );
+
+    let ranges_len: usize = regions.iter().map(|a| a.length() as usize).sum();
+    assert_eq!(10e6 as usize, 1_000_000);
+
+    // Compute mutation rates per Mb from aggregate stats
+    let rates_per_mb: Vec<(String, f64)> = feature_stats
+        .into_iter()
+        .map(|(feat, (n, bp))| {
+            debug!("{bp}");
+            let rate = if bp > 0 {
+                (n as f64) / ((ranges_len as f64) / 10e6)
+            } else {
+                0.0
+            };
+            (feat, rate)
+        })
+        .collect();
+
+    (glm_rows, rates_per_mb)
+}
+
+/// Returns a flat table: one-hot encoded feature columns.
+fn flatten_glm_rows(
+    rows: &[GlmRow],
+) -> (Vec<String>, Vec<std::collections::HashMap<String, String>>) {
+    let mut all_features: BTreeSet<String> = BTreeSet::new();
+    for r in rows {
+        all_features.extend(r.features.iter().cloned());
+    }
+
+    let feature_list: Vec<_> = all_features.into_iter().collect();
+
+    let mut table: Vec<_> = rows
+        .iter()
+        .map(|r| {
+            let mut row = std::collections::HashMap::new();
+            row.insert("contig".into(), r.contig.clone());
+            row.insert("start".into(), r.start.to_string());
+            row.insert("end".into(), r.end.to_string());
+            row.insert("length".into(), r.length.to_string());
+            row.insert("mutation_count".into(), r.mutation_count.to_string());
+            row.insert("log_length".into(), (r.length as f64).ln().to_string());
+
+            for f in &feature_list {
+                row.insert(
+                    f.clone(),
+                    if r.features.contains(f) { "1" } else { "0" }.into(),
+                );
+            }
+
+            row
+        })
+        .collect();
+
+    // Parallel sort by contig and start
+    table.par_sort_by(|a, b| {
+        let ca = a.get("contig").unwrap();
+        let cb = b.get("contig").unwrap();
+        let sa = a.get("start").unwrap().parse::<u32>().unwrap();
+        let sb = b.get("start").unwrap().parse::<u32>().unwrap();
+        (ca, sa).cmp(&(cb, sb))
+    });
+
+    (feature_list, table)
+}
+
+pub fn write_glm_rows(all_rows: &[GlmRow], csv_path: &str) -> anyhow::Result<()> {
+    let (features, flat_rows) = flatten_glm_rows(all_rows);
+    let mut writer = csv::Writer::from_path(csv_path)?;
+
+    let mut headers = vec![
+        "contig",
+        "start",
+        "end",
+        "length",
+        "log_length",
+        "mutation_count",
+    ];
+    headers.extend(features.iter().map(|s| s.as_str()));
+    writer.write_record(&headers)?;
+
+    for row in flat_rows {
+        let values: Vec<_> = headers.iter().map(|&h| row.get(h).unwrap()).collect();
+        writer.write_record(values)?;
+    }
+    writer.flush()?;
+    Ok(())
+}