Thomas
/
pandora_lib_promethion


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016
							//! # Variant Statistics and Analysis
//!
//! This module provides statistical analysis, rate calculations, and data preparation
//! for genomic variant collections, particularly focused on somatic variant analysis
//! and generalized linear modeling (GLM) of mutation rates.
//!
//! ## Core Types
//!
//! ### Statistics Aggregation
//!
//! - [`VariantsStats`] - Comprehensive statistics for a variant collection including:
//!   - Variant counts by category (SNV, insertion, deletion, etc.)
//!   - Variant allele frequency (VAF) distributions
//!   - Read depth distributions
//!   - VEP consequence and impact distributions
//!   - Gene-level mutation counts
//!   - COSMIC and gnomAD frequency overlaps
//!   - Somatic mutation rate calculations
//!
//! ### Mutation Rate Analysis
//!
//! - [`SomaticVariantRates`] - Somatic mutation burden metrics:
//!   - Whole-genome mutation rate (mutations per megabase)
//!   - Coding region mutation rate
//!   - Nonsynonymous mutation rate
//!   - Exon coverage statistics
//!
//! ### GLM Data Preparation
//!
//! - [`GlmRow`] - Genomic region with mutation count and feature annotations for statistical modeling
//!   - Used to build datasets for generalized linear models (GLMs)
//!   - Enables analysis of mutation rate drivers (e.g., replication timing, chromatin state)
//!
//! ## Key Functions
//!
//! ### Mutation Rate Calculation
//!
//! - [`somatic_rates()`] - Calculate somatic mutation rates across WGS and coding regions
//! - [`somatic_depth_quality_ranges()`] - Identify high-depth and low-quality genomic regions
//!
//! ### GLM Dataset Generation
//!
//! - [`make_glm_rows_from_regions_par()`] - Build GLM-ready dataset from variants and genomic bins
//! - [`write_glm_rows()`] - Export GLM rows to CSV for downstream statistical analysis
//!
//! ### Utilities
//!
//! - [`ranges_from_consecutive_true_iter()`] - Convert boolean iterator to genomic ranges
//! - [`merge_adjacent_ranges()`] - Merge adjacent genomic ranges
//!
//! ## Usage Examples
//!
//! ### Computing Variant Statistics
//!
//! ```ignore
//! use pandora_lib_promethion::variant::variants_stats::VariantsStats;
//! use pandora_lib_promethion::variant::variant_collection::Variants;
//!
//! let mut variants = clairs.variants(&annotations)?;
//! let high_depth_ranges = Vec::new(); // or from somatic_depth_quality_ranges()
//!
//! let stats = VariantsStats::new(&mut variants, "sample_001", &config, &high_depth_ranges)?;
//!
//! println!("Total variants: {}", stats.n);
//! println!("WGS mutation rate: {:.2} mutations/Mb", stats.somatic_rates.somatic_mutation_rate_wgs);
//! println!("Coding mutation rate: {:.2} mutations/Mb", stats.somatic_rates.somatic_mutation_rate_coding);
//! # Ok::<(), anyhow::Error>(())
//! ```
//!
//! ### Calculating Somatic Mutation Rates
//!
//! ```ignore
//! use pandora_lib_promethion::variant::variants_stats::somatic_rates;
//! use pandora_lib_promethion::io::gff::features_ranges;
//!
//! // Load exon ranges from GFF
//! let exon_ranges = features_ranges(&config.genes_gtf, "exon")?;
//!
//! // Calculate mutation rates
//! let rates = somatic_rates(&variants.data, &exon_ranges, &config)?;
//!
//! println!("Total variants: {}", rates.total_variants);
//! println!("Variants in coding: {}", rates.variants_in_coding);
//! println!("Nonsynonymous rate: {:.2} mutations/Mb", rates.somatic_nonsynonymous_rate_coding);
//! # Ok::<(), anyhow::Error>(())
//! ```
//!
//! ### Identifying High-Depth Regions
//!
//! ```ignore
//! use pandora_lib_promethion::variant::variants_stats::somatic_depth_quality_ranges;
//!
//! let (high_depth, low_quality) = somatic_depth_quality_ranges("sample_001", &config)?;
//!
//! println!("High-depth regions: {}", high_depth.len());
//! println!("Low-quality regions: {}", low_quality.len());
//! # Ok::<(), anyhow::Error>(())
//! ```
//!
//! ### Preparing GLM Dataset
//!
//! ```ignore
//! use pandora_lib_promethion::variant::variants_stats::{
//!     make_glm_rows_from_regions_par, write_glm_rows
//! };
//! use pandora_lib_promethion::annotation::{Annotation, ReplicationClass};
//! use pandora_lib_promethion::io::bed::read_bed;
//!
//! // Load genomic bins (e.g., 1 Mb windows)
//! let bins = read_bed("genome_1mb_bins.bed")?;
//!
//! // Define features to track
//! let tracked_features = vec![
//!     Annotation::ReplicationTiming(ReplicationClass::Early),
//!     Annotation::ReplicationTiming(ReplicationClass::Late),
//!     Annotation::HighDepths,
//! ];
//!
//! // Generate GLM dataset
//! let (glm_rows, mutation_rates) = make_glm_rows_from_regions_par(
//!     &variants,
//!     &bins,
//!     &tracked_features,
//!     2, // min_callers
//! );
//!
//! // Save to CSV for R/Python analysis
//! write_glm_rows(&glm_rows, "glm_data.csv")?;
//!
//! // Print per-feature mutation rates
//! for (feature, rate) in mutation_rates {
//!     println!("{}: {:.2} mutations/Mb", feature, rate);
//! }
//! # Ok::<(), anyhow::Error>(())
//! ```
//!
//! ## Statistical Modeling Workflow
//!
//! This module supports a complete workflow for modeling somatic mutation rates:
//!
//! 1. **Load variants** from callers (ClairS, DeepSomatic, etc.)
//! 2. **Annotate variants** with VEP, COSMIC, gnomAD
//! 3. **Compute statistics** using `VariantsStats::new()`
//! 4. **Identify quality regions** using `somatic_depth_quality_ranges()`
//! 5. **Prepare GLM dataset** using `make_glm_rows_from_regions_par()`
//! 6. **Export to CSV** using `write_glm_rows()`
//! 7. **Model in R/Python** using GLM/GAM frameworks
//!
//! ## Performance
//!
//! All major functions use Rayon for parallel processing:
//! - `VariantsStats::new()` parallelizes per-variant aggregation
//! - `somatic_depth_quality_ranges()` processes contigs in parallel
//! - `make_glm_rows_from_regions_par()` parallelizes region processing with efficient binary search

use std::collections::{BTreeMap, BTreeSet};

use anyhow::Context;
use csv::ByteRecord;
use dashmap::DashMap;
use log::debug;
use ordered_float::OrderedFloat;
use rayon::prelude::*;
use serde::{Deserialize, Serialize, Serializer};

use crate::{
    annotation::{vep::VepImpact, Annotation},
    config::Config,
    helpers::bin_data,
    io::{
        bed::read_bed, dict::read_dict, gff::features_ranges, readers::get_gz_reader,
        tsv::tsv_reader, writers::get_gz_writer,
    },
    positions::{
        contig_to_num, merge_overlapping_genome_ranges, par_overlaps, range_intersection_par,
        GenomeRange,
    },
    scan::bin::{parse_bin_record_into, BinRowBuf},
};

use super::variant_collection::{Variant, Variants};

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VariantsStats {
    pub n: u32,
    #[serde(serialize_with = "serialize_dashmap_sort")]
    pub alteration_categories: DashMap<String, u32>,
    #[serde(serialize_with = "serialize_dashmap_sort")]
    pub cosmic: DashMap<u64, u32>,
    #[serde(serialize_with = "serialize_dashmap_sort")]
    pub n_alts: DashMap<u32, u32>,
    #[serde(serialize_with = "serialize_dashmap_sort")]
    pub depths: DashMap<u32, u32>,
    #[serde(serialize_with = "serialize_dashmap_sort")]
    pub vafs: DashMap<OrderedFloat<f32>, u32>,
    #[serde(serialize_with = "serialize_dashmap_sort")]
    pub vep_impact: DashMap<String, u32>,
    #[serde(serialize_with = "serialize_dashmap_sort")]
    pub consequences: DashMap<String, u32>,
    #[serde(serialize_with = "serialize_dashmap_sort")]
    pub genes: DashMap<String, u32>,
    #[serde(serialize_with = "serialize_dashmap_sort")]
    pub genes_nonsynonymus: DashMap<String, u32>,

    pub n_gnomad: usize,
    pub gnomad: Vec<(String, Vec<(f64, usize)>)>,

    pub somatic_rates: SomaticVariantRates,
    pub high_depth_somatic_rates: SomaticVariantRates,
    pub mutation_rates: Vec<(String, (u32, usize))>,

    #[serde(serialize_with = "serialize_dashmap_sort")]
    pub insertions_len: DashMap<u32, u32>,

    #[serde(serialize_with = "serialize_dashmap_sort")]
    pub deletions_len: DashMap<u32, u32>,
}

pub fn serialize_dashmap_sort<S, T>(
    data: &DashMap<T, u32>,
    serializer: S,
) -> Result<S::Ok, S::Error>
where
    S: Serializer,
    T: Serialize + Ord + std::hash::Hash + Clone,
{
    let ordered: BTreeMap<_, _> = data
        .iter()
        .map(|entry| (entry.key().clone(), *entry.value()))
        .collect();

    ordered.serialize(serializer)
}

impl VariantsStats {
    pub fn new(
        variants: &mut Variants,
        _id: &str,
        config: &Config,
        high_depth_ranges: &[GenomeRange],
    ) -> anyhow::Result<Self> {
        let n = variants.data.len() as u32;
        let alteration_categories: DashMap<String, u32> = DashMap::new();
        let vep_impact: DashMap<String, u32> = DashMap::new();
        let genes: DashMap<String, u32> = DashMap::new();
        let genes_nonsynonymus: DashMap<String, u32> = DashMap::new();
        let consequences: DashMap<String, u32> = DashMap::new();
        let n_alts: DashMap<u32, u32> = DashMap::new();
        let depths: DashMap<u32, u32> = DashMap::new();
        let vafs: DashMap<OrderedFloat<f32>, u32> = DashMap::new();
        let cosmic: DashMap<u64, u32> = DashMap::new();
        let gnomads: DashMap<String, Vec<f64>> = DashMap::new();
        let context: DashMap<String, Vec<String>> = DashMap::new();
        let insertions_len: DashMap<u32, u32> = DashMap::new();
        let deletions_len: DashMap<u32, u32> = DashMap::new();

        variants.data.par_iter().for_each(|v| {
            // VEP
            if let Ok(best_vep) = v.best_vep() {
                if let Some(ref impact) = best_vep.extra.impact {
                    *vep_impact.entry(impact.to_string()).or_default() += 1;
                }
                if let Some(ref gene) = best_vep.extra.symbol {
                    *genes.entry(gene.to_string()).or_default() += 1;
                }
                if let (Some(impact), Some(gene)) = (best_vep.extra.impact, best_vep.extra.symbol) {
                    if impact <= VepImpact::MODERATE {
                        *genes_nonsynonymus.entry(gene).or_default() += 1;
                    }
                }

                if let Some(csqs) = best_vep.consequence {
                    let mut csq: Vec<String> = csqs.into_iter().map(String::from).collect();
                    csq.sort();
                    csq.dedup();
                    *consequences.entry(csq.join(", ")).or_default() += 1;
                }
            }

            // VAF
            let (n_alt, depth) = v.n_alt_depth();
            *n_alts.entry(n_alt as u32).or_default() += 1;
            *depths.entry(depth as u32).or_default() += 1;

            let vaf = OrderedFloat::from(((n_alt * 1_000.0 / depth).round() / 10.0) as f32);
            if !vaf.is_nan() {
                *vafs.entry(vaf).or_default() += 1;
            }

            let callers = v.callers();

            // Annotations
            v.annotations.iter().for_each(|annotation| {
                match annotation {
                    Annotation::Cosmic(v) => *cosmic.entry(v.cosmic_cnt).or_default() += 1,
                    Annotation::GnomAD(v) => {
                        v.to_vec()
                            .iter()
                            .map(|e| e.to_string_value_pair())
                            .for_each(|(key, value)| {
                                gnomads.entry(key).or_default().push(value);
                            });
                    }
                    Annotation::TriNucleotides(bases) => context
                        .entry(bases.iter().map(|v| v.to_string()).collect())
                        .or_default()
                        .push(callers.clone()),
                    _ => (),
                };
            });

            // Alteration category
            let mut alteration_category_str = v
                .alteration_category()
                .iter()
                .map(|c| c.to_string())
                .collect::<Vec<String>>();
            alteration_category_str.sort();
            alteration_category_str.dedup();

            *alteration_categories
                .entry(alteration_category_str.join(", "))
                .or_default() += 1;

            if let Some(len) = v.insertion_length() {
                *insertions_len.entry(len).or_default() += 1;
            }

            if let Some(len) = v.deletion_length() {
                *deletions_len.entry(len).or_default() += 1;
            }
        });

        let mut n_gnomad = 0;
        let gnomad = gnomads
            .iter()
            .map(|e| {
                let data = e.value().to_vec();
                n_gnomad = data.len();
                (e.key().to_string(), bin_data(data, 0.0001))
            })
            .collect();

        let exon_ranges = features_ranges("exon", config)?;
        let exon_ranges = merge_overlapping_genome_ranges(&exon_ranges);

        let all_somatic_rates = somatic_rates(&variants.data, &exon_ranges, config)?;

        let exon_ranges_ref: Vec<&GenomeRange> = exon_ranges.iter().collect();
        let exons_high_depth = range_intersection_par(
            &high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(),
            &exon_ranges_ref,
        );
        let high_depth = somatic_rates(&variants.data, &exons_high_depth, config)?;

        let mut mutation_rates = Vec::new();

        // HighDepths
        let ann = Annotation::HighDepth;
        let res = variants.annotate_with_ranges(
            high_depth_ranges,
            Some(ann.clone()),
            config.min_n_callers,
            Vec::new(),
        );
        mutation_rates.push((ann.to_string(), res));

        // Exons
        let res =
            variants.annotate_with_ranges(&exon_ranges, None, config.min_n_callers, Vec::new());
        mutation_rates.push(("Exons".to_string(), res));

        // ExonsHighDepths
        let res = variants.annotate_with_ranges(
            &exons_high_depth,
            None,
            config.min_n_callers,
            Vec::new(),
        );
        mutation_rates.push(("Exons HighDepths".to_string(), res));

        for (name, path) in config.panels.iter() {
            let panel_ranges: Vec<GenomeRange> =
                read_bed(path)?.into_iter().map(|e| e.range).collect();
            let ann = Annotation::Panel(name.to_string());
            let res = variants.annotate_with_ranges(
                &panel_ranges,
                Some(ann.clone()),
                config.min_n_callers,
                Vec::new(),
            );
            mutation_rates.push((ann.to_string(), res));

            let panel_ranges: Vec<GenomeRange> = range_intersection_par(
                &high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(),
                &panel_ranges.iter().collect::<Vec<&GenomeRange>>(),
            );

            let ann = Annotation::Panel(format!("{}_HighDepths", name));
            let res = variants.annotate_with_ranges(
                &panel_ranges,
                Some(ann.clone()),
                config.min_n_callers,
                Vec::new(),
            );
            mutation_rates.push((ann.to_string(), res));
        }

        // mutation_rates.sort_by(|(_, (_, an)), (_, (_, bn))| an.cmp(bn));

        // Output rates per megabase
        for (feature, (bp, n)) in &mutation_rates {
            let rate = (*n as f64) / ((*bp as f64) / 1e6);
            println!("{feature}: {rate:.2} mutations/Mb ({n} / {bp})");
        }

        // let (glm_rows, rates_per_mb) =
        //     make_glm_rows_from_regions_par(variants, &high_depth_ranges, &tracked_annotations, config.min_n_callers);
        //
        // // Output rates per megabase
        // for (feature, rate) in &rates_per_mb {
        //     println!("{feature}: {rate:.2} mutations/Mb");
        // }
        //
        // write_glm_rows(
        //     &glm_rows,
        //     &format!("{}/{id}_glm_rows.csv", config.somatic_pipe_stats(id)),
        // )?;

        Ok(Self {
            n,
            alteration_categories,
            cosmic,
            gnomad,
            vafs,
            n_alts,
            depths,
            vep_impact,
            consequences,
            genes,
            genes_nonsynonymus,
            n_gnomad,
            somatic_rates: all_somatic_rates,
            high_depth_somatic_rates: high_depth,
            mutation_rates,
            insertions_len,
            deletions_len,
        })
    }

    pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
        let mut writer = get_gz_writer(filename, true)
            .with_context(|| anyhow::anyhow!("Failed to create file: {}", filename))?;

        serde_json::to_writer(&mut writer, self)
            .with_context(|| anyhow::anyhow!("Failed to serialize JSON to file: {}", filename))?;

        writer.close().with_context(|| {
            anyhow::anyhow!("Failed to close BGZF writer for file: {}", filename)
        })?;

        debug!("Successfully saved variants to {}", filename);
        Ok(())
    }

    pub fn load_from_json(filename: &str) -> anyhow::Result<Self> {
        let mut reader = get_gz_reader(filename)
            .with_context(|| format!("Failed to create BGZF reader for file: {}", filename))?;

        let variants: Self = serde_json::from_reader(&mut reader)
            .with_context(|| format!("Failed to deserialize JSON from file: {}", filename))?;

        debug!("Successfully loaded variants from {}", filename);
        Ok(variants)
    }
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct SomaticVariantRates {
    pub wgs_length: u32,
    pub total_variants: usize,
    pub somatic_mutation_rate_wgs: f64,
    pub exon_count: usize,
    pub total_exon_bases: u32,
    pub variants_in_coding: usize,
    pub somatic_mutation_rate_coding: f64,
    pub nonsynonymous_variants: usize,
    pub somatic_nonsynonymous_rate_coding: f64,
}

pub fn somatic_rates(
    variants: &[Variant],
    exon_ranges: &Vec<GenomeRange>,
    config: &Config,
) -> anyhow::Result<SomaticVariantRates> {
    let ol = par_overlaps(variants, exon_ranges);

    let n_coding = ol
        .iter()
        .filter_map(|i| variants[*i].best_vep().ok())
        .filter_map(|bv| bv.impact())
        .filter(|impact| *impact <= VepImpact::MODERATE)
        .count();

    let n_bases_m: u32 = exon_ranges.par_iter().map(|gr| gr.length()).sum();
    let mega_base_m = n_bases_m as f64 / 10.0e6;

    let wgs_len: u32 = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum();
    let rate_wgs = variants.len() as f64 / (wgs_len as f64 / 10.0e6);

    let n_exons_mb = ol.len() as f64 / mega_base_m;
    let n_coding_mb = n_coding as f64 / mega_base_m;

    Ok(SomaticVariantRates {
        total_variants: variants.len(),
        exon_count: exon_ranges.len(),
        variants_in_coding: ol.len(),
        nonsynonymous_variants: n_coding,
        total_exon_bases: n_bases_m,
        wgs_length: wgs_len,
        somatic_mutation_rate_wgs: rate_wgs,
        somatic_mutation_rate_coding: n_exons_mb,
        somatic_nonsynonymous_rate_coding: n_coding_mb,
    })
}

/// Computes genomic ranges with sufficient depth and with excessive low-quality reads
/// by jointly scanning normal and tumoral per-contig depth TSV files.
///
/// For each contig, the function reads the corresponding gzipped TSV files from the
/// normal and tumoral directories, iterates over bins in lockstep, and validates that
/// both files are perfectly aligned (same contig, same start position, same bin layout).
///
/// Two kinds of ranges are produced:
/// - **High-quality depth ranges**: consecutive bins where *both* normal and tumoral
///   depths are greater than or equal to `config.min_high_quality_depth`.
/// - **Low-quality excess ranges**: consecutive bins where *both* normal and tumoral
///   bins contain more low-quality reads than `config.max_depth_low_quality`.
///
/// Contigs are processed in parallel. Within each contig, adjacent or overlapping
/// ranges are merged before being returned.
///
/// # Errors
///
/// Returns an error if:
/// - A normal or tumoral TSV file cannot be opened or decompressed.
/// - A TSV record cannot be parsed.
/// - The normal and tumoral files differ in number of lines.
/// - Corresponding records disagree on contig name, start position, or bin structure
///   (depth or low-quality vector length).
///
/// # Returns
///
/// A tuple `(high_depth_ranges, lowq_excess_ranges)` where each element is a `Vec<GenomeRange>`
/// spanning all contigs.
///
/// # Notes
///
/// - Input TSV files are expected to be tab-delimited, headerless, and sorted by
///   genomic position.
/// - Low-quality ranges represent **excess low-quality signal** (bins exceeding the
///   configured threshold), not acceptable regions.
/// - The implementation reuses parsing buffers and avoids per-line allocations for
///   performance.
///
/// # Panics
///
/// This function does not intentionally panic.
pub fn somatic_depth_quality_ranges(
    id: &str,
    config: &Config,
) -> anyhow::Result<(Vec<GenomeRange>, Vec<GenomeRange>)> {
    // chr1..chr22 + X,Y,M
    let contigs: Vec<String> = read_dict(&config.dict_file)?
        .into_iter()
        .map(|(sn, _ln)| sn)
        .collect();

    let cfg = config;

    let per_contig = contigs
        .into_par_iter()
        .map(|contig| {
            let normal_path = format!("{}/{}_count.tsv.gz", cfg.normal_dir_count(id), contig);
            let tumor_path = format!("{}/{}_count.tsv.gz", cfg.tumoral_dir_count(id), contig);

            let normal_rdr = get_gz_reader(&normal_path)
                .with_context(|| format!("Failed to open normal file: {}", normal_path))?;
            let tumor_rdr = get_gz_reader(&tumor_path)
                .with_context(|| format!("Failed to open tumor file: {}", tumor_path))?;

            let mut high_runs: Vec<GenomeRange> = Vec::new();
            let mut lowq_runs: Vec<GenomeRange> = Vec::new();

            let mut n_tsv = tsv_reader(normal_rdr); // normal_rdr: impl Read
            let mut t_tsv = tsv_reader(tumor_rdr);

            let mut n_rec = ByteRecord::new();
            let mut t_rec = ByteRecord::new();

            let mut n_buf = BinRowBuf::default();
            let mut t_buf = BinRowBuf::default();

            let mut line_no = 0usize;

            loop {
                let n_ok = n_tsv.read_byte_record(&mut n_rec)?;
                let t_ok = t_tsv.read_byte_record(&mut t_rec)?;

                if n_ok || t_ok {
                    line_no += 1;
                }

                match (n_ok, t_ok) {
                    (false, false) => break,
                    (true, false) => {
                        anyhow::bail!("{} has extra lines at {}", normal_path, line_no)
                    }
                    (false, true) => anyhow::bail!("{} has extra lines at {}", tumor_path, line_no),
                    (true, true) => {
                        let (n_start, n_depths, n_lowq) =
                            parse_bin_record_into(&n_rec, &mut n_buf, &contig)
                                .with_context(|| format!("{} line {}", normal_path, line_no))?;

                        let (t_start, t_depths, t_lowq) =
                            parse_bin_record_into(&t_rec, &mut t_buf, &contig)
                                .with_context(|| format!("{} line {}", tumor_path, line_no))?;

                        anyhow::ensure!(n_start == t_start, "start mismatch at line {}", line_no);
                        anyhow::ensure!(
                            n_depths.len() == t_depths.len(),
                            "depth len mismatch at line {}",
                            line_no
                        );
                        anyhow::ensure!(
                            n_lowq.len() == t_lowq.len(),
                            "lowq len mismatch at line {}",
                            line_no
                        );

                        let high_mask_iter = n_depths.iter().zip(t_depths).map(|(&nd, &td)| {
                            nd >= cfg.min_high_quality_depth && td >= cfg.min_high_quality_depth
                        });

                        let lowq_mask_iter = n_lowq.iter().zip(t_lowq).map(|(&nq, &tq)| {
                            nq > cfg.max_depth_low_quality && tq > cfg.max_depth_low_quality
                        });

                        high_runs.extend(ranges_from_consecutive_true_iter(
                            high_mask_iter,
                            n_start,
                            &contig,
                        ));
                        lowq_runs.extend(ranges_from_consecutive_true_iter(
                            lowq_mask_iter,
                            n_start,
                            &contig,
                        ));
                    }
                }
            }

            // Merge adjacent/overlapping ranges within this contig
            high_runs = merge_adjacent_ranges(high_runs);
            lowq_runs = merge_adjacent_ranges(lowq_runs);

            Ok((high_runs, lowq_runs))
        })
        .collect::<anyhow::Result<Vec<_>>>()?;

    // Flatten
    let (high_all, low_all): (Vec<_>, Vec<_>) = per_contig.into_iter().unzip();
    Ok((
        high_all.into_iter().flatten().collect(),
        low_all.into_iter().flatten().collect(),
    ))
}

/// Iterator-based version (no temporary Vec<bool>).
/// Produces end-exclusive ranges in the same shape as your old function.
pub fn ranges_from_consecutive_true_iter<I>(mask: I, start0: u32, contig: &str) -> Vec<GenomeRange>
where
    I: IntoIterator<Item = bool>,
{
    let contig = contig_to_num(contig);
    let mut ranges = Vec::new();
    let mut current_start: Option<u32> = None;
    let mut i: u32 = 0;

    for v in mask {
        if v {
            if current_start.is_none() {
                current_start = Some(start0 + i);
            }
        } else if let Some(s) = current_start.take() {
            ranges.push(GenomeRange {
                contig,
                range: s..(start0 + i),
            });
        }
        i += 1;
    }

    if let Some(s) = current_start {
        ranges.push(GenomeRange {
            contig,
            range: s..(start0 + i),
        });
    }

    ranges
}

/// Merge overlapping or touching ranges per contig (end-exclusive).
pub fn merge_adjacent_ranges(mut ranges: Vec<GenomeRange>) -> Vec<GenomeRange> {
    if ranges.is_empty() {
        return ranges;
    }

    ranges.sort_by(|a, b| (a.contig, a.range.start).cmp(&(b.contig, b.range.start)));

    let mut merged = Vec::with_capacity(ranges.len());
    let mut cur = ranges[0].clone();

    for r in ranges.into_iter().skip(1) {
        if r.contig == cur.contig && r.range.start <= cur.range.end {
            if r.range.end > cur.range.end {
                cur.range.end = r.range.end;
            }
        } else {
            merged.push(cur);
            cur = r;
        }
    }
    merged.push(cur);
    merged
}

/// A region-level data structure for modeling mutation rates using GLMs or other statistical models.
///
/// Each `GlmRow` represents a genomic interval (e.g., from a BED file) and contains:
/// - The region's coordinates (`contig`, `start`, `end`)
/// - Its length in base pairs
/// - The number of somatic mutations overlapping the region
/// - A list of binary feature labels (e.g., "Early", "HighGC")
///
/// This structure is designed to be serializable (e.g., to CSV) for downstream statistical modeling.
///
/// # Example
/// ```
/// GlmRow {
///     contig: "chr1".to_string(),
///     start: 100_000,
///     end: 101_000,
///     length: 1000,
///     mutation_count: 3,
///     features: vec!["Early".into(), "HighGC".into()],
/// }
/// ```
#[derive(Debug, Serialize)]
pub struct GlmRow {
    /// Chromosome name (e.g., "chr1")
    pub contig: String,

    /// Start coordinate (0-based, inclusive)
    pub start: u32,

    /// End coordinate (0-based, exclusive)
    pub end: u32,

    /// Length of the region in base pairs (end - start)
    pub length: usize,

    /// Number of variants overlapping this region
    pub mutation_count: usize,

    /// Binary feature labels associated with this region
    pub features: Vec<String>,
}

/// Builds a GLM-ready table from a set of fixed genomic regions and a list of annotated variants,
/// in parallel, using efficient binary search-based double-pointer traversal.
///
/// For each region, this function:
/// - Counts how many variants fall within the region
/// - Extracts the relevant annotations (features) from overlapping variants
/// - Produces one `GlmRow` with coordinates, mutation count, and feature labels
///
/// Additionally, it computes the **mutation rate per megabase** for each tracked annotation
/// by aggregating counts and total base coverage across regions where each feature is present.
///
/// # Assumptions
/// - `variants.data` must be sorted by `position.contig` then `position.position`
/// - `regions` must be sorted by `range.contig` then `range.start`
///
/// # Arguments
/// * `variants` - A list of annotated somatic variants (sorted).
/// * `regions` - A set of genomic bins or intervals (e.g., from BED) used as modeling units (sorted).
/// * `tracked_annotations` - A list of `Annotation` values to track as binary features.
///
/// # Returns
/// A tuple:
/// * `Vec<GlmRow>` — one per region, with mutation count and feature labels.
/// * `Vec<(String, f64)>` — per-feature mutation rate (mutations per megabase).
///
/// # Example
/// ```rust
/// let (rows, rates) = make_glm_rows_from_regions_par(
///     &variants,
///     &genome_bins,
///     &[
///         Annotation::ReplicationTiming(ReplicationClass::Early),
///         Annotation::HighDepths
///     ]
/// );
///
/// for (feature, rate) in rates {
///     println!("{feature}: {:.2} mutations/Mb", rate);
/// }
/// ```
pub fn make_glm_rows_from_regions_par(
    variants: &Variants,
    regions: &[GenomeRange],
    tracked_annotations: &[Annotation],
    min_callers: u8,
) -> (Vec<GlmRow>, Vec<(String, f64)>) {
    let (glm_rows, feature_stats): (Vec<GlmRow>, BTreeMap<String, (usize, usize)>) = regions
        .par_iter()
        .map(|region| {
            let mut mutation_count = 0;
            let mut feature_set = BTreeSet::new();

            // Binary search into sorted variant list
            let start_idx = variants.data.partition_point(|v| {
                v.position.contig < region.contig
                    || (v.position.contig == region.contig
                        && v.position.position < region.range.start)
            });

            for var in &variants.data[start_idx..] {
                let pos = &var.position;

                if pos.contig != region.contig || pos.position >= region.range.end {
                    break;
                }

                if region.range.contains(&pos.position) && var.n_callers() >= min_callers {
                    mutation_count += 1;

                    for ann in &var.annotations {
                        if tracked_annotations.contains(ann) {
                            feature_set.insert(ann.to_string());
                        }
                    }
                }
            }

            let region_len = region.length() as usize;

            let row = GlmRow {
                contig: region.contig.to_string(),
                start: region.range.start,
                end: region.range.end,
                length: region_len,
                mutation_count,
                features: feature_set.clone().into_iter().collect(),
            };

            // Local (per-thread) accumulation of stats
            let mut stats = BTreeMap::new();
            for feat in &feature_set {
                stats.insert(feat.clone(), (mutation_count, region_len));
            }

            (vec![row], stats)
        })
        .reduce(
            || (Vec::new(), BTreeMap::new()),
            |(mut rows_a, mut stats_a), (mut rows_b, stats_b)| {
                rows_a.append(&mut rows_b);
                for (feat, (n, bp)) in stats_b {
                    stats_a
                        .entry(feat)
                        .and_modify(|(na, bpa)| {
                            *na += n;
                            *bpa += bp;
                        })
                        .or_insert((n, bp));
                }
                (rows_a, stats_a)
            },
        );

    let ranges_len: usize = regions.iter().map(|a| a.length() as usize).sum();
    assert_eq!(10e6 as usize, 1_000_000);

    // Compute mutation rates per Mb from aggregate stats
    let rates_per_mb: Vec<(String, f64)> = feature_stats
        .into_iter()
        .map(|(feat, (n, bp))| {
            debug!("{bp}");
            let rate = if bp > 0 {
                (n as f64) / ((ranges_len as f64) / 10e6)
            } else {
                0.0
            };
            (feat, rate)
        })
        .collect();

    (glm_rows, rates_per_mb)
}

/// Returns a flat table: one-hot encoded feature columns.
fn flatten_glm_rows(
    rows: &[GlmRow],
) -> (Vec<String>, Vec<std::collections::HashMap<String, String>>) {
    let mut all_features: BTreeSet<String> = BTreeSet::new();
    for r in rows {
        all_features.extend(r.features.iter().cloned());
    }

    let feature_list: Vec<_> = all_features.into_iter().collect();

    let mut table: Vec<_> = rows
        .iter()
        .map(|r| {
            let mut row = std::collections::HashMap::new();
            row.insert("contig".into(), r.contig.clone());
            row.insert("start".into(), r.start.to_string());
            row.insert("end".into(), r.end.to_string());
            row.insert("length".into(), r.length.to_string());
            row.insert("mutation_count".into(), r.mutation_count.to_string());
            row.insert("log_length".into(), (r.length as f64).ln().to_string());

            for f in &feature_list {
                row.insert(
                    f.clone(),
                    if r.features.contains(f) { "1" } else { "0" }.into(),
                );
            }

            row
        })
        .collect();

    // Parallel sort by contig and start
    table.par_sort_by(|a, b| {
        let ca = a.get("contig").unwrap();
        let cb = b.get("contig").unwrap();
        let sa = a.get("start").unwrap().parse::<u32>().unwrap();
        let sb = b.get("start").unwrap().parse::<u32>().unwrap();
        (ca, sa).cmp(&(cb, sb))
    });

    (feature_list, table)
}

pub fn write_glm_rows(all_rows: &[GlmRow], csv_path: &str) -> anyhow::Result<()> {
    let (features, flat_rows) = flatten_glm_rows(all_rows);
    let mut writer = csv::Writer::from_path(csv_path)?;

    let mut headers = vec![
        "contig",
        "start",
        "end",
        "length",
        "log_length",
        "mutation_count",
    ];
    headers.extend(features.iter().map(|s| s.as_str()));
    writer.write_record(&headers)?;

    for row in flat_rows {
        let values: Vec<_> = headers.iter().map(|&h| row.get(h).unwrap()).collect();
        writer.write_record(values)?;
    }
    writer.flush()?;
    Ok(())
}

#[cfg(test)]
mod tests {
    use log::info;

    use super::*;
    use crate::{helpers::test_init, positions::GenomeRangeExt};

    #[test]
    fn high_depths() -> anyhow::Result<()> {
        test_init();
        let config = Config::default();

        let id = "DUMCO";

        let (mut high_depth_ranges, mut low_quality_ranges) =
            somatic_depth_quality_ranges(id, &config)?;
        high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
        low_quality_ranges.par_sort_by_key(|r| (r.contig, r.range.start));

        // let high_depth_ranges = merge_adjacent_ranges(high_depth_ranges);
        // let low_quality_ranges = merge_adjacent_ranges(low_quality_ranges);


        info!(
            "High-depth ranges: n={} bp={}\nLowQ ranges: n={} bp={}",
            high_depth_ranges.len(),
            high_depth_ranges.total_len(),
            low_quality_ranges.len(),
            low_quality_ranges.total_len(),
        );
        high_depth_ranges.iter().take(10).for_each(|e| println!("{e:?}"));
        low_quality_ranges.iter().take(10).for_each(|e| println!("{e:?}"));
        Ok(())
    }
}