//! Modkit pileup parsing and gene-level epigenetic activity computation. //! //! Reads modkit bedMethyl pileup files (BGZF + Tabix indexed) and BED4 region //! files to compute a per-gene **epigenetic activity proxy** based on the //! contrast between promoter and gene-body methylation. //! //! Region methylation is fetched **per region** via the Tabix index using //! [`fetch_tabix_lines_with`], avoiding loading the full genome-wide pileup //! (hundreds of millions of CpG sites) into memory. //! //! # Pipeline //! //! ```text //! pileup.bed.gz + .tbi ──► fetch per region (fetch_tabix_lines_with) //! regions.bed ──► read_bed4 //! │ //! compute_gene_activity //! │ //! write_gene_activity_bed9_itemrgb //! ``` //! //! # Coordinate system //! //! All coordinates are **0-based half-open `[start, end)`** throughout, //! matching the BED convention. `PileupSite.start` is a single-base position. use anyhow::{anyhow, Context, Result}; use std::{ cmp::Ordering, collections::HashMap, fs, fs::File, io::{BufWriter, Write}, path::Path, }; use crate::{ helpers::{diverging_rgb, TempFileGuard}, io::{bed::read_bed, readers::fetch_tabix_lines_with}, positions::GenomeRange, }; // ─── Types ─────────────────────────────────────────────────────────────────── /// One methylation call row from a modkit pileup (bedMethyl) file. /// /// Pileup rows are single-base intervals `[start, start+1)`. /// Only the columns needed for region aggregation are stored: /// col 2 (`start`), col 10 (`nvalid_cov`), col 12 (`nmod`). #[derive(Debug, Clone)] pub struct PileupSite { /// 0-based position (half-open interval `[start, start+1)`) pub start: u64, /// Number of valid modification calls at this site (col 10) pub nvalid_cov: u64, /// Number of modified calls at this site (col 12) pub nmod: u64, } /// One region from a BED4 file: chrom, start, end, name. /// /// Coordinates are **0-based half-open** `[start, end)`. #[derive(Debug, Clone)] pub struct BedRegion { pub chrom: String, /// 0-based inclusive start pub start: u64, /// 0-based exclusive end pub end: u64, pub name: String, } /// Promoter/body classification inferred from a region name suffix. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum RegionKind { Promoter, GeneBody, Other, } /// A region name split into gene key and [`RegionKind`]. #[derive(Debug, Clone)] pub struct ParsedName { pub gene_key: String, pub kind: RegionKind, } /// Gene-level epigenetic activity proxy computed from a single sample's pileup. /// /// Score: `log2(body_frac / prom_frac)` — positive means more active-like, /// negative means more repressed-like. Output coordinates use the **promoter** /// interval (anchors the signal near the TSS in genome browsers). #[derive(Debug, Clone)] pub struct GeneActivity { pub gene_key: String, pub chrom: String, /// Promoter start, 0-based inclusive pub start: u64, /// Promoter end, 0-based exclusive pub end: u64, pub prom_frac: f64, pub body_frac: f64, /// `log2(body_frac / prom_frac)` pub score: f64, } // ─── Parsing helpers ───────────────────────────────────────────────────────── /// Parse a region name for `_prom` or `_body` suffix. /// /// - `"FOO_prom"` → gene key `"FOO"`, kind [`RegionKind::Promoter`] /// - `"FOO_body"` → gene key `"FOO"`, kind [`RegionKind::GeneBody`] /// - anything else → gene key = full name, kind [`RegionKind::Other`] pub fn parse_region_name(name: &str) -> ParsedName { if let Some(g) = name.strip_suffix("_prom") { ParsedName { gene_key: g.to_string(), kind: RegionKind::Promoter, } } else if let Some(g) = name.strip_suffix("_body") { ParsedName { gene_key: g.to_string(), kind: RegionKind::GeneBody, } } else { ParsedName { gene_key: name.to_string(), kind: RegionKind::Other, } } } /// Read a BED4 regions file into memory. /// /// Wraps [`read_bed`], inheriting its header skipping (`#`, `track`, `browser`) /// and gzip support. Coordinates are widened from `u32` to `u64`; /// contig names are recovered via `GenomeRange::contig()`. /// /// # Errors /// /// Returns an error if the file cannot be opened or any line is malformed. pub fn read_bed4(path: &str) -> Result> { Ok(read_bed(path)? .into_iter() .map(|row| BedRegion { chrom: row.range.contig(), start: row.range.range.start as u64, end: row.range.range.end as u64, name: row.name.unwrap_or_else(|| ".".to_string()), }) .collect()) } // ─── Region methylation ─────────────────────────────────────────────────────── /// Aggregate methylation fraction over `[start, end)` from a **pre-loaded** /// sorted pileup slice. /// /// Returns `(fraction, sum_nvalid_cov, sum_nmod)` where /// `fraction = Σ Nmod / Σ Nvalid_cov`. Returns `None` if no sites overlap or /// total coverage is zero. /// /// Uses [`slice::partition_point`] for O(log n) start lookup. /// Prefer [`region_fraction_from_tabix`] for production use — this variant is /// useful when the pileup is already loaded in memory (e.g. unit tests). pub fn region_fraction_modified( chrom_sites: &[PileupSite], start: u64, end: u64, ) -> Option<(f64, u64, u64)> { let i = chrom_sites.partition_point(|s| s.start < start); let mut sum_cov = 0u64; let mut sum_mod = 0u64; for s in &chrom_sites[i..] { if s.start >= end { break; } sum_cov = sum_cov.saturating_add(s.nvalid_cov); sum_mod = sum_mod.saturating_add(s.nmod); } if sum_cov == 0 { None } else { Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod)) } } /// Fetch methylation fraction for `region` directly from a Tabix-indexed pileup. /// /// Uses [`fetch_tabix_lines_with`] to seek only to the relevant BGZF chunks — /// no full-file load. Parses col 10 (`Nvalid_cov`) and col 12 (`Nmod`) from /// each overlapping pileup line. /// /// Returns `(fraction, sum_nvalid_cov, sum_nmod)`, or `None` if no sites /// overlap the region or total coverage is zero. /// /// # Arguments /// /// * `pileup_bgz` - Path to the modkit pileup `.bed.gz`; index expected at /// `.tbi` /// * `region` - Genomic region to query (0-based half-open) /// /// # Errors /// /// Returns an error if the Tabix index cannot be read or a pileup line is /// malformed. pub fn region_fraction_from_tabix( pileup_bgz: &str, region: &GenomeRange, ) -> Result> { let mut sum_cov = 0u64; let mut sum_mod = 0u64; fetch_tabix_lines_with(pileup_bgz, region, |line| { let f: Vec<&str> = line.split('\t').collect(); let nvalid_cov: u64 = f .get(9) .ok_or_else(|| anyhow!("Missing Nvalid_cov (col 10) in {pileup_bgz}: {line}"))? .parse() .with_context(|| format!("Bad Nvalid_cov in {pileup_bgz}: {line}"))?; let nmod: u64 = f .get(11) .ok_or_else(|| anyhow!("Missing Nmod (col 12) in {pileup_bgz}: {line}"))? .parse() .with_context(|| format!("Bad Nmod in {pileup_bgz}: {line}"))?; sum_cov = sum_cov.saturating_add(nvalid_cov); sum_mod = sum_mod.saturating_add(nmod); Ok(()) })?; if sum_cov == 0 { Ok(None) } else { Ok(Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod))) } } // ─── Activity computation ───────────────────────────────────────────────────── /// Compute per-gene epigenetic activity from a Tabix-indexed modkit pileup. /// /// For each gene with both a `_prom` and a `_body` region, fetches methylation /// fractions via [`region_fraction_from_tabix`] (one tabix seek per region), /// then computes `score = log2(body_frac / prom_frac)`. /// /// Genes are skipped if either fraction is 0, total coverage is below /// `min_sum_cov`, or the contig is absent from the Tabix index. /// /// Results are sorted by `(chrom, start)` for IGV compatibility. /// /// # Arguments /// /// * `pileup_bgz` - Path to the modkit pileup `.bed.gz` (Tabix-indexed) /// * `regions` - BED4 regions with `_prom` / `_body` name suffixes /// * `min_sum_cov` - Minimum total `Nvalid_cov` required per region /// /// # Errors /// /// Returns an error if any region has `end <= start` or a Tabix fetch fails. pub fn compute_gene_activity( pileup_bgz: &str, regions: &[BedRegion], min_sum_cov: u64, ) -> Result> { #[derive(Clone)] struct Part { chrom: String, start: u64, end: u64, frac: f64, } #[derive(Default)] struct Pair { prom: Option, body: Option, } let mut by_gene: HashMap = HashMap::new(); for r in regions { if r.end <= r.start { return Err(anyhow!( "Invalid region end<=start {}:{}-{} ({})", r.chrom, r.start, r.end, r.name )); } let ParsedName { gene_key, kind } = parse_region_name(&r.name); if kind == RegionKind::Other { continue; } let region = GenomeRange::new(&r.chrom, r.start as u32, r.end as u32); let (frac, sum_cov, _) = match region_fraction_from_tabix(pileup_bgz, ®ion)? { Some(x) => x, None => continue, }; if sum_cov < min_sum_cov || frac <= 0.0 { continue; } let part = Part { chrom: r.chrom.clone(), start: r.start, end: r.end, frac, }; let entry = by_gene.entry(gene_key).or_default(); match kind { RegionKind::Promoter => entry.prom = Some(part), RegionKind::GeneBody => entry.body = Some(part), RegionKind::Other => {} } } let mut out: Vec = by_gene .into_iter() .filter_map(|(gene, pair)| { let (prom, body) = (pair.prom?, pair.body?); Some(GeneActivity { gene_key: gene, chrom: prom.chrom, start: prom.start, end: prom.end, prom_frac: prom.frac, body_frac: body.frac, score: (body.frac / prom.frac).log2(), }) }) .collect(); out.sort_by(|a, b| match a.chrom.cmp(&b.chrom) { Ordering::Equal => a.start.cmp(&b.start), o => o, }); Ok(out) } // ─── Colour / score helpers ─────────────────────────────────────────────────── /// Map a log2-ratio to a BED `score` field (0–1000). /// /// `v` is clipped to `[-clip, +clip]` then linearly scaled: /// `-clip` → 0, `0` → 500, `+clip` → 1000. pub fn bed_score_from_log2_ratio(v: f64, clip: f64) -> u16 { debug_assert!(clip > 0.0, "clip must be > 0"); let t = (v.clamp(-clip, clip) + clip) / (2.0 * clip); (t * 1000.0).round().clamp(0.0, 1000.0) as u16 } // ─── Output ─────────────────────────────────────────────────────────────────── /// Write gene activity as an IGV BED9 track with `itemRgb=On`. /// /// Colour encodes `GeneActivity::score` via [`diverging_rgb`] clipped to /// `[-clip, +clip]`. BED col 5 (`score`) holds the same quantity scaled to /// `[0, 1000]` via [`bed_score_from_log2_ratio`]. /// /// Written atomically: data goes to a UUID temp file, renamed on success. /// A [`TempFileGuard`] removes the temp file automatically on failure. /// /// # Errors /// /// Returns an error if `clip <= 0`, the file cannot be written, or the rename fails. pub fn write_gene_activity_bed9( path: &str, track_name: &str, activity: &[GeneActivity], clip: f64, ) -> Result<()> { if clip <= 0.0 { return Err(anyhow!("clip must be > 0, got {clip}")); } let tmp_dir = Path::new(path).parent().unwrap_or(Path::new(".")); let mut guard = TempFileGuard::new(); let tmp_path = guard.tmp_path("", tmp_dir); let mut w = BufWriter::new( File::create(&tmp_path) .with_context(|| format!("failed to create temp file: {}", tmp_path.display()))?, ); writeln!(w, r#"track name="{}" itemRgb="On""#, track_name)?; for a in activity { let (rr, gg, bb) = diverging_rgb(a.score, clip); writeln!( w, "{}\t{}\t{}\t{}={:.4}\t{}\t.\t{}\t{}\t{},{},{}", a.chrom, a.start, a.end, a.gene_key, a.score, bed_score_from_log2_ratio(a.score, clip), a.start, a.end, rr, gg, bb, )?; } w.flush() .with_context(|| format!("failed to flush: {}", tmp_path.display()))?; drop(w); fs::rename(&tmp_path, path) .with_context(|| format!("failed to rename {} → {path}", tmp_path.display()))?; guard.disarm(); Ok(()) } // ─── Pipeline ───────────────────────────────────────────────────────────────── /// End-to-end pipeline: Tabix pileup + BED4 regions → IGV BED9 activity track. /// /// 1. [`read_bed4`] — parse the gene promoter/body regions /// 2. [`compute_gene_activity`] — fetch per-region methylation fractions via /// Tabix and compute log2 activity scores /// 3. [`write_gene_activity_bed9`] — write the BED9 track atomically /// /// # Returns /// /// The number of gene features written. /// /// # Errors /// /// Returns an error if any pipeline step fails. pub fn pileup_to_activity_bed9( pileup_bgz: &str, regions_bed4_path: &str, min_sum_cov: u64, out_bed9_path: &str, track_name: &str, clip: f64, ) -> Result { let regions = read_bed4(regions_bed4_path)?; let activity = compute_gene_activity(pileup_bgz, ®ions, min_sum_cov)?; let n = activity.len(); write_gene_activity_bed9(out_bed9_path, track_name, &activity, clip)?; Ok(n) } #[cfg(test)] mod tests { use super::*; use crate::{config::Config, helpers::test_init}; use log::info; #[test] fn modkit_activity_igv() -> anyhow::Result<()> { test_init(); let config = Config::default(); let regions_bed4_path = "/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_genes_prom_body.bed"; for id in ["DUMCO", "CHAHA"] { let path = format!( "{}/{}_{}_modkit_pileup.bed.gz", config.tumoral_dir(id), id, config.tumoral_name ); let out = format!( "{}/{}_{}_modkit_activity.bed", config.tumoral_dir(id), id, config.tumoral_name ); let n = pileup_to_activity_bed9( &path, regions_bed4_path, 100, &out, &format!("{id} Gene Activity"), 2.0, )?; info!("{id}: {n} gene activities written"); } Ok(()) } }