//! Modkit pileup parsing and gene-level epigenetic activity computation. //! //! Reads modkit bedMethyl pileup files (BGZF-compressed) and BED4 region //! files to compute a per-gene **epigenetic activity proxy** based on the //! contrast between promoter and gene-body methylation. //! //! # Pipeline //! //! ```text //! pileup.bed.gz ──► read_pileup_index ──► HashMap //! │ //! regions.bed ──► read_bed4 ──► [BedRegion] ─┤ //! │ //! compute_gene_activity_from_pileup //! │ //! write_gene_activity_bed9_itemrgb ◄─┘ //! ``` //! //! # Coordinate system //! //! All coordinates are **0-based half-open `[start, end)`** throughout, matching //! the BED convention. `PileupSite.start` is a single-base position (the interval //! is `[start, start+1)`). use anyhow::{anyhow, Context, Result}; use std::{ cmp::Ordering, collections::HashMap, fs, fs::File, io::{BufRead, BufWriter, Write}, path::Path, }; use crate::{helpers::TempFileGuard, io::{bed::read_bed, readers::get_gz_reader}}; /// One methylation call row from a modkit pileup (bedMethyl) file. /// /// Pileup rows are single-base intervals `[start, start+1)`. /// Only the columns needed for region aggregation are kept: /// - `start`: col 2, 0-based /// - `nvalid_cov`: col 10 (Nvalid_cov) /// - `nmod`: col 12 (Nmod) #[derive(Debug, Clone)] pub struct PileupSite { /// 0-based position (half-open interval `[start, start+1)`) pub start: u64, /// Number of valid modification calls at this site (col 10) pub nvalid_cov: u64, /// Number of modified calls at this site (col 12) pub nmod: u64, } /// One region from a BED4 file: chrom, start, end, name. /// /// Coordinates are **0-based half-open** `[start, end)`. #[derive(Debug, Clone)] pub struct BedRegion { pub chrom: String, /// 0-based inclusive start pub start: u64, /// 0-based exclusive end pub end: u64, pub name: String, } /// Promoter/body classification inferred from a region name suffix. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum RegionKind { Promoter, GeneBody, Other, } /// A region name split into gene key and [`RegionKind`]. #[derive(Debug, Clone)] pub struct ParsedName { pub gene_key: String, pub kind: RegionKind, } /// Parse a region name for `_prom` or `_body` suffix. /// /// - `"FOO_prom"` → gene key `"FOO"`, kind [`RegionKind::Promoter`] /// - `"FOO_body"` → gene key `"FOO"`, kind [`RegionKind::GeneBody`] /// - anything else → gene key = full name, kind [`RegionKind::Other`] pub fn parse_region_name(name: &str) -> ParsedName { if let Some(g) = name.strip_suffix("_prom") { ParsedName { gene_key: g.to_string(), kind: RegionKind::Promoter } } else if let Some(g) = name.strip_suffix("_body") { ParsedName { gene_key: g.to_string(), kind: RegionKind::GeneBody } } else { ParsedName { gene_key: name.to_string(), kind: RegionKind::Other } } } /// Diverging blue→white→red palette for IGV `itemRgb`. /// /// Maps `v` clipped to `[-clip, +clip]` onto the range: /// - `v = -clip` → `(0, 0, 255)` pure blue /// - `v = 0` → `(255, 255, 255)` white /// - `v = +clip` → `(255, 0, 0)` pure red pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) { debug_assert!(clip > 0.0, "clip must be > 0"); let x = v.clamp(-clip, clip); let t = (x + clip) / (2.0 * clip); if t < 0.5 { let u = t / 0.5; let r = (255.0 * u).round() as u8; let g = (255.0 * u).round() as u8; (r, g, 255) } else { let u = (t - 0.5) / 0.5; let g = (255.0 * (1.0 - u)).round() as u8; let b = (255.0 * (1.0 - u)).round() as u8; (255, g, b) } } /// Read a BED4 regions file into memory. /// /// Wraps [`read_bed`] from the `bed` module, inheriting its header skipping /// (`#`, `track`, `browser`) and gzip support. Coordinates are widened from /// `u32` to `u64`; contig names are recovered via `num_to_contig`. /// /// # Arguments /// /// * `path` - Path to the BED4 file (plain or BGZF-compressed) /// /// # Errors /// /// Returns an error if the file cannot be opened or any line is malformed. pub fn read_bed4(path: &str) -> Result> { Ok(read_bed(path)? .into_iter() .map(|row| BedRegion { chrom: row.range.contig(), start: row.range.range.start as u64, end: row.range.range.end as u64, name: row.name.unwrap_or_else(|| ".".to_string()), }) .collect()) } /// Read a modkit pileup file and index sites per chromosome. /// /// Parses columns 1 (chrom), 2 (start), 10 (Nvalid_cov), 12 (Nmod). /// Sites with `Nvalid_cov = 0` are dropped. The per-chromosome vectors are /// sorted by position to enable binary search in [`region_fraction_modified`]. /// /// # Arguments /// /// * `path` - Path to the modkit bedMethyl pileup file (BGZF-compressed) /// /// # Errors /// /// Returns an error if the file cannot be opened or any data line is malformed. pub fn read_pileup_index(path: &str) -> Result>> { let mut map: HashMap> = HashMap::new(); for (i, line) in get_gz_reader(path)?.lines().enumerate() { let line = line.with_context(|| format!("I/O error at line {}", i + 1))?; if line.starts_with('#') || line.is_empty() { continue; } let f: Vec<&str> = line.split('\t').collect(); let chrom = f.get(0) .ok_or_else(|| anyhow!("Missing chrom at line {}", i + 1))?.to_string(); let start: u64 = f.get(1) .ok_or_else(|| anyhow!("Missing start at line {}", i + 1))? .parse().with_context(|| format!("Bad start at line {}", i + 1))?; let nvalid_cov: u64 = f.get(9) .ok_or_else(|| anyhow!("Missing Nvalid_cov (col 10) at line {}", i + 1))? .parse().with_context(|| format!("Bad Nvalid_cov at line {}", i + 1))?; let nmod: u64 = f.get(11) .ok_or_else(|| anyhow!("Missing Nmod (col 12) at line {}", i + 1))? .parse().with_context(|| format!("Bad Nmod at line {}", i + 1))?; if nvalid_cov == 0 { continue; } map.entry(chrom).or_default().push(PileupSite { start, nvalid_cov, nmod }); } for v in map.values_mut() { v.sort_by_key(|s| s.start); } Ok(map) } /// Aggregate methylation fraction over `[start, end)` from a sorted pileup slice. /// /// Returns `(fraction, sum_nvalid_cov, sum_nmod)` where /// `fraction = sum(Nmod) / sum(Nvalid_cov)`. Returns `None` if no sites overlap /// or if total coverage is zero. /// /// Uses [`slice::partition_point`] for O(log n) start lookup. pub fn region_fraction_modified( chrom_sites: &[PileupSite], start: u64, end: u64, ) -> Option<(f64, u64, u64)> { let i = chrom_sites.partition_point(|s| s.start < start); let mut sum_cov: u64 = 0; let mut sum_mod: u64 = 0; for s in &chrom_sites[i..] { if s.start >= end { break; } sum_cov = sum_cov.saturating_add(s.nvalid_cov); sum_mod = sum_mod.saturating_add(s.nmod); } if sum_cov == 0 { None } else { Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod)) } } /// Gene-level epigenetic activity proxy computed from a single sample's pileup. /// /// Score: `log2(body_frac / prom_frac)` where both fractions are /// coverage-weighted methylation ratios over the respective regions. /// /// - Score > 0: gene-body methylation exceeds promoter → "active-like" /// - Score < 0: promoter methylation exceeds gene-body → "repressed-like" /// /// Output coordinates use the **promoter** interval (anchors to TSS in genome browsers). #[derive(Debug, Clone)] pub struct GeneActivity { pub gene_key: String, pub chrom: String, /// Promoter start, 0-based pub start: u64, /// Promoter end, 0-based exclusive pub end: u64, pub prom_frac: f64, pub body_frac: f64, /// `log2(body_frac / prom_frac)` pub score: f64, } /// Compute per-gene epigenetic activity from a modkit pileup and BED4 regions. /// /// For each gene with both a `_prom` and a `_body` region: /// 1. Aggregate methylation fraction `f(R) = Σ Nmod / Σ Nvalid_cov` per region /// 2. Skip genes where either fraction is 0 or coverage is below `min_sum_cov` /// 3. Compute `score = log2(body_frac / prom_frac)` /// /// Results are sorted by `(chrom, start)` for IGV compatibility. /// /// # Arguments /// /// * `pileup` - Per-chromosome pileup sites from [`read_pileup_index`] /// * `regions` - BED4 regions with `_prom` / `_body` name suffixes /// * `min_sum_cov` - Minimum total `Nvalid_cov` required per region /// /// # Errors /// /// Returns an error if any region has `end <= start`. pub fn compute_gene_activity_from_pileup( pileup: &HashMap>, regions: &[BedRegion], min_sum_cov: u64, ) -> Result> { #[derive(Clone)] struct Part { chrom: String, start: u64, end: u64, frac: f64, sum_cov: u64 } #[derive(Default)] struct Pair { prom: Option, body: Option } let mut by_gene: HashMap = HashMap::new(); for r in regions { if r.end <= r.start { return Err(anyhow!( "Invalid region end<=start {}:{}-{} ({})", r.chrom, r.start, r.end, r.name )); } let ParsedName { gene_key, kind } = parse_region_name(&r.name); if kind == RegionKind::Other { continue; } let chrom_sites = match pileup.get(&r.chrom) { Some(v) => v, None => continue }; let (frac, sum_cov, _) = match region_fraction_modified(chrom_sites, r.start, r.end) { Some(x) => x, None => continue, }; if sum_cov < min_sum_cov || frac <= 0.0 { continue; } let part = Part { chrom: r.chrom.clone(), start: r.start, end: r.end, frac, sum_cov }; let entry = by_gene.entry(gene_key).or_default(); match kind { RegionKind::Promoter => entry.prom = Some(part), RegionKind::GeneBody => entry.body = Some(part), RegionKind::Other => {} } } let mut out = Vec::new(); for (gene, pair) in by_gene { let (prom, body) = match (pair.prom, pair.body) { (Some(p), Some(b)) => (p, b), _ => continue, }; out.push(GeneActivity { gene_key: gene, chrom: prom.chrom, start: prom.start, end: prom.end, prom_frac: prom.frac, body_frac: body.frac, score: (body.frac / prom.frac).log2(), }); } out.sort_by(|a, b| match a.chrom.cmp(&b.chrom) { Ordering::Equal => a.start.cmp(&b.start), o => o, }); Ok(out) } /// Convert a log2-ratio to a BED `score` field (0–1000). /// /// `v` is clipped to `[-clip, +clip]` then linearly mapped: /// `-clip` → 0, `0` → 500, `+clip` → 1000. pub fn bed_score_from_log2_ratio(v: f64, clip: f64) -> u16 { debug_assert!(clip > 0.0, "clip must be > 0"); let t = (v.clamp(-clip, clip) + clip) / (2.0 * clip); (t * 1000.0).round().clamp(0.0, 1000.0) as u16 } /// Write gene activity as an IGV BED9 track with `itemRgb=On`. /// /// Color encodes `GeneActivity::score` (`log2(body/prom)`) via /// [`diverging_rgb`] clipped to `[-clip, +clip]`. The BED `score` column /// (col 5) encodes the same quantity scaled to `[0, 1000]` via /// [`bed_score_from_log2_ratio`]. /// /// Output is written atomically: data goes to a UUID temp file in the same /// directory as `path`, then renamed on success. A [`TempFileGuard`] ensures /// the temp file is removed automatically on failure. /// /// # Arguments /// /// * `path` - Destination BED9 file path /// * `track_name` - IGV track name shown in the track label /// * `activity` - Gene activity records (produced by [`compute_gene_activity_from_pileup`]) /// * `clip` - Symmetric clip value for color and score mapping; must be > 0 /// /// # Errors /// /// Returns an error if `clip <= 0`, the file cannot be written, or the atomic /// rename fails. pub fn write_gene_activity_bed9_itemrgb( path: &str, track_name: &str, activity: &[GeneActivity], clip: f64, ) -> Result<()> { if clip <= 0.0 { return Err(anyhow!("clip must be > 0, got {clip}")); } let tmp_dir = Path::new(path).parent().unwrap_or(Path::new(".")); let mut guard = TempFileGuard::new(); let tmp_path = guard.tmp_path("", tmp_dir); let out = File::create(&tmp_path) .with_context(|| format!("failed to create temp file: {}", tmp_path.display()))?; let mut w = BufWriter::new(out); writeln!(w, r#"track name="{}" itemRgb="On""#, track_name)?; for a in activity { let (rr, gg, bb) = diverging_rgb(a.score, clip); let bed_score = bed_score_from_log2_ratio(a.score, clip); writeln!( w, "{}\t{}\t{}\t{}={:.4}\t{}\t.\t{}\t{}\t{},{},{}", a.chrom, a.start, a.end, a.gene_key, a.score, bed_score, a.start, a.end, rr, gg, bb, )?; } w.flush().with_context(|| format!("failed to flush: {}", tmp_path.display()))?; drop(w); fs::rename(&tmp_path, path) .with_context(|| format!("failed to rename {} → {path}", tmp_path.display()))?; guard.disarm(); Ok(()) } /// End-to-end helper: pileup + BED4 regions → IGV BED9 activity track. /// /// Runs the full pipeline in order: /// 1. [`read_pileup_index`] — parse and index the modkit pileup /// 2. [`read_bed4`] — parse the gene promoter/body regions /// 3. [`compute_gene_activity_from_pileup`] — compute log2 activity scores /// 4. [`write_gene_activity_bed9_itemrgb`] — write the BED9 track atomically /// /// # Returns /// /// The number of gene features written. /// /// # Errors /// /// Returns an error if any pipeline step fails. pub fn pileup_regions_to_activity_bed9_itemrgb( pileup_path: &str, regions_bed4_path: &str, min_sum_cov: u64, out_bed9_path: &str, track_name: &str, clip: f64, ) -> Result { let pile = read_pileup_index(pileup_path)?; let regions = read_bed4(regions_bed4_path)?; let activity = compute_gene_activity_from_pileup(&pile, ®ions, min_sum_cov)?; let n = activity.len(); write_gene_activity_bed9_itemrgb(out_bed9_path, track_name, &activity, clip)?; Ok(n) } #[cfg(test)] mod tests { use log::info; use super::*; use crate::{config::Config, helpers::test_init}; #[test] fn modkit_activity_igv() -> anyhow::Result<()> { test_init(); let config = Config::default(); let regions_bed4_path = "/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_genes_prom_body.bed"; for id in ["DUMCO", "CHAHA"] { let path = format!( "{}/{}_{}_modkit_pileup.bed.gz", config.tumoral_dir(id), id, config.tumoral_name ); let out = format!( "{}/{}_{}_modkit_activity.bed", config.tumoral_dir(id), id, config.tumoral_name ); let n = pileup_regions_to_activity_bed9_itemrgb( &path, regions_bed4_path, 100, &out, &format!("{id} Gene Activity"), 2.0, )?; info!("{id}: {n} gene activities written"); } Ok(()) } }