| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474 |
- //! Modkit pileup parsing and gene-level epigenetic activity computation.
- //!
- //! Reads modkit bedMethyl pileup files (BGZF-compressed) and BED4 region
- //! files to compute a per-gene **epigenetic activity proxy** based on the
- //! contrast between promoter and gene-body methylation.
- //!
- //! # Pipeline
- //!
- //! ```text
- //! pileup.bed.gz ──► read_pileup_index ──► HashMap<chrom, [PileupSite]>
- //! │
- //! regions.bed ──► read_bed4 ──► [BedRegion] ─┤
- //! │
- //! compute_gene_activity_from_pileup
- //! │
- //! write_gene_activity_bed9_itemrgb ◄─┘
- //! ```
- //!
- //! # Coordinate system
- //!
- //! All coordinates are **0-based half-open `[start, end)`** throughout, matching
- //! the BED convention. `PileupSite.start` is a single-base position (the interval
- //! is `[start, start+1)`).
- use anyhow::{anyhow, Context, Result};
- use std::{
- cmp::Ordering,
- collections::HashMap,
- fs,
- fs::File,
- io::{BufRead, BufWriter, Write},
- path::Path,
- };
- use crate::{helpers::TempFileGuard, io::{bed::read_bed, readers::get_gz_reader}};
- /// One methylation call row from a modkit pileup (bedMethyl) file.
- ///
- /// Pileup rows are single-base intervals `[start, start+1)`.
- /// Only the columns needed for region aggregation are kept:
- /// - `start`: col 2, 0-based
- /// - `nvalid_cov`: col 10 (Nvalid_cov)
- /// - `nmod`: col 12 (Nmod)
- #[derive(Debug, Clone)]
- pub struct PileupSite {
- /// 0-based position (half-open interval `[start, start+1)`)
- pub start: u64,
- /// Number of valid modification calls at this site (col 10)
- pub nvalid_cov: u64,
- /// Number of modified calls at this site (col 12)
- pub nmod: u64,
- }
- /// One region from a BED4 file: chrom, start, end, name.
- ///
- /// Coordinates are **0-based half-open** `[start, end)`.
- #[derive(Debug, Clone)]
- pub struct BedRegion {
- pub chrom: String,
- /// 0-based inclusive start
- pub start: u64,
- /// 0-based exclusive end
- pub end: u64,
- pub name: String,
- }
- /// Promoter/body classification inferred from a region name suffix.
- #[derive(Debug, Clone, Copy, PartialEq, Eq)]
- pub enum RegionKind {
- Promoter,
- GeneBody,
- Other,
- }
- /// A region name split into gene key and [`RegionKind`].
- #[derive(Debug, Clone)]
- pub struct ParsedName {
- pub gene_key: String,
- pub kind: RegionKind,
- }
- /// Parse a region name for `_prom` or `_body` suffix.
- ///
- /// - `"FOO_prom"` → gene key `"FOO"`, kind [`RegionKind::Promoter`]
- /// - `"FOO_body"` → gene key `"FOO"`, kind [`RegionKind::GeneBody`]
- /// - anything else → gene key = full name, kind [`RegionKind::Other`]
- pub fn parse_region_name(name: &str) -> ParsedName {
- if let Some(g) = name.strip_suffix("_prom") {
- ParsedName { gene_key: g.to_string(), kind: RegionKind::Promoter }
- } else if let Some(g) = name.strip_suffix("_body") {
- ParsedName { gene_key: g.to_string(), kind: RegionKind::GeneBody }
- } else {
- ParsedName { gene_key: name.to_string(), kind: RegionKind::Other }
- }
- }
- /// Diverging blue→white→red palette for IGV `itemRgb`.
- ///
- /// Maps `v` clipped to `[-clip, +clip]` onto the range:
- /// - `v = -clip` → `(0, 0, 255)` pure blue
- /// - `v = 0` → `(255, 255, 255)` white
- /// - `v = +clip` → `(255, 0, 0)` pure red
- pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) {
- debug_assert!(clip > 0.0, "clip must be > 0");
- let x = v.clamp(-clip, clip);
- let t = (x + clip) / (2.0 * clip);
- if t < 0.5 {
- let u = t / 0.5;
- let r = (255.0 * u).round() as u8;
- let g = (255.0 * u).round() as u8;
- (r, g, 255)
- } else {
- let u = (t - 0.5) / 0.5;
- let g = (255.0 * (1.0 - u)).round() as u8;
- let b = (255.0 * (1.0 - u)).round() as u8;
- (255, g, b)
- }
- }
- /// Read a BED4 regions file into memory.
- ///
- /// Wraps [`read_bed`] from the `bed` module, inheriting its header skipping
- /// (`#`, `track`, `browser`) and gzip support. Coordinates are widened from
- /// `u32` to `u64`; contig names are recovered via `num_to_contig`.
- ///
- /// # Arguments
- ///
- /// * `path` - Path to the BED4 file (plain or BGZF-compressed)
- ///
- /// # Errors
- ///
- /// Returns an error if the file cannot be opened or any line is malformed.
- pub fn read_bed4(path: &str) -> Result<Vec<BedRegion>> {
- Ok(read_bed(path)?
- .into_iter()
- .map(|row| BedRegion {
- chrom: row.range.contig(),
- start: row.range.range.start as u64,
- end: row.range.range.end as u64,
- name: row.name.unwrap_or_else(|| ".".to_string()),
- })
- .collect())
- }
- /// Read a modkit pileup file and index sites per chromosome.
- ///
- /// Parses columns 1 (chrom), 2 (start), 10 (Nvalid_cov), 12 (Nmod).
- /// Sites with `Nvalid_cov = 0` are dropped. The per-chromosome vectors are
- /// sorted by position to enable binary search in [`region_fraction_modified`].
- ///
- /// # Arguments
- ///
- /// * `path` - Path to the modkit bedMethyl pileup file (BGZF-compressed)
- ///
- /// # Errors
- ///
- /// Returns an error if the file cannot be opened or any data line is malformed.
- pub fn read_pileup_index(path: &str) -> Result<HashMap<String, Vec<PileupSite>>> {
- let mut map: HashMap<String, Vec<PileupSite>> = HashMap::new();
- for (i, line) in get_gz_reader(path)?.lines().enumerate() {
- let line = line.with_context(|| format!("I/O error at line {}", i + 1))?;
- if line.starts_with('#') || line.is_empty() {
- continue;
- }
- let f: Vec<&str> = line.split('\t').collect();
- let chrom = f.get(0)
- .ok_or_else(|| anyhow!("Missing chrom at line {}", i + 1))?.to_string();
- let start: u64 = f.get(1)
- .ok_or_else(|| anyhow!("Missing start at line {}", i + 1))?
- .parse().with_context(|| format!("Bad start at line {}", i + 1))?;
- let nvalid_cov: u64 = f.get(9)
- .ok_or_else(|| anyhow!("Missing Nvalid_cov (col 10) at line {}", i + 1))?
- .parse().with_context(|| format!("Bad Nvalid_cov at line {}", i + 1))?;
- let nmod: u64 = f.get(11)
- .ok_or_else(|| anyhow!("Missing Nmod (col 12) at line {}", i + 1))?
- .parse().with_context(|| format!("Bad Nmod at line {}", i + 1))?;
- if nvalid_cov == 0 {
- continue;
- }
- map.entry(chrom).or_default().push(PileupSite { start, nvalid_cov, nmod });
- }
- for v in map.values_mut() {
- v.sort_by_key(|s| s.start);
- }
- Ok(map)
- }
- /// Aggregate methylation fraction over `[start, end)` from a sorted pileup slice.
- ///
- /// Returns `(fraction, sum_nvalid_cov, sum_nmod)` where
- /// `fraction = sum(Nmod) / sum(Nvalid_cov)`. Returns `None` if no sites overlap
- /// or if total coverage is zero.
- ///
- /// Uses [`slice::partition_point`] for O(log n) start lookup.
- pub fn region_fraction_modified(
- chrom_sites: &[PileupSite],
- start: u64,
- end: u64,
- ) -> Option<(f64, u64, u64)> {
- let i = chrom_sites.partition_point(|s| s.start < start);
- let mut sum_cov: u64 = 0;
- let mut sum_mod: u64 = 0;
- for s in &chrom_sites[i..] {
- if s.start >= end {
- break;
- }
- sum_cov = sum_cov.saturating_add(s.nvalid_cov);
- sum_mod = sum_mod.saturating_add(s.nmod);
- }
- if sum_cov == 0 {
- None
- } else {
- Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod))
- }
- }
- /// Gene-level epigenetic activity proxy computed from a single sample's pileup.
- ///
- /// Score: `log2(body_frac / prom_frac)` where both fractions are
- /// coverage-weighted methylation ratios over the respective regions.
- ///
- /// - Score > 0: gene-body methylation exceeds promoter → "active-like"
- /// - Score < 0: promoter methylation exceeds gene-body → "repressed-like"
- ///
- /// Output coordinates use the **promoter** interval (anchors to TSS in genome browsers).
- #[derive(Debug, Clone)]
- pub struct GeneActivity {
- pub gene_key: String,
- pub chrom: String,
- /// Promoter start, 0-based
- pub start: u64,
- /// Promoter end, 0-based exclusive
- pub end: u64,
- pub prom_frac: f64,
- pub body_frac: f64,
- /// `log2(body_frac / prom_frac)`
- pub score: f64,
- }
- /// Compute per-gene epigenetic activity from a modkit pileup and BED4 regions.
- ///
- /// For each gene with both a `_prom` and a `_body` region:
- /// 1. Aggregate methylation fraction `f(R) = Σ Nmod / Σ Nvalid_cov` per region
- /// 2. Skip genes where either fraction is 0 or coverage is below `min_sum_cov`
- /// 3. Compute `score = log2(body_frac / prom_frac)`
- ///
- /// Results are sorted by `(chrom, start)` for IGV compatibility.
- ///
- /// # Arguments
- ///
- /// * `pileup` - Per-chromosome pileup sites from [`read_pileup_index`]
- /// * `regions` - BED4 regions with `_prom` / `_body` name suffixes
- /// * `min_sum_cov` - Minimum total `Nvalid_cov` required per region
- ///
- /// # Errors
- ///
- /// Returns an error if any region has `end <= start`.
- pub fn compute_gene_activity_from_pileup(
- pileup: &HashMap<String, Vec<PileupSite>>,
- regions: &[BedRegion],
- min_sum_cov: u64,
- ) -> Result<Vec<GeneActivity>> {
- #[derive(Clone)]
- struct Part { chrom: String, start: u64, end: u64, frac: f64, sum_cov: u64 }
- #[derive(Default)]
- struct Pair { prom: Option<Part>, body: Option<Part> }
- let mut by_gene: HashMap<String, Pair> = HashMap::new();
- for r in regions {
- if r.end <= r.start {
- return Err(anyhow!(
- "Invalid region end<=start {}:{}-{} ({})",
- r.chrom, r.start, r.end, r.name
- ));
- }
- let ParsedName { gene_key, kind } = parse_region_name(&r.name);
- if kind == RegionKind::Other { continue; }
- let chrom_sites = match pileup.get(&r.chrom) { Some(v) => v, None => continue };
- let (frac, sum_cov, _) = match region_fraction_modified(chrom_sites, r.start, r.end) {
- Some(x) => x,
- None => continue,
- };
- if sum_cov < min_sum_cov || frac <= 0.0 { continue; }
- let part = Part { chrom: r.chrom.clone(), start: r.start, end: r.end, frac, sum_cov };
- let entry = by_gene.entry(gene_key).or_default();
- match kind {
- RegionKind::Promoter => entry.prom = Some(part),
- RegionKind::GeneBody => entry.body = Some(part),
- RegionKind::Other => {}
- }
- }
- let mut out = Vec::new();
- for (gene, pair) in by_gene {
- let (prom, body) = match (pair.prom, pair.body) {
- (Some(p), Some(b)) => (p, b),
- _ => continue,
- };
- out.push(GeneActivity {
- gene_key: gene,
- chrom: prom.chrom,
- start: prom.start,
- end: prom.end,
- prom_frac: prom.frac,
- body_frac: body.frac,
- score: (body.frac / prom.frac).log2(),
- });
- }
- out.sort_by(|a, b| match a.chrom.cmp(&b.chrom) {
- Ordering::Equal => a.start.cmp(&b.start),
- o => o,
- });
- Ok(out)
- }
- /// Convert a log2-ratio to a BED `score` field (0–1000).
- ///
- /// `v` is clipped to `[-clip, +clip]` then linearly mapped:
- /// `-clip` → 0, `0` → 500, `+clip` → 1000.
- pub fn bed_score_from_log2_ratio(v: f64, clip: f64) -> u16 {
- debug_assert!(clip > 0.0, "clip must be > 0");
- let t = (v.clamp(-clip, clip) + clip) / (2.0 * clip);
- (t * 1000.0).round().clamp(0.0, 1000.0) as u16
- }
- /// Write gene activity as an IGV BED9 track with `itemRgb=On`.
- ///
- /// Color encodes `GeneActivity::score` (`log2(body/prom)`) via
- /// [`diverging_rgb`] clipped to `[-clip, +clip]`. The BED `score` column
- /// (col 5) encodes the same quantity scaled to `[0, 1000]` via
- /// [`bed_score_from_log2_ratio`].
- ///
- /// Output is written atomically: data goes to a UUID temp file in the same
- /// directory as `path`, then renamed on success. A [`TempFileGuard`] ensures
- /// the temp file is removed automatically on failure.
- ///
- /// # Arguments
- ///
- /// * `path` - Destination BED9 file path
- /// * `track_name` - IGV track name shown in the track label
- /// * `activity` - Gene activity records (produced by [`compute_gene_activity_from_pileup`])
- /// * `clip` - Symmetric clip value for color and score mapping; must be > 0
- ///
- /// # Errors
- ///
- /// Returns an error if `clip <= 0`, the file cannot be written, or the atomic
- /// rename fails.
- pub fn write_gene_activity_bed9_itemrgb(
- path: &str,
- track_name: &str,
- activity: &[GeneActivity],
- clip: f64,
- ) -> Result<()> {
- if clip <= 0.0 {
- return Err(anyhow!("clip must be > 0, got {clip}"));
- }
- let tmp_dir = Path::new(path).parent().unwrap_or(Path::new("."));
- let mut guard = TempFileGuard::new();
- let tmp_path = guard.tmp_path("", tmp_dir);
- let out = File::create(&tmp_path)
- .with_context(|| format!("failed to create temp file: {}", tmp_path.display()))?;
- let mut w = BufWriter::new(out);
- writeln!(w, r#"track name="{}" itemRgb="On""#, track_name)?;
- for a in activity {
- let (rr, gg, bb) = diverging_rgb(a.score, clip);
- let bed_score = bed_score_from_log2_ratio(a.score, clip);
- writeln!(
- w,
- "{}\t{}\t{}\t{}={:.4}\t{}\t.\t{}\t{}\t{},{},{}",
- a.chrom, a.start, a.end,
- a.gene_key, a.score,
- bed_score,
- a.start, a.end,
- rr, gg, bb,
- )?;
- }
- w.flush().with_context(|| format!("failed to flush: {}", tmp_path.display()))?;
- drop(w);
- fs::rename(&tmp_path, path)
- .with_context(|| format!("failed to rename {} → {path}", tmp_path.display()))?;
- guard.disarm();
- Ok(())
- }
- /// End-to-end helper: pileup + BED4 regions → IGV BED9 activity track.
- ///
- /// Runs the full pipeline in order:
- /// 1. [`read_pileup_index`] — parse and index the modkit pileup
- /// 2. [`read_bed4`] — parse the gene promoter/body regions
- /// 3. [`compute_gene_activity_from_pileup`] — compute log2 activity scores
- /// 4. [`write_gene_activity_bed9_itemrgb`] — write the BED9 track atomically
- ///
- /// # Returns
- ///
- /// The number of gene features written.
- ///
- /// # Errors
- ///
- /// Returns an error if any pipeline step fails.
- pub fn pileup_regions_to_activity_bed9_itemrgb(
- pileup_path: &str,
- regions_bed4_path: &str,
- min_sum_cov: u64,
- out_bed9_path: &str,
- track_name: &str,
- clip: f64,
- ) -> Result<usize> {
- let pile = read_pileup_index(pileup_path)?;
- let regions = read_bed4(regions_bed4_path)?;
- let activity = compute_gene_activity_from_pileup(&pile, ®ions, min_sum_cov)?;
- let n = activity.len();
- write_gene_activity_bed9_itemrgb(out_bed9_path, track_name, &activity, clip)?;
- Ok(n)
- }
- #[cfg(test)]
- mod tests {
- use log::info;
- use super::*;
- use crate::{config::Config, helpers::test_init};
- #[test]
- fn modkit_activity_igv() -> anyhow::Result<()> {
- test_init();
- let config = Config::default();
- let regions_bed4_path =
- "/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_genes_prom_body.bed";
- for id in ["DUMCO", "CHAHA"] {
- let path = format!(
- "{}/{}_{}_modkit_pileup.bed.gz",
- config.tumoral_dir(id), id, config.tumoral_name
- );
- let out = format!(
- "{}/{}_{}_modkit_activity.bed",
- config.tumoral_dir(id), id, config.tumoral_name
- );
- let n = pileup_regions_to_activity_bed9_itemrgb(
- &path, regions_bed4_path, 100, &out,
- &format!("{id} Gene Activity"), 2.0,
- )?;
- info!("{id}: {n} gene activities written");
- }
- Ok(())
- }
- }
|