| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492 |
- //! Modkit pileup parsing and gene-level epigenetic activity computation.
- //!
- //! Reads modkit bedMethyl pileup files (BGZF + Tabix indexed) and BED4 region
- //! files to compute a per-gene **epigenetic activity proxy** based on the
- //! contrast between promoter and gene-body methylation.
- //!
- //! Region methylation is fetched **per region** via the Tabix index using
- //! [`fetch_tabix_lines_with`], avoiding loading the full genome-wide pileup
- //! (hundreds of millions of CpG sites) into memory.
- //!
- //! # Pipeline
- //!
- //! ```text
- //! pileup.bed.gz + .tbi ──► fetch per region (fetch_tabix_lines_with)
- //! regions.bed ──► read_bed4
- //! │
- //! compute_gene_activity
- //! │
- //! write_gene_activity_bed9_itemrgb
- //! ```
- //!
- //! # Coordinate system
- //!
- //! All coordinates are **0-based half-open `[start, end)`** throughout,
- //! matching the BED convention. `PileupSite.start` is a single-base position.
- use anyhow::{anyhow, Context, Result};
- use std::{
- cmp::Ordering,
- collections::HashMap,
- fs,
- fs::File,
- io::{BufWriter, Write},
- path::Path,
- };
- use crate::{
- helpers::{diverging_rgb, TempFileGuard},
- io::{bed::read_bed, readers::fetch_tabix_lines_with},
- positions::GenomeRange,
- };
- // ─── Types ───────────────────────────────────────────────────────────────────
- /// One methylation call row from a modkit pileup (bedMethyl) file.
- ///
- /// Pileup rows are single-base intervals `[start, start+1)`.
- /// Only the columns needed for region aggregation are stored:
- /// col 2 (`start`), col 10 (`nvalid_cov`), col 12 (`nmod`).
- #[derive(Debug, Clone)]
- pub struct PileupSite {
- /// 0-based position (half-open interval `[start, start+1)`)
- pub start: u64,
- /// Number of valid modification calls at this site (col 10)
- pub nvalid_cov: u64,
- /// Number of modified calls at this site (col 12)
- pub nmod: u64,
- }
- /// One region from a BED4 file: chrom, start, end, name.
- ///
- /// Coordinates are **0-based half-open** `[start, end)`.
- #[derive(Debug, Clone)]
- pub struct BedRegion {
- pub chrom: String,
- /// 0-based inclusive start
- pub start: u64,
- /// 0-based exclusive end
- pub end: u64,
- pub name: String,
- }
- /// Promoter/body classification inferred from a region name suffix.
- #[derive(Debug, Clone, Copy, PartialEq, Eq)]
- pub enum RegionKind {
- Promoter,
- GeneBody,
- Other,
- }
- /// A region name split into gene key and [`RegionKind`].
- #[derive(Debug, Clone)]
- pub struct ParsedName {
- pub gene_key: String,
- pub kind: RegionKind,
- }
- /// Gene-level epigenetic activity proxy computed from a single sample's pileup.
- ///
- /// Score: `log2(body_frac / prom_frac)` — positive means more active-like,
- /// negative means more repressed-like. Output coordinates use the **promoter**
- /// interval (anchors the signal near the TSS in genome browsers).
- #[derive(Debug, Clone)]
- pub struct GeneActivity {
- pub gene_key: String,
- pub chrom: String,
- /// Promoter start, 0-based inclusive
- pub start: u64,
- /// Promoter end, 0-based exclusive
- pub end: u64,
- pub prom_frac: f64,
- pub body_frac: f64,
- /// `log2(body_frac / prom_frac)`
- pub score: f64,
- }
- // ─── Parsing helpers ─────────────────────────────────────────────────────────
- /// Parse a region name for `_prom` or `_body` suffix.
- ///
- /// - `"FOO_prom"` → gene key `"FOO"`, kind [`RegionKind::Promoter`]
- /// - `"FOO_body"` → gene key `"FOO"`, kind [`RegionKind::GeneBody`]
- /// - anything else → gene key = full name, kind [`RegionKind::Other`]
- pub fn parse_region_name(name: &str) -> ParsedName {
- if let Some(g) = name.strip_suffix("_prom") {
- ParsedName {
- gene_key: g.to_string(),
- kind: RegionKind::Promoter,
- }
- } else if let Some(g) = name.strip_suffix("_body") {
- ParsedName {
- gene_key: g.to_string(),
- kind: RegionKind::GeneBody,
- }
- } else {
- ParsedName {
- gene_key: name.to_string(),
- kind: RegionKind::Other,
- }
- }
- }
- /// Read a BED4 regions file into memory.
- ///
- /// Wraps [`read_bed`], inheriting its header skipping (`#`, `track`, `browser`)
- /// and gzip support. Coordinates are widened from `u32` to `u64`;
- /// contig names are recovered via `GenomeRange::contig()`.
- ///
- /// # Errors
- ///
- /// Returns an error if the file cannot be opened or any line is malformed.
- pub fn read_bed4(path: &str) -> Result<Vec<BedRegion>> {
- Ok(read_bed(path)?
- .into_iter()
- .map(|row| BedRegion {
- chrom: row.range.contig(),
- start: row.range.range.start as u64,
- end: row.range.range.end as u64,
- name: row.name.unwrap_or_else(|| ".".to_string()),
- })
- .collect())
- }
- // ─── Region methylation ───────────────────────────────────────────────────────
- /// Aggregate methylation fraction over `[start, end)` from a **pre-loaded**
- /// sorted pileup slice.
- ///
- /// Returns `(fraction, sum_nvalid_cov, sum_nmod)` where
- /// `fraction = Σ Nmod / Σ Nvalid_cov`. Returns `None` if no sites overlap or
- /// total coverage is zero.
- ///
- /// Uses [`slice::partition_point`] for O(log n) start lookup.
- /// Prefer [`region_fraction_from_tabix`] for production use — this variant is
- /// useful when the pileup is already loaded in memory (e.g. unit tests).
- pub fn region_fraction_modified(
- chrom_sites: &[PileupSite],
- start: u64,
- end: u64,
- ) -> Option<(f64, u64, u64)> {
- let i = chrom_sites.partition_point(|s| s.start < start);
- let mut sum_cov = 0u64;
- let mut sum_mod = 0u64;
- for s in &chrom_sites[i..] {
- if s.start >= end {
- break;
- }
- sum_cov = sum_cov.saturating_add(s.nvalid_cov);
- sum_mod = sum_mod.saturating_add(s.nmod);
- }
- if sum_cov == 0 {
- None
- } else {
- Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod))
- }
- }
- /// Fetch methylation fraction for `region` directly from a Tabix-indexed pileup.
- ///
- /// Uses [`fetch_tabix_lines_with`] to seek only to the relevant BGZF chunks —
- /// no full-file load. Parses col 10 (`Nvalid_cov`) and col 12 (`Nmod`) from
- /// each overlapping pileup line.
- ///
- /// Returns `(fraction, sum_nvalid_cov, sum_nmod)`, or `None` if no sites
- /// overlap the region or total coverage is zero.
- ///
- /// # Arguments
- ///
- /// * `pileup_bgz` - Path to the modkit pileup `.bed.gz`; index expected at
- /// `<pileup_bgz>.tbi`
- /// * `region` - Genomic region to query (0-based half-open)
- ///
- /// # Errors
- ///
- /// Returns an error if the Tabix index cannot be read or a pileup line is
- /// malformed.
- pub fn region_fraction_from_tabix(
- pileup_bgz: &str,
- region: &GenomeRange,
- ) -> Result<Option<(f64, u64, u64)>> {
- let mut sum_cov = 0u64;
- let mut sum_mod = 0u64;
- fetch_tabix_lines_with(pileup_bgz, region, |line| {
- let f: Vec<&str> = line.split('\t').collect();
- let nvalid_cov: u64 = f
- .get(9)
- .ok_or_else(|| anyhow!("Missing Nvalid_cov (col 10) in {pileup_bgz}: {line}"))?
- .parse()
- .with_context(|| format!("Bad Nvalid_cov in {pileup_bgz}: {line}"))?;
- let nmod: u64 = f
- .get(11)
- .ok_or_else(|| anyhow!("Missing Nmod (col 12) in {pileup_bgz}: {line}"))?
- .parse()
- .with_context(|| format!("Bad Nmod in {pileup_bgz}: {line}"))?;
- sum_cov = sum_cov.saturating_add(nvalid_cov);
- sum_mod = sum_mod.saturating_add(nmod);
- Ok(())
- })?;
- if sum_cov == 0 {
- Ok(None)
- } else {
- Ok(Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod)))
- }
- }
- // ─── Activity computation ─────────────────────────────────────────────────────
- /// Compute per-gene epigenetic activity from a Tabix-indexed modkit pileup.
- ///
- /// For each gene with both a `_prom` and a `_body` region, fetches methylation
- /// fractions via [`region_fraction_from_tabix`] (one tabix seek per region),
- /// then computes `score = log2(body_frac / prom_frac)`.
- ///
- /// Genes are skipped if either fraction is 0, total coverage is below
- /// `min_sum_cov`, or the contig is absent from the Tabix index.
- ///
- /// Results are sorted by `(chrom, start)` for IGV compatibility.
- ///
- /// # Arguments
- ///
- /// * `pileup_bgz` - Path to the modkit pileup `.bed.gz` (Tabix-indexed)
- /// * `regions` - BED4 regions with `_prom` / `_body` name suffixes
- /// * `min_sum_cov` - Minimum total `Nvalid_cov` required per region
- ///
- /// # Errors
- ///
- /// Returns an error if any region has `end <= start` or a Tabix fetch fails.
- pub fn compute_gene_activity(
- pileup_bgz: &str,
- regions: &[BedRegion],
- min_sum_cov: u64,
- ) -> Result<Vec<GeneActivity>> {
- #[derive(Clone)]
- struct Part {
- chrom: String,
- start: u64,
- end: u64,
- frac: f64,
- }
- #[derive(Default)]
- struct Pair {
- prom: Option<Part>,
- body: Option<Part>,
- }
- let mut by_gene: HashMap<String, Pair> = HashMap::new();
- for r in regions {
- if r.end <= r.start {
- return Err(anyhow!(
- "Invalid region end<=start {}:{}-{} ({})",
- r.chrom,
- r.start,
- r.end,
- r.name
- ));
- }
- let ParsedName { gene_key, kind } = parse_region_name(&r.name);
- if kind == RegionKind::Other {
- continue;
- }
- let region = GenomeRange::new(&r.chrom, r.start as u32, r.end as u32);
- let (frac, sum_cov, _) = match region_fraction_from_tabix(pileup_bgz, ®ion)? {
- Some(x) => x,
- None => continue,
- };
- if sum_cov < min_sum_cov || frac <= 0.0 {
- continue;
- }
- let part = Part {
- chrom: r.chrom.clone(),
- start: r.start,
- end: r.end,
- frac,
- };
- let entry = by_gene.entry(gene_key).or_default();
- match kind {
- RegionKind::Promoter => entry.prom = Some(part),
- RegionKind::GeneBody => entry.body = Some(part),
- RegionKind::Other => {}
- }
- }
- let mut out: Vec<GeneActivity> = by_gene
- .into_iter()
- .filter_map(|(gene, pair)| {
- let (prom, body) = (pair.prom?, pair.body?);
- Some(GeneActivity {
- gene_key: gene,
- chrom: prom.chrom,
- start: prom.start,
- end: prom.end,
- prom_frac: prom.frac,
- body_frac: body.frac,
- score: (body.frac / prom.frac).log2(),
- })
- })
- .collect();
- out.sort_by(|a, b| match a.chrom.cmp(&b.chrom) {
- Ordering::Equal => a.start.cmp(&b.start),
- o => o,
- });
- Ok(out)
- }
- // ─── Colour / score helpers ───────────────────────────────────────────────────
- /// Map a log2-ratio to a BED `score` field (0–1000).
- ///
- /// `v` is clipped to `[-clip, +clip]` then linearly scaled:
- /// `-clip` → 0, `0` → 500, `+clip` → 1000.
- pub fn bed_score_from_log2_ratio(v: f64, clip: f64) -> u16 {
- debug_assert!(clip > 0.0, "clip must be > 0");
- let t = (v.clamp(-clip, clip) + clip) / (2.0 * clip);
- (t * 1000.0).round().clamp(0.0, 1000.0) as u16
- }
- // ─── Output ───────────────────────────────────────────────────────────────────
- /// Write gene activity as an IGV BED9 track with `itemRgb=On`.
- ///
- /// Colour encodes `GeneActivity::score` via [`diverging_rgb`] clipped to
- /// `[-clip, +clip]`. BED col 5 (`score`) holds the same quantity scaled to
- /// `[0, 1000]` via [`bed_score_from_log2_ratio`].
- ///
- /// Written atomically: data goes to a UUID temp file, renamed on success.
- /// A [`TempFileGuard`] removes the temp file automatically on failure.
- ///
- /// # Errors
- ///
- /// Returns an error if `clip <= 0`, the file cannot be written, or the rename fails.
- pub fn write_gene_activity_bed9(
- path: &str,
- track_name: &str,
- activity: &[GeneActivity],
- clip: f64,
- ) -> Result<()> {
- if clip <= 0.0 {
- return Err(anyhow!("clip must be > 0, got {clip}"));
- }
- let tmp_dir = Path::new(path).parent().unwrap_or(Path::new("."));
- let mut guard = TempFileGuard::new();
- let tmp_path = guard.tmp_path("", tmp_dir);
- let mut w = BufWriter::new(
- File::create(&tmp_path)
- .with_context(|| format!("failed to create temp file: {}", tmp_path.display()))?,
- );
- writeln!(w, r#"track name="{}" itemRgb="On""#, track_name)?;
- for a in activity {
- let (rr, gg, bb) = diverging_rgb(a.score, clip);
- writeln!(
- w,
- "{}\t{}\t{}\t{}={:.4}\t{}\t.\t{}\t{}\t{},{},{}",
- a.chrom,
- a.start,
- a.end,
- a.gene_key,
- a.score,
- bed_score_from_log2_ratio(a.score, clip),
- a.start,
- a.end,
- rr,
- gg,
- bb,
- )?;
- }
- w.flush()
- .with_context(|| format!("failed to flush: {}", tmp_path.display()))?;
- drop(w);
- fs::rename(&tmp_path, path)
- .with_context(|| format!("failed to rename {} → {path}", tmp_path.display()))?;
- guard.disarm();
- Ok(())
- }
- // ─── Pipeline ─────────────────────────────────────────────────────────────────
- /// End-to-end pipeline: Tabix pileup + BED4 regions → IGV BED9 activity track.
- ///
- /// 1. [`read_bed4`] — parse the gene promoter/body regions
- /// 2. [`compute_gene_activity`] — fetch per-region methylation fractions via
- /// Tabix and compute log2 activity scores
- /// 3. [`write_gene_activity_bed9`] — write the BED9 track atomically
- ///
- /// # Returns
- ///
- /// The number of gene features written.
- ///
- /// # Errors
- ///
- /// Returns an error if any pipeline step fails.
- pub fn pileup_to_activity_bed9(
- pileup_bgz: &str,
- regions_bed4_path: &str,
- min_sum_cov: u64,
- out_bed9_path: &str,
- track_name: &str,
- clip: f64,
- ) -> Result<usize> {
- let regions = read_bed4(regions_bed4_path)?;
- let activity = compute_gene_activity(pileup_bgz, ®ions, min_sum_cov)?;
- let n = activity.len();
- write_gene_activity_bed9(out_bed9_path, track_name, &activity, clip)?;
- Ok(n)
- }
- #[cfg(test)]
- mod tests {
- use super::*;
- use crate::{config::Config, helpers::test_init};
- use log::info;
- #[test]
- fn modkit_activity_igv() -> anyhow::Result<()> {
- test_init();
- let config = Config::default();
- let regions_bed4_path =
- "/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_genes_prom_body.bed";
- for id in ["DUMCO", "CHAHA"] {
- let path = format!(
- "{}/{}_{}_modkit_pileup.bed.gz",
- config.tumoral_dir(id),
- id,
- config.tumoral_name
- );
- let out = format!(
- "{}/{}_{}_modkit_activity.bed",
- config.tumoral_dir(id),
- id,
- config.tumoral_name
- );
- let n = pileup_to_activity_bed9(
- &path,
- regions_bed4_path,
- 100,
- &out,
- &format!("{id} Gene Activity"),
- 2.0,
- )?;
- info!("{id}: {n} gene activities written");
- }
- Ok(())
- }
- }
|