modkit.rs 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. //! Modkit pileup parsing and gene-level epigenetic activity computation.
  2. //!
  3. //! Reads modkit bedMethyl pileup files (BGZF + Tabix indexed) and BED4 region
  4. //! files to compute a per-gene **epigenetic activity proxy** based on the
  5. //! contrast between promoter and gene-body methylation.
  6. //!
  7. //! Region methylation is fetched **per region** via the Tabix index using
  8. //! [`fetch_tabix_lines_with`], avoiding loading the full genome-wide pileup
  9. //! (hundreds of millions of CpG sites) into memory.
  10. //!
  11. //! # Pipeline
  12. //!
  13. //! ```text
  14. //! pileup.bed.gz + .tbi ──► fetch per region (fetch_tabix_lines_with)
  15. //! regions.bed ──► read_bed4
  16. //! │
  17. //! compute_gene_activity
  18. //! │
  19. //! write_gene_activity_bed9_itemrgb
  20. //! ```
  21. //!
  22. //! # Coordinate system
  23. //!
  24. //! All coordinates are **0-based half-open `[start, end)`** throughout,
  25. //! matching the BED convention. `PileupSite.start` is a single-base position.
  26. use anyhow::{anyhow, Context, Result};
  27. use std::{
  28. cmp::Ordering,
  29. collections::HashMap,
  30. fs,
  31. fs::File,
  32. io::{BufWriter, Write},
  33. path::Path,
  34. };
  35. use crate::{
  36. helpers::{diverging_rgb, TempFileGuard},
  37. io::{bed::read_bed, readers::fetch_tabix_lines_with},
  38. positions::GenomeRange,
  39. };
  40. // ─── Types ───────────────────────────────────────────────────────────────────
  41. /// One methylation call row from a modkit pileup (bedMethyl) file.
  42. ///
  43. /// Pileup rows are single-base intervals `[start, start+1)`.
  44. /// Only the columns needed for region aggregation are stored:
  45. /// col 2 (`start`), col 10 (`nvalid_cov`), col 12 (`nmod`).
  46. #[derive(Debug, Clone)]
  47. pub struct PileupSite {
  48. /// 0-based position (half-open interval `[start, start+1)`)
  49. pub start: u64,
  50. /// Number of valid modification calls at this site (col 10)
  51. pub nvalid_cov: u64,
  52. /// Number of modified calls at this site (col 12)
  53. pub nmod: u64,
  54. }
  55. /// One region from a BED4 file: chrom, start, end, name.
  56. ///
  57. /// Coordinates are **0-based half-open** `[start, end)`.
  58. #[derive(Debug, Clone)]
  59. pub struct BedRegion {
  60. pub chrom: String,
  61. /// 0-based inclusive start
  62. pub start: u64,
  63. /// 0-based exclusive end
  64. pub end: u64,
  65. pub name: String,
  66. }
  67. /// Promoter/body classification inferred from a region name suffix.
  68. #[derive(Debug, Clone, Copy, PartialEq, Eq)]
  69. pub enum RegionKind {
  70. Promoter,
  71. GeneBody,
  72. Other,
  73. }
  74. /// A region name split into gene key and [`RegionKind`].
  75. #[derive(Debug, Clone)]
  76. pub struct ParsedName {
  77. pub gene_key: String,
  78. pub kind: RegionKind,
  79. }
  80. /// Gene-level epigenetic activity proxy computed from a single sample's pileup.
  81. ///
  82. /// Score: `log2(body_frac / prom_frac)` — positive means more active-like,
  83. /// negative means more repressed-like. Output coordinates use the **promoter**
  84. /// interval (anchors the signal near the TSS in genome browsers).
  85. #[derive(Debug, Clone)]
  86. pub struct GeneActivity {
  87. pub gene_key: String,
  88. pub chrom: String,
  89. /// Promoter start, 0-based inclusive
  90. pub start: u64,
  91. /// Promoter end, 0-based exclusive
  92. pub end: u64,
  93. pub prom_frac: f64,
  94. pub body_frac: f64,
  95. /// `log2(body_frac / prom_frac)`
  96. pub score: f64,
  97. }
  98. // ─── Parsing helpers ─────────────────────────────────────────────────────────
  99. /// Parse a region name for `_prom` or `_body` suffix.
  100. ///
  101. /// - `"FOO_prom"` → gene key `"FOO"`, kind [`RegionKind::Promoter`]
  102. /// - `"FOO_body"` → gene key `"FOO"`, kind [`RegionKind::GeneBody`]
  103. /// - anything else → gene key = full name, kind [`RegionKind::Other`]
  104. pub fn parse_region_name(name: &str) -> ParsedName {
  105. if let Some(g) = name.strip_suffix("_prom") {
  106. ParsedName {
  107. gene_key: g.to_string(),
  108. kind: RegionKind::Promoter,
  109. }
  110. } else if let Some(g) = name.strip_suffix("_body") {
  111. ParsedName {
  112. gene_key: g.to_string(),
  113. kind: RegionKind::GeneBody,
  114. }
  115. } else {
  116. ParsedName {
  117. gene_key: name.to_string(),
  118. kind: RegionKind::Other,
  119. }
  120. }
  121. }
  122. /// Read a BED4 regions file into memory.
  123. ///
  124. /// Wraps [`read_bed`], inheriting its header skipping (`#`, `track`, `browser`)
  125. /// and gzip support. Coordinates are widened from `u32` to `u64`;
  126. /// contig names are recovered via `GenomeRange::contig()`.
  127. ///
  128. /// # Errors
  129. ///
  130. /// Returns an error if the file cannot be opened or any line is malformed.
  131. pub fn read_bed4(path: &str) -> Result<Vec<BedRegion>> {
  132. Ok(read_bed(path)?
  133. .into_iter()
  134. .map(|row| BedRegion {
  135. chrom: row.range.contig(),
  136. start: row.range.range.start as u64,
  137. end: row.range.range.end as u64,
  138. name: row.name.unwrap_or_else(|| ".".to_string()),
  139. })
  140. .collect())
  141. }
  142. // ─── Region methylation ───────────────────────────────────────────────────────
  143. /// Aggregate methylation fraction over `[start, end)` from a **pre-loaded**
  144. /// sorted pileup slice.
  145. ///
  146. /// Returns `(fraction, sum_nvalid_cov, sum_nmod)` where
  147. /// `fraction = Σ Nmod / Σ Nvalid_cov`. Returns `None` if no sites overlap or
  148. /// total coverage is zero.
  149. ///
  150. /// Uses [`slice::partition_point`] for O(log n) start lookup.
  151. /// Prefer [`region_fraction_from_tabix`] for production use — this variant is
  152. /// useful when the pileup is already loaded in memory (e.g. unit tests).
  153. pub fn region_fraction_modified(
  154. chrom_sites: &[PileupSite],
  155. start: u64,
  156. end: u64,
  157. ) -> Option<(f64, u64, u64)> {
  158. let i = chrom_sites.partition_point(|s| s.start < start);
  159. let mut sum_cov = 0u64;
  160. let mut sum_mod = 0u64;
  161. for s in &chrom_sites[i..] {
  162. if s.start >= end {
  163. break;
  164. }
  165. sum_cov = sum_cov.saturating_add(s.nvalid_cov);
  166. sum_mod = sum_mod.saturating_add(s.nmod);
  167. }
  168. if sum_cov == 0 {
  169. None
  170. } else {
  171. Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod))
  172. }
  173. }
  174. /// Fetch methylation fraction for `region` directly from a Tabix-indexed pileup.
  175. ///
  176. /// Uses [`fetch_tabix_lines_with`] to seek only to the relevant BGZF chunks —
  177. /// no full-file load. Parses col 10 (`Nvalid_cov`) and col 12 (`Nmod`) from
  178. /// each overlapping pileup line.
  179. ///
  180. /// Returns `(fraction, sum_nvalid_cov, sum_nmod)`, or `None` if no sites
  181. /// overlap the region or total coverage is zero.
  182. ///
  183. /// # Arguments
  184. ///
  185. /// * `pileup_bgz` - Path to the modkit pileup `.bed.gz`; index expected at
  186. /// `<pileup_bgz>.tbi`
  187. /// * `region` - Genomic region to query (0-based half-open)
  188. ///
  189. /// # Errors
  190. ///
  191. /// Returns an error if the Tabix index cannot be read or a pileup line is
  192. /// malformed.
  193. pub fn region_fraction_from_tabix(
  194. pileup_bgz: &str,
  195. region: &GenomeRange,
  196. ) -> Result<Option<(f64, u64, u64)>> {
  197. let mut sum_cov = 0u64;
  198. let mut sum_mod = 0u64;
  199. fetch_tabix_lines_with(pileup_bgz, region, |line| {
  200. let f: Vec<&str> = line.split('\t').collect();
  201. let nvalid_cov: u64 = f
  202. .get(9)
  203. .ok_or_else(|| anyhow!("Missing Nvalid_cov (col 10) in {pileup_bgz}: {line}"))?
  204. .parse()
  205. .with_context(|| format!("Bad Nvalid_cov in {pileup_bgz}: {line}"))?;
  206. let nmod: u64 = f
  207. .get(11)
  208. .ok_or_else(|| anyhow!("Missing Nmod (col 12) in {pileup_bgz}: {line}"))?
  209. .parse()
  210. .with_context(|| format!("Bad Nmod in {pileup_bgz}: {line}"))?;
  211. sum_cov = sum_cov.saturating_add(nvalid_cov);
  212. sum_mod = sum_mod.saturating_add(nmod);
  213. Ok(())
  214. })?;
  215. if sum_cov == 0 {
  216. Ok(None)
  217. } else {
  218. Ok(Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod)))
  219. }
  220. }
  221. // ─── Activity computation ─────────────────────────────────────────────────────
  222. /// Compute per-gene epigenetic activity from a Tabix-indexed modkit pileup.
  223. ///
  224. /// For each gene with both a `_prom` and a `_body` region, fetches methylation
  225. /// fractions via [`region_fraction_from_tabix`] (one tabix seek per region),
  226. /// then computes `score = log2(body_frac / prom_frac)`.
  227. ///
  228. /// Genes are skipped if either fraction is 0, total coverage is below
  229. /// `min_sum_cov`, or the contig is absent from the Tabix index.
  230. ///
  231. /// Results are sorted by `(chrom, start)` for IGV compatibility.
  232. ///
  233. /// # Arguments
  234. ///
  235. /// * `pileup_bgz` - Path to the modkit pileup `.bed.gz` (Tabix-indexed)
  236. /// * `regions` - BED4 regions with `_prom` / `_body` name suffixes
  237. /// * `min_sum_cov` - Minimum total `Nvalid_cov` required per region
  238. ///
  239. /// # Errors
  240. ///
  241. /// Returns an error if any region has `end <= start` or a Tabix fetch fails.
  242. pub fn compute_gene_activity(
  243. pileup_bgz: &str,
  244. regions: &[BedRegion],
  245. min_sum_cov: u64,
  246. ) -> Result<Vec<GeneActivity>> {
  247. #[derive(Clone)]
  248. struct Part {
  249. chrom: String,
  250. start: u64,
  251. end: u64,
  252. frac: f64,
  253. }
  254. #[derive(Default)]
  255. struct Pair {
  256. prom: Option<Part>,
  257. body: Option<Part>,
  258. }
  259. let mut by_gene: HashMap<String, Pair> = HashMap::new();
  260. for r in regions {
  261. if r.end <= r.start {
  262. return Err(anyhow!(
  263. "Invalid region end<=start {}:{}-{} ({})",
  264. r.chrom,
  265. r.start,
  266. r.end,
  267. r.name
  268. ));
  269. }
  270. let ParsedName { gene_key, kind } = parse_region_name(&r.name);
  271. if kind == RegionKind::Other {
  272. continue;
  273. }
  274. let region = GenomeRange::new(&r.chrom, r.start as u32, r.end as u32);
  275. let (frac, sum_cov, _) = match region_fraction_from_tabix(pileup_bgz, &region)? {
  276. Some(x) => x,
  277. None => continue,
  278. };
  279. if sum_cov < min_sum_cov || frac <= 0.0 {
  280. continue;
  281. }
  282. let part = Part {
  283. chrom: r.chrom.clone(),
  284. start: r.start,
  285. end: r.end,
  286. frac,
  287. };
  288. let entry = by_gene.entry(gene_key).or_default();
  289. match kind {
  290. RegionKind::Promoter => entry.prom = Some(part),
  291. RegionKind::GeneBody => entry.body = Some(part),
  292. RegionKind::Other => {}
  293. }
  294. }
  295. let mut out: Vec<GeneActivity> = by_gene
  296. .into_iter()
  297. .filter_map(|(gene, pair)| {
  298. let (prom, body) = (pair.prom?, pair.body?);
  299. Some(GeneActivity {
  300. gene_key: gene,
  301. chrom: prom.chrom,
  302. start: prom.start,
  303. end: prom.end,
  304. prom_frac: prom.frac,
  305. body_frac: body.frac,
  306. score: (body.frac / prom.frac).log2(),
  307. })
  308. })
  309. .collect();
  310. out.sort_by(|a, b| match a.chrom.cmp(&b.chrom) {
  311. Ordering::Equal => a.start.cmp(&b.start),
  312. o => o,
  313. });
  314. Ok(out)
  315. }
  316. // ─── Colour / score helpers ───────────────────────────────────────────────────
  317. /// Map a log2-ratio to a BED `score` field (0–1000).
  318. ///
  319. /// `v` is clipped to `[-clip, +clip]` then linearly scaled:
  320. /// `-clip` → 0, `0` → 500, `+clip` → 1000.
  321. pub fn bed_score_from_log2_ratio(v: f64, clip: f64) -> u16 {
  322. debug_assert!(clip > 0.0, "clip must be > 0");
  323. let t = (v.clamp(-clip, clip) + clip) / (2.0 * clip);
  324. (t * 1000.0).round().clamp(0.0, 1000.0) as u16
  325. }
  326. // ─── Output ───────────────────────────────────────────────────────────────────
  327. /// Write gene activity as an IGV BED9 track with `itemRgb=On`.
  328. ///
  329. /// Colour encodes `GeneActivity::score` via [`diverging_rgb`] clipped to
  330. /// `[-clip, +clip]`. BED col 5 (`score`) holds the same quantity scaled to
  331. /// `[0, 1000]` via [`bed_score_from_log2_ratio`].
  332. ///
  333. /// Written atomically: data goes to a UUID temp file, renamed on success.
  334. /// A [`TempFileGuard`] removes the temp file automatically on failure.
  335. ///
  336. /// # Errors
  337. ///
  338. /// Returns an error if `clip <= 0`, the file cannot be written, or the rename fails.
  339. pub fn write_gene_activity_bed9(
  340. path: &str,
  341. track_name: &str,
  342. activity: &[GeneActivity],
  343. clip: f64,
  344. ) -> Result<()> {
  345. if clip <= 0.0 {
  346. return Err(anyhow!("clip must be > 0, got {clip}"));
  347. }
  348. let tmp_dir = Path::new(path).parent().unwrap_or(Path::new("."));
  349. let mut guard = TempFileGuard::new();
  350. let tmp_path = guard.tmp_path("", tmp_dir);
  351. let mut w = BufWriter::new(
  352. File::create(&tmp_path)
  353. .with_context(|| format!("failed to create temp file: {}", tmp_path.display()))?,
  354. );
  355. writeln!(w, r#"track name="{}" itemRgb="On""#, track_name)?;
  356. for a in activity {
  357. let (rr, gg, bb) = diverging_rgb(a.score, clip);
  358. writeln!(
  359. w,
  360. "{}\t{}\t{}\t{}={:.4}\t{}\t.\t{}\t{}\t{},{},{}",
  361. a.chrom,
  362. a.start,
  363. a.end,
  364. a.gene_key,
  365. a.score,
  366. bed_score_from_log2_ratio(a.score, clip),
  367. a.start,
  368. a.end,
  369. rr,
  370. gg,
  371. bb,
  372. )?;
  373. }
  374. w.flush()
  375. .with_context(|| format!("failed to flush: {}", tmp_path.display()))?;
  376. drop(w);
  377. fs::rename(&tmp_path, path)
  378. .with_context(|| format!("failed to rename {} → {path}", tmp_path.display()))?;
  379. guard.disarm();
  380. Ok(())
  381. }
  382. // ─── Pipeline ─────────────────────────────────────────────────────────────────
  383. /// End-to-end pipeline: Tabix pileup + BED4 regions → IGV BED9 activity track.
  384. ///
  385. /// 1. [`read_bed4`] — parse the gene promoter/body regions
  386. /// 2. [`compute_gene_activity`] — fetch per-region methylation fractions via
  387. /// Tabix and compute log2 activity scores
  388. /// 3. [`write_gene_activity_bed9`] — write the BED9 track atomically
  389. ///
  390. /// # Returns
  391. ///
  392. /// The number of gene features written.
  393. ///
  394. /// # Errors
  395. ///
  396. /// Returns an error if any pipeline step fails.
  397. pub fn pileup_to_activity_bed9(
  398. pileup_bgz: &str,
  399. regions_bed4_path: &str,
  400. min_sum_cov: u64,
  401. out_bed9_path: &str,
  402. track_name: &str,
  403. clip: f64,
  404. ) -> Result<usize> {
  405. let regions = read_bed4(regions_bed4_path)?;
  406. let activity = compute_gene_activity(pileup_bgz, &regions, min_sum_cov)?;
  407. let n = activity.len();
  408. write_gene_activity_bed9(out_bed9_path, track_name, &activity, clip)?;
  409. Ok(n)
  410. }
  411. #[cfg(test)]
  412. mod tests {
  413. use super::*;
  414. use crate::{config::Config, helpers::test_init};
  415. use log::info;
  416. #[test]
  417. fn modkit_activity_igv() -> anyhow::Result<()> {
  418. test_init();
  419. let config = Config::default();
  420. let regions_bed4_path =
  421. "/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_genes_prom_body.bed";
  422. for id in ["DUMCO", "CHAHA"] {
  423. let path = format!(
  424. "{}/{}_{}_modkit_pileup.bed.gz",
  425. config.tumoral_dir(id),
  426. id,
  427. config.tumoral_name
  428. );
  429. let out = format!(
  430. "{}/{}_{}_modkit_activity.bed",
  431. config.tumoral_dir(id),
  432. id,
  433. config.tumoral_name
  434. );
  435. let n = pileup_to_activity_bed9(
  436. &path,
  437. regions_bed4_path,
  438. 100,
  439. &out,
  440. &format!("{id} Gene Activity"),
  441. 2.0,
  442. )?;
  443. info!("{id}: {n} gene activities written");
  444. }
  445. Ok(())
  446. }
  447. }