modkit.rs 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474
  1. //! Modkit pileup parsing and gene-level epigenetic activity computation.
  2. //!
  3. //! Reads modkit bedMethyl pileup files (BGZF-compressed) and BED4 region
  4. //! files to compute a per-gene **epigenetic activity proxy** based on the
  5. //! contrast between promoter and gene-body methylation.
  6. //!
  7. //! # Pipeline
  8. //!
  9. //! ```text
  10. //! pileup.bed.gz ──► read_pileup_index ──► HashMap<chrom, [PileupSite]>
  11. //! │
  12. //! regions.bed ──► read_bed4 ──► [BedRegion] ─┤
  13. //! │
  14. //! compute_gene_activity_from_pileup
  15. //! │
  16. //! write_gene_activity_bed9_itemrgb ◄─┘
  17. //! ```
  18. //!
  19. //! # Coordinate system
  20. //!
  21. //! All coordinates are **0-based half-open `[start, end)`** throughout, matching
  22. //! the BED convention. `PileupSite.start` is a single-base position (the interval
  23. //! is `[start, start+1)`).
  24. use anyhow::{anyhow, Context, Result};
  25. use std::{
  26. cmp::Ordering,
  27. collections::HashMap,
  28. fs,
  29. fs::File,
  30. io::{BufRead, BufWriter, Write},
  31. path::Path,
  32. };
  33. use crate::{helpers::TempFileGuard, io::{bed::read_bed, readers::get_gz_reader}};
  34. /// One methylation call row from a modkit pileup (bedMethyl) file.
  35. ///
  36. /// Pileup rows are single-base intervals `[start, start+1)`.
  37. /// Only the columns needed for region aggregation are kept:
  38. /// - `start`: col 2, 0-based
  39. /// - `nvalid_cov`: col 10 (Nvalid_cov)
  40. /// - `nmod`: col 12 (Nmod)
  41. #[derive(Debug, Clone)]
  42. pub struct PileupSite {
  43. /// 0-based position (half-open interval `[start, start+1)`)
  44. pub start: u64,
  45. /// Number of valid modification calls at this site (col 10)
  46. pub nvalid_cov: u64,
  47. /// Number of modified calls at this site (col 12)
  48. pub nmod: u64,
  49. }
  50. /// One region from a BED4 file: chrom, start, end, name.
  51. ///
  52. /// Coordinates are **0-based half-open** `[start, end)`.
  53. #[derive(Debug, Clone)]
  54. pub struct BedRegion {
  55. pub chrom: String,
  56. /// 0-based inclusive start
  57. pub start: u64,
  58. /// 0-based exclusive end
  59. pub end: u64,
  60. pub name: String,
  61. }
  62. /// Promoter/body classification inferred from a region name suffix.
  63. #[derive(Debug, Clone, Copy, PartialEq, Eq)]
  64. pub enum RegionKind {
  65. Promoter,
  66. GeneBody,
  67. Other,
  68. }
  69. /// A region name split into gene key and [`RegionKind`].
  70. #[derive(Debug, Clone)]
  71. pub struct ParsedName {
  72. pub gene_key: String,
  73. pub kind: RegionKind,
  74. }
  75. /// Parse a region name for `_prom` or `_body` suffix.
  76. ///
  77. /// - `"FOO_prom"` → gene key `"FOO"`, kind [`RegionKind::Promoter`]
  78. /// - `"FOO_body"` → gene key `"FOO"`, kind [`RegionKind::GeneBody`]
  79. /// - anything else → gene key = full name, kind [`RegionKind::Other`]
  80. pub fn parse_region_name(name: &str) -> ParsedName {
  81. if let Some(g) = name.strip_suffix("_prom") {
  82. ParsedName { gene_key: g.to_string(), kind: RegionKind::Promoter }
  83. } else if let Some(g) = name.strip_suffix("_body") {
  84. ParsedName { gene_key: g.to_string(), kind: RegionKind::GeneBody }
  85. } else {
  86. ParsedName { gene_key: name.to_string(), kind: RegionKind::Other }
  87. }
  88. }
  89. /// Diverging blue→white→red palette for IGV `itemRgb`.
  90. ///
  91. /// Maps `v` clipped to `[-clip, +clip]` onto the range:
  92. /// - `v = -clip` → `(0, 0, 255)` pure blue
  93. /// - `v = 0` → `(255, 255, 255)` white
  94. /// - `v = +clip` → `(255, 0, 0)` pure red
  95. pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) {
  96. debug_assert!(clip > 0.0, "clip must be > 0");
  97. let x = v.clamp(-clip, clip);
  98. let t = (x + clip) / (2.0 * clip);
  99. if t < 0.5 {
  100. let u = t / 0.5;
  101. let r = (255.0 * u).round() as u8;
  102. let g = (255.0 * u).round() as u8;
  103. (r, g, 255)
  104. } else {
  105. let u = (t - 0.5) / 0.5;
  106. let g = (255.0 * (1.0 - u)).round() as u8;
  107. let b = (255.0 * (1.0 - u)).round() as u8;
  108. (255, g, b)
  109. }
  110. }
  111. /// Read a BED4 regions file into memory.
  112. ///
  113. /// Wraps [`read_bed`] from the `bed` module, inheriting its header skipping
  114. /// (`#`, `track`, `browser`) and gzip support. Coordinates are widened from
  115. /// `u32` to `u64`; contig names are recovered via `num_to_contig`.
  116. ///
  117. /// # Arguments
  118. ///
  119. /// * `path` - Path to the BED4 file (plain or BGZF-compressed)
  120. ///
  121. /// # Errors
  122. ///
  123. /// Returns an error if the file cannot be opened or any line is malformed.
  124. pub fn read_bed4(path: &str) -> Result<Vec<BedRegion>> {
  125. Ok(read_bed(path)?
  126. .into_iter()
  127. .map(|row| BedRegion {
  128. chrom: row.range.contig(),
  129. start: row.range.range.start as u64,
  130. end: row.range.range.end as u64,
  131. name: row.name.unwrap_or_else(|| ".".to_string()),
  132. })
  133. .collect())
  134. }
  135. /// Read a modkit pileup file and index sites per chromosome.
  136. ///
  137. /// Parses columns 1 (chrom), 2 (start), 10 (Nvalid_cov), 12 (Nmod).
  138. /// Sites with `Nvalid_cov = 0` are dropped. The per-chromosome vectors are
  139. /// sorted by position to enable binary search in [`region_fraction_modified`].
  140. ///
  141. /// # Arguments
  142. ///
  143. /// * `path` - Path to the modkit bedMethyl pileup file (BGZF-compressed)
  144. ///
  145. /// # Errors
  146. ///
  147. /// Returns an error if the file cannot be opened or any data line is malformed.
  148. pub fn read_pileup_index(path: &str) -> Result<HashMap<String, Vec<PileupSite>>> {
  149. let mut map: HashMap<String, Vec<PileupSite>> = HashMap::new();
  150. for (i, line) in get_gz_reader(path)?.lines().enumerate() {
  151. let line = line.with_context(|| format!("I/O error at line {}", i + 1))?;
  152. if line.starts_with('#') || line.is_empty() {
  153. continue;
  154. }
  155. let f: Vec<&str> = line.split('\t').collect();
  156. let chrom = f.get(0)
  157. .ok_or_else(|| anyhow!("Missing chrom at line {}", i + 1))?.to_string();
  158. let start: u64 = f.get(1)
  159. .ok_or_else(|| anyhow!("Missing start at line {}", i + 1))?
  160. .parse().with_context(|| format!("Bad start at line {}", i + 1))?;
  161. let nvalid_cov: u64 = f.get(9)
  162. .ok_or_else(|| anyhow!("Missing Nvalid_cov (col 10) at line {}", i + 1))?
  163. .parse().with_context(|| format!("Bad Nvalid_cov at line {}", i + 1))?;
  164. let nmod: u64 = f.get(11)
  165. .ok_or_else(|| anyhow!("Missing Nmod (col 12) at line {}", i + 1))?
  166. .parse().with_context(|| format!("Bad Nmod at line {}", i + 1))?;
  167. if nvalid_cov == 0 {
  168. continue;
  169. }
  170. map.entry(chrom).or_default().push(PileupSite { start, nvalid_cov, nmod });
  171. }
  172. for v in map.values_mut() {
  173. v.sort_by_key(|s| s.start);
  174. }
  175. Ok(map)
  176. }
  177. /// Aggregate methylation fraction over `[start, end)` from a sorted pileup slice.
  178. ///
  179. /// Returns `(fraction, sum_nvalid_cov, sum_nmod)` where
  180. /// `fraction = sum(Nmod) / sum(Nvalid_cov)`. Returns `None` if no sites overlap
  181. /// or if total coverage is zero.
  182. ///
  183. /// Uses [`slice::partition_point`] for O(log n) start lookup.
  184. pub fn region_fraction_modified(
  185. chrom_sites: &[PileupSite],
  186. start: u64,
  187. end: u64,
  188. ) -> Option<(f64, u64, u64)> {
  189. let i = chrom_sites.partition_point(|s| s.start < start);
  190. let mut sum_cov: u64 = 0;
  191. let mut sum_mod: u64 = 0;
  192. for s in &chrom_sites[i..] {
  193. if s.start >= end {
  194. break;
  195. }
  196. sum_cov = sum_cov.saturating_add(s.nvalid_cov);
  197. sum_mod = sum_mod.saturating_add(s.nmod);
  198. }
  199. if sum_cov == 0 {
  200. None
  201. } else {
  202. Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod))
  203. }
  204. }
  205. /// Gene-level epigenetic activity proxy computed from a single sample's pileup.
  206. ///
  207. /// Score: `log2(body_frac / prom_frac)` where both fractions are
  208. /// coverage-weighted methylation ratios over the respective regions.
  209. ///
  210. /// - Score > 0: gene-body methylation exceeds promoter → "active-like"
  211. /// - Score < 0: promoter methylation exceeds gene-body → "repressed-like"
  212. ///
  213. /// Output coordinates use the **promoter** interval (anchors to TSS in genome browsers).
  214. #[derive(Debug, Clone)]
  215. pub struct GeneActivity {
  216. pub gene_key: String,
  217. pub chrom: String,
  218. /// Promoter start, 0-based
  219. pub start: u64,
  220. /// Promoter end, 0-based exclusive
  221. pub end: u64,
  222. pub prom_frac: f64,
  223. pub body_frac: f64,
  224. /// `log2(body_frac / prom_frac)`
  225. pub score: f64,
  226. }
  227. /// Compute per-gene epigenetic activity from a modkit pileup and BED4 regions.
  228. ///
  229. /// For each gene with both a `_prom` and a `_body` region:
  230. /// 1. Aggregate methylation fraction `f(R) = Σ Nmod / Σ Nvalid_cov` per region
  231. /// 2. Skip genes where either fraction is 0 or coverage is below `min_sum_cov`
  232. /// 3. Compute `score = log2(body_frac / prom_frac)`
  233. ///
  234. /// Results are sorted by `(chrom, start)` for IGV compatibility.
  235. ///
  236. /// # Arguments
  237. ///
  238. /// * `pileup` - Per-chromosome pileup sites from [`read_pileup_index`]
  239. /// * `regions` - BED4 regions with `_prom` / `_body` name suffixes
  240. /// * `min_sum_cov` - Minimum total `Nvalid_cov` required per region
  241. ///
  242. /// # Errors
  243. ///
  244. /// Returns an error if any region has `end <= start`.
  245. pub fn compute_gene_activity_from_pileup(
  246. pileup: &HashMap<String, Vec<PileupSite>>,
  247. regions: &[BedRegion],
  248. min_sum_cov: u64,
  249. ) -> Result<Vec<GeneActivity>> {
  250. #[derive(Clone)]
  251. struct Part { chrom: String, start: u64, end: u64, frac: f64, sum_cov: u64 }
  252. #[derive(Default)]
  253. struct Pair { prom: Option<Part>, body: Option<Part> }
  254. let mut by_gene: HashMap<String, Pair> = HashMap::new();
  255. for r in regions {
  256. if r.end <= r.start {
  257. return Err(anyhow!(
  258. "Invalid region end<=start {}:{}-{} ({})",
  259. r.chrom, r.start, r.end, r.name
  260. ));
  261. }
  262. let ParsedName { gene_key, kind } = parse_region_name(&r.name);
  263. if kind == RegionKind::Other { continue; }
  264. let chrom_sites = match pileup.get(&r.chrom) { Some(v) => v, None => continue };
  265. let (frac, sum_cov, _) = match region_fraction_modified(chrom_sites, r.start, r.end) {
  266. Some(x) => x,
  267. None => continue,
  268. };
  269. if sum_cov < min_sum_cov || frac <= 0.0 { continue; }
  270. let part = Part { chrom: r.chrom.clone(), start: r.start, end: r.end, frac, sum_cov };
  271. let entry = by_gene.entry(gene_key).or_default();
  272. match kind {
  273. RegionKind::Promoter => entry.prom = Some(part),
  274. RegionKind::GeneBody => entry.body = Some(part),
  275. RegionKind::Other => {}
  276. }
  277. }
  278. let mut out = Vec::new();
  279. for (gene, pair) in by_gene {
  280. let (prom, body) = match (pair.prom, pair.body) {
  281. (Some(p), Some(b)) => (p, b),
  282. _ => continue,
  283. };
  284. out.push(GeneActivity {
  285. gene_key: gene,
  286. chrom: prom.chrom,
  287. start: prom.start,
  288. end: prom.end,
  289. prom_frac: prom.frac,
  290. body_frac: body.frac,
  291. score: (body.frac / prom.frac).log2(),
  292. });
  293. }
  294. out.sort_by(|a, b| match a.chrom.cmp(&b.chrom) {
  295. Ordering::Equal => a.start.cmp(&b.start),
  296. o => o,
  297. });
  298. Ok(out)
  299. }
  300. /// Convert a log2-ratio to a BED `score` field (0–1000).
  301. ///
  302. /// `v` is clipped to `[-clip, +clip]` then linearly mapped:
  303. /// `-clip` → 0, `0` → 500, `+clip` → 1000.
  304. pub fn bed_score_from_log2_ratio(v: f64, clip: f64) -> u16 {
  305. debug_assert!(clip > 0.0, "clip must be > 0");
  306. let t = (v.clamp(-clip, clip) + clip) / (2.0 * clip);
  307. (t * 1000.0).round().clamp(0.0, 1000.0) as u16
  308. }
  309. /// Write gene activity as an IGV BED9 track with `itemRgb=On`.
  310. ///
  311. /// Color encodes `GeneActivity::score` (`log2(body/prom)`) via
  312. /// [`diverging_rgb`] clipped to `[-clip, +clip]`. The BED `score` column
  313. /// (col 5) encodes the same quantity scaled to `[0, 1000]` via
  314. /// [`bed_score_from_log2_ratio`].
  315. ///
  316. /// Output is written atomically: data goes to a UUID temp file in the same
  317. /// directory as `path`, then renamed on success. A [`TempFileGuard`] ensures
  318. /// the temp file is removed automatically on failure.
  319. ///
  320. /// # Arguments
  321. ///
  322. /// * `path` - Destination BED9 file path
  323. /// * `track_name` - IGV track name shown in the track label
  324. /// * `activity` - Gene activity records (produced by [`compute_gene_activity_from_pileup`])
  325. /// * `clip` - Symmetric clip value for color and score mapping; must be > 0
  326. ///
  327. /// # Errors
  328. ///
  329. /// Returns an error if `clip <= 0`, the file cannot be written, or the atomic
  330. /// rename fails.
  331. pub fn write_gene_activity_bed9_itemrgb(
  332. path: &str,
  333. track_name: &str,
  334. activity: &[GeneActivity],
  335. clip: f64,
  336. ) -> Result<()> {
  337. if clip <= 0.0 {
  338. return Err(anyhow!("clip must be > 0, got {clip}"));
  339. }
  340. let tmp_dir = Path::new(path).parent().unwrap_or(Path::new("."));
  341. let mut guard = TempFileGuard::new();
  342. let tmp_path = guard.tmp_path("", tmp_dir);
  343. let out = File::create(&tmp_path)
  344. .with_context(|| format!("failed to create temp file: {}", tmp_path.display()))?;
  345. let mut w = BufWriter::new(out);
  346. writeln!(w, r#"track name="{}" itemRgb="On""#, track_name)?;
  347. for a in activity {
  348. let (rr, gg, bb) = diverging_rgb(a.score, clip);
  349. let bed_score = bed_score_from_log2_ratio(a.score, clip);
  350. writeln!(
  351. w,
  352. "{}\t{}\t{}\t{}={:.4}\t{}\t.\t{}\t{}\t{},{},{}",
  353. a.chrom, a.start, a.end,
  354. a.gene_key, a.score,
  355. bed_score,
  356. a.start, a.end,
  357. rr, gg, bb,
  358. )?;
  359. }
  360. w.flush().with_context(|| format!("failed to flush: {}", tmp_path.display()))?;
  361. drop(w);
  362. fs::rename(&tmp_path, path)
  363. .with_context(|| format!("failed to rename {} → {path}", tmp_path.display()))?;
  364. guard.disarm();
  365. Ok(())
  366. }
  367. /// End-to-end helper: pileup + BED4 regions → IGV BED9 activity track.
  368. ///
  369. /// Runs the full pipeline in order:
  370. /// 1. [`read_pileup_index`] — parse and index the modkit pileup
  371. /// 2. [`read_bed4`] — parse the gene promoter/body regions
  372. /// 3. [`compute_gene_activity_from_pileup`] — compute log2 activity scores
  373. /// 4. [`write_gene_activity_bed9_itemrgb`] — write the BED9 track atomically
  374. ///
  375. /// # Returns
  376. ///
  377. /// The number of gene features written.
  378. ///
  379. /// # Errors
  380. ///
  381. /// Returns an error if any pipeline step fails.
  382. pub fn pileup_regions_to_activity_bed9_itemrgb(
  383. pileup_path: &str,
  384. regions_bed4_path: &str,
  385. min_sum_cov: u64,
  386. out_bed9_path: &str,
  387. track_name: &str,
  388. clip: f64,
  389. ) -> Result<usize> {
  390. let pile = read_pileup_index(pileup_path)?;
  391. let regions = read_bed4(regions_bed4_path)?;
  392. let activity = compute_gene_activity_from_pileup(&pile, &regions, min_sum_cov)?;
  393. let n = activity.len();
  394. write_gene_activity_bed9_itemrgb(out_bed9_path, track_name, &activity, clip)?;
  395. Ok(n)
  396. }
  397. #[cfg(test)]
  398. mod tests {
  399. use log::info;
  400. use super::*;
  401. use crate::{config::Config, helpers::test_init};
  402. #[test]
  403. fn modkit_activity_igv() -> anyhow::Result<()> {
  404. test_init();
  405. let config = Config::default();
  406. let regions_bed4_path =
  407. "/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_genes_prom_body.bed";
  408. for id in ["DUMCO", "CHAHA"] {
  409. let path = format!(
  410. "{}/{}_{}_modkit_pileup.bed.gz",
  411. config.tumoral_dir(id), id, config.tumoral_name
  412. );
  413. let out = format!(
  414. "{}/{}_{}_modkit_activity.bed",
  415. config.tumoral_dir(id), id, config.tumoral_name
  416. );
  417. let n = pileup_regions_to_activity_bed9_itemrgb(
  418. &path, regions_bed4_path, 100, &out,
  419. &format!("{id} Gene Activity"), 2.0,
  420. )?;
  421. info!("{id}: {n} gene activities written");
  422. }
  423. Ok(())
  424. }
  425. }