|
|
@@ -1,26 +1,28 @@
|
|
|
//! Modkit pileup parsing and gene-level epigenetic activity computation.
|
|
|
//!
|
|
|
-//! Reads modkit bedMethyl pileup files (BGZF-compressed) and BED4 region
|
|
|
+//! Reads modkit bedMethyl pileup files (BGZF + Tabix indexed) and BED4 region
|
|
|
//! files to compute a per-gene **epigenetic activity proxy** based on the
|
|
|
//! contrast between promoter and gene-body methylation.
|
|
|
//!
|
|
|
+//! Region methylation is fetched **per region** via the Tabix index using
|
|
|
+//! [`fetch_tabix_lines_with`], avoiding loading the full genome-wide pileup
|
|
|
+//! (hundreds of millions of CpG sites) into memory.
|
|
|
+//!
|
|
|
//! # Pipeline
|
|
|
//!
|
|
|
//! ```text
|
|
|
-//! pileup.bed.gz ──► read_pileup_index ──► HashMap<chrom, [PileupSite]>
|
|
|
-//! │
|
|
|
-//! regions.bed ──► read_bed4 ──► [BedRegion] ─┤
|
|
|
-//! │
|
|
|
-//! compute_gene_activity_from_pileup
|
|
|
-//! │
|
|
|
-//! write_gene_activity_bed9_itemrgb ◄─┘
|
|
|
+//! pileup.bed.gz + .tbi ──► fetch per region (fetch_tabix_lines_with)
|
|
|
+//! regions.bed ──► read_bed4
|
|
|
+//! │
|
|
|
+//! compute_gene_activity
|
|
|
+//! │
|
|
|
+//! write_gene_activity_bed9_itemrgb
|
|
|
//! ```
|
|
|
//!
|
|
|
//! # Coordinate system
|
|
|
//!
|
|
|
-//! All coordinates are **0-based half-open `[start, end)`** throughout, matching
|
|
|
-//! the BED convention. `PileupSite.start` is a single-base position (the interval
|
|
|
-//! is `[start, start+1)`).
|
|
|
+//! All coordinates are **0-based half-open `[start, end)`** throughout,
|
|
|
+//! matching the BED convention. `PileupSite.start` is a single-base position.
|
|
|
|
|
|
use anyhow::{anyhow, Context, Result};
|
|
|
use std::{
|
|
|
@@ -28,19 +30,23 @@ use std::{
|
|
|
collections::HashMap,
|
|
|
fs,
|
|
|
fs::File,
|
|
|
- io::{BufRead, BufWriter, Write},
|
|
|
+ io::{BufWriter, Write},
|
|
|
path::Path,
|
|
|
};
|
|
|
|
|
|
-use crate::{helpers::TempFileGuard, io::{bed::read_bed, readers::get_gz_reader}};
|
|
|
+use crate::{
|
|
|
+ helpers::TempFileGuard,
|
|
|
+ io::{bed::read_bed, readers::fetch_tabix_lines_with},
|
|
|
+ positions::GenomeRange,
|
|
|
+};
|
|
|
+
|
|
|
+// ─── Types ───────────────────────────────────────────────────────────────────
|
|
|
|
|
|
/// One methylation call row from a modkit pileup (bedMethyl) file.
|
|
|
///
|
|
|
/// Pileup rows are single-base intervals `[start, start+1)`.
|
|
|
-/// Only the columns needed for region aggregation are kept:
|
|
|
-/// - `start`: col 2, 0-based
|
|
|
-/// - `nvalid_cov`: col 10 (Nvalid_cov)
|
|
|
-/// - `nmod`: col 12 (Nmod)
|
|
|
+/// Only the columns needed for region aggregation are stored:
|
|
|
+/// col 2 (`start`), col 10 (`nvalid_cov`), col 12 (`nmod`).
|
|
|
#[derive(Debug, Clone)]
|
|
|
pub struct PileupSite {
|
|
|
/// 0-based position (half-open interval `[start, start+1)`)
|
|
|
@@ -79,6 +85,27 @@ pub struct ParsedName {
|
|
|
pub kind: RegionKind,
|
|
|
}
|
|
|
|
|
|
+/// Gene-level epigenetic activity proxy computed from a single sample's pileup.
|
|
|
+///
|
|
|
+/// Score: `log2(body_frac / prom_frac)` — positive means more active-like,
|
|
|
+/// negative means more repressed-like. Output coordinates use the **promoter**
|
|
|
+/// interval (anchors the signal near the TSS in genome browsers).
|
|
|
+#[derive(Debug, Clone)]
|
|
|
+pub struct GeneActivity {
|
|
|
+ pub gene_key: String,
|
|
|
+ pub chrom: String,
|
|
|
+ /// Promoter start, 0-based inclusive
|
|
|
+ pub start: u64,
|
|
|
+ /// Promoter end, 0-based exclusive
|
|
|
+ pub end: u64,
|
|
|
+ pub prom_frac: f64,
|
|
|
+ pub body_frac: f64,
|
|
|
+ /// `log2(body_frac / prom_frac)`
|
|
|
+ pub score: f64,
|
|
|
+}
|
|
|
+
|
|
|
+// ─── Parsing helpers ─────────────────────────────────────────────────────────
|
|
|
+
|
|
|
/// Parse a region name for `_prom` or `_body` suffix.
|
|
|
///
|
|
|
/// - `"FOO_prom"` → gene key `"FOO"`, kind [`RegionKind::Promoter`]
|
|
|
@@ -94,38 +121,11 @@ pub fn parse_region_name(name: &str) -> ParsedName {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-/// Diverging blue→white→red palette for IGV `itemRgb`.
|
|
|
-///
|
|
|
-/// Maps `v` clipped to `[-clip, +clip]` onto the range:
|
|
|
-/// - `v = -clip` → `(0, 0, 255)` pure blue
|
|
|
-/// - `v = 0` → `(255, 255, 255)` white
|
|
|
-/// - `v = +clip` → `(255, 0, 0)` pure red
|
|
|
-pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) {
|
|
|
- debug_assert!(clip > 0.0, "clip must be > 0");
|
|
|
- let x = v.clamp(-clip, clip);
|
|
|
- let t = (x + clip) / (2.0 * clip);
|
|
|
- if t < 0.5 {
|
|
|
- let u = t / 0.5;
|
|
|
- let r = (255.0 * u).round() as u8;
|
|
|
- let g = (255.0 * u).round() as u8;
|
|
|
- (r, g, 255)
|
|
|
- } else {
|
|
|
- let u = (t - 0.5) / 0.5;
|
|
|
- let g = (255.0 * (1.0 - u)).round() as u8;
|
|
|
- let b = (255.0 * (1.0 - u)).round() as u8;
|
|
|
- (255, g, b)
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
/// Read a BED4 regions file into memory.
|
|
|
///
|
|
|
-/// Wraps [`read_bed`] from the `bed` module, inheriting its header skipping
|
|
|
-/// (`#`, `track`, `browser`) and gzip support. Coordinates are widened from
|
|
|
-/// `u32` to `u64`; contig names are recovered via `num_to_contig`.
|
|
|
-///
|
|
|
-/// # Arguments
|
|
|
-///
|
|
|
-/// * `path` - Path to the BED4 file (plain or BGZF-compressed)
|
|
|
+/// Wraps [`read_bed`], inheriting its header skipping (`#`, `track`, `browser`)
|
|
|
+/// and gzip support. Coordinates are widened from `u32` to `u64`;
|
|
|
+/// contig names are recovered via `GenomeRange::contig()`.
|
|
|
///
|
|
|
/// # Errors
|
|
|
///
|
|
|
@@ -142,136 +142,107 @@ pub fn read_bed4(path: &str) -> Result<Vec<BedRegion>> {
|
|
|
.collect())
|
|
|
}
|
|
|
|
|
|
-/// Read a modkit pileup file and index sites per chromosome.
|
|
|
-///
|
|
|
-/// Parses columns 1 (chrom), 2 (start), 10 (Nvalid_cov), 12 (Nmod).
|
|
|
-/// Sites with `Nvalid_cov = 0` are dropped. The per-chromosome vectors are
|
|
|
-/// sorted by position to enable binary search in [`region_fraction_modified`].
|
|
|
-///
|
|
|
-/// # Arguments
|
|
|
-///
|
|
|
-/// * `path` - Path to the modkit bedMethyl pileup file (BGZF-compressed)
|
|
|
-///
|
|
|
-/// # Errors
|
|
|
-///
|
|
|
-/// Returns an error if the file cannot be opened or any data line is malformed.
|
|
|
-pub fn read_pileup_index(path: &str) -> Result<HashMap<String, Vec<PileupSite>>> {
|
|
|
- let mut map: HashMap<String, Vec<PileupSite>> = HashMap::new();
|
|
|
-
|
|
|
- for (i, line) in get_gz_reader(path)?.lines().enumerate() {
|
|
|
- let line = line.with_context(|| format!("I/O error at line {}", i + 1))?;
|
|
|
- if line.starts_with('#') || line.is_empty() {
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- let f: Vec<&str> = line.split('\t').collect();
|
|
|
-
|
|
|
- let chrom = f.get(0)
|
|
|
- .ok_or_else(|| anyhow!("Missing chrom at line {}", i + 1))?.to_string();
|
|
|
- let start: u64 = f.get(1)
|
|
|
- .ok_or_else(|| anyhow!("Missing start at line {}", i + 1))?
|
|
|
- .parse().with_context(|| format!("Bad start at line {}", i + 1))?;
|
|
|
- let nvalid_cov: u64 = f.get(9)
|
|
|
- .ok_or_else(|| anyhow!("Missing Nvalid_cov (col 10) at line {}", i + 1))?
|
|
|
- .parse().with_context(|| format!("Bad Nvalid_cov at line {}", i + 1))?;
|
|
|
- let nmod: u64 = f.get(11)
|
|
|
- .ok_or_else(|| anyhow!("Missing Nmod (col 12) at line {}", i + 1))?
|
|
|
- .parse().with_context(|| format!("Bad Nmod at line {}", i + 1))?;
|
|
|
-
|
|
|
- if nvalid_cov == 0 {
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- map.entry(chrom).or_default().push(PileupSite { start, nvalid_cov, nmod });
|
|
|
- }
|
|
|
-
|
|
|
- for v in map.values_mut() {
|
|
|
- v.sort_by_key(|s| s.start);
|
|
|
- }
|
|
|
-
|
|
|
- Ok(map)
|
|
|
-}
|
|
|
+// ─── Region methylation ───────────────────────────────────────────────────────
|
|
|
|
|
|
-/// Aggregate methylation fraction over `[start, end)` from a sorted pileup slice.
|
|
|
+/// Aggregate methylation fraction over `[start, end)` from a **pre-loaded**
|
|
|
+/// sorted pileup slice.
|
|
|
///
|
|
|
/// Returns `(fraction, sum_nvalid_cov, sum_nmod)` where
|
|
|
-/// `fraction = sum(Nmod) / sum(Nvalid_cov)`. Returns `None` if no sites overlap
|
|
|
-/// or if total coverage is zero.
|
|
|
+/// `fraction = Σ Nmod / Σ Nvalid_cov`. Returns `None` if no sites overlap or
|
|
|
+/// total coverage is zero.
|
|
|
///
|
|
|
/// Uses [`slice::partition_point`] for O(log n) start lookup.
|
|
|
+/// Prefer [`region_fraction_from_tabix`] for production use — this variant is
|
|
|
+/// useful when the pileup is already loaded in memory (e.g. unit tests).
|
|
|
pub fn region_fraction_modified(
|
|
|
chrom_sites: &[PileupSite],
|
|
|
start: u64,
|
|
|
end: u64,
|
|
|
) -> Option<(f64, u64, u64)> {
|
|
|
let i = chrom_sites.partition_point(|s| s.start < start);
|
|
|
-
|
|
|
- let mut sum_cov: u64 = 0;
|
|
|
- let mut sum_mod: u64 = 0;
|
|
|
+ let mut sum_cov = 0u64;
|
|
|
+ let mut sum_mod = 0u64;
|
|
|
|
|
|
for s in &chrom_sites[i..] {
|
|
|
- if s.start >= end {
|
|
|
- break;
|
|
|
- }
|
|
|
+ if s.start >= end { break; }
|
|
|
sum_cov = sum_cov.saturating_add(s.nvalid_cov);
|
|
|
sum_mod = sum_mod.saturating_add(s.nmod);
|
|
|
}
|
|
|
|
|
|
- if sum_cov == 0 {
|
|
|
- None
|
|
|
- } else {
|
|
|
- Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod))
|
|
|
- }
|
|
|
+ if sum_cov == 0 { None } else { Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod)) }
|
|
|
}
|
|
|
|
|
|
-/// Gene-level epigenetic activity proxy computed from a single sample's pileup.
|
|
|
+/// Fetch methylation fraction for `region` directly from a Tabix-indexed pileup.
|
|
|
///
|
|
|
-/// Score: `log2(body_frac / prom_frac)` where both fractions are
|
|
|
-/// coverage-weighted methylation ratios over the respective regions.
|
|
|
+/// Uses [`fetch_tabix_lines_with`] to seek only to the relevant BGZF chunks —
|
|
|
+/// no full-file load. Parses col 10 (`Nvalid_cov`) and col 12 (`Nmod`) from
|
|
|
+/// each overlapping pileup line.
|
|
|
///
|
|
|
-/// - Score > 0: gene-body methylation exceeds promoter → "active-like"
|
|
|
-/// - Score < 0: promoter methylation exceeds gene-body → "repressed-like"
|
|
|
+/// Returns `(fraction, sum_nvalid_cov, sum_nmod)`, or `None` if no sites
|
|
|
+/// overlap the region or total coverage is zero.
|
|
|
///
|
|
|
-/// Output coordinates use the **promoter** interval (anchors to TSS in genome browsers).
|
|
|
-#[derive(Debug, Clone)]
|
|
|
-pub struct GeneActivity {
|
|
|
- pub gene_key: String,
|
|
|
- pub chrom: String,
|
|
|
- /// Promoter start, 0-based
|
|
|
- pub start: u64,
|
|
|
- /// Promoter end, 0-based exclusive
|
|
|
- pub end: u64,
|
|
|
- pub prom_frac: f64,
|
|
|
- pub body_frac: f64,
|
|
|
- /// `log2(body_frac / prom_frac)`
|
|
|
- pub score: f64,
|
|
|
+/// # Arguments
|
|
|
+///
|
|
|
+/// * `pileup_bgz` - Path to the modkit pileup `.bed.gz`; index expected at
|
|
|
+/// `<pileup_bgz>.tbi`
|
|
|
+/// * `region` - Genomic region to query (0-based half-open)
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+///
|
|
|
+/// Returns an error if the Tabix index cannot be read or a pileup line is
|
|
|
+/// malformed.
|
|
|
+pub fn region_fraction_from_tabix(
|
|
|
+ pileup_bgz: &str,
|
|
|
+ region: &GenomeRange,
|
|
|
+) -> Result<Option<(f64, u64, u64)>> {
|
|
|
+ let mut sum_cov = 0u64;
|
|
|
+ let mut sum_mod = 0u64;
|
|
|
+
|
|
|
+ fetch_tabix_lines_with(pileup_bgz, region, |line| {
|
|
|
+ let f: Vec<&str> = line.split('\t').collect();
|
|
|
+ let nvalid_cov: u64 = f.get(9)
|
|
|
+ .ok_or_else(|| anyhow!("Missing Nvalid_cov (col 10)"))?
|
|
|
+ .parse().context("Bad Nvalid_cov")?;
|
|
|
+ let nmod: u64 = f.get(11)
|
|
|
+ .ok_or_else(|| anyhow!("Missing Nmod (col 12)"))?
|
|
|
+ .parse().context("Bad Nmod")?;
|
|
|
+ sum_cov = sum_cov.saturating_add(nvalid_cov);
|
|
|
+ sum_mod = sum_mod.saturating_add(nmod);
|
|
|
+ Ok(())
|
|
|
+ })?;
|
|
|
+
|
|
|
+ if sum_cov == 0 { Ok(None) } else { Ok(Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod))) }
|
|
|
}
|
|
|
|
|
|
-/// Compute per-gene epigenetic activity from a modkit pileup and BED4 regions.
|
|
|
+// ─── Activity computation ─────────────────────────────────────────────────────
|
|
|
+
|
|
|
+/// Compute per-gene epigenetic activity from a Tabix-indexed modkit pileup.
|
|
|
///
|
|
|
-/// For each gene with both a `_prom` and a `_body` region:
|
|
|
-/// 1. Aggregate methylation fraction `f(R) = Σ Nmod / Σ Nvalid_cov` per region
|
|
|
-/// 2. Skip genes where either fraction is 0 or coverage is below `min_sum_cov`
|
|
|
-/// 3. Compute `score = log2(body_frac / prom_frac)`
|
|
|
+/// For each gene with both a `_prom` and a `_body` region, fetches methylation
|
|
|
+/// fractions via [`region_fraction_from_tabix`] (one tabix seek per region),
|
|
|
+/// then computes `score = log2(body_frac / prom_frac)`.
|
|
|
+///
|
|
|
+/// Genes are skipped if either fraction is 0, total coverage is below
|
|
|
+/// `min_sum_cov`, or the contig is absent from the Tabix index.
|
|
|
///
|
|
|
/// Results are sorted by `(chrom, start)` for IGV compatibility.
|
|
|
///
|
|
|
/// # Arguments
|
|
|
///
|
|
|
-/// * `pileup` - Per-chromosome pileup sites from [`read_pileup_index`]
|
|
|
+/// * `pileup_bgz` - Path to the modkit pileup `.bed.gz` (Tabix-indexed)
|
|
|
/// * `regions` - BED4 regions with `_prom` / `_body` name suffixes
|
|
|
/// * `min_sum_cov` - Minimum total `Nvalid_cov` required per region
|
|
|
///
|
|
|
/// # Errors
|
|
|
///
|
|
|
-/// Returns an error if any region has `end <= start`.
|
|
|
-pub fn compute_gene_activity_from_pileup(
|
|
|
- pileup: &HashMap<String, Vec<PileupSite>>,
|
|
|
+/// Returns an error if any region has `end <= start` or a Tabix fetch fails.
|
|
|
+pub fn compute_gene_activity(
|
|
|
+ pileup_bgz: &str,
|
|
|
regions: &[BedRegion],
|
|
|
min_sum_cov: u64,
|
|
|
) -> Result<Vec<GeneActivity>> {
|
|
|
#[derive(Clone)]
|
|
|
- struct Part { chrom: String, start: u64, end: u64, frac: f64, sum_cov: u64 }
|
|
|
+ struct Part { chrom: String, start: u64, end: u64, frac: f64 }
|
|
|
#[derive(Default)]
|
|
|
struct Pair { prom: Option<Part>, body: Option<Part> }
|
|
|
|
|
|
@@ -288,40 +259,39 @@ pub fn compute_gene_activity_from_pileup(
|
|
|
let ParsedName { gene_key, kind } = parse_region_name(&r.name);
|
|
|
if kind == RegionKind::Other { continue; }
|
|
|
|
|
|
- let chrom_sites = match pileup.get(&r.chrom) { Some(v) => v, None => continue };
|
|
|
+ let region = GenomeRange::new(&r.chrom, r.start as u32, r.end as u32);
|
|
|
|
|
|
- let (frac, sum_cov, _) = match region_fraction_modified(chrom_sites, r.start, r.end) {
|
|
|
+ let (frac, sum_cov, _) = match region_fraction_from_tabix(pileup_bgz, ®ion)? {
|
|
|
Some(x) => x,
|
|
|
None => continue,
|
|
|
};
|
|
|
|
|
|
if sum_cov < min_sum_cov || frac <= 0.0 { continue; }
|
|
|
|
|
|
- let part = Part { chrom: r.chrom.clone(), start: r.start, end: r.end, frac, sum_cov };
|
|
|
+ let part = Part { chrom: r.chrom.clone(), start: r.start, end: r.end, frac };
|
|
|
let entry = by_gene.entry(gene_key).or_default();
|
|
|
match kind {
|
|
|
- RegionKind::Promoter => entry.prom = Some(part),
|
|
|
- RegionKind::GeneBody => entry.body = Some(part),
|
|
|
- RegionKind::Other => {}
|
|
|
+ RegionKind::Promoter => entry.prom = Some(part),
|
|
|
+ RegionKind::GeneBody => entry.body = Some(part),
|
|
|
+ RegionKind::Other => {}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- let mut out = Vec::new();
|
|
|
- for (gene, pair) in by_gene {
|
|
|
- let (prom, body) = match (pair.prom, pair.body) {
|
|
|
- (Some(p), Some(b)) => (p, b),
|
|
|
- _ => continue,
|
|
|
- };
|
|
|
- out.push(GeneActivity {
|
|
|
- gene_key: gene,
|
|
|
- chrom: prom.chrom,
|
|
|
- start: prom.start,
|
|
|
- end: prom.end,
|
|
|
- prom_frac: prom.frac,
|
|
|
- body_frac: body.frac,
|
|
|
- score: (body.frac / prom.frac).log2(),
|
|
|
- });
|
|
|
- }
|
|
|
+ let mut out: Vec<GeneActivity> = by_gene
|
|
|
+ .into_iter()
|
|
|
+ .filter_map(|(gene, pair)| {
|
|
|
+ let (prom, body) = (pair.prom?, pair.body?);
|
|
|
+ Some(GeneActivity {
|
|
|
+ gene_key: gene,
|
|
|
+ chrom: prom.chrom,
|
|
|
+ start: prom.start,
|
|
|
+ end: prom.end,
|
|
|
+ prom_frac: prom.frac,
|
|
|
+ body_frac: body.frac,
|
|
|
+ score: (body.frac / prom.frac).log2(),
|
|
|
+ })
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
|
|
|
out.sort_by(|a, b| match a.chrom.cmp(&b.chrom) {
|
|
|
Ordering::Equal => a.start.cmp(&b.start),
|
|
|
@@ -331,9 +301,28 @@ pub fn compute_gene_activity_from_pileup(
|
|
|
Ok(out)
|
|
|
}
|
|
|
|
|
|
-/// Convert a log2-ratio to a BED `score` field (0–1000).
|
|
|
+// ─── Colour / score helpers ───────────────────────────────────────────────────
|
|
|
+
|
|
|
+/// Diverging blue→white→red palette for IGV `itemRgb`.
|
|
|
///
|
|
|
-/// `v` is clipped to `[-clip, +clip]` then linearly mapped:
|
|
|
+/// `v` is clipped to `[-clip, +clip]` and mapped to:
|
|
|
+/// `-clip` → `(0, 0, 255)`, `0` → `(255, 255, 255)`, `+clip` → `(255, 0, 0)`.
|
|
|
+pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) {
|
|
|
+ debug_assert!(clip > 0.0, "clip must be > 0");
|
|
|
+ let t = (v.clamp(-clip, clip) + clip) / (2.0 * clip);
|
|
|
+ if t < 0.5 {
|
|
|
+ let u = t / 0.5;
|
|
|
+ ((255.0 * u).round() as u8, (255.0 * u).round() as u8, 255)
|
|
|
+ } else {
|
|
|
+ let u = (t - 0.5) / 0.5;
|
|
|
+ let c = (255.0 * (1.0 - u)).round() as u8;
|
|
|
+ (255, c, c)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/// Map a log2-ratio to a BED `score` field (0–1000).
|
|
|
+///
|
|
|
+/// `v` is clipped to `[-clip, +clip]` then linearly scaled:
|
|
|
/// `-clip` → 0, `0` → 500, `+clip` → 1000.
|
|
|
pub fn bed_score_from_log2_ratio(v: f64, clip: f64) -> u16 {
|
|
|
debug_assert!(clip > 0.0, "clip must be > 0");
|
|
|
@@ -341,29 +330,21 @@ pub fn bed_score_from_log2_ratio(v: f64, clip: f64) -> u16 {
|
|
|
(t * 1000.0).round().clamp(0.0, 1000.0) as u16
|
|
|
}
|
|
|
|
|
|
+// ─── Output ───────────────────────────────────────────────────────────────────
|
|
|
+
|
|
|
/// Write gene activity as an IGV BED9 track with `itemRgb=On`.
|
|
|
///
|
|
|
-/// Color encodes `GeneActivity::score` (`log2(body/prom)`) via
|
|
|
-/// [`diverging_rgb`] clipped to `[-clip, +clip]`. The BED `score` column
|
|
|
-/// (col 5) encodes the same quantity scaled to `[0, 1000]` via
|
|
|
-/// [`bed_score_from_log2_ratio`].
|
|
|
-///
|
|
|
-/// Output is written atomically: data goes to a UUID temp file in the same
|
|
|
-/// directory as `path`, then renamed on success. A [`TempFileGuard`] ensures
|
|
|
-/// the temp file is removed automatically on failure.
|
|
|
+/// Colour encodes `GeneActivity::score` via [`diverging_rgb`] clipped to
|
|
|
+/// `[-clip, +clip]`. BED col 5 (`score`) holds the same quantity scaled to
|
|
|
+/// `[0, 1000]` via [`bed_score_from_log2_ratio`].
|
|
|
///
|
|
|
-/// # Arguments
|
|
|
-///
|
|
|
-/// * `path` - Destination BED9 file path
|
|
|
-/// * `track_name` - IGV track name shown in the track label
|
|
|
-/// * `activity` - Gene activity records (produced by [`compute_gene_activity_from_pileup`])
|
|
|
-/// * `clip` - Symmetric clip value for color and score mapping; must be > 0
|
|
|
+/// Written atomically: data goes to a UUID temp file, renamed on success.
|
|
|
+/// A [`TempFileGuard`] removes the temp file automatically on failure.
|
|
|
///
|
|
|
/// # Errors
|
|
|
///
|
|
|
-/// Returns an error if `clip <= 0`, the file cannot be written, or the atomic
|
|
|
-/// rename fails.
|
|
|
-pub fn write_gene_activity_bed9_itemrgb(
|
|
|
+/// Returns an error if `clip <= 0`, the file cannot be written, or the rename fails.
|
|
|
+pub fn write_gene_activity_bed9(
|
|
|
path: &str,
|
|
|
track_name: &str,
|
|
|
activity: &[GeneActivity],
|
|
|
@@ -377,21 +358,20 @@ pub fn write_gene_activity_bed9_itemrgb(
|
|
|
let mut guard = TempFileGuard::new();
|
|
|
let tmp_path = guard.tmp_path("", tmp_dir);
|
|
|
|
|
|
- let out = File::create(&tmp_path)
|
|
|
- .with_context(|| format!("failed to create temp file: {}", tmp_path.display()))?;
|
|
|
- let mut w = BufWriter::new(out);
|
|
|
+ let mut w = BufWriter::new(
|
|
|
+ File::create(&tmp_path)
|
|
|
+ .with_context(|| format!("failed to create temp file: {}", tmp_path.display()))?,
|
|
|
+ );
|
|
|
|
|
|
writeln!(w, r#"track name="{}" itemRgb="On""#, track_name)?;
|
|
|
-
|
|
|
for a in activity {
|
|
|
let (rr, gg, bb) = diverging_rgb(a.score, clip);
|
|
|
- let bed_score = bed_score_from_log2_ratio(a.score, clip);
|
|
|
writeln!(
|
|
|
w,
|
|
|
"{}\t{}\t{}\t{}={:.4}\t{}\t.\t{}\t{}\t{},{},{}",
|
|
|
a.chrom, a.start, a.end,
|
|
|
a.gene_key, a.score,
|
|
|
- bed_score,
|
|
|
+ bed_score_from_log2_ratio(a.score, clip),
|
|
|
a.start, a.end,
|
|
|
rr, gg, bb,
|
|
|
)?;
|
|
|
@@ -407,13 +387,14 @@ pub fn write_gene_activity_bed9_itemrgb(
|
|
|
Ok(())
|
|
|
}
|
|
|
|
|
|
-/// End-to-end helper: pileup + BED4 regions → IGV BED9 activity track.
|
|
|
+// ─── Pipeline ─────────────────────────────────────────────────────────────────
|
|
|
+
|
|
|
+/// End-to-end pipeline: Tabix pileup + BED4 regions → IGV BED9 activity track.
|
|
|
///
|
|
|
-/// Runs the full pipeline in order:
|
|
|
-/// 1. [`read_pileup_index`] — parse and index the modkit pileup
|
|
|
-/// 2. [`read_bed4`] — parse the gene promoter/body regions
|
|
|
-/// 3. [`compute_gene_activity_from_pileup`] — compute log2 activity scores
|
|
|
-/// 4. [`write_gene_activity_bed9_itemrgb`] — write the BED9 track atomically
|
|
|
+/// 1. [`read_bed4`] — parse the gene promoter/body regions
|
|
|
+/// 2. [`compute_gene_activity`] — fetch per-region methylation fractions via
|
|
|
+/// Tabix and compute log2 activity scores
|
|
|
+/// 3. [`write_gene_activity_bed9`] — write the BED9 track atomically
|
|
|
///
|
|
|
/// # Returns
|
|
|
///
|
|
|
@@ -422,33 +403,30 @@ pub fn write_gene_activity_bed9_itemrgb(
|
|
|
/// # Errors
|
|
|
///
|
|
|
/// Returns an error if any pipeline step fails.
|
|
|
-pub fn pileup_regions_to_activity_bed9_itemrgb(
|
|
|
- pileup_path: &str,
|
|
|
+pub fn pileup_to_activity_bed9(
|
|
|
+ pileup_bgz: &str,
|
|
|
regions_bed4_path: &str,
|
|
|
min_sum_cov: u64,
|
|
|
out_bed9_path: &str,
|
|
|
track_name: &str,
|
|
|
clip: f64,
|
|
|
) -> Result<usize> {
|
|
|
- let pile = read_pileup_index(pileup_path)?;
|
|
|
let regions = read_bed4(regions_bed4_path)?;
|
|
|
- let activity = compute_gene_activity_from_pileup(&pile, ®ions, min_sum_cov)?;
|
|
|
+ let activity = compute_gene_activity(pileup_bgz, ®ions, min_sum_cov)?;
|
|
|
let n = activity.len();
|
|
|
- write_gene_activity_bed9_itemrgb(out_bed9_path, track_name, &activity, clip)?;
|
|
|
+ write_gene_activity_bed9(out_bed9_path, track_name, &activity, clip)?;
|
|
|
Ok(n)
|
|
|
}
|
|
|
|
|
|
#[cfg(test)]
|
|
|
mod tests {
|
|
|
use log::info;
|
|
|
-
|
|
|
use super::*;
|
|
|
use crate::{config::Config, helpers::test_init};
|
|
|
|
|
|
#[test]
|
|
|
fn modkit_activity_igv() -> anyhow::Result<()> {
|
|
|
test_init();
|
|
|
-
|
|
|
let config = Config::default();
|
|
|
let regions_bed4_path =
|
|
|
"/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_genes_prom_body.bed";
|
|
|
@@ -462,13 +440,12 @@ mod tests {
|
|
|
"{}/{}_{}_modkit_activity.bed",
|
|
|
config.tumoral_dir(id), id, config.tumoral_name
|
|
|
);
|
|
|
- let n = pileup_regions_to_activity_bed9_itemrgb(
|
|
|
+ let n = pileup_to_activity_bed9(
|
|
|
&path, regions_bed4_path, 100, &out,
|
|
|
&format!("{id} Gene Activity"), 2.0,
|
|
|
)?;
|
|
|
info!("{id}: {n} gene activities written");
|
|
|
}
|
|
|
-
|
|
|
Ok(())
|
|
|
}
|
|
|
}
|