variants_stats.rs 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016
  1. //! # Variant Statistics and Analysis
  2. //!
  3. //! This module provides statistical analysis, rate calculations, and data preparation
  4. //! for genomic variant collections, particularly focused on somatic variant analysis
  5. //! and generalized linear modeling (GLM) of mutation rates.
  6. //!
  7. //! ## Core Types
  8. //!
  9. //! ### Statistics Aggregation
  10. //!
  11. //! - [`VariantsStats`] - Comprehensive statistics for a variant collection including:
  12. //! - Variant counts by category (SNV, insertion, deletion, etc.)
  13. //! - Variant allele frequency (VAF) distributions
  14. //! - Read depth distributions
  15. //! - VEP consequence and impact distributions
  16. //! - Gene-level mutation counts
  17. //! - COSMIC and gnomAD frequency overlaps
  18. //! - Somatic mutation rate calculations
  19. //!
  20. //! ### Mutation Rate Analysis
  21. //!
  22. //! - [`SomaticVariantRates`] - Somatic mutation burden metrics:
  23. //! - Whole-genome mutation rate (mutations per megabase)
  24. //! - Coding region mutation rate
  25. //! - Nonsynonymous mutation rate
  26. //! - Exon coverage statistics
  27. //!
  28. //! ### GLM Data Preparation
  29. //!
  30. //! - [`GlmRow`] - Genomic region with mutation count and feature annotations for statistical modeling
  31. //! - Used to build datasets for generalized linear models (GLMs)
  32. //! - Enables analysis of mutation rate drivers (e.g., replication timing, chromatin state)
  33. //!
  34. //! ## Key Functions
  35. //!
  36. //! ### Mutation Rate Calculation
  37. //!
  38. //! - [`somatic_rates()`] - Calculate somatic mutation rates across WGS and coding regions
  39. //! - [`somatic_depth_quality_ranges()`] - Identify high-depth and low-quality genomic regions
  40. //!
  41. //! ### GLM Dataset Generation
  42. //!
  43. //! - [`make_glm_rows_from_regions_par()`] - Build GLM-ready dataset from variants and genomic bins
  44. //! - [`write_glm_rows()`] - Export GLM rows to CSV for downstream statistical analysis
  45. //!
  46. //! ### Utilities
  47. //!
  48. //! - [`ranges_from_consecutive_true_iter()`] - Convert boolean iterator to genomic ranges
  49. //! - [`merge_adjacent_ranges()`] - Merge adjacent genomic ranges
  50. //!
  51. //! ## Usage Examples
  52. //!
  53. //! ### Computing Variant Statistics
  54. //!
  55. //! ```ignore
  56. //! use pandora_lib_promethion::variant::variants_stats::VariantsStats;
  57. //! use pandora_lib_promethion::variant::variant_collection::Variants;
  58. //!
  59. //! let mut variants = clairs.variants(&annotations)?;
  60. //! let high_depth_ranges = Vec::new(); // or from somatic_depth_quality_ranges()
  61. //!
  62. //! let stats = VariantsStats::new(&mut variants, "sample_001", &config, &high_depth_ranges)?;
  63. //!
  64. //! println!("Total variants: {}", stats.n);
  65. //! println!("WGS mutation rate: {:.2} mutations/Mb", stats.somatic_rates.somatic_mutation_rate_wgs);
  66. //! println!("Coding mutation rate: {:.2} mutations/Mb", stats.somatic_rates.somatic_mutation_rate_coding);
  67. //! # Ok::<(), anyhow::Error>(())
  68. //! ```
  69. //!
  70. //! ### Calculating Somatic Mutation Rates
  71. //!
  72. //! ```ignore
  73. //! use pandora_lib_promethion::variant::variants_stats::somatic_rates;
  74. //! use pandora_lib_promethion::io::gff::features_ranges;
  75. //!
  76. //! // Load exon ranges from GFF
  77. //! let exon_ranges = features_ranges(&config.genes_gtf, "exon")?;
  78. //!
  79. //! // Calculate mutation rates
  80. //! let rates = somatic_rates(&variants.data, &exon_ranges, &config)?;
  81. //!
  82. //! println!("Total variants: {}", rates.total_variants);
  83. //! println!("Variants in coding: {}", rates.variants_in_coding);
  84. //! println!("Nonsynonymous rate: {:.2} mutations/Mb", rates.somatic_nonsynonymous_rate_coding);
  85. //! # Ok::<(), anyhow::Error>(())
  86. //! ```
  87. //!
  88. //! ### Identifying High-Depth Regions
  89. //!
  90. //! ```ignore
  91. //! use pandora_lib_promethion::variant::variants_stats::somatic_depth_quality_ranges;
  92. //!
  93. //! let (high_depth, low_quality) = somatic_depth_quality_ranges("sample_001", &config)?;
  94. //!
  95. //! println!("High-depth regions: {}", high_depth.len());
  96. //! println!("Low-quality regions: {}", low_quality.len());
  97. //! # Ok::<(), anyhow::Error>(())
  98. //! ```
  99. //!
  100. //! ### Preparing GLM Dataset
  101. //!
  102. //! ```ignore
  103. //! use pandora_lib_promethion::variant::variants_stats::{
  104. //! make_glm_rows_from_regions_par, write_glm_rows
  105. //! };
  106. //! use pandora_lib_promethion::annotation::{Annotation, ReplicationClass};
  107. //! use pandora_lib_promethion::io::bed::read_bed;
  108. //!
  109. //! // Load genomic bins (e.g., 1 Mb windows)
  110. //! let bins = read_bed("genome_1mb_bins.bed")?;
  111. //!
  112. //! // Define features to track
  113. //! let tracked_features = vec![
  114. //! Annotation::ReplicationTiming(ReplicationClass::Early),
  115. //! Annotation::ReplicationTiming(ReplicationClass::Late),
  116. //! Annotation::HighDepths,
  117. //! ];
  118. //!
  119. //! // Generate GLM dataset
  120. //! let (glm_rows, mutation_rates) = make_glm_rows_from_regions_par(
  121. //! &variants,
  122. //! &bins,
  123. //! &tracked_features,
  124. //! 2, // min_callers
  125. //! );
  126. //!
  127. //! // Save to CSV for R/Python analysis
  128. //! write_glm_rows(&glm_rows, "glm_data.csv")?;
  129. //!
  130. //! // Print per-feature mutation rates
  131. //! for (feature, rate) in mutation_rates {
  132. //! println!("{}: {:.2} mutations/Mb", feature, rate);
  133. //! }
  134. //! # Ok::<(), anyhow::Error>(())
  135. //! ```
  136. //!
  137. //! ## Statistical Modeling Workflow
  138. //!
  139. //! This module supports a complete workflow for modeling somatic mutation rates:
  140. //!
  141. //! 1. **Load variants** from callers (ClairS, DeepSomatic, etc.)
  142. //! 2. **Annotate variants** with VEP, COSMIC, gnomAD
  143. //! 3. **Compute statistics** using `VariantsStats::new()`
  144. //! 4. **Identify quality regions** using `somatic_depth_quality_ranges()`
  145. //! 5. **Prepare GLM dataset** using `make_glm_rows_from_regions_par()`
  146. //! 6. **Export to CSV** using `write_glm_rows()`
  147. //! 7. **Model in R/Python** using GLM/GAM frameworks
  148. //!
  149. //! ## Performance
  150. //!
  151. //! All major functions use Rayon for parallel processing:
  152. //! - `VariantsStats::new()` parallelizes per-variant aggregation
  153. //! - `somatic_depth_quality_ranges()` processes contigs in parallel
  154. //! - `make_glm_rows_from_regions_par()` parallelizes region processing with efficient binary search
  155. use std::collections::{BTreeMap, BTreeSet};
  156. use anyhow::Context;
  157. use csv::ByteRecord;
  158. use dashmap::DashMap;
  159. use log::debug;
  160. use ordered_float::OrderedFloat;
  161. use rayon::prelude::*;
  162. use serde::{Deserialize, Serialize, Serializer};
  163. use crate::{
  164. annotation::{vep::VepImpact, Annotation},
  165. config::Config,
  166. helpers::bin_data,
  167. io::{
  168. bed::read_bed, dict::read_dict, gff::features_ranges, readers::get_gz_reader,
  169. tsv::tsv_reader, writers::get_gz_writer,
  170. },
  171. positions::{
  172. contig_to_num, merge_overlapping_genome_ranges, par_overlaps, range_intersection_par,
  173. GenomeRange,
  174. },
  175. scan::bin::{parse_bin_record_into, BinRowBuf},
  176. };
  177. use super::variant_collection::{Variant, Variants};
  178. #[derive(Debug, Clone, Serialize, Deserialize)]
  179. pub struct VariantsStats {
  180. pub n: u32,
  181. #[serde(serialize_with = "serialize_dashmap_sort")]
  182. pub alteration_categories: DashMap<String, u32>,
  183. #[serde(serialize_with = "serialize_dashmap_sort")]
  184. pub cosmic: DashMap<u64, u32>,
  185. #[serde(serialize_with = "serialize_dashmap_sort")]
  186. pub n_alts: DashMap<u32, u32>,
  187. #[serde(serialize_with = "serialize_dashmap_sort")]
  188. pub depths: DashMap<u32, u32>,
  189. #[serde(serialize_with = "serialize_dashmap_sort")]
  190. pub vafs: DashMap<OrderedFloat<f32>, u32>,
  191. #[serde(serialize_with = "serialize_dashmap_sort")]
  192. pub vep_impact: DashMap<String, u32>,
  193. #[serde(serialize_with = "serialize_dashmap_sort")]
  194. pub consequences: DashMap<String, u32>,
  195. #[serde(serialize_with = "serialize_dashmap_sort")]
  196. pub genes: DashMap<String, u32>,
  197. #[serde(serialize_with = "serialize_dashmap_sort")]
  198. pub genes_nonsynonymus: DashMap<String, u32>,
  199. pub n_gnomad: usize,
  200. pub gnomad: Vec<(String, Vec<(f64, usize)>)>,
  201. pub somatic_rates: SomaticVariantRates,
  202. pub high_depth_somatic_rates: SomaticVariantRates,
  203. pub mutation_rates: Vec<(String, (u32, usize))>,
  204. #[serde(serialize_with = "serialize_dashmap_sort")]
  205. pub insertions_len: DashMap<u32, u32>,
  206. #[serde(serialize_with = "serialize_dashmap_sort")]
  207. pub deletions_len: DashMap<u32, u32>,
  208. }
  209. pub fn serialize_dashmap_sort<S, T>(
  210. data: &DashMap<T, u32>,
  211. serializer: S,
  212. ) -> Result<S::Ok, S::Error>
  213. where
  214. S: Serializer,
  215. T: Serialize + Ord + std::hash::Hash + Clone,
  216. {
  217. let ordered: BTreeMap<_, _> = data
  218. .iter()
  219. .map(|entry| (entry.key().clone(), *entry.value()))
  220. .collect();
  221. ordered.serialize(serializer)
  222. }
  223. impl VariantsStats {
  224. pub fn new(
  225. variants: &mut Variants,
  226. _id: &str,
  227. config: &Config,
  228. high_depth_ranges: &[GenomeRange],
  229. ) -> anyhow::Result<Self> {
  230. let n = variants.data.len() as u32;
  231. let alteration_categories: DashMap<String, u32> = DashMap::new();
  232. let vep_impact: DashMap<String, u32> = DashMap::new();
  233. let genes: DashMap<String, u32> = DashMap::new();
  234. let genes_nonsynonymus: DashMap<String, u32> = DashMap::new();
  235. let consequences: DashMap<String, u32> = DashMap::new();
  236. let n_alts: DashMap<u32, u32> = DashMap::new();
  237. let depths: DashMap<u32, u32> = DashMap::new();
  238. let vafs: DashMap<OrderedFloat<f32>, u32> = DashMap::new();
  239. let cosmic: DashMap<u64, u32> = DashMap::new();
  240. let gnomads: DashMap<String, Vec<f64>> = DashMap::new();
  241. let context: DashMap<String, Vec<String>> = DashMap::new();
  242. let insertions_len: DashMap<u32, u32> = DashMap::new();
  243. let deletions_len: DashMap<u32, u32> = DashMap::new();
  244. variants.data.par_iter().for_each(|v| {
  245. // VEP
  246. if let Ok(best_vep) = v.best_vep() {
  247. if let Some(ref impact) = best_vep.extra.impact {
  248. *vep_impact.entry(impact.to_string()).or_default() += 1;
  249. }
  250. if let Some(ref gene) = best_vep.extra.symbol {
  251. *genes.entry(gene.to_string()).or_default() += 1;
  252. }
  253. if let (Some(impact), Some(gene)) = (best_vep.extra.impact, best_vep.extra.symbol) {
  254. if impact <= VepImpact::MODERATE {
  255. *genes_nonsynonymus.entry(gene).or_default() += 1;
  256. }
  257. }
  258. if let Some(csqs) = best_vep.consequence {
  259. let mut csq: Vec<String> = csqs.into_iter().map(String::from).collect();
  260. csq.sort();
  261. csq.dedup();
  262. *consequences.entry(csq.join(", ")).or_default() += 1;
  263. }
  264. }
  265. // VAF
  266. let (n_alt, depth) = v.n_alt_depth();
  267. *n_alts.entry(n_alt as u32).or_default() += 1;
  268. *depths.entry(depth as u32).or_default() += 1;
  269. let vaf = OrderedFloat::from(((n_alt * 1_000.0 / depth).round() / 10.0) as f32);
  270. if !vaf.is_nan() {
  271. *vafs.entry(vaf).or_default() += 1;
  272. }
  273. let callers = v.callers();
  274. // Annotations
  275. v.annotations.iter().for_each(|annotation| {
  276. match annotation {
  277. Annotation::Cosmic(v) => *cosmic.entry(v.cosmic_cnt).or_default() += 1,
  278. Annotation::GnomAD(v) => {
  279. v.to_vec()
  280. .iter()
  281. .map(|e| e.to_string_value_pair())
  282. .for_each(|(key, value)| {
  283. gnomads.entry(key).or_default().push(value);
  284. });
  285. }
  286. Annotation::TriNucleotides(bases) => context
  287. .entry(bases.iter().map(|v| v.to_string()).collect())
  288. .or_default()
  289. .push(callers.clone()),
  290. _ => (),
  291. };
  292. });
  293. // Alteration category
  294. let mut alteration_category_str = v
  295. .alteration_category()
  296. .iter()
  297. .map(|c| c.to_string())
  298. .collect::<Vec<String>>();
  299. alteration_category_str.sort();
  300. alteration_category_str.dedup();
  301. *alteration_categories
  302. .entry(alteration_category_str.join(", "))
  303. .or_default() += 1;
  304. if let Some(len) = v.insertion_length() {
  305. *insertions_len.entry(len).or_default() += 1;
  306. }
  307. if let Some(len) = v.deletion_length() {
  308. *deletions_len.entry(len).or_default() += 1;
  309. }
  310. });
  311. let mut n_gnomad = 0;
  312. let gnomad = gnomads
  313. .iter()
  314. .map(|e| {
  315. let data = e.value().to_vec();
  316. n_gnomad = data.len();
  317. (e.key().to_string(), bin_data(data, 0.0001))
  318. })
  319. .collect();
  320. let exon_ranges = features_ranges("exon", config)?;
  321. let exon_ranges = merge_overlapping_genome_ranges(&exon_ranges);
  322. let all_somatic_rates = somatic_rates(&variants.data, &exon_ranges, config)?;
  323. let exon_ranges_ref: Vec<&GenomeRange> = exon_ranges.iter().collect();
  324. let exons_high_depth = range_intersection_par(
  325. &high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(),
  326. &exon_ranges_ref,
  327. );
  328. let high_depth = somatic_rates(&variants.data, &exons_high_depth, config)?;
  329. let mut mutation_rates = Vec::new();
  330. // HighDepths
  331. let ann = Annotation::HighDepth;
  332. let res = variants.annotate_with_ranges(
  333. high_depth_ranges,
  334. Some(ann.clone()),
  335. config.min_n_callers,
  336. Vec::new(),
  337. );
  338. mutation_rates.push((ann.to_string(), res));
  339. // Exons
  340. let res =
  341. variants.annotate_with_ranges(&exon_ranges, None, config.min_n_callers, Vec::new());
  342. mutation_rates.push(("Exons".to_string(), res));
  343. // ExonsHighDepths
  344. let res = variants.annotate_with_ranges(
  345. &exons_high_depth,
  346. None,
  347. config.min_n_callers,
  348. Vec::new(),
  349. );
  350. mutation_rates.push(("Exons HighDepths".to_string(), res));
  351. for (name, path) in config.panels.iter() {
  352. let panel_ranges: Vec<GenomeRange> =
  353. read_bed(path)?.into_iter().map(|e| e.range).collect();
  354. let ann = Annotation::Panel(name.to_string());
  355. let res = variants.annotate_with_ranges(
  356. &panel_ranges,
  357. Some(ann.clone()),
  358. config.min_n_callers,
  359. Vec::new(),
  360. );
  361. mutation_rates.push((ann.to_string(), res));
  362. let panel_ranges: Vec<GenomeRange> = range_intersection_par(
  363. &high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(),
  364. &panel_ranges.iter().collect::<Vec<&GenomeRange>>(),
  365. );
  366. let ann = Annotation::Panel(format!("{}_HighDepths", name));
  367. let res = variants.annotate_with_ranges(
  368. &panel_ranges,
  369. Some(ann.clone()),
  370. config.min_n_callers,
  371. Vec::new(),
  372. );
  373. mutation_rates.push((ann.to_string(), res));
  374. }
  375. // mutation_rates.sort_by(|(_, (_, an)), (_, (_, bn))| an.cmp(bn));
  376. // Output rates per megabase
  377. for (feature, (bp, n)) in &mutation_rates {
  378. let rate = (*n as f64) / ((*bp as f64) / 1e6);
  379. println!("{feature}: {rate:.2} mutations/Mb ({n} / {bp})");
  380. }
  381. // let (glm_rows, rates_per_mb) =
  382. // make_glm_rows_from_regions_par(variants, &high_depth_ranges, &tracked_annotations, config.min_n_callers);
  383. //
  384. // // Output rates per megabase
  385. // for (feature, rate) in &rates_per_mb {
  386. // println!("{feature}: {rate:.2} mutations/Mb");
  387. // }
  388. //
  389. // write_glm_rows(
  390. // &glm_rows,
  391. // &format!("{}/{id}_glm_rows.csv", config.somatic_pipe_stats(id)),
  392. // )?;
  393. Ok(Self {
  394. n,
  395. alteration_categories,
  396. cosmic,
  397. gnomad,
  398. vafs,
  399. n_alts,
  400. depths,
  401. vep_impact,
  402. consequences,
  403. genes,
  404. genes_nonsynonymus,
  405. n_gnomad,
  406. somatic_rates: all_somatic_rates,
  407. high_depth_somatic_rates: high_depth,
  408. mutation_rates,
  409. insertions_len,
  410. deletions_len,
  411. })
  412. }
  413. pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
  414. let mut writer = get_gz_writer(filename, true)
  415. .with_context(|| anyhow::anyhow!("Failed to create file: {}", filename))?;
  416. serde_json::to_writer(&mut writer, self)
  417. .with_context(|| anyhow::anyhow!("Failed to serialize JSON to file: {}", filename))?;
  418. writer.close().with_context(|| {
  419. anyhow::anyhow!("Failed to close BGZF writer for file: {}", filename)
  420. })?;
  421. debug!("Successfully saved variants to {}", filename);
  422. Ok(())
  423. }
  424. pub fn load_from_json(filename: &str) -> anyhow::Result<Self> {
  425. let mut reader = get_gz_reader(filename)
  426. .with_context(|| format!("Failed to create BGZF reader for file: {}", filename))?;
  427. let variants: Self = serde_json::from_reader(&mut reader)
  428. .with_context(|| format!("Failed to deserialize JSON from file: {}", filename))?;
  429. debug!("Successfully loaded variants from {}", filename);
  430. Ok(variants)
  431. }
  432. }
  433. #[derive(Debug, Serialize, Deserialize, Clone)]
  434. pub struct SomaticVariantRates {
  435. pub wgs_length: u32,
  436. pub total_variants: usize,
  437. pub somatic_mutation_rate_wgs: f64,
  438. pub exon_count: usize,
  439. pub total_exon_bases: u32,
  440. pub variants_in_coding: usize,
  441. pub somatic_mutation_rate_coding: f64,
  442. pub nonsynonymous_variants: usize,
  443. pub somatic_nonsynonymous_rate_coding: f64,
  444. }
  445. pub fn somatic_rates(
  446. variants: &[Variant],
  447. exon_ranges: &Vec<GenomeRange>,
  448. config: &Config,
  449. ) -> anyhow::Result<SomaticVariantRates> {
  450. let ol = par_overlaps(variants, exon_ranges);
  451. let n_coding = ol
  452. .iter()
  453. .filter_map(|i| variants[*i].best_vep().ok())
  454. .filter_map(|bv| bv.impact())
  455. .filter(|impact| *impact <= VepImpact::MODERATE)
  456. .count();
  457. let n_bases_m: u32 = exon_ranges.par_iter().map(|gr| gr.length()).sum();
  458. let mega_base_m = n_bases_m as f64 / 10.0e6;
  459. let wgs_len: u32 = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum();
  460. let rate_wgs = variants.len() as f64 / (wgs_len as f64 / 10.0e6);
  461. let n_exons_mb = ol.len() as f64 / mega_base_m;
  462. let n_coding_mb = n_coding as f64 / mega_base_m;
  463. Ok(SomaticVariantRates {
  464. total_variants: variants.len(),
  465. exon_count: exon_ranges.len(),
  466. variants_in_coding: ol.len(),
  467. nonsynonymous_variants: n_coding,
  468. total_exon_bases: n_bases_m,
  469. wgs_length: wgs_len,
  470. somatic_mutation_rate_wgs: rate_wgs,
  471. somatic_mutation_rate_coding: n_exons_mb,
  472. somatic_nonsynonymous_rate_coding: n_coding_mb,
  473. })
  474. }
  475. /// Computes genomic ranges with sufficient depth and with excessive low-quality reads
  476. /// by jointly scanning normal and tumoral per-contig depth TSV files.
  477. ///
  478. /// For each contig, the function reads the corresponding gzipped TSV files from the
  479. /// normal and tumoral directories, iterates over bins in lockstep, and validates that
  480. /// both files are perfectly aligned (same contig, same start position, same bin layout).
  481. ///
  482. /// Two kinds of ranges are produced:
  483. /// - **High-quality depth ranges**: consecutive bins where *both* normal and tumoral
  484. /// depths are greater than or equal to `config.min_high_quality_depth`.
  485. /// - **Low-quality excess ranges**: consecutive bins where *both* normal and tumoral
  486. /// bins contain more low-quality reads than `config.max_depth_low_quality`.
  487. ///
  488. /// Contigs are processed in parallel. Within each contig, adjacent or overlapping
  489. /// ranges are merged before being returned.
  490. ///
  491. /// # Errors
  492. ///
  493. /// Returns an error if:
  494. /// - A normal or tumoral TSV file cannot be opened or decompressed.
  495. /// - A TSV record cannot be parsed.
  496. /// - The normal and tumoral files differ in number of lines.
  497. /// - Corresponding records disagree on contig name, start position, or bin structure
  498. /// (depth or low-quality vector length).
  499. ///
  500. /// # Returns
  501. ///
  502. /// A tuple `(high_depth_ranges, lowq_excess_ranges)` where each element is a `Vec<GenomeRange>`
  503. /// spanning all contigs.
  504. ///
  505. /// # Notes
  506. ///
  507. /// - Input TSV files are expected to be tab-delimited, headerless, and sorted by
  508. /// genomic position.
  509. /// - Low-quality ranges represent **excess low-quality signal** (bins exceeding the
  510. /// configured threshold), not acceptable regions.
  511. /// - The implementation reuses parsing buffers and avoids per-line allocations for
  512. /// performance.
  513. ///
  514. /// # Panics
  515. ///
  516. /// This function does not intentionally panic.
  517. pub fn somatic_depth_quality_ranges(
  518. id: &str,
  519. config: &Config,
  520. ) -> anyhow::Result<(Vec<GenomeRange>, Vec<GenomeRange>)> {
  521. // chr1..chr22 + X,Y,M
  522. let contigs: Vec<String> = read_dict(&config.dict_file)?
  523. .into_iter()
  524. .map(|(sn, _ln)| sn)
  525. .collect();
  526. let cfg = config;
  527. let per_contig = contigs
  528. .into_par_iter()
  529. .map(|contig| {
  530. let normal_path = format!("{}/{}_count.tsv.gz", cfg.normal_dir_count(id), contig);
  531. let tumor_path = format!("{}/{}_count.tsv.gz", cfg.tumoral_dir_count(id), contig);
  532. let normal_rdr = get_gz_reader(&normal_path)
  533. .with_context(|| format!("Failed to open normal file: {}", normal_path))?;
  534. let tumor_rdr = get_gz_reader(&tumor_path)
  535. .with_context(|| format!("Failed to open tumor file: {}", tumor_path))?;
  536. let mut high_runs: Vec<GenomeRange> = Vec::new();
  537. let mut lowq_runs: Vec<GenomeRange> = Vec::new();
  538. let mut n_tsv = tsv_reader(normal_rdr); // normal_rdr: impl Read
  539. let mut t_tsv = tsv_reader(tumor_rdr);
  540. let mut n_rec = ByteRecord::new();
  541. let mut t_rec = ByteRecord::new();
  542. let mut n_buf = BinRowBuf::default();
  543. let mut t_buf = BinRowBuf::default();
  544. let mut line_no = 0usize;
  545. loop {
  546. let n_ok = n_tsv.read_byte_record(&mut n_rec)?;
  547. let t_ok = t_tsv.read_byte_record(&mut t_rec)?;
  548. if n_ok || t_ok {
  549. line_no += 1;
  550. }
  551. match (n_ok, t_ok) {
  552. (false, false) => break,
  553. (true, false) => {
  554. anyhow::bail!("{} has extra lines at {}", normal_path, line_no)
  555. }
  556. (false, true) => anyhow::bail!("{} has extra lines at {}", tumor_path, line_no),
  557. (true, true) => {
  558. let (n_start, n_depths, n_lowq) =
  559. parse_bin_record_into(&n_rec, &mut n_buf, &contig)
  560. .with_context(|| format!("{} line {}", normal_path, line_no))?;
  561. let (t_start, t_depths, t_lowq) =
  562. parse_bin_record_into(&t_rec, &mut t_buf, &contig)
  563. .with_context(|| format!("{} line {}", tumor_path, line_no))?;
  564. anyhow::ensure!(n_start == t_start, "start mismatch at line {}", line_no);
  565. anyhow::ensure!(
  566. n_depths.len() == t_depths.len(),
  567. "depth len mismatch at line {}",
  568. line_no
  569. );
  570. anyhow::ensure!(
  571. n_lowq.len() == t_lowq.len(),
  572. "lowq len mismatch at line {}",
  573. line_no
  574. );
  575. let high_mask_iter = n_depths.iter().zip(t_depths).map(|(&nd, &td)| {
  576. nd >= cfg.min_high_quality_depth && td >= cfg.min_high_quality_depth
  577. });
  578. let lowq_mask_iter = n_lowq.iter().zip(t_lowq).map(|(&nq, &tq)| {
  579. nq > cfg.max_depth_low_quality && tq > cfg.max_depth_low_quality
  580. });
  581. high_runs.extend(ranges_from_consecutive_true_iter(
  582. high_mask_iter,
  583. n_start,
  584. &contig,
  585. ));
  586. lowq_runs.extend(ranges_from_consecutive_true_iter(
  587. lowq_mask_iter,
  588. n_start,
  589. &contig,
  590. ));
  591. }
  592. }
  593. }
  594. // Merge adjacent/overlapping ranges within this contig
  595. high_runs = merge_adjacent_ranges(high_runs);
  596. lowq_runs = merge_adjacent_ranges(lowq_runs);
  597. Ok((high_runs, lowq_runs))
  598. })
  599. .collect::<anyhow::Result<Vec<_>>>()?;
  600. // Flatten
  601. let (high_all, low_all): (Vec<_>, Vec<_>) = per_contig.into_iter().unzip();
  602. Ok((
  603. high_all.into_iter().flatten().collect(),
  604. low_all.into_iter().flatten().collect(),
  605. ))
  606. }
  607. /// Iterator-based version (no temporary Vec<bool>).
  608. /// Produces end-exclusive ranges in the same shape as your old function.
  609. pub fn ranges_from_consecutive_true_iter<I>(mask: I, start0: u32, contig: &str) -> Vec<GenomeRange>
  610. where
  611. I: IntoIterator<Item = bool>,
  612. {
  613. let contig = contig_to_num(contig);
  614. let mut ranges = Vec::new();
  615. let mut current_start: Option<u32> = None;
  616. let mut i: u32 = 0;
  617. for v in mask {
  618. if v {
  619. if current_start.is_none() {
  620. current_start = Some(start0 + i);
  621. }
  622. } else if let Some(s) = current_start.take() {
  623. ranges.push(GenomeRange {
  624. contig,
  625. range: s..(start0 + i),
  626. });
  627. }
  628. i += 1;
  629. }
  630. if let Some(s) = current_start {
  631. ranges.push(GenomeRange {
  632. contig,
  633. range: s..(start0 + i),
  634. });
  635. }
  636. ranges
  637. }
  638. /// Merge overlapping or touching ranges per contig (end-exclusive).
  639. pub fn merge_adjacent_ranges(mut ranges: Vec<GenomeRange>) -> Vec<GenomeRange> {
  640. if ranges.is_empty() {
  641. return ranges;
  642. }
  643. ranges.sort_by(|a, b| (a.contig, a.range.start).cmp(&(b.contig, b.range.start)));
  644. let mut merged = Vec::with_capacity(ranges.len());
  645. let mut cur = ranges[0].clone();
  646. for r in ranges.into_iter().skip(1) {
  647. if r.contig == cur.contig && r.range.start <= cur.range.end {
  648. if r.range.end > cur.range.end {
  649. cur.range.end = r.range.end;
  650. }
  651. } else {
  652. merged.push(cur);
  653. cur = r;
  654. }
  655. }
  656. merged.push(cur);
  657. merged
  658. }
  659. /// A region-level data structure for modeling mutation rates using GLMs or other statistical models.
  660. ///
  661. /// Each `GlmRow` represents a genomic interval (e.g., from a BED file) and contains:
  662. /// - The region's coordinates (`contig`, `start`, `end`)
  663. /// - Its length in base pairs
  664. /// - The number of somatic mutations overlapping the region
  665. /// - A list of binary feature labels (e.g., "Early", "HighGC")
  666. ///
  667. /// This structure is designed to be serializable (e.g., to CSV) for downstream statistical modeling.
  668. ///
  669. /// # Example
  670. /// ```
  671. /// GlmRow {
  672. /// contig: "chr1".to_string(),
  673. /// start: 100_000,
  674. /// end: 101_000,
  675. /// length: 1000,
  676. /// mutation_count: 3,
  677. /// features: vec!["Early".into(), "HighGC".into()],
  678. /// }
  679. /// ```
  680. #[derive(Debug, Serialize)]
  681. pub struct GlmRow {
  682. /// Chromosome name (e.g., "chr1")
  683. pub contig: String,
  684. /// Start coordinate (0-based, inclusive)
  685. pub start: u32,
  686. /// End coordinate (0-based, exclusive)
  687. pub end: u32,
  688. /// Length of the region in base pairs (end - start)
  689. pub length: usize,
  690. /// Number of variants overlapping this region
  691. pub mutation_count: usize,
  692. /// Binary feature labels associated with this region
  693. pub features: Vec<String>,
  694. }
  695. /// Builds a GLM-ready table from a set of fixed genomic regions and a list of annotated variants,
  696. /// in parallel, using efficient binary search-based double-pointer traversal.
  697. ///
  698. /// For each region, this function:
  699. /// - Counts how many variants fall within the region
  700. /// - Extracts the relevant annotations (features) from overlapping variants
  701. /// - Produces one `GlmRow` with coordinates, mutation count, and feature labels
  702. ///
  703. /// Additionally, it computes the **mutation rate per megabase** for each tracked annotation
  704. /// by aggregating counts and total base coverage across regions where each feature is present.
  705. ///
  706. /// # Assumptions
  707. /// - `variants.data` must be sorted by `position.contig` then `position.position`
  708. /// - `regions` must be sorted by `range.contig` then `range.start`
  709. ///
  710. /// # Arguments
  711. /// * `variants` - A list of annotated somatic variants (sorted).
  712. /// * `regions` - A set of genomic bins or intervals (e.g., from BED) used as modeling units (sorted).
  713. /// * `tracked_annotations` - A list of `Annotation` values to track as binary features.
  714. ///
  715. /// # Returns
  716. /// A tuple:
  717. /// * `Vec<GlmRow>` — one per region, with mutation count and feature labels.
  718. /// * `Vec<(String, f64)>` — per-feature mutation rate (mutations per megabase).
  719. ///
  720. /// # Example
  721. /// ```rust
  722. /// let (rows, rates) = make_glm_rows_from_regions_par(
  723. /// &variants,
  724. /// &genome_bins,
  725. /// &[
  726. /// Annotation::ReplicationTiming(ReplicationClass::Early),
  727. /// Annotation::HighDepths
  728. /// ]
  729. /// );
  730. ///
  731. /// for (feature, rate) in rates {
  732. /// println!("{feature}: {:.2} mutations/Mb", rate);
  733. /// }
  734. /// ```
  735. pub fn make_glm_rows_from_regions_par(
  736. variants: &Variants,
  737. regions: &[GenomeRange],
  738. tracked_annotations: &[Annotation],
  739. min_callers: u8,
  740. ) -> (Vec<GlmRow>, Vec<(String, f64)>) {
  741. let (glm_rows, feature_stats): (Vec<GlmRow>, BTreeMap<String, (usize, usize)>) = regions
  742. .par_iter()
  743. .map(|region| {
  744. let mut mutation_count = 0;
  745. let mut feature_set = BTreeSet::new();
  746. // Binary search into sorted variant list
  747. let start_idx = variants.data.partition_point(|v| {
  748. v.position.contig < region.contig
  749. || (v.position.contig == region.contig
  750. && v.position.position < region.range.start)
  751. });
  752. for var in &variants.data[start_idx..] {
  753. let pos = &var.position;
  754. if pos.contig != region.contig || pos.position >= region.range.end {
  755. break;
  756. }
  757. if region.range.contains(&pos.position) && var.n_callers() >= min_callers {
  758. mutation_count += 1;
  759. for ann in &var.annotations {
  760. if tracked_annotations.contains(ann) {
  761. feature_set.insert(ann.to_string());
  762. }
  763. }
  764. }
  765. }
  766. let region_len = region.length() as usize;
  767. let row = GlmRow {
  768. contig: region.contig.to_string(),
  769. start: region.range.start,
  770. end: region.range.end,
  771. length: region_len,
  772. mutation_count,
  773. features: feature_set.clone().into_iter().collect(),
  774. };
  775. // Local (per-thread) accumulation of stats
  776. let mut stats = BTreeMap::new();
  777. for feat in &feature_set {
  778. stats.insert(feat.clone(), (mutation_count, region_len));
  779. }
  780. (vec![row], stats)
  781. })
  782. .reduce(
  783. || (Vec::new(), BTreeMap::new()),
  784. |(mut rows_a, mut stats_a), (mut rows_b, stats_b)| {
  785. rows_a.append(&mut rows_b);
  786. for (feat, (n, bp)) in stats_b {
  787. stats_a
  788. .entry(feat)
  789. .and_modify(|(na, bpa)| {
  790. *na += n;
  791. *bpa += bp;
  792. })
  793. .or_insert((n, bp));
  794. }
  795. (rows_a, stats_a)
  796. },
  797. );
  798. let ranges_len: usize = regions.iter().map(|a| a.length() as usize).sum();
  799. assert_eq!(10e6 as usize, 1_000_000);
  800. // Compute mutation rates per Mb from aggregate stats
  801. let rates_per_mb: Vec<(String, f64)> = feature_stats
  802. .into_iter()
  803. .map(|(feat, (n, bp))| {
  804. debug!("{bp}");
  805. let rate = if bp > 0 {
  806. (n as f64) / ((ranges_len as f64) / 10e6)
  807. } else {
  808. 0.0
  809. };
  810. (feat, rate)
  811. })
  812. .collect();
  813. (glm_rows, rates_per_mb)
  814. }
  815. /// Returns a flat table: one-hot encoded feature columns.
  816. fn flatten_glm_rows(
  817. rows: &[GlmRow],
  818. ) -> (Vec<String>, Vec<std::collections::HashMap<String, String>>) {
  819. let mut all_features: BTreeSet<String> = BTreeSet::new();
  820. for r in rows {
  821. all_features.extend(r.features.iter().cloned());
  822. }
  823. let feature_list: Vec<_> = all_features.into_iter().collect();
  824. let mut table: Vec<_> = rows
  825. .iter()
  826. .map(|r| {
  827. let mut row = std::collections::HashMap::new();
  828. row.insert("contig".into(), r.contig.clone());
  829. row.insert("start".into(), r.start.to_string());
  830. row.insert("end".into(), r.end.to_string());
  831. row.insert("length".into(), r.length.to_string());
  832. row.insert("mutation_count".into(), r.mutation_count.to_string());
  833. row.insert("log_length".into(), (r.length as f64).ln().to_string());
  834. for f in &feature_list {
  835. row.insert(
  836. f.clone(),
  837. if r.features.contains(f) { "1" } else { "0" }.into(),
  838. );
  839. }
  840. row
  841. })
  842. .collect();
  843. // Parallel sort by contig and start
  844. table.par_sort_by(|a, b| {
  845. let ca = a.get("contig").unwrap();
  846. let cb = b.get("contig").unwrap();
  847. let sa = a.get("start").unwrap().parse::<u32>().unwrap();
  848. let sb = b.get("start").unwrap().parse::<u32>().unwrap();
  849. (ca, sa).cmp(&(cb, sb))
  850. });
  851. (feature_list, table)
  852. }
  853. pub fn write_glm_rows(all_rows: &[GlmRow], csv_path: &str) -> anyhow::Result<()> {
  854. let (features, flat_rows) = flatten_glm_rows(all_rows);
  855. let mut writer = csv::Writer::from_path(csv_path)?;
  856. let mut headers = vec![
  857. "contig",
  858. "start",
  859. "end",
  860. "length",
  861. "log_length",
  862. "mutation_count",
  863. ];
  864. headers.extend(features.iter().map(|s| s.as_str()));
  865. writer.write_record(&headers)?;
  866. for row in flat_rows {
  867. let values: Vec<_> = headers.iter().map(|&h| row.get(h).unwrap()).collect();
  868. writer.write_record(values)?;
  869. }
  870. writer.flush()?;
  871. Ok(())
  872. }
  873. #[cfg(test)]
  874. mod tests {
  875. use log::info;
  876. use super::*;
  877. use crate::{helpers::test_init, positions::GenomeRangeExt};
  878. #[test]
  879. fn high_depths() -> anyhow::Result<()> {
  880. test_init();
  881. let config = Config::default();
  882. let id = "DUMCO";
  883. let (mut high_depth_ranges, mut low_quality_ranges) =
  884. somatic_depth_quality_ranges(id, &config)?;
  885. high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
  886. low_quality_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
  887. // let high_depth_ranges = merge_adjacent_ranges(high_depth_ranges);
  888. // let low_quality_ranges = merge_adjacent_ranges(low_quality_ranges);
  889. info!(
  890. "High-depth ranges: n={} bp={}\nLowQ ranges: n={} bp={}",
  891. high_depth_ranges.len(),
  892. high_depth_ranges.total_len(),
  893. low_quality_ranges.len(),
  894. low_quality_ranges.total_len(),
  895. );
  896. high_depth_ranges.iter().take(10).for_each(|e| println!("{e:?}"));
  897. low_quality_ranges.iter().take(10).for_each(|e| println!("{e:?}"));
  898. Ok(())
  899. }
  900. }