|
|
@@ -0,0 +1,306 @@
|
|
|
+use std::{collections::{BTreeMap, HashMap}, io::BufRead};
|
|
|
+
|
|
|
+use anyhow::Context;
|
|
|
+use dashmap::DashMap;
|
|
|
+use log::debug;
|
|
|
+use ordered_float::OrderedFloat;
|
|
|
+use rayon::prelude::*;
|
|
|
+use serde::{Deserialize, Serialize, Serializer};
|
|
|
+
|
|
|
+use crate::{
|
|
|
+ annotation::{vep::VepImpact, Annotation},
|
|
|
+ config::Config,
|
|
|
+ helpers::bin_data,
|
|
|
+ io::{dict::read_dict, readers::get_gz_reader, writers::get_gz_writer},
|
|
|
+ positions::{contig_to_num, par_overlaps, GenomeRange},
|
|
|
+ scan::scan::BinCount,
|
|
|
+};
|
|
|
+
|
|
|
+use super::variant_collection::{Variant, Variants};
|
|
|
+
|
|
|
+#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
+pub struct VariantsStats {
|
|
|
+ pub n: u32,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub alteration_categories: DashMap<String, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub cosmic: DashMap<u64, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub n_alts: DashMap<u32, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub depths: DashMap<u32, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub vafs: DashMap<OrderedFloat<f32>, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub vep_impact: DashMap<String, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub consequences: DashMap<String, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub genes: DashMap<String, u32>,
|
|
|
+ pub n_gnomad: usize,
|
|
|
+ pub gnomad: Vec<(String, Vec<(f64, usize)>)>,
|
|
|
+}
|
|
|
+pub fn serialize_dashmap_sort<S, T>(
|
|
|
+ data: &DashMap<T, u32>,
|
|
|
+ serializer: S,
|
|
|
+) -> Result<S::Ok, S::Error>
|
|
|
+where
|
|
|
+ S: Serializer,
|
|
|
+ T: Serialize + Ord + std::hash::Hash + Clone,
|
|
|
+{
|
|
|
+ let ordered: BTreeMap<_, _> = data
|
|
|
+ .iter()
|
|
|
+ .map(|entry| (entry.key().clone(), *entry.value()))
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ ordered.serialize(serializer)
|
|
|
+}
|
|
|
+
|
|
|
+impl VariantsStats {
|
|
|
+ pub fn new(variants: &Variants) -> Self {
|
|
|
+ let n = variants.data.len() as u32;
|
|
|
+ let alteration_categories: DashMap<String, u32> = DashMap::new();
|
|
|
+ let vep_impact: DashMap<String, u32> = DashMap::new();
|
|
|
+ let genes: DashMap<String, u32> = DashMap::new();
|
|
|
+ let consequences: DashMap<String, u32> = DashMap::new();
|
|
|
+ let n_alts: DashMap<u32, u32> = DashMap::new();
|
|
|
+ let depths: DashMap<u32, u32> = DashMap::new();
|
|
|
+ let vafs: DashMap<OrderedFloat<f32>, u32> = DashMap::new();
|
|
|
+ let cosmic: DashMap<u64, u32> = DashMap::new();
|
|
|
+ let gnomads: DashMap<String, Vec<f64>> = DashMap::new();
|
|
|
+
|
|
|
+ variants.data.par_iter().for_each(|v| {
|
|
|
+ if let Ok(best_vep) = v.best_vep() {
|
|
|
+ if let Some(impact) = best_vep.extra.impact {
|
|
|
+ *vep_impact.entry(impact.to_string()).or_default() += 1;
|
|
|
+ }
|
|
|
+ if let Some(gene) = best_vep.extra.symbol {
|
|
|
+ *genes.entry(gene).or_default() += 1;
|
|
|
+ }
|
|
|
+ if let Some(csqs) = best_vep.consequence {
|
|
|
+ let mut csq: Vec<String> = csqs.into_iter().map(String::from).collect();
|
|
|
+ csq.sort();
|
|
|
+ csq.dedup();
|
|
|
+ *consequences.entry(csq.join(", ")).or_default() += 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let (n_alt, depth) = v.n_alt_depth();
|
|
|
+ *n_alts.entry(n_alt as u32).or_default() += 1;
|
|
|
+ *depths.entry(depth as u32).or_default() += 1;
|
|
|
+
|
|
|
+ let vaf = OrderedFloat::from(((n_alt * 1_000.0 / depth).round() / 10.0) as f32);
|
|
|
+ if !vaf.is_nan() {
|
|
|
+ *vafs.entry(vaf).or_default() += 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ v.annotations.iter().for_each(|annotation| {
|
|
|
+ match annotation {
|
|
|
+ Annotation::Cosmic(v) => *cosmic.entry(v.cosmic_cnt).or_default() += 1,
|
|
|
+ Annotation::GnomAD(v) => {
|
|
|
+ v.to_vec()
|
|
|
+ .iter()
|
|
|
+ .map(|e| e.to_string_value_pair())
|
|
|
+ .for_each(|(key, value)| {
|
|
|
+ gnomads.entry(key).or_default().push(value);
|
|
|
+ });
|
|
|
+ }
|
|
|
+ _ => (),
|
|
|
+ };
|
|
|
+ });
|
|
|
+ let mut alteration_category_str = v
|
|
|
+ .alteration_category()
|
|
|
+ .iter()
|
|
|
+ .map(|c| c.to_string())
|
|
|
+ .collect::<Vec<String>>();
|
|
|
+ alteration_category_str.sort();
|
|
|
+ alteration_category_str.dedup();
|
|
|
+
|
|
|
+ *alteration_categories
|
|
|
+ .entry(alteration_category_str.join(", "))
|
|
|
+ .or_default() += 1;
|
|
|
+ });
|
|
|
+
|
|
|
+ let mut n_gnomad = 0;
|
|
|
+ let gnomad = gnomads
|
|
|
+ .iter()
|
|
|
+ .map(|e| {
|
|
|
+ let data = e.value().to_vec();
|
|
|
+ n_gnomad = data.len();
|
|
|
+ (e.key().to_string(), bin_data(data, 0.0001))
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ Self {
|
|
|
+ n,
|
|
|
+ alteration_categories,
|
|
|
+ cosmic,
|
|
|
+ gnomad,
|
|
|
+ vafs,
|
|
|
+ n_alts,
|
|
|
+ depths,
|
|
|
+ vep_impact,
|
|
|
+ consequences,
|
|
|
+ genes,
|
|
|
+ n_gnomad,
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
|
|
|
+ let mut writer = get_gz_writer(filename, true)
|
|
|
+ .with_context(|| anyhow::anyhow!("Failed to create file: {}", filename))?;
|
|
|
+
|
|
|
+ serde_json::to_writer(&mut writer, self)
|
|
|
+ .with_context(|| anyhow::anyhow!("Failed to serialize JSON to file: {}", filename))?;
|
|
|
+
|
|
|
+ writer.close().with_context(|| {
|
|
|
+ anyhow::anyhow!("Failed to close BGZF writer for file: {}", filename)
|
|
|
+ })?;
|
|
|
+
|
|
|
+ debug!("Successfully saved variants to {}", filename);
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+ pub fn load_from_json(filename: &str) -> anyhow::Result<Self> {
|
|
|
+ let mut reader = get_gz_reader(filename)
|
|
|
+ .with_context(|| format!("Failed to create BGZF reader for file: {}", filename))?;
|
|
|
+
|
|
|
+ let variants: Self = serde_json::from_reader(&mut reader)
|
|
|
+ .with_context(|| format!("Failed to deserialize JSON from file: {}", filename))?;
|
|
|
+
|
|
|
+ debug!("Successfully loaded variants from {}", filename);
|
|
|
+ Ok(variants)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug)]
|
|
|
+pub struct SomaticVariantRates {
|
|
|
+ pub wgs_length: u32,
|
|
|
+ pub total_variants: usize,
|
|
|
+ pub somatic_mutation_rate_wgs: f64,
|
|
|
+ pub exon_count: usize,
|
|
|
+ pub variants_in_coding: usize,
|
|
|
+ pub coding_variants: usize,
|
|
|
+ pub total_exon_bases: u32,
|
|
|
+ pub somatic_mutation_rate_coding: f64,
|
|
|
+ pub somatic_nonsynonymous_rate_coding: f64,
|
|
|
+}
|
|
|
+
|
|
|
+pub fn somatic_rates(
|
|
|
+ variants: &[Variant],
|
|
|
+ feature_ranges: &Vec<GenomeRange>,
|
|
|
+ config: &Config,
|
|
|
+) -> anyhow::Result<SomaticVariantRates> {
|
|
|
+ let ol = par_overlaps(variants, feature_ranges);
|
|
|
+
|
|
|
+ let n_coding = ol
|
|
|
+ .iter()
|
|
|
+ .filter_map(|i| variants[*i].best_vep().ok())
|
|
|
+ .filter_map(|bv| bv.impact())
|
|
|
+ .filter(|impact| *impact <= VepImpact::MODERATE)
|
|
|
+ .count();
|
|
|
+
|
|
|
+ let n_bases_m: u32 = feature_ranges.par_iter().map(|gr| gr.length()).sum();
|
|
|
+ let mega_base_m = n_bases_m as f64 / 10.0e6;
|
|
|
+
|
|
|
+ let wgs_len: u32 = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum();
|
|
|
+ let rate_wgs = variants.len() as f64 / (wgs_len as f64 / 10.0e6);
|
|
|
+
|
|
|
+ let n_exons_mb = ol.len() as f64 / mega_base_m;
|
|
|
+ let n_coding_mb = n_coding as f64 / mega_base_m;
|
|
|
+
|
|
|
+ Ok(SomaticVariantRates {
|
|
|
+ total_variants: variants.len(),
|
|
|
+ exon_count: feature_ranges.len(),
|
|
|
+ variants_in_coding: ol.len(),
|
|
|
+ coding_variants: n_coding,
|
|
|
+ total_exon_bases: n_bases_m,
|
|
|
+ wgs_length: wgs_len,
|
|
|
+ somatic_mutation_rate_wgs: rate_wgs,
|
|
|
+ somatic_mutation_rate_coding: n_exons_mb,
|
|
|
+ somatic_nonsynonymous_rate_coding: n_coding_mb,
|
|
|
+ })
|
|
|
+}
|
|
|
+
|
|
|
+pub fn high_depth_somatic(id: &str, config: &Config) -> anyhow::Result<DashMap<String, Vec<GenomeRange>>> {
|
|
|
+ let mut contigs: Vec<String> = (1..22).map(|i| format!("chr{i}")).collect();
|
|
|
+ contigs.extend(["chrX", "chrY", "chrM"].into_iter().map(String::from));
|
|
|
+
|
|
|
+ let results: DashMap<String, Vec<GenomeRange>> = DashMap::new();
|
|
|
+ contigs.into_par_iter().for_each(|contig| {
|
|
|
+ let mrd_path = format!("{}/{contig}_count.tsv.gz", config.normal_dir_count(id));
|
|
|
+ let mrd_reader =
|
|
|
+ get_gz_reader(&mrd_path).with_context(|| format!("Failed to open: {mrd_path}"));
|
|
|
+ let diag_path = format!("{}/{contig}_count.tsv.gz", config.tumoral_dir_count(id));
|
|
|
+ let diag_reader =
|
|
|
+ get_gz_reader(&diag_path).with_context(|| format!("Failed to open: {diag_path}"));
|
|
|
+
|
|
|
+ if let (Ok(mrd_reader), Ok(diag_reader)) = (mrd_reader, diag_reader) {
|
|
|
+ let ranges: Vec<GenomeRange> = mrd_reader
|
|
|
+ .lines()
|
|
|
+ .zip(diag_reader.lines())
|
|
|
+ .filter_map(|(mrd, diag)| {
|
|
|
+ if let (Ok(mrd), Ok(diag)) = (mrd, diag) {
|
|
|
+ if let (Ok(mrd), Ok(diag)) =
|
|
|
+ (BinCount::from_tsv_row(&mrd), BinCount::from_tsv_row(&diag))
|
|
|
+ {
|
|
|
+ assert_eq!(mrd.contig, diag.contig);
|
|
|
+ assert_eq!(mrd.start, diag.start);
|
|
|
+ let bools: Vec<bool> = mrd
|
|
|
+ .depths
|
|
|
+ .into_iter()
|
|
|
+ .zip(diag.depths)
|
|
|
+ .map(|(depth_mrd, depth_diag)| {
|
|
|
+ depth_mrd >= config.min_high_quality_depth
|
|
|
+ && depth_diag >= config.min_high_quality_depth
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+ let ranges = ranges_from_consecutive_true(&bools, mrd.start, &mrd.contig);
|
|
|
+ return Some(ranges);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ None
|
|
|
+ })
|
|
|
+ .flatten()
|
|
|
+ .collect();
|
|
|
+ results.insert(contig, ranges);
|
|
|
+ }
|
|
|
+ });
|
|
|
+
|
|
|
+ Ok(results)
|
|
|
+}
|
|
|
+
|
|
|
+fn ranges_from_consecutive_true(vec: &[bool], start: u32, contig: &str) -> Vec<GenomeRange> {
|
|
|
+ let contig = contig_to_num(contig);
|
|
|
+ let mut ranges = Vec::new();
|
|
|
+ let mut current_start: Option<u32> = None;
|
|
|
+
|
|
|
+ // Iterate through elements starting from specified position
|
|
|
+ for (i, &value) in vec.iter().enumerate() {
|
|
|
+ let i = i as u32 + start;
|
|
|
+ match (value, current_start) {
|
|
|
+ // Begin new range
|
|
|
+ (true, None) => current_start = Some(i),
|
|
|
+ // Finalize current range
|
|
|
+ (false, Some(start_idx)) => {
|
|
|
+ // ranges.push(start_idx..i);
|
|
|
+ ranges.push(GenomeRange {
|
|
|
+ contig,
|
|
|
+ range: start_idx..i,
|
|
|
+ });
|
|
|
+ current_start = None;
|
|
|
+ }
|
|
|
+ _ => {}
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Add any remaining active range
|
|
|
+ if let Some(start_idx) = current_start {
|
|
|
+ // ranges.push(start_idx..vec.len() as u32);
|
|
|
+ ranges.push(GenomeRange {
|
|
|
+ contig,
|
|
|
+ range: start_idx..vec.len() as u32 + start,
|
|
|
+ });
|
|
|
+ }
|
|
|
+
|
|
|
+ ranges
|
|
|
+}
|