|
|
@@ -1,5 +1,5 @@
|
|
|
use std::{
|
|
|
- collections::{HashMap, HashSet},
|
|
|
+ collections::{BTreeMap, HashMap, HashSet},
|
|
|
fs::{self, File},
|
|
|
io::Write,
|
|
|
path::Path,
|
|
|
@@ -10,6 +10,7 @@ use bgzip::{BGZFReader, BGZFWriter};
|
|
|
use csv::ReaderBuilder;
|
|
|
use dashmap::DashMap;
|
|
|
use log::{debug, error, info, warn};
|
|
|
+use ordered_float::OrderedFloat;
|
|
|
use rayon::prelude::*;
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
use uuid::Uuid;
|
|
|
@@ -27,7 +28,7 @@ use crate::{
|
|
|
bam::{counts_at, counts_ins_at},
|
|
|
vcf::Vcf,
|
|
|
},
|
|
|
- helpers::{app_storage_dir, estimate_shannon_entropy, mean, temp_file_path, Hash128},
|
|
|
+ helpers::{app_storage_dir, bin_data, estimate_shannon_entropy, mean, temp_file_path, Hash128},
|
|
|
io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header},
|
|
|
positions::GenomePosition,
|
|
|
};
|
|
|
@@ -742,19 +743,83 @@ impl Variants {
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
pub struct VariantsStats {
|
|
|
pub n: u32,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
pub alteration_categories: DashMap<String, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
pub cosmic: DashMap<u64, u32>,
|
|
|
- pub gnomad: DashMap<String, Vec<f64>>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub n_alts: DashMap<u32, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub depths: DashMap<u32, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub vafs: DashMap<OrderedFloat<f32>, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub vep_impact: DashMap<String, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub consequences: DashMap<String, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub genes: DashMap<String, u32>,
|
|
|
+ pub n_gnomad: usize,
|
|
|
+ pub gnomad: Vec<(String, Vec<(f64, usize)>)>,
|
|
|
+}
|
|
|
+
|
|
|
+use serde::Serializer;
|
|
|
+
|
|
|
+pub fn serialize_dashmap_sort<S, T>(
|
|
|
+ data: &DashMap<T, u32>,
|
|
|
+ serializer: S,
|
|
|
+) -> Result<S::Ok, S::Error>
|
|
|
+where
|
|
|
+ S: Serializer,
|
|
|
+ T: Serialize + Ord + std::hash::Hash + Clone,
|
|
|
+{
|
|
|
+ let ordered: BTreeMap<_, _> = data
|
|
|
+ .iter()
|
|
|
+ .map(|entry| (entry.key().clone(), *entry.value()))
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ ordered.serialize(serializer)
|
|
|
}
|
|
|
|
|
|
impl VariantsStats {
|
|
|
pub fn new(variants: &Variants) -> Self {
|
|
|
let n = variants.data.len() as u32;
|
|
|
let alteration_categories: DashMap<String, u32> = DashMap::new();
|
|
|
+ let vep_impact: DashMap<String, u32> = DashMap::new();
|
|
|
+ let genes: DashMap<String, u32> = DashMap::new();
|
|
|
+ let consequences: DashMap<String, u32> = DashMap::new();
|
|
|
+ let n_alts: DashMap<u32, u32> = DashMap::new();
|
|
|
+ let depths: DashMap<u32, u32> = DashMap::new();
|
|
|
+ let vafs: DashMap<OrderedFloat<f32>, u32> = DashMap::new();
|
|
|
let cosmic: DashMap<u64, u32> = DashMap::new();
|
|
|
- let gnomad: DashMap<String, Vec<f64>> = DashMap::new();
|
|
|
+ let gnomads: DashMap<String, Vec<f64>> = DashMap::new();
|
|
|
|
|
|
variants.data.par_iter().for_each(|v| {
|
|
|
+ if let Ok(best_vep) = v.best_vep() {
|
|
|
+ if let Some(impact) = best_vep.extra.impact {
|
|
|
+ *vep_impact.entry(impact.to_string()).or_default() += 1;
|
|
|
+ }
|
|
|
+ if let Some(gene) = best_vep.extra.symbol {
|
|
|
+ *genes.entry(gene).or_default() += 1;
|
|
|
+ }
|
|
|
+ if let Some(csqs) = best_vep.consequence {
|
|
|
+ let mut csq: Vec<String> = csqs.into_iter().map(String::from).collect();
|
|
|
+ csq.sort();
|
|
|
+ csq.dedup();
|
|
|
+ *consequences.entry(csq.join(", ")).or_default() += 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let (n_alt, depth) = v.n_alt_depth();
|
|
|
+ *n_alts.entry(n_alt as u32).or_default() += 1;
|
|
|
+ *depths.entry(depth as u32).or_default() += 1;
|
|
|
+
|
|
|
+
|
|
|
+ let vaf = OrderedFloat::from(((n_alt * 1_000.0 / depth).round() / 10.0) as f32);
|
|
|
+ if !vaf.is_nan() {
|
|
|
+ *vafs.entry(vaf).or_default() += 1;
|
|
|
+ }
|
|
|
+
|
|
|
v.annotations.iter().for_each(|annotation| {
|
|
|
match annotation {
|
|
|
Annotation::Cosmic(v) => *cosmic.entry(v.cosmic_cnt).or_default() += 1,
|
|
|
@@ -763,7 +828,7 @@ impl VariantsStats {
|
|
|
.iter()
|
|
|
.map(|e| e.to_string_value_pair())
|
|
|
.for_each(|(key, value)| {
|
|
|
- gnomad.entry(key).or_default().push(value);
|
|
|
+ gnomads.entry(key).or_default().push(value);
|
|
|
});
|
|
|
}
|
|
|
_ => (),
|
|
|
@@ -781,30 +846,40 @@ impl VariantsStats {
|
|
|
.entry(alteration_category_str.join(", "))
|
|
|
.or_default() += 1;
|
|
|
});
|
|
|
-
|
|
|
- gnomad.iter().for_each(|e| {
|
|
|
- println!("{}\t{}", e.key(), mean(e.value()));
|
|
|
- });
|
|
|
+
|
|
|
+ let mut n_gnomad = 0;
|
|
|
+ let gnomad = gnomads.iter().map(|e| {
|
|
|
+ let data = e.value().to_vec();
|
|
|
+ n_gnomad = data.len();
|
|
|
+ (e.key().to_string(), bin_data(data, 0.0001))
|
|
|
+ }).collect();
|
|
|
|
|
|
Self {
|
|
|
n,
|
|
|
alteration_categories,
|
|
|
cosmic,
|
|
|
gnomad,
|
|
|
+ vafs,
|
|
|
+ n_alts,
|
|
|
+ depths,
|
|
|
+ vep_impact,
|
|
|
+ consequences,
|
|
|
+ genes,
|
|
|
+ n_gnomad,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
|
|
|
let file = File::create(filename)
|
|
|
- .with_context(|| format!("Failed to create file: {}", filename))?;
|
|
|
+ .map_err(|e| anyhow::anyhow!("Failed to create file: {}\n{e}", filename))?;
|
|
|
let mut writer = BGZFWriter::new(file, bgzip::Compression::default());
|
|
|
|
|
|
serde_json::to_writer(&mut writer, self)
|
|
|
- .with_context(|| format!("Failed to serialize JSON to file: {}", filename))?;
|
|
|
+ .map_err(|e| anyhow::anyhow!("Failed to serialize JSON to file: {}\n{e}", filename))?;
|
|
|
|
|
|
writer
|
|
|
.close()
|
|
|
- .with_context(|| format!("Failed to close BGZF writer for file: {}", filename))?;
|
|
|
+ .map_err(|e| anyhow::anyhow!("Failed to close BGZF writer for file: {}\n{e}", filename))?;
|
|
|
|
|
|
debug!("Successfully saved variants to {}", filename);
|
|
|
Ok(())
|
|
|
@@ -1139,8 +1214,6 @@ impl ExternalAnnotation {
|
|
|
let (sv, unfound): (Vec<VcfVariant>, Vec<VcfVariant>) =
|
|
|
unfound.into_iter().partition(|v| v.has_svtype());
|
|
|
|
|
|
- warn!("SV {}", sv.len());
|
|
|
-
|
|
|
let min_chunk_size = 1000;
|
|
|
let max_chunks = 150;
|
|
|
|