|
|
@@ -10,7 +10,8 @@ use std::{
|
|
|
collections::{HashMap, HashSet},
|
|
|
fmt,
|
|
|
fs::File,
|
|
|
- io::{Read, Write},
|
|
|
+ io::{BufWriter, Read, Write},
|
|
|
+ path::Path,
|
|
|
str::FromStr,
|
|
|
sync::{
|
|
|
atomic::{AtomicU32, Ordering},
|
|
|
@@ -19,7 +20,7 @@ use std::{
|
|
|
};
|
|
|
|
|
|
use crate::{
|
|
|
- helpers::{mean, Blake3BuildHasher, Hash128},
|
|
|
+ helpers::{format_count, mean, Blake3BuildHasher, Hash128},
|
|
|
variant::{variant_collection::VariantCollection, vcf_variant::AlterationCategory},
|
|
|
};
|
|
|
use bitcode::{Decode, Encode};
|
|
|
@@ -93,6 +94,15 @@ pub enum Annotation {
|
|
|
|
|
|
/// RepeatMasker
|
|
|
Repeat,
|
|
|
+
|
|
|
+ /// NanomonSV Target Site Duplication
|
|
|
+ TSD(String),
|
|
|
+
|
|
|
+ /// NanomonSV insertion type
|
|
|
+ InsertionType(String),
|
|
|
+
|
|
|
+ /// query_start,query_end,class,class...
|
|
|
+ RepeatMasker(String),
|
|
|
}
|
|
|
|
|
|
/// Denotes the biological sample type associated with a variant call.
|
|
|
@@ -231,6 +241,9 @@ impl fmt::Display for Annotation {
|
|
|
HighDepth => "HighDepth".into(),
|
|
|
Panel(name) => format!("Panel_{name}"),
|
|
|
LowMAPQ => "LowMAPQ".to_string(),
|
|
|
+ TSD(_) => "TSD".to_string(),
|
|
|
+ InsertionType(v) => format!("InsertionType_{v}"),
|
|
|
+ RepeatMasker(_) => "RepeatMasker".to_string(),
|
|
|
};
|
|
|
write!(f, "{}", s)
|
|
|
}
|
|
|
@@ -326,15 +339,95 @@ pub struct AnnotationsStats {
|
|
|
pub numeric: DashMap<String, HashMap<String, Vec<f64>>>,
|
|
|
}
|
|
|
|
|
|
+impl fmt::Display for AnnotationsStats {
|
|
|
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
+ let total: u64 = self.categorical.iter().map(|e| *e.value()).sum();
|
|
|
+ let max_key_len = self
|
|
|
+ .categorical
|
|
|
+ .iter()
|
|
|
+ .map(|e| e.key().len())
|
|
|
+ .max()
|
|
|
+ .unwrap_or(0);
|
|
|
+
|
|
|
+ writeln!(f, "\nCallers stats:")?;
|
|
|
+ writeln!(f, " n categories: {}\n", self.categorical.len())?;
|
|
|
+
|
|
|
+ let mut lines: Vec<String> = self
|
|
|
+ .categorical
|
|
|
+ .iter()
|
|
|
+ .map(|e| {
|
|
|
+ let k = e.key();
|
|
|
+ let v = *e.value();
|
|
|
+ let pct = 100.0 * v as f64 / total as f64;
|
|
|
+ let mut num_str = Vec::new();
|
|
|
+ if let Some(nums) = self.numeric.get(k) {
|
|
|
+ num_str.extend(
|
|
|
+ nums.iter()
|
|
|
+ .map(|(k_n, v_n)| format!("{k_n}: {:.2}", mean(v_n))),
|
|
|
+ );
|
|
|
+ }
|
|
|
+ num_str.sort();
|
|
|
+ let num_part = if num_str.is_empty() {
|
|
|
+ String::new()
|
|
|
+ } else {
|
|
|
+ format!(" [{}]", num_str.join(", "))
|
|
|
+ };
|
|
|
+ format!(
|
|
|
+ " {:<width$} {:>10} {:>5.1}%{num_part}",
|
|
|
+ k,
|
|
|
+ format_count(v),
|
|
|
+ pct,
|
|
|
+ width = max_key_len
|
|
|
+ )
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ lines.sort();
|
|
|
+ writeln!(f, "{}", lines.join("\n"))?;
|
|
|
+ writeln!(f, " {:-<width$}", "", width = max_key_len + 22)?;
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " {:<width$} {:>10} 100.0%",
|
|
|
+ "Total",
|
|
|
+ format_count(total),
|
|
|
+ width = max_key_len
|
|
|
+ )
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+pub enum AnnotationsStatCat {
|
|
|
+ Categorical(String),
|
|
|
+ Numerical(String, f64),
|
|
|
+}
|
|
|
+
|
|
|
+impl Annotation {
|
|
|
+ pub fn stat(&self) -> AnnotationsStatCat {
|
|
|
+ use Annotation::*;
|
|
|
+ match self {
|
|
|
+ ShannonEntropy(v) => AnnotationsStatCat::Numerical(self.to_string(), *v),
|
|
|
+ ConstitDepth(v) => AnnotationsStatCat::Numerical(self.to_string(), *v as f64),
|
|
|
+ ConstitAlt(v) => AnnotationsStatCat::Numerical(self.to_string(), *v as f64),
|
|
|
+ Cosmic(c) => AnnotationsStatCat::Numerical(self.to_string(), c.cosmic_cnt as f64),
|
|
|
+ Callers(caller, sample) => {
|
|
|
+ AnnotationsStatCat::Categorical(format!("{caller} {sample}"))
|
|
|
+ }
|
|
|
+ AlterationCategory(alt_cat) => AnnotationsStatCat::Categorical(alt_cat.to_string()),
|
|
|
+ _ => AnnotationsStatCat::Categorical(self.to_string()),
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
impl AnnotationsStats {
|
|
|
- pub fn save_to_json(&self, file_path: &str) -> anyhow::Result<()> {
|
|
|
- let json = serde_json::to_string_pretty(self)?;
|
|
|
- let mut file = File::create(file_path)?;
|
|
|
- file.write_all(json.as_bytes())?;
|
|
|
+ /// Serializes the statistics to a pretty-printed JSON file.
|
|
|
+ pub fn save_to_json(&self, file_path: impl AsRef<Path>) -> anyhow::Result<()> {
|
|
|
+ let file = File::create(file_path)?;
|
|
|
+ let writer = BufWriter::new(file);
|
|
|
+ serde_json::to_writer_pretty(writer, self)?;
|
|
|
Ok(())
|
|
|
}
|
|
|
|
|
|
- pub fn load_from_json(file_path: &str) -> anyhow::Result<Self> {
|
|
|
+ /// Deserializes statistics from a JSON file produced by [`save_to_json`](Self::save_to_json).
|
|
|
+ pub fn load_from_json(file_path: impl AsRef<Path>) -> anyhow::Result<Self> {
|
|
|
let mut file = File::open(file_path)?;
|
|
|
let mut contents = String::new();
|
|
|
file.read_to_string(&mut contents)?;
|
|
|
@@ -345,6 +438,9 @@ impl AnnotationsStats {
|
|
|
|
|
|
#[allow(clippy::type_complexity)]
|
|
|
impl Annotations {
|
|
|
+ /// Inserts or updates annotations for a given variant key.
|
|
|
+ ///
|
|
|
+ /// If the key already exists, the new annotations are appended to the existing ones.
|
|
|
pub fn insert_update(&self, key: Hash128, add: &[Annotation]) {
|
|
|
self.store
|
|
|
.entry(key)
|
|
|
@@ -356,7 +452,6 @@ impl Annotations {
|
|
|
&self,
|
|
|
annotations: Option<Box<dyn Fn(&Annotation) -> bool + Send + Sync>>,
|
|
|
) -> AnnotationsStats {
|
|
|
- use Annotation::*;
|
|
|
let map: DashMap<String, u64> = DashMap::new();
|
|
|
let num_maps: DashMap<String, HashMap<String, Vec<f64>>> = DashMap::new();
|
|
|
|
|
|
@@ -370,17 +465,9 @@ impl Annotations {
|
|
|
let mut categorical = Vec::new();
|
|
|
let mut numerical = Vec::new();
|
|
|
for ann in anns.iter() {
|
|
|
- match ann {
|
|
|
- LowConstitDepth | LowEntropy | GnomAD(_) | VEP(_) | TriNucleotides(_)
|
|
|
- | ReplicationTiming(_) | HighDepth | CpG | VNTR | Repeat | Panel(_)
|
|
|
- | LowMAPQ | HighConstitAlt => categorical.push(ann.to_string()),
|
|
|
- Callers(caller, sample) => categorical.push(format!("{caller} {sample}")),
|
|
|
- ShannonEntropy(v) => numerical.push((ann.to_string(), *v)),
|
|
|
- ConstitDepth(v) | Annotation::ConstitAlt(v) => {
|
|
|
- numerical.push((ann.to_string(), *v as f64));
|
|
|
- }
|
|
|
- Cosmic(c) => numerical.push((ann.to_string(), c.cosmic_cnt as f64)),
|
|
|
- AlterationCategory(alt_cat) => categorical.push(alt_cat.to_string()),
|
|
|
+ match ann.stat() {
|
|
|
+ AnnotationsStatCat::Categorical(s) => categorical.push(s),
|
|
|
+ AnnotationsStatCat::Numerical(k, v) => numerical.push((k, v)),
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -400,34 +487,13 @@ impl Annotations {
|
|
|
}
|
|
|
});
|
|
|
|
|
|
- println!("\nCallers stats:");
|
|
|
- println!("\tn categories: {}", map.len());
|
|
|
- let mut n = 0;
|
|
|
- let lines: Vec<String> = map
|
|
|
- .iter()
|
|
|
- .map(|e| {
|
|
|
- let k = e.key();
|
|
|
- let v = e.value();
|
|
|
- n += v;
|
|
|
- let mut num_str = Vec::new();
|
|
|
- if let Some(nums) = num_maps.get(k) {
|
|
|
- num_str.extend(
|
|
|
- nums.iter()
|
|
|
- .map(|(k_n, v_n)| format!("{k_n} {:.2}", mean(v_n))),
|
|
|
- )
|
|
|
- }
|
|
|
- num_str.sort();
|
|
|
- format!("\t- {k}\t{v}\t{}", num_str.join("\t"))
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- println!("{}", lines.join("\n"));
|
|
|
- println!("Total\t{n}\n");
|
|
|
-
|
|
|
- AnnotationsStats {
|
|
|
+ let stats = AnnotationsStats {
|
|
|
categorical: map,
|
|
|
numeric: num_maps,
|
|
|
- }
|
|
|
+ };
|
|
|
+
|
|
|
+ info!("\n{stats}");
|
|
|
+ stats
|
|
|
}
|
|
|
|
|
|
pub fn vep_stats(&self) -> anyhow::Result<VepStats> {
|