|
|
@@ -0,0 +1,960 @@
|
|
|
+use anyhow::Context;
|
|
|
+use log::{info, warn};
|
|
|
+use ordered_float::Float;
|
|
|
+use pandora_lib_graph::cytoband::{svg_chromosome, AdditionalRect, RectPosition};
|
|
|
+use plotly::{color::Rgb, common::Marker, layout::BarMode, Bar, Layout, Plot, Scatter};
|
|
|
+use rand::{thread_rng, Rng};
|
|
|
+use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
|
|
+use serde::{
|
|
|
+ de::{self, Visitor},
|
|
|
+ Deserialize, Deserializer, Serialize,
|
|
|
+};
|
|
|
+use statrs::{
|
|
|
+ distribution::{Continuous, Discrete},
|
|
|
+ statistics::Statistics,
|
|
|
+};
|
|
|
+use std::{
|
|
|
+ collections::{BTreeMap, HashMap, HashSet},
|
|
|
+ f64, fmt,
|
|
|
+ fs::File,
|
|
|
+ io::{BufRead, BufReader, Write},
|
|
|
+ str::FromStr,
|
|
|
+};
|
|
|
+
|
|
|
+#[derive(Debug, Clone)]
|
|
|
+pub struct Count {
|
|
|
+ pub position: CountRange,
|
|
|
+ pub n_reads: u32,
|
|
|
+ pub n_low_mapq: u32,
|
|
|
+ pub frac_sa: f32,
|
|
|
+ pub sa_outlier: bool,
|
|
|
+ pub frac_se: f32,
|
|
|
+ pub se_outlier: bool,
|
|
|
+ pub annotation: Vec<CountAnnotation>,
|
|
|
+}
|
|
|
+
|
|
|
+impl fmt::Display for Count {
|
|
|
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
+ write!(
|
|
|
+ f,
|
|
|
+ "{}\t{}\t{}\t{:.6}\t{}\t{:.6}\t{}",
|
|
|
+ self.position,
|
|
|
+ self.n_reads,
|
|
|
+ self.n_low_mapq,
|
|
|
+ self.frac_sa,
|
|
|
+ self.sa_outlier,
|
|
|
+ self.frac_se,
|
|
|
+ self.se_outlier
|
|
|
+ )
|
|
|
+ }
|
|
|
+}
|
|
|
+// inclusive 0 based
|
|
|
+#[derive(Debug, Clone)]
|
|
|
+pub struct CountRange {
|
|
|
+ pub contig: String,
|
|
|
+ pub start: u32,
|
|
|
+ pub end: u32,
|
|
|
+}
|
|
|
+
|
|
|
+impl fmt::Display for CountRange {
|
|
|
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
+ write!(f, "{}:{}-{}", self.contig, self.start, self.end)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone, Hash, Eq, PartialEq)]
|
|
|
+pub enum CountAnnotation {
|
|
|
+ MaskedLowMRD,
|
|
|
+ MaskedQuality,
|
|
|
+}
|
|
|
+
|
|
|
+impl<'de> Deserialize<'de> for Count {
|
|
|
+ fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
|
|
+ where
|
|
|
+ D: Deserializer<'de>,
|
|
|
+ {
|
|
|
+ struct CountVisitor;
|
|
|
+
|
|
|
+ impl<'de> Visitor<'de> for CountVisitor {
|
|
|
+ type Value = Count;
|
|
|
+
|
|
|
+ fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
|
+ formatter.write_str("a string in the format 'chr:start-end n_reads n_low_mapq frac_sa sa_outlier frac_se se_outlier'")
|
|
|
+ }
|
|
|
+
|
|
|
+ fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
|
|
|
+ where
|
|
|
+ E: de::Error,
|
|
|
+ {
|
|
|
+ let parts: Vec<&str> = s.split_whitespace().collect();
|
|
|
+ if parts.len() != 7 {
|
|
|
+ return Err(E::custom("incorrect number of fields"));
|
|
|
+ }
|
|
|
+
|
|
|
+ let position_parts: Vec<&str> = parts[0].split(&[':', '-'][..]).collect();
|
|
|
+ if position_parts.len() != 3 {
|
|
|
+ return Err(E::custom("incorrect position format"));
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(Count {
|
|
|
+ position: CountRange {
|
|
|
+ contig: position_parts[0].to_string(),
|
|
|
+ start: u32::from_str(position_parts[1]).map_err(E::custom)?,
|
|
|
+ end: u32::from_str(position_parts[2]).map_err(E::custom)?,
|
|
|
+ },
|
|
|
+ n_reads: u32::from_str(parts[1]).map_err(E::custom)?,
|
|
|
+ n_low_mapq: u32::from_str(parts[2]).map_err(E::custom)?,
|
|
|
+ frac_sa: f32::from_str(parts[3]).map_err(E::custom)?,
|
|
|
+ sa_outlier: bool::from_str(parts[4]).map_err(E::custom)?,
|
|
|
+ frac_se: f32::from_str(parts[5]).map_err(E::custom)?,
|
|
|
+ se_outlier: bool::from_str(parts[6]).map_err(E::custom)?,
|
|
|
+ annotation: Vec::new(),
|
|
|
+ })
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ deserializer.deserialize_str(CountVisitor)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+pub fn read_counts_from_file(filename: &str) -> anyhow::Result<Vec<Count>> {
|
|
|
+ let file = File::open(filename)?;
|
|
|
+ let reader = BufReader::new(file);
|
|
|
+ let mut counts = Vec::new();
|
|
|
+
|
|
|
+ for line in reader.lines() {
|
|
|
+ let line = line?;
|
|
|
+ let count: Count = serde_json::from_str(&format!("\"{}\"", escape_control_chars(&line)))?;
|
|
|
+ counts.push(count);
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(counts)
|
|
|
+}
|
|
|
+fn escape_control_chars(s: &str) -> String {
|
|
|
+ s.chars()
|
|
|
+ .map(|c| {
|
|
|
+ if c.is_control() {
|
|
|
+ format!("\\u{:04x}", c as u32)
|
|
|
+ } else {
|
|
|
+ c.to_string()
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect()
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug)]
|
|
|
+pub struct Counts {
|
|
|
+ pub data: HashMap<String, Vec<Count>>,
|
|
|
+ pub mrd: HashMap<String, Vec<Count>>,
|
|
|
+}
|
|
|
+
|
|
|
+impl Counts {
|
|
|
+ pub fn from_files(paths: Vec<String>) -> Self {
|
|
|
+ let counts: Vec<Vec<Count>> = paths
|
|
|
+ .par_iter()
|
|
|
+ .map(|path| match read_counts_from_file(path) {
|
|
|
+ Ok(c) => c,
|
|
|
+ Err(e) => {
|
|
|
+ warn!("Couldnt load {path}: {e}");
|
|
|
+ Vec::new()
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .filter(|v| !v.is_empty())
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let mut data = HashMap::new();
|
|
|
+ for count in counts {
|
|
|
+ let contig = count.first().unwrap().position.contig.clone();
|
|
|
+ data.insert(contig, count);
|
|
|
+ }
|
|
|
+ Counts {
|
|
|
+ data,
|
|
|
+ mrd: HashMap::new(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn mrd_from_files(&mut self, paths: Vec<String>) {
|
|
|
+ let counts: Vec<Vec<Count>> = paths
|
|
|
+ .par_iter()
|
|
|
+ .map(|path| match read_counts_from_file(path) {
|
|
|
+ Ok(c) => c,
|
|
|
+ Err(e) => {
|
|
|
+ warn!("Couldnt load {path}: {e}");
|
|
|
+ Vec::new()
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .filter(|v| !v.is_empty())
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let mut data = HashMap::new();
|
|
|
+ for count in counts {
|
|
|
+ let contig = count.first().unwrap().position.contig.clone();
|
|
|
+ data.insert(contig, count);
|
|
|
+ }
|
|
|
+ self.mrd = data;
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn mask_low_mrd(&mut self, contig: &str, min_reads: u32) -> anyhow::Result<()> {
|
|
|
+ if let (Some(mrd), Some(diag)) = (self.mrd.get(contig), self.data.get_mut(contig)) {
|
|
|
+ for (m, d) in mrd.iter().zip(diag) {
|
|
|
+ if m.n_reads < min_reads {
|
|
|
+ d.annotation.push(CountAnnotation::MaskedLowMRD);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in both mrd and diag.")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn mask_low_quality(&mut self, contig: &str, max_ratio: f64) -> anyhow::Result<()> {
|
|
|
+ if let Some(diag) = self.data.get_mut(contig) {
|
|
|
+ for d in diag.iter_mut() {
|
|
|
+ if (d.n_low_mapq as f64 / (d.n_reads + d.n_low_mapq) as f64) > max_ratio {
|
|
|
+ d.annotation.push(CountAnnotation::MaskedQuality);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in both mrd and diag.")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn frequencies(&self, contig: &str) -> anyhow::Result<Vec<(f64, f64)>> {
|
|
|
+ let data = self.get(contig)?;
|
|
|
+
|
|
|
+ let mut frequencies = HashMap::new();
|
|
|
+ for count in data.iter() {
|
|
|
+ *frequencies.entry(*count).or_insert(0) += 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut frequencies: Vec<(u32, f64)> =
|
|
|
+ frequencies.iter().map(|(x, y)| (*x, *y as f64)).collect();
|
|
|
+ frequencies.sort_by_key(|v| v.0);
|
|
|
+ Ok(frequencies.iter().map(|(x, y)| (*x as f64, *y)).collect())
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn percentile(&self, contig: &str, percentile: f64) -> anyhow::Result<u32> {
|
|
|
+ let mut data = self.get(contig)?;
|
|
|
+ data.sort_unstable();
|
|
|
+ let total_count = data.len();
|
|
|
+ let index = |percentile: f64| -> usize {
|
|
|
+ (percentile / 100.0 * (total_count - 1) as f64).round() as usize
|
|
|
+ };
|
|
|
+
|
|
|
+ Ok(*data.get(index(percentile)).context("Error in percentile")?)
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn save_contig(
|
|
|
+ &mut self,
|
|
|
+ contig: &str,
|
|
|
+ prefix: &str,
|
|
|
+ breaks: Vec<u32>,
|
|
|
+ ) -> anyhow::Result<CountsStats> {
|
|
|
+ self.mask_low_mrd(contig, 6)?;
|
|
|
+ self.mask_low_quality(contig, 0.1)?;
|
|
|
+
|
|
|
+ let data: Vec<f64> = self.get(contig)?.iter().map(|v| *v as f64).collect();
|
|
|
+ let n_final = data.len();
|
|
|
+
|
|
|
+ let frequencies = self.frequencies(contig)?;
|
|
|
+ let percentile_99 = self.percentile(contig, 99.0)?;
|
|
|
+
|
|
|
+ let mut data_x = Vec::new();
|
|
|
+ let mut data_y = Vec::new();
|
|
|
+ frequencies.iter().for_each(|(x, y)| {
|
|
|
+ if *x <= percentile_99 as f64 {
|
|
|
+ data_x.push(*x);
|
|
|
+ data_y.push(*y / n_final as f64);
|
|
|
+ }
|
|
|
+ });
|
|
|
+
|
|
|
+ // Distribution plot
|
|
|
+ let distribution_path = format!("{prefix}_{contig}_distrib.svg");
|
|
|
+ info!("Saving graph: {distribution_path}");
|
|
|
+ let mut plot = Plot::new();
|
|
|
+ let colors: Vec<Rgb> = data_x
|
|
|
+ .iter()
|
|
|
+ .map(|&x| match x {
|
|
|
+ x if x < 2.0 => Rgb::new(193, 18, 31),
|
|
|
+ x if x < 6.0 => Rgb::new(243, 114, 44),
|
|
|
+ x if x < 15.0 => Rgb::new(255, 202, 58),
|
|
|
+ _ => Rgb::new(138, 201, 38),
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let bars = Bar::new(data_x.clone(), data_y.clone())
|
|
|
+ .show_legend(false)
|
|
|
+ .marker(Marker::new().color_array(colors));
|
|
|
+
|
|
|
+ plot.add_trace(bars);
|
|
|
+
|
|
|
+ let sum: f64 = data.iter().sum();
|
|
|
+ let mean = (&data).mean();
|
|
|
+ let count = data.len() as f64;
|
|
|
+ let std_dev = (&data).std_dev();
|
|
|
+
|
|
|
+ // Normal
|
|
|
+ let normal = statrs::distribution::Normal::new(mean, std_dev)?;
|
|
|
+ let data_y: Vec<f64> = data_x.iter().map(|x| normal.pdf(*x)).collect();
|
|
|
+ let trace = Scatter::new(data_x.clone(), data_y).name("Normal");
|
|
|
+ plot.add_trace(trace);
|
|
|
+
|
|
|
+ // // Gamma
|
|
|
+ // let shape = mean * mean / variance;
|
|
|
+ // let rate = mean / variance;
|
|
|
+ //
|
|
|
+ // let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
|
|
|
+ // let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
|
|
|
+ // let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
|
|
|
+ // plot.add_trace(trace);
|
|
|
+
|
|
|
+ // Poisson
|
|
|
+ let lambda = sum / count;
|
|
|
+ let poisson = statrs::distribution::Poisson::new(lambda)?;
|
|
|
+ let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
|
|
|
+ let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
|
|
|
+ plot.add_trace(trace);
|
|
|
+
|
|
|
+ plot.write_image(distribution_path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
+
|
|
|
+ // Fractions
|
|
|
+ let mut breaks_values = Vec::new();
|
|
|
+ for (i, b) in breaks.iter().enumerate() {
|
|
|
+ if i == 0 {
|
|
|
+ let total: f64 = frequencies
|
|
|
+ .iter()
|
|
|
+ .filter(|(x, _)| *x < *b as f64)
|
|
|
+ .map(|(_, y)| *y / count)
|
|
|
+ .sum();
|
|
|
+ breaks_values.push((format!("< {b}"), total));
|
|
|
+ } else {
|
|
|
+ let last = breaks[i - 1];
|
|
|
+ let total: f64 = frequencies
|
|
|
+ .iter()
|
|
|
+ .filter(|(x, _)| *x < *b as f64 && *x >= last as f64)
|
|
|
+ .map(|(_, y)| *y / count)
|
|
|
+ .sum();
|
|
|
+
|
|
|
+ breaks_values.push((format!("[{last} - {b}["), total));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let last = *breaks.last().unwrap();
|
|
|
+ let total: f64 = frequencies
|
|
|
+ .iter()
|
|
|
+ .filter(|(x, _)| *x >= last as f64)
|
|
|
+ .map(|(_, y)| *y / count)
|
|
|
+ .sum();
|
|
|
+ breaks_values.push((format!(">= {last}"), total));
|
|
|
+
|
|
|
+ // Chromosome
|
|
|
+ let tol = 25;
|
|
|
+ let chromosome_path = format!("{prefix}_{contig}_chromosome.svg");
|
|
|
+ info!("Saving graph: {chromosome_path}");
|
|
|
+
|
|
|
+ let target_annotations: HashSet<CountAnnotation> = vec![
|
|
|
+ CountAnnotation::MaskedLowMRD,
|
|
|
+ CountAnnotation::MaskedQuality,
|
|
|
+ ]
|
|
|
+ .into_iter()
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let d: Vec<u32> = self
|
|
|
+ .data
|
|
|
+ .get(contig)
|
|
|
+ .unwrap()
|
|
|
+ .iter()
|
|
|
+ .map(|c| {
|
|
|
+ if c.annotation
|
|
|
+ .iter()
|
|
|
+ .any(|ann| target_annotations.contains(ann))
|
|
|
+ {
|
|
|
+ 10_000u32
|
|
|
+ } else {
|
|
|
+ c.n_reads
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let hm = self.counts_annotations(contig)?;
|
|
|
+ let len = d.len();
|
|
|
+ let mut masked: Vec<(String, f64)> = hm
|
|
|
+ .iter()
|
|
|
+ .map(|(k, v)| (format!("{:?}", k), *v as f64 / len as f64))
|
|
|
+ .collect();
|
|
|
+ masked.push(("Un masked".to_string(), n_final as f64 / len as f64));
|
|
|
+
|
|
|
+ let under_6_rects: Vec<AdditionalRect> = ranges_under(&d, 5, tol)
|
|
|
+ .iter()
|
|
|
+ .filter(|(s, e)| e > s)
|
|
|
+ .map(|(start, end)| AdditionalRect {
|
|
|
+ start: *start as u32 * 1000,
|
|
|
+ end: *end as u32 * 1000,
|
|
|
+ color: String::from("red"),
|
|
|
+ position: RectPosition::Below(1),
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let over_6_rects: Vec<AdditionalRect> = ranges_between(&d, 6, 9999, tol)
|
|
|
+ .iter()
|
|
|
+ .filter(|(s, e)| e > s)
|
|
|
+ .map(|(start, end)| AdditionalRect {
|
|
|
+ start: *start as u32 * 1000,
|
|
|
+ end: *end as u32 * 1000,
|
|
|
+ color: String::from("green"),
|
|
|
+ position: RectPosition::Below(2),
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let masked_rec: Vec<AdditionalRect> = ranges_over(&d, 10000, tol)
|
|
|
+ .iter()
|
|
|
+ .filter(|(s, e)| e > s)
|
|
|
+ .map(|(start, end)| AdditionalRect {
|
|
|
+ start: *start as u32 * 1000,
|
|
|
+ end: *end as u32 * 1000,
|
|
|
+ color: String::from("grey"),
|
|
|
+ position: RectPosition::Below(0),
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let mut all = Vec::new();
|
|
|
+ all.extend(under_6_rects);
|
|
|
+ all.extend(over_6_rects);
|
|
|
+ // all.extend(over15);
|
|
|
+ all.extend(masked_rec);
|
|
|
+
|
|
|
+ svg_chromosome(
|
|
|
+ contig,
|
|
|
+ 1000,
|
|
|
+ 50,
|
|
|
+ "/data/ref/hs1/cytoBandMapped.bed",
|
|
|
+ &chromosome_path,
|
|
|
+ &all,
|
|
|
+ &Vec::new(),
|
|
|
+ )
|
|
|
+ .unwrap();
|
|
|
+
|
|
|
+ let stats = CountsStats {
|
|
|
+ sum,
|
|
|
+ mean,
|
|
|
+ std_dev,
|
|
|
+ breaks_values,
|
|
|
+ masked,
|
|
|
+ };
|
|
|
+
|
|
|
+ // Save stats
|
|
|
+ let json_path = format!("{prefix}_{contig}_stats.json");
|
|
|
+ info!("Saving stats into: {json_path}");
|
|
|
+ let json = serde_json::to_string_pretty(&stats)?;
|
|
|
+ let mut file = File::create(json_path)?;
|
|
|
+ file.write_all(json.as_bytes())?;
|
|
|
+
|
|
|
+ Ok(stats)
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn save_contigs(
|
|
|
+ &mut self,
|
|
|
+ contigs: &Vec<String>,
|
|
|
+ prefix: &str,
|
|
|
+ breaks: Vec<u32>,
|
|
|
+ ) -> anyhow::Result<()> {
|
|
|
+ let mut stats = Vec::new();
|
|
|
+ let mut proportions = HashMap::new();
|
|
|
+ for contig in contigs {
|
|
|
+ let stat = self.save_contig(contig, prefix, breaks.clone())?;
|
|
|
+ let un_masked: Vec<&(String, f64)> = stat.masked.iter().filter(|(s, _)| s == "Un masked").collect();
|
|
|
+ let un_masked = un_masked.first().unwrap().1;
|
|
|
+ let masked = 1.0 - un_masked;
|
|
|
+ let mut props: Vec<(String, f64)> = stat.breaks_values.iter().map(|(s, v)| (s.to_string(), *v * un_masked)).collect();
|
|
|
+ props.push(("masked".to_string(), masked));
|
|
|
+ props.iter().for_each(|(s, v)| {
|
|
|
+ proportions.entry(s.to_string()).or_insert(vec![]).push(*v);
|
|
|
+ });
|
|
|
+ stats.push(stat);
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut plot = Plot::new();
|
|
|
+ let layout = Layout::new().bar_mode(BarMode::Stack);
|
|
|
+
|
|
|
+ for (k, v) in proportions {
|
|
|
+ plot.add_trace(Bar::new(contigs.clone(), v.to_vec()).name(k));
|
|
|
+ }
|
|
|
+ println!("{:?}", contigs);
|
|
|
+ plot.set_layout(layout);
|
|
|
+ plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn counts_annotations(
|
|
|
+ &self,
|
|
|
+ contig: &str,
|
|
|
+ ) -> anyhow::Result<HashMap<CountAnnotation, u64>> {
|
|
|
+ if let Some(d) = self.data.get(contig) {
|
|
|
+ let mut counts = HashMap::new();
|
|
|
+ for c in d {
|
|
|
+ for a in &c.annotation {
|
|
|
+ *counts.entry(a.clone()).or_insert(0) += 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(counts)
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in counts.")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn get(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
|
|
|
+ if let Some(ccounts) = self.data.get(contig) {
|
|
|
+ let target_annotations: HashSet<CountAnnotation> = vec![
|
|
|
+ CountAnnotation::MaskedLowMRD,
|
|
|
+ CountAnnotation::MaskedQuality,
|
|
|
+ ]
|
|
|
+ .into_iter()
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ Ok(ccounts
|
|
|
+ .iter()
|
|
|
+ .filter(|count| {
|
|
|
+ !count
|
|
|
+ .annotation
|
|
|
+ .iter()
|
|
|
+ .any(|ann| target_annotations.contains(ann))
|
|
|
+ })
|
|
|
+ .map(|c| c.n_reads)
|
|
|
+ .collect())
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in counts.")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn mrd(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
|
|
|
+ if let Some(ccounts) = self.mrd.get(contig) {
|
|
|
+ Ok(ccounts.iter().map(|c| c.n_reads).collect())
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in counts.")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn calculate_percentiles(
|
|
|
+ &self,
|
|
|
+ contig: &str,
|
|
|
+ percentiles: &[f64],
|
|
|
+ ) -> anyhow::Result<Vec<f64>> {
|
|
|
+ if let Some(ccounts) = self.data.get(contig) {
|
|
|
+ let mut n_reads: Vec<u32> = ccounts.iter().map(|c| c.n_reads).collect();
|
|
|
+
|
|
|
+ n_reads.sort_unstable();
|
|
|
+
|
|
|
+ let cdf = ND::new(n_reads.clone());
|
|
|
+
|
|
|
+ // println!("CDF at 13: {:?}", cdf.cdf(13));
|
|
|
+ println!("Percentile at 99: {:?}", cdf.percentile(99.0));
|
|
|
+ println!("above 15X: {:?}", cdf.proportion_above(15));
|
|
|
+ // println!("above 15.1X: {:?}", cdf.fitted_proportion_above(&15.1));
|
|
|
+ println!("under 6X: {:?}", cdf.proportion_under(6));
|
|
|
+
|
|
|
+ Ok(percentiles
|
|
|
+ .iter()
|
|
|
+ .map(|&p| {
|
|
|
+ let index = (p * (n_reads.len() - 1) as f64).round() as usize;
|
|
|
+ n_reads[index] as f64
|
|
|
+ })
|
|
|
+ .collect())
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in counts.")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn nd_reads(&self, contig: &str) -> anyhow::Result<ND> {
|
|
|
+ if let Some(ccounts) = self.data.get(contig) {
|
|
|
+ Ok(ND::new(ccounts.iter().map(|c| c.n_reads).collect()))
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in counts")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn distribution(&self, contig: &str) -> anyhow::Result<ND> {
|
|
|
+ Ok(ND::new(self.get(contig)?))
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn save_stats(&self) -> anyhow::Result<()> {
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn save_global_proportions_graph(
|
|
|
+ &self,
|
|
|
+ path: &str,
|
|
|
+ contigs: &Vec<String>,
|
|
|
+ breaks: Vec<u32>,
|
|
|
+ ) {
|
|
|
+ let mut breaks_str = Vec::new();
|
|
|
+ for (i, b) in breaks.iter().enumerate() {
|
|
|
+ if i == 0 {
|
|
|
+ breaks_str.push(format!("< {b}"));
|
|
|
+ } else {
|
|
|
+ let last = breaks[i - 1];
|
|
|
+ breaks_str.push(format!("[{last} - {b}["))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ breaks_str.push(format!(">= {}", breaks.last().unwrap()));
|
|
|
+
|
|
|
+ let mut proportions = Vec::new();
|
|
|
+
|
|
|
+ for contig in contigs.iter() {
|
|
|
+ let d = self.get(contig).unwrap();
|
|
|
+ let nd = ND::new(d);
|
|
|
+ proportions.push(nd.frequencies(&breaks));
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut plot = Plot::new();
|
|
|
+ let layout = Layout::new().bar_mode(BarMode::Stack);
|
|
|
+ for (i, v) in transpose(proportions).iter().enumerate() {
|
|
|
+ plot.add_trace(Bar::new(contigs.clone(), v.to_vec()).name(&breaks_str[i]));
|
|
|
+ }
|
|
|
+ println!("{:?}", contigs);
|
|
|
+ plot.set_layout(layout);
|
|
|
+ plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn save_global_distribution_graph(&self, path: &str, contigs: &Vec<String>) {
|
|
|
+ let d: Vec<u32> = contigs.iter().flat_map(|c| self.get(c).unwrap()).collect();
|
|
|
+ let mut data_sorted = d.clone();
|
|
|
+ data_sorted.sort_unstable();
|
|
|
+
|
|
|
+ let nd = ND::new(d.clone());
|
|
|
+ let mut plot = Plot::new();
|
|
|
+
|
|
|
+ let bar_x: Vec<u32> = (1..=nd.percentile(99.0).unwrap()).collect();
|
|
|
+ let colors: Vec<plotly::color::Rgb> = bar_x
|
|
|
+ .iter()
|
|
|
+ .map(|&x| {
|
|
|
+ if x <= 2 {
|
|
|
+ plotly::color::Rgb::new(193, 18, 31)
|
|
|
+ } else if x >= 15 {
|
|
|
+ plotly::color::Rgb::new(138, 201, 38)
|
|
|
+ } else if x <= 6 {
|
|
|
+ plotly::color::Rgb::new(243, 114, 44)
|
|
|
+ } else {
|
|
|
+ plotly::color::Rgb::new(255, 202, 58)
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let data: Vec<u32> = d.iter().filter(|x| **x >= 1).copied().collect();
|
|
|
+
|
|
|
+ // frequencies
|
|
|
+ let mut frequencies = HashMap::new();
|
|
|
+ for &value in &data {
|
|
|
+ *frequencies.entry(value).or_insert(0) += 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ let bars = Bar::new(
|
|
|
+ bar_x.clone(),
|
|
|
+ bar_x
|
|
|
+ .iter()
|
|
|
+ .map(|x| *frequencies.get(x).unwrap_or(&0) as f64 / data.len() as f64)
|
|
|
+ .collect(),
|
|
|
+ )
|
|
|
+ .show_legend(false)
|
|
|
+ .marker(Marker::new().color_array(colors));
|
|
|
+
|
|
|
+ plot.add_trace(bars);
|
|
|
+
|
|
|
+ let data_x = generate_range(0.0, nd.percentile(99.0).unwrap().into(), 100);
|
|
|
+ let data_y: Vec<f64> = data_x.iter().map(|x| nd.fitted_normal.pdf(x)).collect();
|
|
|
+ let trace = Scatter::new(data_x.clone(), data_y).name("Gaussian");
|
|
|
+ plot.add_trace(trace);
|
|
|
+
|
|
|
+ // Gamma
|
|
|
+ let data: Vec<f64> = d.iter().map(|x| *x as f64).collect();
|
|
|
+
|
|
|
+ let sum: f64 = data.iter().sum();
|
|
|
+ let mean = (&data).mean();
|
|
|
+ let variance = (&data).variance();
|
|
|
+ let count = d.len() as f64;
|
|
|
+ let shape = mean * mean / variance;
|
|
|
+ let rate = mean / variance;
|
|
|
+
|
|
|
+ let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
|
|
|
+ let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
|
|
|
+ let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
|
|
|
+ plot.add_trace(trace);
|
|
|
+
|
|
|
+ // Poisson
|
|
|
+ let lambda = sum / count;
|
|
|
+ let poisson = statrs::distribution::Poisson::new(lambda).unwrap();
|
|
|
+ let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
|
|
|
+ let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
|
|
|
+ plot.add_trace(trace);
|
|
|
+
|
|
|
+ plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
+ println!("> 15x: {:?}", nd.frequencies(&vec![1, 6, 15]));
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+pub struct ND {
|
|
|
+ pub data: Vec<u32>,
|
|
|
+ pub distribution: BTreeMap<u32, f64>,
|
|
|
+ pub total_count: usize,
|
|
|
+ pub frequency: HashMap<u32, usize>,
|
|
|
+ pub fitted_normal: UvNormal,
|
|
|
+}
|
|
|
+
|
|
|
+use rstat::{fitting::MLE, normal::UvNormal, ContinuousDistribution};
|
|
|
+
|
|
|
+impl ND {
|
|
|
+ fn new(mut data: Vec<u32>) -> Self {
|
|
|
+ data.sort_unstable();
|
|
|
+ let n = data.len();
|
|
|
+ info!("n values {n}");
|
|
|
+ let mut distribution = BTreeMap::new();
|
|
|
+
|
|
|
+ let mut frequency = HashMap::new();
|
|
|
+ for &value in &data {
|
|
|
+ *frequency.entry(value).or_insert(0) += 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut cumulative_count = 0;
|
|
|
+ for (&value, &count) in &frequency {
|
|
|
+ cumulative_count += count;
|
|
|
+ let cumulative_prob = cumulative_count as f64 / n as f64;
|
|
|
+ distribution.insert(value, cumulative_prob);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Fit normal distribution
|
|
|
+ let fitted_normal = rstat::univariate::normal::Normal::fit_mle(
|
|
|
+ &data
|
|
|
+ .iter()
|
|
|
+ .filter(|x| *x >= &1u32)
|
|
|
+ .map(|x| *x as f64)
|
|
|
+ .collect::<Vec<f64>>(),
|
|
|
+ )
|
|
|
+ .unwrap();
|
|
|
+
|
|
|
+ Self {
|
|
|
+ data,
|
|
|
+ distribution,
|
|
|
+ frequency,
|
|
|
+ total_count: n,
|
|
|
+ fitted_normal,
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn frequency(&self, x: u32) -> usize {
|
|
|
+ *self.frequency.get(&x).unwrap_or(&0)
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn frequencies(&self, breaks: &Vec<u32>) -> Vec<f64> {
|
|
|
+ let mut last_prop_under = 0.0;
|
|
|
+ let mut res = Vec::new();
|
|
|
+ for brk in breaks {
|
|
|
+ let v = self.proportion_under(*brk) - last_prop_under;
|
|
|
+ res.push(v);
|
|
|
+ last_prop_under += v;
|
|
|
+ }
|
|
|
+ let per99 = self.percentile(99.0).unwrap();
|
|
|
+ res.push(self.proportion_under(per99) - last_prop_under);
|
|
|
+ res
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn percentile(&self, percentile: f64) -> Option<u32> {
|
|
|
+ if !(0.0..=100.0).contains(&percentile) {
|
|
|
+ return None;
|
|
|
+ }
|
|
|
+
|
|
|
+ let index = (percentile / 100.0 * (self.total_count - 1) as f64).round() as usize;
|
|
|
+ self.data.get(index).cloned()
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn proportion_under(&self, x: u32) -> f64 {
|
|
|
+ let count = self
|
|
|
+ .frequency
|
|
|
+ .iter()
|
|
|
+ .filter(|(&value, _)| value < x)
|
|
|
+ .map(|(_, &count)| count)
|
|
|
+ .sum::<usize>();
|
|
|
+ count as f64 / self.total_count as f64
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn proportion_above(&self, x: u32) -> f64 {
|
|
|
+ let count = self
|
|
|
+ .frequency
|
|
|
+ .iter()
|
|
|
+ .filter(|(&value, _)| value > x)
|
|
|
+ .map(|(_, &count)| count)
|
|
|
+ .sum::<usize>();
|
|
|
+ count as f64 / self.total_count as f64
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+pub fn generate_range(start: f64, end: f64, steps: usize) -> Vec<f64> {
|
|
|
+ if steps == 0 {
|
|
|
+ return vec![];
|
|
|
+ }
|
|
|
+ if steps == 1 {
|
|
|
+ return vec![start];
|
|
|
+ }
|
|
|
+
|
|
|
+ let step_size = (end - start) / (steps - 1) as f64;
|
|
|
+ (0..steps).map(|i| start + i as f64 * step_size).collect()
|
|
|
+}
|
|
|
+
|
|
|
+use rayon::prelude::*;
|
|
|
+
|
|
|
+pub fn ranges_under(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
|
|
|
+ get_ranges_parallel(vec, x, tolerance, |val, threshold| val <= threshold)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn ranges_over(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
|
|
|
+ get_ranges_parallel(vec, x, tolerance, |val, threshold| val >= threshold)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn ranges_between(
|
|
|
+ vec: &[u32],
|
|
|
+ lower: u32,
|
|
|
+ upper: u32,
|
|
|
+ tolerance: usize,
|
|
|
+) -> Vec<(usize, usize)> {
|
|
|
+ get_ranges_parallel(vec, (lower, upper), tolerance, |val, (l, u)| {
|
|
|
+ val >= l && val <= u
|
|
|
+ })
|
|
|
+}
|
|
|
+
|
|
|
+pub fn get_ranges_parallel<T, F>(
|
|
|
+ vec: &[u32],
|
|
|
+ threshold: T,
|
|
|
+ tolerance: usize,
|
|
|
+ compare: F,
|
|
|
+) -> Vec<(usize, usize)>
|
|
|
+where
|
|
|
+ F: Fn(u32, T) -> bool + Sync,
|
|
|
+ T: Copy + Sync,
|
|
|
+{
|
|
|
+ if vec.is_empty() {
|
|
|
+ return Vec::new();
|
|
|
+ }
|
|
|
+
|
|
|
+ let chunk_size = (vec.len() / rayon::current_num_threads()).max(1);
|
|
|
+ vec.par_chunks(chunk_size)
|
|
|
+ .enumerate()
|
|
|
+ .flat_map(|(chunk_index, chunk)| {
|
|
|
+ let mut local_ranges = Vec::new();
|
|
|
+ let mut current_range: Option<(usize, usize)> = None;
|
|
|
+ let offset = chunk_index * chunk_size;
|
|
|
+
|
|
|
+ for (i, &val) in chunk.iter().enumerate() {
|
|
|
+ let global_index = offset + i;
|
|
|
+ if compare(val, threshold) {
|
|
|
+ match current_range {
|
|
|
+ Some((start, end)) if global_index <= end + tolerance + 1 => {
|
|
|
+ current_range = Some((start, global_index));
|
|
|
+ }
|
|
|
+ Some((start, end)) => {
|
|
|
+ local_ranges.push((start, end));
|
|
|
+ current_range = Some((global_index, global_index));
|
|
|
+ }
|
|
|
+ None => {
|
|
|
+ current_range = Some((global_index, global_index));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else if let Some((start, end)) = current_range {
|
|
|
+ if global_index > end + tolerance + 1 {
|
|
|
+ local_ranges.push((start, end));
|
|
|
+ current_range = None;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if let Some(range) = current_range {
|
|
|
+ local_ranges.push(range);
|
|
|
+ }
|
|
|
+
|
|
|
+ local_ranges
|
|
|
+ })
|
|
|
+ .collect()
|
|
|
+}
|
|
|
+
|
|
|
+pub fn transpose(v: Vec<Vec<f64>>) -> Vec<Vec<f64>> {
|
|
|
+ assert!(!v.is_empty());
|
|
|
+ let len = v[0].len();
|
|
|
+ let mut result = vec![Vec::with_capacity(v.len()); len];
|
|
|
+
|
|
|
+ for row in v {
|
|
|
+ for (i, val) in row.into_iter().enumerate() {
|
|
|
+ result[i].push(val);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ result
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Serialize)]
|
|
|
+pub struct CountsStats {
|
|
|
+ pub sum: f64,
|
|
|
+ pub mean: f64,
|
|
|
+ pub std_dev: f64,
|
|
|
+ pub breaks_values: Vec<(String, f64)>,
|
|
|
+ pub masked: Vec<(String, f64)>,
|
|
|
+}
|
|
|
+
|
|
|
+// pub fn save_barplota
|
|
|
+// data: Vec<f64>,
|
|
|
+// data_x: Vec<f64>,
|
|
|
+// data_y: Vec<f64>,
|
|
|
+// path: &str,
|
|
|
+// ) -> anyhow::Result<CountsStats> {
|
|
|
+// let mut plot = Plot::new();
|
|
|
+//
|
|
|
+// let colors: Vec<plotly::color::Rgb> = data_x
|
|
|
+// .iter()
|
|
|
+// .map(|&x| {
|
|
|
+// if x <= 2.0 {
|
|
|
+// plotly::color::Rgb::new(193, 18, 31)
|
|
|
+// } else if x >= 15.0 {
|
|
|
+// plotly::color::Rgb::new(138, 201, 38)
|
|
|
+// } else if x <= 6.0 {
|
|
|
+// plotly::color::Rgb::new(243, 114, 44)
|
|
|
+// } else {
|
|
|
+// plotly::color::Rgb::new(255, 202, 58)
|
|
|
+// }
|
|
|
+// })
|
|
|
+// .collect();
|
|
|
+//
|
|
|
+// let bars = Bar::new(data_x.clone(), data_y.clone())
|
|
|
+// .show_legend(false)
|
|
|
+// .marker(Marker::new().color_array(colors));
|
|
|
+//
|
|
|
+// plot.add_trace(bars);
|
|
|
+//
|
|
|
+// let sum: f64 = data.iter().sum();
|
|
|
+// let mean = (&data).mean();
|
|
|
+// let count = data.len() as f64;
|
|
|
+// let std_dev = (&data).std_dev();
|
|
|
+// println!("mean {mean}");
|
|
|
+//
|
|
|
+// // Normal
|
|
|
+// let normal = statrs::distribution::Normal::new(mean, std_dev)?;
|
|
|
+// let data_y: Vec<f64> = data_x.iter().map(|x| normal.pdf(*x)).collect();
|
|
|
+// let trace = Scatter::new(data_x.clone(), data_y).name("Normal");
|
|
|
+// plot.add_trace(trace);
|
|
|
+//
|
|
|
+// // // Gamma
|
|
|
+// // let shape = mean * mean / variance;
|
|
|
+// // let rate = mean / variance;
|
|
|
+// //
|
|
|
+// // let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
|
|
|
+// // let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
|
|
|
+// // let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
|
|
|
+// // plot.add_trace(trace);
|
|
|
+//
|
|
|
+// // Poisson
|
|
|
+// let lambda = sum / count;
|
|
|
+// let poisson = statrs::distribution::Poisson::new(lambda)?;
|
|
|
+// let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
|
|
|
+// let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
|
|
|
+// plot.add_trace(trace);
|
|
|
+//
|
|
|
+// plot.use_local_plotly();
|
|
|
+// plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
+// Ok(CountsStats { sum, mean, std_dev })
|
|
|
+// }
|