|
|
@@ -1,960 +0,0 @@
|
|
|
-use anyhow::Context;
|
|
|
-use log::{info, warn};
|
|
|
-use ordered_float::Float;
|
|
|
-use pandora_lib_graph::cytoband::{svg_chromosome, AdditionalRect, RectPosition};
|
|
|
-use plotly::{color::Rgb, common::Marker, layout::BarMode, Bar, Layout, Plot, Scatter};
|
|
|
-use rand::{thread_rng, Rng};
|
|
|
-use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
|
|
-use serde::{
|
|
|
- de::{self, Visitor},
|
|
|
- Deserialize, Deserializer, Serialize,
|
|
|
-};
|
|
|
-use statrs::{
|
|
|
- distribution::{Continuous, Discrete},
|
|
|
- statistics::Statistics,
|
|
|
-};
|
|
|
-use std::{
|
|
|
- collections::{BTreeMap, HashMap, HashSet},
|
|
|
- f64, fmt,
|
|
|
- fs::File,
|
|
|
- io::{BufRead, BufReader, Write},
|
|
|
- str::FromStr,
|
|
|
-};
|
|
|
-
|
|
|
-#[derive(Debug, Clone)]
|
|
|
-pub struct Count {
|
|
|
- pub position: CountRange,
|
|
|
- pub n_reads: u32,
|
|
|
- pub n_low_mapq: u32,
|
|
|
- pub frac_sa: f32,
|
|
|
- pub sa_outlier: bool,
|
|
|
- pub frac_se: f32,
|
|
|
- pub se_outlier: bool,
|
|
|
- pub annotation: Vec<CountAnnotation>,
|
|
|
-}
|
|
|
-
|
|
|
-impl fmt::Display for Count {
|
|
|
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
- write!(
|
|
|
- f,
|
|
|
- "{}\t{}\t{}\t{:.6}\t{}\t{:.6}\t{}",
|
|
|
- self.position,
|
|
|
- self.n_reads,
|
|
|
- self.n_low_mapq,
|
|
|
- self.frac_sa,
|
|
|
- self.sa_outlier,
|
|
|
- self.frac_se,
|
|
|
- self.se_outlier
|
|
|
- )
|
|
|
- }
|
|
|
-}
|
|
|
-// inclusive 0 based
|
|
|
-#[derive(Debug, Clone)]
|
|
|
-pub struct CountRange {
|
|
|
- pub contig: String,
|
|
|
- pub start: u32,
|
|
|
- pub end: u32,
|
|
|
-}
|
|
|
-
|
|
|
-impl fmt::Display for CountRange {
|
|
|
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
- write!(f, "{}:{}-{}", self.contig, self.start, self.end)
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-#[derive(Debug, Clone, Hash, Eq, PartialEq)]
|
|
|
-pub enum CountAnnotation {
|
|
|
- MaskedLowMRD,
|
|
|
- MaskedQuality,
|
|
|
-}
|
|
|
-
|
|
|
-impl<'de> Deserialize<'de> for Count {
|
|
|
- fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
|
|
- where
|
|
|
- D: Deserializer<'de>,
|
|
|
- {
|
|
|
- struct CountVisitor;
|
|
|
-
|
|
|
- impl<'de> Visitor<'de> for CountVisitor {
|
|
|
- type Value = Count;
|
|
|
-
|
|
|
- fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
|
- formatter.write_str("a string in the format 'chr:start-end n_reads n_low_mapq frac_sa sa_outlier frac_se se_outlier'")
|
|
|
- }
|
|
|
-
|
|
|
- fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
|
|
|
- where
|
|
|
- E: de::Error,
|
|
|
- {
|
|
|
- let parts: Vec<&str> = s.split_whitespace().collect();
|
|
|
- if parts.len() != 7 {
|
|
|
- return Err(E::custom("incorrect number of fields"));
|
|
|
- }
|
|
|
-
|
|
|
- let position_parts: Vec<&str> = parts[0].split(&[':', '-'][..]).collect();
|
|
|
- if position_parts.len() != 3 {
|
|
|
- return Err(E::custom("incorrect position format"));
|
|
|
- }
|
|
|
-
|
|
|
- Ok(Count {
|
|
|
- position: CountRange {
|
|
|
- contig: position_parts[0].to_string(),
|
|
|
- start: u32::from_str(position_parts[1]).map_err(E::custom)?,
|
|
|
- end: u32::from_str(position_parts[2]).map_err(E::custom)?,
|
|
|
- },
|
|
|
- n_reads: u32::from_str(parts[1]).map_err(E::custom)?,
|
|
|
- n_low_mapq: u32::from_str(parts[2]).map_err(E::custom)?,
|
|
|
- frac_sa: f32::from_str(parts[3]).map_err(E::custom)?,
|
|
|
- sa_outlier: bool::from_str(parts[4]).map_err(E::custom)?,
|
|
|
- frac_se: f32::from_str(parts[5]).map_err(E::custom)?,
|
|
|
- se_outlier: bool::from_str(parts[6]).map_err(E::custom)?,
|
|
|
- annotation: Vec::new(),
|
|
|
- })
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- deserializer.deserialize_str(CountVisitor)
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-pub fn read_counts_from_file(filename: &str) -> anyhow::Result<Vec<Count>> {
|
|
|
- let file = File::open(filename)?;
|
|
|
- let reader = BufReader::new(file);
|
|
|
- let mut counts = Vec::new();
|
|
|
-
|
|
|
- for line in reader.lines() {
|
|
|
- let line = line?;
|
|
|
- let count: Count = serde_json::from_str(&format!("\"{}\"", escape_control_chars(&line)))?;
|
|
|
- counts.push(count);
|
|
|
- }
|
|
|
-
|
|
|
- Ok(counts)
|
|
|
-}
|
|
|
-fn escape_control_chars(s: &str) -> String {
|
|
|
- s.chars()
|
|
|
- .map(|c| {
|
|
|
- if c.is_control() {
|
|
|
- format!("\\u{:04x}", c as u32)
|
|
|
- } else {
|
|
|
- c.to_string()
|
|
|
- }
|
|
|
- })
|
|
|
- .collect()
|
|
|
-}
|
|
|
-
|
|
|
-#[derive(Debug)]
|
|
|
-pub struct Counts {
|
|
|
- pub data: HashMap<String, Vec<Count>>,
|
|
|
- pub mrd: HashMap<String, Vec<Count>>,
|
|
|
-}
|
|
|
-
|
|
|
-impl Counts {
|
|
|
- pub fn from_files(paths: Vec<String>) -> Self {
|
|
|
- let counts: Vec<Vec<Count>> = paths
|
|
|
- .par_iter()
|
|
|
- .map(|path| match read_counts_from_file(path) {
|
|
|
- Ok(c) => c,
|
|
|
- Err(e) => {
|
|
|
- warn!("Couldnt load {path}: {e}");
|
|
|
- Vec::new()
|
|
|
- }
|
|
|
- })
|
|
|
- .filter(|v| !v.is_empty())
|
|
|
- .collect();
|
|
|
-
|
|
|
- let mut data = HashMap::new();
|
|
|
- for count in counts {
|
|
|
- let contig = count.first().unwrap().position.contig.clone();
|
|
|
- data.insert(contig, count);
|
|
|
- }
|
|
|
- Counts {
|
|
|
- data,
|
|
|
- mrd: HashMap::new(),
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- pub fn mrd_from_files(&mut self, paths: Vec<String>) {
|
|
|
- let counts: Vec<Vec<Count>> = paths
|
|
|
- .par_iter()
|
|
|
- .map(|path| match read_counts_from_file(path) {
|
|
|
- Ok(c) => c,
|
|
|
- Err(e) => {
|
|
|
- warn!("Couldnt load {path}: {e}");
|
|
|
- Vec::new()
|
|
|
- }
|
|
|
- })
|
|
|
- .filter(|v| !v.is_empty())
|
|
|
- .collect();
|
|
|
-
|
|
|
- let mut data = HashMap::new();
|
|
|
- for count in counts {
|
|
|
- let contig = count.first().unwrap().position.contig.clone();
|
|
|
- data.insert(contig, count);
|
|
|
- }
|
|
|
- self.mrd = data;
|
|
|
- }
|
|
|
-
|
|
|
- pub fn mask_low_mrd(&mut self, contig: &str, min_reads: u32) -> anyhow::Result<()> {
|
|
|
- if let (Some(mrd), Some(diag)) = (self.mrd.get(contig), self.data.get_mut(contig)) {
|
|
|
- for (m, d) in mrd.iter().zip(diag) {
|
|
|
- if m.n_reads < min_reads {
|
|
|
- d.annotation.push(CountAnnotation::MaskedLowMRD);
|
|
|
- }
|
|
|
- }
|
|
|
- Ok(())
|
|
|
- } else {
|
|
|
- anyhow::bail!("No {contig} in both mrd and diag.")
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- pub fn mask_low_quality(&mut self, contig: &str, max_ratio: f64) -> anyhow::Result<()> {
|
|
|
- if let Some(diag) = self.data.get_mut(contig) {
|
|
|
- for d in diag.iter_mut() {
|
|
|
- if (d.n_low_mapq as f64 / (d.n_reads + d.n_low_mapq) as f64) > max_ratio {
|
|
|
- d.annotation.push(CountAnnotation::MaskedQuality);
|
|
|
- }
|
|
|
- }
|
|
|
- Ok(())
|
|
|
- } else {
|
|
|
- anyhow::bail!("No {contig} in both mrd and diag.")
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- pub fn frequencies(&self, contig: &str) -> anyhow::Result<Vec<(f64, f64)>> {
|
|
|
- let data = self.get(contig)?;
|
|
|
-
|
|
|
- let mut frequencies = HashMap::new();
|
|
|
- for count in data.iter() {
|
|
|
- *frequencies.entry(*count).or_insert(0) += 1;
|
|
|
- }
|
|
|
-
|
|
|
- let mut frequencies: Vec<(u32, f64)> =
|
|
|
- frequencies.iter().map(|(x, y)| (*x, *y as f64)).collect();
|
|
|
- frequencies.sort_by_key(|v| v.0);
|
|
|
- Ok(frequencies.iter().map(|(x, y)| (*x as f64, *y)).collect())
|
|
|
- }
|
|
|
-
|
|
|
- pub fn percentile(&self, contig: &str, percentile: f64) -> anyhow::Result<u32> {
|
|
|
- let mut data = self.get(contig)?;
|
|
|
- data.sort_unstable();
|
|
|
- let total_count = data.len();
|
|
|
- let index = |percentile: f64| -> usize {
|
|
|
- (percentile / 100.0 * (total_count - 1) as f64).round() as usize
|
|
|
- };
|
|
|
-
|
|
|
- Ok(*data.get(index(percentile)).context("Error in percentile")?)
|
|
|
- }
|
|
|
-
|
|
|
- pub fn save_contig(
|
|
|
- &mut self,
|
|
|
- contig: &str,
|
|
|
- prefix: &str,
|
|
|
- breaks: Vec<u32>,
|
|
|
- ) -> anyhow::Result<CountsStats> {
|
|
|
- self.mask_low_mrd(contig, 6)?;
|
|
|
- self.mask_low_quality(contig, 0.1)?;
|
|
|
-
|
|
|
- let data: Vec<f64> = self.get(contig)?.iter().map(|v| *v as f64).collect();
|
|
|
- let n_final = data.len();
|
|
|
-
|
|
|
- let frequencies = self.frequencies(contig)?;
|
|
|
- let percentile_99 = self.percentile(contig, 99.0)?;
|
|
|
-
|
|
|
- let mut data_x = Vec::new();
|
|
|
- let mut data_y = Vec::new();
|
|
|
- frequencies.iter().for_each(|(x, y)| {
|
|
|
- if *x <= percentile_99 as f64 {
|
|
|
- data_x.push(*x);
|
|
|
- data_y.push(*y / n_final as f64);
|
|
|
- }
|
|
|
- });
|
|
|
-
|
|
|
- // Distribution plot
|
|
|
- let distribution_path = format!("{prefix}_{contig}_distrib.svg");
|
|
|
- info!("Saving graph: {distribution_path}");
|
|
|
- let mut plot = Plot::new();
|
|
|
- let colors: Vec<Rgb> = data_x
|
|
|
- .iter()
|
|
|
- .map(|&x| match x {
|
|
|
- x if x < 2.0 => Rgb::new(193, 18, 31),
|
|
|
- x if x < 6.0 => Rgb::new(243, 114, 44),
|
|
|
- x if x < 15.0 => Rgb::new(255, 202, 58),
|
|
|
- _ => Rgb::new(138, 201, 38),
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- let bars = Bar::new(data_x.clone(), data_y.clone())
|
|
|
- .show_legend(false)
|
|
|
- .marker(Marker::new().color_array(colors));
|
|
|
-
|
|
|
- plot.add_trace(bars);
|
|
|
-
|
|
|
- let sum: f64 = data.iter().sum();
|
|
|
- let mean = (&data).mean();
|
|
|
- let count = data.len() as f64;
|
|
|
- let std_dev = (&data).std_dev();
|
|
|
-
|
|
|
- // Normal
|
|
|
- let normal = statrs::distribution::Normal::new(mean, std_dev)?;
|
|
|
- let data_y: Vec<f64> = data_x.iter().map(|x| normal.pdf(*x)).collect();
|
|
|
- let trace = Scatter::new(data_x.clone(), data_y).name("Normal");
|
|
|
- plot.add_trace(trace);
|
|
|
-
|
|
|
- // // Gamma
|
|
|
- // let shape = mean * mean / variance;
|
|
|
- // let rate = mean / variance;
|
|
|
- //
|
|
|
- // let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
|
|
|
- // let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
|
|
|
- // let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
|
|
|
- // plot.add_trace(trace);
|
|
|
-
|
|
|
- // Poisson
|
|
|
- let lambda = sum / count;
|
|
|
- let poisson = statrs::distribution::Poisson::new(lambda)?;
|
|
|
- let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
|
|
|
- let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
|
|
|
- plot.add_trace(trace);
|
|
|
-
|
|
|
- plot.write_image(distribution_path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
-
|
|
|
- // Fractions
|
|
|
- let mut breaks_values = Vec::new();
|
|
|
- for (i, b) in breaks.iter().enumerate() {
|
|
|
- if i == 0 {
|
|
|
- let total: f64 = frequencies
|
|
|
- .iter()
|
|
|
- .filter(|(x, _)| *x < *b as f64)
|
|
|
- .map(|(_, y)| *y / count)
|
|
|
- .sum();
|
|
|
- breaks_values.push((format!("< {b}"), total));
|
|
|
- } else {
|
|
|
- let last = breaks[i - 1];
|
|
|
- let total: f64 = frequencies
|
|
|
- .iter()
|
|
|
- .filter(|(x, _)| *x < *b as f64 && *x >= last as f64)
|
|
|
- .map(|(_, y)| *y / count)
|
|
|
- .sum();
|
|
|
-
|
|
|
- breaks_values.push((format!("[{last} - {b}["), total));
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- let last = *breaks.last().unwrap();
|
|
|
- let total: f64 = frequencies
|
|
|
- .iter()
|
|
|
- .filter(|(x, _)| *x >= last as f64)
|
|
|
- .map(|(_, y)| *y / count)
|
|
|
- .sum();
|
|
|
- breaks_values.push((format!(">= {last}"), total));
|
|
|
-
|
|
|
- // Chromosome
|
|
|
- let tol = 25;
|
|
|
- let chromosome_path = format!("{prefix}_{contig}_chromosome.svg");
|
|
|
- info!("Saving graph: {chromosome_path}");
|
|
|
-
|
|
|
- let target_annotations: HashSet<CountAnnotation> = vec![
|
|
|
- CountAnnotation::MaskedLowMRD,
|
|
|
- CountAnnotation::MaskedQuality,
|
|
|
- ]
|
|
|
- .into_iter()
|
|
|
- .collect();
|
|
|
-
|
|
|
- let d: Vec<u32> = self
|
|
|
- .data
|
|
|
- .get(contig)
|
|
|
- .unwrap()
|
|
|
- .iter()
|
|
|
- .map(|c| {
|
|
|
- if c.annotation
|
|
|
- .iter()
|
|
|
- .any(|ann| target_annotations.contains(ann))
|
|
|
- {
|
|
|
- 10_000u32
|
|
|
- } else {
|
|
|
- c.n_reads
|
|
|
- }
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- let hm = self.counts_annotations(contig)?;
|
|
|
- let len = d.len();
|
|
|
- let mut masked: Vec<(String, f64)> = hm
|
|
|
- .iter()
|
|
|
- .map(|(k, v)| (format!("{:?}", k), *v as f64 / len as f64))
|
|
|
- .collect();
|
|
|
- masked.push(("Un masked".to_string(), n_final as f64 / len as f64));
|
|
|
-
|
|
|
- let under_6_rects: Vec<AdditionalRect> = ranges_under(&d, 5, tol)
|
|
|
- .iter()
|
|
|
- .filter(|(s, e)| e > s)
|
|
|
- .map(|(start, end)| AdditionalRect {
|
|
|
- start: *start as u32 * 1000,
|
|
|
- end: *end as u32 * 1000,
|
|
|
- color: String::from("red"),
|
|
|
- position: RectPosition::Below(1),
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- let over_6_rects: Vec<AdditionalRect> = ranges_between(&d, 6, 9999, tol)
|
|
|
- .iter()
|
|
|
- .filter(|(s, e)| e > s)
|
|
|
- .map(|(start, end)| AdditionalRect {
|
|
|
- start: *start as u32 * 1000,
|
|
|
- end: *end as u32 * 1000,
|
|
|
- color: String::from("green"),
|
|
|
- position: RectPosition::Below(2),
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- let masked_rec: Vec<AdditionalRect> = ranges_over(&d, 10000, tol)
|
|
|
- .iter()
|
|
|
- .filter(|(s, e)| e > s)
|
|
|
- .map(|(start, end)| AdditionalRect {
|
|
|
- start: *start as u32 * 1000,
|
|
|
- end: *end as u32 * 1000,
|
|
|
- color: String::from("grey"),
|
|
|
- position: RectPosition::Below(0),
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- let mut all = Vec::new();
|
|
|
- all.extend(under_6_rects);
|
|
|
- all.extend(over_6_rects);
|
|
|
- // all.extend(over15);
|
|
|
- all.extend(masked_rec);
|
|
|
-
|
|
|
- svg_chromosome(
|
|
|
- contig,
|
|
|
- 1000,
|
|
|
- 50,
|
|
|
- "/data/ref/hs1/cytoBandMapped.bed",
|
|
|
- &chromosome_path,
|
|
|
- &all,
|
|
|
- &Vec::new(),
|
|
|
- )
|
|
|
- .unwrap();
|
|
|
-
|
|
|
- let stats = CountsStats {
|
|
|
- sum,
|
|
|
- mean,
|
|
|
- std_dev,
|
|
|
- breaks_values,
|
|
|
- masked,
|
|
|
- };
|
|
|
-
|
|
|
- // Save stats
|
|
|
- let json_path = format!("{prefix}_{contig}_stats.json");
|
|
|
- info!("Saving stats into: {json_path}");
|
|
|
- let json = serde_json::to_string_pretty(&stats)?;
|
|
|
- let mut file = File::create(json_path)?;
|
|
|
- file.write_all(json.as_bytes())?;
|
|
|
-
|
|
|
- Ok(stats)
|
|
|
- }
|
|
|
-
|
|
|
- pub fn save_contigs(
|
|
|
- &mut self,
|
|
|
- contigs: &Vec<String>,
|
|
|
- prefix: &str,
|
|
|
- breaks: Vec<u32>,
|
|
|
- ) -> anyhow::Result<()> {
|
|
|
- let mut stats = Vec::new();
|
|
|
- let mut proportions = HashMap::new();
|
|
|
- for contig in contigs {
|
|
|
- let stat = self.save_contig(contig, prefix, breaks.clone())?;
|
|
|
- let un_masked: Vec<&(String, f64)> = stat.masked.iter().filter(|(s, _)| s == "Un masked").collect();
|
|
|
- let un_masked = un_masked.first().unwrap().1;
|
|
|
- let masked = 1.0 - un_masked;
|
|
|
- let mut props: Vec<(String, f64)> = stat.breaks_values.iter().map(|(s, v)| (s.to_string(), *v * un_masked)).collect();
|
|
|
- props.push(("masked".to_string(), masked));
|
|
|
- props.iter().for_each(|(s, v)| {
|
|
|
- proportions.entry(s.to_string()).or_insert(vec![]).push(*v);
|
|
|
- });
|
|
|
- stats.push(stat);
|
|
|
- }
|
|
|
-
|
|
|
- let mut plot = Plot::new();
|
|
|
- let layout = Layout::new().bar_mode(BarMode::Stack);
|
|
|
-
|
|
|
- for (k, v) in proportions {
|
|
|
- plot.add_trace(Bar::new(contigs.clone(), v.to_vec()).name(k));
|
|
|
- }
|
|
|
- println!("{:?}", contigs);
|
|
|
- plot.set_layout(layout);
|
|
|
- plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
-
|
|
|
- Ok(())
|
|
|
- }
|
|
|
-
|
|
|
- pub fn counts_annotations(
|
|
|
- &self,
|
|
|
- contig: &str,
|
|
|
- ) -> anyhow::Result<HashMap<CountAnnotation, u64>> {
|
|
|
- if let Some(d) = self.data.get(contig) {
|
|
|
- let mut counts = HashMap::new();
|
|
|
- for c in d {
|
|
|
- for a in &c.annotation {
|
|
|
- *counts.entry(a.clone()).or_insert(0) += 1;
|
|
|
- }
|
|
|
- }
|
|
|
- Ok(counts)
|
|
|
- } else {
|
|
|
- anyhow::bail!("No {contig} in counts.")
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- pub fn get(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
|
|
|
- if let Some(ccounts) = self.data.get(contig) {
|
|
|
- let target_annotations: HashSet<CountAnnotation> = vec![
|
|
|
- CountAnnotation::MaskedLowMRD,
|
|
|
- CountAnnotation::MaskedQuality,
|
|
|
- ]
|
|
|
- .into_iter()
|
|
|
- .collect();
|
|
|
-
|
|
|
- Ok(ccounts
|
|
|
- .iter()
|
|
|
- .filter(|count| {
|
|
|
- !count
|
|
|
- .annotation
|
|
|
- .iter()
|
|
|
- .any(|ann| target_annotations.contains(ann))
|
|
|
- })
|
|
|
- .map(|c| c.n_reads)
|
|
|
- .collect())
|
|
|
- } else {
|
|
|
- anyhow::bail!("No {contig} in counts.")
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- pub fn mrd(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
|
|
|
- if let Some(ccounts) = self.mrd.get(contig) {
|
|
|
- Ok(ccounts.iter().map(|c| c.n_reads).collect())
|
|
|
- } else {
|
|
|
- anyhow::bail!("No {contig} in counts.")
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- pub fn calculate_percentiles(
|
|
|
- &self,
|
|
|
- contig: &str,
|
|
|
- percentiles: &[f64],
|
|
|
- ) -> anyhow::Result<Vec<f64>> {
|
|
|
- if let Some(ccounts) = self.data.get(contig) {
|
|
|
- let mut n_reads: Vec<u32> = ccounts.iter().map(|c| c.n_reads).collect();
|
|
|
-
|
|
|
- n_reads.sort_unstable();
|
|
|
-
|
|
|
- let cdf = ND::new(n_reads.clone());
|
|
|
-
|
|
|
- // println!("CDF at 13: {:?}", cdf.cdf(13));
|
|
|
- println!("Percentile at 99: {:?}", cdf.percentile(99.0));
|
|
|
- println!("above 15X: {:?}", cdf.proportion_above(15));
|
|
|
- // println!("above 15.1X: {:?}", cdf.fitted_proportion_above(&15.1));
|
|
|
- println!("under 6X: {:?}", cdf.proportion_under(6));
|
|
|
-
|
|
|
- Ok(percentiles
|
|
|
- .iter()
|
|
|
- .map(|&p| {
|
|
|
- let index = (p * (n_reads.len() - 1) as f64).round() as usize;
|
|
|
- n_reads[index] as f64
|
|
|
- })
|
|
|
- .collect())
|
|
|
- } else {
|
|
|
- anyhow::bail!("No {contig} in counts.")
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- pub fn nd_reads(&self, contig: &str) -> anyhow::Result<ND> {
|
|
|
- if let Some(ccounts) = self.data.get(contig) {
|
|
|
- Ok(ND::new(ccounts.iter().map(|c| c.n_reads).collect()))
|
|
|
- } else {
|
|
|
- anyhow::bail!("No {contig} in counts")
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- pub fn distribution(&self, contig: &str) -> anyhow::Result<ND> {
|
|
|
- Ok(ND::new(self.get(contig)?))
|
|
|
- }
|
|
|
-
|
|
|
- pub fn save_stats(&self) -> anyhow::Result<()> {
|
|
|
- Ok(())
|
|
|
- }
|
|
|
-
|
|
|
- pub fn save_global_proportions_graph(
|
|
|
- &self,
|
|
|
- path: &str,
|
|
|
- contigs: &Vec<String>,
|
|
|
- breaks: Vec<u32>,
|
|
|
- ) {
|
|
|
- let mut breaks_str = Vec::new();
|
|
|
- for (i, b) in breaks.iter().enumerate() {
|
|
|
- if i == 0 {
|
|
|
- breaks_str.push(format!("< {b}"));
|
|
|
- } else {
|
|
|
- let last = breaks[i - 1];
|
|
|
- breaks_str.push(format!("[{last} - {b}["))
|
|
|
- }
|
|
|
- }
|
|
|
- breaks_str.push(format!(">= {}", breaks.last().unwrap()));
|
|
|
-
|
|
|
- let mut proportions = Vec::new();
|
|
|
-
|
|
|
- for contig in contigs.iter() {
|
|
|
- let d = self.get(contig).unwrap();
|
|
|
- let nd = ND::new(d);
|
|
|
- proportions.push(nd.frequencies(&breaks));
|
|
|
- }
|
|
|
-
|
|
|
- let mut plot = Plot::new();
|
|
|
- let layout = Layout::new().bar_mode(BarMode::Stack);
|
|
|
- for (i, v) in transpose(proportions).iter().enumerate() {
|
|
|
- plot.add_trace(Bar::new(contigs.clone(), v.to_vec()).name(&breaks_str[i]));
|
|
|
- }
|
|
|
- println!("{:?}", contigs);
|
|
|
- plot.set_layout(layout);
|
|
|
- plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
- }
|
|
|
-
|
|
|
- pub fn save_global_distribution_graph(&self, path: &str, contigs: &Vec<String>) {
|
|
|
- let d: Vec<u32> = contigs.iter().flat_map(|c| self.get(c).unwrap()).collect();
|
|
|
- let mut data_sorted = d.clone();
|
|
|
- data_sorted.sort_unstable();
|
|
|
-
|
|
|
- let nd = ND::new(d.clone());
|
|
|
- let mut plot = Plot::new();
|
|
|
-
|
|
|
- let bar_x: Vec<u32> = (1..=nd.percentile(99.0).unwrap()).collect();
|
|
|
- let colors: Vec<plotly::color::Rgb> = bar_x
|
|
|
- .iter()
|
|
|
- .map(|&x| {
|
|
|
- if x <= 2 {
|
|
|
- plotly::color::Rgb::new(193, 18, 31)
|
|
|
- } else if x >= 15 {
|
|
|
- plotly::color::Rgb::new(138, 201, 38)
|
|
|
- } else if x <= 6 {
|
|
|
- plotly::color::Rgb::new(243, 114, 44)
|
|
|
- } else {
|
|
|
- plotly::color::Rgb::new(255, 202, 58)
|
|
|
- }
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- let data: Vec<u32> = d.iter().filter(|x| **x >= 1).copied().collect();
|
|
|
-
|
|
|
- // frequencies
|
|
|
- let mut frequencies = HashMap::new();
|
|
|
- for &value in &data {
|
|
|
- *frequencies.entry(value).or_insert(0) += 1;
|
|
|
- }
|
|
|
-
|
|
|
- let bars = Bar::new(
|
|
|
- bar_x.clone(),
|
|
|
- bar_x
|
|
|
- .iter()
|
|
|
- .map(|x| *frequencies.get(x).unwrap_or(&0) as f64 / data.len() as f64)
|
|
|
- .collect(),
|
|
|
- )
|
|
|
- .show_legend(false)
|
|
|
- .marker(Marker::new().color_array(colors));
|
|
|
-
|
|
|
- plot.add_trace(bars);
|
|
|
-
|
|
|
- let data_x = generate_range(0.0, nd.percentile(99.0).unwrap().into(), 100);
|
|
|
- let data_y: Vec<f64> = data_x.iter().map(|x| nd.fitted_normal.pdf(x)).collect();
|
|
|
- let trace = Scatter::new(data_x.clone(), data_y).name("Gaussian");
|
|
|
- plot.add_trace(trace);
|
|
|
-
|
|
|
- // Gamma
|
|
|
- let data: Vec<f64> = d.iter().map(|x| *x as f64).collect();
|
|
|
-
|
|
|
- let sum: f64 = data.iter().sum();
|
|
|
- let mean = (&data).mean();
|
|
|
- let variance = (&data).variance();
|
|
|
- let count = d.len() as f64;
|
|
|
- let shape = mean * mean / variance;
|
|
|
- let rate = mean / variance;
|
|
|
-
|
|
|
- let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
|
|
|
- let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
|
|
|
- let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
|
|
|
- plot.add_trace(trace);
|
|
|
-
|
|
|
- // Poisson
|
|
|
- let lambda = sum / count;
|
|
|
- let poisson = statrs::distribution::Poisson::new(lambda).unwrap();
|
|
|
- let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
|
|
|
- let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
|
|
|
- plot.add_trace(trace);
|
|
|
-
|
|
|
- plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
- println!("> 15x: {:?}", nd.frequencies(&vec![1, 6, 15]));
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-pub struct ND {
|
|
|
- pub data: Vec<u32>,
|
|
|
- pub distribution: BTreeMap<u32, f64>,
|
|
|
- pub total_count: usize,
|
|
|
- pub frequency: HashMap<u32, usize>,
|
|
|
- pub fitted_normal: UvNormal,
|
|
|
-}
|
|
|
-
|
|
|
-use rstat::{fitting::MLE, normal::UvNormal, ContinuousDistribution};
|
|
|
-
|
|
|
-impl ND {
|
|
|
- fn new(mut data: Vec<u32>) -> Self {
|
|
|
- data.sort_unstable();
|
|
|
- let n = data.len();
|
|
|
- info!("n values {n}");
|
|
|
- let mut distribution = BTreeMap::new();
|
|
|
-
|
|
|
- let mut frequency = HashMap::new();
|
|
|
- for &value in &data {
|
|
|
- *frequency.entry(value).or_insert(0) += 1;
|
|
|
- }
|
|
|
-
|
|
|
- let mut cumulative_count = 0;
|
|
|
- for (&value, &count) in &frequency {
|
|
|
- cumulative_count += count;
|
|
|
- let cumulative_prob = cumulative_count as f64 / n as f64;
|
|
|
- distribution.insert(value, cumulative_prob);
|
|
|
- }
|
|
|
-
|
|
|
- // Fit normal distribution
|
|
|
- let fitted_normal = rstat::univariate::normal::Normal::fit_mle(
|
|
|
- &data
|
|
|
- .iter()
|
|
|
- .filter(|x| *x >= &1u32)
|
|
|
- .map(|x| *x as f64)
|
|
|
- .collect::<Vec<f64>>(),
|
|
|
- )
|
|
|
- .unwrap();
|
|
|
-
|
|
|
- Self {
|
|
|
- data,
|
|
|
- distribution,
|
|
|
- frequency,
|
|
|
- total_count: n,
|
|
|
- fitted_normal,
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- pub fn frequency(&self, x: u32) -> usize {
|
|
|
- *self.frequency.get(&x).unwrap_or(&0)
|
|
|
- }
|
|
|
-
|
|
|
- pub fn frequencies(&self, breaks: &Vec<u32>) -> Vec<f64> {
|
|
|
- let mut last_prop_under = 0.0;
|
|
|
- let mut res = Vec::new();
|
|
|
- for brk in breaks {
|
|
|
- let v = self.proportion_under(*brk) - last_prop_under;
|
|
|
- res.push(v);
|
|
|
- last_prop_under += v;
|
|
|
- }
|
|
|
- let per99 = self.percentile(99.0).unwrap();
|
|
|
- res.push(self.proportion_under(per99) - last_prop_under);
|
|
|
- res
|
|
|
- }
|
|
|
-
|
|
|
- pub fn percentile(&self, percentile: f64) -> Option<u32> {
|
|
|
- if !(0.0..=100.0).contains(&percentile) {
|
|
|
- return None;
|
|
|
- }
|
|
|
-
|
|
|
- let index = (percentile / 100.0 * (self.total_count - 1) as f64).round() as usize;
|
|
|
- self.data.get(index).cloned()
|
|
|
- }
|
|
|
-
|
|
|
- pub fn proportion_under(&self, x: u32) -> f64 {
|
|
|
- let count = self
|
|
|
- .frequency
|
|
|
- .iter()
|
|
|
- .filter(|(&value, _)| value < x)
|
|
|
- .map(|(_, &count)| count)
|
|
|
- .sum::<usize>();
|
|
|
- count as f64 / self.total_count as f64
|
|
|
- }
|
|
|
-
|
|
|
- pub fn proportion_above(&self, x: u32) -> f64 {
|
|
|
- let count = self
|
|
|
- .frequency
|
|
|
- .iter()
|
|
|
- .filter(|(&value, _)| value > x)
|
|
|
- .map(|(_, &count)| count)
|
|
|
- .sum::<usize>();
|
|
|
- count as f64 / self.total_count as f64
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-pub fn generate_range(start: f64, end: f64, steps: usize) -> Vec<f64> {
|
|
|
- if steps == 0 {
|
|
|
- return vec![];
|
|
|
- }
|
|
|
- if steps == 1 {
|
|
|
- return vec![start];
|
|
|
- }
|
|
|
-
|
|
|
- let step_size = (end - start) / (steps - 1) as f64;
|
|
|
- (0..steps).map(|i| start + i as f64 * step_size).collect()
|
|
|
-}
|
|
|
-
|
|
|
-use rayon::prelude::*;
|
|
|
-
|
|
|
-pub fn ranges_under(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
|
|
|
- get_ranges_parallel(vec, x, tolerance, |val, threshold| val <= threshold)
|
|
|
-}
|
|
|
-
|
|
|
-pub fn ranges_over(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
|
|
|
- get_ranges_parallel(vec, x, tolerance, |val, threshold| val >= threshold)
|
|
|
-}
|
|
|
-
|
|
|
-pub fn ranges_between(
|
|
|
- vec: &[u32],
|
|
|
- lower: u32,
|
|
|
- upper: u32,
|
|
|
- tolerance: usize,
|
|
|
-) -> Vec<(usize, usize)> {
|
|
|
- get_ranges_parallel(vec, (lower, upper), tolerance, |val, (l, u)| {
|
|
|
- val >= l && val <= u
|
|
|
- })
|
|
|
-}
|
|
|
-
|
|
|
-pub fn get_ranges_parallel<T, F>(
|
|
|
- vec: &[u32],
|
|
|
- threshold: T,
|
|
|
- tolerance: usize,
|
|
|
- compare: F,
|
|
|
-) -> Vec<(usize, usize)>
|
|
|
-where
|
|
|
- F: Fn(u32, T) -> bool + Sync,
|
|
|
- T: Copy + Sync,
|
|
|
-{
|
|
|
- if vec.is_empty() {
|
|
|
- return Vec::new();
|
|
|
- }
|
|
|
-
|
|
|
- let chunk_size = (vec.len() / rayon::current_num_threads()).max(1);
|
|
|
- vec.par_chunks(chunk_size)
|
|
|
- .enumerate()
|
|
|
- .flat_map(|(chunk_index, chunk)| {
|
|
|
- let mut local_ranges = Vec::new();
|
|
|
- let mut current_range: Option<(usize, usize)> = None;
|
|
|
- let offset = chunk_index * chunk_size;
|
|
|
-
|
|
|
- for (i, &val) in chunk.iter().enumerate() {
|
|
|
- let global_index = offset + i;
|
|
|
- if compare(val, threshold) {
|
|
|
- match current_range {
|
|
|
- Some((start, end)) if global_index <= end + tolerance + 1 => {
|
|
|
- current_range = Some((start, global_index));
|
|
|
- }
|
|
|
- Some((start, end)) => {
|
|
|
- local_ranges.push((start, end));
|
|
|
- current_range = Some((global_index, global_index));
|
|
|
- }
|
|
|
- None => {
|
|
|
- current_range = Some((global_index, global_index));
|
|
|
- }
|
|
|
- }
|
|
|
- } else if let Some((start, end)) = current_range {
|
|
|
- if global_index > end + tolerance + 1 {
|
|
|
- local_ranges.push((start, end));
|
|
|
- current_range = None;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if let Some(range) = current_range {
|
|
|
- local_ranges.push(range);
|
|
|
- }
|
|
|
-
|
|
|
- local_ranges
|
|
|
- })
|
|
|
- .collect()
|
|
|
-}
|
|
|
-
|
|
|
-pub fn transpose(v: Vec<Vec<f64>>) -> Vec<Vec<f64>> {
|
|
|
- assert!(!v.is_empty());
|
|
|
- let len = v[0].len();
|
|
|
- let mut result = vec![Vec::with_capacity(v.len()); len];
|
|
|
-
|
|
|
- for row in v {
|
|
|
- for (i, val) in row.into_iter().enumerate() {
|
|
|
- result[i].push(val);
|
|
|
- }
|
|
|
- }
|
|
|
- result
|
|
|
-}
|
|
|
-
|
|
|
-#[derive(Debug, Serialize)]
|
|
|
-pub struct CountsStats {
|
|
|
- pub sum: f64,
|
|
|
- pub mean: f64,
|
|
|
- pub std_dev: f64,
|
|
|
- pub breaks_values: Vec<(String, f64)>,
|
|
|
- pub masked: Vec<(String, f64)>,
|
|
|
-}
|
|
|
-
|
|
|
-// pub fn save_barplota
|
|
|
-// data: Vec<f64>,
|
|
|
-// data_x: Vec<f64>,
|
|
|
-// data_y: Vec<f64>,
|
|
|
-// path: &str,
|
|
|
-// ) -> anyhow::Result<CountsStats> {
|
|
|
-// let mut plot = Plot::new();
|
|
|
-//
|
|
|
-// let colors: Vec<plotly::color::Rgb> = data_x
|
|
|
-// .iter()
|
|
|
-// .map(|&x| {
|
|
|
-// if x <= 2.0 {
|
|
|
-// plotly::color::Rgb::new(193, 18, 31)
|
|
|
-// } else if x >= 15.0 {
|
|
|
-// plotly::color::Rgb::new(138, 201, 38)
|
|
|
-// } else if x <= 6.0 {
|
|
|
-// plotly::color::Rgb::new(243, 114, 44)
|
|
|
-// } else {
|
|
|
-// plotly::color::Rgb::new(255, 202, 58)
|
|
|
-// }
|
|
|
-// })
|
|
|
-// .collect();
|
|
|
-//
|
|
|
-// let bars = Bar::new(data_x.clone(), data_y.clone())
|
|
|
-// .show_legend(false)
|
|
|
-// .marker(Marker::new().color_array(colors));
|
|
|
-//
|
|
|
-// plot.add_trace(bars);
|
|
|
-//
|
|
|
-// let sum: f64 = data.iter().sum();
|
|
|
-// let mean = (&data).mean();
|
|
|
-// let count = data.len() as f64;
|
|
|
-// let std_dev = (&data).std_dev();
|
|
|
-// println!("mean {mean}");
|
|
|
-//
|
|
|
-// // Normal
|
|
|
-// let normal = statrs::distribution::Normal::new(mean, std_dev)?;
|
|
|
-// let data_y: Vec<f64> = data_x.iter().map(|x| normal.pdf(*x)).collect();
|
|
|
-// let trace = Scatter::new(data_x.clone(), data_y).name("Normal");
|
|
|
-// plot.add_trace(trace);
|
|
|
-//
|
|
|
-// // // Gamma
|
|
|
-// // let shape = mean * mean / variance;
|
|
|
-// // let rate = mean / variance;
|
|
|
-// //
|
|
|
-// // let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
|
|
|
-// // let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
|
|
|
-// // let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
|
|
|
-// // plot.add_trace(trace);
|
|
|
-//
|
|
|
-// // Poisson
|
|
|
-// let lambda = sum / count;
|
|
|
-// let poisson = statrs::distribution::Poisson::new(lambda)?;
|
|
|
-// let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
|
|
|
-// let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
|
|
|
-// plot.add_trace(trace);
|
|
|
-//
|
|
|
-// plot.use_local_plotly();
|
|
|
-// plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
-// Ok(CountsStats { sum, mean, std_dev })
|
|
|
-// }
|