|
|
@@ -1,14 +1,23 @@
|
|
|
+use anyhow::Context;
|
|
|
use log::{info, warn};
|
|
|
+use ordered_float::Float;
|
|
|
+use pandora_lib_graph::cytoband::{svg_chromosome, AdditionalRect, RectPosition};
|
|
|
+use plotly::{common::Marker, layout::BarMode, Bar, Layout, Plot, Scatter};
|
|
|
+use rand::{thread_rng, Rng};
|
|
|
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
|
|
use serde::{
|
|
|
de::{self, Visitor},
|
|
|
- Deserialize, Deserializer,
|
|
|
+ Deserialize, Deserializer, Serialize,
|
|
|
+};
|
|
|
+use statrs::{
|
|
|
+ distribution::{Continuous, Discrete},
|
|
|
+ statistics::Statistics,
|
|
|
};
|
|
|
use std::{
|
|
|
- collections::{BTreeMap, HashMap},
|
|
|
- fmt,
|
|
|
+ collections::{BTreeMap, HashMap, HashSet},
|
|
|
+ f64, fmt,
|
|
|
fs::File,
|
|
|
- io::{BufRead, BufReader},
|
|
|
+ io::{BufRead, BufReader, Write},
|
|
|
str::FromStr,
|
|
|
};
|
|
|
|
|
|
@@ -21,6 +30,7 @@ pub struct Count {
|
|
|
pub sa_outlier: bool,
|
|
|
pub frac_se: f32,
|
|
|
pub se_outlier: bool,
|
|
|
+ pub annotation: Vec<CountAnnotation>,
|
|
|
}
|
|
|
|
|
|
impl fmt::Display for Count {
|
|
|
@@ -52,6 +62,12 @@ impl fmt::Display for CountRange {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+#[derive(Debug, Clone, Hash, Eq, PartialEq)]
|
|
|
+pub enum CountAnnotation {
|
|
|
+ MaskedLowMRD,
|
|
|
+ MaskedQuality,
|
|
|
+}
|
|
|
+
|
|
|
impl<'de> Deserialize<'de> for Count {
|
|
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
|
|
where
|
|
|
@@ -92,6 +108,7 @@ impl<'de> Deserialize<'de> for Count {
|
|
|
sa_outlier: bool::from_str(parts[4]).map_err(E::custom)?,
|
|
|
frac_se: f32::from_str(parts[5]).map_err(E::custom)?,
|
|
|
se_outlier: bool::from_str(parts[6]).map_err(E::custom)?,
|
|
|
+ annotation: Vec::new(),
|
|
|
})
|
|
|
}
|
|
|
}
|
|
|
@@ -128,10 +145,35 @@ fn escape_control_chars(s: &str) -> String {
|
|
|
#[derive(Debug)]
|
|
|
pub struct Counts {
|
|
|
pub data: HashMap<String, Vec<Count>>,
|
|
|
+ pub mrd: HashMap<String, Vec<Count>>,
|
|
|
}
|
|
|
|
|
|
impl Counts {
|
|
|
- pub fn from_files(paths: Vec<&str>) -> Self {
|
|
|
+ pub fn from_files(paths: Vec<String>) -> Self {
|
|
|
+ let counts: Vec<Vec<Count>> = paths
|
|
|
+ .par_iter()
|
|
|
+ .map(|path| match read_counts_from_file(path) {
|
|
|
+ Ok(c) => c,
|
|
|
+ Err(e) => {
|
|
|
+ warn!("Couldnt load {path}: {e}");
|
|
|
+ Vec::new()
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .filter(|v| !v.is_empty())
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let mut data = HashMap::new();
|
|
|
+ for count in counts {
|
|
|
+ let contig = count.first().unwrap().position.contig.clone();
|
|
|
+ data.insert(contig, count);
|
|
|
+ }
|
|
|
+ Counts {
|
|
|
+ data,
|
|
|
+ mrd: HashMap::new(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn mrd_from_files(&mut self, paths: Vec<String>) {
|
|
|
let counts: Vec<Vec<Count>> = paths
|
|
|
.par_iter()
|
|
|
.map(|path| match read_counts_from_file(path) {
|
|
|
@@ -149,11 +191,325 @@ impl Counts {
|
|
|
let contig = count.first().unwrap().position.contig.clone();
|
|
|
data.insert(contig, count);
|
|
|
}
|
|
|
- Counts { data }
|
|
|
+ self.mrd = data;
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn mask_low_mrd(&mut self, contig: &str, min_reads: u32) -> anyhow::Result<()> {
|
|
|
+ if let (Some(mrd), Some(diag)) = (self.mrd.get(contig), self.data.get_mut(contig)) {
|
|
|
+ for (m, d) in mrd.iter().zip(diag) {
|
|
|
+ if m.n_reads < min_reads {
|
|
|
+ d.annotation.push(CountAnnotation::MaskedLowMRD);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in both mrd and diag.")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn mask_low_quality(&mut self, contig: &str, max_ratio: f64) -> anyhow::Result<()> {
|
|
|
+ if let Some(diag) = self.data.get_mut(contig) {
|
|
|
+ for d in diag.iter_mut() {
|
|
|
+ if (d.n_low_mapq as f64 / (d.n_reads + d.n_low_mapq) as f64) > max_ratio {
|
|
|
+ d.annotation.push(CountAnnotation::MaskedQuality);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in both mrd and diag.")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn frequencies(&self, contig: &str) -> anyhow::Result<Vec<(f64, f64)>> {
|
|
|
+ let data = self.get(contig)?;
|
|
|
+
|
|
|
+ let mut frequencies = HashMap::new();
|
|
|
+ for count in data.iter() {
|
|
|
+ *frequencies.entry(*count).or_insert(0) += 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut frequencies: Vec<(u32, f64)> =
|
|
|
+ frequencies.iter().map(|(x, y)| (*x, *y as f64)).collect();
|
|
|
+ frequencies.sort_by_key(|v| v.0);
|
|
|
+ Ok(frequencies.iter().map(|(x, y)| (*x as f64, *y)).collect())
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn percentile(&self, contig: &str, percentile: f64) -> anyhow::Result<u32> {
|
|
|
+ let mut data = self.get(contig)?;
|
|
|
+ data.sort_unstable();
|
|
|
+ let total_count = data.len();
|
|
|
+ let index = |percentile: f64| -> usize {
|
|
|
+ (percentile / 100.0 * (total_count - 1) as f64).round() as usize
|
|
|
+ };
|
|
|
+
|
|
|
+ Ok(*data.get(index(percentile)).context("Error in percentile")?)
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn save_contig(&mut self, contig: &str, prefix: &str, breaks: Vec<u32>) -> anyhow::Result<()> {
|
|
|
+ self.mask_low_mrd(contig, 6)?;
|
|
|
+ self.mask_low_quality(contig, 0.1)?;
|
|
|
+
|
|
|
+ let data: Vec<f64> = self.get(contig)?.iter().map(|v| *v as f64).collect();
|
|
|
+ let n_final = data.len();
|
|
|
+
|
|
|
+ let frequencies = self.frequencies(contig)?;
|
|
|
+ let percentile_99 = self.percentile(contig, 99.0)?;
|
|
|
+
|
|
|
+ let mut data_x = Vec::new();
|
|
|
+ let mut data_y = Vec::new();
|
|
|
+ frequencies.iter().for_each(|(x, y)| {
|
|
|
+ if *x <= percentile_99 as f64 {
|
|
|
+ data_x.push(*x);
|
|
|
+ data_y.push(*y / n_final as f64);
|
|
|
+ }
|
|
|
+ });
|
|
|
+
|
|
|
+ // Distribution plot
|
|
|
+ let distribution_path = format!("{prefix}_{contig}_distrib.svg");
|
|
|
+ info!("Saving graph: {distribution_path}");
|
|
|
+ let mut plot = Plot::new();
|
|
|
+ let colors: Vec<plotly::color::Rgb> = data_x
|
|
|
+ .iter()
|
|
|
+ .map(|&x| {
|
|
|
+ if x < 2.0 {
|
|
|
+ plotly::color::Rgb::new(193, 18, 31)
|
|
|
+ } else if x >= 15.0 {
|
|
|
+ plotly::color::Rgb::new(138, 201, 38)
|
|
|
+ } else if x < 6.0 {
|
|
|
+ plotly::color::Rgb::new(243, 114, 44)
|
|
|
+ } else {
|
|
|
+ plotly::color::Rgb::new(255, 202, 58)
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let bars = Bar::new(data_x.clone(), data_y.clone())
|
|
|
+ .show_legend(false)
|
|
|
+ .marker(Marker::new().color_array(colors));
|
|
|
+
|
|
|
+ plot.add_trace(bars);
|
|
|
+
|
|
|
+ let sum: f64 = data.iter().sum();
|
|
|
+ let mean = (&data).mean();
|
|
|
+ let count = data.len() as f64;
|
|
|
+ let std_dev = (&data).std_dev();
|
|
|
+
|
|
|
+ // Normal
|
|
|
+ let normal = statrs::distribution::Normal::new(mean, std_dev)?;
|
|
|
+ let data_y: Vec<f64> = data_x.iter().map(|x| normal.pdf(*x)).collect();
|
|
|
+ let trace = Scatter::new(data_x.clone(), data_y).name("Normal");
|
|
|
+ plot.add_trace(trace);
|
|
|
+
|
|
|
+ // // Gamma
|
|
|
+ // let shape = mean * mean / variance;
|
|
|
+ // let rate = mean / variance;
|
|
|
+ //
|
|
|
+ // let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
|
|
|
+ // let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
|
|
|
+ // let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
|
|
|
+ // plot.add_trace(trace);
|
|
|
+
|
|
|
+ // Poisson
|
|
|
+ let lambda = sum / count;
|
|
|
+ let poisson = statrs::distribution::Poisson::new(lambda)?;
|
|
|
+ let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
|
|
|
+ let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
|
|
|
+ plot.add_trace(trace);
|
|
|
+
|
|
|
+ plot.use_local_plotly();
|
|
|
+ plot.write_image(distribution_path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
+
|
|
|
+ // Fractions
|
|
|
+ let mut breaks_values = Vec::new();
|
|
|
+ for (i, b) in breaks.iter().enumerate() {
|
|
|
+ if i == 0 {
|
|
|
+ let total: f64 = frequencies
|
|
|
+ .iter()
|
|
|
+ .filter(|(x, _)| *x < *b as f64)
|
|
|
+ .map(|(_, y)| *y / count)
|
|
|
+ .sum();
|
|
|
+ breaks_values.push((format!("< {b}"), total));
|
|
|
+ } else {
|
|
|
+ let last = breaks[i - 1];
|
|
|
+ let total: f64 = frequencies
|
|
|
+ .iter()
|
|
|
+ .filter(|(x, _)| *x < *b as f64 && *x >= last as f64)
|
|
|
+ .map(|(_, y)| *y / count)
|
|
|
+ .sum();
|
|
|
+
|
|
|
+ breaks_values.push((format!("[{last} - {b}["), total));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let last = *breaks.last().unwrap();
|
|
|
+ let total: f64 = frequencies
|
|
|
+ .iter()
|
|
|
+ .filter(|(x, _)| *x >= last as f64)
|
|
|
+ .map(|(_, y)| *y / count)
|
|
|
+ .sum();
|
|
|
+ breaks_values.push((format!(">= {last}"), total));
|
|
|
+
|
|
|
+ // Chromosome
|
|
|
+ let tol = 25;
|
|
|
+ let chromosome_path = format!("{prefix}_{contig}_chromosome.svg");
|
|
|
+ info!("Saving graph: {chromosome_path}");
|
|
|
+
|
|
|
+ let target_annotations: HashSet<CountAnnotation> = vec![
|
|
|
+ CountAnnotation::MaskedLowMRD,
|
|
|
+ CountAnnotation::MaskedQuality,
|
|
|
+ ]
|
|
|
+ .into_iter()
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let d: Vec<u32> = self
|
|
|
+ .data
|
|
|
+ .get(contig)
|
|
|
+ .unwrap()
|
|
|
+ .iter()
|
|
|
+ .map(|c| {
|
|
|
+ if c.annotation
|
|
|
+ .iter()
|
|
|
+ .any(|ann| target_annotations.contains(ann))
|
|
|
+ {
|
|
|
+ 10_000u32
|
|
|
+ } else {
|
|
|
+ c.n_reads
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let hm = self.counts_annotations(contig)?;
|
|
|
+
|
|
|
+ let len = d.len();
|
|
|
+ let mut masked: Vec<(String, f64)> = hm
|
|
|
+ .iter()
|
|
|
+ .map(|(k, v)| (format!("{:?}", k), *v as f64 / len as f64))
|
|
|
+ .collect();
|
|
|
+ masked.push(("Un masked".to_string(), n_final as f64 / len as f64));
|
|
|
+
|
|
|
+ let under_6_rects: Vec<AdditionalRect> = ranges_under(&d, 5, tol)
|
|
|
+ .iter()
|
|
|
+ .filter(|(s, e)| e > s)
|
|
|
+ .map(|(start, end)| AdditionalRect {
|
|
|
+ start: *start as u32 * 1000,
|
|
|
+ end: *end as u32 * 1000,
|
|
|
+ color: String::from("red"),
|
|
|
+ position: RectPosition::Below(1),
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let over_6_rects: Vec<AdditionalRect> = ranges_between(&d, 6, 9999, tol)
|
|
|
+ .iter()
|
|
|
+ .filter(|(s, e)| e > s)
|
|
|
+ .map(|(start, end)| AdditionalRect {
|
|
|
+ start: *start as u32 * 1000,
|
|
|
+ end: *end as u32 * 1000,
|
|
|
+ color: String::from("green"),
|
|
|
+ position: RectPosition::Below(2),
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let masked_rec: Vec<AdditionalRect> = ranges_over(&d, 10000, tol)
|
|
|
+ .iter()
|
|
|
+ .filter(|(s, e)| e > s)
|
|
|
+ .map(|(start, end)| AdditionalRect {
|
|
|
+ start: *start as u32 * 1000,
|
|
|
+ end: *end as u32 * 1000,
|
|
|
+ color: String::from("grey"),
|
|
|
+ position: RectPosition::Below(0),
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let mut all = Vec::new();
|
|
|
+ all.extend(under_6_rects);
|
|
|
+ all.extend(over_6_rects);
|
|
|
+ // all.extend(over15);
|
|
|
+ all.extend(masked_rec);
|
|
|
+
|
|
|
+ svg_chromosome(
|
|
|
+ contig,
|
|
|
+ 1000,
|
|
|
+ 50,
|
|
|
+ "/data/ref/hs1/cytoBandMapped.bed",
|
|
|
+ &chromosome_path,
|
|
|
+ &all,
|
|
|
+ &Vec::new(),
|
|
|
+ )
|
|
|
+ .unwrap();
|
|
|
+
|
|
|
+ let stats = CountsStats {
|
|
|
+ sum,
|
|
|
+ mean,
|
|
|
+ std_dev,
|
|
|
+ breaks_values,
|
|
|
+ masked,
|
|
|
+ };
|
|
|
+
|
|
|
+ // Save stats
|
|
|
+ let json_path = format!("{prefix}_{contig}_stats.json");
|
|
|
+ info!("Saving stats into: {json_path}");
|
|
|
+ let json = serde_json::to_string_pretty(&stats)?;
|
|
|
+ let mut file = File::create(json_path)?;
|
|
|
+ file.write_all(json.as_bytes())?;
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn save_contigs(
|
|
|
+ &mut self,
|
|
|
+ contigs: &Vec<String>,
|
|
|
+ prefix: &str,
|
|
|
+ breaks: Vec<u32>,
|
|
|
+ ) -> anyhow::Result<()> {
|
|
|
+ for contig in contigs {
|
|
|
+ self.save_contig(contig, prefix, breaks.clone())?;
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn counts_annotations(
|
|
|
+ &self,
|
|
|
+ contig: &str,
|
|
|
+ ) -> anyhow::Result<HashMap<CountAnnotation, u64>> {
|
|
|
+ if let Some(d) = self.data.get(contig) {
|
|
|
+ let mut counts = HashMap::new();
|
|
|
+ for c in d {
|
|
|
+ for a in &c.annotation {
|
|
|
+ *counts.entry(a.clone()).or_insert(0) += 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(counts)
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in counts.")
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
pub fn get(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
|
|
|
if let Some(ccounts) = self.data.get(contig) {
|
|
|
+ let target_annotations: HashSet<CountAnnotation> = vec![
|
|
|
+ CountAnnotation::MaskedLowMRD,
|
|
|
+ CountAnnotation::MaskedQuality,
|
|
|
+ ]
|
|
|
+ .into_iter()
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ Ok(ccounts
|
|
|
+ .iter()
|
|
|
+ .filter(|count| {
|
|
|
+ !count
|
|
|
+ .annotation
|
|
|
+ .iter()
|
|
|
+ .any(|ann| target_annotations.contains(ann))
|
|
|
+ })
|
|
|
+ .map(|c| c.n_reads)
|
|
|
+ .collect())
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in counts.")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn mrd(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
|
|
|
+ if let Some(ccounts) = self.mrd.get(contig) {
|
|
|
Ok(ccounts.iter().map(|c| c.n_reads).collect())
|
|
|
} else {
|
|
|
anyhow::bail!("No {contig} in counts.")
|
|
|
@@ -172,10 +528,10 @@ impl Counts {
|
|
|
|
|
|
let cdf = ND::new(n_reads.clone());
|
|
|
|
|
|
- println!("CDF at 13: {:?}", cdf.cdf(13));
|
|
|
+ // println!("CDF at 13: {:?}", cdf.cdf(13));
|
|
|
println!("Percentile at 99: {:?}", cdf.percentile(99.0));
|
|
|
println!("above 15X: {:?}", cdf.proportion_above(15));
|
|
|
- println!("above 15.1X: {:?}", cdf.fitted_proportion_above(15.1));
|
|
|
+ // println!("above 15.1X: {:?}", cdf.fitted_proportion_above(&15.1));
|
|
|
println!("under 6X: {:?}", cdf.proportion_under(6));
|
|
|
|
|
|
Ok(percentiles
|
|
|
@@ -197,29 +553,146 @@ impl Counts {
|
|
|
anyhow::bail!("No {contig} in counts")
|
|
|
}
|
|
|
}
|
|
|
-}
|
|
|
|
|
|
-use statrs::{
|
|
|
- distribution::{ContinuousCDF, Normal},
|
|
|
- statistics::Distribution,
|
|
|
-};
|
|
|
+ pub fn distribution(&self, contig: &str) -> anyhow::Result<ND> {
|
|
|
+ Ok(ND::new(self.get(contig)?))
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn save_stats(&self) -> anyhow::Result<()> {
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn save_global_proportions_graph(
|
|
|
+ &self,
|
|
|
+ path: &str,
|
|
|
+ contigs: &Vec<String>,
|
|
|
+ breaks: Vec<u32>,
|
|
|
+ ) {
|
|
|
+ let mut breaks_str = Vec::new();
|
|
|
+ for (i, b) in breaks.iter().enumerate() {
|
|
|
+ if i == 0 {
|
|
|
+ breaks_str.push(format!("< {b}"));
|
|
|
+ } else {
|
|
|
+ let last = breaks[i - 1];
|
|
|
+ breaks_str.push(format!("[{last} - {b}["))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ breaks_str.push(format!(">= {}", breaks.last().unwrap()));
|
|
|
+
|
|
|
+ let mut proportions = Vec::new();
|
|
|
+
|
|
|
+ for contig in contigs.iter() {
|
|
|
+ let d = self.get(contig).unwrap();
|
|
|
+ let nd = ND::new(d);
|
|
|
+ proportions.push(nd.frequencies(&breaks));
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut plot = Plot::new();
|
|
|
+ let layout = Layout::new().bar_mode(BarMode::Stack);
|
|
|
+ for (i, v) in transpose(proportions).iter().enumerate() {
|
|
|
+ plot.add_trace(Bar::new(contigs.clone(), v.to_vec()).name(&breaks_str[i]));
|
|
|
+ }
|
|
|
+ println!("{:?}", contigs);
|
|
|
+ plot.set_layout(layout);
|
|
|
+ plot.use_local_plotly();
|
|
|
+ plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn save_global_distribution_graph(&self, path: &str, contigs: &Vec<String>) {
|
|
|
+ let d: Vec<u32> = contigs.iter().flat_map(|c| self.get(c).unwrap()).collect();
|
|
|
+ let mut data_sorted = d.clone();
|
|
|
+ data_sorted.sort_unstable();
|
|
|
+
|
|
|
+ let nd = ND::new(d.clone());
|
|
|
+ let mut plot = Plot::new();
|
|
|
+
|
|
|
+ let bar_x: Vec<u32> = (1..=nd.percentile(99.0).unwrap()).collect();
|
|
|
+ let colors: Vec<plotly::color::Rgb> = bar_x
|
|
|
+ .iter()
|
|
|
+ .map(|&x| {
|
|
|
+ if x <= 2 {
|
|
|
+ plotly::color::Rgb::new(193, 18, 31)
|
|
|
+ } else if x >= 15 {
|
|
|
+ plotly::color::Rgb::new(138, 201, 38)
|
|
|
+ } else if x <= 6 {
|
|
|
+ plotly::color::Rgb::new(243, 114, 44)
|
|
|
+ } else {
|
|
|
+ plotly::color::Rgb::new(255, 202, 58)
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let data: Vec<u32> = d.iter().filter(|x| **x >= 1).copied().collect();
|
|
|
+
|
|
|
+ // frequencies
|
|
|
+ let mut frequencies = HashMap::new();
|
|
|
+ for &value in &data {
|
|
|
+ *frequencies.entry(value).or_insert(0) += 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ let bars = Bar::new(
|
|
|
+ bar_x.clone(),
|
|
|
+ bar_x
|
|
|
+ .iter()
|
|
|
+ .map(|x| *frequencies.get(x).unwrap_or(&0) as f64 / data.len() as f64)
|
|
|
+ .collect(),
|
|
|
+ )
|
|
|
+ .show_legend(false)
|
|
|
+ .marker(Marker::new().color_array(colors));
|
|
|
+
|
|
|
+ plot.add_trace(bars);
|
|
|
+
|
|
|
+ let data_x = generate_range(0.0, nd.percentile(99.0).unwrap().into(), 100);
|
|
|
+ let data_y: Vec<f64> = data_x.iter().map(|x| nd.fitted_normal.pdf(x)).collect();
|
|
|
+ let trace = Scatter::new(data_x.clone(), data_y).name("Gaussian");
|
|
|
+ plot.add_trace(trace);
|
|
|
+
|
|
|
+ // Gamma
|
|
|
+ let data: Vec<f64> = d.iter().map(|x| *x as f64).collect();
|
|
|
+
|
|
|
+ let sum: f64 = data.iter().sum();
|
|
|
+ let mean = (&data).mean();
|
|
|
+ let variance = (&data).variance();
|
|
|
+ let count = d.len() as f64;
|
|
|
+ let shape = mean * mean / variance;
|
|
|
+ let rate = mean / variance;
|
|
|
+
|
|
|
+ let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
|
|
|
+ let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
|
|
|
+ let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
|
|
|
+ plot.add_trace(trace);
|
|
|
+
|
|
|
+ // Poisson
|
|
|
+ let lambda = sum / count;
|
|
|
+ let poisson = statrs::distribution::Poisson::new(lambda).unwrap();
|
|
|
+ let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
|
|
|
+ let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
|
|
|
+ plot.add_trace(trace);
|
|
|
+
|
|
|
+ plot.use_local_plotly();
|
|
|
+ plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
+ println!("> 15x: {:?}", nd.frequencies(&vec![1, 6, 15]));
|
|
|
+ }
|
|
|
+}
|
|
|
|
|
|
pub struct ND {
|
|
|
pub data: Vec<u32>,
|
|
|
pub distribution: BTreeMap<u32, f64>,
|
|
|
pub total_count: usize,
|
|
|
pub frequency: HashMap<u32, usize>,
|
|
|
- pub fitted_normal: Normal,
|
|
|
+ pub fitted_normal: UvNormal,
|
|
|
}
|
|
|
|
|
|
+use rstat::{fitting::MLE, normal::UvNormal, ContinuousDistribution};
|
|
|
+
|
|
|
impl ND {
|
|
|
fn new(mut data: Vec<u32>) -> Self {
|
|
|
data.sort_unstable();
|
|
|
let n = data.len();
|
|
|
info!("n values {n}");
|
|
|
- let mut frequency = HashMap::new();
|
|
|
let mut distribution = BTreeMap::new();
|
|
|
|
|
|
+ let mut frequency = HashMap::new();
|
|
|
for &value in &data {
|
|
|
*frequency.entry(value).or_insert(0) += 1;
|
|
|
}
|
|
|
@@ -232,7 +705,14 @@ impl ND {
|
|
|
}
|
|
|
|
|
|
// Fit normal distribution
|
|
|
- let fitted_normal = Self::fit_normal(&data);
|
|
|
+ let fitted_normal = rstat::univariate::normal::Normal::fit_mle(
|
|
|
+ &data
|
|
|
+ .iter()
|
|
|
+ .filter(|x| *x >= &1u32)
|
|
|
+ .map(|x| *x as f64)
|
|
|
+ .collect::<Vec<f64>>(),
|
|
|
+ )
|
|
|
+ .unwrap();
|
|
|
|
|
|
Self {
|
|
|
data,
|
|
|
@@ -243,38 +723,23 @@ impl ND {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- fn fit_normal(data: &[u32]) -> Normal {
|
|
|
- let n = data.len() as f64;
|
|
|
-
|
|
|
- // Calculate mean and variance in a single pass
|
|
|
- let (sum, sum_sq) = data
|
|
|
- .iter()
|
|
|
- .filter(|v| **v > 1)
|
|
|
- .fold((0.0, 0.0), |(sum, sum_sq), x| {
|
|
|
- let x = *x as f64;
|
|
|
- (sum + x, sum_sq + x * x)
|
|
|
- });
|
|
|
-
|
|
|
- let mean = sum / n;
|
|
|
- let variance = (sum_sq / n) - (mean * mean);
|
|
|
- let std_dev = variance.sqrt();
|
|
|
-
|
|
|
- Normal::new(mean, std_dev).unwrap()
|
|
|
- }
|
|
|
-
|
|
|
- pub fn pdf(&self, x: f64) -> f64 {
|
|
|
- let epsilon = 1e-6; // Small value for numerical differentiation
|
|
|
- let cdf_x = self.fitted_normal.cdf(x);
|
|
|
- let cdf_x_plus_epsilon = self.fitted_normal.cdf(x + epsilon);
|
|
|
-
|
|
|
- // Approximate the derivative
|
|
|
- (cdf_x_plus_epsilon - cdf_x) / epsilon
|
|
|
- }
|
|
|
-
|
|
|
pub fn frequency(&self, x: u32) -> usize {
|
|
|
*self.frequency.get(&x).unwrap_or(&0)
|
|
|
}
|
|
|
|
|
|
+ pub fn frequencies(&self, breaks: &Vec<u32>) -> Vec<f64> {
|
|
|
+ let mut last_prop_under = 0.0;
|
|
|
+ let mut res = Vec::new();
|
|
|
+ for brk in breaks {
|
|
|
+ let v = self.proportion_under(*brk) - last_prop_under;
|
|
|
+ res.push(v);
|
|
|
+ last_prop_under += v;
|
|
|
+ }
|
|
|
+ let per99 = self.percentile(99.0).unwrap();
|
|
|
+ res.push(self.proportion_under(per99) - last_prop_under);
|
|
|
+ res
|
|
|
+ }
|
|
|
+
|
|
|
pub fn percentile(&self, percentile: f64) -> Option<u32> {
|
|
|
if !(0.0..=100.0).contains(&percentile) {
|
|
|
return None;
|
|
|
@@ -284,12 +749,6 @@ impl ND {
|
|
|
self.data.get(index).cloned()
|
|
|
}
|
|
|
|
|
|
- pub fn cdf(&self, x: u32) -> f64 {
|
|
|
- self.distribution
|
|
|
- .range(..=x)
|
|
|
- .next_back()
|
|
|
- .map_or(0.0, |(_, &prob)| prob)
|
|
|
- }
|
|
|
pub fn proportion_under(&self, x: u32) -> f64 {
|
|
|
let count = self
|
|
|
.frequency
|
|
|
@@ -309,21 +768,6 @@ impl ND {
|
|
|
.sum::<usize>();
|
|
|
count as f64 / self.total_count as f64
|
|
|
}
|
|
|
-
|
|
|
- pub fn fitted_proportion_under(&self, x: f64) -> f64 {
|
|
|
- self.fitted_normal.cdf(x)
|
|
|
- }
|
|
|
-
|
|
|
- pub fn fitted_proportion_above(&self, x: f64) -> f64 {
|
|
|
- 1.0 - self.fitted_normal.cdf(x)
|
|
|
- }
|
|
|
-
|
|
|
- pub fn get_fitted_parameters(&self) -> (f64, f64) {
|
|
|
- (
|
|
|
- self.fitted_normal.mean().unwrap(),
|
|
|
- self.fitted_normal.std_dev().unwrap(),
|
|
|
- )
|
|
|
- }
|
|
|
}
|
|
|
|
|
|
pub fn generate_range(start: f64, end: f64, steps: usize) -> Vec<f64> {
|
|
|
@@ -412,3 +856,87 @@ where
|
|
|
})
|
|
|
.collect()
|
|
|
}
|
|
|
+
|
|
|
+pub fn transpose(v: Vec<Vec<f64>>) -> Vec<Vec<f64>> {
|
|
|
+ assert!(!v.is_empty());
|
|
|
+ let len = v[0].len();
|
|
|
+ let mut result = vec![Vec::with_capacity(v.len()); len];
|
|
|
+
|
|
|
+ for row in v {
|
|
|
+ for (i, val) in row.into_iter().enumerate() {
|
|
|
+ result[i].push(val);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ result
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Serialize)]
|
|
|
+pub struct CountsStats {
|
|
|
+ pub sum: f64,
|
|
|
+ pub mean: f64,
|
|
|
+ pub std_dev: f64,
|
|
|
+ pub breaks_values: Vec<(String, f64)>,
|
|
|
+ pub masked: Vec<(String, f64)>,
|
|
|
+}
|
|
|
+
|
|
|
+// pub fn save_barplota
|
|
|
+// data: Vec<f64>,
|
|
|
+// data_x: Vec<f64>,
|
|
|
+// data_y: Vec<f64>,
|
|
|
+// path: &str,
|
|
|
+// ) -> anyhow::Result<CountsStats> {
|
|
|
+// let mut plot = Plot::new();
|
|
|
+//
|
|
|
+// let colors: Vec<plotly::color::Rgb> = data_x
|
|
|
+// .iter()
|
|
|
+// .map(|&x| {
|
|
|
+// if x <= 2.0 {
|
|
|
+// plotly::color::Rgb::new(193, 18, 31)
|
|
|
+// } else if x >= 15.0 {
|
|
|
+// plotly::color::Rgb::new(138, 201, 38)
|
|
|
+// } else if x <= 6.0 {
|
|
|
+// plotly::color::Rgb::new(243, 114, 44)
|
|
|
+// } else {
|
|
|
+// plotly::color::Rgb::new(255, 202, 58)
|
|
|
+// }
|
|
|
+// })
|
|
|
+// .collect();
|
|
|
+//
|
|
|
+// let bars = Bar::new(data_x.clone(), data_y.clone())
|
|
|
+// .show_legend(false)
|
|
|
+// .marker(Marker::new().color_array(colors));
|
|
|
+//
|
|
|
+// plot.add_trace(bars);
|
|
|
+//
|
|
|
+// let sum: f64 = data.iter().sum();
|
|
|
+// let mean = (&data).mean();
|
|
|
+// let count = data.len() as f64;
|
|
|
+// let std_dev = (&data).std_dev();
|
|
|
+// println!("mean {mean}");
|
|
|
+//
|
|
|
+// // Normal
|
|
|
+// let normal = statrs::distribution::Normal::new(mean, std_dev)?;
|
|
|
+// let data_y: Vec<f64> = data_x.iter().map(|x| normal.pdf(*x)).collect();
|
|
|
+// let trace = Scatter::new(data_x.clone(), data_y).name("Normal");
|
|
|
+// plot.add_trace(trace);
|
|
|
+//
|
|
|
+// // // Gamma
|
|
|
+// // let shape = mean * mean / variance;
|
|
|
+// // let rate = mean / variance;
|
|
|
+// //
|
|
|
+// // let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
|
|
|
+// // let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
|
|
|
+// // let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
|
|
|
+// // plot.add_trace(trace);
|
|
|
+//
|
|
|
+// // Poisson
|
|
|
+// let lambda = sum / count;
|
|
|
+// let poisson = statrs::distribution::Poisson::new(lambda)?;
|
|
|
+// let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
|
|
|
+// let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
|
|
|
+// plot.add_trace(trace);
|
|
|
+//
|
|
|
+// plot.use_local_plotly();
|
|
|
+// plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
|
|
|
+// Ok(CountsStats { sum, mean, std_dev })
|
|
|
+// }
|