| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960 |
- use anyhow::Context;
- use log::{info, warn};
- use ordered_float::Float;
- use pandora_lib_graph::cytoband::{svg_chromosome, AdditionalRect, RectPosition};
- use plotly::{color::Rgb, common::Marker, layout::BarMode, Bar, Layout, Plot, Scatter};
- use rand::{thread_rng, Rng};
- use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
- use serde::{
- de::{self, Visitor},
- Deserialize, Deserializer, Serialize,
- };
- use statrs::{
- distribution::{Continuous, Discrete},
- statistics::Statistics,
- };
- use std::{
- collections::{BTreeMap, HashMap, HashSet},
- f64, fmt,
- fs::File,
- io::{BufRead, BufReader, Write},
- str::FromStr,
- };
- #[derive(Debug, Clone)]
- pub struct Count {
- pub position: CountRange,
- pub n_reads: u32,
- pub n_low_mapq: u32,
- pub frac_sa: f32,
- pub sa_outlier: bool,
- pub frac_se: f32,
- pub se_outlier: bool,
- pub annotation: Vec<CountAnnotation>,
- }
- impl fmt::Display for Count {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- write!(
- f,
- "{}\t{}\t{}\t{:.6}\t{}\t{:.6}\t{}",
- self.position,
- self.n_reads,
- self.n_low_mapq,
- self.frac_sa,
- self.sa_outlier,
- self.frac_se,
- self.se_outlier
- )
- }
- }
- // inclusive 0 based
- #[derive(Debug, Clone)]
- pub struct CountRange {
- pub contig: String,
- pub start: u32,
- pub end: u32,
- }
- impl fmt::Display for CountRange {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- write!(f, "{}:{}-{}", self.contig, self.start, self.end)
- }
- }
- #[derive(Debug, Clone, Hash, Eq, PartialEq)]
- pub enum CountAnnotation {
- MaskedLowMRD,
- MaskedQuality,
- }
- impl<'de> Deserialize<'de> for Count {
- fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
- where
- D: Deserializer<'de>,
- {
- struct CountVisitor;
- impl<'de> Visitor<'de> for CountVisitor {
- type Value = Count;
- fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
- formatter.write_str("a string in the format 'chr:start-end n_reads n_low_mapq frac_sa sa_outlier frac_se se_outlier'")
- }
- fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
- where
- E: de::Error,
- {
- let parts: Vec<&str> = s.split_whitespace().collect();
- if parts.len() != 7 {
- return Err(E::custom("incorrect number of fields"));
- }
- let position_parts: Vec<&str> = parts[0].split(&[':', '-'][..]).collect();
- if position_parts.len() != 3 {
- return Err(E::custom("incorrect position format"));
- }
- Ok(Count {
- position: CountRange {
- contig: position_parts[0].to_string(),
- start: u32::from_str(position_parts[1]).map_err(E::custom)?,
- end: u32::from_str(position_parts[2]).map_err(E::custom)?,
- },
- n_reads: u32::from_str(parts[1]).map_err(E::custom)?,
- n_low_mapq: u32::from_str(parts[2]).map_err(E::custom)?,
- frac_sa: f32::from_str(parts[3]).map_err(E::custom)?,
- sa_outlier: bool::from_str(parts[4]).map_err(E::custom)?,
- frac_se: f32::from_str(parts[5]).map_err(E::custom)?,
- se_outlier: bool::from_str(parts[6]).map_err(E::custom)?,
- annotation: Vec::new(),
- })
- }
- }
- deserializer.deserialize_str(CountVisitor)
- }
- }
- pub fn read_counts_from_file(filename: &str) -> anyhow::Result<Vec<Count>> {
- let file = File::open(filename)?;
- let reader = BufReader::new(file);
- let mut counts = Vec::new();
- for line in reader.lines() {
- let line = line?;
- let count: Count = serde_json::from_str(&format!("\"{}\"", escape_control_chars(&line)))?;
- counts.push(count);
- }
- Ok(counts)
- }
- fn escape_control_chars(s: &str) -> String {
- s.chars()
- .map(|c| {
- if c.is_control() {
- format!("\\u{:04x}", c as u32)
- } else {
- c.to_string()
- }
- })
- .collect()
- }
- #[derive(Debug)]
- pub struct Counts {
- pub data: HashMap<String, Vec<Count>>,
- pub mrd: HashMap<String, Vec<Count>>,
- }
- impl Counts {
- pub fn from_files(paths: Vec<String>) -> Self {
- let counts: Vec<Vec<Count>> = paths
- .par_iter()
- .map(|path| match read_counts_from_file(path) {
- Ok(c) => c,
- Err(e) => {
- warn!("Couldnt load {path}: {e}");
- Vec::new()
- }
- })
- .filter(|v| !v.is_empty())
- .collect();
- let mut data = HashMap::new();
- for count in counts {
- let contig = count.first().unwrap().position.contig.clone();
- data.insert(contig, count);
- }
- Counts {
- data,
- mrd: HashMap::new(),
- }
- }
- pub fn mrd_from_files(&mut self, paths: Vec<String>) {
- let counts: Vec<Vec<Count>> = paths
- .par_iter()
- .map(|path| match read_counts_from_file(path) {
- Ok(c) => c,
- Err(e) => {
- warn!("Couldnt load {path}: {e}");
- Vec::new()
- }
- })
- .filter(|v| !v.is_empty())
- .collect();
- let mut data = HashMap::new();
- for count in counts {
- let contig = count.first().unwrap().position.contig.clone();
- data.insert(contig, count);
- }
- self.mrd = data;
- }
- pub fn mask_low_mrd(&mut self, contig: &str, min_reads: u32) -> anyhow::Result<()> {
- if let (Some(mrd), Some(diag)) = (self.mrd.get(contig), self.data.get_mut(contig)) {
- for (m, d) in mrd.iter().zip(diag) {
- if m.n_reads < min_reads {
- d.annotation.push(CountAnnotation::MaskedLowMRD);
- }
- }
- Ok(())
- } else {
- anyhow::bail!("No {contig} in both mrd and diag.")
- }
- }
- pub fn mask_low_quality(&mut self, contig: &str, max_ratio: f64) -> anyhow::Result<()> {
- if let Some(diag) = self.data.get_mut(contig) {
- for d in diag.iter_mut() {
- if (d.n_low_mapq as f64 / (d.n_reads + d.n_low_mapq) as f64) > max_ratio {
- d.annotation.push(CountAnnotation::MaskedQuality);
- }
- }
- Ok(())
- } else {
- anyhow::bail!("No {contig} in both mrd and diag.")
- }
- }
- pub fn frequencies(&self, contig: &str) -> anyhow::Result<Vec<(f64, f64)>> {
- let data = self.get(contig)?;
- let mut frequencies = HashMap::new();
- for count in data.iter() {
- *frequencies.entry(*count).or_insert(0) += 1;
- }
- let mut frequencies: Vec<(u32, f64)> =
- frequencies.iter().map(|(x, y)| (*x, *y as f64)).collect();
- frequencies.sort_by_key(|v| v.0);
- Ok(frequencies.iter().map(|(x, y)| (*x as f64, *y)).collect())
- }
- pub fn percentile(&self, contig: &str, percentile: f64) -> anyhow::Result<u32> {
- let mut data = self.get(contig)?;
- data.sort_unstable();
- let total_count = data.len();
- let index = |percentile: f64| -> usize {
- (percentile / 100.0 * (total_count - 1) as f64).round() as usize
- };
- Ok(*data.get(index(percentile)).context("Error in percentile")?)
- }
- pub fn save_contig(
- &mut self,
- contig: &str,
- prefix: &str,
- breaks: Vec<u32>,
- ) -> anyhow::Result<CountsStats> {
- self.mask_low_mrd(contig, 6)?;
- self.mask_low_quality(contig, 0.1)?;
- let data: Vec<f64> = self.get(contig)?.iter().map(|v| *v as f64).collect();
- let n_final = data.len();
- let frequencies = self.frequencies(contig)?;
- let percentile_99 = self.percentile(contig, 99.0)?;
- let mut data_x = Vec::new();
- let mut data_y = Vec::new();
- frequencies.iter().for_each(|(x, y)| {
- if *x <= percentile_99 as f64 {
- data_x.push(*x);
- data_y.push(*y / n_final as f64);
- }
- });
- // Distribution plot
- let distribution_path = format!("{prefix}_{contig}_distrib.svg");
- info!("Saving graph: {distribution_path}");
- let mut plot = Plot::new();
- let colors: Vec<Rgb> = data_x
- .iter()
- .map(|&x| match x {
- x if x < 2.0 => Rgb::new(193, 18, 31),
- x if x < 6.0 => Rgb::new(243, 114, 44),
- x if x < 15.0 => Rgb::new(255, 202, 58),
- _ => Rgb::new(138, 201, 38),
- })
- .collect();
- let bars = Bar::new(data_x.clone(), data_y.clone())
- .show_legend(false)
- .marker(Marker::new().color_array(colors));
- plot.add_trace(bars);
- let sum: f64 = data.iter().sum();
- let mean = (&data).mean();
- let count = data.len() as f64;
- let std_dev = (&data).std_dev();
- // Normal
- let normal = statrs::distribution::Normal::new(mean, std_dev)?;
- let data_y: Vec<f64> = data_x.iter().map(|x| normal.pdf(*x)).collect();
- let trace = Scatter::new(data_x.clone(), data_y).name("Normal");
- plot.add_trace(trace);
- // // Gamma
- // let shape = mean * mean / variance;
- // let rate = mean / variance;
- //
- // let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
- // let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
- // let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
- // plot.add_trace(trace);
- // Poisson
- let lambda = sum / count;
- let poisson = statrs::distribution::Poisson::new(lambda)?;
- let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
- let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
- plot.add_trace(trace);
- plot.write_image(distribution_path, plotly::ImageFormat::SVG, 800, 600, 1.0);
- // Fractions
- let mut breaks_values = Vec::new();
- for (i, b) in breaks.iter().enumerate() {
- if i == 0 {
- let total: f64 = frequencies
- .iter()
- .filter(|(x, _)| *x < *b as f64)
- .map(|(_, y)| *y / count)
- .sum();
- breaks_values.push((format!("< {b}"), total));
- } else {
- let last = breaks[i - 1];
- let total: f64 = frequencies
- .iter()
- .filter(|(x, _)| *x < *b as f64 && *x >= last as f64)
- .map(|(_, y)| *y / count)
- .sum();
- breaks_values.push((format!("[{last} - {b}["), total));
- }
- }
- let last = *breaks.last().unwrap();
- let total: f64 = frequencies
- .iter()
- .filter(|(x, _)| *x >= last as f64)
- .map(|(_, y)| *y / count)
- .sum();
- breaks_values.push((format!(">= {last}"), total));
- // Chromosome
- let tol = 25;
- let chromosome_path = format!("{prefix}_{contig}_chromosome.svg");
- info!("Saving graph: {chromosome_path}");
- let target_annotations: HashSet<CountAnnotation> = vec![
- CountAnnotation::MaskedLowMRD,
- CountAnnotation::MaskedQuality,
- ]
- .into_iter()
- .collect();
- let d: Vec<u32> = self
- .data
- .get(contig)
- .unwrap()
- .iter()
- .map(|c| {
- if c.annotation
- .iter()
- .any(|ann| target_annotations.contains(ann))
- {
- 10_000u32
- } else {
- c.n_reads
- }
- })
- .collect();
- let hm = self.counts_annotations(contig)?;
- let len = d.len();
- let mut masked: Vec<(String, f64)> = hm
- .iter()
- .map(|(k, v)| (format!("{:?}", k), *v as f64 / len as f64))
- .collect();
- masked.push(("Un masked".to_string(), n_final as f64 / len as f64));
- let under_6_rects: Vec<AdditionalRect> = ranges_under(&d, 5, tol)
- .iter()
- .filter(|(s, e)| e > s)
- .map(|(start, end)| AdditionalRect {
- start: *start as u32 * 1000,
- end: *end as u32 * 1000,
- color: String::from("red"),
- position: RectPosition::Below(1),
- })
- .collect();
- let over_6_rects: Vec<AdditionalRect> = ranges_between(&d, 6, 9999, tol)
- .iter()
- .filter(|(s, e)| e > s)
- .map(|(start, end)| AdditionalRect {
- start: *start as u32 * 1000,
- end: *end as u32 * 1000,
- color: String::from("green"),
- position: RectPosition::Below(2),
- })
- .collect();
- let masked_rec: Vec<AdditionalRect> = ranges_over(&d, 10000, tol)
- .iter()
- .filter(|(s, e)| e > s)
- .map(|(start, end)| AdditionalRect {
- start: *start as u32 * 1000,
- end: *end as u32 * 1000,
- color: String::from("grey"),
- position: RectPosition::Below(0),
- })
- .collect();
- let mut all = Vec::new();
- all.extend(under_6_rects);
- all.extend(over_6_rects);
- // all.extend(over15);
- all.extend(masked_rec);
- svg_chromosome(
- contig,
- 1000,
- 50,
- "/data/ref/hs1/cytoBandMapped.bed",
- &chromosome_path,
- &all,
- &Vec::new(),
- )
- .unwrap();
- let stats = CountsStats {
- sum,
- mean,
- std_dev,
- breaks_values,
- masked,
- };
- // Save stats
- let json_path = format!("{prefix}_{contig}_stats.json");
- info!("Saving stats into: {json_path}");
- let json = serde_json::to_string_pretty(&stats)?;
- let mut file = File::create(json_path)?;
- file.write_all(json.as_bytes())?;
- Ok(stats)
- }
- pub fn save_contigs(
- &mut self,
- contigs: &Vec<String>,
- prefix: &str,
- breaks: Vec<u32>,
- ) -> anyhow::Result<()> {
- let mut stats = Vec::new();
- let mut proportions = HashMap::new();
- for contig in contigs {
- let stat = self.save_contig(contig, prefix, breaks.clone())?;
- let un_masked: Vec<&(String, f64)> = stat.masked.iter().filter(|(s, _)| s == "Un masked").collect();
- let un_masked = un_masked.first().unwrap().1;
- let masked = 1.0 - un_masked;
- let mut props: Vec<(String, f64)> = stat.breaks_values.iter().map(|(s, v)| (s.to_string(), *v * un_masked)).collect();
- props.push(("masked".to_string(), masked));
- props.iter().for_each(|(s, v)| {
- proportions.entry(s.to_string()).or_insert(vec![]).push(*v);
- });
- stats.push(stat);
- }
- let mut plot = Plot::new();
- let layout = Layout::new().bar_mode(BarMode::Stack);
-
- for (k, v) in proportions {
- plot.add_trace(Bar::new(contigs.clone(), v.to_vec()).name(k));
- }
- println!("{:?}", contigs);
- plot.set_layout(layout);
- plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
- Ok(())
- }
- pub fn counts_annotations(
- &self,
- contig: &str,
- ) -> anyhow::Result<HashMap<CountAnnotation, u64>> {
- if let Some(d) = self.data.get(contig) {
- let mut counts = HashMap::new();
- for c in d {
- for a in &c.annotation {
- *counts.entry(a.clone()).or_insert(0) += 1;
- }
- }
- Ok(counts)
- } else {
- anyhow::bail!("No {contig} in counts.")
- }
- }
- pub fn get(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
- if let Some(ccounts) = self.data.get(contig) {
- let target_annotations: HashSet<CountAnnotation> = vec![
- CountAnnotation::MaskedLowMRD,
- CountAnnotation::MaskedQuality,
- ]
- .into_iter()
- .collect();
- Ok(ccounts
- .iter()
- .filter(|count| {
- !count
- .annotation
- .iter()
- .any(|ann| target_annotations.contains(ann))
- })
- .map(|c| c.n_reads)
- .collect())
- } else {
- anyhow::bail!("No {contig} in counts.")
- }
- }
- pub fn mrd(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
- if let Some(ccounts) = self.mrd.get(contig) {
- Ok(ccounts.iter().map(|c| c.n_reads).collect())
- } else {
- anyhow::bail!("No {contig} in counts.")
- }
- }
- pub fn calculate_percentiles(
- &self,
- contig: &str,
- percentiles: &[f64],
- ) -> anyhow::Result<Vec<f64>> {
- if let Some(ccounts) = self.data.get(contig) {
- let mut n_reads: Vec<u32> = ccounts.iter().map(|c| c.n_reads).collect();
- n_reads.sort_unstable();
- let cdf = ND::new(n_reads.clone());
- // println!("CDF at 13: {:?}", cdf.cdf(13));
- println!("Percentile at 99: {:?}", cdf.percentile(99.0));
- println!("above 15X: {:?}", cdf.proportion_above(15));
- // println!("above 15.1X: {:?}", cdf.fitted_proportion_above(&15.1));
- println!("under 6X: {:?}", cdf.proportion_under(6));
- Ok(percentiles
- .iter()
- .map(|&p| {
- let index = (p * (n_reads.len() - 1) as f64).round() as usize;
- n_reads[index] as f64
- })
- .collect())
- } else {
- anyhow::bail!("No {contig} in counts.")
- }
- }
- pub fn nd_reads(&self, contig: &str) -> anyhow::Result<ND> {
- if let Some(ccounts) = self.data.get(contig) {
- Ok(ND::new(ccounts.iter().map(|c| c.n_reads).collect()))
- } else {
- anyhow::bail!("No {contig} in counts")
- }
- }
- pub fn distribution(&self, contig: &str) -> anyhow::Result<ND> {
- Ok(ND::new(self.get(contig)?))
- }
- pub fn save_stats(&self) -> anyhow::Result<()> {
- Ok(())
- }
- pub fn save_global_proportions_graph(
- &self,
- path: &str,
- contigs: &Vec<String>,
- breaks: Vec<u32>,
- ) {
- let mut breaks_str = Vec::new();
- for (i, b) in breaks.iter().enumerate() {
- if i == 0 {
- breaks_str.push(format!("< {b}"));
- } else {
- let last = breaks[i - 1];
- breaks_str.push(format!("[{last} - {b}["))
- }
- }
- breaks_str.push(format!(">= {}", breaks.last().unwrap()));
- let mut proportions = Vec::new();
- for contig in contigs.iter() {
- let d = self.get(contig).unwrap();
- let nd = ND::new(d);
- proportions.push(nd.frequencies(&breaks));
- }
- let mut plot = Plot::new();
- let layout = Layout::new().bar_mode(BarMode::Stack);
- for (i, v) in transpose(proportions).iter().enumerate() {
- plot.add_trace(Bar::new(contigs.clone(), v.to_vec()).name(&breaks_str[i]));
- }
- println!("{:?}", contigs);
- plot.set_layout(layout);
- plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
- }
- pub fn save_global_distribution_graph(&self, path: &str, contigs: &Vec<String>) {
- let d: Vec<u32> = contigs.iter().flat_map(|c| self.get(c).unwrap()).collect();
- let mut data_sorted = d.clone();
- data_sorted.sort_unstable();
- let nd = ND::new(d.clone());
- let mut plot = Plot::new();
- let bar_x: Vec<u32> = (1..=nd.percentile(99.0).unwrap()).collect();
- let colors: Vec<plotly::color::Rgb> = bar_x
- .iter()
- .map(|&x| {
- if x <= 2 {
- plotly::color::Rgb::new(193, 18, 31)
- } else if x >= 15 {
- plotly::color::Rgb::new(138, 201, 38)
- } else if x <= 6 {
- plotly::color::Rgb::new(243, 114, 44)
- } else {
- plotly::color::Rgb::new(255, 202, 58)
- }
- })
- .collect();
- let data: Vec<u32> = d.iter().filter(|x| **x >= 1).copied().collect();
- // frequencies
- let mut frequencies = HashMap::new();
- for &value in &data {
- *frequencies.entry(value).or_insert(0) += 1;
- }
- let bars = Bar::new(
- bar_x.clone(),
- bar_x
- .iter()
- .map(|x| *frequencies.get(x).unwrap_or(&0) as f64 / data.len() as f64)
- .collect(),
- )
- .show_legend(false)
- .marker(Marker::new().color_array(colors));
- plot.add_trace(bars);
- let data_x = generate_range(0.0, nd.percentile(99.0).unwrap().into(), 100);
- let data_y: Vec<f64> = data_x.iter().map(|x| nd.fitted_normal.pdf(x)).collect();
- let trace = Scatter::new(data_x.clone(), data_y).name("Gaussian");
- plot.add_trace(trace);
- // Gamma
- let data: Vec<f64> = d.iter().map(|x| *x as f64).collect();
- let sum: f64 = data.iter().sum();
- let mean = (&data).mean();
- let variance = (&data).variance();
- let count = d.len() as f64;
- let shape = mean * mean / variance;
- let rate = mean / variance;
- let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
- let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
- let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
- plot.add_trace(trace);
- // Poisson
- let lambda = sum / count;
- let poisson = statrs::distribution::Poisson::new(lambda).unwrap();
- let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
- let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
- plot.add_trace(trace);
- plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
- println!("> 15x: {:?}", nd.frequencies(&vec![1, 6, 15]));
- }
- }
- pub struct ND {
- pub data: Vec<u32>,
- pub distribution: BTreeMap<u32, f64>,
- pub total_count: usize,
- pub frequency: HashMap<u32, usize>,
- pub fitted_normal: UvNormal,
- }
- use rstat::{fitting::MLE, normal::UvNormal, ContinuousDistribution};
- impl ND {
- fn new(mut data: Vec<u32>) -> Self {
- data.sort_unstable();
- let n = data.len();
- info!("n values {n}");
- let mut distribution = BTreeMap::new();
- let mut frequency = HashMap::new();
- for &value in &data {
- *frequency.entry(value).or_insert(0) += 1;
- }
- let mut cumulative_count = 0;
- for (&value, &count) in &frequency {
- cumulative_count += count;
- let cumulative_prob = cumulative_count as f64 / n as f64;
- distribution.insert(value, cumulative_prob);
- }
- // Fit normal distribution
- let fitted_normal = rstat::univariate::normal::Normal::fit_mle(
- &data
- .iter()
- .filter(|x| *x >= &1u32)
- .map(|x| *x as f64)
- .collect::<Vec<f64>>(),
- )
- .unwrap();
- Self {
- data,
- distribution,
- frequency,
- total_count: n,
- fitted_normal,
- }
- }
- pub fn frequency(&self, x: u32) -> usize {
- *self.frequency.get(&x).unwrap_or(&0)
- }
- pub fn frequencies(&self, breaks: &Vec<u32>) -> Vec<f64> {
- let mut last_prop_under = 0.0;
- let mut res = Vec::new();
- for brk in breaks {
- let v = self.proportion_under(*brk) - last_prop_under;
- res.push(v);
- last_prop_under += v;
- }
- let per99 = self.percentile(99.0).unwrap();
- res.push(self.proportion_under(per99) - last_prop_under);
- res
- }
- pub fn percentile(&self, percentile: f64) -> Option<u32> {
- if !(0.0..=100.0).contains(&percentile) {
- return None;
- }
- let index = (percentile / 100.0 * (self.total_count - 1) as f64).round() as usize;
- self.data.get(index).cloned()
- }
- pub fn proportion_under(&self, x: u32) -> f64 {
- let count = self
- .frequency
- .iter()
- .filter(|(&value, _)| value < x)
- .map(|(_, &count)| count)
- .sum::<usize>();
- count as f64 / self.total_count as f64
- }
- pub fn proportion_above(&self, x: u32) -> f64 {
- let count = self
- .frequency
- .iter()
- .filter(|(&value, _)| value > x)
- .map(|(_, &count)| count)
- .sum::<usize>();
- count as f64 / self.total_count as f64
- }
- }
- pub fn generate_range(start: f64, end: f64, steps: usize) -> Vec<f64> {
- if steps == 0 {
- return vec![];
- }
- if steps == 1 {
- return vec![start];
- }
- let step_size = (end - start) / (steps - 1) as f64;
- (0..steps).map(|i| start + i as f64 * step_size).collect()
- }
- use rayon::prelude::*;
- pub fn ranges_under(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
- get_ranges_parallel(vec, x, tolerance, |val, threshold| val <= threshold)
- }
- pub fn ranges_over(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
- get_ranges_parallel(vec, x, tolerance, |val, threshold| val >= threshold)
- }
- pub fn ranges_between(
- vec: &[u32],
- lower: u32,
- upper: u32,
- tolerance: usize,
- ) -> Vec<(usize, usize)> {
- get_ranges_parallel(vec, (lower, upper), tolerance, |val, (l, u)| {
- val >= l && val <= u
- })
- }
- pub fn get_ranges_parallel<T, F>(
- vec: &[u32],
- threshold: T,
- tolerance: usize,
- compare: F,
- ) -> Vec<(usize, usize)>
- where
- F: Fn(u32, T) -> bool + Sync,
- T: Copy + Sync,
- {
- if vec.is_empty() {
- return Vec::new();
- }
- let chunk_size = (vec.len() / rayon::current_num_threads()).max(1);
- vec.par_chunks(chunk_size)
- .enumerate()
- .flat_map(|(chunk_index, chunk)| {
- let mut local_ranges = Vec::new();
- let mut current_range: Option<(usize, usize)> = None;
- let offset = chunk_index * chunk_size;
- for (i, &val) in chunk.iter().enumerate() {
- let global_index = offset + i;
- if compare(val, threshold) {
- match current_range {
- Some((start, end)) if global_index <= end + tolerance + 1 => {
- current_range = Some((start, global_index));
- }
- Some((start, end)) => {
- local_ranges.push((start, end));
- current_range = Some((global_index, global_index));
- }
- None => {
- current_range = Some((global_index, global_index));
- }
- }
- } else if let Some((start, end)) = current_range {
- if global_index > end + tolerance + 1 {
- local_ranges.push((start, end));
- current_range = None;
- }
- }
- }
- if let Some(range) = current_range {
- local_ranges.push(range);
- }
- local_ranges
- })
- .collect()
- }
- pub fn transpose(v: Vec<Vec<f64>>) -> Vec<Vec<f64>> {
- assert!(!v.is_empty());
- let len = v[0].len();
- let mut result = vec![Vec::with_capacity(v.len()); len];
- for row in v {
- for (i, val) in row.into_iter().enumerate() {
- result[i].push(val);
- }
- }
- result
- }
- #[derive(Debug, Serialize)]
- pub struct CountsStats {
- pub sum: f64,
- pub mean: f64,
- pub std_dev: f64,
- pub breaks_values: Vec<(String, f64)>,
- pub masked: Vec<(String, f64)>,
- }
- // pub fn save_barplota
- // data: Vec<f64>,
- // data_x: Vec<f64>,
- // data_y: Vec<f64>,
- // path: &str,
- // ) -> anyhow::Result<CountsStats> {
- // let mut plot = Plot::new();
- //
- // let colors: Vec<plotly::color::Rgb> = data_x
- // .iter()
- // .map(|&x| {
- // if x <= 2.0 {
- // plotly::color::Rgb::new(193, 18, 31)
- // } else if x >= 15.0 {
- // plotly::color::Rgb::new(138, 201, 38)
- // } else if x <= 6.0 {
- // plotly::color::Rgb::new(243, 114, 44)
- // } else {
- // plotly::color::Rgb::new(255, 202, 58)
- // }
- // })
- // .collect();
- //
- // let bars = Bar::new(data_x.clone(), data_y.clone())
- // .show_legend(false)
- // .marker(Marker::new().color_array(colors));
- //
- // plot.add_trace(bars);
- //
- // let sum: f64 = data.iter().sum();
- // let mean = (&data).mean();
- // let count = data.len() as f64;
- // let std_dev = (&data).std_dev();
- // println!("mean {mean}");
- //
- // // Normal
- // let normal = statrs::distribution::Normal::new(mean, std_dev)?;
- // let data_y: Vec<f64> = data_x.iter().map(|x| normal.pdf(*x)).collect();
- // let trace = Scatter::new(data_x.clone(), data_y).name("Normal");
- // plot.add_trace(trace);
- //
- // // // Gamma
- // // let shape = mean * mean / variance;
- // // let rate = mean / variance;
- // //
- // // let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
- // // let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
- // // let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
- // // plot.add_trace(trace);
- //
- // // Poisson
- // let lambda = sum / count;
- // let poisson = statrs::distribution::Poisson::new(lambda)?;
- // let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
- // let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
- // plot.add_trace(trace);
- //
- // plot.use_local_plotly();
- // plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
- // Ok(CountsStats { sum, mean, std_dev })
- // }
|