Your Name 1 жил өмнө
parent
commit
621bcc54e3
5 өөрчлөгдсөн 1031 нэмэгдсэн , 670 устгасан
  1. 960 0
      <
  2. 12 228
      Cargo.lock
  3. 1 3
      Cargo.toml
  4. 56 336
      src/counts.rs
  5. 2 103
      src/lib.rs

+ 960 - 0
<

@@ -0,0 +1,960 @@
+use anyhow::Context;
+use log::{info, warn};
+use ordered_float::Float;
+use pandora_lib_graph::cytoband::{svg_chromosome, AdditionalRect, RectPosition};
+use plotly::{color::Rgb, common::Marker, layout::BarMode, Bar, Layout, Plot, Scatter};
+use rand::{thread_rng, Rng};
+use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
+use serde::{
+    de::{self, Visitor},
+    Deserialize, Deserializer, Serialize,
+};
+use statrs::{
+    distribution::{Continuous, Discrete},
+    statistics::Statistics,
+};
+use std::{
+    collections::{BTreeMap, HashMap, HashSet},
+    f64, fmt,
+    fs::File,
+    io::{BufRead, BufReader, Write},
+    str::FromStr,
+};
+
+#[derive(Debug, Clone)]
+pub struct Count {
+    pub position: CountRange,
+    pub n_reads: u32,
+    pub n_low_mapq: u32,
+    pub frac_sa: f32,
+    pub sa_outlier: bool,
+    pub frac_se: f32,
+    pub se_outlier: bool,
+    pub annotation: Vec<CountAnnotation>,
+}
+
+impl fmt::Display for Count {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{}\t{}\t{}\t{:.6}\t{}\t{:.6}\t{}",
+            self.position,
+            self.n_reads,
+            self.n_low_mapq,
+            self.frac_sa,
+            self.sa_outlier,
+            self.frac_se,
+            self.se_outlier
+        )
+    }
+}
+// inclusive 0 based
+#[derive(Debug, Clone)]
+pub struct CountRange {
+    pub contig: String,
+    pub start: u32,
+    pub end: u32,
+}
+
+impl fmt::Display for CountRange {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}:{}-{}", self.contig, self.start, self.end)
+    }
+}
+
+#[derive(Debug, Clone, Hash, Eq, PartialEq)]
+pub enum CountAnnotation {
+    MaskedLowMRD,
+    MaskedQuality,
+}
+
+impl<'de> Deserialize<'de> for Count {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        struct CountVisitor;
+
+        impl<'de> Visitor<'de> for CountVisitor {
+            type Value = Count;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                formatter.write_str("a string in the format 'chr:start-end n_reads n_low_mapq frac_sa sa_outlier frac_se se_outlier'")
+            }
+
+            fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
+            where
+                E: de::Error,
+            {
+                let parts: Vec<&str> = s.split_whitespace().collect();
+                if parts.len() != 7 {
+                    return Err(E::custom("incorrect number of fields"));
+                }
+
+                let position_parts: Vec<&str> = parts[0].split(&[':', '-'][..]).collect();
+                if position_parts.len() != 3 {
+                    return Err(E::custom("incorrect position format"));
+                }
+
+                Ok(Count {
+                    position: CountRange {
+                        contig: position_parts[0].to_string(),
+                        start: u32::from_str(position_parts[1]).map_err(E::custom)?,
+                        end: u32::from_str(position_parts[2]).map_err(E::custom)?,
+                    },
+                    n_reads: u32::from_str(parts[1]).map_err(E::custom)?,
+                    n_low_mapq: u32::from_str(parts[2]).map_err(E::custom)?,
+                    frac_sa: f32::from_str(parts[3]).map_err(E::custom)?,
+                    sa_outlier: bool::from_str(parts[4]).map_err(E::custom)?,
+                    frac_se: f32::from_str(parts[5]).map_err(E::custom)?,
+                    se_outlier: bool::from_str(parts[6]).map_err(E::custom)?,
+                    annotation: Vec::new(),
+                })
+            }
+        }
+
+        deserializer.deserialize_str(CountVisitor)
+    }
+}
+
+pub fn read_counts_from_file(filename: &str) -> anyhow::Result<Vec<Count>> {
+    let file = File::open(filename)?;
+    let reader = BufReader::new(file);
+    let mut counts = Vec::new();
+
+    for line in reader.lines() {
+        let line = line?;
+        let count: Count = serde_json::from_str(&format!("\"{}\"", escape_control_chars(&line)))?;
+        counts.push(count);
+    }
+
+    Ok(counts)
+}
+fn escape_control_chars(s: &str) -> String {
+    s.chars()
+        .map(|c| {
+            if c.is_control() {
+                format!("\\u{:04x}", c as u32)
+            } else {
+                c.to_string()
+            }
+        })
+        .collect()
+}
+
+#[derive(Debug)]
+pub struct Counts {
+    pub data: HashMap<String, Vec<Count>>,
+    pub mrd: HashMap<String, Vec<Count>>,
+}
+
+impl Counts {
+    pub fn from_files(paths: Vec<String>) -> Self {
+        let counts: Vec<Vec<Count>> = paths
+            .par_iter()
+            .map(|path| match read_counts_from_file(path) {
+                Ok(c) => c,
+                Err(e) => {
+                    warn!("Couldnt load {path}: {e}");
+                    Vec::new()
+                }
+            })
+            .filter(|v| !v.is_empty())
+            .collect();
+
+        let mut data = HashMap::new();
+        for count in counts {
+            let contig = count.first().unwrap().position.contig.clone();
+            data.insert(contig, count);
+        }
+        Counts {
+            data,
+            mrd: HashMap::new(),
+        }
+    }
+
+    pub fn mrd_from_files(&mut self, paths: Vec<String>) {
+        let counts: Vec<Vec<Count>> = paths
+            .par_iter()
+            .map(|path| match read_counts_from_file(path) {
+                Ok(c) => c,
+                Err(e) => {
+                    warn!("Couldnt load {path}: {e}");
+                    Vec::new()
+                }
+            })
+            .filter(|v| !v.is_empty())
+            .collect();
+
+        let mut data = HashMap::new();
+        for count in counts {
+            let contig = count.first().unwrap().position.contig.clone();
+            data.insert(contig, count);
+        }
+        self.mrd = data;
+    }
+
+    pub fn mask_low_mrd(&mut self, contig: &str, min_reads: u32) -> anyhow::Result<()> {
+        if let (Some(mrd), Some(diag)) = (self.mrd.get(contig), self.data.get_mut(contig)) {
+            for (m, d) in mrd.iter().zip(diag) {
+                if m.n_reads < min_reads {
+                    d.annotation.push(CountAnnotation::MaskedLowMRD);
+                }
+            }
+            Ok(())
+        } else {
+            anyhow::bail!("No {contig} in both mrd and diag.")
+        }
+    }
+
+    pub fn mask_low_quality(&mut self, contig: &str, max_ratio: f64) -> anyhow::Result<()> {
+        if let Some(diag) = self.data.get_mut(contig) {
+            for d in diag.iter_mut() {
+                if (d.n_low_mapq as f64 / (d.n_reads + d.n_low_mapq) as f64) > max_ratio {
+                    d.annotation.push(CountAnnotation::MaskedQuality);
+                }
+            }
+            Ok(())
+        } else {
+            anyhow::bail!("No {contig} in both mrd and diag.")
+        }
+    }
+
+    pub fn frequencies(&self, contig: &str) -> anyhow::Result<Vec<(f64, f64)>> {
+        let data = self.get(contig)?;
+
+        let mut frequencies = HashMap::new();
+        for count in data.iter() {
+            *frequencies.entry(*count).or_insert(0) += 1;
+        }
+
+        let mut frequencies: Vec<(u32, f64)> =
+            frequencies.iter().map(|(x, y)| (*x, *y as f64)).collect();
+        frequencies.sort_by_key(|v| v.0);
+        Ok(frequencies.iter().map(|(x, y)| (*x as f64, *y)).collect())
+    }
+
+    pub fn percentile(&self, contig: &str, percentile: f64) -> anyhow::Result<u32> {
+        let mut data = self.get(contig)?;
+        data.sort_unstable();
+        let total_count = data.len();
+        let index = |percentile: f64| -> usize {
+            (percentile / 100.0 * (total_count - 1) as f64).round() as usize
+        };
+
+        Ok(*data.get(index(percentile)).context("Error in percentile")?)
+    }
+
+    pub fn save_contig(
+        &mut self,
+        contig: &str,
+        prefix: &str,
+        breaks: Vec<u32>,
+    ) -> anyhow::Result<CountsStats> {
+        self.mask_low_mrd(contig, 6)?;
+        self.mask_low_quality(contig, 0.1)?;
+
+        let data: Vec<f64> = self.get(contig)?.iter().map(|v| *v as f64).collect();
+        let n_final = data.len();
+
+        let frequencies = self.frequencies(contig)?;
+        let percentile_99 = self.percentile(contig, 99.0)?;
+
+        let mut data_x = Vec::new();
+        let mut data_y = Vec::new();
+        frequencies.iter().for_each(|(x, y)| {
+            if *x <= percentile_99 as f64 {
+                data_x.push(*x);
+                data_y.push(*y / n_final as f64);
+            }
+        });
+
+        // Distribution plot
+        let distribution_path = format!("{prefix}_{contig}_distrib.svg");
+        info!("Saving graph: {distribution_path}");
+        let mut plot = Plot::new();
+        let colors: Vec<Rgb> = data_x
+            .iter()
+            .map(|&x| match x {
+                x if x < 2.0 => Rgb::new(193, 18, 31),
+                x if x < 6.0 => Rgb::new(243, 114, 44),
+                x if x < 15.0 => Rgb::new(255, 202, 58),
+                _ => Rgb::new(138, 201, 38),
+            })
+            .collect();
+
+        let bars = Bar::new(data_x.clone(), data_y.clone())
+            .show_legend(false)
+            .marker(Marker::new().color_array(colors));
+
+        plot.add_trace(bars);
+
+        let sum: f64 = data.iter().sum();
+        let mean = (&data).mean();
+        let count = data.len() as f64;
+        let std_dev = (&data).std_dev();
+
+        // Normal
+        let normal = statrs::distribution::Normal::new(mean, std_dev)?;
+        let data_y: Vec<f64> = data_x.iter().map(|x| normal.pdf(*x)).collect();
+        let trace = Scatter::new(data_x.clone(), data_y).name("Normal");
+        plot.add_trace(trace);
+
+        // // Gamma
+        // let shape = mean * mean / variance;
+        // let rate = mean / variance;
+        //
+        // let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
+        // let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
+        // let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
+        // plot.add_trace(trace);
+
+        // Poisson
+        let lambda = sum / count;
+        let poisson = statrs::distribution::Poisson::new(lambda)?;
+        let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
+        let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
+        plot.add_trace(trace);
+
+        plot.write_image(distribution_path, plotly::ImageFormat::SVG, 800, 600, 1.0);
+
+        // Fractions
+        let mut breaks_values = Vec::new();
+        for (i, b) in breaks.iter().enumerate() {
+            if i == 0 {
+                let total: f64 = frequencies
+                    .iter()
+                    .filter(|(x, _)| *x < *b as f64)
+                    .map(|(_, y)| *y / count)
+                    .sum();
+                breaks_values.push((format!("< {b}"), total));
+            } else {
+                let last = breaks[i - 1];
+                let total: f64 = frequencies
+                    .iter()
+                    .filter(|(x, _)| *x < *b as f64 && *x >= last as f64)
+                    .map(|(_, y)| *y / count)
+                    .sum();
+
+                breaks_values.push((format!("[{last} - {b}["), total));
+            }
+        }
+
+        let last = *breaks.last().unwrap();
+        let total: f64 = frequencies
+            .iter()
+            .filter(|(x, _)| *x >= last as f64)
+            .map(|(_, y)| *y / count)
+            .sum();
+        breaks_values.push((format!(">= {last}"), total));
+
+        // Chromosome
+        let tol = 25;
+        let chromosome_path = format!("{prefix}_{contig}_chromosome.svg");
+        info!("Saving graph: {chromosome_path}");
+
+        let target_annotations: HashSet<CountAnnotation> = vec![
+            CountAnnotation::MaskedLowMRD,
+            CountAnnotation::MaskedQuality,
+        ]
+        .into_iter()
+        .collect();
+
+        let d: Vec<u32> = self
+            .data
+            .get(contig)
+            .unwrap()
+            .iter()
+            .map(|c| {
+                if c.annotation
+                    .iter()
+                    .any(|ann| target_annotations.contains(ann))
+                {
+                    10_000u32
+                } else {
+                    c.n_reads
+                }
+            })
+            .collect();
+
+        let hm = self.counts_annotations(contig)?;
+        let len = d.len();
+        let mut masked: Vec<(String, f64)> = hm
+            .iter()
+            .map(|(k, v)| (format!("{:?}", k), *v as f64 / len as f64))
+            .collect();
+        masked.push(("Un masked".to_string(), n_final as f64 / len as f64));
+
+        let under_6_rects: Vec<AdditionalRect> = ranges_under(&d, 5, tol)
+            .iter()
+            .filter(|(s, e)| e > s)
+            .map(|(start, end)| AdditionalRect {
+                start: *start as u32 * 1000,
+                end: *end as u32 * 1000,
+                color: String::from("red"),
+                position: RectPosition::Below(1),
+            })
+            .collect();
+
+        let over_6_rects: Vec<AdditionalRect> = ranges_between(&d, 6, 9999, tol)
+            .iter()
+            .filter(|(s, e)| e > s)
+            .map(|(start, end)| AdditionalRect {
+                start: *start as u32 * 1000,
+                end: *end as u32 * 1000,
+                color: String::from("green"),
+                position: RectPosition::Below(2),
+            })
+            .collect();
+
+        let masked_rec: Vec<AdditionalRect> = ranges_over(&d, 10000, tol)
+            .iter()
+            .filter(|(s, e)| e > s)
+            .map(|(start, end)| AdditionalRect {
+                start: *start as u32 * 1000,
+                end: *end as u32 * 1000,
+                color: String::from("grey"),
+                position: RectPosition::Below(0),
+            })
+            .collect();
+
+        let mut all = Vec::new();
+        all.extend(under_6_rects);
+        all.extend(over_6_rects);
+        // all.extend(over15);
+        all.extend(masked_rec);
+
+        svg_chromosome(
+            contig,
+            1000,
+            50,
+            "/data/ref/hs1/cytoBandMapped.bed",
+            &chromosome_path,
+            &all,
+            &Vec::new(),
+        )
+        .unwrap();
+
+        let stats = CountsStats {
+            sum,
+            mean,
+            std_dev,
+            breaks_values,
+            masked,
+        };
+
+        // Save stats
+        let json_path = format!("{prefix}_{contig}_stats.json");
+        info!("Saving stats into: {json_path}");
+        let json = serde_json::to_string_pretty(&stats)?;
+        let mut file = File::create(json_path)?;
+        file.write_all(json.as_bytes())?;
+
+        Ok(stats)
+    }
+
+    pub fn save_contigs(
+        &mut self,
+        contigs: &Vec<String>,
+        prefix: &str,
+        breaks: Vec<u32>,
+    ) -> anyhow::Result<()> {
+        let mut stats = Vec::new();
+        let mut proportions = HashMap::new();
+        for contig in contigs {
+            let stat = self.save_contig(contig, prefix, breaks.clone())?;
+            let un_masked: Vec<&(String, f64)> = stat.masked.iter().filter(|(s, _)| s == "Un masked").collect();
+            let un_masked = un_masked.first().unwrap().1;
+            let masked = 1.0 - un_masked;
+            let mut props: Vec<(String, f64)> = stat.breaks_values.iter().map(|(s, v)| (s.to_string(), *v * un_masked)).collect();
+            props.push(("masked".to_string(), masked));
+            props.iter().for_each(|(s, v)| {
+                proportions.entry(s.to_string()).or_insert(vec![]).push(*v);
+            });
+            stats.push(stat);
+        }
+
+        let mut plot = Plot::new();
+        let layout = Layout::new().bar_mode(BarMode::Stack);
+        
+        for (k, v) in proportions {
+            plot.add_trace(Bar::new(contigs.clone(), v.to_vec()).name(k));
+        }
+        println!("{:?}", contigs);
+        plot.set_layout(layout);
+        plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
+
+        Ok(())
+    }
+
+    pub fn counts_annotations(
+        &self,
+        contig: &str,
+    ) -> anyhow::Result<HashMap<CountAnnotation, u64>> {
+        if let Some(d) = self.data.get(contig) {
+            let mut counts = HashMap::new();
+            for c in d {
+                for a in &c.annotation {
+                    *counts.entry(a.clone()).or_insert(0) += 1;
+                }
+            }
+            Ok(counts)
+        } else {
+            anyhow::bail!("No {contig} in counts.")
+        }
+    }
+
+    pub fn get(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
+        if let Some(ccounts) = self.data.get(contig) {
+            let target_annotations: HashSet<CountAnnotation> = vec![
+                CountAnnotation::MaskedLowMRD,
+                CountAnnotation::MaskedQuality,
+            ]
+            .into_iter()
+            .collect();
+
+            Ok(ccounts
+                .iter()
+                .filter(|count| {
+                    !count
+                        .annotation
+                        .iter()
+                        .any(|ann| target_annotations.contains(ann))
+                })
+                .map(|c| c.n_reads)
+                .collect())
+        } else {
+            anyhow::bail!("No {contig} in counts.")
+        }
+    }
+
+    pub fn mrd(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
+        if let Some(ccounts) = self.mrd.get(contig) {
+            Ok(ccounts.iter().map(|c| c.n_reads).collect())
+        } else {
+            anyhow::bail!("No {contig} in counts.")
+        }
+    }
+
+    pub fn calculate_percentiles(
+        &self,
+        contig: &str,
+        percentiles: &[f64],
+    ) -> anyhow::Result<Vec<f64>> {
+        if let Some(ccounts) = self.data.get(contig) {
+            let mut n_reads: Vec<u32> = ccounts.iter().map(|c| c.n_reads).collect();
+
+            n_reads.sort_unstable();
+
+            let cdf = ND::new(n_reads.clone());
+
+            // println!("CDF at 13: {:?}", cdf.cdf(13));
+            println!("Percentile at 99: {:?}", cdf.percentile(99.0));
+            println!("above 15X: {:?}", cdf.proportion_above(15));
+            // println!("above 15.1X: {:?}", cdf.fitted_proportion_above(&15.1));
+            println!("under 6X: {:?}", cdf.proportion_under(6));
+
+            Ok(percentiles
+                .iter()
+                .map(|&p| {
+                    let index = (p * (n_reads.len() - 1) as f64).round() as usize;
+                    n_reads[index] as f64
+                })
+                .collect())
+        } else {
+            anyhow::bail!("No {contig} in counts.")
+        }
+    }
+
+    pub fn nd_reads(&self, contig: &str) -> anyhow::Result<ND> {
+        if let Some(ccounts) = self.data.get(contig) {
+            Ok(ND::new(ccounts.iter().map(|c| c.n_reads).collect()))
+        } else {
+            anyhow::bail!("No {contig} in counts")
+        }
+    }
+
+    pub fn distribution(&self, contig: &str) -> anyhow::Result<ND> {
+        Ok(ND::new(self.get(contig)?))
+    }
+
+    pub fn save_stats(&self) -> anyhow::Result<()> {
+        Ok(())
+    }
+
+    pub fn save_global_proportions_graph(
+        &self,
+        path: &str,
+        contigs: &Vec<String>,
+        breaks: Vec<u32>,
+    ) {
+        let mut breaks_str = Vec::new();
+        for (i, b) in breaks.iter().enumerate() {
+            if i == 0 {
+                breaks_str.push(format!("< {b}"));
+            } else {
+                let last = breaks[i - 1];
+                breaks_str.push(format!("[{last} - {b}["))
+            }
+        }
+        breaks_str.push(format!(">= {}", breaks.last().unwrap()));
+
+        let mut proportions = Vec::new();
+
+        for contig in contigs.iter() {
+            let d = self.get(contig).unwrap();
+            let nd = ND::new(d);
+            proportions.push(nd.frequencies(&breaks));
+        }
+
+        let mut plot = Plot::new();
+        let layout = Layout::new().bar_mode(BarMode::Stack);
+        for (i, v) in transpose(proportions).iter().enumerate() {
+            plot.add_trace(Bar::new(contigs.clone(), v.to_vec()).name(&breaks_str[i]));
+        }
+        println!("{:?}", contigs);
+        plot.set_layout(layout);
+        plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
+    }
+
+    pub fn save_global_distribution_graph(&self, path: &str, contigs: &Vec<String>) {
+        let d: Vec<u32> = contigs.iter().flat_map(|c| self.get(c).unwrap()).collect();
+        let mut data_sorted = d.clone();
+        data_sorted.sort_unstable();
+
+        let nd = ND::new(d.clone());
+        let mut plot = Plot::new();
+
+        let bar_x: Vec<u32> = (1..=nd.percentile(99.0).unwrap()).collect();
+        let colors: Vec<plotly::color::Rgb> = bar_x
+            .iter()
+            .map(|&x| {
+                if x <= 2 {
+                    plotly::color::Rgb::new(193, 18, 31)
+                } else if x >= 15 {
+                    plotly::color::Rgb::new(138, 201, 38)
+                } else if x <= 6 {
+                    plotly::color::Rgb::new(243, 114, 44)
+                } else {
+                    plotly::color::Rgb::new(255, 202, 58)
+                }
+            })
+            .collect();
+
+        let data: Vec<u32> = d.iter().filter(|x| **x >= 1).copied().collect();
+
+        // frequencies
+        let mut frequencies = HashMap::new();
+        for &value in &data {
+            *frequencies.entry(value).or_insert(0) += 1;
+        }
+
+        let bars = Bar::new(
+            bar_x.clone(),
+            bar_x
+                .iter()
+                .map(|x| *frequencies.get(x).unwrap_or(&0) as f64 / data.len() as f64)
+                .collect(),
+        )
+        .show_legend(false)
+        .marker(Marker::new().color_array(colors));
+
+        plot.add_trace(bars);
+
+        let data_x = generate_range(0.0, nd.percentile(99.0).unwrap().into(), 100);
+        let data_y: Vec<f64> = data_x.iter().map(|x| nd.fitted_normal.pdf(x)).collect();
+        let trace = Scatter::new(data_x.clone(), data_y).name("Gaussian");
+        plot.add_trace(trace);
+
+        // Gamma
+        let data: Vec<f64> = d.iter().map(|x| *x as f64).collect();
+
+        let sum: f64 = data.iter().sum();
+        let mean = (&data).mean();
+        let variance = (&data).variance();
+        let count = d.len() as f64;
+        let shape = mean * mean / variance;
+        let rate = mean / variance;
+
+        let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
+        let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
+        let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
+        plot.add_trace(trace);
+
+        // Poisson
+        let lambda = sum / count;
+        let poisson = statrs::distribution::Poisson::new(lambda).unwrap();
+        let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
+        let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
+        plot.add_trace(trace);
+
+        plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
+        println!("> 15x: {:?}", nd.frequencies(&vec![1, 6, 15]));
+    }
+}
+
+pub struct ND {
+    pub data: Vec<u32>,
+    pub distribution: BTreeMap<u32, f64>,
+    pub total_count: usize,
+    pub frequency: HashMap<u32, usize>,
+    pub fitted_normal: UvNormal,
+}
+
+use rstat::{fitting::MLE, normal::UvNormal, ContinuousDistribution};
+
+impl ND {
+    fn new(mut data: Vec<u32>) -> Self {
+        data.sort_unstable();
+        let n = data.len();
+        info!("n values {n}");
+        let mut distribution = BTreeMap::new();
+
+        let mut frequency = HashMap::new();
+        for &value in &data {
+            *frequency.entry(value).or_insert(0) += 1;
+        }
+
+        let mut cumulative_count = 0;
+        for (&value, &count) in &frequency {
+            cumulative_count += count;
+            let cumulative_prob = cumulative_count as f64 / n as f64;
+            distribution.insert(value, cumulative_prob);
+        }
+
+        // Fit normal distribution
+        let fitted_normal = rstat::univariate::normal::Normal::fit_mle(
+            &data
+                .iter()
+                .filter(|x| *x >= &1u32)
+                .map(|x| *x as f64)
+                .collect::<Vec<f64>>(),
+        )
+        .unwrap();
+
+        Self {
+            data,
+            distribution,
+            frequency,
+            total_count: n,
+            fitted_normal,
+        }
+    }
+
+    pub fn frequency(&self, x: u32) -> usize {
+        *self.frequency.get(&x).unwrap_or(&0)
+    }
+
+    pub fn frequencies(&self, breaks: &Vec<u32>) -> Vec<f64> {
+        let mut last_prop_under = 0.0;
+        let mut res = Vec::new();
+        for brk in breaks {
+            let v = self.proportion_under(*brk) - last_prop_under;
+            res.push(v);
+            last_prop_under += v;
+        }
+        let per99 = self.percentile(99.0).unwrap();
+        res.push(self.proportion_under(per99) - last_prop_under);
+        res
+    }
+
+    pub fn percentile(&self, percentile: f64) -> Option<u32> {
+        if !(0.0..=100.0).contains(&percentile) {
+            return None;
+        }
+
+        let index = (percentile / 100.0 * (self.total_count - 1) as f64).round() as usize;
+        self.data.get(index).cloned()
+    }
+
+    pub fn proportion_under(&self, x: u32) -> f64 {
+        let count = self
+            .frequency
+            .iter()
+            .filter(|(&value, _)| value < x)
+            .map(|(_, &count)| count)
+            .sum::<usize>();
+        count as f64 / self.total_count as f64
+    }
+
+    pub fn proportion_above(&self, x: u32) -> f64 {
+        let count = self
+            .frequency
+            .iter()
+            .filter(|(&value, _)| value > x)
+            .map(|(_, &count)| count)
+            .sum::<usize>();
+        count as f64 / self.total_count as f64
+    }
+}
+
+pub fn generate_range(start: f64, end: f64, steps: usize) -> Vec<f64> {
+    if steps == 0 {
+        return vec![];
+    }
+    if steps == 1 {
+        return vec![start];
+    }
+
+    let step_size = (end - start) / (steps - 1) as f64;
+    (0..steps).map(|i| start + i as f64 * step_size).collect()
+}
+
+use rayon::prelude::*;
+
+pub fn ranges_under(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
+    get_ranges_parallel(vec, x, tolerance, |val, threshold| val <= threshold)
+}
+
+pub fn ranges_over(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
+    get_ranges_parallel(vec, x, tolerance, |val, threshold| val >= threshold)
+}
+
+pub fn ranges_between(
+    vec: &[u32],
+    lower: u32,
+    upper: u32,
+    tolerance: usize,
+) -> Vec<(usize, usize)> {
+    get_ranges_parallel(vec, (lower, upper), tolerance, |val, (l, u)| {
+        val >= l && val <= u
+    })
+}
+
+pub fn get_ranges_parallel<T, F>(
+    vec: &[u32],
+    threshold: T,
+    tolerance: usize,
+    compare: F,
+) -> Vec<(usize, usize)>
+where
+    F: Fn(u32, T) -> bool + Sync,
+    T: Copy + Sync,
+{
+    if vec.is_empty() {
+        return Vec::new();
+    }
+
+    let chunk_size = (vec.len() / rayon::current_num_threads()).max(1);
+    vec.par_chunks(chunk_size)
+        .enumerate()
+        .flat_map(|(chunk_index, chunk)| {
+            let mut local_ranges = Vec::new();
+            let mut current_range: Option<(usize, usize)> = None;
+            let offset = chunk_index * chunk_size;
+
+            for (i, &val) in chunk.iter().enumerate() {
+                let global_index = offset + i;
+                if compare(val, threshold) {
+                    match current_range {
+                        Some((start, end)) if global_index <= end + tolerance + 1 => {
+                            current_range = Some((start, global_index));
+                        }
+                        Some((start, end)) => {
+                            local_ranges.push((start, end));
+                            current_range = Some((global_index, global_index));
+                        }
+                        None => {
+                            current_range = Some((global_index, global_index));
+                        }
+                    }
+                } else if let Some((start, end)) = current_range {
+                    if global_index > end + tolerance + 1 {
+                        local_ranges.push((start, end));
+                        current_range = None;
+                    }
+                }
+            }
+
+            if let Some(range) = current_range {
+                local_ranges.push(range);
+            }
+
+            local_ranges
+        })
+        .collect()
+}
+
+pub fn transpose(v: Vec<Vec<f64>>) -> Vec<Vec<f64>> {
+    assert!(!v.is_empty());
+    let len = v[0].len();
+    let mut result = vec![Vec::with_capacity(v.len()); len];
+
+    for row in v {
+        for (i, val) in row.into_iter().enumerate() {
+            result[i].push(val);
+        }
+    }
+    result
+}
+
+#[derive(Debug, Serialize)]
+pub struct CountsStats {
+    pub sum: f64,
+    pub mean: f64,
+    pub std_dev: f64,
+    pub breaks_values: Vec<(String, f64)>,
+    pub masked: Vec<(String, f64)>,
+}
+
+// pub fn save_barplota
+//     data: Vec<f64>,
+//     data_x: Vec<f64>,
+//     data_y: Vec<f64>,
+//     path: &str,
+// ) -> anyhow::Result<CountsStats> {
+//     let mut plot = Plot::new();
+//
+//     let colors: Vec<plotly::color::Rgb> = data_x
+//         .iter()
+//         .map(|&x| {
+//             if x <= 2.0 {
+//                 plotly::color::Rgb::new(193, 18, 31)
+//             } else if x >= 15.0 {
+//                 plotly::color::Rgb::new(138, 201, 38)
+//             } else if x <= 6.0 {
+//                 plotly::color::Rgb::new(243, 114, 44)
+//             } else {
+//                 plotly::color::Rgb::new(255, 202, 58)
+//             }
+//         })
+//         .collect();
+//
+//     let bars = Bar::new(data_x.clone(), data_y.clone())
+//         .show_legend(false)
+//         .marker(Marker::new().color_array(colors));
+//
+//     plot.add_trace(bars);
+//
+//     let sum: f64 = data.iter().sum();
+//     let mean = (&data).mean();
+//     let count = data.len() as f64;
+//     let std_dev = (&data).std_dev();
+//     println!("mean {mean}");
+//
+//     // Normal
+//     let normal = statrs::distribution::Normal::new(mean, std_dev)?;
+//     let data_y: Vec<f64> = data_x.iter().map(|x| normal.pdf(*x)).collect();
+//     let trace = Scatter::new(data_x.clone(), data_y).name("Normal");
+//     plot.add_trace(trace);
+//
+//     // // Gamma
+//     // let shape = mean * mean / variance;
+//     // let rate = mean / variance;
+//     //
+//     // let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
+//     // let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
+//     // let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
+//     // plot.add_trace(trace);
+//
+//     // Poisson
+//     let lambda = sum / count;
+//     let poisson = statrs::distribution::Poisson::new(lambda)?;
+//     let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
+//     let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
+//     plot.add_trace(trace);
+//
+//     plot.use_local_plotly();
+//     plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
+//     Ok(CountsStats { sum, mean, std_dev })
+// }

+ 12 - 228
Cargo.lock

@@ -116,9 +116,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.88"
+version = "1.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4e1496f8fb1fbf272686b8d37f523dab3e4a7443300055e74cdaa449f3114356"
+checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6"
 
 [[package]]
 name = "approx"
@@ -144,32 +144,12 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
 
-[[package]]
-name = "atomic-polyfill"
-version = "1.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8cf2bce30dfe09ef0bfaef228b9d414faaf7e563035494d7fe092dba54b300f4"
-dependencies = [
- "critical-section",
-]
-
 [[package]]
 name = "autocfg"
 version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
 
-[[package]]
-name = "average"
-version = "0.15.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a237a6822e1c3c98e700b6db5b293eb341b7524dcb8d227941245702b7431dc"
-dependencies = [
- "easy-cast",
- "float-ord",
- "num-traits",
-]
-
 [[package]]
 name = "backtrace"
 version = "0.3.74"
@@ -248,16 +228,6 @@ dependencies = [
  "generic-array",
 ]
 
-[[package]]
-name = "bstr"
-version = "1.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c"
-dependencies = [
- "memchr",
- "serde",
-]
-
 [[package]]
 name = "bumpalo"
 version = "3.16.0"
@@ -276,12 +246,6 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
-[[package]]
-name = "bytes"
-version = "1.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50"
-
 [[package]]
 name = "bzip2"
 version = "0.4.4"
@@ -305,9 +269,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.1.18"
+version = "1.1.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b62ac837cdb5cb22e10a256099b4fc502b1dfe560cb282963a974d7abd80e476"
+checksum = "07b1695e2c7e8fc85310cde85aeaab7e3097f593c91d209d3f9df76c928100f0"
 dependencies = [
  "jobserver",
  "libc",
@@ -372,12 +336,6 @@ dependencies = [
  "cc",
 ]
 
-[[package]]
-name = "cobs"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67ba02a97a2bd10f4b59b25c7973101c79642302776489e030cd13cdab09ed15"
-
 [[package]]
 name = "colorchoice"
 version = "1.0.2"
@@ -442,12 +400,6 @@ dependencies = [
  "cfg-if",
 ]
 
-[[package]]
-name = "critical-section"
-version = "1.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f64009896348fc5af4222e9cf7d7d82a95a256c634ebcf61c53e4ea461422242"
-
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.13"
@@ -688,33 +640,12 @@ version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125"
 
-[[package]]
-name = "easy-cast"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10936778145f3bea71fd9bf61332cce28c28e96a380714f7ab34838b80733fd6"
-dependencies = [
- "libm",
-]
-
 [[package]]
 name = "either"
 version = "1.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 
-[[package]]
-name = "embedded-io"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef1a6892d9eef45c8fa6b9e0086428a2cca8491aca8f787c534a3d6d0bcb3ced"
-
-[[package]]
-name = "embedded-io"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d"
-
 [[package]]
 name = "encode_unicode"
 version = "0.3.6"
@@ -792,12 +723,6 @@ dependencies = [
  "miniz_oxide",
 ]
 
-[[package]]
-name = "float-ord"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ce81f49ae8a0482e4c55ea62ebbd7e5a686af544c00b9d090bba3ff9be97b3d"
-
 [[package]]
 name = "fnv"
 version = "1.0.7"
@@ -866,15 +791,6 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
-[[package]]
-name = "hash32"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67"
-dependencies = [
- "byteorder",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -887,20 +803,6 @@ version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 
-[[package]]
-name = "heapless"
-version = "0.7.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f"
-dependencies = [
- "atomic-polyfill",
- "hash32",
- "rustc_version 0.4.1",
- "serde",
- "spin",
- "stable_deref_trait",
-]
-
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -956,9 +858,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
 
 [[package]]
 name = "iana-time-zone"
-version = "0.1.60"
+version = "0.1.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
+checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220"
 dependencies = [
  "android_system_properties",
  "core-foundation-sys",
@@ -1320,7 +1222,7 @@ version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ac8cd24d9f185bb7223958d8c1ff7a961b74b1953fd05dba7cc568a63b3861ec"
 dependencies = [
- "rustc_version 0.1.7",
+ "rustc_version",
 ]
 
 [[package]]
@@ -1333,40 +1235,6 @@ dependencies = [
  "minimal-lexical",
 ]
 
-[[package]]
-name = "noodles-bgzf"
-version = "0.32.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b2fba0f4a64cc897d9396d730a0c444d148daed7de31ad5904ecc673178fc9d"
-dependencies = [
- "byteorder",
- "bytes",
- "crossbeam-channel",
- "flate2",
-]
-
-[[package]]
-name = "noodles-core"
-version = "0.15.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5a8c6b020d1205abef2b0fab4463a6c5ecc3c8f4d561ca8b0d1a42323376200"
-dependencies = [
- "bstr",
-]
-
-[[package]]
-name = "noodles-fasta"
-version = "0.41.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7a1662ac3ace299515c982a322e378bbeb4c1bd90fb098d823ef0f3a6abcc00"
-dependencies = [
- "bstr",
- "bytes",
- "memchr",
- "noodles-bgzf",
- "noodles-core",
-]
-
 [[package]]
 name = "num"
 version = "0.2.1"
@@ -1552,25 +1420,7 @@ dependencies = [
 [[package]]
 name = "pandora_lib_graph"
 version = "0.1.0"
-source = "git+https://git.t0m4.fr/Thomas/pandora_lib_graph.git#87ea889f2c132fc0c162cf01fb2383a0667c4ef1"
-dependencies = [
- "pandora_lib_scan 0.1.0 (git+https://git.t0m4.fr/Thomas/pandora_lib_scan.git)",
-]
-
-[[package]]
-name = "pandora_lib_pileup"
-version = "0.1.0"
-source = "git+https://git.t0m4.fr/Thomas/pandora_lib_pileup.git#381addffe723a828effcb5aabc00ada14586ba5f"
-dependencies = [
- "anyhow",
- "average",
- "env_logger",
- "log",
- "noodles-fasta",
- "rayon",
- "rust-htslib",
- "uuid",
-]
+source = "git+https://git.t0m4.fr/Thomas/pandora_lib_graph.git#1e08bc3af3d4d899b925f05421874da1fa5025ae"
 
 [[package]]
 name = "pandora_lib_scan"
@@ -1581,16 +1431,14 @@ dependencies = [
  "csv",
  "dashmap",
  "env_logger",
- "flate2",
+ "indexmap 2.5.0",
  "indicatif",
  "indicatif-log-bridge",
  "log",
  "num-format",
  "ordered-float",
  "pandora_lib_graph",
- "pandora_lib_pileup",
  "plotly",
- "postcard",
  "rand 0.8.5",
  "rayon",
  "rstat",
@@ -1601,33 +1449,6 @@ dependencies = [
  "uuid",
 ]
 
-[[package]]
-name = "pandora_lib_scan"
-version = "0.1.0"
-source = "git+https://git.t0m4.fr/Thomas/pandora_lib_scan.git#6daa641b3b51905189137d4653bb8602421ec5f0"
-dependencies = [
- "anyhow",
- "crossbeam-channel",
- "csv",
- "dashmap",
- "env_logger",
- "flate2",
- "indicatif",
- "indicatif-log-bridge",
- "log",
- "num-format",
- "ordered-float",
- "pandora_lib_pileup",
- "plotly",
- "postcard",
- "rayon",
- "rust-htslib",
- "serde",
- "serde_json",
- "statrs",
- "uuid",
-]
-
 [[package]]
 name = "parking_lot"
 version = "0.12.3"
@@ -1730,19 +1551,6 @@ version = "1.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265"
 
-[[package]]
-name = "postcard"
-version = "1.0.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f7f0a8d620d71c457dd1d47df76bb18960378da56af4527aaa10f515eee732e"
-dependencies = [
- "cobs",
- "embedded-io 0.4.0",
- "embedded-io 0.6.1",
- "heapless",
- "serde",
-]
-
 [[package]]
 name = "powerfmt"
 version = "0.2.0"
@@ -2053,16 +1861,7 @@ version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084"
 dependencies = [
- "semver 0.1.20",
-]
-
-[[package]]
-name = "rustc_version"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
-dependencies = [
- "semver 1.0.23",
+ "semver",
 ]
 
 [[package]]
@@ -2098,12 +1897,6 @@ version = "0.1.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac"
 
-[[package]]
-name = "semver"
-version = "1.0.23"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b"
-
 [[package]]
 name = "serde"
 version = "1.0.210"
@@ -2238,15 +2031,6 @@ dependencies = [
  "cc",
 ]
 
-[[package]]
-name = "spin"
-version = "0.9.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
-dependencies = [
- "lock_api",
-]
-
 [[package]]
 name = "stable_deref_trait"
 version = "1.2.0"
@@ -2425,9 +2209,9 @@ checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.23"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
+checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
 dependencies = [
  "tinyvec",
 ]

+ 1 - 3
Cargo.toml

@@ -13,12 +13,9 @@ num-format = "0.4.4"
 rayon = "1.10.0"
 rust-htslib = "0.47.0"
 uuid = { version = "1.10.0", features = ["v4"] }
-pandora_lib_pileup = { git = "https://git.t0m4.fr/Thomas/pandora_lib_pileup.git" }
 pandora_lib_graph = { git = "https://git.t0m4.fr/Thomas/pandora_lib_graph.git" }
 indicatif-log-bridge = "0.2.2"
 serde = { version = "1.0.*", default-features = false }
-postcard = { version = "1.0.8", features = ["alloc"] }
-flate2 = "1.0.30"
 csv = "1.3.0"
 dashmap = { version = "6.0.1", features = ["rayon"] }
 serde_json = "1.0.128"
@@ -27,4 +24,5 @@ statrs = "0.17.1"
 plotly = { version = "0.9.1", features = ["kaleido"] }
 rstat = "0.6.0"
 rand = "0.8.5"
+indexmap = "2.5.0"
 

+ 56 - 336
src/counts.rs

@@ -1,9 +1,8 @@
 use anyhow::Context;
+use indexmap::IndexMap;
 use log::{info, warn};
-use ordered_float::Float;
 use pandora_lib_graph::cytoband::{svg_chromosome, AdditionalRect, RectPosition};
 use plotly::{color::Rgb, common::Marker, layout::BarMode, Bar, Layout, Plot, Scatter};
-use rand::{thread_rng, Rng};
 use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
 use serde::{
     de::{self, Visitor},
@@ -14,7 +13,7 @@ use statrs::{
     statistics::Statistics,
 };
 use std::{
-    collections::{BTreeMap, HashMap, HashSet},
+    collections::{HashMap, HashSet},
     f64, fmt,
     fs::File,
     io::{BufRead, BufReader, Write},
@@ -130,6 +129,7 @@ pub fn read_counts_from_file(filename: &str) -> anyhow::Result<Vec<Count>> {
 
     Ok(counts)
 }
+
 fn escape_control_chars(s: &str) -> String {
     s.chars()
         .map(|c| {
@@ -250,7 +250,7 @@ impl Counts {
         contig: &str,
         prefix: &str,
         breaks: Vec<u32>,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<CountsStats> {
         self.mask_low_mrd(contig, 6)?;
         self.mask_low_quality(contig, 0.1)?;
 
@@ -316,7 +316,6 @@ impl Counts {
         let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
         plot.add_trace(trace);
 
-        plot.use_local_plotly();
         plot.write_image(distribution_path, plotly::ImageFormat::SVG, 800, 600, 1.0);
 
         // Fractions
@@ -347,7 +346,7 @@ impl Counts {
             .filter(|(x, _)| *x >= last as f64)
             .map(|(_, y)| *y / count)
             .sum();
-        breaks_values.push((format!(">= {last}"), total));
+        breaks_values.push((format!(" {last}"), total));
 
         // Chromosome
         let tol = 25;
@@ -451,7 +450,7 @@ impl Counts {
         let mut file = File::create(json_path)?;
         file.write_all(json.as_bytes())?;
 
-        Ok(())
+        Ok(stats)
     }
 
     pub fn save_contigs(
@@ -460,9 +459,58 @@ impl Counts {
         prefix: &str,
         breaks: Vec<u32>,
     ) -> anyhow::Result<()> {
+        let mut stats = Vec::new();
+        let mut proportions = IndexMap::new();
         for contig in contigs {
-            self.save_contig(contig, prefix, breaks.clone())?;
+            let stat = self.save_contig(contig, prefix, breaks.clone())?;
+            let un_masked: Vec<&(String, f64)> = stat
+                .masked
+                .iter()
+                .filter(|(s, _)| s == "Un masked")
+                .collect();
+            let un_masked = un_masked.first().unwrap().1;
+            let masked = 1.0 - un_masked;
+            let mut props: Vec<(String, f64)> = stat
+                .breaks_values
+                .iter()
+                .map(|(s, v)| (s.to_string(), *v * un_masked))
+                .collect();
+            props.push(("masked".to_string(), masked));
+            props.iter().for_each(|(s, v)| {
+                proportions.entry(s.to_string()).or_insert(vec![]).push(*v);
+            });
+            stats.push(stat);
+        }
+
+        let mut plot = Plot::new();
+        let layout = Layout::new().bar_mode(BarMode::Stack);
+        let colors = pandora_lib_graph::theme::Colors::default();
+
+        let colors = [
+            colors.get("dark_red"),
+            colors.get("red"),
+            colors.get("yellow"),
+            colors.get("green"),
+            colors.get("grey"),
+        ];
+
+        let mut contigs_rev = contigs.clone();
+        contigs_rev.reverse();
+        for (i, (k, v)) in proportions.iter().enumerate() {
+            let mut v = v.to_vec();
+            v.reverse();
+            plot.add_trace(
+                Bar::new(v, contigs_rev.clone())
+                    // Bar::new(contigs.clone(), v.to_vec())
+                    .orientation(plotly::common::Orientation::Horizontal)
+                    .name(k)
+                    .marker(Marker::new().color(colors[i].to_string())),
+            );
         }
+        plot.set_layout(layout);
+        let global_path = format!("{prefix}_global.svg");
+        plot.write_image(global_path, plotly::ImageFormat::SVG, 1000, 600, 1.0);
+
         Ok(())
     }
 
@@ -514,259 +562,6 @@ impl Counts {
             anyhow::bail!("No {contig} in counts.")
         }
     }
-
-    pub fn calculate_percentiles(
-        &self,
-        contig: &str,
-        percentiles: &[f64],
-    ) -> anyhow::Result<Vec<f64>> {
-        if let Some(ccounts) = self.data.get(contig) {
-            let mut n_reads: Vec<u32> = ccounts.iter().map(|c| c.n_reads).collect();
-
-            n_reads.sort_unstable();
-
-            let cdf = ND::new(n_reads.clone());
-
-            // println!("CDF at 13: {:?}", cdf.cdf(13));
-            println!("Percentile at 99: {:?}", cdf.percentile(99.0));
-            println!("above 15X: {:?}", cdf.proportion_above(15));
-            // println!("above 15.1X: {:?}", cdf.fitted_proportion_above(&15.1));
-            println!("under 6X: {:?}", cdf.proportion_under(6));
-
-            Ok(percentiles
-                .iter()
-                .map(|&p| {
-                    let index = (p * (n_reads.len() - 1) as f64).round() as usize;
-                    n_reads[index] as f64
-                })
-                .collect())
-        } else {
-            anyhow::bail!("No {contig} in counts.")
-        }
-    }
-
-    pub fn nd_reads(&self, contig: &str) -> anyhow::Result<ND> {
-        if let Some(ccounts) = self.data.get(contig) {
-            Ok(ND::new(ccounts.iter().map(|c| c.n_reads).collect()))
-        } else {
-            anyhow::bail!("No {contig} in counts")
-        }
-    }
-
-    pub fn distribution(&self, contig: &str) -> anyhow::Result<ND> {
-        Ok(ND::new(self.get(contig)?))
-    }
-
-    pub fn save_stats(&self) -> anyhow::Result<()> {
-        Ok(())
-    }
-
-    pub fn save_global_proportions_graph(
-        &self,
-        path: &str,
-        contigs: &Vec<String>,
-        breaks: Vec<u32>,
-    ) {
-        let mut breaks_str = Vec::new();
-        for (i, b) in breaks.iter().enumerate() {
-            if i == 0 {
-                breaks_str.push(format!("< {b}"));
-            } else {
-                let last = breaks[i - 1];
-                breaks_str.push(format!("[{last} - {b}["))
-            }
-        }
-        breaks_str.push(format!(">= {}", breaks.last().unwrap()));
-
-        let mut proportions = Vec::new();
-
-        for contig in contigs.iter() {
-            let d = self.get(contig).unwrap();
-            let nd = ND::new(d);
-            proportions.push(nd.frequencies(&breaks));
-        }
-
-        let mut plot = Plot::new();
-        let layout = Layout::new().bar_mode(BarMode::Stack);
-        for (i, v) in transpose(proportions).iter().enumerate() {
-            plot.add_trace(Bar::new(contigs.clone(), v.to_vec()).name(&breaks_str[i]));
-        }
-        println!("{:?}", contigs);
-        plot.set_layout(layout);
-        plot.use_local_plotly();
-        plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
-    }
-
-    pub fn save_global_distribution_graph(&self, path: &str, contigs: &Vec<String>) {
-        let d: Vec<u32> = contigs.iter().flat_map(|c| self.get(c).unwrap()).collect();
-        let mut data_sorted = d.clone();
-        data_sorted.sort_unstable();
-
-        let nd = ND::new(d.clone());
-        let mut plot = Plot::new();
-
-        let bar_x: Vec<u32> = (1..=nd.percentile(99.0).unwrap()).collect();
-        let colors: Vec<plotly::color::Rgb> = bar_x
-            .iter()
-            .map(|&x| {
-                if x <= 2 {
-                    plotly::color::Rgb::new(193, 18, 31)
-                } else if x >= 15 {
-                    plotly::color::Rgb::new(138, 201, 38)
-                } else if x <= 6 {
-                    plotly::color::Rgb::new(243, 114, 44)
-                } else {
-                    plotly::color::Rgb::new(255, 202, 58)
-                }
-            })
-            .collect();
-
-        let data: Vec<u32> = d.iter().filter(|x| **x >= 1).copied().collect();
-
-        // frequencies
-        let mut frequencies = HashMap::new();
-        for &value in &data {
-            *frequencies.entry(value).or_insert(0) += 1;
-        }
-
-        let bars = Bar::new(
-            bar_x.clone(),
-            bar_x
-                .iter()
-                .map(|x| *frequencies.get(x).unwrap_or(&0) as f64 / data.len() as f64)
-                .collect(),
-        )
-        .show_legend(false)
-        .marker(Marker::new().color_array(colors));
-
-        plot.add_trace(bars);
-
-        let data_x = generate_range(0.0, nd.percentile(99.0).unwrap().into(), 100);
-        let data_y: Vec<f64> = data_x.iter().map(|x| nd.fitted_normal.pdf(x)).collect();
-        let trace = Scatter::new(data_x.clone(), data_y).name("Gaussian");
-        plot.add_trace(trace);
-
-        // Gamma
-        let data: Vec<f64> = d.iter().map(|x| *x as f64).collect();
-
-        let sum: f64 = data.iter().sum();
-        let mean = (&data).mean();
-        let variance = (&data).variance();
-        let count = d.len() as f64;
-        let shape = mean * mean / variance;
-        let rate = mean / variance;
-
-        let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
-        let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
-        let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
-        plot.add_trace(trace);
-
-        // Poisson
-        let lambda = sum / count;
-        let poisson = statrs::distribution::Poisson::new(lambda).unwrap();
-        let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
-        let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
-        plot.add_trace(trace);
-
-        plot.use_local_plotly();
-        plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
-        println!("> 15x: {:?}", nd.frequencies(&vec![1, 6, 15]));
-    }
-}
-
-pub struct ND {
-    pub data: Vec<u32>,
-    pub distribution: BTreeMap<u32, f64>,
-    pub total_count: usize,
-    pub frequency: HashMap<u32, usize>,
-    pub fitted_normal: UvNormal,
-}
-
-use rstat::{fitting::MLE, normal::UvNormal, ContinuousDistribution};
-
-impl ND {
-    fn new(mut data: Vec<u32>) -> Self {
-        data.sort_unstable();
-        let n = data.len();
-        info!("n values {n}");
-        let mut distribution = BTreeMap::new();
-
-        let mut frequency = HashMap::new();
-        for &value in &data {
-            *frequency.entry(value).or_insert(0) += 1;
-        }
-
-        let mut cumulative_count = 0;
-        for (&value, &count) in &frequency {
-            cumulative_count += count;
-            let cumulative_prob = cumulative_count as f64 / n as f64;
-            distribution.insert(value, cumulative_prob);
-        }
-
-        // Fit normal distribution
-        let fitted_normal = rstat::univariate::normal::Normal::fit_mle(
-            &data
-                .iter()
-                .filter(|x| *x >= &1u32)
-                .map(|x| *x as f64)
-                .collect::<Vec<f64>>(),
-        )
-        .unwrap();
-
-        Self {
-            data,
-            distribution,
-            frequency,
-            total_count: n,
-            fitted_normal,
-        }
-    }
-
-    pub fn frequency(&self, x: u32) -> usize {
-        *self.frequency.get(&x).unwrap_or(&0)
-    }
-
-    pub fn frequencies(&self, breaks: &Vec<u32>) -> Vec<f64> {
-        let mut last_prop_under = 0.0;
-        let mut res = Vec::new();
-        for brk in breaks {
-            let v = self.proportion_under(*brk) - last_prop_under;
-            res.push(v);
-            last_prop_under += v;
-        }
-        let per99 = self.percentile(99.0).unwrap();
-        res.push(self.proportion_under(per99) - last_prop_under);
-        res
-    }
-
-    pub fn percentile(&self, percentile: f64) -> Option<u32> {
-        if !(0.0..=100.0).contains(&percentile) {
-            return None;
-        }
-
-        let index = (percentile / 100.0 * (self.total_count - 1) as f64).round() as usize;
-        self.data.get(index).cloned()
-    }
-
-    pub fn proportion_under(&self, x: u32) -> f64 {
-        let count = self
-            .frequency
-            .iter()
-            .filter(|(&value, _)| value < x)
-            .map(|(_, &count)| count)
-            .sum::<usize>();
-        count as f64 / self.total_count as f64
-    }
-
-    pub fn proportion_above(&self, x: u32) -> f64 {
-        let count = self
-            .frequency
-            .iter()
-            .filter(|(&value, _)| value > x)
-            .map(|(_, &count)| count)
-            .sum::<usize>();
-        count as f64 / self.total_count as f64
-    }
 }
 
 pub fn generate_range(start: f64, end: f64, steps: usize) -> Vec<f64> {
@@ -856,19 +651,6 @@ where
         .collect()
 }
 
-pub fn transpose(v: Vec<Vec<f64>>) -> Vec<Vec<f64>> {
-    assert!(!v.is_empty());
-    let len = v[0].len();
-    let mut result = vec![Vec::with_capacity(v.len()); len];
-
-    for row in v {
-        for (i, val) in row.into_iter().enumerate() {
-            result[i].push(val);
-        }
-    }
-    result
-}
-
 #[derive(Debug, Serialize)]
 pub struct CountsStats {
     pub sum: f64,
@@ -877,65 +659,3 @@ pub struct CountsStats {
     pub breaks_values: Vec<(String, f64)>,
     pub masked: Vec<(String, f64)>,
 }
-
-// pub fn save_barplota
-//     data: Vec<f64>,
-//     data_x: Vec<f64>,
-//     data_y: Vec<f64>,
-//     path: &str,
-// ) -> anyhow::Result<CountsStats> {
-//     let mut plot = Plot::new();
-//
-//     let colors: Vec<plotly::color::Rgb> = data_x
-//         .iter()
-//         .map(|&x| {
-//             if x <= 2.0 {
-//                 plotly::color::Rgb::new(193, 18, 31)
-//             } else if x >= 15.0 {
-//                 plotly::color::Rgb::new(138, 201, 38)
-//             } else if x <= 6.0 {
-//                 plotly::color::Rgb::new(243, 114, 44)
-//             } else {
-//                 plotly::color::Rgb::new(255, 202, 58)
-//             }
-//         })
-//         .collect();
-//
-//     let bars = Bar::new(data_x.clone(), data_y.clone())
-//         .show_legend(false)
-//         .marker(Marker::new().color_array(colors));
-//
-//     plot.add_trace(bars);
-//
-//     let sum: f64 = data.iter().sum();
-//     let mean = (&data).mean();
-//     let count = data.len() as f64;
-//     let std_dev = (&data).std_dev();
-//     println!("mean {mean}");
-//
-//     // Normal
-//     let normal = statrs::distribution::Normal::new(mean, std_dev)?;
-//     let data_y: Vec<f64> = data_x.iter().map(|x| normal.pdf(*x)).collect();
-//     let trace = Scatter::new(data_x.clone(), data_y).name("Normal");
-//     plot.add_trace(trace);
-//
-//     // // Gamma
-//     // let shape = mean * mean / variance;
-//     // let rate = mean / variance;
-//     //
-//     // let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
-//     // let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
-//     // let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
-//     // plot.add_trace(trace);
-//
-//     // Poisson
-//     let lambda = sum / count;
-//     let poisson = statrs::distribution::Poisson::new(lambda)?;
-//     let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
-//     let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
-//     plot.add_trace(trace);
-//
-//     plot.use_local_plotly();
-//     plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
-//     Ok(CountsStats { sum, mean, std_dev })
-// }

+ 2 - 103
src/lib.rs

@@ -306,10 +306,7 @@ pub fn par_whole_scan(dict_file: &str, bam_path: &str, out_dir: &str) -> anyhow:
 #[cfg(test)]
 mod tests {
 
-    use counts::{generate_range, ranges_between, ranges_over, ranges_under,  Counts};
-    use pandora_lib_graph::cytoband::{svg_chromosome, AdditionalRect, RectPosition};
-    use plotly::{common::Marker, Bar, Plot, Scatter};
-    use rstat::{statistics::Quantiles, ContinuousDistribution, Probability};
+    use counts::Counts;
     use rust_htslib::bam::Reader;
 
     use super::*;
@@ -407,10 +404,9 @@ mod tests {
     }
 
     #[test]
-    fn diff() -> anyhow::Result<()> {
+    fn save_stats() -> anyhow::Result<()> {
         init();
         let id = "BECERRA";
-        let contig = "chr1";
         let breaks = vec![1, 6, 15];
 
         let result_dir = "/data/longreads_basic_pipe";
@@ -435,105 +431,8 @@ mod tests {
                 .map(|c| format!("{result_dir}/{id}/mrd/scan/{c}_counts.tsv"))
                 .collect(),
         );
-        // let hm = counts.counts_annotations(contig)?;
-
-        // let len = counts.data.get(contig).unwrap().len();
-        // hm.iter().for_each(|(k, v)| {
-        //     println!("{:?} {:0.2}", k, *v as f64 / len as f64);
-        // });
-        // let n_final = counts.get(contig).unwrap().len();
-        // println!("n ok: {:0.2}", n_final as f64 / len as f64);
 
         counts.save_contigs(&contigs, &format!("{save_dir}/{id}"), breaks)?;
         Ok(())
     }
-
-    #[test]
-    fn load() -> anyhow::Result<()> {
-        init();
-        info!("loading");
-        let id = "BECERRA";
-        let contig = "chr9";
-        let breaks = vec![1, 6, 15];
-
-        let mut contigs: Vec<String> = (1..=22).map(|c| format!("chr{c}")).collect();
-        contigs.push("chrX".to_string());
-        contigs.push("chrY".to_string());
-        let counts = Counts::from_files(
-            contigs
-                .clone()
-                .iter()
-                .map(|c| format!("/data/longreads_basic_pipe/{id}/diag/scan/{c}_counts.tsv"))
-                .collect(),
-        );
-        counts.save_global_proportions_graph("/data/proportions.svg", &contigs, breaks);
-        counts.save_global_distribution_graph("/data/global_distribution.svg", &contigs);
-
-        let chr1_nd_reads = counts.nd_reads(contig)?;
-        println!(
-            "Percentiles: 1% {}, 50% {}, 99% {}",
-            chr1_nd_reads.percentile(1.0).unwrap(),
-            chr1_nd_reads.percentile(50.0).unwrap(),
-            chr1_nd_reads.percentile(99.0).unwrap()
-        );
-        println!("< 6x: {:.2}%", chr1_nd_reads.proportion_under(6) * 100.0);
-        println!("> 15x: {:.2}%", chr1_nd_reads.proportion_above(15) * 100.0);
-        println!("> 15x: {:?}", chr1_nd_reads.frequencies(&vec![1, 6, 15]));
-        let d = counts.get(contig)?;
-
-        let tol = 25;
-
-        let under_6_rects: Vec<AdditionalRect> = ranges_under(&d, 6, 0)
-            .iter()
-            .filter(|(s, e)| e > s)
-            // .filter(|(s, e)| e - s > tol)
-            .map(|(start, end)| AdditionalRect {
-                start: *start as u32 * 1000,
-                end: *end as u32 * 1000,
-                color: String::from("red"),
-                position: RectPosition::Below(0),
-            })
-            .collect();
-
-        let over_6_rects: Vec<AdditionalRect> = ranges_between(&d, 6, 15, tol)
-            .iter()
-            .filter(|(s, e)| e > s)
-            // .filter(|(s, e)| e - s > tol)
-            .map(|(start, end)| AdditionalRect {
-                start: *start as u32 * 1000,
-                end: *end as u32 * 1000,
-                color: String::from("green"),
-                position: RectPosition::Below(1),
-            })
-            .collect();
-
-        let over15: Vec<AdditionalRect> = ranges_over(&d, 15, 10)
-            .iter()
-            .filter(|(s, e)| e > s)
-            .map(|(start, end)| AdditionalRect {
-                start: *start as u32 * 1000,
-                end: *end as u32 * 1000,
-                color: String::from("blue"),
-                position: RectPosition::Below(2),
-            })
-            .collect();
-
-        let mut all = Vec::new();
-        all.extend(under_6_rects);
-        all.extend(over_6_rects);
-        all.extend(over15);
-
-        svg_chromosome(
-            contig,
-            1000,
-            50,
-            "/data/ref/hs1/cytoBandMapped.bed",
-            "/data/chr1.svg",
-            &all,
-            &Vec::new(),
-        )
-        .unwrap();
-
-        Ok(())
-    }
 }