Your Name 1 سال پیش
والد
کامیت
fa341807cb
4فایلهای تغییر یافته به همراه20 افزوده شده و 1650 حذف شده
  1. 0 960
      <
  2. 18 666
      Cargo.lock
  3. 0 9
      Cargo.toml
  4. 2 15
      src/counts.rs

+ 0 - 960
<

@@ -1,960 +0,0 @@
-use anyhow::Context;
-use log::{info, warn};
-use ordered_float::Float;
-use pandora_lib_graph::cytoband::{svg_chromosome, AdditionalRect, RectPosition};
-use plotly::{color::Rgb, common::Marker, layout::BarMode, Bar, Layout, Plot, Scatter};
-use rand::{thread_rng, Rng};
-use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
-use serde::{
-    de::{self, Visitor},
-    Deserialize, Deserializer, Serialize,
-};
-use statrs::{
-    distribution::{Continuous, Discrete},
-    statistics::Statistics,
-};
-use std::{
-    collections::{BTreeMap, HashMap, HashSet},
-    f64, fmt,
-    fs::File,
-    io::{BufRead, BufReader, Write},
-    str::FromStr,
-};
-
-#[derive(Debug, Clone)]
-pub struct Count {
-    pub position: CountRange,
-    pub n_reads: u32,
-    pub n_low_mapq: u32,
-    pub frac_sa: f32,
-    pub sa_outlier: bool,
-    pub frac_se: f32,
-    pub se_outlier: bool,
-    pub annotation: Vec<CountAnnotation>,
-}
-
-impl fmt::Display for Count {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(
-            f,
-            "{}\t{}\t{}\t{:.6}\t{}\t{:.6}\t{}",
-            self.position,
-            self.n_reads,
-            self.n_low_mapq,
-            self.frac_sa,
-            self.sa_outlier,
-            self.frac_se,
-            self.se_outlier
-        )
-    }
-}
-// inclusive 0 based
-#[derive(Debug, Clone)]
-pub struct CountRange {
-    pub contig: String,
-    pub start: u32,
-    pub end: u32,
-}
-
-impl fmt::Display for CountRange {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "{}:{}-{}", self.contig, self.start, self.end)
-    }
-}
-
-#[derive(Debug, Clone, Hash, Eq, PartialEq)]
-pub enum CountAnnotation {
-    MaskedLowMRD,
-    MaskedQuality,
-}
-
-impl<'de> Deserialize<'de> for Count {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: Deserializer<'de>,
-    {
-        struct CountVisitor;
-
-        impl<'de> Visitor<'de> for CountVisitor {
-            type Value = Count;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                formatter.write_str("a string in the format 'chr:start-end n_reads n_low_mapq frac_sa sa_outlier frac_se se_outlier'")
-            }
-
-            fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
-            where
-                E: de::Error,
-            {
-                let parts: Vec<&str> = s.split_whitespace().collect();
-                if parts.len() != 7 {
-                    return Err(E::custom("incorrect number of fields"));
-                }
-
-                let position_parts: Vec<&str> = parts[0].split(&[':', '-'][..]).collect();
-                if position_parts.len() != 3 {
-                    return Err(E::custom("incorrect position format"));
-                }
-
-                Ok(Count {
-                    position: CountRange {
-                        contig: position_parts[0].to_string(),
-                        start: u32::from_str(position_parts[1]).map_err(E::custom)?,
-                        end: u32::from_str(position_parts[2]).map_err(E::custom)?,
-                    },
-                    n_reads: u32::from_str(parts[1]).map_err(E::custom)?,
-                    n_low_mapq: u32::from_str(parts[2]).map_err(E::custom)?,
-                    frac_sa: f32::from_str(parts[3]).map_err(E::custom)?,
-                    sa_outlier: bool::from_str(parts[4]).map_err(E::custom)?,
-                    frac_se: f32::from_str(parts[5]).map_err(E::custom)?,
-                    se_outlier: bool::from_str(parts[6]).map_err(E::custom)?,
-                    annotation: Vec::new(),
-                })
-            }
-        }
-
-        deserializer.deserialize_str(CountVisitor)
-    }
-}
-
-pub fn read_counts_from_file(filename: &str) -> anyhow::Result<Vec<Count>> {
-    let file = File::open(filename)?;
-    let reader = BufReader::new(file);
-    let mut counts = Vec::new();
-
-    for line in reader.lines() {
-        let line = line?;
-        let count: Count = serde_json::from_str(&format!("\"{}\"", escape_control_chars(&line)))?;
-        counts.push(count);
-    }
-
-    Ok(counts)
-}
-fn escape_control_chars(s: &str) -> String {
-    s.chars()
-        .map(|c| {
-            if c.is_control() {
-                format!("\\u{:04x}", c as u32)
-            } else {
-                c.to_string()
-            }
-        })
-        .collect()
-}
-
-#[derive(Debug)]
-pub struct Counts {
-    pub data: HashMap<String, Vec<Count>>,
-    pub mrd: HashMap<String, Vec<Count>>,
-}
-
-impl Counts {
-    pub fn from_files(paths: Vec<String>) -> Self {
-        let counts: Vec<Vec<Count>> = paths
-            .par_iter()
-            .map(|path| match read_counts_from_file(path) {
-                Ok(c) => c,
-                Err(e) => {
-                    warn!("Couldnt load {path}: {e}");
-                    Vec::new()
-                }
-            })
-            .filter(|v| !v.is_empty())
-            .collect();
-
-        let mut data = HashMap::new();
-        for count in counts {
-            let contig = count.first().unwrap().position.contig.clone();
-            data.insert(contig, count);
-        }
-        Counts {
-            data,
-            mrd: HashMap::new(),
-        }
-    }
-
-    pub fn mrd_from_files(&mut self, paths: Vec<String>) {
-        let counts: Vec<Vec<Count>> = paths
-            .par_iter()
-            .map(|path| match read_counts_from_file(path) {
-                Ok(c) => c,
-                Err(e) => {
-                    warn!("Couldnt load {path}: {e}");
-                    Vec::new()
-                }
-            })
-            .filter(|v| !v.is_empty())
-            .collect();
-
-        let mut data = HashMap::new();
-        for count in counts {
-            let contig = count.first().unwrap().position.contig.clone();
-            data.insert(contig, count);
-        }
-        self.mrd = data;
-    }
-
-    pub fn mask_low_mrd(&mut self, contig: &str, min_reads: u32) -> anyhow::Result<()> {
-        if let (Some(mrd), Some(diag)) = (self.mrd.get(contig), self.data.get_mut(contig)) {
-            for (m, d) in mrd.iter().zip(diag) {
-                if m.n_reads < min_reads {
-                    d.annotation.push(CountAnnotation::MaskedLowMRD);
-                }
-            }
-            Ok(())
-        } else {
-            anyhow::bail!("No {contig} in both mrd and diag.")
-        }
-    }
-
-    pub fn mask_low_quality(&mut self, contig: &str, max_ratio: f64) -> anyhow::Result<()> {
-        if let Some(diag) = self.data.get_mut(contig) {
-            for d in diag.iter_mut() {
-                if (d.n_low_mapq as f64 / (d.n_reads + d.n_low_mapq) as f64) > max_ratio {
-                    d.annotation.push(CountAnnotation::MaskedQuality);
-                }
-            }
-            Ok(())
-        } else {
-            anyhow::bail!("No {contig} in both mrd and diag.")
-        }
-    }
-
-    pub fn frequencies(&self, contig: &str) -> anyhow::Result<Vec<(f64, f64)>> {
-        let data = self.get(contig)?;
-
-        let mut frequencies = HashMap::new();
-        for count in data.iter() {
-            *frequencies.entry(*count).or_insert(0) += 1;
-        }
-
-        let mut frequencies: Vec<(u32, f64)> =
-            frequencies.iter().map(|(x, y)| (*x, *y as f64)).collect();
-        frequencies.sort_by_key(|v| v.0);
-        Ok(frequencies.iter().map(|(x, y)| (*x as f64, *y)).collect())
-    }
-
-    pub fn percentile(&self, contig: &str, percentile: f64) -> anyhow::Result<u32> {
-        let mut data = self.get(contig)?;
-        data.sort_unstable();
-        let total_count = data.len();
-        let index = |percentile: f64| -> usize {
-            (percentile / 100.0 * (total_count - 1) as f64).round() as usize
-        };
-
-        Ok(*data.get(index(percentile)).context("Error in percentile")?)
-    }
-
-    pub fn save_contig(
-        &mut self,
-        contig: &str,
-        prefix: &str,
-        breaks: Vec<u32>,
-    ) -> anyhow::Result<CountsStats> {
-        self.mask_low_mrd(contig, 6)?;
-        self.mask_low_quality(contig, 0.1)?;
-
-        let data: Vec<f64> = self.get(contig)?.iter().map(|v| *v as f64).collect();
-        let n_final = data.len();
-
-        let frequencies = self.frequencies(contig)?;
-        let percentile_99 = self.percentile(contig, 99.0)?;
-
-        let mut data_x = Vec::new();
-        let mut data_y = Vec::new();
-        frequencies.iter().for_each(|(x, y)| {
-            if *x <= percentile_99 as f64 {
-                data_x.push(*x);
-                data_y.push(*y / n_final as f64);
-            }
-        });
-
-        // Distribution plot
-        let distribution_path = format!("{prefix}_{contig}_distrib.svg");
-        info!("Saving graph: {distribution_path}");
-        let mut plot = Plot::new();
-        let colors: Vec<Rgb> = data_x
-            .iter()
-            .map(|&x| match x {
-                x if x < 2.0 => Rgb::new(193, 18, 31),
-                x if x < 6.0 => Rgb::new(243, 114, 44),
-                x if x < 15.0 => Rgb::new(255, 202, 58),
-                _ => Rgb::new(138, 201, 38),
-            })
-            .collect();
-
-        let bars = Bar::new(data_x.clone(), data_y.clone())
-            .show_legend(false)
-            .marker(Marker::new().color_array(colors));
-
-        plot.add_trace(bars);
-
-        let sum: f64 = data.iter().sum();
-        let mean = (&data).mean();
-        let count = data.len() as f64;
-        let std_dev = (&data).std_dev();
-
-        // Normal
-        let normal = statrs::distribution::Normal::new(mean, std_dev)?;
-        let data_y: Vec<f64> = data_x.iter().map(|x| normal.pdf(*x)).collect();
-        let trace = Scatter::new(data_x.clone(), data_y).name("Normal");
-        plot.add_trace(trace);
-
-        // // Gamma
-        // let shape = mean * mean / variance;
-        // let rate = mean / variance;
-        //
-        // let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
-        // let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
-        // let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
-        // plot.add_trace(trace);
-
-        // Poisson
-        let lambda = sum / count;
-        let poisson = statrs::distribution::Poisson::new(lambda)?;
-        let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
-        let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
-        plot.add_trace(trace);
-
-        plot.write_image(distribution_path, plotly::ImageFormat::SVG, 800, 600, 1.0);
-
-        // Fractions
-        let mut breaks_values = Vec::new();
-        for (i, b) in breaks.iter().enumerate() {
-            if i == 0 {
-                let total: f64 = frequencies
-                    .iter()
-                    .filter(|(x, _)| *x < *b as f64)
-                    .map(|(_, y)| *y / count)
-                    .sum();
-                breaks_values.push((format!("< {b}"), total));
-            } else {
-                let last = breaks[i - 1];
-                let total: f64 = frequencies
-                    .iter()
-                    .filter(|(x, _)| *x < *b as f64 && *x >= last as f64)
-                    .map(|(_, y)| *y / count)
-                    .sum();
-
-                breaks_values.push((format!("[{last} - {b}["), total));
-            }
-        }
-
-        let last = *breaks.last().unwrap();
-        let total: f64 = frequencies
-            .iter()
-            .filter(|(x, _)| *x >= last as f64)
-            .map(|(_, y)| *y / count)
-            .sum();
-        breaks_values.push((format!(">= {last}"), total));
-
-        // Chromosome
-        let tol = 25;
-        let chromosome_path = format!("{prefix}_{contig}_chromosome.svg");
-        info!("Saving graph: {chromosome_path}");
-
-        let target_annotations: HashSet<CountAnnotation> = vec![
-            CountAnnotation::MaskedLowMRD,
-            CountAnnotation::MaskedQuality,
-        ]
-        .into_iter()
-        .collect();
-
-        let d: Vec<u32> = self
-            .data
-            .get(contig)
-            .unwrap()
-            .iter()
-            .map(|c| {
-                if c.annotation
-                    .iter()
-                    .any(|ann| target_annotations.contains(ann))
-                {
-                    10_000u32
-                } else {
-                    c.n_reads
-                }
-            })
-            .collect();
-
-        let hm = self.counts_annotations(contig)?;
-        let len = d.len();
-        let mut masked: Vec<(String, f64)> = hm
-            .iter()
-            .map(|(k, v)| (format!("{:?}", k), *v as f64 / len as f64))
-            .collect();
-        masked.push(("Un masked".to_string(), n_final as f64 / len as f64));
-
-        let under_6_rects: Vec<AdditionalRect> = ranges_under(&d, 5, tol)
-            .iter()
-            .filter(|(s, e)| e > s)
-            .map(|(start, end)| AdditionalRect {
-                start: *start as u32 * 1000,
-                end: *end as u32 * 1000,
-                color: String::from("red"),
-                position: RectPosition::Below(1),
-            })
-            .collect();
-
-        let over_6_rects: Vec<AdditionalRect> = ranges_between(&d, 6, 9999, tol)
-            .iter()
-            .filter(|(s, e)| e > s)
-            .map(|(start, end)| AdditionalRect {
-                start: *start as u32 * 1000,
-                end: *end as u32 * 1000,
-                color: String::from("green"),
-                position: RectPosition::Below(2),
-            })
-            .collect();
-
-        let masked_rec: Vec<AdditionalRect> = ranges_over(&d, 10000, tol)
-            .iter()
-            .filter(|(s, e)| e > s)
-            .map(|(start, end)| AdditionalRect {
-                start: *start as u32 * 1000,
-                end: *end as u32 * 1000,
-                color: String::from("grey"),
-                position: RectPosition::Below(0),
-            })
-            .collect();
-
-        let mut all = Vec::new();
-        all.extend(under_6_rects);
-        all.extend(over_6_rects);
-        // all.extend(over15);
-        all.extend(masked_rec);
-
-        svg_chromosome(
-            contig,
-            1000,
-            50,
-            "/data/ref/hs1/cytoBandMapped.bed",
-            &chromosome_path,
-            &all,
-            &Vec::new(),
-        )
-        .unwrap();
-
-        let stats = CountsStats {
-            sum,
-            mean,
-            std_dev,
-            breaks_values,
-            masked,
-        };
-
-        // Save stats
-        let json_path = format!("{prefix}_{contig}_stats.json");
-        info!("Saving stats into: {json_path}");
-        let json = serde_json::to_string_pretty(&stats)?;
-        let mut file = File::create(json_path)?;
-        file.write_all(json.as_bytes())?;
-
-        Ok(stats)
-    }
-
-    pub fn save_contigs(
-        &mut self,
-        contigs: &Vec<String>,
-        prefix: &str,
-        breaks: Vec<u32>,
-    ) -> anyhow::Result<()> {
-        let mut stats = Vec::new();
-        let mut proportions = HashMap::new();
-        for contig in contigs {
-            let stat = self.save_contig(contig, prefix, breaks.clone())?;
-            let un_masked: Vec<&(String, f64)> = stat.masked.iter().filter(|(s, _)| s == "Un masked").collect();
-            let un_masked = un_masked.first().unwrap().1;
-            let masked = 1.0 - un_masked;
-            let mut props: Vec<(String, f64)> = stat.breaks_values.iter().map(|(s, v)| (s.to_string(), *v * un_masked)).collect();
-            props.push(("masked".to_string(), masked));
-            props.iter().for_each(|(s, v)| {
-                proportions.entry(s.to_string()).or_insert(vec![]).push(*v);
-            });
-            stats.push(stat);
-        }
-
-        let mut plot = Plot::new();
-        let layout = Layout::new().bar_mode(BarMode::Stack);
-        
-        for (k, v) in proportions {
-            plot.add_trace(Bar::new(contigs.clone(), v.to_vec()).name(k));
-        }
-        println!("{:?}", contigs);
-        plot.set_layout(layout);
-        plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
-
-        Ok(())
-    }
-
-    pub fn counts_annotations(
-        &self,
-        contig: &str,
-    ) -> anyhow::Result<HashMap<CountAnnotation, u64>> {
-        if let Some(d) = self.data.get(contig) {
-            let mut counts = HashMap::new();
-            for c in d {
-                for a in &c.annotation {
-                    *counts.entry(a.clone()).or_insert(0) += 1;
-                }
-            }
-            Ok(counts)
-        } else {
-            anyhow::bail!("No {contig} in counts.")
-        }
-    }
-
-    pub fn get(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
-        if let Some(ccounts) = self.data.get(contig) {
-            let target_annotations: HashSet<CountAnnotation> = vec![
-                CountAnnotation::MaskedLowMRD,
-                CountAnnotation::MaskedQuality,
-            ]
-            .into_iter()
-            .collect();
-
-            Ok(ccounts
-                .iter()
-                .filter(|count| {
-                    !count
-                        .annotation
-                        .iter()
-                        .any(|ann| target_annotations.contains(ann))
-                })
-                .map(|c| c.n_reads)
-                .collect())
-        } else {
-            anyhow::bail!("No {contig} in counts.")
-        }
-    }
-
-    pub fn mrd(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
-        if let Some(ccounts) = self.mrd.get(contig) {
-            Ok(ccounts.iter().map(|c| c.n_reads).collect())
-        } else {
-            anyhow::bail!("No {contig} in counts.")
-        }
-    }
-
-    pub fn calculate_percentiles(
-        &self,
-        contig: &str,
-        percentiles: &[f64],
-    ) -> anyhow::Result<Vec<f64>> {
-        if let Some(ccounts) = self.data.get(contig) {
-            let mut n_reads: Vec<u32> = ccounts.iter().map(|c| c.n_reads).collect();
-
-            n_reads.sort_unstable();
-
-            let cdf = ND::new(n_reads.clone());
-
-            // println!("CDF at 13: {:?}", cdf.cdf(13));
-            println!("Percentile at 99: {:?}", cdf.percentile(99.0));
-            println!("above 15X: {:?}", cdf.proportion_above(15));
-            // println!("above 15.1X: {:?}", cdf.fitted_proportion_above(&15.1));
-            println!("under 6X: {:?}", cdf.proportion_under(6));
-
-            Ok(percentiles
-                .iter()
-                .map(|&p| {
-                    let index = (p * (n_reads.len() - 1) as f64).round() as usize;
-                    n_reads[index] as f64
-                })
-                .collect())
-        } else {
-            anyhow::bail!("No {contig} in counts.")
-        }
-    }
-
-    pub fn nd_reads(&self, contig: &str) -> anyhow::Result<ND> {
-        if let Some(ccounts) = self.data.get(contig) {
-            Ok(ND::new(ccounts.iter().map(|c| c.n_reads).collect()))
-        } else {
-            anyhow::bail!("No {contig} in counts")
-        }
-    }
-
-    pub fn distribution(&self, contig: &str) -> anyhow::Result<ND> {
-        Ok(ND::new(self.get(contig)?))
-    }
-
-    pub fn save_stats(&self) -> anyhow::Result<()> {
-        Ok(())
-    }
-
-    pub fn save_global_proportions_graph(
-        &self,
-        path: &str,
-        contigs: &Vec<String>,
-        breaks: Vec<u32>,
-    ) {
-        let mut breaks_str = Vec::new();
-        for (i, b) in breaks.iter().enumerate() {
-            if i == 0 {
-                breaks_str.push(format!("< {b}"));
-            } else {
-                let last = breaks[i - 1];
-                breaks_str.push(format!("[{last} - {b}["))
-            }
-        }
-        breaks_str.push(format!(">= {}", breaks.last().unwrap()));
-
-        let mut proportions = Vec::new();
-
-        for contig in contigs.iter() {
-            let d = self.get(contig).unwrap();
-            let nd = ND::new(d);
-            proportions.push(nd.frequencies(&breaks));
-        }
-
-        let mut plot = Plot::new();
-        let layout = Layout::new().bar_mode(BarMode::Stack);
-        for (i, v) in transpose(proportions).iter().enumerate() {
-            plot.add_trace(Bar::new(contigs.clone(), v.to_vec()).name(&breaks_str[i]));
-        }
-        println!("{:?}", contigs);
-        plot.set_layout(layout);
-        plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
-    }
-
-    pub fn save_global_distribution_graph(&self, path: &str, contigs: &Vec<String>) {
-        let d: Vec<u32> = contigs.iter().flat_map(|c| self.get(c).unwrap()).collect();
-        let mut data_sorted = d.clone();
-        data_sorted.sort_unstable();
-
-        let nd = ND::new(d.clone());
-        let mut plot = Plot::new();
-
-        let bar_x: Vec<u32> = (1..=nd.percentile(99.0).unwrap()).collect();
-        let colors: Vec<plotly::color::Rgb> = bar_x
-            .iter()
-            .map(|&x| {
-                if x <= 2 {
-                    plotly::color::Rgb::new(193, 18, 31)
-                } else if x >= 15 {
-                    plotly::color::Rgb::new(138, 201, 38)
-                } else if x <= 6 {
-                    plotly::color::Rgb::new(243, 114, 44)
-                } else {
-                    plotly::color::Rgb::new(255, 202, 58)
-                }
-            })
-            .collect();
-
-        let data: Vec<u32> = d.iter().filter(|x| **x >= 1).copied().collect();
-
-        // frequencies
-        let mut frequencies = HashMap::new();
-        for &value in &data {
-            *frequencies.entry(value).or_insert(0) += 1;
-        }
-
-        let bars = Bar::new(
-            bar_x.clone(),
-            bar_x
-                .iter()
-                .map(|x| *frequencies.get(x).unwrap_or(&0) as f64 / data.len() as f64)
-                .collect(),
-        )
-        .show_legend(false)
-        .marker(Marker::new().color_array(colors));
-
-        plot.add_trace(bars);
-
-        let data_x = generate_range(0.0, nd.percentile(99.0).unwrap().into(), 100);
-        let data_y: Vec<f64> = data_x.iter().map(|x| nd.fitted_normal.pdf(x)).collect();
-        let trace = Scatter::new(data_x.clone(), data_y).name("Gaussian");
-        plot.add_trace(trace);
-
-        // Gamma
-        let data: Vec<f64> = d.iter().map(|x| *x as f64).collect();
-
-        let sum: f64 = data.iter().sum();
-        let mean = (&data).mean();
-        let variance = (&data).variance();
-        let count = d.len() as f64;
-        let shape = mean * mean / variance;
-        let rate = mean / variance;
-
-        let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
-        let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
-        let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
-        plot.add_trace(trace);
-
-        // Poisson
-        let lambda = sum / count;
-        let poisson = statrs::distribution::Poisson::new(lambda).unwrap();
-        let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
-        let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
-        plot.add_trace(trace);
-
-        plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
-        println!("> 15x: {:?}", nd.frequencies(&vec![1, 6, 15]));
-    }
-}
-
-pub struct ND {
-    pub data: Vec<u32>,
-    pub distribution: BTreeMap<u32, f64>,
-    pub total_count: usize,
-    pub frequency: HashMap<u32, usize>,
-    pub fitted_normal: UvNormal,
-}
-
-use rstat::{fitting::MLE, normal::UvNormal, ContinuousDistribution};
-
-impl ND {
-    fn new(mut data: Vec<u32>) -> Self {
-        data.sort_unstable();
-        let n = data.len();
-        info!("n values {n}");
-        let mut distribution = BTreeMap::new();
-
-        let mut frequency = HashMap::new();
-        for &value in &data {
-            *frequency.entry(value).or_insert(0) += 1;
-        }
-
-        let mut cumulative_count = 0;
-        for (&value, &count) in &frequency {
-            cumulative_count += count;
-            let cumulative_prob = cumulative_count as f64 / n as f64;
-            distribution.insert(value, cumulative_prob);
-        }
-
-        // Fit normal distribution
-        let fitted_normal = rstat::univariate::normal::Normal::fit_mle(
-            &data
-                .iter()
-                .filter(|x| *x >= &1u32)
-                .map(|x| *x as f64)
-                .collect::<Vec<f64>>(),
-        )
-        .unwrap();
-
-        Self {
-            data,
-            distribution,
-            frequency,
-            total_count: n,
-            fitted_normal,
-        }
-    }
-
-    pub fn frequency(&self, x: u32) -> usize {
-        *self.frequency.get(&x).unwrap_or(&0)
-    }
-
-    pub fn frequencies(&self, breaks: &Vec<u32>) -> Vec<f64> {
-        let mut last_prop_under = 0.0;
-        let mut res = Vec::new();
-        for brk in breaks {
-            let v = self.proportion_under(*brk) - last_prop_under;
-            res.push(v);
-            last_prop_under += v;
-        }
-        let per99 = self.percentile(99.0).unwrap();
-        res.push(self.proportion_under(per99) - last_prop_under);
-        res
-    }
-
-    pub fn percentile(&self, percentile: f64) -> Option<u32> {
-        if !(0.0..=100.0).contains(&percentile) {
-            return None;
-        }
-
-        let index = (percentile / 100.0 * (self.total_count - 1) as f64).round() as usize;
-        self.data.get(index).cloned()
-    }
-
-    pub fn proportion_under(&self, x: u32) -> f64 {
-        let count = self
-            .frequency
-            .iter()
-            .filter(|(&value, _)| value < x)
-            .map(|(_, &count)| count)
-            .sum::<usize>();
-        count as f64 / self.total_count as f64
-    }
-
-    pub fn proportion_above(&self, x: u32) -> f64 {
-        let count = self
-            .frequency
-            .iter()
-            .filter(|(&value, _)| value > x)
-            .map(|(_, &count)| count)
-            .sum::<usize>();
-        count as f64 / self.total_count as f64
-    }
-}
-
-pub fn generate_range(start: f64, end: f64, steps: usize) -> Vec<f64> {
-    if steps == 0 {
-        return vec![];
-    }
-    if steps == 1 {
-        return vec![start];
-    }
-
-    let step_size = (end - start) / (steps - 1) as f64;
-    (0..steps).map(|i| start + i as f64 * step_size).collect()
-}
-
-use rayon::prelude::*;
-
-pub fn ranges_under(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
-    get_ranges_parallel(vec, x, tolerance, |val, threshold| val <= threshold)
-}
-
-pub fn ranges_over(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
-    get_ranges_parallel(vec, x, tolerance, |val, threshold| val >= threshold)
-}
-
-pub fn ranges_between(
-    vec: &[u32],
-    lower: u32,
-    upper: u32,
-    tolerance: usize,
-) -> Vec<(usize, usize)> {
-    get_ranges_parallel(vec, (lower, upper), tolerance, |val, (l, u)| {
-        val >= l && val <= u
-    })
-}
-
-pub fn get_ranges_parallel<T, F>(
-    vec: &[u32],
-    threshold: T,
-    tolerance: usize,
-    compare: F,
-) -> Vec<(usize, usize)>
-where
-    F: Fn(u32, T) -> bool + Sync,
-    T: Copy + Sync,
-{
-    if vec.is_empty() {
-        return Vec::new();
-    }
-
-    let chunk_size = (vec.len() / rayon::current_num_threads()).max(1);
-    vec.par_chunks(chunk_size)
-        .enumerate()
-        .flat_map(|(chunk_index, chunk)| {
-            let mut local_ranges = Vec::new();
-            let mut current_range: Option<(usize, usize)> = None;
-            let offset = chunk_index * chunk_size;
-
-            for (i, &val) in chunk.iter().enumerate() {
-                let global_index = offset + i;
-                if compare(val, threshold) {
-                    match current_range {
-                        Some((start, end)) if global_index <= end + tolerance + 1 => {
-                            current_range = Some((start, global_index));
-                        }
-                        Some((start, end)) => {
-                            local_ranges.push((start, end));
-                            current_range = Some((global_index, global_index));
-                        }
-                        None => {
-                            current_range = Some((global_index, global_index));
-                        }
-                    }
-                } else if let Some((start, end)) = current_range {
-                    if global_index > end + tolerance + 1 {
-                        local_ranges.push((start, end));
-                        current_range = None;
-                    }
-                }
-            }
-
-            if let Some(range) = current_range {
-                local_ranges.push(range);
-            }
-
-            local_ranges
-        })
-        .collect()
-}
-
-pub fn transpose(v: Vec<Vec<f64>>) -> Vec<Vec<f64>> {
-    assert!(!v.is_empty());
-    let len = v[0].len();
-    let mut result = vec![Vec::with_capacity(v.len()); len];
-
-    for row in v {
-        for (i, val) in row.into_iter().enumerate() {
-            result[i].push(val);
-        }
-    }
-    result
-}
-
-#[derive(Debug, Serialize)]
-pub struct CountsStats {
-    pub sum: f64,
-    pub mean: f64,
-    pub std_dev: f64,
-    pub breaks_values: Vec<(String, f64)>,
-    pub masked: Vec<(String, f64)>,
-}
-
-// pub fn save_barplota
-//     data: Vec<f64>,
-//     data_x: Vec<f64>,
-//     data_y: Vec<f64>,
-//     path: &str,
-// ) -> anyhow::Result<CountsStats> {
-//     let mut plot = Plot::new();
-//
-//     let colors: Vec<plotly::color::Rgb> = data_x
-//         .iter()
-//         .map(|&x| {
-//             if x <= 2.0 {
-//                 plotly::color::Rgb::new(193, 18, 31)
-//             } else if x >= 15.0 {
-//                 plotly::color::Rgb::new(138, 201, 38)
-//             } else if x <= 6.0 {
-//                 plotly::color::Rgb::new(243, 114, 44)
-//             } else {
-//                 plotly::color::Rgb::new(255, 202, 58)
-//             }
-//         })
-//         .collect();
-//
-//     let bars = Bar::new(data_x.clone(), data_y.clone())
-//         .show_legend(false)
-//         .marker(Marker::new().color_array(colors));
-//
-//     plot.add_trace(bars);
-//
-//     let sum: f64 = data.iter().sum();
-//     let mean = (&data).mean();
-//     let count = data.len() as f64;
-//     let std_dev = (&data).std_dev();
-//     println!("mean {mean}");
-//
-//     // Normal
-//     let normal = statrs::distribution::Normal::new(mean, std_dev)?;
-//     let data_y: Vec<f64> = data_x.iter().map(|x| normal.pdf(*x)).collect();
-//     let trace = Scatter::new(data_x.clone(), data_y).name("Normal");
-//     plot.add_trace(trace);
-//
-//     // // Gamma
-//     // let shape = mean * mean / variance;
-//     // let rate = mean / variance;
-//     //
-//     // let gamma = statrs::distribution::Gamma::new(shape, rate).unwrap();
-//     // let data_y: Vec<f64> = data_x.iter().map(|x| gamma.pdf(*x)).collect();
-//     // let trace = Scatter::new(data_x.clone(), data_y).name("Gamma");
-//     // plot.add_trace(trace);
-//
-//     // Poisson
-//     let lambda = sum / count;
-//     let poisson = statrs::distribution::Poisson::new(lambda)?;
-//     let data_y = data_x.iter().map(|x| poisson.pmf(*x as u64)).collect();
-//     let trace = Scatter::new(data_x.clone(), data_y).name("Poisson");
-//     plot.add_trace(trace);
-//
-//     plot.use_local_plotly();
-//     plot.write_image(path, plotly::ImageFormat::SVG, 800, 600, 1.0);
-//     Ok(CountsStats { sum, mean, std_dev })
-// }

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 18 - 666
Cargo.lock


+ 0 - 9
Cargo.toml

@@ -5,25 +5,16 @@ edition = "2021"
 
 [dependencies]
 anyhow = "1.0.86"
-crossbeam-channel = "0.5.13"
-env_logger = "0.11.5"
-indicatif = "0.17.8"
 log = "0.4.22"
-num-format = "0.4.4"
 rayon = "1.10.0"
 rust-htslib = "0.47.0"
 uuid = { version = "1.10.0", features = ["v4"] }
 pandora_lib_graph = { git = "https://git.t0m4.fr/Thomas/pandora_lib_graph.git" }
-indicatif-log-bridge = "0.2.2"
 serde = { version = "1.0.*", default-features = false }
 csv = "1.3.0"
 dashmap = { version = "6.0.1", features = ["rayon"] }
 serde_json = "1.0.128"
-ordered-float = "4.2.2"
 statrs = "0.17.1"
-plotly = { version = "0.9.1", features = ["kaleido"] }
-rstat = "0.6.0"
 rand = "0.8.5"
 indexmap = "2.5.0"
 charming = { version = "0.3.1", features = ["ssr"] }
-

+ 2 - 15
src/counts.rs

@@ -1,8 +1,7 @@
 use anyhow::Context;
 use charming::{
-    component::{Axis, Legend, Title},
+    component::{Axis, Legend},
     datatype::DataPointItem,
-    df,
     element::{AxisType, ItemStyle, Label, LabelPosition, TextStyle, Tooltip, Trigger},
     series::{Bar, Line},
     Chart,
@@ -10,14 +9,13 @@ use charming::{
 use indexmap::IndexMap;
 use log::{info, warn};
 use pandora_lib_graph::cytoband::{svg_chromosome, AdditionalRect, RectPosition};
-// use plotly::{color::Rgb, common::Marker, layout::BarMode, Bar, Layout, Plot, Scatter};
 use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
 use serde::{
     de::{self, Visitor},
     Deserialize, Deserializer, Serialize,
 };
 use statrs::{
-    distribution::{Continuous, Discrete, Normal, Poisson},
+    distribution::{Discrete, Poisson},
     statistics::Statistics,
 };
 use std::{
@@ -314,17 +312,6 @@ impl Counts {
             )
             .series(Bar::new().data(data_y_chart));
 
-        // Normal distribution
-        // let normal = Normal::new(mean, std_dev).unwrap();
-        // let normal_y: Vec<f64> = data_x.iter().map(|&x| normal.pdf(x)).collect();
-        //
-        // let normal_line = Line::new()
-        //     .name("Normal")
-        //     .data(normal_y)
-        //     .item_style(ItemStyle::new().color("#1E88E5".to_string()));
-        //
-        // let chart = chart.series(normal_line);
-        //
         // Poisson distribution
         let lambda = sum / count;
         let poisson = Poisson::new(lambda).unwrap();

برخی فایل ها در این مقایسه diff نمایش داده نمی شوند زیرا تعداد فایل ها بسیار زیاد است