пре 1 година · 6daa641b3b
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,4 +22,7 @@ flate2 = "1.0.30"
 
				 csv = "1.3.0"
			
 
				 dashmap = { version = "6.0.1", features = ["rayon"] }
			
 
				 serde_json = "1.0.128"
			
 
				+ordered-float = "4.2.2"
			
 
				+statrs = "0.17.1"
			
 
				+plotly = { version = "0.9.1", features = ["kaleido"] }
			
 
				 
			
--- a/src/counts.rs
+++ b/src/counts.rs
@@ -1,11 +1,11 @@
 
				-use log::warn;
			
 
				+use log::{info, warn};
			
 
				 use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
			
 
				 use serde::{
			
 
				     de::{self, Visitor},
			
 
				     Deserialize, Deserializer,
			
 
				 };
			
 
				 use std::{
			
 
				-    collections::HashMap,
			
 
				+    collections::{BTreeMap, HashMap},
			
 
				     fmt,
			
 
				     fs::File,
			
 
				     io::{BufRead, BufReader},
			
@@ -139,7 +139,7 @@ impl Counts {
 
				                 Err(e) => {
			
 
				                     warn!("Couldnt load {path}: {e}");
			
 
				                     Vec::new()
			
 
				-                },
			
 
				+                }
			
 
				             })
			
 
				             .filter(|v| !v.is_empty())
			
 
				             .collect();
			
@@ -152,28 +152,64 @@ impl Counts {
 
				         Counts { data }
			
 
				     }
			
 
				 
			
 
				-    pub fn cumulative_coverage(&self, contig: &str/* , median_len: f64 */) {
			
 
				+    pub fn get(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
			
 
				         if let Some(ccounts) = self.data.get(contig) {
			
 
				+            Ok(ccounts.iter().map(|c| c.n_reads).collect())
			
 
				+        } else {
			
 
				+            anyhow::bail!("No {contig} in counts.")
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    pub fn calculate_percentiles(
			
 
				+        &self,
			
 
				+        contig: &str,
			
 
				+        percentiles: &[f64],
			
 
				+    ) -> anyhow::Result<Vec<f64>> {
			
 
				+        if let Some(ccounts) = self.data.get(contig) {
			
 
				+            let mut n_reads: Vec<u32> = ccounts.iter().map(|c| c.n_reads).collect();
			
 
				+
			
 
				+            n_reads.sort_unstable();
			
 
				+
			
 
				+            let cdf = ND::new(n_reads.clone());
			
 
				+
			
 
				+            println!("CDF at 13: {:?}", cdf.cdf(13));
			
 
				+            println!("Percentile at 99: {:?}", cdf.percentile(99.0));
			
 
				+            println!("above 15X: {:?}", cdf.proportion_above(15));
			
 
				+            println!("above 15.1X: {:?}", cdf.fitted_proportion_above(15.1));
			
 
				+            println!("under 6X: {:?}", cdf.proportion_under(6));
			
 
				+
			
 
				+            Ok(percentiles
			
 
				+                .iter()
			
 
				+                .map(|&p| {
			
 
				+                    let index = (p * (n_reads.len() - 1) as f64).round() as usize;
			
 
				+                    n_reads[index] as f64
			
 
				+                })
			
 
				+                .collect())
			
 
				+        } else {
			
 
				+            anyhow::bail!("No {contig} in counts.")
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    pub fn cumulative_coverage(
			
 
				+        &self,
			
 
				+        contig: &str, /* , median_len: f64 */
			
 
				+    ) -> anyhow::Result<Vec<(u32, f64)>> {
			
 
				+        if let Some(ccounts) = self.data.get(contig) {
			
 
				+            let n_reads: Vec<u32> = ccounts.iter().map(|c| c.n_reads).collect();
			
 
				+            if ccounts.is_empty() {
			
 
				+                anyhow::bail!("No counts for {contig}")
			
 
				+            }
			
 
				             let mut ccounts = ccounts.clone();
			
 
				             ccounts.sort_by_key(|c| c.n_reads);
			
 
				             let n_counts = ccounts.len();
			
 
				-            let min = ccounts.first().unwrap();
			
 
				-            println!("{min}");
			
 
				-            let max = ccounts.last().unwrap();
			
 
				-            println!("{max}");
			
 
				-            // let start = max.position.start;
			
 
				-            // let end = max.position.end;
			
 
				-            // let len = end - start + 1;
			
 
				-            // let cov = max.n_reads as f64 * median_len / len as f64;
			
 
				-            // println!("{cov}");
			
 
				-            let index_50 = (0.5 * (n_counts - 1) as f64).round() as usize;
			
 
				-
			
 
				-            let median = ccounts.get(index_50).unwrap();
			
 
				-            println!("50% {}", median.n_reads);
			
 
				-
			
 
				-            let index_95 =  (0.99 * (n_counts - 1) as f64).round() as usize;
			
 
				+            let per = |x: f64| (x * (n_counts - 1) as f64).round() as usize;
			
 
				+
			
 
				+            let median = ccounts.get(per(0.5)).unwrap();
			
 
				+
			
 
				+            let index_95 = (0.99 * (n_counts - 1) as f64).round() as usize;
			
 
				             let v = ccounts.get(index_95).unwrap();
			
 
				             println!("99% {}", v.n_reads);
			
 
				+            println!("99% {:?}", self.calculate_percentiles(contig, &[0.99])?);
			
 
				 
			
 
				             let cov99 = ccounts.get(index_95).unwrap().n_reads;
			
 
				             // let mut last_start = 0;
			
@@ -183,18 +219,311 @@ impl Counts {
 
				                 let frac = n_sup / n_counts as f64;
			
 
				                 all_frac.push((i, frac));
			
 
				             }
			
 
				-            println!("{:#?}", all_frac);
			
 
				 
			
 
				-            let mut all_frac = vec![(0, 0.0)];
			
 
				-            for i in 1..=cov99 {
			
 
				-                let n_sup = ccounts.iter().filter(|c| c.n_reads < i).count() as f64;
			
 
				-                let frac = n_sup / n_counts as f64;
			
 
				-                all_frac.push((i, frac));
			
 
				+            Ok(all_frac)
			
 
				+        } else {
			
 
				+            anyhow::bail!("No {contig} in counts")
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    pub fn nd_reads(&self, contig: &str) -> anyhow::Result<ND> {
			
 
				+        if let Some(ccounts) = self.data.get(contig) {
			
 
				+            Ok(ND::new(ccounts.iter().map(|c| c.n_reads).collect()))
			
 
				+        } else {
			
 
				+            anyhow::bail!("No {contig} in counts")
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+use statrs::{
			
 
				+    distribution::{ContinuousCDF, Normal},
			
 
				+    statistics::{Distribution, Statistics},
			
 
				+};
			
 
				+
			
 
				+pub struct ND {
			
 
				+    pub data: Vec<u32>,
			
 
				+    pub distribution: BTreeMap<u32, f64>,
			
 
				+    pub total_count: usize,
			
 
				+    pub frequency: HashMap<u32, usize>,
			
 
				+    pub fitted_normal: Normal,
			
 
				+}
			
 
				+
			
 
				+impl ND {
			
 
				+    fn new(mut data: Vec<u32>) -> Self {
			
 
				+        data.sort_unstable();
			
 
				+        let n = data.len();
			
 
				+        info!("n values {n}");
			
 
				+        let mut frequency = HashMap::new();
			
 
				+        let mut distribution = BTreeMap::new();
			
 
				+
			
 
				+        for &value in &data {
			
 
				+            *frequency.entry(value).or_insert(0) += 1;
			
 
				+        }
			
 
				+
			
 
				+        let mut cumulative_count = 0;
			
 
				+        for (&value, &count) in &frequency {
			
 
				+            cumulative_count += count;
			
 
				+            let cumulative_prob = cumulative_count as f64 / n as f64;
			
 
				+            distribution.insert(value, cumulative_prob);
			
 
				+        }
			
 
				+
			
 
				+        // Fit normal distribution
			
 
				+        let fitted_normal = Self::fit_normal(&data);
			
 
				+        // let fitted_normal = Self::fit_normal_with_outlier_removal(self, 2);
			
 
				+
			
 
				+        // let mean = data.iter().map(|&x| x as f64).sum::<f64>() / n as f64;
			
 
				+        // let variance = data.iter().map(|&x| (x as f64 - mean).powi(2)).sum::<f64>() / n as f64;
			
 
				+        // let std_dev = variance.sqrt();
			
 
				+        //
			
 
				+        // let fitted_normal = Normal::new(mean, std_dev).unwrap();
			
 
				+
			
 
				+        Self {
			
 
				+            data,
			
 
				+            distribution,
			
 
				+            frequency,
			
 
				+            total_count: n,
			
 
				+            fitted_normal,
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    fn fit_normal(data: &[u32]) -> Normal {
			
 
				+        let n = data.len() as f64;
			
 
				+
			
 
				+        // Calculate mean and variance in a single pass
			
 
				+        let (sum, sum_sq) = data
			
 
				+            .iter()
			
 
				+            .filter(|v| **v > 1)
			
 
				+            .fold((0.0, 0.0), |(sum, sum_sq), x| {
			
 
				+                let x = *x as f64;
			
 
				+                (sum + x, sum_sq + x * x)
			
 
				+            });
			
 
				+
			
 
				+        let mean = sum / n;
			
 
				+        let variance = (sum_sq / n) - (mean * mean);
			
 
				+        let std_dev = variance.sqrt();
			
 
				+
			
 
				+        Normal::new(mean, std_dev).unwrap()
			
 
				+    }
			
 
				+
			
 
				+    fn fit_normal_with_outlier_removal(
			
 
				+        data: Vec<u32>,
			
 
				+        max_iterations: usize,
			
 
				+        z_score_threshold: f64,
			
 
				+        removal_percentage: f64,
			
 
				+    ) -> Normal {
			
 
				+        let mut current_data = data.clone();
			
 
				+
			
 
				+        for _ in 0..max_iterations {
			
 
				+            // Calculate mean and standard deviation
			
 
				+            let mean = current_data.iter().map(|&x| x as f64).mean();
			
 
				+            let std_dev = current_data.iter().map(|&x| x as f64).std_dev();
			
 
				+
			
 
				+            // Identify outliers
			
 
				+            let z_scores: Vec<f64> = current_data
			
 
				+                .iter()
			
 
				+                .map(|&x| (x as f64 - mean).abs() / std_dev)
			
 
				+                .collect();
			
 
				+
			
 
				+            let outliers: Vec<bool> = z_scores.iter().map(|&z| z > z_score_threshold).collect();
			
 
				+
			
 
				+            // If no outliers, break
			
 
				+            if !outliers.iter().any(|&x| x) {
			
 
				+                break;
			
 
				             }
			
 
				-            println!("{:#?}", all_frac);
			
 
				 
			
 
				+            // Remove a percentage of the worst outliers
			
 
				+            let num_to_remove = (current_data.len() as f64 * removal_percentage).round() as usize;
			
 
				+            let mut indexed_z_scores: Vec<(usize, f64)> =
			
 
				+                z_scores.into_iter().enumerate().collect();
			
 
				+            indexed_z_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
			
 
				+
			
 
				+            let indices_to_remove: Vec<usize> = indexed_z_scores
			
 
				+                .iter()
			
 
				+                .take(num_to_remove)
			
 
				+                .map(|&(index, _)| index)
			
 
				+                .collect();
			
 
				 
			
 
				+            current_data = current_data
			
 
				+                .into_iter()
			
 
				+                .enumerate()
			
 
				+                .filter(|(i, _)| !indices_to_remove.contains(i))
			
 
				+                .map(|(_, v)| v)
			
 
				+                .collect();
			
 
				 
			
 
				+            println!(
			
 
				+                "Iteration complete: {} points remaining",
			
 
				+                current_data.len()
			
 
				+            );
			
 
				         }
			
 
				+
			
 
				+        // Update the CDF with the cleaned data
			
 
				+        // self.data = current_data;
			
 
				+        // self.update_distribution();
			
 
				+
			
 
				+        // Return the fitted normal distribution
			
 
				+        let data_wo_zero: Vec<f64> = data.iter().filter(|v| **v > 0).map(|v| *v as f64).collect();
			
 
				+        let mean = data_wo_zero.clone().mean();
			
 
				+        let std_dev = data_wo_zero.std_dev();
			
 
				+        Normal::new(mean, std_dev).unwrap()
			
 
				+    }
			
 
				+
			
 
				+    pub fn pdf(&self, x: f64) -> f64 {
			
 
				+        let epsilon = 1e-6; // Small value for numerical differentiation
			
 
				+        let cdf_x = self.fitted_normal.cdf(x);
			
 
				+        let cdf_x_plus_epsilon = self.fitted_normal.cdf(x + epsilon);
			
 
				+
			
 
				+        // Approximate the derivative
			
 
				+        (cdf_x_plus_epsilon - cdf_x) / epsilon
			
 
				+    }
			
 
				+
			
 
				+    pub fn frequency(&self, x: u32) -> usize {
			
 
				+        *self.frequency.get(&x).unwrap_or(&0)
			
 
				+    }
			
 
				+
			
 
				+    pub fn percentile(&self, percentile: f64) -> Option<u32> {
			
 
				+        if !(0.0..=100.0).contains(&percentile) {
			
 
				+            return None;
			
 
				+        }
			
 
				+
			
 
				+        let index = (percentile / 100.0 * (self.total_count - 1) as f64).round() as usize;
			
 
				+        self.data.get(index).cloned()
			
 
				+    }
			
 
				+
			
 
				+    pub fn cdf(&self, x: u32) -> f64 {
			
 
				+        self.distribution
			
 
				+            .range(..=x)
			
 
				+            .next_back()
			
 
				+            .map_or(0.0, |(_, &prob)| prob)
			
 
				+    }
			
 
				+    pub fn proportion_under(&self, x: u32) -> f64 {
			
 
				+        let count = self
			
 
				+            .frequency
			
 
				+            .iter()
			
 
				+            .filter(|(&value, _)| value < x)
			
 
				+            .map(|(_, &count)| count)
			
 
				+            .sum::<usize>();
			
 
				+        count as f64 / self.total_count as f64
			
 
				+    }
			
 
				+
			
 
				+    pub fn proportion_above(&self, x: u32) -> f64 {
			
 
				+        let count = self
			
 
				+            .frequency
			
 
				+            .iter()
			
 
				+            .filter(|(&value, _)| value > x)
			
 
				+            .map(|(_, &count)| count)
			
 
				+            .sum::<usize>();
			
 
				+        count as f64 / self.total_count as f64
			
 
				+    }
			
 
				+    // pub fn proportion_under(&self, x: u32) -> f64 {
			
 
				+    //     self.distribution
			
 
				+    //         .range(..x)
			
 
				+    //         .next_back()
			
 
				+    //         .map_or(0.0, |(_, &prob)| prob)
			
 
				+    // }
			
 
				+    //
			
 
				+    // pub fn proportion_above(&self, x: u32) -> f64 {
			
 
				+    //     1.0 - self.cdf(x)
			
 
				+    // }
			
 
				+
			
 
				+    pub fn fitted_proportion_under(&self, x: f64) -> f64 {
			
 
				+        self.fitted_normal.cdf(x)
			
 
				+    }
			
 
				+
			
 
				+    pub fn fitted_proportion_above(&self, x: f64) -> f64 {
			
 
				+        1.0 - self.fitted_normal.cdf(x)
			
 
				+    }
			
 
				+
			
 
				+    pub fn get_fitted_parameters(&self) -> (f64, f64) {
			
 
				+        (
			
 
				+            self.fitted_normal.mean().unwrap(),
			
 
				+            self.fitted_normal.std_dev().unwrap(),
			
 
				+        )
			
 
				     }
			
 
				 }
			
 
				+
			
 
				+pub fn generate_range(start: f64, end: f64, steps: usize) -> Vec<f64> {
			
 
				+    if steps == 0 {
			
 
				+        return vec![];
			
 
				+    }
			
 
				+    if steps == 1 {
			
 
				+        return vec![start];
			
 
				+    }
			
 
				+
			
 
				+    let step_size = (end - start) / (steps - 1) as f64;
			
 
				+    (0..steps).map(|i| start + i as f64 * step_size).collect()
			
 
				+}
			
 
				+
			
 
				+use rayon::prelude::*;
			
 
				+
			
 
				+pub fn ranges_under(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
			
 
				+    get_ranges_parallel(vec, x, tolerance, |val, threshold| val <= threshold)
			
 
				+}
			
 
				+
			
 
				+pub fn ranges_over(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
			
 
				+    get_ranges_parallel(vec, x, tolerance, |val, threshold| val >= threshold)
			
 
				+}
			
 
				+
			
 
				+pub fn ranges_between(
			
 
				+    vec: &[u32],
			
 
				+    lower: u32,
			
 
				+    upper: u32,
			
 
				+    tolerance: usize,
			
 
				+) -> Vec<(usize, usize)> {
			
 
				+    get_ranges_parallel(vec, (lower, upper), tolerance, |val, (l, u)| {
			
 
				+        val >= l && val <= u
			
 
				+    })
			
 
				+}
			
 
				+
			
 
				+pub fn get_ranges_parallel<T, F>(
			
 
				+    vec: &[u32],
			
 
				+    threshold: T,
			
 
				+    tolerance: usize,
			
 
				+    compare: F,
			
 
				+) -> Vec<(usize, usize)>
			
 
				+where
			
 
				+    F: Fn(u32, T) -> bool + Sync,
			
 
				+    T: Copy + Sync,
			
 
				+{
			
 
				+    if vec.is_empty() {
			
 
				+        return Vec::new();
			
 
				+    }
			
 
				+
			
 
				+    let chunk_size = (vec.len() / rayon::current_num_threads()).max(1);
			
 
				+    vec.par_chunks(chunk_size)
			
 
				+        .enumerate()
			
 
				+        .flat_map(|(chunk_index, chunk)| {
			
 
				+            let mut local_ranges = Vec::new();
			
 
				+            let mut current_range: Option<(usize, usize)> = None;
			
 
				+            let offset = chunk_index * chunk_size;
			
 
				+
			
 
				+            for (i, &val) in chunk.iter().enumerate() {
			
 
				+                let global_index = offset + i;
			
 
				+                if compare(val, threshold) {
			
 
				+                    match current_range {
			
 
				+                        Some((start, end)) if global_index <= end + tolerance + 1 => {
			
 
				+                            current_range = Some((start, global_index));
			
 
				+                        }
			
 
				+                        Some((start, end)) => {
			
 
				+                            local_ranges.push((start, end));
			
 
				+                            current_range = Some((global_index, global_index));
			
 
				+                        }
			
 
				+                        None => {
			
 
				+                            current_range = Some((global_index, global_index));
			
 
				+                        }
			
 
				+                    }
			
 
				+                } else if let Some((start, end)) = current_range {
			
 
				+                    if global_index > end + tolerance + 1 {
			
 
				+                        local_ranges.push((start, end));
			
 
				+                        current_range = None;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            if let Some(range) = current_range {
			
 
				+                local_ranges.push(range);
			
 
				+            }
			
 
				+
			
 
				+            local_ranges
			
 
				+        })
			
 
				+        .collect()
			
 
				+}
			
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -16,8 +16,8 @@ use std::{
 
				 
			
 
				 pub mod bam;
			
 
				 pub mod bin;
			
 
				-pub mod dict_reader;
			
 
				 pub mod counts;
			
 
				+pub mod dict_reader;
			
 
				 
			
 
				 #[derive(Debug)]
			
 
				 pub struct Config {
			
@@ -343,7 +343,8 @@ pub fn par_whole_scan(dict_file: &str, bam_path: &str, out_dir: &str) -> anyhow:
 
				 #[cfg(test)]
			
 
				 mod tests {
			
 
				 
			
 
				-    use counts::{read_counts_from_file, Counts};
			
 
				+    use counts::{ranges_over, ranges_under, Counts};
			
 
				+    use plotly::{color::NamedColor, common::Marker, Bar, Plot, Scatter};
			
 
				     use rust_htslib::bam::Reader;
			
 
				 
			
 
				     use super::*;
			
@@ -447,11 +448,66 @@ mod tests {
 
				     fn load() -> anyhow::Result<()> {
			
 
				         init();
			
 
				         info!("loading");
			
 
				-        let median_len = 4460.0;
			
 
				         let count_file = "/data/longreads_basic_pipe/ROBIN/diag/scan/chr1_counts.tsv";
			
 
				         let counts = Counts::from_files(vec![count_file]);
			
 
				 
			
 
				-        counts.cumulative_coverage("chr1");
			
 
				+        let chr1_nd_reads = counts.nd_reads("chr1")?;
			
 
				+        println!(
			
 
				+            "Percentiles: 1% {}, 50% {}, 99% {}",
			
 
				+            chr1_nd_reads.percentile(1.0).unwrap(),
			
 
				+            chr1_nd_reads.percentile(50.0).unwrap(),
			
 
				+            chr1_nd_reads.percentile(99.0).unwrap()
			
 
				+        );
			
 
				+        println!("< 6x: {:.2}%", chr1_nd_reads.proportion_under(6) * 100.0);
			
 
				+        println!("> 15x: {:.2}%", chr1_nd_reads.proportion_above(15) * 100.0);
			
 
				+        let d = counts.get("chr1")?;
			
 
				+        
			
 
				+        let all_ranges = ranges_over(&d, 6, 10);
			
 
				+        println!("Ranges over 1 : {:?}", all_ranges.len());
			
 
				+        let all_ranges = ranges_under(&d, 6, 10);
			
 
				+        println!("Ranges under 1 : {:?}", all_ranges.len());
			
 
				+
			
 
				+        let mut plot = Plot::new();
			
 
				+
			
 
				+        let bar_x: Vec<u32> = (0..=chr1_nd_reads.percentile(99.0).unwrap()).collect();
			
 
				+        let colors: Vec<plotly::color::Rgb> = bar_x
			
 
				+            .iter()
			
 
				+            .map(|&x| {
			
 
				+                if x <= 2 {
			
 
				+                    plotly::color::Rgb::new(193, 18, 31)
			
 
				+                } else if x >= 15 {
			
 
				+                    plotly::color::Rgb::new(138, 201, 38)
			
 
				+                } else if x <= 6 {
			
 
				+                    plotly::color::Rgb::new(243, 114, 44)
			
 
				+                } else {
			
 
				+                    plotly::color::Rgb::new(255, 202, 58)
			
 
				+                }
			
 
				+            })
			
 
				+            .collect();
			
 
				+
			
 
				+        let bars = Bar::new(
			
 
				+            bar_x.clone(),
			
 
				+            bar_x
			
 
				+                .iter()
			
 
				+                .map(|x| chr1_nd_reads.frequency(*x) as f64 / chr1_nd_reads.data.len() as f64)
			
 
				+                .collect(),
			
 
				+        )
			
 
				+        .show_legend(false)
			
 
				+        .marker(Marker::new().color_array(colors));
			
 
				+
			
 
				+        plot.add_trace(bars);
			
 
				+
			
 
				+        // let trace = Scatter::new(
			
 
				+        //     data.iter().map(|(x, _)| *x).collect::<Vec<f64>>(),
			
 
				+        //     data.iter().map(|(_, y)| *y).collect::<Vec<f64>>(),
			
 
				+        // );
			
 
				+        // plot.add_trace(trace);
			
 
				+        //
			
 
				+        plot.use_local_plotly();
			
 
				+
			
 
				+        plot.write_image("/data/test2.svg", plotly::ImageFormat::SVG, 800, 600, 1.0);
			
 
				+
			
 
				+        // plot.write_html("out.html");
			
 
				         Ok(())
			
 
				     }