Your Name пре 1 година
родитељ
комит
6daa641b3b
4 измењених фајлова са 1125 додато и 40 уклоњено
  1. 707 10
      Cargo.lock
  2. 3 0
      Cargo.toml
  3. 355 26
      src/counts.rs
  4. 60 4
      src/lib.rs

Разлика између датотеке није приказан због своје велике величине
+ 707 - 10
Cargo.lock


+ 3 - 0
Cargo.toml

@@ -22,4 +22,7 @@ flate2 = "1.0.30"
 csv = "1.3.0"
 dashmap = { version = "6.0.1", features = ["rayon"] }
 serde_json = "1.0.128"
+ordered-float = "4.2.2"
+statrs = "0.17.1"
+plotly = { version = "0.9.1", features = ["kaleido"] }
 

+ 355 - 26
src/counts.rs

@@ -1,11 +1,11 @@
-use log::warn;
+use log::{info, warn};
 use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
 use serde::{
     de::{self, Visitor},
     Deserialize, Deserializer,
 };
 use std::{
-    collections::HashMap,
+    collections::{BTreeMap, HashMap},
     fmt,
     fs::File,
     io::{BufRead, BufReader},
@@ -139,7 +139,7 @@ impl Counts {
                 Err(e) => {
                     warn!("Couldnt load {path}: {e}");
                     Vec::new()
-                },
+                }
             })
             .filter(|v| !v.is_empty())
             .collect();
@@ -152,28 +152,64 @@ impl Counts {
         Counts { data }
     }
 
-    pub fn cumulative_coverage(&self, contig: &str/* , median_len: f64 */) {
+    pub fn get(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
         if let Some(ccounts) = self.data.get(contig) {
+            Ok(ccounts.iter().map(|c| c.n_reads).collect())
+        } else {
+            anyhow::bail!("No {contig} in counts.")
+        }
+    }
+
+    pub fn calculate_percentiles(
+        &self,
+        contig: &str,
+        percentiles: &[f64],
+    ) -> anyhow::Result<Vec<f64>> {
+        if let Some(ccounts) = self.data.get(contig) {
+            let mut n_reads: Vec<u32> = ccounts.iter().map(|c| c.n_reads).collect();
+
+            n_reads.sort_unstable();
+
+            let cdf = ND::new(n_reads.clone());
+
+            println!("CDF at 13: {:?}", cdf.cdf(13));
+            println!("Percentile at 99: {:?}", cdf.percentile(99.0));
+            println!("above 15X: {:?}", cdf.proportion_above(15));
+            println!("above 15.1X: {:?}", cdf.fitted_proportion_above(15.1));
+            println!("under 6X: {:?}", cdf.proportion_under(6));
+
+            Ok(percentiles
+                .iter()
+                .map(|&p| {
+                    let index = (p * (n_reads.len() - 1) as f64).round() as usize;
+                    n_reads[index] as f64
+                })
+                .collect())
+        } else {
+            anyhow::bail!("No {contig} in counts.")
+        }
+    }
+
+    pub fn cumulative_coverage(
+        &self,
+        contig: &str, /* , median_len: f64 */
+    ) -> anyhow::Result<Vec<(u32, f64)>> {
+        if let Some(ccounts) = self.data.get(contig) {
+            let n_reads: Vec<u32> = ccounts.iter().map(|c| c.n_reads).collect();
+            if ccounts.is_empty() {
+                anyhow::bail!("No counts for {contig}")
+            }
             let mut ccounts = ccounts.clone();
             ccounts.sort_by_key(|c| c.n_reads);
             let n_counts = ccounts.len();
-            let min = ccounts.first().unwrap();
-            println!("{min}");
-            let max = ccounts.last().unwrap();
-            println!("{max}");
-            // let start = max.position.start;
-            // let end = max.position.end;
-            // let len = end - start + 1;
-            // let cov = max.n_reads as f64 * median_len / len as f64;
-            // println!("{cov}");
-            let index_50 = (0.5 * (n_counts - 1) as f64).round() as usize;
-
-            let median = ccounts.get(index_50).unwrap();
-            println!("50% {}", median.n_reads);
-
-            let index_95 =  (0.99 * (n_counts - 1) as f64).round() as usize;
+            let per = |x: f64| (x * (n_counts - 1) as f64).round() as usize;
+
+            let median = ccounts.get(per(0.5)).unwrap();
+
+            let index_95 = (0.99 * (n_counts - 1) as f64).round() as usize;
             let v = ccounts.get(index_95).unwrap();
             println!("99% {}", v.n_reads);
+            println!("99% {:?}", self.calculate_percentiles(contig, &[0.99])?);
 
             let cov99 = ccounts.get(index_95).unwrap().n_reads;
             // let mut last_start = 0;
@@ -183,18 +219,311 @@ impl Counts {
                 let frac = n_sup / n_counts as f64;
                 all_frac.push((i, frac));
             }
-            println!("{:#?}", all_frac);
 
-            let mut all_frac = vec![(0, 0.0)];
-            for i in 1..=cov99 {
-                let n_sup = ccounts.iter().filter(|c| c.n_reads < i).count() as f64;
-                let frac = n_sup / n_counts as f64;
-                all_frac.push((i, frac));
+            Ok(all_frac)
+        } else {
+            anyhow::bail!("No {contig} in counts")
+        }
+    }
+
+    pub fn nd_reads(&self, contig: &str) -> anyhow::Result<ND> {
+        if let Some(ccounts) = self.data.get(contig) {
+            Ok(ND::new(ccounts.iter().map(|c| c.n_reads).collect()))
+        } else {
+            anyhow::bail!("No {contig} in counts")
+        }
+    }
+}
+
+use statrs::{
+    distribution::{ContinuousCDF, Normal},
+    statistics::{Distribution, Statistics},
+};
+
+pub struct ND {
+    pub data: Vec<u32>,
+    pub distribution: BTreeMap<u32, f64>,
+    pub total_count: usize,
+    pub frequency: HashMap<u32, usize>,
+    pub fitted_normal: Normal,
+}
+
+impl ND {
+    fn new(mut data: Vec<u32>) -> Self {
+        data.sort_unstable();
+        let n = data.len();
+        info!("n values {n}");
+        let mut frequency = HashMap::new();
+        let mut distribution = BTreeMap::new();
+
+        for &value in &data {
+            *frequency.entry(value).or_insert(0) += 1;
+        }
+
+        let mut cumulative_count = 0;
+        for (&value, &count) in &frequency {
+            cumulative_count += count;
+            let cumulative_prob = cumulative_count as f64 / n as f64;
+            distribution.insert(value, cumulative_prob);
+        }
+
+        // Fit normal distribution
+        let fitted_normal = Self::fit_normal(&data);
+        // let fitted_normal = Self::fit_normal_with_outlier_removal(self, 2);
+
+        // let mean = data.iter().map(|&x| x as f64).sum::<f64>() / n as f64;
+        // let variance = data.iter().map(|&x| (x as f64 - mean).powi(2)).sum::<f64>() / n as f64;
+        // let std_dev = variance.sqrt();
+        //
+        // let fitted_normal = Normal::new(mean, std_dev).unwrap();
+
+        Self {
+            data,
+            distribution,
+            frequency,
+            total_count: n,
+            fitted_normal,
+        }
+    }
+
+    fn fit_normal(data: &[u32]) -> Normal {
+        let n = data.len() as f64;
+
+        // Calculate mean and variance in a single pass
+        let (sum, sum_sq) = data
+            .iter()
+            .filter(|v| **v > 1)
+            .fold((0.0, 0.0), |(sum, sum_sq), x| {
+                let x = *x as f64;
+                (sum + x, sum_sq + x * x)
+            });
+
+        let mean = sum / n;
+        let variance = (sum_sq / n) - (mean * mean);
+        let std_dev = variance.sqrt();
+
+        Normal::new(mean, std_dev).unwrap()
+    }
+
+    fn fit_normal_with_outlier_removal(
+        data: Vec<u32>,
+        max_iterations: usize,
+        z_score_threshold: f64,
+        removal_percentage: f64,
+    ) -> Normal {
+        let mut current_data = data.clone();
+
+        for _ in 0..max_iterations {
+            // Calculate mean and standard deviation
+            let mean = current_data.iter().map(|&x| x as f64).mean();
+            let std_dev = current_data.iter().map(|&x| x as f64).std_dev();
+
+            // Identify outliers
+            let z_scores: Vec<f64> = current_data
+                .iter()
+                .map(|&x| (x as f64 - mean).abs() / std_dev)
+                .collect();
+
+            let outliers: Vec<bool> = z_scores.iter().map(|&z| z > z_score_threshold).collect();
+
+            // If no outliers, break
+            if !outliers.iter().any(|&x| x) {
+                break;
             }
-            println!("{:#?}", all_frac);
 
+            // Remove a percentage of the worst outliers
+            let num_to_remove = (current_data.len() as f64 * removal_percentage).round() as usize;
+            let mut indexed_z_scores: Vec<(usize, f64)> =
+                z_scores.into_iter().enumerate().collect();
+            indexed_z_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+
+            let indices_to_remove: Vec<usize> = indexed_z_scores
+                .iter()
+                .take(num_to_remove)
+                .map(|&(index, _)| index)
+                .collect();
 
+            current_data = current_data
+                .into_iter()
+                .enumerate()
+                .filter(|(i, _)| !indices_to_remove.contains(i))
+                .map(|(_, v)| v)
+                .collect();
 
+            println!(
+                "Iteration complete: {} points remaining",
+                current_data.len()
+            );
         }
+
+        // Update the CDF with the cleaned data
+        // self.data = current_data;
+        // self.update_distribution();
+
+        // Return the fitted normal distribution
+        let data_wo_zero: Vec<f64> = data.iter().filter(|v| **v > 0).map(|v| *v as f64).collect();
+        let mean = data_wo_zero.clone().mean();
+        let std_dev = data_wo_zero.std_dev();
+        Normal::new(mean, std_dev).unwrap()
+    }
+
+    pub fn pdf(&self, x: f64) -> f64 {
+        let epsilon = 1e-6; // Small value for numerical differentiation
+        let cdf_x = self.fitted_normal.cdf(x);
+        let cdf_x_plus_epsilon = self.fitted_normal.cdf(x + epsilon);
+
+        // Approximate the derivative
+        (cdf_x_plus_epsilon - cdf_x) / epsilon
+    }
+
+    pub fn frequency(&self, x: u32) -> usize {
+        *self.frequency.get(&x).unwrap_or(&0)
+    }
+
+    pub fn percentile(&self, percentile: f64) -> Option<u32> {
+        if !(0.0..=100.0).contains(&percentile) {
+            return None;
+        }
+
+        let index = (percentile / 100.0 * (self.total_count - 1) as f64).round() as usize;
+        self.data.get(index).cloned()
+    }
+
+    pub fn cdf(&self, x: u32) -> f64 {
+        self.distribution
+            .range(..=x)
+            .next_back()
+            .map_or(0.0, |(_, &prob)| prob)
+    }
+    pub fn proportion_under(&self, x: u32) -> f64 {
+        let count = self
+            .frequency
+            .iter()
+            .filter(|(&value, _)| value < x)
+            .map(|(_, &count)| count)
+            .sum::<usize>();
+        count as f64 / self.total_count as f64
+    }
+
+    pub fn proportion_above(&self, x: u32) -> f64 {
+        let count = self
+            .frequency
+            .iter()
+            .filter(|(&value, _)| value > x)
+            .map(|(_, &count)| count)
+            .sum::<usize>();
+        count as f64 / self.total_count as f64
+    }
+    // pub fn proportion_under(&self, x: u32) -> f64 {
+    //     self.distribution
+    //         .range(..x)
+    //         .next_back()
+    //         .map_or(0.0, |(_, &prob)| prob)
+    // }
+    //
+    // pub fn proportion_above(&self, x: u32) -> f64 {
+    //     1.0 - self.cdf(x)
+    // }
+
+    pub fn fitted_proportion_under(&self, x: f64) -> f64 {
+        self.fitted_normal.cdf(x)
+    }
+
+    pub fn fitted_proportion_above(&self, x: f64) -> f64 {
+        1.0 - self.fitted_normal.cdf(x)
+    }
+
+    pub fn get_fitted_parameters(&self) -> (f64, f64) {
+        (
+            self.fitted_normal.mean().unwrap(),
+            self.fitted_normal.std_dev().unwrap(),
+        )
     }
 }
+
+pub fn generate_range(start: f64, end: f64, steps: usize) -> Vec<f64> {
+    if steps == 0 {
+        return vec![];
+    }
+    if steps == 1 {
+        return vec![start];
+    }
+
+    let step_size = (end - start) / (steps - 1) as f64;
+    (0..steps).map(|i| start + i as f64 * step_size).collect()
+}
+
+use rayon::prelude::*;
+
+pub fn ranges_under(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
+    get_ranges_parallel(vec, x, tolerance, |val, threshold| val <= threshold)
+}
+
+pub fn ranges_over(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
+    get_ranges_parallel(vec, x, tolerance, |val, threshold| val >= threshold)
+}
+
+pub fn ranges_between(
+    vec: &[u32],
+    lower: u32,
+    upper: u32,
+    tolerance: usize,
+) -> Vec<(usize, usize)> {
+    get_ranges_parallel(vec, (lower, upper), tolerance, |val, (l, u)| {
+        val >= l && val <= u
+    })
+}
+
+pub fn get_ranges_parallel<T, F>(
+    vec: &[u32],
+    threshold: T,
+    tolerance: usize,
+    compare: F,
+) -> Vec<(usize, usize)>
+where
+    F: Fn(u32, T) -> bool + Sync,
+    T: Copy + Sync,
+{
+    if vec.is_empty() {
+        return Vec::new();
+    }
+
+    let chunk_size = (vec.len() / rayon::current_num_threads()).max(1);
+    vec.par_chunks(chunk_size)
+        .enumerate()
+        .flat_map(|(chunk_index, chunk)| {
+            let mut local_ranges = Vec::new();
+            let mut current_range: Option<(usize, usize)> = None;
+            let offset = chunk_index * chunk_size;
+
+            for (i, &val) in chunk.iter().enumerate() {
+                let global_index = offset + i;
+                if compare(val, threshold) {
+                    match current_range {
+                        Some((start, end)) if global_index <= end + tolerance + 1 => {
+                            current_range = Some((start, global_index));
+                        }
+                        Some((start, end)) => {
+                            local_ranges.push((start, end));
+                            current_range = Some((global_index, global_index));
+                        }
+                        None => {
+                            current_range = Some((global_index, global_index));
+                        }
+                    }
+                } else if let Some((start, end)) = current_range {
+                    if global_index > end + tolerance + 1 {
+                        local_ranges.push((start, end));
+                        current_range = None;
+                    }
+                }
+            }
+
+            if let Some(range) = current_range {
+                local_ranges.push(range);
+            }
+
+            local_ranges
+        })
+        .collect()
+}

+ 60 - 4
src/lib.rs

@@ -16,8 +16,8 @@ use std::{
 
 pub mod bam;
 pub mod bin;
-pub mod dict_reader;
 pub mod counts;
+pub mod dict_reader;
 
 #[derive(Debug)]
 pub struct Config {
@@ -343,7 +343,8 @@ pub fn par_whole_scan(dict_file: &str, bam_path: &str, out_dir: &str) -> anyhow:
 #[cfg(test)]
 mod tests {
 
-    use counts::{read_counts_from_file, Counts};
+    use counts::{ranges_over, ranges_under, Counts};
+    use plotly::{color::NamedColor, common::Marker, Bar, Plot, Scatter};
     use rust_htslib::bam::Reader;
 
     use super::*;
@@ -447,11 +448,66 @@ mod tests {
     fn load() -> anyhow::Result<()> {
         init();
         info!("loading");
-        let median_len = 4460.0;
         let count_file = "/data/longreads_basic_pipe/ROBIN/diag/scan/chr1_counts.tsv";
         let counts = Counts::from_files(vec![count_file]);
 
-        counts.cumulative_coverage("chr1");
+        let chr1_nd_reads = counts.nd_reads("chr1")?;
+        println!(
+            "Percentiles: 1% {}, 50% {}, 99% {}",
+            chr1_nd_reads.percentile(1.0).unwrap(),
+            chr1_nd_reads.percentile(50.0).unwrap(),
+            chr1_nd_reads.percentile(99.0).unwrap()
+        );
+        println!("< 6x: {:.2}%", chr1_nd_reads.proportion_under(6) * 100.0);
+        println!("> 15x: {:.2}%", chr1_nd_reads.proportion_above(15) * 100.0);
+        let d = counts.get("chr1")?;
+        
+        let all_ranges = ranges_over(&d, 6, 10);
+        println!("Ranges over 1 : {:?}", all_ranges.len());
+        let all_ranges = ranges_under(&d, 6, 10);
+        println!("Ranges under 1 : {:?}", all_ranges.len());
+
+        let mut plot = Plot::new();
+
+        let bar_x: Vec<u32> = (0..=chr1_nd_reads.percentile(99.0).unwrap()).collect();
+        let colors: Vec<plotly::color::Rgb> = bar_x
+            .iter()
+            .map(|&x| {
+                if x <= 2 {
+                    plotly::color::Rgb::new(193, 18, 31)
+                } else if x >= 15 {
+                    plotly::color::Rgb::new(138, 201, 38)
+                } else if x <= 6 {
+                    plotly::color::Rgb::new(243, 114, 44)
+                } else {
+                    plotly::color::Rgb::new(255, 202, 58)
+                }
+            })
+            .collect();
+
+        let bars = Bar::new(
+            bar_x.clone(),
+            bar_x
+                .iter()
+                .map(|x| chr1_nd_reads.frequency(*x) as f64 / chr1_nd_reads.data.len() as f64)
+                .collect(),
+        )
+        .show_legend(false)
+        .marker(Marker::new().color_array(colors));
+
+        plot.add_trace(bars);
+
+        // let trace = Scatter::new(
+        //     data.iter().map(|(x, _)| *x).collect::<Vec<f64>>(),
+        //     data.iter().map(|(_, y)| *y).collect::<Vec<f64>>(),
+        // );
+        // plot.add_trace(trace);
+        //
+        plot.use_local_plotly();
+
+        plot.write_image("/data/test2.svg", plotly::ImageFormat::SVG, 800, 600, 1.0);
+
+        // plot.write_html("out.html");
         Ok(())
     }
 

Неке датотеке нису приказане због велике количине промена