|
|
@@ -1,11 +1,11 @@
|
|
|
-use log::warn;
|
|
|
+use log::{info, warn};
|
|
|
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
|
|
use serde::{
|
|
|
de::{self, Visitor},
|
|
|
Deserialize, Deserializer,
|
|
|
};
|
|
|
use std::{
|
|
|
- collections::HashMap,
|
|
|
+ collections::{BTreeMap, HashMap},
|
|
|
fmt,
|
|
|
fs::File,
|
|
|
io::{BufRead, BufReader},
|
|
|
@@ -139,7 +139,7 @@ impl Counts {
|
|
|
Err(e) => {
|
|
|
warn!("Couldnt load {path}: {e}");
|
|
|
Vec::new()
|
|
|
- },
|
|
|
+ }
|
|
|
})
|
|
|
.filter(|v| !v.is_empty())
|
|
|
.collect();
|
|
|
@@ -152,28 +152,64 @@ impl Counts {
|
|
|
Counts { data }
|
|
|
}
|
|
|
|
|
|
- pub fn cumulative_coverage(&self, contig: &str/* , median_len: f64 */) {
|
|
|
+ pub fn get(&self, contig: &str) -> anyhow::Result<Vec<u32>> {
|
|
|
if let Some(ccounts) = self.data.get(contig) {
|
|
|
+ Ok(ccounts.iter().map(|c| c.n_reads).collect())
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in counts.")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn calculate_percentiles(
|
|
|
+ &self,
|
|
|
+ contig: &str,
|
|
|
+ percentiles: &[f64],
|
|
|
+ ) -> anyhow::Result<Vec<f64>> {
|
|
|
+ if let Some(ccounts) = self.data.get(contig) {
|
|
|
+ let mut n_reads: Vec<u32> = ccounts.iter().map(|c| c.n_reads).collect();
|
|
|
+
|
|
|
+ n_reads.sort_unstable();
|
|
|
+
|
|
|
+ let cdf = ND::new(n_reads.clone());
|
|
|
+
|
|
|
+ println!("CDF at 13: {:?}", cdf.cdf(13));
|
|
|
+ println!("Percentile at 99: {:?}", cdf.percentile(99.0));
|
|
|
+ println!("above 15X: {:?}", cdf.proportion_above(15));
|
|
|
+ println!("above 15.1X: {:?}", cdf.fitted_proportion_above(15.1));
|
|
|
+ println!("under 6X: {:?}", cdf.proportion_under(6));
|
|
|
+
|
|
|
+ Ok(percentiles
|
|
|
+ .iter()
|
|
|
+ .map(|&p| {
|
|
|
+ let index = (p * (n_reads.len() - 1) as f64).round() as usize;
|
|
|
+ n_reads[index] as f64
|
|
|
+ })
|
|
|
+ .collect())
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in counts.")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn cumulative_coverage(
|
|
|
+ &self,
|
|
|
+ contig: &str, /* , median_len: f64 */
|
|
|
+ ) -> anyhow::Result<Vec<(u32, f64)>> {
|
|
|
+ if let Some(ccounts) = self.data.get(contig) {
|
|
|
+ let n_reads: Vec<u32> = ccounts.iter().map(|c| c.n_reads).collect();
|
|
|
+ if ccounts.is_empty() {
|
|
|
+ anyhow::bail!("No counts for {contig}")
|
|
|
+ }
|
|
|
let mut ccounts = ccounts.clone();
|
|
|
ccounts.sort_by_key(|c| c.n_reads);
|
|
|
let n_counts = ccounts.len();
|
|
|
- let min = ccounts.first().unwrap();
|
|
|
- println!("{min}");
|
|
|
- let max = ccounts.last().unwrap();
|
|
|
- println!("{max}");
|
|
|
- // let start = max.position.start;
|
|
|
- // let end = max.position.end;
|
|
|
- // let len = end - start + 1;
|
|
|
- // let cov = max.n_reads as f64 * median_len / len as f64;
|
|
|
- // println!("{cov}");
|
|
|
- let index_50 = (0.5 * (n_counts - 1) as f64).round() as usize;
|
|
|
-
|
|
|
- let median = ccounts.get(index_50).unwrap();
|
|
|
- println!("50% {}", median.n_reads);
|
|
|
-
|
|
|
- let index_95 = (0.99 * (n_counts - 1) as f64).round() as usize;
|
|
|
+ let per = |x: f64| (x * (n_counts - 1) as f64).round() as usize;
|
|
|
+
|
|
|
+ let median = ccounts.get(per(0.5)).unwrap();
|
|
|
+
|
|
|
+ let index_95 = (0.99 * (n_counts - 1) as f64).round() as usize;
|
|
|
let v = ccounts.get(index_95).unwrap();
|
|
|
println!("99% {}", v.n_reads);
|
|
|
+ println!("99% {:?}", self.calculate_percentiles(contig, &[0.99])?);
|
|
|
|
|
|
let cov99 = ccounts.get(index_95).unwrap().n_reads;
|
|
|
// let mut last_start = 0;
|
|
|
@@ -183,18 +219,311 @@ impl Counts {
|
|
|
let frac = n_sup / n_counts as f64;
|
|
|
all_frac.push((i, frac));
|
|
|
}
|
|
|
- println!("{:#?}", all_frac);
|
|
|
|
|
|
- let mut all_frac = vec![(0, 0.0)];
|
|
|
- for i in 1..=cov99 {
|
|
|
- let n_sup = ccounts.iter().filter(|c| c.n_reads < i).count() as f64;
|
|
|
- let frac = n_sup / n_counts as f64;
|
|
|
- all_frac.push((i, frac));
|
|
|
+ Ok(all_frac)
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in counts")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn nd_reads(&self, contig: &str) -> anyhow::Result<ND> {
|
|
|
+ if let Some(ccounts) = self.data.get(contig) {
|
|
|
+ Ok(ND::new(ccounts.iter().map(|c| c.n_reads).collect()))
|
|
|
+ } else {
|
|
|
+ anyhow::bail!("No {contig} in counts")
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+use statrs::{
|
|
|
+ distribution::{ContinuousCDF, Normal},
|
|
|
+ statistics::{Distribution, Statistics},
|
|
|
+};
|
|
|
+
|
|
|
+pub struct ND {
|
|
|
+ pub data: Vec<u32>,
|
|
|
+ pub distribution: BTreeMap<u32, f64>,
|
|
|
+ pub total_count: usize,
|
|
|
+ pub frequency: HashMap<u32, usize>,
|
|
|
+ pub fitted_normal: Normal,
|
|
|
+}
|
|
|
+
|
|
|
+impl ND {
|
|
|
+ fn new(mut data: Vec<u32>) -> Self {
|
|
|
+ data.sort_unstable();
|
|
|
+ let n = data.len();
|
|
|
+ info!("n values {n}");
|
|
|
+ let mut frequency = HashMap::new();
|
|
|
+ let mut distribution = BTreeMap::new();
|
|
|
+
|
|
|
+ for &value in &data {
|
|
|
+ *frequency.entry(value).or_insert(0) += 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut cumulative_count = 0;
|
|
|
+ for (&value, &count) in &frequency {
|
|
|
+ cumulative_count += count;
|
|
|
+ let cumulative_prob = cumulative_count as f64 / n as f64;
|
|
|
+ distribution.insert(value, cumulative_prob);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Fit normal distribution
|
|
|
+ let fitted_normal = Self::fit_normal(&data);
|
|
|
+ // let fitted_normal = Self::fit_normal_with_outlier_removal(self, 2);
|
|
|
+
|
|
|
+ // let mean = data.iter().map(|&x| x as f64).sum::<f64>() / n as f64;
|
|
|
+ // let variance = data.iter().map(|&x| (x as f64 - mean).powi(2)).sum::<f64>() / n as f64;
|
|
|
+ // let std_dev = variance.sqrt();
|
|
|
+ //
|
|
|
+ // let fitted_normal = Normal::new(mean, std_dev).unwrap();
|
|
|
+
|
|
|
+ Self {
|
|
|
+ data,
|
|
|
+ distribution,
|
|
|
+ frequency,
|
|
|
+ total_count: n,
|
|
|
+ fitted_normal,
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ fn fit_normal(data: &[u32]) -> Normal {
|
|
|
+ let n = data.len() as f64;
|
|
|
+
|
|
|
+ // Calculate mean and variance in a single pass
|
|
|
+ let (sum, sum_sq) = data
|
|
|
+ .iter()
|
|
|
+ .filter(|v| **v > 1)
|
|
|
+ .fold((0.0, 0.0), |(sum, sum_sq), x| {
|
|
|
+ let x = *x as f64;
|
|
|
+ (sum + x, sum_sq + x * x)
|
|
|
+ });
|
|
|
+
|
|
|
+ let mean = sum / n;
|
|
|
+ let variance = (sum_sq / n) - (mean * mean);
|
|
|
+ let std_dev = variance.sqrt();
|
|
|
+
|
|
|
+ Normal::new(mean, std_dev).unwrap()
|
|
|
+ }
|
|
|
+
|
|
|
+ fn fit_normal_with_outlier_removal(
|
|
|
+ data: Vec<u32>,
|
|
|
+ max_iterations: usize,
|
|
|
+ z_score_threshold: f64,
|
|
|
+ removal_percentage: f64,
|
|
|
+ ) -> Normal {
|
|
|
+ let mut current_data = data.clone();
|
|
|
+
|
|
|
+ for _ in 0..max_iterations {
|
|
|
+ // Calculate mean and standard deviation
|
|
|
+ let mean = current_data.iter().map(|&x| x as f64).mean();
|
|
|
+ let std_dev = current_data.iter().map(|&x| x as f64).std_dev();
|
|
|
+
|
|
|
+ // Identify outliers
|
|
|
+ let z_scores: Vec<f64> = current_data
|
|
|
+ .iter()
|
|
|
+ .map(|&x| (x as f64 - mean).abs() / std_dev)
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let outliers: Vec<bool> = z_scores.iter().map(|&z| z > z_score_threshold).collect();
|
|
|
+
|
|
|
+ // If no outliers, break
|
|
|
+ if !outliers.iter().any(|&x| x) {
|
|
|
+ break;
|
|
|
}
|
|
|
- println!("{:#?}", all_frac);
|
|
|
|
|
|
+ // Remove a percentage of the worst outliers
|
|
|
+ let num_to_remove = (current_data.len() as f64 * removal_percentage).round() as usize;
|
|
|
+ let mut indexed_z_scores: Vec<(usize, f64)> =
|
|
|
+ z_scores.into_iter().enumerate().collect();
|
|
|
+ indexed_z_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
|
|
|
+
|
|
|
+ let indices_to_remove: Vec<usize> = indexed_z_scores
|
|
|
+ .iter()
|
|
|
+ .take(num_to_remove)
|
|
|
+ .map(|&(index, _)| index)
|
|
|
+ .collect();
|
|
|
|
|
|
+ current_data = current_data
|
|
|
+ .into_iter()
|
|
|
+ .enumerate()
|
|
|
+ .filter(|(i, _)| !indices_to_remove.contains(i))
|
|
|
+ .map(|(_, v)| v)
|
|
|
+ .collect();
|
|
|
|
|
|
+ println!(
|
|
|
+ "Iteration complete: {} points remaining",
|
|
|
+ current_data.len()
|
|
|
+ );
|
|
|
}
|
|
|
+
|
|
|
+ // Update the CDF with the cleaned data
|
|
|
+ // self.data = current_data;
|
|
|
+ // self.update_distribution();
|
|
|
+
|
|
|
+ // Return the fitted normal distribution
|
|
|
+ let data_wo_zero: Vec<f64> = data.iter().filter(|v| **v > 0).map(|v| *v as f64).collect();
|
|
|
+ let mean = data_wo_zero.clone().mean();
|
|
|
+ let std_dev = data_wo_zero.std_dev();
|
|
|
+ Normal::new(mean, std_dev).unwrap()
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn pdf(&self, x: f64) -> f64 {
|
|
|
+ let epsilon = 1e-6; // Small value for numerical differentiation
|
|
|
+ let cdf_x = self.fitted_normal.cdf(x);
|
|
|
+ let cdf_x_plus_epsilon = self.fitted_normal.cdf(x + epsilon);
|
|
|
+
|
|
|
+ // Approximate the derivative
|
|
|
+ (cdf_x_plus_epsilon - cdf_x) / epsilon
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn frequency(&self, x: u32) -> usize {
|
|
|
+ *self.frequency.get(&x).unwrap_or(&0)
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn percentile(&self, percentile: f64) -> Option<u32> {
|
|
|
+ if !(0.0..=100.0).contains(&percentile) {
|
|
|
+ return None;
|
|
|
+ }
|
|
|
+
|
|
|
+ let index = (percentile / 100.0 * (self.total_count - 1) as f64).round() as usize;
|
|
|
+ self.data.get(index).cloned()
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn cdf(&self, x: u32) -> f64 {
|
|
|
+ self.distribution
|
|
|
+ .range(..=x)
|
|
|
+ .next_back()
|
|
|
+ .map_or(0.0, |(_, &prob)| prob)
|
|
|
+ }
|
|
|
+ pub fn proportion_under(&self, x: u32) -> f64 {
|
|
|
+ let count = self
|
|
|
+ .frequency
|
|
|
+ .iter()
|
|
|
+ .filter(|(&value, _)| value < x)
|
|
|
+ .map(|(_, &count)| count)
|
|
|
+ .sum::<usize>();
|
|
|
+ count as f64 / self.total_count as f64
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn proportion_above(&self, x: u32) -> f64 {
|
|
|
+ let count = self
|
|
|
+ .frequency
|
|
|
+ .iter()
|
|
|
+ .filter(|(&value, _)| value > x)
|
|
|
+ .map(|(_, &count)| count)
|
|
|
+ .sum::<usize>();
|
|
|
+ count as f64 / self.total_count as f64
|
|
|
+ }
|
|
|
+ // pub fn proportion_under(&self, x: u32) -> f64 {
|
|
|
+ // self.distribution
|
|
|
+ // .range(..x)
|
|
|
+ // .next_back()
|
|
|
+ // .map_or(0.0, |(_, &prob)| prob)
|
|
|
+ // }
|
|
|
+ //
|
|
|
+ // pub fn proportion_above(&self, x: u32) -> f64 {
|
|
|
+ // 1.0 - self.cdf(x)
|
|
|
+ // }
|
|
|
+
|
|
|
+ pub fn fitted_proportion_under(&self, x: f64) -> f64 {
|
|
|
+ self.fitted_normal.cdf(x)
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn fitted_proportion_above(&self, x: f64) -> f64 {
|
|
|
+ 1.0 - self.fitted_normal.cdf(x)
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn get_fitted_parameters(&self) -> (f64, f64) {
|
|
|
+ (
|
|
|
+ self.fitted_normal.mean().unwrap(),
|
|
|
+ self.fitted_normal.std_dev().unwrap(),
|
|
|
+ )
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+pub fn generate_range(start: f64, end: f64, steps: usize) -> Vec<f64> {
|
|
|
+ if steps == 0 {
|
|
|
+ return vec![];
|
|
|
+ }
|
|
|
+ if steps == 1 {
|
|
|
+ return vec![start];
|
|
|
+ }
|
|
|
+
|
|
|
+ let step_size = (end - start) / (steps - 1) as f64;
|
|
|
+ (0..steps).map(|i| start + i as f64 * step_size).collect()
|
|
|
+}
|
|
|
+
|
|
|
+use rayon::prelude::*;
|
|
|
+
|
|
|
+pub fn ranges_under(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
|
|
|
+ get_ranges_parallel(vec, x, tolerance, |val, threshold| val <= threshold)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn ranges_over(vec: &[u32], x: u32, tolerance: usize) -> Vec<(usize, usize)> {
|
|
|
+ get_ranges_parallel(vec, x, tolerance, |val, threshold| val >= threshold)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn ranges_between(
|
|
|
+ vec: &[u32],
|
|
|
+ lower: u32,
|
|
|
+ upper: u32,
|
|
|
+ tolerance: usize,
|
|
|
+) -> Vec<(usize, usize)> {
|
|
|
+ get_ranges_parallel(vec, (lower, upper), tolerance, |val, (l, u)| {
|
|
|
+ val >= l && val <= u
|
|
|
+ })
|
|
|
+}
|
|
|
+
|
|
|
+pub fn get_ranges_parallel<T, F>(
|
|
|
+ vec: &[u32],
|
|
|
+ threshold: T,
|
|
|
+ tolerance: usize,
|
|
|
+ compare: F,
|
|
|
+) -> Vec<(usize, usize)>
|
|
|
+where
|
|
|
+ F: Fn(u32, T) -> bool + Sync,
|
|
|
+ T: Copy + Sync,
|
|
|
+{
|
|
|
+ if vec.is_empty() {
|
|
|
+ return Vec::new();
|
|
|
+ }
|
|
|
+
|
|
|
+ let chunk_size = (vec.len() / rayon::current_num_threads()).max(1);
|
|
|
+ vec.par_chunks(chunk_size)
|
|
|
+ .enumerate()
|
|
|
+ .flat_map(|(chunk_index, chunk)| {
|
|
|
+ let mut local_ranges = Vec::new();
|
|
|
+ let mut current_range: Option<(usize, usize)> = None;
|
|
|
+ let offset = chunk_index * chunk_size;
|
|
|
+
|
|
|
+ for (i, &val) in chunk.iter().enumerate() {
|
|
|
+ let global_index = offset + i;
|
|
|
+ if compare(val, threshold) {
|
|
|
+ match current_range {
|
|
|
+ Some((start, end)) if global_index <= end + tolerance + 1 => {
|
|
|
+ current_range = Some((start, global_index));
|
|
|
+ }
|
|
|
+ Some((start, end)) => {
|
|
|
+ local_ranges.push((start, end));
|
|
|
+ current_range = Some((global_index, global_index));
|
|
|
+ }
|
|
|
+ None => {
|
|
|
+ current_range = Some((global_index, global_index));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else if let Some((start, end)) = current_range {
|
|
|
+ if global_index > end + tolerance + 1 {
|
|
|
+ local_ranges.push((start, end));
|
|
|
+ current_range = None;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if let Some(range) = current_range {
|
|
|
+ local_ranges.push(range);
|
|
|
+ }
|
|
|
+
|
|
|
+ local_ranges
|
|
|
+ })
|
|
|
+ .collect()
|
|
|
+}
|