|
|
@@ -0,0 +1,200 @@
|
|
|
+use log::warn;
|
|
|
+use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
|
|
+use serde::{
|
|
|
+ de::{self, Visitor},
|
|
|
+ Deserialize, Deserializer,
|
|
|
+};
|
|
|
+use std::{
|
|
|
+ collections::HashMap,
|
|
|
+ fmt,
|
|
|
+ fs::File,
|
|
|
+ io::{BufRead, BufReader},
|
|
|
+ str::FromStr,
|
|
|
+};
|
|
|
+
|
|
|
+#[derive(Debug, Clone)]
|
|
|
+pub struct Count {
|
|
|
+ pub position: CountRange,
|
|
|
+ pub n_reads: u32,
|
|
|
+ pub n_low_mapq: u32,
|
|
|
+ pub frac_sa: f32,
|
|
|
+ pub sa_outlier: bool,
|
|
|
+ pub frac_se: f32,
|
|
|
+ pub se_outlier: bool,
|
|
|
+}
|
|
|
+
|
|
|
+impl fmt::Display for Count {
|
|
|
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
+ write!(
|
|
|
+ f,
|
|
|
+ "{}\t{}\t{}\t{:.6}\t{}\t{:.6}\t{}",
|
|
|
+ self.position,
|
|
|
+ self.n_reads,
|
|
|
+ self.n_low_mapq,
|
|
|
+ self.frac_sa,
|
|
|
+ self.sa_outlier,
|
|
|
+ self.frac_se,
|
|
|
+ self.se_outlier
|
|
|
+ )
|
|
|
+ }
|
|
|
+}
|
|
|
+// inclusive 0 based
|
|
|
+#[derive(Debug, Clone)]
|
|
|
+pub struct CountRange {
|
|
|
+ pub contig: String,
|
|
|
+ pub start: u32,
|
|
|
+ pub end: u32,
|
|
|
+}
|
|
|
+
|
|
|
+impl fmt::Display for CountRange {
|
|
|
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
+ write!(f, "{}:{}-{}", self.contig, self.start, self.end)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl<'de> Deserialize<'de> for Count {
|
|
|
+ fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
|
|
+ where
|
|
|
+ D: Deserializer<'de>,
|
|
|
+ {
|
|
|
+ struct CountVisitor;
|
|
|
+
|
|
|
+ impl<'de> Visitor<'de> for CountVisitor {
|
|
|
+ type Value = Count;
|
|
|
+
|
|
|
+ fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
|
+ formatter.write_str("a string in the format 'chr:start-end n_reads n_low_mapq frac_sa sa_outlier frac_se se_outlier'")
|
|
|
+ }
|
|
|
+
|
|
|
+ fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
|
|
|
+ where
|
|
|
+ E: de::Error,
|
|
|
+ {
|
|
|
+ let parts: Vec<&str> = s.split_whitespace().collect();
|
|
|
+ if parts.len() != 7 {
|
|
|
+ return Err(E::custom("incorrect number of fields"));
|
|
|
+ }
|
|
|
+
|
|
|
+ let position_parts: Vec<&str> = parts[0].split(&[':', '-'][..]).collect();
|
|
|
+ if position_parts.len() != 3 {
|
|
|
+ return Err(E::custom("incorrect position format"));
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(Count {
|
|
|
+ position: CountRange {
|
|
|
+ contig: position_parts[0].to_string(),
|
|
|
+ start: u32::from_str(position_parts[1]).map_err(E::custom)?,
|
|
|
+ end: u32::from_str(position_parts[2]).map_err(E::custom)?,
|
|
|
+ },
|
|
|
+ n_reads: u32::from_str(parts[1]).map_err(E::custom)?,
|
|
|
+ n_low_mapq: u32::from_str(parts[2]).map_err(E::custom)?,
|
|
|
+ frac_sa: f32::from_str(parts[3]).map_err(E::custom)?,
|
|
|
+ sa_outlier: bool::from_str(parts[4]).map_err(E::custom)?,
|
|
|
+ frac_se: f32::from_str(parts[5]).map_err(E::custom)?,
|
|
|
+ se_outlier: bool::from_str(parts[6]).map_err(E::custom)?,
|
|
|
+ })
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ deserializer.deserialize_str(CountVisitor)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+pub fn read_counts_from_file(filename: &str) -> anyhow::Result<Vec<Count>> {
|
|
|
+ let file = File::open(filename)?;
|
|
|
+ let reader = BufReader::new(file);
|
|
|
+ let mut counts = Vec::new();
|
|
|
+
|
|
|
+ for line in reader.lines() {
|
|
|
+ let line = line?;
|
|
|
+ let count: Count = serde_json::from_str(&format!("\"{}\"", escape_control_chars(&line)))?;
|
|
|
+ counts.push(count);
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(counts)
|
|
|
+}
|
|
|
+fn escape_control_chars(s: &str) -> String {
|
|
|
+ s.chars()
|
|
|
+ .map(|c| {
|
|
|
+ if c.is_control() {
|
|
|
+ format!("\\u{:04x}", c as u32)
|
|
|
+ } else {
|
|
|
+ c.to_string()
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect()
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug)]
|
|
|
+pub struct Counts {
|
|
|
+ pub data: HashMap<String, Vec<Count>>,
|
|
|
+}
|
|
|
+
|
|
|
+impl Counts {
|
|
|
+ pub fn from_files(paths: Vec<&str>) -> Self {
|
|
|
+ let counts: Vec<Vec<Count>> = paths
|
|
|
+ .par_iter()
|
|
|
+ .map(|path| match read_counts_from_file(path) {
|
|
|
+ Ok(c) => c,
|
|
|
+ Err(e) => {
|
|
|
+ warn!("Couldnt load {path}: {e}");
|
|
|
+ Vec::new()
|
|
|
+ },
|
|
|
+ })
|
|
|
+ .filter(|v| !v.is_empty())
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let mut data = HashMap::new();
|
|
|
+ for count in counts {
|
|
|
+ let contig = count.first().unwrap().position.contig.clone();
|
|
|
+ data.insert(contig, count);
|
|
|
+ }
|
|
|
+ Counts { data }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn cumulative_coverage(&self, contig: &str/* , median_len: f64 */) {
|
|
|
+ if let Some(ccounts) = self.data.get(contig) {
|
|
|
+ let mut ccounts = ccounts.clone();
|
|
|
+ ccounts.sort_by_key(|c| c.n_reads);
|
|
|
+ let n_counts = ccounts.len();
|
|
|
+ let min = ccounts.first().unwrap();
|
|
|
+ println!("{min}");
|
|
|
+ let max = ccounts.last().unwrap();
|
|
|
+ println!("{max}");
|
|
|
+ // let start = max.position.start;
|
|
|
+ // let end = max.position.end;
|
|
|
+ // let len = end - start + 1;
|
|
|
+ // let cov = max.n_reads as f64 * median_len / len as f64;
|
|
|
+ // println!("{cov}");
|
|
|
+ let index_50 = (0.5 * (n_counts - 1) as f64).round() as usize;
|
|
|
+
|
|
|
+ let median = ccounts.get(index_50).unwrap();
|
|
|
+ println!("50% {}", median.n_reads);
|
|
|
+
|
|
|
+ let index_95 = (0.99 * (n_counts - 1) as f64).round() as usize;
|
|
|
+ let v = ccounts.get(index_95).unwrap();
|
|
|
+ println!("99% {}", v.n_reads);
|
|
|
+
|
|
|
+ let cov99 = ccounts.get(index_95).unwrap().n_reads;
|
|
|
+ // let mut last_start = 0;
|
|
|
+ let mut all_frac = vec![(0, 1.0)];
|
|
|
+ for i in 1..=cov99 {
|
|
|
+ let n_sup = ccounts.iter().filter(|c| c.n_reads >= i).count() as f64;
|
|
|
+ let frac = n_sup / n_counts as f64;
|
|
|
+ all_frac.push((i, frac));
|
|
|
+ }
|
|
|
+ println!("{:#?}", all_frac);
|
|
|
+
|
|
|
+ let mut all_frac = vec![(0, 0.0)];
|
|
|
+ for i in 1..=cov99 {
|
|
|
+ let n_sup = ccounts.iter().filter(|c| c.n_reads < i).count() as f64;
|
|
|
+ let frac = n_sup / n_counts as f64;
|
|
|
+ all_frac.push((i, frac));
|
|
|
+ }
|
|
|
+ println!("{:#?}", all_frac);
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|