Your Name 1 年之前
父節點
當前提交
176f0bdcff
共有 4 個文件被更改,包括 228 次插入0 次删除
  1. 13 0
      Cargo.lock
  2. 1 0
      Cargo.toml
  3. 200 0
      src/counts.rs
  4. 14 0
      src/lib.rs

+ 13 - 0
Cargo.lock

@@ -860,6 +860,7 @@ dependencies = [
  "rayon",
  "rust-htslib",
  "serde",
+ "serde_json",
  "uuid",
 ]
 
@@ -1085,6 +1086,18 @@ dependencies = [
  "syn 2.0.75",
 ]
 
+[[package]]
+name = "serde_json"
+version = "1.0.128"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8"
+dependencies = [
+ "itoa",
+ "memchr",
+ "ryu",
+ "serde",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"

+ 1 - 0
Cargo.toml

@@ -21,4 +21,5 @@ postcard = { version = "1.0.8", features = ["alloc"] }
 flate2 = "1.0.30"
 csv = "1.3.0"
 dashmap = { version = "6.0.1", features = ["rayon"] }
+serde_json = "1.0.128"
 

+ 200 - 0
src/counts.rs

@@ -0,0 +1,200 @@
+use log::warn;
+use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
+use serde::{
+    de::{self, Visitor},
+    Deserialize, Deserializer,
+};
+use std::{
+    collections::HashMap,
+    fmt,
+    fs::File,
+    io::{BufRead, BufReader},
+    str::FromStr,
+};
+
+#[derive(Debug, Clone)]
+pub struct Count {
+    pub position: CountRange,
+    pub n_reads: u32,
+    pub n_low_mapq: u32,
+    pub frac_sa: f32,
+    pub sa_outlier: bool,
+    pub frac_se: f32,
+    pub se_outlier: bool,
+}
+
+impl fmt::Display for Count {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{}\t{}\t{}\t{:.6}\t{}\t{:.6}\t{}",
+            self.position,
+            self.n_reads,
+            self.n_low_mapq,
+            self.frac_sa,
+            self.sa_outlier,
+            self.frac_se,
+            self.se_outlier
+        )
+    }
+}
+// inclusive 0 based
+#[derive(Debug, Clone)]
+pub struct CountRange {
+    pub contig: String,
+    pub start: u32,
+    pub end: u32,
+}
+
+impl fmt::Display for CountRange {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}:{}-{}", self.contig, self.start, self.end)
+    }
+}
+
+impl<'de> Deserialize<'de> for Count {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        struct CountVisitor;
+
+        impl<'de> Visitor<'de> for CountVisitor {
+            type Value = Count;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                formatter.write_str("a string in the format 'chr:start-end n_reads n_low_mapq frac_sa sa_outlier frac_se se_outlier'")
+            }
+
+            fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
+            where
+                E: de::Error,
+            {
+                let parts: Vec<&str> = s.split_whitespace().collect();
+                if parts.len() != 7 {
+                    return Err(E::custom("incorrect number of fields"));
+                }
+
+                let position_parts: Vec<&str> = parts[0].split(&[':', '-'][..]).collect();
+                if position_parts.len() != 3 {
+                    return Err(E::custom("incorrect position format"));
+                }
+
+                Ok(Count {
+                    position: CountRange {
+                        contig: position_parts[0].to_string(),
+                        start: u32::from_str(position_parts[1]).map_err(E::custom)?,
+                        end: u32::from_str(position_parts[2]).map_err(E::custom)?,
+                    },
+                    n_reads: u32::from_str(parts[1]).map_err(E::custom)?,
+                    n_low_mapq: u32::from_str(parts[2]).map_err(E::custom)?,
+                    frac_sa: f32::from_str(parts[3]).map_err(E::custom)?,
+                    sa_outlier: bool::from_str(parts[4]).map_err(E::custom)?,
+                    frac_se: f32::from_str(parts[5]).map_err(E::custom)?,
+                    se_outlier: bool::from_str(parts[6]).map_err(E::custom)?,
+                })
+            }
+        }
+
+        deserializer.deserialize_str(CountVisitor)
+    }
+}
+
+pub fn read_counts_from_file(filename: &str) -> anyhow::Result<Vec<Count>> {
+    let file = File::open(filename)?;
+    let reader = BufReader::new(file);
+    let mut counts = Vec::new();
+
+    for line in reader.lines() {
+        let line = line?;
+        let count: Count = serde_json::from_str(&format!("\"{}\"", escape_control_chars(&line)))?;
+        counts.push(count);
+    }
+
+    Ok(counts)
+}
+fn escape_control_chars(s: &str) -> String {
+    s.chars()
+        .map(|c| {
+            if c.is_control() {
+                format!("\\u{:04x}", c as u32)
+            } else {
+                c.to_string()
+            }
+        })
+        .collect()
+}
+
+#[derive(Debug)]
+pub struct Counts {
+    pub data: HashMap<String, Vec<Count>>,
+}
+
+impl Counts {
+    pub fn from_files(paths: Vec<&str>) -> Self {
+        let counts: Vec<Vec<Count>> = paths
+            .par_iter()
+            .map(|path| match read_counts_from_file(path) {
+                Ok(c) => c,
+                Err(e) => {
+                    warn!("Couldnt load {path}: {e}");
+                    Vec::new()
+                },
+            })
+            .filter(|v| !v.is_empty())
+            .collect();
+
+        let mut data = HashMap::new();
+        for count in counts {
+            let contig = count.first().unwrap().position.contig.clone();
+            data.insert(contig, count);
+        }
+        Counts { data }
+    }
+
+    pub fn cumulative_coverage(&self, contig: &str/* , median_len: f64 */) {
+        if let Some(ccounts) = self.data.get(contig) {
+            let mut ccounts = ccounts.clone();
+            ccounts.sort_by_key(|c| c.n_reads);
+            let n_counts = ccounts.len();
+            let min = ccounts.first().unwrap();
+            println!("{min}");
+            let max = ccounts.last().unwrap();
+            println!("{max}");
+            // let start = max.position.start;
+            // let end = max.position.end;
+            // let len = end - start + 1;
+            // let cov = max.n_reads as f64 * median_len / len as f64;
+            // println!("{cov}");
+            let index_50 = (0.5 * (n_counts - 1) as f64).round() as usize;
+
+            let median = ccounts.get(index_50).unwrap();
+            println!("50% {}", median.n_reads);
+
+            let index_95 =  (0.99 * (n_counts - 1) as f64).round() as usize;
+            let v = ccounts.get(index_95).unwrap();
+            println!("99% {}", v.n_reads);
+
+            let cov99 = ccounts.get(index_95).unwrap().n_reads;
+            // let mut last_start = 0;
+            let mut all_frac = vec![(0, 1.0)];
+            for i in 1..=cov99 {
+                let n_sup = ccounts.iter().filter(|c| c.n_reads >= i).count() as f64;
+                let frac = n_sup / n_counts as f64;
+                all_frac.push((i, frac));
+            }
+            println!("{:#?}", all_frac);
+
+            let mut all_frac = vec![(0, 0.0)];
+            for i in 1..=cov99 {
+                let n_sup = ccounts.iter().filter(|c| c.n_reads < i).count() as f64;
+                let frac = n_sup / n_counts as f64;
+                all_frac.push((i, frac));
+            }
+            println!("{:#?}", all_frac);
+
+
+
+        }
+    }
+}

+ 14 - 0
src/lib.rs

@@ -17,6 +17,7 @@ use std::{
 pub mod bam;
 pub mod bin;
 pub mod dict_reader;
+pub mod counts;
 
 #[derive(Debug)]
 pub struct Config {
@@ -342,6 +343,7 @@ pub fn par_whole_scan(dict_file: &str, bam_path: &str, out_dir: &str) -> anyhow:
 #[cfg(test)]
 mod tests {
 
+    use counts::{read_counts_from_file, Counts};
     use rust_htslib::bam::Reader;
 
     use super::*;
@@ -441,6 +443,18 @@ mod tests {
         assert!(records_ids.contains(&"56bbfd4a-0d1d-4d97-b307-7626be446ce8".to_string()));
     }
 
+    #[test]
+    fn load() -> anyhow::Result<()> {
+        init();
+        info!("loading");
+        let median_len = 4460.0;
+        let count_file = "/data/longreads_basic_pipe/ROBIN/diag/scan/chr1_counts.tsv";
+        let counts = Counts::from_files(vec![count_file]);
+
+        counts.cumulative_coverage("chr1");
+        Ok(())
+    }
+
     // #[test]
     // fn phasing() -> anyhow::Result<()> {
     //     let id = "SALICETTO";