Thomas 1 rok pred
rodič
commit
81a8fd7b6a
2 zmenil súbory, kde vykonal 173 pridanie a 3 odobranie
  1. 29 0
      src/counts.rs
  2. 144 3
      src/lib.rs

+ 29 - 0
src/counts.rs

@@ -158,6 +158,7 @@ impl Counts {
     pub fn from_files(paths: Vec<String>) -> Self {
         let counts: Vec<Vec<Count>> = paths
             .par_iter()
+            .inspect(|i| info!("Reading {i}"))
             .map(|path| match read_counts_from_file(path) {
                 Ok(c) => c,
                 Err(e) => {
@@ -182,6 +183,7 @@ impl Counts {
     pub fn mrd_from_files(&mut self, paths: Vec<String>) {
         let counts: Vec<Vec<Count>> = paths
             .par_iter()
+            .inspect(|i| info!("Reading {i}"))
             .map(|path| match read_counts_from_file(path) {
                 Ok(c) => c,
                 Err(e) => {
@@ -713,3 +715,30 @@ pub struct CountsStats {
     pub breaks_values: Vec<(String, f64)>,
     pub masked: Vec<(String, f64)>,
 }
+
+pub fn merge_ranges(ranges: Vec<CountRange>) -> Vec<CountRange> {
+    if ranges.is_empty() {
+        return vec![];
+    }
+
+    let mut merged: Vec<CountRange> = Vec::new();
+    let mut current_range = ranges[0].clone(); // Clone the first range to make it mutable
+
+    for range in ranges.iter().skip(1) {
+        // Check if the current range overlaps or is contiguous with the next range
+        if current_range.end >= range.start - 1 {
+            // Merge the ranges by updating the end
+            current_range.end = current_range.end.max(range.end);
+        } else {
+            // Push the current range to merged and update to the new range
+            merged.push(current_range.clone());
+            current_range = range.clone(); // Clone the new range to make it mutable
+        }
+    }
+
+    // Push the last processed range
+    merged.push(current_range);
+    
+    merged
+}
+

+ 144 - 3
src/lib.rs

@@ -306,7 +306,9 @@ pub fn par_whole_scan(dict_file: &str, bam_path: &str, out_dir: &str) -> anyhow:
 #[cfg(test)]
 mod tests {
 
-    use counts::Counts;
+    use std::collections::HashMap;
+
+    use counts::{merge_ranges, CountAnnotation, CountRange, Counts};
     use rust_htslib::bam::Reader;
 
     use super::*;
@@ -406,8 +408,8 @@ mod tests {
     #[test]
     fn save_stats() -> anyhow::Result<()> {
         init();
-        let id = "DAHAN";
-        let breaks = vec![1, 6, 15];
+        let id = "LEVASSEUR";
+        let breaks = vec![1, 4, 15];
 
         let result_dir = "/data/longreads_basic_pipe";
         let save_dir = format!("{result_dir}/{id}/diag/report/data/scan");
@@ -435,4 +437,143 @@ mod tests {
         counts.save_contigs(&contigs, &format!("{save_dir}/{id}"), breaks)?;
         Ok(())
     }
+
+    #[test]
+    fn save_json() -> anyhow::Result<()> {
+        init();
+        let id = "MORIN";
+        let contig = "chr1";
+
+        let result_dir = "/data/longreads_basic_pipe";
+        let save_dir = format!("{result_dir}/{id}/diag/report/data/scan");
+        info!("Files will be saved into {save_dir}");
+        fs::create_dir_all(&save_dir)?;
+
+        let mut contigs: Vec<String> = (1..=22).map(|c| format!("chr{c}")).collect();
+        contigs.push("chrX".to_string());
+        contigs.push("chrY".to_string());
+        let mut counts = Counts::from_files(
+            contigs
+                .clone()
+                .iter()
+                .map(|c| format!("{result_dir}/{id}/diag/scan/{c}_counts.tsv"))
+                .collect(),
+        );
+        counts.mrd_from_files(
+            contigs
+                .clone()
+                .iter()
+                .map(|c| format!("{result_dir}/{id}/mrd/scan/{c}_counts.tsv"))
+                .collect(),
+        );
+
+        // counts.mask_low_mrd(contig, 6)?;
+        counts.mask_low_quality(contig, 0.1)?;
+
+        let mask_annotations: HashSet<CountAnnotation> = vec![
+            CountAnnotation::MaskedLowMRD,
+            CountAnnotation::MaskedQuality,
+        ]
+        .into_iter()
+        .collect();
+
+        let contig_counts = counts.data.get(contig).unwrap();
+
+        let mut n_masked = 0;
+        let mut n = 0;
+        let mut n_by_depth = HashMap::new();
+        for count in contig_counts {
+            if count
+                .annotation
+                .iter()
+                .any(|a| mask_annotations.contains(a))
+            {
+                n_masked += 1;
+            } else {
+                *n_by_depth.entry(count.n_reads).or_insert(0) += 1;
+            }
+            n += 1;
+        }
+        let mut d_n: Vec<_> = n_by_depth.iter().collect();
+        d_n.sort_unstable_by_key(|(d, _)| **d);
+        println!("{:#?}", d_n);
+        let n_unmasked = d_n.iter().map(|(_, v)| **v).sum::<i32>() as f64;
+
+        // let proportions: Vec<(u32, f64)> = d_n.iter().map(|(d,n)| (**d, **n as f64 / n_unmasked)).collect();
+        // let u  = serde_json::from_value(proportions)?;
+        // println!("{:?}", proportions);
+
+        // let n = contig_counts.len();
+        // let n_masked = contig_counts
+        //     .iter()
+        //     .filter(|c| c.annotation.iter().any(|a| mask_annotations.contains(a)))
+        //     .count();
+
+        info!(
+            "{contig} masked: {n_masked}/{n} ({:.0}%)",
+            n_masked as f64 * 100.0 / n as f64
+        );
+
+        // let n_unmasked = (n - n_masked) as f64;
+        // let mut frequencies = HashMap::new();
+        // let contig_values = counts.get(&contig);
+        // contig_value.iter()
+        //     .map(|n| {
+        //
+        //     })
+        //     .for_each(|n| {
+        //
+        // });
+
+        Ok(())
+    }
+
+    #[test]
+    fn save_mask() -> anyhow::Result<()> {
+        init();
+
+        let id = "ADJAGBA";
+        let result_dir = "/data/longreads_basic_pipe";
+        let save_path = format!("{result_dir}/{id}/diag/mask.bed");
+
+        let mut contigs: Vec<String> = (1..=22).map(|c| format!("chr{c}")).collect();
+        contigs.push("chrX".to_string());
+        contigs.push("chrY".to_string());
+        let mut counts = Counts::from_files(
+            contigs
+                .clone()
+                .iter()
+                .map(|c| format!("{result_dir}/{id}/diag/scan/{c}_counts.tsv"))
+                .collect(),
+        );
+        counts.mrd_from_files(
+            contigs
+                .clone()
+                .iter()
+                .map(|c| format!("{result_dir}/{id}/mrd/scan/{c}_counts.tsv"))
+                .collect(),
+        );
+
+        let mut writer = BufWriter::new(File::create(save_path)?) ;
+        for c in contigs.iter() {
+            println!("Contig {c}");
+            counts.mask_low_mrd(c, 6)?;
+            counts.mask_low_quality(c, 0.3)?;
+            let count = counts.data.get(c).unwrap();
+            let ranges: Vec<CountRange> = count
+                .iter()
+                .filter(|a| {
+                    a.annotation
+                        .iter()
+                        .any(|e| {
+                            matches!(e, CountAnnotation::MaskedLowMRD)
+                                | matches!(e, CountAnnotation::MaskedQuality)
+                        })
+                })
+                .map(|e|  e.position.clone()).collect();
+            let bed: Vec<String> = merge_ranges(ranges).iter().map(|e| format!("{}\t{}\t{}", e.contig, e.start, e.end)).collect();
+            writer.write_fmt(format_args!("{}\n", bed.join("\n")))?;
+        }
+        Ok(())
+    }
 }