Thomas 1 month ago
parent
commit
b3231d15e3
1 changed files with 42 additions and 23 deletions
  1. 42 23
      src/main.rs

+ 42 - 23
src/main.rs

@@ -11,7 +11,7 @@ use clap::Parser;
 use log::info;
 use statrs::distribution::{ChiSquared, ContinuousCDF};
 use std::{
-    collections::HashMap,
+    collections::{HashMap, HashSet},
     fmt,
     fs::File,
     io::{BufReader, Write},
@@ -48,6 +48,7 @@ struct Exon {
     depth_end: u32,
     depth_after_end: u32,
     mean_depth: u32,
+    n_reads: u32,
     snps: Vec<Snp>,
 }
 
@@ -57,12 +58,12 @@ impl fmt::Display for Exon {
             .snps
             .iter()
             .filter_map(|s| s.get_p().ok())
-            .filter(|p| p <= &0.01)
+            .filter(|p| p <= &0.001)
             .count();
 
         writeln!(
             f,
-            "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
+            "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
             self.contig,
             self.start,
             self.end,
@@ -72,6 +73,7 @@ impl fmt::Display for Exon {
             self.depth_start,
             self.depth_end,
             self.depth_after_end,
+            self.n_reads,
             self.id
         )
     }
@@ -250,6 +252,7 @@ fn main() -> anyhow::Result<()> {
 
             let mut len = 0;
             let mut depths = 0;
+            let mut reads = HashSet::new();
 
             let snps_pos: Vec<u32> = exon.snps.iter().map(|s| s.pos).collect();
             for p in bam_reader.pileup() {
@@ -259,6 +262,7 @@ fn main() -> anyhow::Result<()> {
                 let mut depth = 0;
                 let mut bases = Vec::new();
                 let populate_bases = snps_pos.contains(&pos);
+                let mut reads_ids = Vec::new();
                 for a in pile.alignments() {
                     if a.is_refskip() || a.is_del() {
                         continue;
@@ -270,11 +274,17 @@ fn main() -> anyhow::Result<()> {
                         (
                             bio_types::strand::ReqStrand::Reverse,
                             noodles_gff::record::Strand::Forward,
-                        ) => depth += 1,
+                        ) => {
+                            depth += 1;
+                            reads_ids.push(String::from_utf8(r.qname().to_vec()).unwrap());
+                        },
                         (
                             bio_types::strand::ReqStrand::Forward,
                             noodles_gff::record::Strand::Reverse,
-                        ) => depth += 1,
+                        ) => {
+                            depth += 1;
+                            reads_ids.push(String::from_utf8(r.qname().to_vec()).unwrap());
+                        },
                         _ => (),
                     }
 
@@ -316,6 +326,7 @@ fn main() -> anyhow::Result<()> {
                 if pos >= bam_start && pos <= bam_end {
                     len += 1;
                     depths += depth;
+                    reads_ids.drain(..).for_each(|r| { reads.insert(r); });
                 }
 
                 if populate_bases {
@@ -327,6 +338,7 @@ fn main() -> anyhow::Result<()> {
             }
 
             exon.mean_depth = (depths as f64 / len as f64).round() as u32;
+            exon.n_reads = reads.len() as u32;
             contig_exons
                 .entry(exon.contig.clone())
                 .or_default()
@@ -345,7 +357,7 @@ fn main() -> anyhow::Result<()> {
         .map(|s| String::from_utf8(s.to_vec()).unwrap())
         .collect();
 
-    let mut genes: HashMap<String, (String, u32, Vec<u32>)> = HashMap::new();
+    let mut genes: HashMap<String, (String, u32, Vec<( u32, u32 )>)> = HashMap::new();
     let mut writer = File::create(out_exons).map(bgzf::MultithreadedWriter::new)?;
     contigs.iter().for_each(|c| {
         if let Some(exons) = contig_exons.get(c) {
@@ -357,45 +369,52 @@ fn main() -> anyhow::Result<()> {
                 writer.write_all(e.to_string().as_bytes()).unwrap();
                 genes
                     .entry(e.id.clone())
-                    .or_insert((e.id.clone(), e.start, Vec::new()))
+                    .or_insert_with(|| (e.contig.clone(), e.start, Vec::new()))
                     .2
-                    .push(e.mean_depth);
+                    .push(( e.mean_depth, e.n_reads ));
                 writer.flush().unwrap();
             });
         }
     });
 
     writer.finish()?;
+    let ngs = genes.len();
+    println!("{ngs}");
     let genes: Vec<_> = genes
-        .values()
-        .map(|(c, p, v)| {
-            let sum: u32 = v.iter().copied().sum();
-            (c.to_string(), *p, sum as f64 / v.len() as f64)
+        .iter()
+        .map(|(id, (c, p, v))| {
+            let sum: u32 = v.iter().map(|(v, _)| *v).sum();
+            let sum_reads: u32 = v.iter().map(|(_, v)| *v).sum();
+            (c.to_string(), *p, sum as f64 / v.len() as f64, sum_reads, id.to_string())
         })
         .collect();
+    let ngs = genes.len();
+    println!("{ngs}");
 
     let mut writer = File::create(out_genes).map(bgzf::MultithreadedWriter::new)?;
     contigs.iter().for_each(|c| {
-        let mut g: Vec<&(String, u32, f64)> =
-            genes.iter().filter(|(contig, _, _)| c == contig).collect();
+        let mut g: Vec<&(String, u32, f64, u32, String)> =
+            genes.iter().filter(|(contig, _, _, _, _)| *c == *contig).collect();
 
-        info!("Sorting {c}");
+        info!("Sorting {}", g.len());
         g.par_sort_by(|a, b| a.1.cmp(&b.1));
         info!("Writing {c}");
         g.iter().for_each(|e| {
-            writer
-                .write_all(
-                    [e.0.to_string(), e.1.to_string(), e.2.to_string()]
-                        .join("\t")
-                        .as_bytes(),
-                )
-                .unwrap();
+            let s = [
+                e.0.to_string(),
+                e.1.to_string(),
+                e.2.to_string(),
+                e.3.to_string(),
+                e.4.to_string(),
+                "\n".to_string(),
+            ]
+            .join("\t");
+            writer.write_all(s.as_bytes()).unwrap();
             writer.flush().unwrap();
         });
     });
     writer.finish()?;
 
-
     info!("Process done in {:#?}", now.elapsed());
     Ok(())
 }