|
|
@@ -11,7 +11,7 @@ use clap::Parser;
|
|
|
use log::info;
|
|
|
use statrs::distribution::{ChiSquared, ContinuousCDF};
|
|
|
use std::{
|
|
|
- collections::HashMap,
|
|
|
+ collections::{HashMap, HashSet},
|
|
|
fmt,
|
|
|
fs::File,
|
|
|
io::{BufReader, Write},
|
|
|
@@ -48,6 +48,7 @@ struct Exon {
|
|
|
depth_end: u32,
|
|
|
depth_after_end: u32,
|
|
|
mean_depth: u32,
|
|
|
+ n_reads: u32,
|
|
|
snps: Vec<Snp>,
|
|
|
}
|
|
|
|
|
|
@@ -57,12 +58,12 @@ impl fmt::Display for Exon {
|
|
|
.snps
|
|
|
.iter()
|
|
|
.filter_map(|s| s.get_p().ok())
|
|
|
- .filter(|p| p <= &0.01)
|
|
|
+ .filter(|p| p <= &0.001)
|
|
|
.count();
|
|
|
|
|
|
writeln!(
|
|
|
f,
|
|
|
- "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
|
|
|
+ "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
|
|
|
self.contig,
|
|
|
self.start,
|
|
|
self.end,
|
|
|
@@ -72,6 +73,7 @@ impl fmt::Display for Exon {
|
|
|
self.depth_start,
|
|
|
self.depth_end,
|
|
|
self.depth_after_end,
|
|
|
+ self.n_reads,
|
|
|
self.id
|
|
|
)
|
|
|
}
|
|
|
@@ -250,6 +252,7 @@ fn main() -> anyhow::Result<()> {
|
|
|
|
|
|
let mut len = 0;
|
|
|
let mut depths = 0;
|
|
|
+ let mut reads = HashSet::new();
|
|
|
|
|
|
let snps_pos: Vec<u32> = exon.snps.iter().map(|s| s.pos).collect();
|
|
|
for p in bam_reader.pileup() {
|
|
|
@@ -259,6 +262,7 @@ fn main() -> anyhow::Result<()> {
|
|
|
let mut depth = 0;
|
|
|
let mut bases = Vec::new();
|
|
|
let populate_bases = snps_pos.contains(&pos);
|
|
|
+ let mut reads_ids = Vec::new();
|
|
|
for a in pile.alignments() {
|
|
|
if a.is_refskip() || a.is_del() {
|
|
|
continue;
|
|
|
@@ -270,11 +274,17 @@ fn main() -> anyhow::Result<()> {
|
|
|
(
|
|
|
bio_types::strand::ReqStrand::Reverse,
|
|
|
noodles_gff::record::Strand::Forward,
|
|
|
- ) => depth += 1,
|
|
|
+ ) => {
|
|
|
+ depth += 1;
|
|
|
+ reads_ids.push(String::from_utf8(r.qname().to_vec()).unwrap());
|
|
|
+ },
|
|
|
(
|
|
|
bio_types::strand::ReqStrand::Forward,
|
|
|
noodles_gff::record::Strand::Reverse,
|
|
|
- ) => depth += 1,
|
|
|
+ ) => {
|
|
|
+ depth += 1;
|
|
|
+ reads_ids.push(String::from_utf8(r.qname().to_vec()).unwrap());
|
|
|
+ },
|
|
|
_ => (),
|
|
|
}
|
|
|
|
|
|
@@ -316,6 +326,7 @@ fn main() -> anyhow::Result<()> {
|
|
|
if pos >= bam_start && pos <= bam_end {
|
|
|
len += 1;
|
|
|
depths += depth;
|
|
|
+ reads_ids.drain(..).for_each(|r| { reads.insert(r); });
|
|
|
}
|
|
|
|
|
|
if populate_bases {
|
|
|
@@ -327,6 +338,7 @@ fn main() -> anyhow::Result<()> {
|
|
|
}
|
|
|
|
|
|
exon.mean_depth = (depths as f64 / len as f64).round() as u32;
|
|
|
+ exon.n_reads = reads.len() as u32;
|
|
|
contig_exons
|
|
|
.entry(exon.contig.clone())
|
|
|
.or_default()
|
|
|
@@ -345,7 +357,7 @@ fn main() -> anyhow::Result<()> {
|
|
|
.map(|s| String::from_utf8(s.to_vec()).unwrap())
|
|
|
.collect();
|
|
|
|
|
|
- let mut genes: HashMap<String, (String, u32, Vec<u32>)> = HashMap::new();
|
|
|
+ let mut genes: HashMap<String, (String, u32, Vec<( u32, u32 )>)> = HashMap::new();
|
|
|
let mut writer = File::create(out_exons).map(bgzf::MultithreadedWriter::new)?;
|
|
|
contigs.iter().for_each(|c| {
|
|
|
if let Some(exons) = contig_exons.get(c) {
|
|
|
@@ -357,45 +369,52 @@ fn main() -> anyhow::Result<()> {
|
|
|
writer.write_all(e.to_string().as_bytes()).unwrap();
|
|
|
genes
|
|
|
.entry(e.id.clone())
|
|
|
- .or_insert((e.id.clone(), e.start, Vec::new()))
|
|
|
+ .or_insert_with(|| (e.contig.clone(), e.start, Vec::new()))
|
|
|
.2
|
|
|
- .push(e.mean_depth);
|
|
|
+ .push(( e.mean_depth, e.n_reads ));
|
|
|
writer.flush().unwrap();
|
|
|
});
|
|
|
}
|
|
|
});
|
|
|
|
|
|
writer.finish()?;
|
|
|
+ let ngs = genes.len();
|
|
|
+ println!("{ngs}");
|
|
|
let genes: Vec<_> = genes
|
|
|
- .values()
|
|
|
- .map(|(c, p, v)| {
|
|
|
- let sum: u32 = v.iter().copied().sum();
|
|
|
- (c.to_string(), *p, sum as f64 / v.len() as f64)
|
|
|
+ .iter()
|
|
|
+ .map(|(id, (c, p, v))| {
|
|
|
+ let sum: u32 = v.iter().map(|(v, _)| *v).sum();
|
|
|
+ let sum_reads: u32 = v.iter().map(|(_, v)| *v).sum();
|
|
|
+ (c.to_string(), *p, sum as f64 / v.len() as f64, sum_reads, id.to_string())
|
|
|
})
|
|
|
.collect();
|
|
|
+ let ngs = genes.len();
|
|
|
+ println!("{ngs}");
|
|
|
|
|
|
let mut writer = File::create(out_genes).map(bgzf::MultithreadedWriter::new)?;
|
|
|
contigs.iter().for_each(|c| {
|
|
|
- let mut g: Vec<&(String, u32, f64)> =
|
|
|
- genes.iter().filter(|(contig, _, _)| c == contig).collect();
|
|
|
+ let mut g: Vec<&(String, u32, f64, u32, String)> =
|
|
|
+ genes.iter().filter(|(contig, _, _, _, _)| *c == *contig).collect();
|
|
|
|
|
|
- info!("Sorting {c}");
|
|
|
+ info!("Sorting {}", g.len());
|
|
|
g.par_sort_by(|a, b| a.1.cmp(&b.1));
|
|
|
info!("Writing {c}");
|
|
|
g.iter().for_each(|e| {
|
|
|
- writer
|
|
|
- .write_all(
|
|
|
- [e.0.to_string(), e.1.to_string(), e.2.to_string()]
|
|
|
- .join("\t")
|
|
|
- .as_bytes(),
|
|
|
- )
|
|
|
- .unwrap();
|
|
|
+ let s = [
|
|
|
+ e.0.to_string(),
|
|
|
+ e.1.to_string(),
|
|
|
+ e.2.to_string(),
|
|
|
+ e.3.to_string(),
|
|
|
+ e.4.to_string(),
|
|
|
+ "\n".to_string(),
|
|
|
+ ]
|
|
|
+ .join("\t");
|
|
|
+ writer.write_all(s.as_bytes()).unwrap();
|
|
|
writer.flush().unwrap();
|
|
|
});
|
|
|
});
|
|
|
writer.finish()?;
|
|
|
|
|
|
-
|
|
|
info!("Process done in {:#?}", now.elapsed());
|
|
|
Ok(())
|
|
|
}
|