|
|
@@ -1,4 +1,8 @@
|
|
|
-use anyhow::{Context, Ok, Result};
|
|
|
+use std::collections::HashMap;
|
|
|
+
|
|
|
+use anyhow::{anyhow, Context, Ok, Result};
|
|
|
+use log::info;
|
|
|
+use rayon::prelude::*;
|
|
|
use rust_htslib::{
|
|
|
bam,
|
|
|
bam::{ext::BamRecordExtensions, record, Read, Record},
|
|
|
@@ -278,8 +282,6 @@ pub fn range_depths(
|
|
|
let rstart = pileup.pos() as i32;
|
|
|
|
|
|
if rstart >= start && rstart < stop {
|
|
|
- // depths.push(pileup.depth());
|
|
|
-
|
|
|
let v = depths
|
|
|
.get_mut((rstart - start) as usize)
|
|
|
.context(format!("Errrr {}", rstart - start))?;
|
|
|
@@ -371,7 +373,7 @@ pub fn scan_sa(
|
|
|
start: i32,
|
|
|
stop: i32,
|
|
|
mapq: u8,
|
|
|
-) -> Result<(Vec<Vec<Record>>, Vec<Vec<Record>>)> {
|
|
|
+) -> Result<Vec<(Vec<Record>, Vec<Record>)>> {
|
|
|
bam.fetch((chr, start - 1, stop))?;
|
|
|
let length = stop - start;
|
|
|
let mut results_start: Vec<Vec<Record>> = Vec::new();
|
|
|
@@ -386,7 +388,6 @@ pub fn scan_sa(
|
|
|
|
|
|
if rstart >= start && rstart < stop {
|
|
|
if record.mapq() >= mapq && record.aux(b"SA").is_ok() {
|
|
|
-
|
|
|
let index = rstart - start;
|
|
|
let u = results_start.get_mut(index as usize).unwrap();
|
|
|
u.push(record.clone());
|
|
|
@@ -401,7 +402,13 @@ pub fn scan_sa(
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- Ok((results_start, results_end))
|
|
|
+
|
|
|
+ let res: Vec<(Vec<Record>, Vec<Record>)> = results_start
|
|
|
+ .iter()
|
|
|
+ .zip(results_end.iter())
|
|
|
+ .map(|(s, e)| (s.to_owned(), e.to_owned()))
|
|
|
+ .collect();
|
|
|
+ Ok(res)
|
|
|
}
|
|
|
|
|
|
pub fn records_at_base(
|
|
|
@@ -485,12 +492,250 @@ pub fn qnames_at_base(
|
|
|
Ok(bases)
|
|
|
}
|
|
|
|
|
|
+fn mean(data: &[f64]) -> Option<f64> {
|
|
|
+ let sum = data.iter().sum::<f64>() as f64;
|
|
|
+ let count = data.len() as f64;
|
|
|
+
|
|
|
+ match count {
|
|
|
+ positive if positive > 0.0 => Some(sum / count),
|
|
|
+ _ => None,
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+fn std_deviation(data: &[f64]) -> Option<f64> {
|
|
|
+ match (mean(data), data.len() as f64) {
|
|
|
+ (Some(data_mean), count) if count > 0.0 => {
|
|
|
+ let variance = data
|
|
|
+ .iter()
|
|
|
+ .map(|value| {
|
|
|
+ let diff = data_mean - (*value as f64);
|
|
|
+ diff * diff
|
|
|
+ })
|
|
|
+ .sum::<f64>()
|
|
|
+ / count;
|
|
|
+
|
|
|
+ Some(variance.sqrt())
|
|
|
+ }
|
|
|
+ _ => None,
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+fn get_se_diag_mrd(
|
|
|
+ diag_bam_path: &str,
|
|
|
+ mrd_bam_path: &str,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ stop: i32,
|
|
|
+ mapq: u8,
|
|
|
+) -> Vec<Vec<Record>> {
|
|
|
+ let min_reads = 3;
|
|
|
+ let bin_size = 1_000;
|
|
|
+ let n_bins = stop - start;
|
|
|
+
|
|
|
+ let mut se_raw = vec![(0, 0)];
|
|
|
+ se_raw.resize(n_bins as usize, (0, 0));
|
|
|
+
|
|
|
+ let get_pos = |i: usize| -> i32 { start + (i as i32 * bin_size as i32) };
|
|
|
+ let se_raw: Vec<(i32, i32)> = se_raw
|
|
|
+ .par_chunks(bin_size)
|
|
|
+ .enumerate()
|
|
|
+ .flat_map(|(i, _)| {
|
|
|
+ let mut diag_bam = rust_htslib::bam::IndexedReader::from_path(diag_bam_path)
|
|
|
+ .context(anyhow!("Reading {}", diag_bam_path))
|
|
|
+ .unwrap();
|
|
|
+ let mut mrd_bam = rust_htslib::bam::IndexedReader::from_path(mrd_bam_path)
|
|
|
+ .context(anyhow!("Reading {}", mrd_bam_path))
|
|
|
+ .unwrap();
|
|
|
+
|
|
|
+ let s = get_pos(i);
|
|
|
+ let e = s + bin_size as i32;
|
|
|
+ let d = get_start_end_qual(&mut diag_bam, chr, s, e, mapq).unwrap();
|
|
|
+ let m = get_start_end_qual(&mut mrd_bam, chr, s, e, mapq).unwrap();
|
|
|
+
|
|
|
+ d.iter()
|
|
|
+ .zip(m.iter())
|
|
|
+ .map(|(d, m)| (*d, *m))
|
|
|
+ .collect::<Vec<(i32, i32)>>()
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ info!("raw {}", se_raw.len());
|
|
|
+
|
|
|
+ let (diag_sum, mrd_sum) = se_raw
|
|
|
+ .iter()
|
|
|
+ .fold((0i32, 0i32), |acc, (d, m)| (acc.0 + d, acc.1 + m));
|
|
|
+
|
|
|
+ let diag = se_raw.iter().map(|(d, _)| *d as f64).collect::<Vec<f64>>();
|
|
|
+ let diag_mean = mean(&diag).unwrap();
|
|
|
+ let diag_std = std_deviation(&diag).unwrap();
|
|
|
+
|
|
|
+ let mrd = se_raw.iter().map(|(_, m)| *m as f64).collect::<Vec<f64>>();
|
|
|
+ let mrd_mean = mean(&mrd).unwrap();
|
|
|
+ let mrd_std = std_deviation(&mrd).unwrap();
|
|
|
+
|
|
|
+ info!(
|
|
|
+ "N/nt diag {} mrd {}",
|
|
|
+ diag_sum as f64 / (stop - start) as f64,
|
|
|
+ mrd_sum as f64 / (stop - start) as f64
|
|
|
+ );
|
|
|
+ info!("Mean diag {diag_mean} mrd {mrd_mean}");
|
|
|
+ info!("std dev diag {diag_std} mrd {mrd_std}");
|
|
|
+
|
|
|
+ let ratio_se: Vec<f64> = diag
|
|
|
+ .iter()
|
|
|
+ .zip(mrd.iter())
|
|
|
+ .map(|(d, m)| (((d / diag_mean) + 1.0) / ((m / mrd_mean) + 1.0)) - 1.0)
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let r_stddev = std_deviation(&ratio_se).unwrap();
|
|
|
+ info!("ratio mean {} std dev {r_stddev}", mean(&ratio_se).unwrap());
|
|
|
+
|
|
|
+ let all: Vec<_> = se_raw
|
|
|
+ .into_iter()
|
|
|
+ .enumerate()
|
|
|
+ .zip(ratio_se.iter())
|
|
|
+ .filter(|((_i, (d, m)), _r)| *d > 3 && *m == 0)
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ info!("{} locations to assemble", all.len());
|
|
|
+
|
|
|
+ // getting priamry reads from bam at given positions
|
|
|
+ let se_data: Vec<(Vec<Record>, Vec<Record>)> = all
|
|
|
+ .par_chunks(100)
|
|
|
+ .flat_map(|chunks| {
|
|
|
+ // Loading bam reader.
|
|
|
+ let mut diag_bam = rust_htslib::bam::IndexedReader::from_path(diag_bam_path)
|
|
|
+ .context(anyhow!("Reading {}", diag_bam_path))
|
|
|
+ .unwrap();
|
|
|
+ chunks
|
|
|
+ .to_vec()
|
|
|
+ .iter()
|
|
|
+ .map(|((i, (_d, _m)), _r)| {
|
|
|
+ let pos = *i as i32 + start;
|
|
|
+ let (mut s, mut e) =
|
|
|
+ get_start_end_qual_rec(&mut diag_bam, chr, pos, pos + 1, mapq).unwrap();
|
|
|
+ let s = s.pop().unwrap();
|
|
|
+ let s = swap_by_primary(diag_bam_path, s);
|
|
|
+ let e = e.pop().unwrap();
|
|
|
+ let e = swap_by_primary(diag_bam_path, e);
|
|
|
+ // info!("{chr}:{pos}\t{r}\t{d} (s:{} e:{:?})\t{m}", s.len(), e.len());
|
|
|
+ // println!("e {e:?}");
|
|
|
+ // println!("s {s:?}");
|
|
|
+
|
|
|
+ (s, e)
|
|
|
+ })
|
|
|
+ .collect::<Vec<(Vec<Record>, Vec<Record>)>>()
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ // filtering by number of reads
|
|
|
+ // let mut groups: HashMap<String, Vec<Record>> = HashMap::new();
|
|
|
+ let mut res = Vec::new();
|
|
|
+ for (s, e) in se_data {
|
|
|
+ if s.len() >= min_reads {
|
|
|
+ res.push(s.clone());
|
|
|
+ // for r in s {
|
|
|
+ // let k = format!("{}-{}", r.tid(), r.pos());
|
|
|
+ // if let Some(vr) = groups.get_mut(&k) {
|
|
|
+ // vr.push(r);
|
|
|
+ // vr.dedup();
|
|
|
+ // } else {
|
|
|
+ // groups.insert(k, vec![r]);
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ }
|
|
|
+ if e.len() >= min_reads {
|
|
|
+ res.push(e.clone());
|
|
|
+ // for r in e {
|
|
|
+ // let k = format!("{}-{}", r.tid(), r.pos());
|
|
|
+ // if let Some(vr) = groups.get_mut(&k) {
|
|
|
+ // vr.push(r);
|
|
|
+ // vr.dedup();
|
|
|
+ // } else {
|
|
|
+ // groups.insert(k, vec![r]);
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ res
|
|
|
+}
|
|
|
+
|
|
|
+pub fn sa_ratio(
|
|
|
+ diag_bam_path: &str,
|
|
|
+ mrd_bam_path: &str,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ stop: i32,
|
|
|
+ mapq: u8,
|
|
|
+ bin_size: usize,
|
|
|
+) -> Vec<(f32, f32)> {
|
|
|
+ let n_bins = stop - start;
|
|
|
+
|
|
|
+ let get_pos = |i: usize| -> i32 { start + (i as i32 * bin_size as i32) };
|
|
|
+ let mut se_raw = vec![(0, 0)];
|
|
|
+ se_raw.resize(n_bins as usize, (0, 0));
|
|
|
+
|
|
|
+ se_raw
|
|
|
+ .par_chunks(bin_size)
|
|
|
+ .enumerate()
|
|
|
+ .flat_map(|(i, _)| {
|
|
|
+ let mut diag_bam = rust_htslib::bam::IndexedReader::from_path(diag_bam_path)
|
|
|
+ .context(anyhow!("Reading {}", diag_bam_path))
|
|
|
+ .unwrap();
|
|
|
+ let mut mrd_bam = rust_htslib::bam::IndexedReader::from_path(mrd_bam_path)
|
|
|
+ .context(anyhow!("Reading {}", mrd_bam_path))
|
|
|
+ .unwrap();
|
|
|
+
|
|
|
+ let s = get_pos(i);
|
|
|
+ let e = s + bin_size as i32;
|
|
|
+ let d = scan_sa(&mut diag_bam, chr, s, e, mapq).unwrap();
|
|
|
+ let m = scan_sa(&mut mrd_bam, chr, s, e, mapq).unwrap();
|
|
|
+
|
|
|
+ d.iter()
|
|
|
+ .zip(m.iter())
|
|
|
+ .map(|(d, m)| {
|
|
|
+ let start_diff: f32 = d.0.len() as f32 / m.0.len() as f32;
|
|
|
+ let end_diff: f32 = d.1.len() as f32 / m.1.len() as f32;
|
|
|
+ (start_diff, end_diff)
|
|
|
+ })
|
|
|
+ .collect::<Vec<(f32, f32)>>()
|
|
|
+ })
|
|
|
+ .collect()
|
|
|
+}
|
|
|
|
|
|
#[cfg(test)]
|
|
|
mod tests {
|
|
|
+ use env_logger::Env;
|
|
|
+ use log::info;
|
|
|
+
|
|
|
// Note this useful idiom: importing names from outer (for mod tests) scope.
|
|
|
use super::*;
|
|
|
|
|
|
+ fn init() {
|
|
|
+ let _ = env_logger::Builder::from_env(Env::default().default_filter_or("info"))
|
|
|
+ .is_test(true)
|
|
|
+ .try_init();
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn se_diff() {
|
|
|
+ init();
|
|
|
+ let case = "MERY";
|
|
|
+ let chr = "chr7";
|
|
|
+ let start = 144_103_157;
|
|
|
+ let stop = 144_183_195;
|
|
|
+ let mapq = 50;
|
|
|
+ let r = get_se_diag_mrd(
|
|
|
+ &format!("/data/longreads_basic_pipe/{case}/diag/{case}_diag_hs1.bam"),
|
|
|
+ &format!("/data/longreads_basic_pipe/{case}/mrd/{case}_mrd_hs1.bam"),
|
|
|
+ chr,
|
|
|
+ start,
|
|
|
+ stop,
|
|
|
+ mapq,
|
|
|
+ );
|
|
|
+ println!("{r:?}");
|
|
|
+ }
|
|
|
+
|
|
|
#[test]
|
|
|
fn test_se() {
|
|
|
let bam_path = "/data/longreads_basic_pipe/CAMARA/diag/CAMARA_diag_hs1.bam";
|
|
|
@@ -512,22 +757,22 @@ mod tests {
|
|
|
}
|
|
|
|
|
|
#[test]
|
|
|
- fn test_sa() {
|
|
|
- let bam_path = "/data/longreads_basic_pipe/CAMARA/diag/CAMARA_diag_hs1.bam";
|
|
|
- let chr = "chr9";
|
|
|
- let start = 21839796;
|
|
|
+ fn sa_r() {
|
|
|
+ init();
|
|
|
+ let case = "MERY";
|
|
|
+ let chr = "chr7";
|
|
|
+ let start = 144_153_157;
|
|
|
+ let stop = 144_153_195;
|
|
|
let mapq = 50;
|
|
|
-
|
|
|
- let mut bam = rust_htslib::bam::IndexedReader::from_path(bam_path).unwrap();
|
|
|
-
|
|
|
- let a = get_start_end_qual(&mut bam, chr, start, start + 1, mapq).unwrap().pop().unwrap();
|
|
|
-
|
|
|
- let (_, e) = get_start_end_qual_rec(&mut bam, chr, start, start + 1, mapq).unwrap();
|
|
|
- let res_records = swap_by_primary(bam_path, e.get(0).unwrap().clone());
|
|
|
-
|
|
|
- let (_, e) = scan_sa(&mut bam, chr, start, start + 1, mapq).unwrap();
|
|
|
- let res_records_sa = swap_by_primary(bam_path, e.get(0).unwrap().clone());
|
|
|
- assert_eq!(a as usize, res_records.len());
|
|
|
- assert_eq!(a as usize, res_records_sa.len());
|
|
|
+ let r = sa_ratio(
|
|
|
+ &format!("/data/longreads_basic_pipe/{case}/diag/{case}_diag_hs1.bam"),
|
|
|
+ &format!("/data/longreads_basic_pipe/{case}/mrd/{case}_diag_hs1.bam"),
|
|
|
+ chr,
|
|
|
+ start,
|
|
|
+ stop,
|
|
|
+ mapq,
|
|
|
+ 1_000
|
|
|
+ );
|
|
|
+ println!("{r:?}");
|
|
|
}
|
|
|
}
|