|
|
@@ -0,0 +1,778 @@
|
|
|
+use std::collections::HashMap;
|
|
|
+
|
|
|
+use anyhow::{anyhow, Context, Ok, Result};
|
|
|
+use log::info;
|
|
|
+use rayon::prelude::*;
|
|
|
+use rust_htslib::{
|
|
|
+ bam,
|
|
|
+ bam::{ext::BamRecordExtensions, record, Read, Record},
|
|
|
+};
|
|
|
+
|
|
|
+pub fn get_hts_nt_pileup(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ with_next_ins: bool,
|
|
|
+) -> Result<Vec<u8>> {
|
|
|
+ let stop = start + 1;
|
|
|
+ let mut bases = Vec::new();
|
|
|
+ bam.fetch((chr, start, stop))?;
|
|
|
+ let mut bam_pileup = Vec::new();
|
|
|
+ for p in bam.pileup() {
|
|
|
+ let pileup = p.context(format!(
|
|
|
+ "Can't pilup bam at position {}:{}-{}",
|
|
|
+ chr, start, stop
|
|
|
+ ))?;
|
|
|
+ let position = pileup.pos() as i32;
|
|
|
+ if position == start {
|
|
|
+ for alignment in pileup.alignments() {
|
|
|
+ match alignment.indel() {
|
|
|
+ bam::pileup::Indel::Ins(_len) => bam_pileup.push(b'I'),
|
|
|
+ bam::pileup::Indel::Del(_len) => bam_pileup.push(b'D'),
|
|
|
+ _ => {
|
|
|
+ let record = alignment.record();
|
|
|
+ if record.seq_len() > 0 {
|
|
|
+ if let Some(b) = hts_base_at(&record, start as u32, with_next_ins)? {
|
|
|
+ bases.push(b);
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ if alignment.is_del() {
|
|
|
+ bases.push(b'D');
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(bases)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn hts_base_at(
|
|
|
+ record: &rust_htslib::bam::record::Record,
|
|
|
+ at_pos: u32,
|
|
|
+ with_next_ins: bool,
|
|
|
+) -> Result<Option<u8>> {
|
|
|
+ use rust_htslib::bam::record::Cigar;
|
|
|
+
|
|
|
+ let cigar = record.cigar();
|
|
|
+ let seq = record.seq();
|
|
|
+ let pos = cigar.pos() as u32;
|
|
|
+
|
|
|
+ let mut read_i = 0u32;
|
|
|
+ let at_pos = at_pos - 1;
|
|
|
+ let mut ref_pos = pos;
|
|
|
+ if ref_pos > at_pos {
|
|
|
+ return Ok(None);
|
|
|
+ }
|
|
|
+
|
|
|
+ for (id, op) in cigar.iter().enumerate() {
|
|
|
+ let (add_read, add_ref) = match *op {
|
|
|
+ Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len),
|
|
|
+ Cigar::Ins(len) => (len, 0),
|
|
|
+ Cigar::Del(len) => (0, len),
|
|
|
+ Cigar::RefSkip(len) => (0, len),
|
|
|
+ Cigar::SoftClip(len) => (len, 0),
|
|
|
+ Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
|
|
|
+ };
|
|
|
+ // If at the end of the op len and next op is Ins return I
|
|
|
+ if with_next_ins && ref_pos + add_read == at_pos + 1 {
|
|
|
+ if let Some(Cigar::Ins(_)) = cigar.get(id + 1) {
|
|
|
+ return Ok(Some(b'I'));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if ref_pos + add_ref > at_pos {
|
|
|
+ // Handle deletions directly
|
|
|
+ if let Cigar::Del(_) = *op {
|
|
|
+ return Ok(Some(b'D'));
|
|
|
+ } else if let Cigar::RefSkip(_) = op {
|
|
|
+ return Ok(None);
|
|
|
+ } else {
|
|
|
+ let diff = at_pos - ref_pos;
|
|
|
+ let p = read_i + diff;
|
|
|
+ return Ok(Some(seq[p as usize]));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ read_i += add_read;
|
|
|
+ ref_pos += add_ref;
|
|
|
+ }
|
|
|
+ Ok(None)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn get_n_start(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ stop: i32,
|
|
|
+) -> Result<usize> {
|
|
|
+ bam.fetch((chr, start, stop))?;
|
|
|
+
|
|
|
+ // let mut start_positions = Vec::new();
|
|
|
+ let mut n_start = 0;
|
|
|
+ for read in bam.records() {
|
|
|
+ let record = read.context(format!("eRR"))?;
|
|
|
+ let rstart = record.pos() as i32;
|
|
|
+ if rstart >= start && rstart < stop {
|
|
|
+ if record.mapq() > 40 {
|
|
|
+ n_start += 1;
|
|
|
+ }
|
|
|
+ // start_positions.push(rstart);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(n_start)
|
|
|
+ // Ok(start_positions.len())
|
|
|
+}
|
|
|
+
|
|
|
+pub fn get_n_start_end_qual(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ stop: i32,
|
|
|
+ mapq: u8,
|
|
|
+) -> Result<usize> {
|
|
|
+ bam.fetch((chr, start, stop))?;
|
|
|
+
|
|
|
+ // let mut start_positions = Vec::new();
|
|
|
+ let mut n_start = 0;
|
|
|
+ for read in bam.records() {
|
|
|
+ let record = read.context(format!("eRR"))?;
|
|
|
+ let rstart = record.pos() as i32;
|
|
|
+ let rend = record.reference_end() as i32;
|
|
|
+ if rstart >= start && rstart < stop || rend >= start && rend < stop {
|
|
|
+ if record.mapq() >= mapq {
|
|
|
+ n_start += 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(n_start)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn get_start_end_qual(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ stop: i32,
|
|
|
+ mapq: u8,
|
|
|
+) -> Result<Vec<i32>> {
|
|
|
+ bam.fetch((chr, start - 1, stop))?;
|
|
|
+ let length = stop - start;
|
|
|
+ let mut results = Vec::new();
|
|
|
+ results.resize(length as usize, 0);
|
|
|
+
|
|
|
+ for read in bam.records() {
|
|
|
+ let record = read.context(format!("Error while parsing record"))?;
|
|
|
+ let rstart = record.pos() as i32;
|
|
|
+ // let rstart = record.reference_start() as i32;
|
|
|
+ let rend = record.reference_end() as i32;
|
|
|
+
|
|
|
+ if rstart >= start && rstart < stop {
|
|
|
+ // println!("start {rstart}");
|
|
|
+ if record.mapq() >= mapq {
|
|
|
+ let index = rstart - start;
|
|
|
+ *results.get_mut(index as usize).unwrap() += 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if rend >= start && rend < stop {
|
|
|
+ // println!("{rend}");
|
|
|
+ if record.mapq() >= mapq {
|
|
|
+ let index = rend - start;
|
|
|
+ *results.get_mut(index as usize).unwrap() += 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(results)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn get_start_end_qual_rec(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ stop: i32,
|
|
|
+ mapq: u8,
|
|
|
+) -> Result<(Vec<Vec<Record>>, Vec<Vec<Record>>)> {
|
|
|
+ bam.fetch((chr, start - 1, stop))?;
|
|
|
+ let length = stop - start;
|
|
|
+ let mut results_start: Vec<Vec<Record>> = Vec::new();
|
|
|
+ results_start.resize(length as usize, vec![]);
|
|
|
+ let mut results_end: Vec<Vec<Record>> = Vec::new();
|
|
|
+ results_end.resize(length as usize, vec![]);
|
|
|
+
|
|
|
+ for read in bam.records() {
|
|
|
+ let record = read.context(format!("Error while parsing record"))?;
|
|
|
+ let rstart = record.pos() as i32;
|
|
|
+ let rend = record.reference_end() as i32;
|
|
|
+
|
|
|
+ if rstart >= start && rstart < stop {
|
|
|
+ if record.mapq() >= mapq {
|
|
|
+ let index = rstart - start;
|
|
|
+ let u = results_start.get_mut(index as usize).unwrap();
|
|
|
+ u.push(record.clone());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if rend >= start && rend < stop {
|
|
|
+ if record.mapq() >= mapq {
|
|
|
+ let index = rend - start;
|
|
|
+ let u = results_end.get_mut(index as usize).unwrap();
|
|
|
+ u.push(record.clone());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok((results_start, results_end))
|
|
|
+}
|
|
|
+
|
|
|
+pub fn primary_record(bam_path: &str, record: &Record) -> Option<Record> {
|
|
|
+ let mut bam = rust_htslib::bam::IndexedReader::from_path(bam_path).unwrap();
|
|
|
+ if record.is_supplementary() {
|
|
|
+ let qname = record.qname();
|
|
|
+ let mut positions: Vec<(&str, i32)> = Vec::new();
|
|
|
+ match record.aux(b"SA") {
|
|
|
+ std::result::Result::Ok(v) => match v {
|
|
|
+ record::Aux::String(v) => {
|
|
|
+ positions = v
|
|
|
+ .split(";")
|
|
|
+ .filter(|s| s.len() != 0)
|
|
|
+ .map(|s| {
|
|
|
+ let s = s.split(",").take(2).collect::<Vec<&str>>();
|
|
|
+ let chr = *s.get(0).unwrap();
|
|
|
+ let position: i32 = s.get(1).unwrap().parse().unwrap();
|
|
|
+ (chr, position)
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+ }
|
|
|
+ _ => (),
|
|
|
+ },
|
|
|
+ Err(_) => (),
|
|
|
+ };
|
|
|
+ for (chr, start) in positions {
|
|
|
+ bam.fetch((chr, start, start + 1)).unwrap();
|
|
|
+ let records = bam.records();
|
|
|
+ for record in records {
|
|
|
+ let record = record.unwrap();
|
|
|
+ // String::from_utf8(record.qname().to_vec()).unwrap().as_str()
|
|
|
+ if qname == record.qname() {
|
|
|
+ if !record.is_supplementary() {
|
|
|
+ return Some(record.clone());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ None
|
|
|
+}
|
|
|
+
|
|
|
+pub fn range_depths(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ stop: i32,
|
|
|
+) -> Result<Vec<u32>> {
|
|
|
+ bam.fetch((chr, start, stop))?;
|
|
|
+
|
|
|
+ let mut depths = vec![0];
|
|
|
+ depths.resize((stop - start) as usize, 0);
|
|
|
+ for p in bam.pileup() {
|
|
|
+ let pileup = p.context(format!("eRR"))?;
|
|
|
+ let rstart = pileup.pos() as i32;
|
|
|
+
|
|
|
+ if rstart >= start && rstart < stop {
|
|
|
+ let v = depths
|
|
|
+ .get_mut((rstart - start) as usize)
|
|
|
+ .context(format!("Errrr {}", rstart - start))?;
|
|
|
+ *v = pileup.depth();
|
|
|
+ } else if rstart >= stop {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(depths)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn range_depths_qual(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ stop: i32,
|
|
|
+ mapq: u8,
|
|
|
+) -> Result<Vec<usize>> {
|
|
|
+ bam.fetch((chr, start, stop))?;
|
|
|
+
|
|
|
+ let mut depths = vec![0];
|
|
|
+ depths.resize((stop - start) as usize, 0);
|
|
|
+ for p in bam.pileup() {
|
|
|
+ let pileup = p.context(format!("eRR"))?;
|
|
|
+ let rstart = pileup.pos() as i32;
|
|
|
+
|
|
|
+ if rstart >= start && rstart < stop {
|
|
|
+ let v = depths
|
|
|
+ .get_mut((rstart - start) as usize)
|
|
|
+ .context(format!("Errrr {}", rstart - start))?;
|
|
|
+ *v = pileup
|
|
|
+ .alignments()
|
|
|
+ .filter(|e| e.record().mapq() >= mapq)
|
|
|
+ .count();
|
|
|
+ } else if rstart >= stop {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(depths)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn range_len_qual(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ stop: i32,
|
|
|
+ mapq: u8,
|
|
|
+) -> Result<Vec<usize>> {
|
|
|
+ bam.fetch((chr, start, stop))?;
|
|
|
+
|
|
|
+ let mut depths = vec![0];
|
|
|
+ depths.resize((stop - start) as usize, 0);
|
|
|
+ for p in bam.pileup() {
|
|
|
+ let pileup = p.context(format!("eRR"))?;
|
|
|
+ let rstart = pileup.pos() as i32;
|
|
|
+
|
|
|
+ if rstart >= start && rstart < stop {
|
|
|
+ let v = depths
|
|
|
+ .get_mut((rstart - start) as usize)
|
|
|
+ .context(format!("Errrr {}", rstart - start))?;
|
|
|
+ *v = pileup
|
|
|
+ .alignments()
|
|
|
+ .filter(|e| e.record().mapq() >= mapq)
|
|
|
+ .map(|e| e.record().seq_len())
|
|
|
+ .sum();
|
|
|
+ } else if rstart >= stop {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(depths)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn swap_by_primary(bam_path: &str, records: Vec<Record>) -> Vec<Record> {
|
|
|
+ let mut res_records = Vec::new();
|
|
|
+ for record in records {
|
|
|
+ if let Some(record) = primary_record(bam_path, &record) {
|
|
|
+ res_records.push(record);
|
|
|
+ } else {
|
|
|
+ res_records.push(record);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ res_records
|
|
|
+}
|
|
|
+
|
|
|
+pub fn scan_sa(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ stop: i32,
|
|
|
+ mapq: u8,
|
|
|
+) -> Result<Vec<(Vec<Record>, Vec<Record>)>> {
|
|
|
+ bam.fetch((chr, start - 1, stop))?;
|
|
|
+ let length = stop - start;
|
|
|
+ let mut results_start: Vec<Vec<Record>> = Vec::new();
|
|
|
+ results_start.resize(length as usize, vec![]);
|
|
|
+ let mut results_end: Vec<Vec<Record>> = Vec::new();
|
|
|
+ results_end.resize(length as usize, vec![]);
|
|
|
+
|
|
|
+ for read in bam.records() {
|
|
|
+ let record = read.context(format!("Error while parsing record"))?;
|
|
|
+ let rstart = record.pos() as i32;
|
|
|
+ let rend = record.reference_end() as i32;
|
|
|
+
|
|
|
+ if rstart >= start && rstart < stop {
|
|
|
+ if record.mapq() >= mapq && record.aux(b"SA").is_ok() {
|
|
|
+ let index = rstart - start;
|
|
|
+ let u = results_start.get_mut(index as usize).unwrap();
|
|
|
+ u.push(record.clone());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if rend >= start && rend < stop && record.aux(b"SA").is_ok() {
|
|
|
+ if record.mapq() >= mapq {
|
|
|
+ let index = rend - start;
|
|
|
+ let u = results_end.get_mut(index as usize).unwrap();
|
|
|
+ u.push(record.clone());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let res: Vec<(Vec<Record>, Vec<Record>)> = results_start
|
|
|
+ .iter()
|
|
|
+ .zip(results_end.iter())
|
|
|
+ .map(|(s, e)| (s.to_owned(), e.to_owned()))
|
|
|
+ .collect();
|
|
|
+ Ok(res)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn records_at_base(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ with_next_ins: bool,
|
|
|
+) -> Result<Vec<(Record, u8)>> {
|
|
|
+ let stop = start + 1;
|
|
|
+ let mut bases = Vec::new();
|
|
|
+ bam.fetch((chr, start, stop))?;
|
|
|
+ let mut bam_pileup = Vec::new();
|
|
|
+ for p in bam.pileup() {
|
|
|
+ let pileup = p.context(format!(
|
|
|
+ "Can't pilup bam at position {}:{}-{}",
|
|
|
+ chr, start, stop
|
|
|
+ ))?;
|
|
|
+ let position = pileup.pos() as i32;
|
|
|
+ if position == start {
|
|
|
+ for alignment in pileup.alignments() {
|
|
|
+ match alignment.indel() {
|
|
|
+ bam::pileup::Indel::Ins(_len) => bam_pileup.push(b'I'),
|
|
|
+ bam::pileup::Indel::Del(_len) => bam_pileup.push(b'D'),
|
|
|
+ _ => {
|
|
|
+ let record = alignment.record();
|
|
|
+ if record.seq_len() > 0 {
|
|
|
+ if let Some(b) = hts_base_at(&record, start as u32, with_next_ins)? {
|
|
|
+ bases.push((record.clone(), b));
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ if alignment.is_del() {
|
|
|
+ bases.push((record.clone(), b'D'));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(bases)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn qnames_at_base(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ with_next_ins: bool,
|
|
|
+) -> Result<Vec<(String, u8)>> {
|
|
|
+ let stop = start + 1;
|
|
|
+ let mut bases = Vec::new();
|
|
|
+ bam.fetch((chr, start, stop))?;
|
|
|
+ let mut bam_pileup = Vec::new();
|
|
|
+ for p in bam.pileup() {
|
|
|
+ let pileup = p.context(format!(
|
|
|
+ "Can't pilup bam at position {}:{}-{}",
|
|
|
+ chr, start, stop
|
|
|
+ ))?;
|
|
|
+ let position = pileup.pos() as i32;
|
|
|
+ if position == start {
|
|
|
+ for alignment in pileup.alignments() {
|
|
|
+ match alignment.indel() {
|
|
|
+ bam::pileup::Indel::Ins(_len) => bam_pileup.push(b'I'),
|
|
|
+ bam::pileup::Indel::Del(_len) => bam_pileup.push(b'D'),
|
|
|
+ _ => {
|
|
|
+ let record = alignment.record();
|
|
|
+ let qname = String::from_utf8(record.qname().to_vec())?;
|
|
|
+ if record.seq_len() > 0 {
|
|
|
+ if let Some(b) = hts_base_at(&record, start as u32, with_next_ins)? {
|
|
|
+ bases.push((qname, b));
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ if alignment.is_del() {
|
|
|
+ bases.push((qname, b'D'));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(bases)
|
|
|
+}
|
|
|
+
|
|
|
+fn mean(data: &[f64]) -> Option<f64> {
|
|
|
+ let sum = data.iter().sum::<f64>() as f64;
|
|
|
+ let count = data.len() as f64;
|
|
|
+
|
|
|
+ match count {
|
|
|
+ positive if positive > 0.0 => Some(sum / count),
|
|
|
+ _ => None,
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+fn std_deviation(data: &[f64]) -> Option<f64> {
|
|
|
+ match (mean(data), data.len() as f64) {
|
|
|
+ (Some(data_mean), count) if count > 0.0 => {
|
|
|
+ let variance = data
|
|
|
+ .iter()
|
|
|
+ .map(|value| {
|
|
|
+ let diff = data_mean - (*value as f64);
|
|
|
+ diff * diff
|
|
|
+ })
|
|
|
+ .sum::<f64>()
|
|
|
+ / count;
|
|
|
+
|
|
|
+ Some(variance.sqrt())
|
|
|
+ }
|
|
|
+ _ => None,
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+pub fn get_se_diag_mrd(
|
|
|
+ diag_bam_path: &str,
|
|
|
+ mrd_bam_path: &str,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ stop: i32,
|
|
|
+ mapq: u8,
|
|
|
+) -> Vec<Vec<Record>> {
|
|
|
+ let min_reads = 3;
|
|
|
+ let bin_size = 1_000;
|
|
|
+ let n_bins = stop - start;
|
|
|
+
|
|
|
+ let mut se_raw = vec![(0, 0)];
|
|
|
+ se_raw.resize(n_bins as usize, (0, 0));
|
|
|
+
|
|
|
+ let get_pos = |i: usize| -> i32 { start + (i as i32 * bin_size as i32) };
|
|
|
+ let se_raw: Vec<(i32, i32)> = se_raw
|
|
|
+ .par_chunks(bin_size)
|
|
|
+ .enumerate()
|
|
|
+ .flat_map(|(i, _)| {
|
|
|
+ let mut diag_bam = rust_htslib::bam::IndexedReader::from_path(diag_bam_path)
|
|
|
+ .context(anyhow!("Reading {}", diag_bam_path))
|
|
|
+ .unwrap();
|
|
|
+ let mut mrd_bam = rust_htslib::bam::IndexedReader::from_path(mrd_bam_path)
|
|
|
+ .context(anyhow!("Reading {}", mrd_bam_path))
|
|
|
+ .unwrap();
|
|
|
+
|
|
|
+ let s = get_pos(i);
|
|
|
+ let e = s + bin_size as i32;
|
|
|
+ let d = get_start_end_qual(&mut diag_bam, chr, s, e, mapq).unwrap();
|
|
|
+ let m = get_start_end_qual(&mut mrd_bam, chr, s, e, mapq).unwrap();
|
|
|
+
|
|
|
+ d.iter()
|
|
|
+ .zip(m.iter())
|
|
|
+ .map(|(d, m)| (*d, *m))
|
|
|
+ .collect::<Vec<(i32, i32)>>()
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ info!("raw {}", se_raw.len());
|
|
|
+
|
|
|
+ let (diag_sum, mrd_sum) = se_raw
|
|
|
+ .iter()
|
|
|
+ .fold((0i32, 0i32), |acc, (d, m)| (acc.0 + d, acc.1 + m));
|
|
|
+
|
|
|
+ let diag = se_raw.iter().map(|(d, _)| *d as f64).collect::<Vec<f64>>();
|
|
|
+ let diag_mean = mean(&diag).unwrap();
|
|
|
+ let diag_std = std_deviation(&diag).unwrap();
|
|
|
+
|
|
|
+ let mrd = se_raw.iter().map(|(_, m)| *m as f64).collect::<Vec<f64>>();
|
|
|
+ let mrd_mean = mean(&mrd).unwrap();
|
|
|
+ let mrd_std = std_deviation(&mrd).unwrap();
|
|
|
+
|
|
|
+ info!(
|
|
|
+ "N/nt diag {} mrd {}",
|
|
|
+ diag_sum as f64 / (stop - start) as f64,
|
|
|
+ mrd_sum as f64 / (stop - start) as f64
|
|
|
+ );
|
|
|
+ info!("Mean diag {diag_mean} mrd {mrd_mean}");
|
|
|
+ info!("std dev diag {diag_std} mrd {mrd_std}");
|
|
|
+
|
|
|
+ let ratio_se: Vec<f64> = diag
|
|
|
+ .iter()
|
|
|
+ .zip(mrd.iter())
|
|
|
+ .map(|(d, m)| (((d / diag_mean) + 1.0) / ((m / mrd_mean) + 1.0)) - 1.0)
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let r_stddev = std_deviation(&ratio_se).unwrap();
|
|
|
+ info!("ratio mean {} std dev {r_stddev}", mean(&ratio_se).unwrap());
|
|
|
+
|
|
|
+ let all: Vec<_> = se_raw
|
|
|
+ .into_iter()
|
|
|
+ .enumerate()
|
|
|
+ .zip(ratio_se.iter())
|
|
|
+ .filter(|((_i, (d, m)), _r)| *d > 3 && *m == 0)
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ info!("{} locations to assemble", all.len());
|
|
|
+
|
|
|
+ // getting priamry reads from bam at given positions
|
|
|
+ let se_data: Vec<(Vec<Record>, Vec<Record>)> = all
|
|
|
+ .par_chunks(100)
|
|
|
+ .flat_map(|chunks| {
|
|
|
+ // Loading bam reader.
|
|
|
+ let mut diag_bam = rust_htslib::bam::IndexedReader::from_path(diag_bam_path)
|
|
|
+ .context(anyhow!("Reading {}", diag_bam_path))
|
|
|
+ .unwrap();
|
|
|
+ chunks
|
|
|
+ .to_vec()
|
|
|
+ .iter()
|
|
|
+ .map(|((i, (_d, _m)), _r)| {
|
|
|
+ let pos = *i as i32 + start;
|
|
|
+ let (mut s, mut e) =
|
|
|
+ get_start_end_qual_rec(&mut diag_bam, chr, pos, pos + 1, mapq).unwrap();
|
|
|
+ let s = s.pop().unwrap();
|
|
|
+ let s = swap_by_primary(diag_bam_path, s);
|
|
|
+ let e = e.pop().unwrap();
|
|
|
+ let e = swap_by_primary(diag_bam_path, e);
|
|
|
+ // info!("{chr}:{pos}\t{r}\t{d} (s:{} e:{:?})\t{m}", s.len(), e.len());
|
|
|
+ // println!("e {e:?}");
|
|
|
+ // println!("s {s:?}");
|
|
|
+
|
|
|
+ (s, e)
|
|
|
+ })
|
|
|
+ .collect::<Vec<(Vec<Record>, Vec<Record>)>>()
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ // filtering by number of reads
|
|
|
+ // let mut groups: HashMap<String, Vec<Record>> = HashMap::new();
|
|
|
+ let mut res = Vec::new();
|
|
|
+ for (s, e) in se_data {
|
|
|
+ if s.len() >= min_reads {
|
|
|
+ res.push(s.clone());
|
|
|
+ // for r in s {
|
|
|
+ // let k = format!("{}-{}", r.tid(), r.pos());
|
|
|
+ // if let Some(vr) = groups.get_mut(&k) {
|
|
|
+ // vr.push(r);
|
|
|
+ // vr.dedup();
|
|
|
+ // } else {
|
|
|
+ // groups.insert(k, vec![r]);
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ }
|
|
|
+ if e.len() >= min_reads {
|
|
|
+ res.push(e.clone());
|
|
|
+ // for r in e {
|
|
|
+ // let k = format!("{}-{}", r.tid(), r.pos());
|
|
|
+ // if let Some(vr) = groups.get_mut(&k) {
|
|
|
+ // vr.push(r);
|
|
|
+ // vr.dedup();
|
|
|
+ // } else {
|
|
|
+ // groups.insert(k, vec![r]);
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ res
|
|
|
+}
|
|
|
+
|
|
|
+pub fn sa_ratio(
|
|
|
+ diag_bam_path: &str,
|
|
|
+ mrd_bam_path: &str,
|
|
|
+ chr: &str,
|
|
|
+ start: i32,
|
|
|
+ stop: i32,
|
|
|
+ mapq: u8,
|
|
|
+ bin_size: usize,
|
|
|
+) -> Vec<(f32, f32)> {
|
|
|
+ let n_bins = stop - start;
|
|
|
+
|
|
|
+ let get_pos = |i: usize| -> i32 { start + (i as i32 * bin_size as i32) };
|
|
|
+ let mut se_raw = vec![(0, 0)];
|
|
|
+ se_raw.resize(n_bins as usize, (0, 0));
|
|
|
+
|
|
|
+ se_raw
|
|
|
+ .par_chunks(bin_size)
|
|
|
+ .enumerate()
|
|
|
+ .flat_map(|(i, _)| {
|
|
|
+ let mut diag_bam = rust_htslib::bam::IndexedReader::from_path(diag_bam_path)
|
|
|
+ .context(anyhow!("Reading {}", diag_bam_path))
|
|
|
+ .unwrap();
|
|
|
+ let mut mrd_bam = rust_htslib::bam::IndexedReader::from_path(mrd_bam_path)
|
|
|
+ .context(anyhow!("Reading {}", mrd_bam_path))
|
|
|
+ .unwrap();
|
|
|
+
|
|
|
+ let s = get_pos(i);
|
|
|
+ let e = s + bin_size as i32;
|
|
|
+ let d = scan_sa(&mut diag_bam, chr, s, e, mapq).unwrap();
|
|
|
+ let m = scan_sa(&mut mrd_bam, chr, s, e, mapq).unwrap();
|
|
|
+
|
|
|
+ d.iter()
|
|
|
+ .zip(m.iter())
|
|
|
+ .map(|(d, m)| {
|
|
|
+ let start_diff: f32 = d.0.len() as f32 / m.0.len() as f32;
|
|
|
+ let end_diff: f32 = d.1.len() as f32 / m.1.len() as f32;
|
|
|
+ (start_diff, end_diff)
|
|
|
+ })
|
|
|
+ .collect::<Vec<(f32, f32)>>()
|
|
|
+ })
|
|
|
+ .collect()
|
|
|
+}
|
|
|
+
|
|
|
+#[cfg(test)]
|
|
|
+mod tests {
|
|
|
+ use env_logger::Env;
|
|
|
+ use log::info;
|
|
|
+
|
|
|
+ // Note this useful idiom: importing names from outer (for mod tests) scope.
|
|
|
+ use super::*;
|
|
|
+
|
|
|
+ fn init() {
|
|
|
+ let _ = env_logger::Builder::from_env(Env::default().default_filter_or("info"))
|
|
|
+ .is_test(true)
|
|
|
+ .try_init();
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn se_diff() {
|
|
|
+ init();
|
|
|
+ let case = "MERY";
|
|
|
+ let chr = "chr7";
|
|
|
+ let start = 144_103_157;
|
|
|
+ let stop = 144_183_195;
|
|
|
+ let mapq = 50;
|
|
|
+ let r = get_se_diag_mrd(
|
|
|
+ &format!("/data/longreads_basic_pipe/{case}/diag/{case}_diag_hs1.bam"),
|
|
|
+ &format!("/data/longreads_basic_pipe/{case}/mrd/{case}_mrd_hs1.bam"),
|
|
|
+ chr,
|
|
|
+ start,
|
|
|
+ stop,
|
|
|
+ mapq,
|
|
|
+ );
|
|
|
+ println!("{r:?}");
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn test_se() {
|
|
|
+ let bam_path = "/data/longreads_basic_pipe/CAMARA/diag/CAMARA_diag_hs1.bam";
|
|
|
+ let chr = "chr9";
|
|
|
+ let start = 21839796;
|
|
|
+ let mapq = 50;
|
|
|
+
|
|
|
+ let mut bam = rust_htslib::bam::IndexedReader::from_path(bam_path).unwrap();
|
|
|
+
|
|
|
+ // let a = get_start_end_qual(&mut bam, chr, start, start + 1, mapq).unwrap();
|
|
|
+ // println!("{a:?}");
|
|
|
+
|
|
|
+ let res = get_start_end_qual_rec(&mut bam, chr, start, start + 1, mapq).unwrap();
|
|
|
+ let (_, e) = res;
|
|
|
+ let res_records = swap_by_primary(bam_path, e.get(0).unwrap().clone());
|
|
|
+ println!("{res_records:?}");
|
|
|
+ println!("{}", res_records.len());
|
|
|
+ assert!(res_records.len() == 12);
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn sa_r() {
|
|
|
+ init();
|
|
|
+ let case = "MERY";
|
|
|
+ let chr = "chr7";
|
|
|
+ let start = 144_153_157;
|
|
|
+ let stop = 144_153_195;
|
|
|
+ let mapq = 50;
|
|
|
+ let r = sa_ratio(
|
|
|
+ &format!("/data/longreads_basic_pipe/{case}/diag/{case}_diag_hs1.bam"),
|
|
|
+ &format!("/data/longreads_basic_pipe/{case}/mrd/{case}_diag_hs1.bam"),
|
|
|
+ chr,
|
|
|
+ start,
|
|
|
+ stop,
|
|
|
+ mapq,
|
|
|
+ 1_000
|
|
|
+ );
|
|
|
+ println!("{r:?}");
|
|
|
+ }
|
|
|
+}
|