|
@@ -1,12 +1,9 @@
|
|
|
-use std::collections::HashMap;
|
|
|
|
|
|
|
+use std::{collections::HashMap, fs::File, io::Write};
|
|
|
|
|
|
|
|
use anyhow::{anyhow, Context, Ok, Result};
|
|
use anyhow::{anyhow, Context, Ok, Result};
|
|
|
use log::info;
|
|
use log::info;
|
|
|
use rayon::prelude::*;
|
|
use rayon::prelude::*;
|
|
|
-use rust_htslib::{
|
|
|
|
|
- bam,
|
|
|
|
|
- bam::{ext::BamRecordExtensions, record, Read, Record},
|
|
|
|
|
-};
|
|
|
|
|
|
|
+use rust_htslib::bam::{self, ext::BamRecordExtensions, record, Header, Read, Record, Writer};
|
|
|
|
|
|
|
|
pub fn get_hts_nt_pileup(
|
|
pub fn get_hts_nt_pileup(
|
|
|
bam: &mut rust_htslib::bam::IndexedReader,
|
|
bam: &mut rust_htslib::bam::IndexedReader,
|
|
@@ -162,7 +159,7 @@ pub fn get_start_end_qual(
|
|
|
results.resize(length as usize, 0);
|
|
results.resize(length as usize, 0);
|
|
|
|
|
|
|
|
for read in bam.records() {
|
|
for read in bam.records() {
|
|
|
- let record = read.context(format!("Error while parsing record"))?;
|
|
|
|
|
|
|
+ let record = read.context("Error while parsing record")?;
|
|
|
let rstart = record.pos() as i32;
|
|
let rstart = record.pos() as i32;
|
|
|
// let rstart = record.reference_start() as i32;
|
|
// let rstart = record.reference_start() as i32;
|
|
|
let rend = record.reference_end() as i32;
|
|
let rend = record.reference_end() as i32;
|
|
@@ -276,7 +273,7 @@ pub fn range_depths(
|
|
|
let mut depths = vec![0];
|
|
let mut depths = vec![0];
|
|
|
depths.resize((stop - start) as usize, 0);
|
|
depths.resize((stop - start) as usize, 0);
|
|
|
for p in bam.pileup() {
|
|
for p in bam.pileup() {
|
|
|
- let pileup = p.context(format!("eRR"))?;
|
|
|
|
|
|
|
+ let pileup = p.context("eRR")?;
|
|
|
let rstart = pileup.pos() as i32;
|
|
let rstart = pileup.pos() as i32;
|
|
|
|
|
|
|
|
if rstart >= start && rstart < stop {
|
|
if rstart >= start && rstart < stop {
|
|
@@ -304,7 +301,7 @@ pub fn range_depths_qual(
|
|
|
let mut depths = vec![0];
|
|
let mut depths = vec![0];
|
|
|
depths.resize((stop - start) as usize, 0);
|
|
depths.resize((stop - start) as usize, 0);
|
|
|
for p in bam.pileup() {
|
|
for p in bam.pileup() {
|
|
|
- let pileup = p.context(format!("eRR"))?;
|
|
|
|
|
|
|
+ let pileup = p.context("eRR")?;
|
|
|
let rstart = pileup.pos() as i32;
|
|
let rstart = pileup.pos() as i32;
|
|
|
|
|
|
|
|
if rstart >= start && rstart < stop {
|
|
if rstart >= start && rstart < stop {
|
|
@@ -334,7 +331,7 @@ pub fn range_len_qual(
|
|
|
let mut depths = vec![0];
|
|
let mut depths = vec![0];
|
|
|
depths.resize((stop - start) as usize, 0);
|
|
depths.resize((stop - start) as usize, 0);
|
|
|
for p in bam.pileup() {
|
|
for p in bam.pileup() {
|
|
|
- let pileup = p.context(format!("eRR"))?;
|
|
|
|
|
|
|
+ let pileup = p.context("eRR")?;
|
|
|
let rstart = pileup.pos() as i32;
|
|
let rstart = pileup.pos() as i32;
|
|
|
|
|
|
|
|
if rstart >= start && rstart < stop {
|
|
if rstart >= start && rstart < stop {
|
|
@@ -380,24 +377,20 @@ pub fn scan_sa(
|
|
|
results_end.resize(length as usize, vec![]);
|
|
results_end.resize(length as usize, vec![]);
|
|
|
|
|
|
|
|
for read in bam.records() {
|
|
for read in bam.records() {
|
|
|
- let record = read.context(format!("Error while parsing record"))?;
|
|
|
|
|
|
|
+ let record = read.context("Error while parsing record".to_string())?;
|
|
|
let rstart = record.pos() as i32;
|
|
let rstart = record.pos() as i32;
|
|
|
let rend = record.reference_end() as i32;
|
|
let rend = record.reference_end() as i32;
|
|
|
|
|
|
|
|
- if rstart >= start && rstart < stop {
|
|
|
|
|
- if record.mapq() >= mapq && record.aux(b"SA").is_ok() {
|
|
|
|
|
- let index = rstart - start;
|
|
|
|
|
- let u = results_start.get_mut(index as usize).unwrap();
|
|
|
|
|
- u.push(record.clone());
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ if rstart >= start && rstart < stop && record.mapq() >= mapq && record.aux(b"SA").is_ok() {
|
|
|
|
|
+ let index = rstart - start;
|
|
|
|
|
+ let u = results_start.get_mut(index as usize).unwrap();
|
|
|
|
|
+ u.push(record.clone());
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- if rend >= start && rend < stop && record.aux(b"SA").is_ok() {
|
|
|
|
|
- if record.mapq() >= mapq {
|
|
|
|
|
- let index = rend - start;
|
|
|
|
|
- let u = results_end.get_mut(index as usize).unwrap();
|
|
|
|
|
- u.push(record.clone());
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ if rend >= start && rend < stop && record.aux(b"SA").is_ok() && record.mapq() >= mapq {
|
|
|
|
|
+ let index = rend - start;
|
|
|
|
|
+ let u = results_end.get_mut(index as usize).unwrap();
|
|
|
|
|
+ u.push(record.clone());
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -436,10 +429,8 @@ pub fn records_at_base(
|
|
|
if let Some(b) = hts_base_at(&record, start as u32, with_next_ins)? {
|
|
if let Some(b) = hts_base_at(&record, start as u32, with_next_ins)? {
|
|
|
bases.push((record.clone(), b));
|
|
bases.push((record.clone(), b));
|
|
|
}
|
|
}
|
|
|
- } else {
|
|
|
|
|
- if alignment.is_del() {
|
|
|
|
|
- bases.push((record.clone(), b'D'));
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ } else if alignment.is_del() {
|
|
|
|
|
+ bases.push((record.clone(), b'D'));
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -477,10 +468,8 @@ pub fn qnames_at_base(
|
|
|
if let Some(b) = hts_base_at(&record, start as u32, with_next_ins)? {
|
|
if let Some(b) = hts_base_at(&record, start as u32, with_next_ins)? {
|
|
|
bases.push((qname, b));
|
|
bases.push((qname, b));
|
|
|
}
|
|
}
|
|
|
- } else {
|
|
|
|
|
- if alignment.is_del() {
|
|
|
|
|
- bases.push((qname, b'D'));
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ } else if alignment.is_del() {
|
|
|
|
|
+ bases.push((qname, b'D'));
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -491,7 +480,7 @@ pub fn qnames_at_base(
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
fn mean(data: &[f64]) -> Option<f64> {
|
|
fn mean(data: &[f64]) -> Option<f64> {
|
|
|
- let sum = data.iter().sum::<f64>() as f64;
|
|
|
|
|
|
|
+ let sum = data.iter().sum::<f64>();
|
|
|
let count = data.len() as f64;
|
|
let count = data.len() as f64;
|
|
|
|
|
|
|
|
match count {
|
|
match count {
|
|
@@ -506,7 +495,7 @@ fn std_deviation(data: &[f64]) -> Option<f64> {
|
|
|
let variance = data
|
|
let variance = data
|
|
|
.iter()
|
|
.iter()
|
|
|
.map(|value| {
|
|
.map(|value| {
|
|
|
- let diff = data_mean - (*value as f64);
|
|
|
|
|
|
|
+ let diff = data_mean - *value;
|
|
|
diff * diff
|
|
diff * diff
|
|
|
})
|
|
})
|
|
|
.sum::<f64>()
|
|
.sum::<f64>()
|
|
@@ -526,7 +515,7 @@ pub fn get_se_diag_mrd(
|
|
|
stop: i32,
|
|
stop: i32,
|
|
|
mapq: u8,
|
|
mapq: u8,
|
|
|
) -> Vec<Vec<Record>> {
|
|
) -> Vec<Vec<Record>> {
|
|
|
- let min_reads = 3;
|
|
|
|
|
|
|
+ // let min_reads = 3;
|
|
|
let bin_size = 1_000;
|
|
let bin_size = 1_000;
|
|
|
let n_bins = stop - start;
|
|
let n_bins = stop - start;
|
|
|
|
|
|
|
@@ -595,9 +584,7 @@ pub fn get_se_diag_mrd(
|
|
|
.filter(|((_i, (d, m)), _r)| *d > 3 && *m == 0)
|
|
.filter(|((_i, (d, m)), _r)| *d > 3 && *m == 0)
|
|
|
.collect();
|
|
.collect();
|
|
|
|
|
|
|
|
- info!("{} locations to assemble", all.len());
|
|
|
|
|
-
|
|
|
|
|
- // getting priamry reads from bam at given positions
|
|
|
|
|
|
|
+ // getting primary reads from bam at given positions
|
|
|
let se_data: Vec<(Vec<Record>, Vec<Record>)> = all
|
|
let se_data: Vec<(Vec<Record>, Vec<Record>)> = all
|
|
|
.par_chunks(100)
|
|
.par_chunks(100)
|
|
|
.flat_map(|chunks| {
|
|
.flat_map(|chunks| {
|
|
@@ -626,36 +613,11 @@ pub fn get_se_diag_mrd(
|
|
|
})
|
|
})
|
|
|
.collect();
|
|
.collect();
|
|
|
|
|
|
|
|
- // filtering by number of reads
|
|
|
|
|
- // let mut groups: HashMap<String, Vec<Record>> = HashMap::new();
|
|
|
|
|
- let mut res = Vec::new();
|
|
|
|
|
- for (s, e) in se_data {
|
|
|
|
|
- if s.len() >= min_reads {
|
|
|
|
|
- res.push(s.clone());
|
|
|
|
|
- // for r in s {
|
|
|
|
|
- // let k = format!("{}-{}", r.tid(), r.pos());
|
|
|
|
|
- // if let Some(vr) = groups.get_mut(&k) {
|
|
|
|
|
- // vr.push(r);
|
|
|
|
|
- // vr.dedup();
|
|
|
|
|
- // } else {
|
|
|
|
|
- // groups.insert(k, vec![r]);
|
|
|
|
|
- // }
|
|
|
|
|
- // }
|
|
|
|
|
- }
|
|
|
|
|
- if e.len() >= min_reads {
|
|
|
|
|
- res.push(e.clone());
|
|
|
|
|
- // for r in e {
|
|
|
|
|
- // let k = format!("{}-{}", r.tid(), r.pos());
|
|
|
|
|
- // if let Some(vr) = groups.get_mut(&k) {
|
|
|
|
|
- // vr.push(r);
|
|
|
|
|
- // vr.dedup();
|
|
|
|
|
- // } else {
|
|
|
|
|
- // groups.insert(k, vec![r]);
|
|
|
|
|
- // }
|
|
|
|
|
- // }
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- res
|
|
|
|
|
|
|
+ se_data
|
|
|
|
|
+ .into_iter()
|
|
|
|
|
+ .flat_map(|(s, e)| vec![s.clone(), e.clone()])
|
|
|
|
|
+ .filter(|e| !e.is_empty())
|
|
|
|
|
+ .collect()
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
pub fn sa_ratio(
|
|
pub fn sa_ratio(
|
|
@@ -701,26 +663,55 @@ pub fn sa_ratio(
|
|
|
.collect()
|
|
.collect()
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-pub fn bam_compo(file_path: &str, sample_size: usize) -> Result<Vec<(String, f64 )>> {
|
|
|
|
|
|
|
+pub fn bam_compo(file_path: &str, sample_size: usize) -> Result<Vec<(String, f64)>> {
|
|
|
let mut bam = bam::Reader::from_path(file_path)?;
|
|
let mut bam = bam::Reader::from_path(file_path)?;
|
|
|
|
|
|
|
|
let mut rgs: HashMap<String, u64> = HashMap::new();
|
|
let mut rgs: HashMap<String, u64> = HashMap::new();
|
|
|
- for result in bam.records().filter_map(Result::ok).take(sample_size)
|
|
|
|
|
- {
|
|
|
|
|
- if let std::result::Result::Ok(t) = result.aux(b"RG") {
|
|
|
|
|
- if let record::Aux::String(s) = t {
|
|
|
|
|
- *rgs.entry(s.to_string()).or_default() += 1;
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ for result in bam.records().filter_map(Result::ok).take(sample_size) {
|
|
|
|
|
+ if let record::Aux::String(s) = result.aux(b"RG")? {
|
|
|
|
|
+ *rgs.entry(s.to_string()).or_default() += 1;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- Ok(rgs.into_iter().map(|(rg, n)| (rg.to_string(), n as f64 * 100.0 / sample_size as f64)).collect())
|
|
|
|
|
|
|
+ Ok(rgs
|
|
|
|
|
+ .into_iter()
|
|
|
|
|
+ .map(|(rg, n)| (rg.to_string(), n as f64 * 100.0 / sample_size as f64))
|
|
|
|
|
+ .collect())
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+pub fn write_bam(records: &Vec<Record>, header: &Header, path: &str) -> Result<()> {
|
|
|
|
|
+ info!("{} records to write", records.len());
|
|
|
|
|
+ let mut writer = Writer::from_path(path, header, bam::Format::Bam).unwrap();
|
|
|
|
|
+ for record in records {
|
|
|
|
|
+ writer.write(record)?;
|
|
|
|
|
+ }
|
|
|
|
|
+ Ok(())
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+pub fn write_fasta(path: &str, records: &Vec<Record>) -> anyhow::Result<()> {
|
|
|
|
|
+ let mut file = File::create(path)?;
|
|
|
|
|
+ let mut writer = noodles_fasta::io::Writer::new(Vec::new());
|
|
|
|
|
+
|
|
|
|
|
+ for record in records {
|
|
|
|
|
+ let qname = String::from_utf8(record.qname().to_vec())?;
|
|
|
|
|
+ let qseq = record.seq().as_bytes();
|
|
|
|
|
+ let qqual = String::from_utf8(record.qual().to_vec())?;
|
|
|
|
|
+ assert_eq!(qseq.len(), qqual.len());
|
|
|
|
|
+ let fastq_record = noodles_fasta::Record::new(
|
|
|
|
|
+ noodles_fasta::record::Definition::new(qname.clone(), None),
|
|
|
|
|
+ qseq.into(),
|
|
|
|
|
+ );
|
|
|
|
|
+ writer.write_record(&fastq_record)?;
|
|
|
|
|
+ }
|
|
|
|
|
+ file.write_all(writer.get_ref())?;
|
|
|
|
|
+ Ok(())
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
#[cfg(test)]
|
|
|
mod tests {
|
|
mod tests {
|
|
|
use env_logger::Env;
|
|
use env_logger::Env;
|
|
|
use log::info;
|
|
use log::info;
|
|
|
|
|
+ use uuid::Uuid;
|
|
|
|
|
|
|
|
// Note this useful idiom: importing names from outer (for mod tests) scope.
|
|
// Note this useful idiom: importing names from outer (for mod tests) scope.
|
|
|
use super::*;
|
|
use super::*;
|
|
@@ -764,7 +755,7 @@ mod tests {
|
|
|
|
|
|
|
|
let res = get_start_end_qual_rec(&mut bam, chr, start, start + 1, mapq).unwrap();
|
|
let res = get_start_end_qual_rec(&mut bam, chr, start, start + 1, mapq).unwrap();
|
|
|
let (_, e) = res;
|
|
let (_, e) = res;
|
|
|
- let res_records = swap_by_primary(bam_path, e.get(0).unwrap().clone());
|
|
|
|
|
|
|
+ let res_records = swap_by_primary(bam_path, e.first().unwrap().clone());
|
|
|
println!("{res_records:?}");
|
|
println!("{res_records:?}");
|
|
|
println!("{}", res_records.len());
|
|
println!("{}", res_records.len());
|
|
|
assert!(res_records.len() == 12);
|
|
assert!(res_records.len() == 12);
|
|
@@ -780,7 +771,7 @@ mod tests {
|
|
|
let mapq = 50;
|
|
let mapq = 50;
|
|
|
let r = sa_ratio(
|
|
let r = sa_ratio(
|
|
|
&format!("/data/longreads_basic_pipe/{case}/diag/{case}_diag_hs1.bam"),
|
|
&format!("/data/longreads_basic_pipe/{case}/diag/{case}_diag_hs1.bam"),
|
|
|
- &format!("/data/longreads_basic_pipe/{case}/mrd/{case}_diag_hs1.bam"),
|
|
|
|
|
|
|
+ &format!("/data/longreads_basic_pipe/{case}/mrd/{case}_mrd_hs1.bam"),
|
|
|
chr,
|
|
chr,
|
|
|
start,
|
|
start,
|
|
|
stop,
|
|
stop,
|
|
@@ -792,7 +783,42 @@ mod tests {
|
|
|
|
|
|
|
|
#[test]
|
|
#[test]
|
|
|
fn rg() {
|
|
fn rg() {
|
|
|
- let rg = bam_compo("/data/longreads_basic_pipe/KENNOUCHE/mrd/KENNOUCHE_mrd_hs1.bam", 20000);
|
|
|
|
|
|
|
+ let rg = bam_compo(
|
|
|
|
|
+ "/data/longreads_basic_pipe/KENNOUCHE/mrd/KENNOUCHE_mrd_hs1.bam",
|
|
|
|
|
+ 20000,
|
|
|
|
|
+ );
|
|
|
println!("{rg:#?}");
|
|
println!("{rg:#?}");
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+ #[test]
|
|
|
|
|
+ fn se_scan() -> Result<()> {
|
|
|
|
|
+ init();
|
|
|
|
|
+ // chr9:21,830,434_22,010,143
|
|
|
|
|
+ let id = "BECERRA";
|
|
|
|
|
+ let chr = "chr9";
|
|
|
|
|
+ let start = 21_820_000;
|
|
|
|
|
+ let stop = 23_000_000;
|
|
|
|
|
+
|
|
|
|
|
+ let result_dir = "/data/longreads_basic_pipe";
|
|
|
|
|
+ let diag_bam_path = format!("{result_dir}/{id}/diag/{id}_diag_hs1.bam");
|
|
|
|
|
+ let mrd_bam_path = format!("{result_dir}/{id}/mrd/{id}_mrd_hs1.bam");
|
|
|
|
|
+ let records = get_se_diag_mrd(&diag_bam_path, &mrd_bam_path, chr, start, stop, 50);
|
|
|
|
|
+
|
|
|
|
|
+ let reader = rust_htslib::bam::Reader::from_path(diag_bam_path)?;
|
|
|
|
|
+ let header = reader.header().clone();
|
|
|
|
|
+ let h = rust_htslib::bam::header::Header::from_template(&header);
|
|
|
|
|
+
|
|
|
|
|
+ let result_dir = format!("{result_dir}/{id}/diag/asm_contigs");
|
|
|
|
|
+ if !std::path::PathBuf::from(&result_dir).exists() {
|
|
|
|
|
+ std::fs::create_dir(&result_dir)?;
|
|
|
|
|
+ }
|
|
|
|
|
+ for e in records.iter() {
|
|
|
|
|
+ let uuid = Uuid::new_v4();
|
|
|
|
|
+ write_fasta(&format!("{result_dir}/{uuid}.fasta"), e)?;
|
|
|
|
|
+ write_bam(e, &h, &format!("{result_dir}/{uuid}.bam"))
|
|
|
|
|
+ .context("Error while writing bam")?;
|
|
|
|
|
+ }
|
|
|
|
|
+ info!("{} group of records", records.len());
|
|
|
|
|
+ Ok(())
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|