|
|
@@ -0,0 +1,449 @@
|
|
|
+use anyhow::{anyhow, Ok, Result};
|
|
|
+use crossbeam_channel::{unbounded, Receiver, Sender};
|
|
|
+use indicatif::MultiProgress;
|
|
|
+use pandora_lib_variants::{utils::new_pg_speed, variants::Variant};
|
|
|
+use log::{info, warn};
|
|
|
+use num_format::{CustomFormat, Grouping, ToFormattedString};
|
|
|
+use rayon::prelude::*;
|
|
|
+use rust_htslib::bam::IndexedReader;
|
|
|
+use std::{
|
|
|
+ cmp::Ordering,
|
|
|
+ collections::{HashSet, VecDeque},
|
|
|
+ thread::spawn,
|
|
|
+};
|
|
|
+use std::{fs::OpenOptions, io::Write};
|
|
|
+
|
|
|
+#[derive(Debug, Clone)]
|
|
|
+pub struct HeteroVar {
|
|
|
+ pub chr: String,
|
|
|
+ pub position: i32,
|
|
|
+ pub base: u8,
|
|
|
+ pub vaf: f64,
|
|
|
+ pub reads: HashSet<String>,
|
|
|
+}
|
|
|
+
|
|
|
+impl HeteroVar {
|
|
|
+ pub fn new(
|
|
|
+ bam: &mut IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ position: i32,
|
|
|
+ reference: u8,
|
|
|
+ alternative: u8,
|
|
|
+ ) -> Result<(HeteroVar, HeteroVar)> {
|
|
|
+ let rec_base = if let std::result::Result::Ok(rb) =
|
|
|
+ pileup::qnames_at_base(bam, chr, position, false)
|
|
|
+ {
|
|
|
+ rb
|
|
|
+ } else {
|
|
|
+ return Err(anyhow!("Error while reading BAM file."));
|
|
|
+ };
|
|
|
+ let depth = rec_base.len() as i32;
|
|
|
+ if depth == 0 {
|
|
|
+ return Err(anyhow!("No records"));
|
|
|
+ }
|
|
|
+ let mut a = (reference, Vec::new());
|
|
|
+ let mut b = (alternative, Vec::new());
|
|
|
+ for (record, base) in rec_base.into_iter() {
|
|
|
+ if base == reference {
|
|
|
+ a.1.push(record.clone());
|
|
|
+ } else if base == alternative {
|
|
|
+ b.1.push(record.clone());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let res: Vec<HeteroVar> = vec![a, b]
|
|
|
+ .into_iter()
|
|
|
+ .enumerate()
|
|
|
+ .filter(|(_, (_, rec))| rec.len() > 0)
|
|
|
+ .map(|(_, (base, records))| {
|
|
|
+ let mut records_ids: HashSet<String> = HashSet::new();
|
|
|
+ records_ids.extend(records.into_iter());
|
|
|
+ HeteroVar {
|
|
|
+ chr: chr.to_string(),
|
|
|
+ position,
|
|
|
+ base,
|
|
|
+ vaf: records_ids.len() as f64 / depth as f64,
|
|
|
+ reads: records_ids,
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ if res.len() == 2 {
|
|
|
+ let a = res.get(0).unwrap().to_owned();
|
|
|
+ let b = res.get(1).unwrap().to_owned();
|
|
|
+ return Ok((a, b));
|
|
|
+ } else {
|
|
|
+ return Err(anyhow!("Not enough reads"));
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+impl PartialEq for HeteroVar {
|
|
|
+ fn eq(&self, other: &Self) -> bool {
|
|
|
+ self.reads == other.reads
|
|
|
+ && self.chr == other.chr
|
|
|
+ && self.position == other.position
|
|
|
+ && self.base == other.base
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone)]
|
|
|
+pub struct Phase {
|
|
|
+ pub data: Vec<HeteroVar>,
|
|
|
+ pub reads: HashSet<String>,
|
|
|
+}
|
|
|
+
|
|
|
+impl Phase {
|
|
|
+ pub fn new(hetero_var: &HeteroVar) -> Self {
|
|
|
+ Self {
|
|
|
+ data: vec![hetero_var.clone()],
|
|
|
+ reads: hetero_var.reads.clone(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+ pub fn try_merge(&mut self, hetero_var: &HeteroVar, min_records: usize) -> bool {
|
|
|
+ if hetero_var
|
|
|
+ .reads
|
|
|
+ .par_iter()
|
|
|
+ .filter(|r| self.reads.contains(*r))
|
|
|
+ .count()
|
|
|
+ >= min_records
|
|
|
+ {
|
|
|
+ self.reads.extend(hetero_var.reads.clone().into_iter());
|
|
|
+ self.data.push(hetero_var.clone());
|
|
|
+ self.data.sort_by(|a, b| a.position.cmp(&b.position));
|
|
|
+ true
|
|
|
+ } else {
|
|
|
+ false
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn try_merge_phase(&mut self, phase: &Phase, min_records: usize) -> bool {
|
|
|
+ if phase
|
|
|
+ .reads
|
|
|
+ .par_iter()
|
|
|
+ .filter(|r| self.reads.contains(*r))
|
|
|
+ .count()
|
|
|
+ >= min_records
|
|
|
+ {
|
|
|
+ self.reads.extend(phase.reads.clone().into_iter());
|
|
|
+ self.data.extend(phase.data.clone().into_iter());
|
|
|
+ self.data.sort_by(|a, b| a.position.cmp(&b.position));
|
|
|
+ true
|
|
|
+ } else {
|
|
|
+ false
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn range(&self, bam: &mut IndexedReader) -> Result<(i64, i64, i64, i64)> {
|
|
|
+ let mut phase = self.clone();
|
|
|
+ phase.sort_dedup();
|
|
|
+ let data = &self.data;
|
|
|
+
|
|
|
+ let left = data.first().unwrap();
|
|
|
+ let right = data.last().unwrap();
|
|
|
+
|
|
|
+ let left_records = pileup::records_at_base(bam, &left.chr, left.position, true)?;
|
|
|
+ let right_records = pileup::records_at_base(bam, &right.chr, right.position, true)?;
|
|
|
+
|
|
|
+ let left_starts: Vec<_> = left_records
|
|
|
+ .iter()
|
|
|
+ .filter(|(_, b)| *b == left.base)
|
|
|
+ .map(|(r, _)| r.pos())
|
|
|
+ .collect();
|
|
|
+ let right_ends: Vec<_> = right_records
|
|
|
+ .iter()
|
|
|
+ .filter(|(_, b)| *b == right.base)
|
|
|
+ .map(|(r, _)| r.pos() + r.seq_len() as i64)
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let min = left_starts.iter().min();
|
|
|
+ let min_cov = left_starts.iter().max();
|
|
|
+ let max_cov = right_ends.iter().min();
|
|
|
+ let max = right_ends.iter().max();
|
|
|
+
|
|
|
+ if min.is_some() && min_cov.is_some() && max.is_some() && max_cov.is_some() {
|
|
|
+ Ok((
|
|
|
+ *min.unwrap(),
|
|
|
+ *min_cov.unwrap(),
|
|
|
+ *max_cov.unwrap(),
|
|
|
+ *max.unwrap(),
|
|
|
+ ))
|
|
|
+ } else {
|
|
|
+ Err(anyhow!("problem"))
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn mean_vaf(&self) -> f64 {
|
|
|
+ let vafs: Vec<_> = self.data.iter().map(|s| s.vaf).collect();
|
|
|
+ vafs.iter().sum::<f64>() / vafs.len() as f64
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn sort_dedup(&mut self) {
|
|
|
+ self.data.sort_by(|a, b| a.position.cmp(&b.position));
|
|
|
+ self.data
|
|
|
+ .dedup_by(|a, b| a.position == b.position && a.chr == b.chr && a.base == b.base);
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn bed_string(&self, bam: &mut IndexedReader) -> Result<String> {
|
|
|
+ let first = self.data.first().unwrap();
|
|
|
+ let (min, min_cov, max_cov, max) = self.range(bam)?;
|
|
|
+ Ok(vec![
|
|
|
+ first.chr.to_string(),
|
|
|
+ min_cov.to_string(),
|
|
|
+ max_cov.to_string(),
|
|
|
+ self.mean_vaf().to_string(),
|
|
|
+ ]
|
|
|
+ .join("\t"))
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl Ord for Phase {
|
|
|
+ fn cmp(&self, other: &Self) -> Ordering {
|
|
|
+ let s = self.data.first().unwrap();
|
|
|
+ let other = other.data.first().unwrap();
|
|
|
+ s.position.cmp(&other.position)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl PartialOrd for Phase {
|
|
|
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
|
|
+ Some(self.cmp(other))
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl PartialEq for Phase {
|
|
|
+ fn eq(&self, other: &Self) -> bool {
|
|
|
+ self.reads == other.reads
|
|
|
+ // self.data.iter().filter(|s| other.data.contains(s)).count() == other.data.len()
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl Eq for Phase {}
|
|
|
+
|
|
|
+pub fn merge_phases(
|
|
|
+ phases: Vec<Phase>,
|
|
|
+ min_records: usize,
|
|
|
+ takes_n: usize,
|
|
|
+ multi: &MultiProgress,
|
|
|
+) -> Result<Vec<Phase>> {
|
|
|
+ let pg = multi.add(new_pg_speed(phases.len() as u64));
|
|
|
+ pg.set_message(format!("Phasing by {takes_n}"));
|
|
|
+
|
|
|
+ let mut merged_phases = Vec::new();
|
|
|
+ let mut round_merges = 0;
|
|
|
+
|
|
|
+ let mut phases = VecDeque::from_iter(phases.into_iter());
|
|
|
+ while let Some(phase) = phases.pop_front() {
|
|
|
+ let (s, r) = spawn_phase(&phase, min_records);
|
|
|
+ phases
|
|
|
+ .par_iter()
|
|
|
+ .take(takes_n)
|
|
|
+ .enumerate()
|
|
|
+ .for_each(|(i, p)| s.send(MessagesIn::Phase((i, p.clone()))).unwrap());
|
|
|
+
|
|
|
+ s.send(MessagesIn::Return)?;
|
|
|
+
|
|
|
+ let mut merged_ids = Vec::new();
|
|
|
+ loop {
|
|
|
+ match r.recv() {
|
|
|
+ std::result::Result::Ok(msg) => match msg {
|
|
|
+ MessagesOut::Phase(p) => {
|
|
|
+ merged_phases.push(p);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ MessagesOut::Merged((i, b)) => {
|
|
|
+ if b {
|
|
|
+ round_merges += 1;
|
|
|
+ merged_ids.push(i);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ },
|
|
|
+ Err(e) => warn!("{e}"),
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let n_merged = merged_ids.len();
|
|
|
+ if n_merged > 0 {
|
|
|
+ info!("Merged {}", merged_ids.len());
|
|
|
+ }
|
|
|
+
|
|
|
+ // Remove merged from phases.
|
|
|
+ pg.set_length(pg.length().unwrap() as u64 - merged_ids.len() as u64);
|
|
|
+ phases = phases
|
|
|
+ .iter()
|
|
|
+ .enumerate()
|
|
|
+ .filter(|(i, _)| !merged_ids.contains(i))
|
|
|
+ .map(|(_, p)| p.to_owned())
|
|
|
+ .collect();
|
|
|
+ merged_ids.clear();
|
|
|
+
|
|
|
+ if phases.len() == 0 && round_merges > 0 {
|
|
|
+ phases.extend(merged_phases.clone().into_iter());
|
|
|
+ merged_phases.clear();
|
|
|
+ warn!("Round merges {round_merges}");
|
|
|
+ round_merges = 0;
|
|
|
+ pg.reset();
|
|
|
+ pg.set_length(phases.len() as u64);
|
|
|
+ }
|
|
|
+ pg.inc(1);
|
|
|
+ }
|
|
|
+ pg.finish();
|
|
|
+ multi.remove(&pg);
|
|
|
+
|
|
|
+ let phases = merged_phases.clone();
|
|
|
+
|
|
|
+ Ok(phases)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn variants_phasing(
|
|
|
+ variants: Vec<Variant>,
|
|
|
+ bam_path: &str,
|
|
|
+ min_records: usize,
|
|
|
+ multi: &MultiProgress,
|
|
|
+) -> Vec<Phase> {
|
|
|
+ info!("{} variants to analyse.", variants.len());
|
|
|
+ let pg = multi.add(new_pg_speed(variants.len() as u64));
|
|
|
+ pg.set_message("Parsing reads at SNPs positions");
|
|
|
+ let mut phases = variants
|
|
|
+ .par_chunks(2_000)
|
|
|
+ .flat_map(|chunks| {
|
|
|
+ let mut bam = rust_htslib::bam::IndexedReader::from_path(bam_path).unwrap();
|
|
|
+ let mut errors = Vec::new();
|
|
|
+ let mut phases: Vec<Phase> = Vec::new();
|
|
|
+ for v in chunks {
|
|
|
+ pg.inc(1);
|
|
|
+ let mut reference = v.reference.to_string().as_bytes().to_vec();
|
|
|
+ let mut altenative = v.alternative.to_string().as_bytes().to_vec();
|
|
|
+ if reference.len() != 1 && altenative.len() != 1 {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ match HeteroVar::new(
|
|
|
+ &mut bam,
|
|
|
+ &v.contig.to_string(),
|
|
|
+ v.position as i32,
|
|
|
+ reference.pop().unwrap(),
|
|
|
+ altenative.pop().unwrap(),
|
|
|
+ ) {
|
|
|
+ std::result::Result::Ok((a, b)) => {
|
|
|
+ phases.push(Phase::new(&a));
|
|
|
+ phases.push(Phase::new(&b));
|
|
|
+ }
|
|
|
+ Err(e) => errors.push(e),
|
|
|
+ }
|
|
|
+ }
|
|
|
+ phases
|
|
|
+ })
|
|
|
+ .collect::<Vec<Phase>>();
|
|
|
+
|
|
|
+ pg.finish();
|
|
|
+ multi.remove(&pg);
|
|
|
+ phases.sort();
|
|
|
+ phases.dedup();
|
|
|
+
|
|
|
+ let mut phases: Vec<Phase> = phases
|
|
|
+ .par_chunks(phases.len() / 75)
|
|
|
+ .flat_map(|chunks| merge_phases(chunks.to_vec(), min_records, 5, &multi).unwrap())
|
|
|
+ .collect();
|
|
|
+ phases.sort();
|
|
|
+ let phases: Vec<Phase> = phases
|
|
|
+ .par_chunks(phases.len() / 50)
|
|
|
+ .flat_map(|chunks| merge_phases(chunks.to_vec(), min_records, 100, &multi).unwrap())
|
|
|
+ .collect();
|
|
|
+ let phases: Vec<Phase> = phases
|
|
|
+ .par_chunks(phases.len() / 25)
|
|
|
+ .flat_map(|chunks| merge_phases(chunks.to_vec(), min_records, 1000, &multi).unwrap())
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ phases
|
|
|
+}
|
|
|
+
|
|
|
+pub fn write_phases_bed(
|
|
|
+ phases: &Vec<Phase>,
|
|
|
+ min_var: usize,
|
|
|
+ bam_path: &str,
|
|
|
+ contig: &str,
|
|
|
+ file: &str,
|
|
|
+) -> Result<()> {
|
|
|
+ let format = CustomFormat::builder()
|
|
|
+ .grouping(Grouping::Standard)
|
|
|
+ .minus_sign("-")
|
|
|
+ .separator(",")
|
|
|
+ .build()
|
|
|
+ .unwrap();
|
|
|
+
|
|
|
+ let mut ranges: Vec<_> = phases
|
|
|
+ .par_chunks(100)
|
|
|
+ .flat_map(|chunks| {
|
|
|
+ let mut bam = rust_htslib::bam::IndexedReader::from_path(bam_path).unwrap();
|
|
|
+ chunks
|
|
|
+ .to_vec()
|
|
|
+ .iter()
|
|
|
+ .map(|p| (p.range(&mut bam), p.mean_vaf(), p.data.len()))
|
|
|
+ .collect::<Vec<_>>()
|
|
|
+ })
|
|
|
+ .filter(|(r, _, _)| r.is_ok())
|
|
|
+ .map(|(r, v, n)| (r.unwrap(), v, n))
|
|
|
+ .map(|((min, min_cov, max_cov, max), v, n)| (min..=max, min_cov..=max_cov, v, n))
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ ranges.sort_by(|(a, _, _, _), (b, _, _, _)| a.start().cmp(&b.start()));
|
|
|
+ let mut w = OpenOptions::new()
|
|
|
+ .read(true)
|
|
|
+ .write(true)
|
|
|
+ .create(true)
|
|
|
+ .append(true)
|
|
|
+ .open(file)?;
|
|
|
+
|
|
|
+ for (_i, (range, _cov, vaf, n)) in ranges.iter().enumerate() {
|
|
|
+ if *n < min_var {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ let line = vec![
|
|
|
+ contig.to_string(),
|
|
|
+ range.start().to_string(),
|
|
|
+ range.end().to_string(),
|
|
|
+ format!(
|
|
|
+ "{} {} {}",
|
|
|
+ (range.end() - range.start()).to_formatted_string(&format),
|
|
|
+ n,
|
|
|
+ vaf
|
|
|
+ ),
|
|
|
+ ]
|
|
|
+ .join("\t");
|
|
|
+ writeln!(&mut w, "{line}")?;
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+}
|
|
|
+
|
|
|
+pub enum MessagesIn {
|
|
|
+ Phase((usize, Phase)),
|
|
|
+ Return,
|
|
|
+}
|
|
|
+pub enum MessagesOut {
|
|
|
+ Phase(Phase),
|
|
|
+ Merged((usize, bool)),
|
|
|
+}
|
|
|
+
|
|
|
+pub fn spawn_phase(
|
|
|
+ phase: &Phase,
|
|
|
+ min_records: usize,
|
|
|
+) -> (Sender<MessagesIn>, Receiver<MessagesOut>) {
|
|
|
+ let (s, r) = unbounded::<MessagesIn>();
|
|
|
+ let (ss, rr) = unbounded::<MessagesOut>();
|
|
|
+ let mut phase = phase.clone();
|
|
|
+ spawn(move || loop {
|
|
|
+ match r.recv() {
|
|
|
+ std::result::Result::Ok(msg) => match msg {
|
|
|
+ MessagesIn::Phase((i, p)) => {
|
|
|
+ let res = phase.try_merge_phase(&p, min_records);
|
|
|
+ ss.send(MessagesOut::Merged((i, res))).unwrap();
|
|
|
+ }
|
|
|
+ MessagesIn::Return => {
|
|
|
+ ss.send(MessagesOut::Phase(phase)).unwrap();
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ },
|
|
|
+ Err(e) => panic!("Err {e}"),
|
|
|
+ };
|
|
|
+ });
|
|
|
+ (s, rr)
|
|
|
+}
|