|
|
@@ -106,15 +106,19 @@ impl PartialEq for HeteroVar {
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
pub struct Phase {
|
|
|
+ pub id: Option<String>,
|
|
|
pub data: Vec<HeteroVar>,
|
|
|
pub reads: HashSet<String>,
|
|
|
+ pub range: Option<(u64, u64, u64, u64)>,
|
|
|
}
|
|
|
|
|
|
impl Phase {
|
|
|
pub fn new(hetero_var: &HeteroVar) -> Self {
|
|
|
Self {
|
|
|
+ id: None,
|
|
|
data: vec![hetero_var.clone()],
|
|
|
reads: hetero_var.reads.clone(),
|
|
|
+ range: None,
|
|
|
}
|
|
|
}
|
|
|
pub fn try_merge(&mut self, hetero_var: &HeteroVar, min_records: usize) -> bool {
|
|
|
@@ -151,17 +155,15 @@ impl Phase {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- pub fn range(
|
|
|
- &self,
|
|
|
+ pub fn make_range(
|
|
|
+ &mut self,
|
|
|
bam_a: &mut IndexedReader,
|
|
|
bam_b: &mut IndexedReader,
|
|
|
- ) -> Result<(i64, i64, i64, i64)> {
|
|
|
- let mut phase = self.clone();
|
|
|
- phase.sort_dedup();
|
|
|
- let data = &self.data;
|
|
|
+ ) -> Result<()> {
|
|
|
+ self.sort_dedup();
|
|
|
|
|
|
- let left = data.first().unwrap();
|
|
|
- let right = data.last().unwrap();
|
|
|
+ let left = self.data.first().unwrap();
|
|
|
+ let right = self.data.last().unwrap();
|
|
|
|
|
|
let mut left_records =
|
|
|
pandora_lib_pileup::records_at_base(bam_a, &left.chr, left.position, false)?;
|
|
|
@@ -184,12 +186,12 @@ impl Phase {
|
|
|
let left_starts: Vec<_> = left_records
|
|
|
.iter()
|
|
|
.filter(|(_, b)| *b == left.base)
|
|
|
- .map(|(r, _)| r.pos())
|
|
|
+ .map(|(r, _)| r.pos() as u64)
|
|
|
.collect();
|
|
|
let right_ends: Vec<_> = right_records
|
|
|
.iter()
|
|
|
.filter(|(_, b)| *b == right.base)
|
|
|
- .map(|(r, _)| r.pos() + r.seq_len() as i64)
|
|
|
+ .map(|(r, _)| (r.pos() as usize + r.seq_len()) as u64)
|
|
|
.collect();
|
|
|
|
|
|
let min = left_starts.iter().min();
|
|
|
@@ -205,7 +207,8 @@ impl Phase {
|
|
|
}
|
|
|
|
|
|
if let (Some(min), Some(min_cov), Some(max_cov), Some(max)) = (min, min_cov, max_cov, max) {
|
|
|
- Ok((*min, *min_cov, *max_cov, *max))
|
|
|
+ self.range = Some((*min, *min_cov, *max_cov, *max));
|
|
|
+ Ok(())
|
|
|
} else {
|
|
|
Err(anyhow!(format!("can't find range {:#?}", self)))
|
|
|
}
|
|
|
@@ -222,38 +225,35 @@ impl Phase {
|
|
|
.dedup_by(|a, b| a.position == b.position && a.chr == b.chr && a.base == b.base);
|
|
|
}
|
|
|
|
|
|
- pub fn bed_string(
|
|
|
- &self,
|
|
|
- bam_a: &mut IndexedReader,
|
|
|
- bam_b: &mut IndexedReader,
|
|
|
- ) -> Result<String> {
|
|
|
- let first = self.data.first().unwrap();
|
|
|
- let (_min, min_cov, max_cov, _max) = self.range(bam_a, bam_b)?;
|
|
|
- Ok([
|
|
|
- first.chr.to_string(),
|
|
|
- min_cov.to_string(),
|
|
|
- max_cov.to_string(),
|
|
|
- self.mean_vaf().to_string(),
|
|
|
- ]
|
|
|
- .join("\t"))
|
|
|
+ pub fn bed_string(&self) -> Option<String> {
|
|
|
+ if let (Some((min, _min_cov, _max_cov, max)), Some(id)) = (self.range, self.id()) {
|
|
|
+ Some([
|
|
|
+ self.data.first().unwrap().chr.to_string(),
|
|
|
+ min.to_string(),
|
|
|
+ max.to_string(),
|
|
|
+ format!("{} {:.2}", id, self.mean_vaf()),
|
|
|
+ ]
|
|
|
+ .join("\t"))
|
|
|
+ } else {
|
|
|
+ None
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
- pub fn id(
|
|
|
- &mut self,
|
|
|
- bam_a: &mut IndexedReader,
|
|
|
- bam_b: &mut IndexedReader,
|
|
|
- ) -> anyhow::Result<String> {
|
|
|
- self.sort_dedup();
|
|
|
- let synteny = String::from_utf8(self.data.iter().map(|var| var.base).collect())?;
|
|
|
- let (min, _, _, max) = self.range(bam_a, bam_b)?;
|
|
|
- let f = self.data.first().unwrap().chr.clone();
|
|
|
- let l = self.data.last().unwrap().chr.clone();
|
|
|
- let b = if l != f {
|
|
|
- format!("{l}:")
|
|
|
+ pub fn id(&self) -> Option<String> {
|
|
|
+ let synteny = String::from_utf8(self.data.iter().map(|var| var.base).collect()).unwrap();
|
|
|
+ if let Some((min, _, _, max)) = self.range {
|
|
|
+ let f = self.data.first().unwrap().chr.clone();
|
|
|
+ let l = self.data.last().unwrap().chr.clone();
|
|
|
+ let b = if l != f {
|
|
|
+ format!("{l}:")
|
|
|
+ } else {
|
|
|
+ "".to_string()
|
|
|
+ };
|
|
|
+ Some(format!("{}:{min}-{}{max}[{synteny}]", f, b))
|
|
|
} else {
|
|
|
- "".to_string()
|
|
|
- };
|
|
|
- Ok(format!("{}:{min}-{}{max}[{synteny}]", f, b))
|
|
|
+ None
|
|
|
+ }
|
|
|
+
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -412,46 +412,32 @@ pub fn variants_phasing(
|
|
|
.par_chunks(phases.len() / 50)
|
|
|
.flat_map(|chunks| merge_phases(chunks.to_vec(), min_records, 100, multi).unwrap())
|
|
|
.collect();
|
|
|
- let phases: Vec<Phase> = phases
|
|
|
+ let mut phases: Vec<Phase> = phases
|
|
|
.par_chunks(phases.len() / 25)
|
|
|
.flat_map(|chunks| merge_phases(chunks.to_vec(), min_records, 1000, multi).unwrap())
|
|
|
.collect();
|
|
|
|
|
|
+ let chunk_size = phases.len() / 25;
|
|
|
+ phases.par_chunks_mut(chunk_size).for_each(|chunk| {
|
|
|
+ let mut bam_a = rust_htslib::bam::IndexedReader::from_path(bam_path_a).unwrap();
|
|
|
+ let mut bam_b = rust_htslib::bam::IndexedReader::from_path(bam_path_b).unwrap();
|
|
|
+ chunk.iter_mut().for_each(|p| {
|
|
|
+ if let Err(e) = p.make_range(&mut bam_a, &mut bam_b) {
|
|
|
+ warn!("{e}");
|
|
|
+ }
|
|
|
+ });
|
|
|
+ });
|
|
|
+
|
|
|
+ phases.par_sort_by(|a, b| a.range.unwrap().0.cmp(&b.range.unwrap().0));
|
|
|
+
|
|
|
phases
|
|
|
}
|
|
|
|
|
|
pub fn write_phases_bed(
|
|
|
- phases: &Vec<Phase>,
|
|
|
- min_var: usize,
|
|
|
- bam_path_a: &str,
|
|
|
- bam_path_b: &str,
|
|
|
- contig: &str,
|
|
|
+ phases: &[Phase], // should be already sorted and with ranges
|
|
|
file: &str,
|
|
|
) -> Result<()> {
|
|
|
- let format = CustomFormat::builder()
|
|
|
- .grouping(Grouping::Standard)
|
|
|
- .minus_sign("-")
|
|
|
- .separator(",")
|
|
|
- .build()
|
|
|
- .unwrap();
|
|
|
-
|
|
|
- let mut ranges: Vec<_> = phases
|
|
|
- .par_chunks(100)
|
|
|
- .flat_map(|chunks| {
|
|
|
- let mut bam_a = rust_htslib::bam::IndexedReader::from_path(bam_path_a).unwrap();
|
|
|
- let mut bam_b = rust_htslib::bam::IndexedReader::from_path(bam_path_b).unwrap();
|
|
|
- chunks
|
|
|
- .to_vec()
|
|
|
- .iter()
|
|
|
- .map(|p| (p.range(&mut bam_a, &mut bam_b), p.mean_vaf(), p.data.len()))
|
|
|
- .collect::<Vec<_>>()
|
|
|
- })
|
|
|
- .filter(|(r, _, _)| r.is_ok())
|
|
|
- .map(|(r, v, n)| (r.unwrap(), v, n))
|
|
|
- .map(|((min, min_cov, max_cov, max), v, n)| (min..=max, min_cov..=max_cov, v, n))
|
|
|
- .collect();
|
|
|
-
|
|
|
- ranges.sort_by(|(a, _, _, _), (b, _, _, _)| a.start().cmp(b.start()));
|
|
|
+
|
|
|
let mut w = OpenOptions::new()
|
|
|
.read(true)
|
|
|
// .write(true)
|
|
|
@@ -459,23 +445,10 @@ pub fn write_phases_bed(
|
|
|
.append(true)
|
|
|
.open(file)?;
|
|
|
|
|
|
- for (range, _cov, vaf, n) in ranges.iter() {
|
|
|
- if *n < min_var {
|
|
|
- continue;
|
|
|
+ for phase in phases.iter() {
|
|
|
+ if let Some(s) = phase.bed_string() {
|
|
|
+ writeln!(&mut w, "{s}")?;
|
|
|
}
|
|
|
- let line = [
|
|
|
- contig.to_string(),
|
|
|
- range.start().to_string(),
|
|
|
- range.end().to_string(),
|
|
|
- format!(
|
|
|
- "{} {} {}",
|
|
|
- (range.end() - range.start()).to_formatted_string(&format),
|
|
|
- n,
|
|
|
- vaf
|
|
|
- ),
|
|
|
- ]
|
|
|
- .join("\t");
|
|
|
- writeln!(&mut w, "{line}")?;
|
|
|
}
|
|
|
Ok(())
|
|
|
}
|
|
|
@@ -559,10 +532,7 @@ impl PhaserConfig {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-pub fn phase(
|
|
|
- config: PhaserConfig,
|
|
|
- progress: MultiProgress,
|
|
|
-) -> anyhow::Result<()> {
|
|
|
+pub fn phase(config: PhaserConfig, progress: MultiProgress) -> anyhow::Result<()> {
|
|
|
let format = CustomFormat::builder()
|
|
|
.grouping(Grouping::Standard)
|
|
|
.minus_sign("-")
|
|
|
@@ -627,6 +597,7 @@ pub fn phase(
|
|
|
&phases,
|
|
|
&format!("{phases_dir}/{id}_{contig}_phases.postcard.gz"),
|
|
|
)?;
|
|
|
+ write_phases_bed(&phases, &format!("{phases_dir}/{id}_{contig}.bed"))?;
|
|
|
}
|
|
|
}
|
|
|
}
|