Thomas 1 jaar geleden
bovenliggende
commit
a52db1b2c4
2 gewijzigde bestanden met toevoegingen van 64 en 99 verwijderingen
  1. 3 9
      src/lib.rs
  2. 61 90
      src/phase.rs

+ 3 - 9
src/lib.rs

@@ -281,7 +281,6 @@ pub fn par_whole_scan(contig: &str, bam_path: &str, out_dir: &str) -> anyhow::Re
 mod tests {
     use indicatif::MultiProgress;
     use indicatif_log_bridge::LogWrapper;
-    use rust_htslib::bam::IndexedReader;
 
     use crate::phase::{load_phases, phase, PhaserConfig};
 
@@ -332,14 +331,9 @@ mod tests {
         let p = load_phases(&phase_path)?;
         info!("{} phases", p.len());
 
-        let bam_path_a = &format!("/data/longreads_basic_pipe/{id}/diag/{id}_diag_hs1.bam");
-        let bam_path_b = &format!("/data/longreads_basic_pipe/{id}/mrd/{id}_mrd_hs1.bam");
-        let mut bam_a = IndexedReader::from_path(bam_path_a)?;
-        let mut bam_b = IndexedReader::from_path(bam_path_b)?;
-
-        for mut phase in p {
-            if phase.data.len() > 1 {
-                info!("{}\t{}", phase.id(&mut bam_a, &mut bam_b)?, phase.mean_vaf());
+        for phase in p {
+            if let Some(phase_id) = &phase.id {
+                info!("{}\t{}", phase_id, phase.mean_vaf());
             }
         }
         Ok(())

+ 61 - 90
src/phase.rs

@@ -106,15 +106,19 @@ impl PartialEq for HeteroVar {
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Phase {
+    pub id: Option<String>,
     pub data: Vec<HeteroVar>,
     pub reads: HashSet<String>,
+    pub range: Option<(u64, u64, u64, u64)>,
 }
 
 impl Phase {
     pub fn new(hetero_var: &HeteroVar) -> Self {
         Self {
+            id: None,
             data: vec![hetero_var.clone()],
             reads: hetero_var.reads.clone(),
+            range: None,
         }
     }
     pub fn try_merge(&mut self, hetero_var: &HeteroVar, min_records: usize) -> bool {
@@ -151,17 +155,15 @@ impl Phase {
         }
     }
 
-    pub fn range(
-        &self,
+    pub fn make_range(
+        &mut self,
         bam_a: &mut IndexedReader,
         bam_b: &mut IndexedReader,
-    ) -> Result<(i64, i64, i64, i64)> {
-        let mut phase = self.clone();
-        phase.sort_dedup();
-        let data = &self.data;
+    ) -> Result<()> {
+        self.sort_dedup();
 
-        let left = data.first().unwrap();
-        let right = data.last().unwrap();
+        let left = self.data.first().unwrap();
+        let right = self.data.last().unwrap();
 
         let mut left_records =
             pandora_lib_pileup::records_at_base(bam_a, &left.chr, left.position, false)?;
@@ -184,12 +186,12 @@ impl Phase {
         let left_starts: Vec<_> = left_records
             .iter()
             .filter(|(_, b)| *b == left.base)
-            .map(|(r, _)| r.pos())
+            .map(|(r, _)| r.pos() as u64)
             .collect();
         let right_ends: Vec<_> = right_records
             .iter()
             .filter(|(_, b)| *b == right.base)
-            .map(|(r, _)| r.pos() + r.seq_len() as i64)
+            .map(|(r, _)| (r.pos() as usize + r.seq_len()) as u64)
             .collect();
 
         let min = left_starts.iter().min();
@@ -205,7 +207,8 @@ impl Phase {
         }
 
         if let (Some(min), Some(min_cov), Some(max_cov), Some(max)) = (min, min_cov, max_cov, max) {
-            Ok((*min, *min_cov, *max_cov, *max))
+            self.range = Some((*min, *min_cov, *max_cov, *max));
+            Ok(())
         } else {
             Err(anyhow!(format!("can't find range {:#?}", self)))
         }
@@ -222,38 +225,35 @@ impl Phase {
             .dedup_by(|a, b| a.position == b.position && a.chr == b.chr && a.base == b.base);
     }
 
-    pub fn bed_string(
-        &self,
-        bam_a: &mut IndexedReader,
-        bam_b: &mut IndexedReader,
-    ) -> Result<String> {
-        let first = self.data.first().unwrap();
-        let (_min, min_cov, max_cov, _max) = self.range(bam_a, bam_b)?;
-        Ok([
-            first.chr.to_string(),
-            min_cov.to_string(),
-            max_cov.to_string(),
-            self.mean_vaf().to_string(),
-        ]
-        .join("\t"))
+    pub fn bed_string(&self) -> Option<String> {
+        if let (Some((min, _min_cov, _max_cov, max)), Some(id)) = (self.range, self.id()) {
+            Some([
+                self.data.first().unwrap().chr.to_string(),
+                min.to_string(),
+                max.to_string(),
+                format!("{} {:.2}", id, self.mean_vaf()),
+            ]
+            .join("\t"))
+        } else {
+            None
+        }
     }
 
-    pub fn id(
-        &mut self,
-        bam_a: &mut IndexedReader,
-        bam_b: &mut IndexedReader,
-    ) -> anyhow::Result<String> {
-        self.sort_dedup();
-        let synteny = String::from_utf8(self.data.iter().map(|var| var.base).collect())?;
-        let (min, _, _, max) = self.range(bam_a, bam_b)?;
-        let f = self.data.first().unwrap().chr.clone();
-        let l = self.data.last().unwrap().chr.clone();
-        let b = if l != f {
-            format!("{l}:")
+    pub fn id(&self) -> Option<String> {
+        let synteny = String::from_utf8(self.data.iter().map(|var| var.base).collect()).unwrap();
+        if let Some((min, _, _, max)) = self.range {
+            let f = self.data.first().unwrap().chr.clone();
+            let l = self.data.last().unwrap().chr.clone();
+            let b = if l != f {
+                format!("{l}:")
+            } else {
+                "".to_string()
+            };
+            Some(format!("{}:{min}-{}{max}[{synteny}]", f, b))
         } else {
-            "".to_string()
-        };
-        Ok(format!("{}:{min}-{}{max}[{synteny}]", f, b))
+            None
+        }
+        
     }
 }
 
@@ -412,46 +412,32 @@ pub fn variants_phasing(
         .par_chunks(phases.len() / 50)
         .flat_map(|chunks| merge_phases(chunks.to_vec(), min_records, 100, multi).unwrap())
         .collect();
-    let phases: Vec<Phase> = phases
+    let mut phases: Vec<Phase> = phases
         .par_chunks(phases.len() / 25)
         .flat_map(|chunks| merge_phases(chunks.to_vec(), min_records, 1000, multi).unwrap())
         .collect();
 
+    let chunk_size = phases.len() / 25;
+    phases.par_chunks_mut(chunk_size).for_each(|chunk| {
+        let mut bam_a = rust_htslib::bam::IndexedReader::from_path(bam_path_a).unwrap();
+        let mut bam_b = rust_htslib::bam::IndexedReader::from_path(bam_path_b).unwrap();
+        chunk.iter_mut().for_each(|p| {
+            if let Err(e) = p.make_range(&mut bam_a, &mut bam_b) {
+                warn!("{e}");
+            }
+        });
+    });
+
+    phases.par_sort_by(|a, b| a.range.unwrap().0.cmp(&b.range.unwrap().0));
+
     phases
 }
 
 pub fn write_phases_bed(
-    phases: &Vec<Phase>,
-    min_var: usize,
-    bam_path_a: &str,
-    bam_path_b: &str,
-    contig: &str,
+    phases: &[Phase], // should be already sorted and with ranges
     file: &str,
 ) -> Result<()> {
-    let format = CustomFormat::builder()
-        .grouping(Grouping::Standard)
-        .minus_sign("-")
-        .separator(",")
-        .build()
-        .unwrap();
-
-    let mut ranges: Vec<_> = phases
-        .par_chunks(100)
-        .flat_map(|chunks| {
-            let mut bam_a = rust_htslib::bam::IndexedReader::from_path(bam_path_a).unwrap();
-            let mut bam_b = rust_htslib::bam::IndexedReader::from_path(bam_path_b).unwrap();
-            chunks
-                .to_vec()
-                .iter()
-                .map(|p| (p.range(&mut bam_a, &mut bam_b), p.mean_vaf(), p.data.len()))
-                .collect::<Vec<_>>()
-        })
-        .filter(|(r, _, _)| r.is_ok())
-        .map(|(r, v, n)| (r.unwrap(), v, n))
-        .map(|((min, min_cov, max_cov, max), v, n)| (min..=max, min_cov..=max_cov, v, n))
-        .collect();
-
-    ranges.sort_by(|(a, _, _, _), (b, _, _, _)| a.start().cmp(b.start()));
+       
     let mut w = OpenOptions::new()
         .read(true)
         // .write(true)
@@ -459,23 +445,10 @@ pub fn write_phases_bed(
         .append(true)
         .open(file)?;
 
-    for (range, _cov, vaf, n) in ranges.iter() {
-        if *n < min_var {
-            continue;
+    for phase in phases.iter() {
+        if let Some(s) = phase.bed_string() {
+            writeln!(&mut w, "{s}")?;
         }
-        let line = [
-            contig.to_string(),
-            range.start().to_string(),
-            range.end().to_string(),
-            format!(
-                "{} {} {}",
-                (range.end() - range.start()).to_formatted_string(&format),
-                n,
-                vaf
-            ),
-        ]
-        .join("\t");
-        writeln!(&mut w, "{line}")?;
     }
     Ok(())
 }
@@ -559,10 +532,7 @@ impl PhaserConfig {
     }
 }
 
-pub fn phase(
-    config: PhaserConfig,
-    progress: MultiProgress,
-) -> anyhow::Result<()> {
+pub fn phase(config: PhaserConfig, progress: MultiProgress) -> anyhow::Result<()> {
     let format = CustomFormat::builder()
         .grouping(Grouping::Standard)
         .minus_sign("-")
@@ -627,6 +597,7 @@ pub fn phase(
                     &phases,
                     &format!("{phases_dir}/{id}_{contig}_phases.postcard.gz"),
                 )?;
+                write_phases_bed(&phases, &format!("{phases_dir}/{id}_{contig}.bed"))?;
             }
         }
     }