Thomas 1 tahun lalu
induk
melakukan
65cf78f893
2 mengubah file dengan 121 tambahan dan 94 penghapusan
  1. 3 75
      src/lib.rs
  2. 118 19
      src/phase.rs

+ 3 - 75
src/lib.rs

@@ -281,11 +281,9 @@ pub fn par_whole_scan(contig: &str, bam_path: &str, out_dir: &str) -> anyhow::Re
 mod tests {
     use indicatif::MultiProgress;
     use indicatif_log_bridge::LogWrapper;
-    use num_format::{CustomFormat, Grouping, ToFormattedString};
-    use pandora_lib_variants::{in_out::dict_reader::read_dict, variants::{AlterationCategory, Variant}};
     use rust_htslib::bam::IndexedReader;
 
-    use crate::phase::{load_phases, save_phases, variants_phasing};
+    use crate::phase::{load_phases, phase, PhaserConfig};
 
     use super::*;
 
@@ -314,84 +312,14 @@ mod tests {
         let id = "SALICETTO";
         let min_records = 2;
 
-        let format = CustomFormat::builder()
-            .grouping(Grouping::Standard)
-            .minus_sign("-")
-            .separator("_")
-            .build()
-            .unwrap();
-
         let logger =
             env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
                 .build();
         let multi = MultiProgress::new();
         LogWrapper::new(multi.clone(), logger).try_init().unwrap();
 
-        let bam_path_a = &format!("/data/longreads_basic_pipe/{id}/diag/{id}_diag_hs1.bam");
-        let bam_path_b = &format!("/data/longreads_basic_pipe/{id}/mrd/{id}_mrd_hs1.bam");
-        let somatic_path =
-            &format!("/data/longreads_basic_pipe/{id}/diag/{id}_constit.bytes.gz");
-        let phases_dir = format!("/data/longreads_basic_pipe/{id}/diag/phases");
-        fs::create_dir_all(&phases_dir)?;
-
-        let variants =
-            pandora_lib_variants::variants::Variants::new_from_bytes(id, somatic_path, multi.clone())?;
-
-        info!(
-            "Variants loaded {}.",
-            variants.len().to_formatted_string(&format)
-        );
-
-        let mut variants: Vec<Variant> = variants
-            .data
-            .into_par_iter()
-            .filter(|v| {
-                let mut v = v.clone();
-                v.vaf() > 0.4 && v.vaf() < 0.6
-            })
-            .filter(|v| {
-                matches!(v.alt_cat(), AlterationCategory::Snv)
-            })
-            .collect();
-
-        let mut contigs = HashSet::new();
-        variants.iter().for_each(|v| {
-            contigs.insert(v.contig.to_string());
-        });
-
-        variants.par_sort_by(|a, b| a.position.cmp(&b.position));
-
-        let dict = read_dict("/data/ref/hs1/chm13v2.0.dict")?;
-        for (contig, _) in dict {
-            if !contigs.contains(&contig) {
-                continue;
-            }
-            // if contig != *"chr9" {
-            //     continue;
-            // }
-
-            let v: Vec<_> = variants
-                .clone()
-                .into_par_iter()
-                .filter(|v| v.contig == contig)
-                .collect();
-            if variants.len() > 1 {
-                info!("{contig}: {} variants to phase", v.len());
-                let phases = variants_phasing(v, bam_path_a, bam_path_b, min_records, &multi);
-                if !phases.is_empty() {
-                    save_phases(&phases, &format!("{phases_dir}/{id}_{contig}_phases.postcard.gz"))?;
-                }
-                // info!("{} phases", phases.len());
-                // let f = phases.first().unwrap();
-                // println!("{f:#?}");
-                // let f = phases.last().unwrap();
-                // println!("{f:#?}");
-               // write_phases_bed(&phases, min_var, bam_path, &contig, bed)?;
-            }
-        }
-
-        // TODO: assign somatic to constit phase
-        Ok(())
+        let config = PhaserConfig::new(id, "/data/longreads_basic_pipe", min_records);
+        phase(config, multi)
     }
 
     #[test]

+ 118 - 19
src/phase.rs

@@ -4,14 +4,18 @@ use flate2::{read::GzDecoder, write::GzEncoder, Compression};
 use indicatif::MultiProgress;
 use log::{info, warn};
 use num_format::{CustomFormat, Grouping, ToFormattedString};
-use pandora_lib_variants::{utils::new_pg_speed, variants::Variant};
+use pandora_lib_variants::{
+    in_out::dict_reader::read_dict,
+    utils::new_pg_speed,
+    variants::{AlterationCategory, Variant},
+};
 use rayon::prelude::*;
 use rust_htslib::bam::IndexedReader;
 use serde::{Deserialize, Serialize};
 use std::{
     cmp::Ordering,
     collections::{HashSet, VecDeque},
-    fs::File,
+    fs::{self, File},
     io::Read,
     thread::spawn,
 };
@@ -218,7 +222,11 @@ impl Phase {
             .dedup_by(|a, b| a.position == b.position && a.chr == b.chr && a.base == b.base);
     }
 
-    pub fn bed_string(&self, bam_a: &mut IndexedReader, bam_b: &mut IndexedReader) -> Result<String> {
+    pub fn bed_string(
+        &self,
+        bam_a: &mut IndexedReader,
+        bam_b: &mut IndexedReader,
+    ) -> Result<String> {
         let first = self.data.first().unwrap();
         let (_min, min_cov, max_cov, _max) = self.range(bam_a, bam_b)?;
         Ok([
@@ -230,7 +238,11 @@ impl Phase {
         .join("\t"))
     }
 
-    pub fn id(&mut self, bam_a: &mut IndexedReader, bam_b: &mut IndexedReader) -> anyhow::Result<String> {
+    pub fn id(
+        &mut self,
+        bam_a: &mut IndexedReader,
+        bam_b: &mut IndexedReader,
+    ) -> anyhow::Result<String> {
         self.sort_dedup();
         let synteny = String::from_utf8(self.data.iter().map(|var| var.base).collect())?;
         let (min, _, _, max) = self.range(bam_a, bam_b)?;
@@ -502,21 +514,6 @@ pub fn spawn_phase(
     (s, rr)
 }
 
-// pub fn save_phases(phases: &Vec<Phase>, filename: &str) -> anyhow::Result<()> {
-//     let bytes = postcard::to_allocvec(phases)?;
-//     let mut file = File::create(filename)?;
-//     file.write_all(&bytes)?;
-//     Ok(())
-// }
-//
-// pub fn load_phases(filename: &str) -> anyhow::Result<Vec<Phase>> {
-//     let mut file = File::open(filename)?;
-//     let mut bytes = Vec::new();
-//     file.read_to_end(&mut bytes)?;
-//     let phases: Vec<Phase> = postcard::from_bytes(&bytes)?;
-//     Ok(phases)
-// }
-
 pub fn save_phases(phases: &Vec<Phase>, filename: &str) -> anyhow::Result<()> {
     let bytes = postcard::to_allocvec(phases).expect("Serialization failed");
     let file = File::create(filename)?;
@@ -534,3 +531,105 @@ pub fn load_phases(filename: &str) -> anyhow::Result<Vec<Phase>> {
     let phases: Vec<Phase> = postcard::from_bytes(&bytes).expect("Deserialization failed");
     Ok(phases)
 }
+
+#[derive(Debug)]
+pub struct PhaserConfig {
+    pub id: String,
+    pub bam_path_a: String,
+    pub bam_path_b: String,
+    pub const_variants: String,
+    pub phases_dir: String,
+    pub min_records: usize,
+}
+
+impl PhaserConfig {
+    pub fn new(id: &str, data_dir: &str, min_records: usize) -> Self {
+        let bam_path_a = format!("{data_dir}/{id}/diag/{id}_diag_hs1.bam");
+        let bam_path_b = format!("{data_dir}/{id}/mrd/{id}_mrd_hs1.bam");
+        let const_variants = format!("{data_dir}/{id}/diag/{id}_constit.bytes.gz");
+        let phases_dir = format!("{data_dir}/{id}/diag/phases");
+        Self {
+            id: id.to_string(),
+            bam_path_a,
+            bam_path_b,
+            const_variants,
+            phases_dir,
+            min_records,
+        }
+    }
+}
+
+pub fn phase(
+    config: PhaserConfig,
+    progress: MultiProgress,
+) -> anyhow::Result<()> {
+    let format = CustomFormat::builder()
+        .grouping(Grouping::Standard)
+        .minus_sign("-")
+        .separator("_")
+        .build()
+        .unwrap();
+
+    let PhaserConfig {
+        id,
+        bam_path_a,
+        bam_path_b,
+        const_variants,
+        phases_dir,
+        min_records,
+    } = config;
+
+    fs::create_dir_all(&phases_dir)?;
+
+    let variants = pandora_lib_variants::variants::Variants::new_from_bytes(
+        &id,
+        &const_variants,
+        progress.clone(),
+    )?;
+
+    info!(
+        "{} variants loaded",
+        variants.len().to_formatted_string(&format)
+    );
+
+    let mut variants: Vec<Variant> = variants
+        .data
+        .into_par_iter()
+        .filter(|v| {
+            let mut v = v.clone();
+            v.vaf() > 0.4 && v.vaf() < 0.6
+        })
+        .filter(|v| matches!(v.alt_cat(), AlterationCategory::Snv))
+        .collect();
+
+    let mut contigs = HashSet::new();
+    variants.iter().for_each(|v| {
+        contigs.insert(v.contig.to_string());
+    });
+
+    variants.par_sort_by(|a, b| a.position.cmp(&b.position));
+
+    let dict = read_dict("/data/ref/hs1/chm13v2.0.dict")?;
+    for (contig, _) in dict {
+        if !contigs.contains(&contig) {
+            continue;
+        }
+        let v: Vec<_> = variants
+            .clone()
+            .into_par_iter()
+            .filter(|v| v.contig == contig)
+            .collect();
+        if variants.len() > 1 {
+            info!("{contig}: {} variants to phase", v.len());
+            let phases = variants_phasing(v, &bam_path_a, &bam_path_b, min_records, &progress);
+            if !phases.is_empty() {
+                save_phases(
+                    &phases,
+                    &format!("{phases_dir}/{id}_{contig}_phases.postcard.gz"),
+                )?;
+            }
+        }
+    }
+
+    Ok(())
+}