|
|
@@ -4,14 +4,18 @@ use flate2::{read::GzDecoder, write::GzEncoder, Compression};
|
|
|
use indicatif::MultiProgress;
|
|
|
use log::{info, warn};
|
|
|
use num_format::{CustomFormat, Grouping, ToFormattedString};
|
|
|
-use pandora_lib_variants::{utils::new_pg_speed, variants::Variant};
|
|
|
+use pandora_lib_variants::{
|
|
|
+ in_out::dict_reader::read_dict,
|
|
|
+ utils::new_pg_speed,
|
|
|
+ variants::{AlterationCategory, Variant},
|
|
|
+};
|
|
|
use rayon::prelude::*;
|
|
|
use rust_htslib::bam::IndexedReader;
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
use std::{
|
|
|
cmp::Ordering,
|
|
|
collections::{HashSet, VecDeque},
|
|
|
- fs::File,
|
|
|
+ fs::{self, File},
|
|
|
io::Read,
|
|
|
thread::spawn,
|
|
|
};
|
|
|
@@ -218,7 +222,11 @@ impl Phase {
|
|
|
.dedup_by(|a, b| a.position == b.position && a.chr == b.chr && a.base == b.base);
|
|
|
}
|
|
|
|
|
|
- pub fn bed_string(&self, bam_a: &mut IndexedReader, bam_b: &mut IndexedReader) -> Result<String> {
|
|
|
+ pub fn bed_string(
|
|
|
+ &self,
|
|
|
+ bam_a: &mut IndexedReader,
|
|
|
+ bam_b: &mut IndexedReader,
|
|
|
+ ) -> Result<String> {
|
|
|
let first = self.data.first().unwrap();
|
|
|
let (_min, min_cov, max_cov, _max) = self.range(bam_a, bam_b)?;
|
|
|
Ok([
|
|
|
@@ -230,7 +238,11 @@ impl Phase {
|
|
|
.join("\t"))
|
|
|
}
|
|
|
|
|
|
- pub fn id(&mut self, bam_a: &mut IndexedReader, bam_b: &mut IndexedReader) -> anyhow::Result<String> {
|
|
|
+ pub fn id(
|
|
|
+ &mut self,
|
|
|
+ bam_a: &mut IndexedReader,
|
|
|
+ bam_b: &mut IndexedReader,
|
|
|
+ ) -> anyhow::Result<String> {
|
|
|
self.sort_dedup();
|
|
|
let synteny = String::from_utf8(self.data.iter().map(|var| var.base).collect())?;
|
|
|
let (min, _, _, max) = self.range(bam_a, bam_b)?;
|
|
|
@@ -502,21 +514,6 @@ pub fn spawn_phase(
|
|
|
(s, rr)
|
|
|
}
|
|
|
|
|
|
-// pub fn save_phases(phases: &Vec<Phase>, filename: &str) -> anyhow::Result<()> {
|
|
|
-// let bytes = postcard::to_allocvec(phases)?;
|
|
|
-// let mut file = File::create(filename)?;
|
|
|
-// file.write_all(&bytes)?;
|
|
|
-// Ok(())
|
|
|
-// }
|
|
|
-//
|
|
|
-// pub fn load_phases(filename: &str) -> anyhow::Result<Vec<Phase>> {
|
|
|
-// let mut file = File::open(filename)?;
|
|
|
-// let mut bytes = Vec::new();
|
|
|
-// file.read_to_end(&mut bytes)?;
|
|
|
-// let phases: Vec<Phase> = postcard::from_bytes(&bytes)?;
|
|
|
-// Ok(phases)
|
|
|
-// }
|
|
|
-
|
|
|
pub fn save_phases(phases: &Vec<Phase>, filename: &str) -> anyhow::Result<()> {
|
|
|
let bytes = postcard::to_allocvec(phases).expect("Serialization failed");
|
|
|
let file = File::create(filename)?;
|
|
|
@@ -534,3 +531,105 @@ pub fn load_phases(filename: &str) -> anyhow::Result<Vec<Phase>> {
|
|
|
let phases: Vec<Phase> = postcard::from_bytes(&bytes).expect("Deserialization failed");
|
|
|
Ok(phases)
|
|
|
}
|
|
|
+
|
|
|
+#[derive(Debug)]
|
|
|
+pub struct PhaserConfig {
|
|
|
+ pub id: String,
|
|
|
+ pub bam_path_a: String,
|
|
|
+ pub bam_path_b: String,
|
|
|
+ pub const_variants: String,
|
|
|
+ pub phases_dir: String,
|
|
|
+ pub min_records: usize,
|
|
|
+}
|
|
|
+
|
|
|
+impl PhaserConfig {
|
|
|
+ pub fn new(id: &str, data_dir: &str, min_records: usize) -> Self {
|
|
|
+ let bam_path_a = format!("{data_dir}/{id}/diag/{id}_diag_hs1.bam");
|
|
|
+ let bam_path_b = format!("{data_dir}/{id}/mrd/{id}_mrd_hs1.bam");
|
|
|
+ let const_variants = format!("{data_dir}/{id}/diag/{id}_constit.bytes.gz");
|
|
|
+ let phases_dir = format!("{data_dir}/{id}/diag/phases");
|
|
|
+ Self {
|
|
|
+ id: id.to_string(),
|
|
|
+ bam_path_a,
|
|
|
+ bam_path_b,
|
|
|
+ const_variants,
|
|
|
+ phases_dir,
|
|
|
+ min_records,
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+pub fn phase(
|
|
|
+ config: PhaserConfig,
|
|
|
+ progress: MultiProgress,
|
|
|
+) -> anyhow::Result<()> {
|
|
|
+ let format = CustomFormat::builder()
|
|
|
+ .grouping(Grouping::Standard)
|
|
|
+ .minus_sign("-")
|
|
|
+ .separator("_")
|
|
|
+ .build()
|
|
|
+ .unwrap();
|
|
|
+
|
|
|
+ let PhaserConfig {
|
|
|
+ id,
|
|
|
+ bam_path_a,
|
|
|
+ bam_path_b,
|
|
|
+ const_variants,
|
|
|
+ phases_dir,
|
|
|
+ min_records,
|
|
|
+ } = config;
|
|
|
+
|
|
|
+ fs::create_dir_all(&phases_dir)?;
|
|
|
+
|
|
|
+ let variants = pandora_lib_variants::variants::Variants::new_from_bytes(
|
|
|
+ &id,
|
|
|
+ &const_variants,
|
|
|
+ progress.clone(),
|
|
|
+ )?;
|
|
|
+
|
|
|
+ info!(
|
|
|
+ "{} variants loaded",
|
|
|
+ variants.len().to_formatted_string(&format)
|
|
|
+ );
|
|
|
+
|
|
|
+ let mut variants: Vec<Variant> = variants
|
|
|
+ .data
|
|
|
+ .into_par_iter()
|
|
|
+ .filter(|v| {
|
|
|
+ let mut v = v.clone();
|
|
|
+ v.vaf() > 0.4 && v.vaf() < 0.6
|
|
|
+ })
|
|
|
+ .filter(|v| matches!(v.alt_cat(), AlterationCategory::Snv))
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let mut contigs = HashSet::new();
|
|
|
+ variants.iter().for_each(|v| {
|
|
|
+ contigs.insert(v.contig.to_string());
|
|
|
+ });
|
|
|
+
|
|
|
+ variants.par_sort_by(|a, b| a.position.cmp(&b.position));
|
|
|
+
|
|
|
+ let dict = read_dict("/data/ref/hs1/chm13v2.0.dict")?;
|
|
|
+ for (contig, _) in dict {
|
|
|
+ if !contigs.contains(&contig) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ let v: Vec<_> = variants
|
|
|
+ .clone()
|
|
|
+ .into_par_iter()
|
|
|
+ .filter(|v| v.contig == contig)
|
|
|
+ .collect();
|
|
|
+ if variants.len() > 1 {
|
|
|
+ info!("{contig}: {} variants to phase", v.len());
|
|
|
+ let phases = variants_phasing(v, &bam_path_a, &bam_path_b, min_records, &progress);
|
|
|
+ if !phases.is_empty() {
|
|
|
+ save_phases(
|
|
|
+ &phases,
|
|
|
+ &format!("{phases_dir}/{id}_{contig}_phases.postcard.gz"),
|
|
|
+ )?;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+}
|