| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235 |
- use std::{
- fs::{self, File, Metadata}, io::Read, path::PathBuf, str::FromStr
- };
- use anyhow::{anyhow, Context};
- use glob::glob;
- use hashbrown::HashMap;
- use log::warn;
- use pandora_lib_bindings::{
- progs::cramino::{Cramino, CraminoRes},
- utils::RunBin,
- };
- use rayon::prelude::*;
- use serde::{Deserialize, Serialize};
- #[derive(Debug, Clone, Deserialize, Serialize)]
- pub struct Bam {
- pub id: String,
- pub time_point: String,
- pub reference_genome: String,
- pub bam_type: BamType,
- pub path: PathBuf,
- #[serde(with = "metadata_serde")]
- pub file_metadata: Metadata,
- // #[serde(skip)]
- // pub file_metadata: Metadata,
- pub cramino: Option<CraminoRes>,
- pub composition: Vec<(String, f64)>,
- }
- #[derive(Debug, PartialEq, Clone, Deserialize, Serialize)]
- pub enum BamType {
- WGS,
- Panel(String),
- ChIP(String),
- }
- impl Bam {
- pub fn new(path: PathBuf) -> anyhow::Result<Self> {
- let stem = path
- .clone()
- .file_stem()
- .context("Can't parse stem from {path}")?
- .to_string_lossy()
- .to_string();
- let stem: Vec<&str> = stem.split('_').collect();
- if stem.len() > 4 || stem.len() < 3 {
- return Err(anyhow!("Error in bam name: {}", path.display()));
- }
- let id = stem[0].to_string();
- let time_point = stem[1].to_string();
- let reference_genome = stem
- .last()
- .context("Can't get last from stem {stem}")?
- .to_string();
- let bam_type = if stem.len() == 4 {
- match stem[2] {
- "oncoT" => BamType::Panel("oncoT".to_string()),
- "H3K27ac" => BamType::ChIP("H3K27ac".to_string()),
- "H3K4me3" => BamType::ChIP("H3K4me3".to_string()),
- _ => return Err(anyhow!("Error in bam name: {}", path.display())),
- }
- } else {
- BamType::WGS
- };
- let tp_dir = path
- .parent()
- .context("Can't parse parent from: {bam_path}")?;
- let cramino_path = format!(
- "{}/{id}_{time_point}_hs1_cramino.txt",
- tp_dir.to_string_lossy()
- );
- let file_metadata = fs::metadata(&path)?;
- let cramino = if bam_type == BamType::WGS {
- if !PathBuf::from_str(&cramino_path)?.exists() {
- return Err(anyhow!("Cramino file missing {cramino_path}"));
- }
- let mut cramino = Cramino::default().with_result_path(&cramino_path);
- cramino
- .parse_results()
- .context(format!("Error while parsing cramino for {cramino_path}"))?;
- if let Some(cramino) = cramino.results {
- Some(cramino)
- } else {
- return Err(anyhow!("Cramino results parsing failed"));
- }
- } else {
- None
- };
- let composition =
- pandora_lib_pileup::bam_compo(path.to_string_lossy().as_ref(), 20000).context(
- format!("Error while reading BAM composition for {}", path.display()),
- )?;
- Ok(Self {
- path,
- // file_metadata,
- cramino,
- id: id.to_string(),
- time_point: time_point.to_string(),
- bam_type,
- reference_genome,
- composition,
- })
- }
- pub fn load_json(path: &str) -> anyhow::Result<Self> {
- let f = File::open(path)?;
- let s: Self = serde_json::from_reader(f)?;
- Ok(s)
- }
- pub fn save_json(path: &str) -> anyhow::Result<()> {
- }
- }
- #[derive(Debug)]
- pub struct BamCollection {
- pub bams: Vec<Bam>,
- }
- impl BamCollection {
- pub fn new(result_dir: &str) -> Self {
- load_bam_collection(result_dir)
- }
- pub fn by_acquisition_id(&self) -> HashMap<String, Vec<&Bam>> {
- let mut acq: HashMap<String, Vec<&Bam>> = HashMap::new();
- for bam in self.bams.iter() {
- for (acq_id, _) in bam.composition.iter() {
- if let Some(entry) = acq.get_mut(acq_id) {
- entry.push(bam);
- } else {
- acq.insert(acq_id.to_string(), vec![bam]);
- }
- }
- }
- acq
- }
- pub fn get(&self, id: &str, time_point: &str) -> Vec<&Bam> {
- self.bams
- .iter()
- .filter(|b| b.id == id && b.time_point == time_point)
- .collect()
- }
- pub fn by_id_completed(&self, min_diag_cov: f32, min_mrd_cov: f32) -> Vec<Bam> {
- self.bams
- .iter()
- .filter(|b| matches!(b.bam_type, BamType::WGS))
- .filter(|b| match &b.cramino {
- Some(cramino) => match b.time_point.as_str() {
- "diag" => cramino.mean_length >= min_diag_cov as f64,
- "mrd" => cramino.mean_length >= min_mrd_cov as f64,
- _ => false
- },
- _ => false,
- })
- .cloned()
- .collect()
- }
- }
- pub fn load_bam_collection(result_dir: &str) -> BamCollection {
- let pattern = format!("{}/*/*/*.bam", result_dir);
- let bams = glob(&pattern)
- .expect("Failed to read glob pattern")
- .par_bridge()
- .filter_map(|entry| {
- match entry {
- Ok(path) => match Bam::new(path) {
- Ok(bam) => return Some(bam),
- Err(e) => warn!("{e}"),
- },
- Err(e) => warn!("Error: {:?}", e),
- }
- None
- })
- .collect();
- BamCollection { bams }
- }
- mod metadata_serde {
- use super::*;
- use serde::{Serializer, Deserializer};
- #[derive(Serialize, Deserialize)]
- struct SerializableMetadata {
- len: u64,
- modified: u64,
- created: u64,
- }
- pub fn serialize<S>(metadata: &Metadata, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: Serializer,
- {
- let serializable = SerializableMetadata {
- len: metadata.len(),
- modified: metadata.modified()
- .unwrap_or(UNIX_EPOCH)
- .duration_since(UNIX_EPOCH)
- .unwrap_or_default()
- .as_secs(),
- created: metadata.created()
- .unwrap_or(UNIX_EPOCH)
- .duration_since(UNIX_EPOCH)
- .unwrap_or_default()
- .as_secs(),
- };
- serializable.serialize(serializer)
- }
- pub fn deserialize<'de, D>(deserializer: D) -> Result<Metadata, D::Error>
- where
- D: Deserializer<'de>,
- {
- let serializable = SerializableMetadata::deserialize(deserializer)?;
- let file = tempfile::tempfile().map_err(serde::de::Error::custom)?;
- let metadata = file.metadata().map_err(serde::de::Error::custom)?;
- Ok(metadata)
- }
- }
|