use anyhow::Context; use chrono::{DateTime, Utc}; use csi::binning_index::ReferenceSequence; use glob::glob; use log::warn; use std::{collections::HashMap, fs::Metadata, os::unix::fs::MetadataExt, path::PathBuf}; use noodles_csi as csi; use num_format::{Locale, ToFormattedString}; use crate::commands::bcftools::{bcftools_index, BcftoolsConfig}; #[derive(Debug, Clone)] pub struct Vcf { pub id: String, pub caller: String, pub time: String, pub path: PathBuf, pub file_metadata: Metadata, pub n_variants: u64, } impl Vcf { pub fn new(path: PathBuf) -> anyhow::Result { let stem = path .file_stem() .context("Can't parse stem")? .to_string_lossy() .to_string(); let stem_splt: Vec<&str> = stem.split('_').collect(); let id = stem_splt[0].to_string(); let time = stem_splt[1].to_string(); let caller = stem_splt[2..stem_splt.len() - 1].join("_"); if !PathBuf::from(format!("{}.csi", path.display())).exists() { bcftools_index(&path.display().to_string(), &BcftoolsConfig::default())?; } let n_variants = n_variants(path.to_str().context("Can't convert path to str")?)?; let file_metadata = path .metadata() .context(format!("Can't access metadata for {}", path.display()))?; Ok(Self { id, caller, time, path, file_metadata, n_variants, }) } pub fn modified(&self) -> anyhow::Result> { Ok(self.file_metadata.modified().unwrap().into()) } pub fn size(&self) -> u64 { self.file_metadata.size() } pub fn tsv(&self) -> anyhow::Result { Ok([ self.id.clone(), self.time.clone(), self.caller.clone(), self.n_variants.to_string(), self.modified()?.to_string(), self.size().to_string(), self.path.display().to_string(), ] .join("\t")) } pub fn println(&self) -> anyhow::Result<()> { let formated_n_variants = self.n_variants.to_formatted_string(&Locale::en); let formated_modified = self.modified()?.naive_local().to_string(); let formated_size = format!("{:#}", byte_unit::Byte::from_u64(self.size())); println!( "{}", [ self.id.to_string(), self.time.to_string(), self.caller.to_string(), formated_n_variants, formated_modified, formated_size, self.path.display().to_string() ] .join("\t") ); Ok(()) } } #[derive(Debug)] pub struct VcfCollection { pub vcfs: Vec, } impl VcfCollection { pub fn new(result_dir: &str) -> Self { let mut vcfs = Vec::new(); let pattern = format!("{}/*/*/*/*_PASSED.vcf.gz", result_dir); for entry in glob(&pattern).expect("Failed to read glob pattern") { match entry { Ok(path) => match Vcf::new(path) { Ok(vcf) => vcfs.push(vcf), Err(e) => warn!("{e}"), }, Err(e) => warn!("Error: {:?}", e), } } VcfCollection { vcfs } } pub fn sort_by_id(&mut self) { self.vcfs.sort_by_key(|v| v.id.clone()); } pub fn group_by_id(&self) -> Vec<(String, Vec)> { let mut vcf_by_ids: HashMap> = HashMap::new(); self.vcfs.iter().for_each(|v| { vcf_by_ids.entry(v.id.clone()).or_default().push(v.clone()); }); vcf_by_ids.into_iter().collect() } } pub fn n_variants(path: &str) -> anyhow::Result { let csi_src = format!("{path}.csi"); let index = csi::read(csi_src).context(format!("can't read index of {path}"))?; let mut n = 0; for reference_sequence in index.reference_sequences() { if let Some(metadata) = reference_sequence.metadata() { n += metadata.mapped_record_count() } } Ok(n) }