use std::{ fs::File, io::BufReader, str::FromStr, }; use anyhow::Context; use pandora_lib_variants::variants::Variant; use crate::{callers::Caller, io::tsv::TsvLine, variant::vcf_variant::VariantType}; /// A single row from a VCF file (tab-separated, no header row). #[derive(Debug, Eq, PartialEq, Clone)] pub struct VCFRow { pub chr: String, pub pos: u32, pub id: String, pub reference: String, pub alt: String, pub qual: String, pub filter: String, pub info: String, pub format: String, pub value: String, } impl FromStr for VCFRow { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { let f: Vec<&str> = s.split('\t').collect(); let get = |i: usize, name: &str| -> anyhow::Result<&str> { f.get(i).copied().ok_or_else(|| anyhow::anyhow!("missing {name} (col {i})")) }; Ok(Self { chr: get(0, "chr")?.to_string(), pos: get(1, "pos")?.parse().context("bad pos")?, id: get(2, "id")?.to_string(), reference: get(3, "reference")?.to_string(), alt: get(4, "alt")?.to_string(), qual: get(5, "qual")?.to_string(), filter: get(6, "filter")?.to_string(), info: get(7, "info")?.to_string(), format: get(8, "format")?.to_string(), value: get(9, "value")?.to_string(), }) } } pub fn read_vcf( path: &str, caller: &Caller, variant_type: &VariantType, ) -> anyhow::Result> { let mut reader = BufReader::new(get_reader(path)?); let mut line = TsvLine::new(); let mut all = Vec::new(); // should be replaced with bcftools while line.read(&mut reader)? { if line.as_str().starts_with('#') || line.as_str().is_empty() { continue; } let record: VCFRow = line.as_str().parse()?; // Normalize into multirows if record.alt.contains(",") { let alts = record.alt.split(',').collect::>(); let n = alts.len(); let vals = record.value.split(':').collect::>(); let ads = vals.get(3).unwrap().split(',').collect::>(); let vafs = vals.get(4).unwrap().split(',').collect::>(); let pls = vals.get(5).unwrap().split(',').collect::>(); for i in 0..n { let cp = &pls[(i * 3)..(i * 3 + 3)]; let nval = format!( "{}:{}:{}:{}:{}:{}", vals[0], vals[1], vals[2], [ads[0], ads[i + 1]].join(","), vafs[i], cp.join(",") ); let mut rec = record.clone(); rec.value = nval.clone(); rec.alt = alts[i].to_string(); all.push(rec); } } else { all.push(record); } } let base_n = "N".to_string(); let res: Vec = all .par_iter_mut() .map(|row| { // for Sniffles normalize insertion/deletion position (after the pos) if caller == &VCFSource::Sniffles && row.reference == base_n && row.alt.len() > 1 { row.pos -= 1; } let mut v = Variant::from_vcfrow(row, caller.clone(), variant_type.clone()).unwrap(); v.get_depth(); v.get_n_alt(); v }) .filter(|v| { for cd in v.callers_data.iter() { if cd.should_filter() { return false; } } true }) .collect(); Ok(res) } pub fn get_reader(path: &str) -> anyhow::Result> { let file_type = *path.split(".").collect::>().last()?; assert!(file_type == "gz" || file_type == "vcf"); let raw_reader: Box = Box::new(File::open(path)?); match file_type { "gz" => { let reader = Box::new(BGZFReader::new(raw_reader)?); Ok(Box::new(BufReader::new(reader))) } "vcf" => { Ok(Box::new(BufReader::new(raw_reader))) } t => { panic!("unknown file type: {}", t) } } }