use std::{fs::File, io::BufReader}; use csv::ReaderBuilder; use pandora_lib_variants::variants::Variant; use crate::{callers::Caller, variant::variant::VariantType}; #[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)] pub struct VCFRow { pub chr: String, pub pos: u32, pub id: String, pub reference: String, pub alt: String, pub qual: String, pub filter: String, pub info: String, pub format: String, pub value: String, } pub fn read_vcf( path: &str, caller: &Caller, variant_type: &VariantType, ) -> anyhow::Result> { let mut reader = ReaderBuilder::new() .delimiter(b'\t') .comment(Some(b'#')) .has_headers(false) .flexible(true) .from_reader(get_reader(path)?); let iter = reader.deserialize(); let mut all = Vec::new(); // should be replaced with bcftools for result in iter { let record: VCFRow = result?; // Normalize into multirows if record.alt.contains(",") { let alts = record.alt.split(',').collect::>(); let n = alts.len(); let vals = record.value.split(':').collect::>(); let ads = vals.get(3).unwrap().split(',').collect::>(); let vafs = vals.get(4).unwrap().split(',').collect::>(); let pls = vals.get(5).unwrap().split(',').collect::>(); for i in 0..n { let cp = &pls[(i * 3)..(i * 3 + 3)]; let nval = format!( "{}:{}:{}:{}:{}:{}", vals[0], vals[1], vals[2], [ads[0], ads[i + 1]].join(","), vafs[i], cp.join(",") ); let mut rec = record.clone(); rec.value = nval.clone(); rec.alt = alts[i].to_string(); all.push(rec); } } else { all.push(record); } } let base_n = "N".to_string(); let res: Vec = all .par_iter_mut() .map(|row| { // for Sniffles normalize insertion/deletion position (after the pos) if caller == &VCFSource::Sniffles && row.reference == base_n && row.alt.len() > 1 { row.pos -= 1; } let mut v = Variant::from_vcfrow(row, caller.clone(), variant_type.clone()).unwrap(); v.get_depth(); v.get_n_alt(); v }) .filter(|v| { for cd in v.callers_data.iter() { if cd.should_filter() { return false; } } true }) .collect(); Ok(res) } pub fn get_reader(path: &str) -> anyhow::Result> { let file_type = *path.split(".").collect::>().last()?; assert!(file_type == "gz" || file_type == "vcf"); let raw_reader: Box = Box::new(File::open(path)?); match file_type { "gz" => { let reader = Box::new(BGZFReader::new(raw_reader)?); Ok(Box::new(BufReader::new(reader))) } "vcf" => { Ok(Box::new(BufReader::new(raw_reader))) } t => { panic!("unknown file type: {}", t) } } }