| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 |
- use std::{fs::File, io::BufReader};
- use csv::ReaderBuilder;
- use pandora_lib_variants::variants::Variant;
- use crate::{callers::Caller, variant::variant::VariantType};
- #[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
- pub struct VCFRow {
- pub chr: String,
- pub pos: u32,
- pub id: String,
- pub reference: String,
- pub alt: String,
- pub qual: String,
- pub filter: String,
- pub info: String,
- pub format: String,
- pub value: String,
- }
- pub fn read_vcf(
- path: &str,
- caller: &Caller,
- variant_type: &VariantType,
- ) -> anyhow::Result<Vec<Variant>> {
- let mut reader = ReaderBuilder::new()
- .delimiter(b'\t')
- .comment(Some(b'#'))
- .has_headers(false)
- .flexible(true)
- .from_reader(get_reader(path)?);
- let iter = reader.deserialize();
- let mut all = Vec::new();
- // should be replaced with bcftools
- for result in iter {
- let record: VCFRow = result?;
- // Normalize into multirows
- if record.alt.contains(",") {
- let alts = record.alt.split(',').collect::<Vec<&str>>();
- let n = alts.len();
- let vals = record.value.split(':').collect::<Vec<&str>>();
- let ads = vals.get(3).unwrap().split(',').collect::<Vec<&str>>();
- let vafs = vals.get(4).unwrap().split(',').collect::<Vec<&str>>();
- let pls = vals.get(5).unwrap().split(',').collect::<Vec<&str>>();
- for i in 0..n {
- let cp = &pls[(i * 3)..(i * 3 + 3)];
- let nval = format!(
- "{}:{}:{}:{}:{}:{}",
- vals[0],
- vals[1],
- vals[2],
- [ads[0], ads[i + 1]].join(","),
- vafs[i],
- cp.join(",")
- );
- let mut rec = record.clone();
- rec.value = nval.clone();
- rec.alt = alts[i].to_string();
- all.push(rec);
- }
- } else {
- all.push(record);
- }
- }
- let base_n = "N".to_string();
- let res: Vec<Variant> = all
- .par_iter_mut()
- .map(|row| {
- // for Sniffles normalize insertion/deletion position (after the pos)
- if caller == &VCFSource::Sniffles && row.reference == base_n && row.alt.len() > 1 {
- row.pos -= 1;
- }
- let mut v = Variant::from_vcfrow(row, caller.clone(), variant_type.clone()).unwrap();
- v.get_depth();
- v.get_n_alt();
- v
- })
- .filter(|v| {
- for cd in v.callers_data.iter() {
- if cd.should_filter() {
- return false;
- }
- }
- true
- })
- .collect();
- Ok(res)
- }
- pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn std::io::Read>> {
- let file_type = *path.split(".").collect::<Vec<&str>>().last()?;
- assert!(file_type == "gz" || file_type == "vcf");
- let raw_reader: Box<dyn std::io::Read> = Box::new(File::open(path)?);
- match file_type {
- "gz" => {
- let reader = Box::new(BGZFReader::new(raw_reader)?);
- Ok(Box::new(BufReader::new(reader)))
- }
- "vcf" => {
- Ok(Box::new(BufReader::new(raw_reader)))
- }
- t => {
- panic!("unknown file type: {}", t)
- }
- }
- }
|