| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143 |
- use std::{
- fs::File,
- io::BufReader,
- str::FromStr,
- };
- use anyhow::Context;
- use pandora_lib_variants::variants::Variant;
- use crate::{callers::Caller, io::tsv::TsvLine, variant::vcf_variant::VariantType};
- /// A single row from a VCF file (tab-separated, no header row).
- #[derive(Debug, Eq, PartialEq, Clone)]
- pub struct VCFRow {
- pub chr: String,
- pub pos: u32,
- pub id: String,
- pub reference: String,
- pub alt: String,
- pub qual: String,
- pub filter: String,
- pub info: String,
- pub format: String,
- pub value: String,
- }
- impl FromStr for VCFRow {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> anyhow::Result<Self> {
- let f: Vec<&str> = s.split('\t').collect();
- let get = |i: usize, name: &str| -> anyhow::Result<&str> {
- f.get(i).copied().ok_or_else(|| anyhow::anyhow!("missing {name} (col {i})"))
- };
- Ok(Self {
- chr: get(0, "chr")?.to_string(),
- pos: get(1, "pos")?.parse().context("bad pos")?,
- id: get(2, "id")?.to_string(),
- reference: get(3, "reference")?.to_string(),
- alt: get(4, "alt")?.to_string(),
- qual: get(5, "qual")?.to_string(),
- filter: get(6, "filter")?.to_string(),
- info: get(7, "info")?.to_string(),
- format: get(8, "format")?.to_string(),
- value: get(9, "value")?.to_string(),
- })
- }
- }
- pub fn read_vcf(
- path: &str,
- caller: &Caller,
- variant_type: &VariantType,
- ) -> anyhow::Result<Vec<Variant>> {
- let mut reader = BufReader::new(get_reader(path)?);
- let mut line = TsvLine::new();
- let mut all = Vec::new();
- // should be replaced with bcftools
- while line.read(&mut reader)? {
- if line.as_str().starts_with('#') || line.as_str().is_empty() {
- continue;
- }
- let record: VCFRow = line.as_str().parse()?;
- // Normalize into multirows
- if record.alt.contains(",") {
- let alts = record.alt.split(',').collect::<Vec<&str>>();
- let n = alts.len();
- let vals = record.value.split(':').collect::<Vec<&str>>();
- let ads = vals.get(3).unwrap().split(',').collect::<Vec<&str>>();
- let vafs = vals.get(4).unwrap().split(',').collect::<Vec<&str>>();
- let pls = vals.get(5).unwrap().split(',').collect::<Vec<&str>>();
- for i in 0..n {
- let cp = &pls[(i * 3)..(i * 3 + 3)];
- let nval = format!(
- "{}:{}:{}:{}:{}:{}",
- vals[0],
- vals[1],
- vals[2],
- [ads[0], ads[i + 1]].join(","),
- vafs[i],
- cp.join(",")
- );
- let mut rec = record.clone();
- rec.value = nval.clone();
- rec.alt = alts[i].to_string();
- all.push(rec);
- }
- } else {
- all.push(record);
- }
- }
- let base_n = "N".to_string();
- let res: Vec<Variant> = all
- .par_iter_mut()
- .map(|row| {
- // for Sniffles normalize insertion/deletion position (after the pos)
- if caller == &VCFSource::Sniffles && row.reference == base_n && row.alt.len() > 1 {
- row.pos -= 1;
- }
- let mut v = Variant::from_vcfrow(row, caller.clone(), variant_type.clone()).unwrap();
- v.get_depth();
- v.get_n_alt();
- v
- })
- .filter(|v| {
- for cd in v.callers_data.iter() {
- if cd.should_filter() {
- return false;
- }
- }
- true
- })
- .collect();
- Ok(res)
- }
- pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn std::io::Read>> {
- let file_type = *path.split(".").collect::<Vec<&str>>().last()?;
- assert!(file_type == "gz" || file_type == "vcf");
- let raw_reader: Box<dyn std::io::Read> = Box::new(File::open(path)?);
- match file_type {
- "gz" => {
- let reader = Box::new(BGZFReader::new(raw_reader)?);
- Ok(Box::new(BufReader::new(reader)))
- }
- "vcf" => {
- Ok(Box::new(BufReader::new(raw_reader)))
- }
- t => {
- panic!("unknown file type: {}", t)
- }
- }
- }
|