vcf_reader.rs 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. use std::{
  2. fs::File,
  3. io::BufReader,
  4. str::FromStr,
  5. };
  6. use anyhow::Context;
  7. use pandora_lib_variants::variants::Variant;
  8. use crate::{callers::Caller, io::tsv::TsvLine, variant::vcf_variant::VariantType};
  9. /// A single row from a VCF file (tab-separated, no header row).
  10. #[derive(Debug, Eq, PartialEq, Clone)]
  11. pub struct VCFRow {
  12. pub chr: String,
  13. pub pos: u32,
  14. pub id: String,
  15. pub reference: String,
  16. pub alt: String,
  17. pub qual: String,
  18. pub filter: String,
  19. pub info: String,
  20. pub format: String,
  21. pub value: String,
  22. }
  23. impl FromStr for VCFRow {
  24. type Err = anyhow::Error;
  25. fn from_str(s: &str) -> anyhow::Result<Self> {
  26. let f: Vec<&str> = s.split('\t').collect();
  27. let get = |i: usize, name: &str| -> anyhow::Result<&str> {
  28. f.get(i).copied().ok_or_else(|| anyhow::anyhow!("missing {name} (col {i})"))
  29. };
  30. Ok(Self {
  31. chr: get(0, "chr")?.to_string(),
  32. pos: get(1, "pos")?.parse().context("bad pos")?,
  33. id: get(2, "id")?.to_string(),
  34. reference: get(3, "reference")?.to_string(),
  35. alt: get(4, "alt")?.to_string(),
  36. qual: get(5, "qual")?.to_string(),
  37. filter: get(6, "filter")?.to_string(),
  38. info: get(7, "info")?.to_string(),
  39. format: get(8, "format")?.to_string(),
  40. value: get(9, "value")?.to_string(),
  41. })
  42. }
  43. }
  44. pub fn read_vcf(
  45. path: &str,
  46. caller: &Caller,
  47. variant_type: &VariantType,
  48. ) -> anyhow::Result<Vec<Variant>> {
  49. let mut reader = BufReader::new(get_reader(path)?);
  50. let mut line = TsvLine::new();
  51. let mut all = Vec::new();
  52. // should be replaced with bcftools
  53. while line.read(&mut reader)? {
  54. if line.as_str().starts_with('#') || line.as_str().is_empty() {
  55. continue;
  56. }
  57. let record: VCFRow = line.as_str().parse()?;
  58. // Normalize into multirows
  59. if record.alt.contains(",") {
  60. let alts = record.alt.split(',').collect::<Vec<&str>>();
  61. let n = alts.len();
  62. let vals = record.value.split(':').collect::<Vec<&str>>();
  63. let ads = vals.get(3).unwrap().split(',').collect::<Vec<&str>>();
  64. let vafs = vals.get(4).unwrap().split(',').collect::<Vec<&str>>();
  65. let pls = vals.get(5).unwrap().split(',').collect::<Vec<&str>>();
  66. for i in 0..n {
  67. let cp = &pls[(i * 3)..(i * 3 + 3)];
  68. let nval = format!(
  69. "{}:{}:{}:{}:{}:{}",
  70. vals[0],
  71. vals[1],
  72. vals[2],
  73. [ads[0], ads[i + 1]].join(","),
  74. vafs[i],
  75. cp.join(",")
  76. );
  77. let mut rec = record.clone();
  78. rec.value = nval.clone();
  79. rec.alt = alts[i].to_string();
  80. all.push(rec);
  81. }
  82. } else {
  83. all.push(record);
  84. }
  85. }
  86. let base_n = "N".to_string();
  87. let res: Vec<Variant> = all
  88. .par_iter_mut()
  89. .map(|row| {
  90. // for Sniffles normalize insertion/deletion position (after the pos)
  91. if caller == &VCFSource::Sniffles && row.reference == base_n && row.alt.len() > 1 {
  92. row.pos -= 1;
  93. }
  94. let mut v = Variant::from_vcfrow(row, caller.clone(), variant_type.clone()).unwrap();
  95. v.get_depth();
  96. v.get_n_alt();
  97. v
  98. })
  99. .filter(|v| {
  100. for cd in v.callers_data.iter() {
  101. if cd.should_filter() {
  102. return false;
  103. }
  104. }
  105. true
  106. })
  107. .collect();
  108. Ok(res)
  109. }
  110. pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn std::io::Read>> {
  111. let file_type = *path.split(".").collect::<Vec<&str>>().last()?;
  112. assert!(file_type == "gz" || file_type == "vcf");
  113. let raw_reader: Box<dyn std::io::Read> = Box::new(File::open(path)?);
  114. match file_type {
  115. "gz" => {
  116. let reader = Box::new(BGZFReader::new(raw_reader)?);
  117. Ok(Box::new(BufReader::new(reader)))
  118. }
  119. "vcf" => {
  120. Ok(Box::new(BufReader::new(raw_reader)))
  121. }
  122. t => {
  123. panic!("unknown file type: {}", t)
  124. }
  125. }
  126. }