vcf_reader.rs 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. use std::{fs::File, io::BufReader};
  2. use csv::ReaderBuilder;
  3. use pandora_lib_variants::variants::Variant;
  4. use crate::{callers::Caller, variant::variant::VariantType};
  5. #[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
  6. pub struct VCFRow {
  7. pub chr: String,
  8. pub pos: u32,
  9. pub id: String,
  10. pub reference: String,
  11. pub alt: String,
  12. pub qual: String,
  13. pub filter: String,
  14. pub info: String,
  15. pub format: String,
  16. pub value: String,
  17. }
  18. pub fn read_vcf(
  19. path: &str,
  20. caller: &Caller,
  21. variant_type: &VariantType,
  22. ) -> anyhow::Result<Vec<Variant>> {
  23. let mut reader = ReaderBuilder::new()
  24. .delimiter(b'\t')
  25. .comment(Some(b'#'))
  26. .has_headers(false)
  27. .flexible(true)
  28. .from_reader(get_reader(path)?);
  29. let iter = reader.deserialize();
  30. let mut all = Vec::new();
  31. // should be replaced with bcftools
  32. for result in iter {
  33. let record: VCFRow = result?;
  34. // Normalize into multirows
  35. if record.alt.contains(",") {
  36. let alts = record.alt.split(',').collect::<Vec<&str>>();
  37. let n = alts.len();
  38. let vals = record.value.split(':').collect::<Vec<&str>>();
  39. let ads = vals.get(3).unwrap().split(',').collect::<Vec<&str>>();
  40. let vafs = vals.get(4).unwrap().split(',').collect::<Vec<&str>>();
  41. let pls = vals.get(5).unwrap().split(',').collect::<Vec<&str>>();
  42. for i in 0..n {
  43. let cp = &pls[(i * 3)..(i * 3 + 3)];
  44. let nval = format!(
  45. "{}:{}:{}:{}:{}:{}",
  46. vals[0],
  47. vals[1],
  48. vals[2],
  49. [ads[0], ads[i + 1]].join(","),
  50. vafs[i],
  51. cp.join(",")
  52. );
  53. let mut rec = record.clone();
  54. rec.value = nval.clone();
  55. rec.alt = alts[i].to_string();
  56. all.push(rec);
  57. }
  58. } else {
  59. all.push(record);
  60. }
  61. }
  62. let base_n = "N".to_string();
  63. let res: Vec<Variant> = all
  64. .par_iter_mut()
  65. .map(|row| {
  66. // for Sniffles normalize insertion/deletion position (after the pos)
  67. if caller == &VCFSource::Sniffles && row.reference == base_n && row.alt.len() > 1 {
  68. row.pos -= 1;
  69. }
  70. let mut v = Variant::from_vcfrow(row, caller.clone(), variant_type.clone()).unwrap();
  71. v.get_depth();
  72. v.get_n_alt();
  73. v
  74. })
  75. .filter(|v| {
  76. for cd in v.callers_data.iter() {
  77. if cd.should_filter() {
  78. return false;
  79. }
  80. }
  81. true
  82. })
  83. .collect();
  84. Ok(res)
  85. }
  86. pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn std::io::Read>> {
  87. let file_type = *path.split(".").collect::<Vec<&str>>().last()?;
  88. assert!(file_type == "gz" || file_type == "vcf");
  89. let raw_reader: Box<dyn std::io::Read> = Box::new(File::open(path)?);
  90. match file_type {
  91. "gz" => {
  92. let reader = Box::new(BGZFReader::new(raw_reader)?);
  93. Ok(Box::new(BufReader::new(reader)))
  94. }
  95. "vcf" => {
  96. Ok(Box::new(BufReader::new(raw_reader)))
  97. }
  98. t => {
  99. panic!("unknown file type: {}", t)
  100. }
  101. }
  102. }