dict.rs 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. use std::{fs::File, io::BufReader};
  2. use anyhow::Context;
  3. use log::debug;
  4. use crate::io::tsv::TsvLine;
  5. /// Read a sequence dictionary (`.dict`) file and return `(name, length)` pairs.
  6. ///
  7. /// Only `@SQ` lines are processed; other lines are skipped. Each `@SQ` line
  8. /// must contain `SN:<name>` and `LN:<length>` tab-separated fields.
  9. ///
  10. /// # Errors
  11. ///
  12. /// Returns an error if the file cannot be opened, an `@SQ` line is missing
  13. /// `SN:` or `LN:`, or `LN:` cannot be parsed as `u32`.
  14. pub fn read_dict(path: &str) -> anyhow::Result<Vec<(String, u32)>> {
  15. debug!("Parsing {path}.");
  16. let mut reader =
  17. BufReader::new(File::open(path).with_context(|| format!("cannot open dict: {path}"))?);
  18. let mut line = TsvLine::new();
  19. let mut line_no = 0usize;
  20. let mut res = Vec::new();
  21. while line
  22. .read(&mut reader)
  23. .with_context(|| format!("failed reading dict line after {path}:{line_no}"))?
  24. {
  25. line_no += 1;
  26. let fields = line.split_fields();
  27. if fields.first().copied() != Some("@SQ") {
  28. continue;
  29. }
  30. let sn = fields
  31. .iter()
  32. .find_map(|f| f.strip_prefix("SN:"))
  33. .with_context(|| format!("Missing SN: in @SQ line at {path}:{line_no}"))?
  34. .to_string();
  35. let ln: u32 = fields
  36. .iter()
  37. .find_map(|f| f.strip_prefix("LN:"))
  38. .with_context(|| format!("Missing LN: in @SQ line at {path}:{line_no}"))?
  39. .parse()
  40. .with_context(|| format!("Invalid LN: value at {path}:{line_no}"))?;
  41. res.push((sn, ln));
  42. }
  43. Ok(res)
  44. }