liftover.rs 3.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. //! UCSC chain file parser and coordinate liftover machine builder.
  2. use crate::io::readers::get_reader;
  3. use anyhow::Context;
  4. use chainfile::liftover::machine::Builder;
  5. use std::io::{BufRead, BufReader};
  6. use std::path::Path;
  7. /// Builds a liftover machine from a chain file.
  8. ///
  9. /// Parses a UCSC chain format file (`.chain` or `.chain.gz`) and constructs a
  10. /// [`chainfile::liftover::Machine`] for coordinate conversion between genome assemblies.
  11. ///
  12. /// # Chain File Format
  13. ///
  14. /// Chain files describe pairwise alignments between genome assemblies with:
  15. /// - **Header lines**: `chain score tName tSize tStrand tStart tEnd qName qSize qStrand qStart qEnd id`
  16. /// - **Alignment blocks**: `size dt dq` (tab or space-delimited)
  17. /// - **Final block**: `size` (single value marking chain end)
  18. /// - **Blank lines**: Separate chains
  19. ///
  20. /// # Normalization
  21. ///
  22. /// This function normalizes space-delimited alignment blocks to tab-delimited format,
  23. /// as required by the `chainfile` crate. This handles chain files from sources like
  24. /// the T2T Consortium that may use space delimiters instead of tabs.
  25. ///
  26. /// # Arguments
  27. ///
  28. /// * `chain_path` - Path to chain file (`.chain` or `.chain.gz`)
  29. ///
  30. /// # Returns
  31. ///
  32. /// A configured [`chainfile::liftover::Machine`] ready for coordinate conversion.
  33. ///
  34. /// # Errors
  35. ///
  36. /// Returns an error if:
  37. /// - File cannot be read or decompressed
  38. /// - Chain file has invalid format (malformed headers, alignment blocks)
  39. /// - Parser encounters non-UTF8 data
  40. ///
  41. /// # Examples
  42. ///
  43. /// ```no_run
  44. /// use pandora_lib_promethion::liftover::build_machine_from_chain;
  45. ///
  46. /// let machine = build_machine_from_chain("hg38ToHs1.chain.gz")?;
  47. ///
  48. /// // Liftover coordinates from hg38 to hs1
  49. /// if let Some(result) = machine.liftover("chr1", 1000000) {
  50. /// println!("Lifted to: {} at {}", result.chromosome(), result.start());
  51. /// }
  52. /// # Ok::<(), anyhow::Error>(())
  53. /// ```
  54. ///
  55. /// # Performance
  56. ///
  57. /// Chain files are loaded entirely into memory and normalized before parsing.
  58. /// For large genome-wide chain files (~100MB compressed), expect:
  59. /// - Memory: ~500MB-1GB during construction
  60. /// - Time: 2-5 seconds on modern systems
  61. pub fn build_machine_from_chain<P: AsRef<Path>>(
  62. chain_path: P,
  63. ) -> anyhow::Result<chainfile::liftover::Machine> {
  64. let path = chain_path.as_ref();
  65. let p = path.to_str().context("path to string")?;
  66. // BufReader is needed to satisfy BufRead for .lines(); get_reader returns
  67. // Box<dyn Read> not Box<dyn BufRead>.
  68. let reader = BufReader::new(get_reader(p)?);
  69. // Normalize space-delimited alignment blocks to tab-delimited format.
  70. // Collect through Result so I/O errors are propagated rather than silently
  71. // truncating the chain file (which would corrupt all downstream liftover results).
  72. let normalized: Vec<u8> = reader
  73. .lines()
  74. .enumerate()
  75. .map(|(i, result)| {
  76. let line_no = i + 1;
  77. let line =
  78. result.with_context(|| format!("I/O error reading chain file {p}:{line_no}"))?;
  79. let out = if line.starts_with("chain") || line.is_empty() {
  80. line
  81. } else {
  82. line.split_whitespace().collect::<Vec<_>>().join("\t")
  83. };
  84. Ok(out)
  85. })
  86. .collect::<anyhow::Result<Vec<_>>>()?
  87. .join("\n")
  88. .into_bytes();
  89. let chain_reader = chainfile::Reader::new(&normalized[..]);
  90. Builder
  91. .try_build_from(chain_reader)
  92. .context("build liftover machine from chain")
  93. }