| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- //! UCSC chain file parser and coordinate liftover machine builder.
- use crate::io::readers::get_reader;
- use anyhow::Context;
- use chainfile::liftover::machine::Builder;
- use std::io::{BufRead, BufReader};
- use std::path::Path;
- /// Builds a liftover machine from a chain file.
- ///
- /// Parses a UCSC chain format file (`.chain` or `.chain.gz`) and constructs a
- /// [`chainfile::liftover::Machine`] for coordinate conversion between genome assemblies.
- ///
- /// # Chain File Format
- ///
- /// Chain files describe pairwise alignments between genome assemblies with:
- /// - **Header lines**: `chain score tName tSize tStrand tStart tEnd qName qSize qStrand qStart qEnd id`
- /// - **Alignment blocks**: `size dt dq` (tab or space-delimited)
- /// - **Final block**: `size` (single value marking chain end)
- /// - **Blank lines**: Separate chains
- ///
- /// # Normalization
- ///
- /// This function normalizes space-delimited alignment blocks to tab-delimited format,
- /// as required by the `chainfile` crate. This handles chain files from sources like
- /// the T2T Consortium that may use space delimiters instead of tabs.
- ///
- /// # Arguments
- ///
- /// * `chain_path` - Path to chain file (`.chain` or `.chain.gz`)
- ///
- /// # Returns
- ///
- /// A configured [`chainfile::liftover::Machine`] ready for coordinate conversion.
- ///
- /// # Errors
- ///
- /// Returns an error if:
- /// - File cannot be read or decompressed
- /// - Chain file has invalid format (malformed headers, alignment blocks)
- /// - Parser encounters non-UTF8 data
- ///
- /// # Examples
- ///
- /// ```no_run
- /// use pandora_lib_promethion::liftover::build_machine_from_chain;
- ///
- /// let machine = build_machine_from_chain("hg38ToHs1.chain.gz")?;
- ///
- /// // Liftover coordinates from hg38 to hs1
- /// if let Some(result) = machine.liftover("chr1", 1000000) {
- /// println!("Lifted to: {} at {}", result.chromosome(), result.start());
- /// }
- /// # Ok::<(), anyhow::Error>(())
- /// ```
- ///
- /// # Performance
- ///
- /// Chain files are loaded entirely into memory and normalized before parsing.
- /// For large genome-wide chain files (~100MB compressed), expect:
- /// - Memory: ~500MB-1GB during construction
- /// - Time: 2-5 seconds on modern systems
- pub fn build_machine_from_chain<P: AsRef<Path>>(
- chain_path: P,
- ) -> anyhow::Result<chainfile::liftover::Machine> {
- let path = chain_path.as_ref();
- let p = path.to_str().context("path to string")?;
- // BufReader is needed to satisfy BufRead for .lines(); get_reader returns
- // Box<dyn Read> not Box<dyn BufRead>.
- let reader = BufReader::new(get_reader(p)?);
- // Normalize space-delimited alignment blocks to tab-delimited format.
- // Collect through Result so I/O errors are propagated rather than silently
- // truncating the chain file (which would corrupt all downstream liftover results).
- let normalized: Vec<u8> = reader
- .lines()
- .enumerate()
- .map(|(i, result)| {
- let line_no = i + 1;
- let line =
- result.with_context(|| format!("I/O error reading chain file {p}:{line_no}"))?;
- let out = if line.starts_with("chain") || line.is_empty() {
- line
- } else {
- line.split_whitespace().collect::<Vec<_>>().join("\t")
- };
- Ok(out)
- })
- .collect::<anyhow::Result<Vec<_>>>()?
- .join("\n")
- .into_bytes();
- let chain_reader = chainfile::Reader::new(&normalized[..]);
- Builder
- .try_build_from(chain_reader)
- .context("build liftover machine from chain")
- }
|