//! UCSC chain file parser and coordinate liftover machine builder. use crate::io::readers::get_reader; use anyhow::Context; use chainfile::liftover::machine::Builder; use std::io::{BufRead, BufReader}; use std::path::Path; /// Builds a liftover machine from a chain file. /// /// Parses a UCSC chain format file (`.chain` or `.chain.gz`) and constructs a /// [`chainfile::liftover::Machine`] for coordinate conversion between genome assemblies. /// /// # Chain File Format /// /// Chain files describe pairwise alignments between genome assemblies with: /// - **Header lines**: `chain score tName tSize tStrand tStart tEnd qName qSize qStrand qStart qEnd id` /// - **Alignment blocks**: `size dt dq` (tab or space-delimited) /// - **Final block**: `size` (single value marking chain end) /// - **Blank lines**: Separate chains /// /// # Normalization /// /// This function normalizes space-delimited alignment blocks to tab-delimited format, /// as required by the `chainfile` crate. This handles chain files from sources like /// the T2T Consortium that may use space delimiters instead of tabs. /// /// # Arguments /// /// * `chain_path` - Path to chain file (`.chain` or `.chain.gz`) /// /// # Returns /// /// A configured [`chainfile::liftover::Machine`] ready for coordinate conversion. /// /// # Errors /// /// Returns an error if: /// - File cannot be read or decompressed /// - Chain file has invalid format (malformed headers, alignment blocks) /// - Parser encounters non-UTF8 data /// /// # Examples /// /// ```no_run /// use pandora_lib_promethion::liftover::build_machine_from_chain; /// /// let machine = build_machine_from_chain("hg38ToHs1.chain.gz")?; /// /// // Liftover coordinates from hg38 to hs1 /// if let Some(result) = machine.liftover("chr1", 1000000) { /// println!("Lifted to: {} at {}", result.chromosome(), result.start()); /// } /// # Ok::<(), anyhow::Error>(()) /// ``` /// /// # Performance /// /// Chain files are loaded entirely into memory and normalized before parsing. /// For large genome-wide chain files (~100MB compressed), expect: /// - Memory: ~500MB-1GB during construction /// - Time: 2-5 seconds on modern systems pub fn build_machine_from_chain>( chain_path: P, ) -> anyhow::Result { let path = chain_path.as_ref(); let p = path.to_str().context("path to string")?; // BufReader is needed to satisfy BufRead for .lines(); get_reader returns // Box not Box. let reader = BufReader::new(get_reader(p)?); // Normalize space-delimited alignment blocks to tab-delimited format. // Collect through Result so I/O errors are propagated rather than silently // truncating the chain file (which would corrupt all downstream liftover results). let normalized: Vec = reader .lines() .enumerate() .map(|(i, result)| { let line_no = i + 1; let line = result.with_context(|| format!("I/O error reading chain file {p}:{line_no}"))?; let out = if line.starts_with("chain") || line.is_empty() { line } else { line.split_whitespace().collect::>().join("\t") }; Ok(out) }) .collect::>>()? .join("\n") .into_bytes(); let chain_reader = chainfile::Reader::new(&normalized[..]); Builder .try_build_from(chain_reader) .context("build liftover machine from chain") }