Thomas
/
pandora_lib_promethion


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
							//! UCSC chain file parser and coordinate liftover machine builder.

use crate::io::readers::get_reader;
use anyhow::Context;
use chainfile::liftover::machine::Builder;
use std::io::{BufRead, BufReader};
use std::path::Path;

/// Builds a liftover machine from a chain file.
///
/// Parses a UCSC chain format file (`.chain` or `.chain.gz`) and constructs a
/// [`chainfile::liftover::Machine`] for coordinate conversion between genome assemblies.
///
/// # Chain File Format
///
/// Chain files describe pairwise alignments between genome assemblies with:
/// - **Header lines**: `chain score tName tSize tStrand tStart tEnd qName qSize qStrand qStart qEnd id`
/// - **Alignment blocks**: `size dt dq` (tab or space-delimited)
/// - **Final block**: `size` (single value marking chain end)
/// - **Blank lines**: Separate chains
///
/// # Normalization
///
/// This function normalizes space-delimited alignment blocks to tab-delimited format,
/// as required by the `chainfile` crate. This handles chain files from sources like
/// the T2T Consortium that may use space delimiters instead of tabs.
///
/// # Arguments
///
/// * `chain_path` - Path to chain file (`.chain` or `.chain.gz`)
///
/// # Returns
///
/// A configured [`chainfile::liftover::Machine`] ready for coordinate conversion.
///
/// # Errors
///
/// Returns an error if:
/// - File cannot be read or decompressed
/// - Chain file has invalid format (malformed headers, alignment blocks)
/// - Parser encounters non-UTF8 data
///
/// # Examples
///
/// ```no_run
/// use pandora_lib_promethion::liftover::build_machine_from_chain;
///
/// let machine = build_machine_from_chain("hg38ToHs1.chain.gz")?;
///
/// // Liftover coordinates from hg38 to hs1
/// if let Some(result) = machine.liftover("chr1", 1000000) {
///     println!("Lifted to: {} at {}", result.chromosome(), result.start());
/// }
/// # Ok::<(), anyhow::Error>(())
/// ```
///
/// # Performance
///
/// Chain files are loaded entirely into memory and normalized before parsing.
/// For large genome-wide chain files (~100MB compressed), expect:
/// - Memory: ~500MB-1GB during construction
/// - Time: 2-5 seconds on modern systems
pub fn build_machine_from_chain<P: AsRef<Path>>(
    chain_path: P,
) -> anyhow::Result<chainfile::liftover::Machine> {
    let path = chain_path.as_ref();
    let p = path.to_str().context("path to string")?;

    // BufReader is needed to satisfy BufRead for .lines(); get_reader returns
    // Box<dyn Read> not Box<dyn BufRead>.
    let reader = BufReader::new(get_reader(p)?);

    // Normalize space-delimited alignment blocks to tab-delimited format.
    // Collect through Result so I/O errors are propagated rather than silently
    // truncating the chain file (which would corrupt all downstream liftover results).
    let normalized: Vec<u8> = reader
        .lines()
        .enumerate()
        .map(|(i, result)| {
            let line_no = i + 1;
            let line =
                result.with_context(|| format!("I/O error reading chain file {p}:{line_no}"))?;
            let out = if line.starts_with("chain") || line.is_empty() {
                line
            } else {
                line.split_whitespace().collect::<Vec<_>>().join("\t")
            };
            Ok(out)
        })
        .collect::<anyhow::Result<Vec<_>>>()?
        .join("\n")
        .into_bytes();

    let chain_reader = chainfile::Reader::new(&normalized[..]);
    Builder
        .try_build_from(chain_reader)
        .context("build liftover machine from chain")
}