|
|
@@ -1,86 +1,90 @@
|
|
|
-use std::{io::Read, path::Path};
|
|
|
-
|
|
|
+use crate::io::readers::get_reader;
|
|
|
use anyhow::Context;
|
|
|
use chainfile::liftover::machine::Builder;
|
|
|
+use std::io::{BufRead, BufReader};
|
|
|
+use std::path::Path;
|
|
|
|
|
|
-use crate::io::readers::get_reader;
|
|
|
-
|
|
|
-/// Build a chainfile liftover machine from a .chain (optionally gz).
|
|
|
-// pub fn build_machine_from_chain<P: AsRef<Path>>(
|
|
|
-// chain_path: P,
|
|
|
-// ) -> anyhow::Result<chainfile::liftover::Machine> {
|
|
|
-// let path = chain_path.as_ref();
|
|
|
-// let p = path.to_str().with_context(|| format!("Failed to parse: {}", path.display()))?;
|
|
|
-//
|
|
|
-// let mut reader = get_reader(p)?;
|
|
|
-//
|
|
|
-// let mut buf = Vec::new();
|
|
|
-// reader.read_to_end(&mut buf)?;
|
|
|
-//
|
|
|
-// let chain_reader = chainfile::Reader::new(&buf[..]);
|
|
|
-// let machine = Builder::default()
|
|
|
-// .try_build_from(chain_reader)
|
|
|
-// .context("build liftover machine from chain")?;
|
|
|
-//
|
|
|
-// Ok(machine)
|
|
|
-// }
|
|
|
-
|
|
|
+/// Builds a liftover machine from a chain file.
|
|
|
+///
|
|
|
+/// Parses a UCSC chain format file (`.chain` or `.chain.gz`) and constructs a
|
|
|
+/// [`chainfile::liftover::Machine`] for coordinate conversion between genome assemblies.
|
|
|
+///
|
|
|
+/// # Chain File Format
|
|
|
+///
|
|
|
+/// Chain files describe pairwise alignments between genome assemblies with:
|
|
|
+/// - **Header lines**: `chain score tName tSize tStrand tStart tEnd qName qSize qStrand qStart qEnd id`
|
|
|
+/// - **Alignment blocks**: `size dt dq` (tab or space-delimited)
|
|
|
+/// - **Final block**: `size` (single value marking chain end)
|
|
|
+/// - **Blank lines**: Separate chains
|
|
|
+///
|
|
|
+/// # Normalization
|
|
|
+///
|
|
|
+/// This function normalizes space-delimited alignment blocks to tab-delimited format,
|
|
|
+/// as required by the `chainfile` crate. This handles chain files from sources like
|
|
|
+/// the T2T Consortium that may use space delimiters instead of tabs.
|
|
|
+///
|
|
|
+/// # Arguments
|
|
|
+///
|
|
|
+/// * `chain_path` - Path to chain file (`.chain` or `.chain.gz`)
|
|
|
+///
|
|
|
+/// # Returns
|
|
|
+///
|
|
|
+/// A configured [`chainfile::liftover::Machine`] ready for coordinate conversion.
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+///
|
|
|
+/// Returns an error if:
|
|
|
+/// - File cannot be read or decompressed
|
|
|
+/// - Chain file has invalid format (malformed headers, alignment blocks)
|
|
|
+/// - Parser encounters non-UTF8 data
|
|
|
+///
|
|
|
+/// # Examples
|
|
|
+///
|
|
|
+/// ```no_run
|
|
|
+/// use pandora_lib_promethion::liftover::build_machine_from_chain;
|
|
|
+///
|
|
|
+/// let machine = build_machine_from_chain("hg38ToHs1.chain.gz")?;
|
|
|
+///
|
|
|
+/// // Liftover coordinates from hg38 to hs1
|
|
|
+/// if let Some(result) = machine.liftover("chr1", 1000000) {
|
|
|
+/// println!("Lifted to: {} at {}", result.chromosome(), result.start());
|
|
|
+/// }
|
|
|
+/// # Ok::<(), anyhow::Error>(())
|
|
|
+/// ```
|
|
|
+///
|
|
|
+/// # Performance
|
|
|
+///
|
|
|
+/// Chain files are loaded entirely into memory and normalized before parsing.
|
|
|
+/// For large genome-wide chain files (~100MB compressed), expect:
|
|
|
+/// - Memory: ~500MB-1GB during construction
|
|
|
+/// - Time: 2-5 seconds on modern systems
|
|
|
pub fn build_machine_from_chain<P: AsRef<Path>>(
|
|
|
chain_path: P,
|
|
|
) -> anyhow::Result<chainfile::liftover::Machine> {
|
|
|
let path = chain_path.as_ref();
|
|
|
let p = path.to_str().context("path to string")?;
|
|
|
-
|
|
|
+
|
|
|
let reader = get_reader(p)?;
|
|
|
let buf_reader = BufReader::new(reader);
|
|
|
-
|
|
|
- // Sanitize: ensure tabs, no trailing whitespace
|
|
|
- let sanitized: Vec<u8> = buf_reader
|
|
|
+
|
|
|
+ // Normalize space-delimited alignment blocks to tab-delimited format
|
|
|
+ let normalized: Vec<u8> = buf_reader
|
|
|
.lines()
|
|
|
- .filter_map(|l| l.ok())
|
|
|
+ .map_while(Result::ok)
|
|
|
.map(|line| {
|
|
|
if line.starts_with("chain") || line.is_empty() {
|
|
|
- line
|
|
|
+ line // Keep header/empty lines as-is
|
|
|
} else {
|
|
|
- // Convert spaces to tabs for alignment blocks
|
|
|
- line.split_whitespace()
|
|
|
- .collect::<Vec<_>>()
|
|
|
- .join("\t")
|
|
|
+ // Convert alignment block spaces to tabs
|
|
|
+ line.split_whitespace().collect::<Vec<_>>().join("\t")
|
|
|
}
|
|
|
})
|
|
|
.collect::<Vec<_>>()
|
|
|
.join("\n")
|
|
|
.into_bytes();
|
|
|
-
|
|
|
- let chain_reader = chainfile::Reader::new(&sanitized[..]);
|
|
|
- Builder::default()
|
|
|
+
|
|
|
+ let chain_reader = chainfile::Reader::new(&normalized[..]);
|
|
|
+ Builder
|
|
|
.try_build_from(chain_reader)
|
|
|
.context("build liftover machine from chain")
|
|
|
}
|
|
|
-
|
|
|
-use std::io::{BufRead, BufReader};
|
|
|
-
|
|
|
-pub fn debug_chain_file<P: AsRef<Path>>(chain_path: P) -> anyhow::Result<()> {
|
|
|
- let reader = get_reader(chain_path.as_ref().to_str().unwrap())?;
|
|
|
- let buf_reader = BufReader::new(reader);
|
|
|
-
|
|
|
- for (idx, line) in buf_reader.lines().enumerate() {
|
|
|
- let line = line?;
|
|
|
-
|
|
|
- // Check the problematic line area
|
|
|
- if line.starts_with("246") {
|
|
|
- eprintln!("Line {}: {:?}", idx + 1, line);
|
|
|
- eprintln!("Bytes: {:?}", line.as_bytes());
|
|
|
- eprintln!("Fields: {:?}", line.split('\t').collect::<Vec<_>>());
|
|
|
- }
|
|
|
-
|
|
|
- // Validate alignment blocks
|
|
|
- if !line.starts_with("chain") && !line.is_empty() {
|
|
|
- let fields: Vec<&str> = line.split('\t').collect();
|
|
|
- if fields.len() != 1 && fields.len() != 3 {
|
|
|
- eprintln!("Invalid line {}: {} fields: {:?}", idx + 1, fields.len(), line);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- Ok(())
|
|
|
-}
|