Thomas il y a 1 mois
Parent
commit
73775d23a0
2 fichiers modifiés avec 80 ajouts et 68 suppressions
  1. 68 64
      src/io/liftover.rs
  2. 12 4
      src/variant/variant_collection.rs

+ 68 - 64
src/io/liftover.rs

@@ -1,86 +1,90 @@
-use std::{io::Read, path::Path};
-
+use crate::io::readers::get_reader;
 use anyhow::Context;
 use chainfile::liftover::machine::Builder;
+use std::io::{BufRead, BufReader};
+use std::path::Path;
 
-use crate::io::readers::get_reader;
-
-/// Build a chainfile liftover machine from a .chain (optionally gz).
-// pub fn build_machine_from_chain<P: AsRef<Path>>(
-//     chain_path: P,
-// ) -> anyhow::Result<chainfile::liftover::Machine> {
-//     let path = chain_path.as_ref();
-//     let p = path.to_str().with_context(|| format!("Failed to parse: {}", path.display()))?;
-//
-//     let mut reader = get_reader(p)?;
-//
-//     let mut buf = Vec::new();
-//     reader.read_to_end(&mut buf)?;
-//
-//     let chain_reader = chainfile::Reader::new(&buf[..]);
-//     let machine = Builder::default()
-//         .try_build_from(chain_reader)
-//         .context("build liftover machine from chain")?;
-//
-//     Ok(machine)
-// }
-
+/// Builds a liftover machine from a chain file.
+///
+/// Parses a UCSC chain format file (`.chain` or `.chain.gz`) and constructs a
+/// [`chainfile::liftover::Machine`] for coordinate conversion between genome assemblies.
+///
+/// # Chain File Format
+///
+/// Chain files describe pairwise alignments between genome assemblies with:
+/// - **Header lines**: `chain score tName tSize tStrand tStart tEnd qName qSize qStrand qStart qEnd id`
+/// - **Alignment blocks**: `size dt dq` (tab or space-delimited)
+/// - **Final block**: `size` (single value marking chain end)
+/// - **Blank lines**: Separate chains
+///
+/// # Normalization
+///
+/// This function normalizes space-delimited alignment blocks to tab-delimited format,
+/// as required by the `chainfile` crate. This handles chain files from sources like
+/// the T2T Consortium that may use space delimiters instead of tabs.
+///
+/// # Arguments
+///
+/// * `chain_path` - Path to chain file (`.chain` or `.chain.gz`)
+///
+/// # Returns
+///
+/// A configured [`chainfile::liftover::Machine`] ready for coordinate conversion.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - File cannot be read or decompressed
+/// - Chain file has invalid format (malformed headers, alignment blocks)
+/// - Parser encounters non-UTF8 data
+///
+/// # Examples
+///
+/// ```no_run
+/// use pandora_lib_promethion::liftover::build_machine_from_chain;
+///
+/// let machine = build_machine_from_chain("hg38ToHs1.chain.gz")?;
+///
+/// // Liftover coordinates from hg38 to hs1
+/// if let Some(result) = machine.liftover("chr1", 1000000) {
+///     println!("Lifted to: {} at {}", result.chromosome(), result.start());
+/// }
+/// # Ok::<(), anyhow::Error>(())
+/// ```
+///
+/// # Performance
+///
+/// Chain files are loaded entirely into memory and normalized before parsing.
+/// For large genome-wide chain files (~100MB compressed), expect:
+/// - Memory: ~500MB-1GB during construction
+/// - Time: 2-5 seconds on modern systems
 pub fn build_machine_from_chain<P: AsRef<Path>>(
     chain_path: P,
 ) -> anyhow::Result<chainfile::liftover::Machine> {
     let path = chain_path.as_ref();
     let p = path.to_str().context("path to string")?;
-    
+
     let reader = get_reader(p)?;
     let buf_reader = BufReader::new(reader);
-    
-    // Sanitize: ensure tabs, no trailing whitespace
-    let sanitized: Vec<u8> = buf_reader
+
+    // Normalize space-delimited alignment blocks to tab-delimited format
+    let normalized: Vec<u8> = buf_reader
         .lines()
-        .filter_map(|l| l.ok())
+        .map_while(Result::ok)
         .map(|line| {
             if line.starts_with("chain") || line.is_empty() {
-                line
+                line // Keep header/empty lines as-is
             } else {
-                // Convert spaces to tabs for alignment blocks
-                line.split_whitespace()
-                    .collect::<Vec<_>>()
-                    .join("\t")
+                // Convert alignment block spaces to tabs
+                line.split_whitespace().collect::<Vec<_>>().join("\t")
             }
         })
         .collect::<Vec<_>>()
         .join("\n")
         .into_bytes();
-    
-    let chain_reader = chainfile::Reader::new(&sanitized[..]);
-    Builder::default()
+
+    let chain_reader = chainfile::Reader::new(&normalized[..]);
+    Builder
         .try_build_from(chain_reader)
         .context("build liftover machine from chain")
 }
-
-use std::io::{BufRead, BufReader};
-
-pub fn debug_chain_file<P: AsRef<Path>>(chain_path: P) -> anyhow::Result<()> {
-    let reader = get_reader(chain_path.as_ref().to_str().unwrap())?;
-    let buf_reader = BufReader::new(reader);
-    
-    for (idx, line) in buf_reader.lines().enumerate() {
-        let line = line?;
-        
-        // Check the problematic line area
-        if line.starts_with("246") {
-            eprintln!("Line {}: {:?}", idx + 1, line);
-            eprintln!("Bytes: {:?}", line.as_bytes());
-            eprintln!("Fields: {:?}", line.split('\t').collect::<Vec<_>>());
-        }
-        
-        // Validate alignment blocks
-        if !line.starts_with("chain") && !line.is_empty() {
-            let fields: Vec<&str> = line.split('\t').collect();
-            if fields.len() != 1 && fields.len() != 3 {
-                eprintln!("Invalid line {}: {} fields: {:?}", idx + 1, fields.len(), line);
-            }
-        }
-    }
-    Ok(())
-}

+ 12 - 4
src/variant/variant_collection.rs

@@ -2059,7 +2059,10 @@ fn process_vep_chunk(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{annotation::{self, Caller}, callers::clairs::ClairS, commands::Command, helpers::test_init, io::liftover::debug_chain_file, pipes::Initialize, variant::vcf_variant::Variants};
+    use crate::{
+        annotation::Caller, callers::clairs::ClairS, helpers::test_init, pipes::Initialize,
+        variant::vcf_variant::Variants,
+    };
 
     #[test]
     fn annotate_constit() -> anyhow::Result<()> {
@@ -2086,15 +2089,20 @@ mod tests {
     #[test]
     fn liftover_hg38() -> anyhow::Result<()> {
         test_init();
-        
+
         let config = Config::default();
         let annotations = Annotations::default();
         let mut var = ClairS::initialize("CHAHA", &config)?.variants(&annotations)?;
         var.variants.truncate(10);
 
-        var.save_into_lifted("./out_hg38.vcf", "./unmapped", "/home/t_steimle/ref/hs1/chm13v2-grch38.chain", "/home/t_steimle/ref/hg38/hg38.fa", 10)?;
+        var.save_into_lifted(
+            "./out_hg38.vcf",
+            "./unmapped",
+            "/home/t_steimle/ref/hs1/chm13v2-grch38.chain",
+            "/home/t_steimle/ref/hg38/hg38.fa",
+            10,
+        )?;
 
         Ok(())
     }
-
 }