Thomas 1 개월 전
부모
커밋
5133ac3efd
16개의 변경된 파일421개의 추가작업 그리고 21개의 파일을 삭제
  1. 73 0
      Cargo.lock
  2. 2 0
      Cargo.toml
  3. 12 0
      out_hg38.vcf
  4. 3 0
      src/annotation/alpha_genome.rs
  5. 1 0
      src/annotation/mod.rs
  6. 0 1
      src/callers/clairs.rs
  7. 1 0
      src/callers/savana.rs
  8. 15 0
      src/helpers.rs
  9. 17 3
      src/io/fasta.rs
  10. 86 0
      src/io/liftover.rs
  11. 1 0
      src/io/mod.rs
  12. 2 2
      src/io/readers.rs
  13. 32 0
      src/positions.rs
  14. 95 4
      src/variant/variant_collection.rs
  15. 79 11
      src/variant/vcf_variant.rs
  16. 2 0
      unmapped

+ 73 - 0
Cargo.lock

@@ -607,6 +607,19 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
 
+[[package]]
+name = "chainfile"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18bc8cdfd99379e863c3b37340240675e42879e06d9d520b531a78868c95f1c8"
+dependencies = [
+ "flate2",
+ "nonempty",
+ "omics",
+ "rust-lapper",
+ "tracing-log",
+]
+
 [[package]]
 name = "chrono"
 version = "0.4.42"
@@ -1654,6 +1667,12 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "nonempty"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "995defdca0a589acfdd1bd2e8e3b896b4d4f7675a31fd14c32611440c7f608e6"
+
 [[package]]
 name = "noodles-bgzf"
 version = "0.45.0"
@@ -1781,6 +1800,31 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
 
+[[package]]
+name = "omics"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d5390efc1c2cac670094a84678fa8f07c38fa6eeb5626cdeceea4f72262209fa"
+dependencies = [
+ "omics-coordinate",
+]
+
+[[package]]
+name = "omics-coordinate"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90a4c7dc44847a1db7bc9982c716a707d599abdc1ad0f9fbb9eb85df3b77f384"
+dependencies = [
+ "omics-core",
+ "thiserror 2.0.17",
+]
+
+[[package]]
+name = "omics-core"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84201d70b8c3a9a0d1e020a6baf61af11deff7535097c4ab2f31eb74f6ee17e5"
+
 [[package]]
 name = "once_cell"
 version = "1.21.3"
@@ -1906,6 +1950,7 @@ dependencies = [
  "bitcode",
  "blake3",
  "byte-unit",
+ "chainfile",
  "chrono",
  "csv",
  "dashmap",
@@ -1928,6 +1973,7 @@ dependencies = [
  "noodles-gff",
  "noodles-tabix",
  "num-format",
+ "omics",
  "ordered-float",
  "pandora_lib_assembler",
  "petgraph 0.8.3",
@@ -2369,6 +2415,15 @@ dependencies = [
  "url",
 ]
 
+[[package]]
+name = "rust-lapper"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2274b9cc4f205bc0945b7be3e4fc1a102b0b7119ba6482faaedb9c4f76dde5d1"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "rust_decimal"
 version = "1.39.0"
@@ -2919,6 +2974,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
 dependencies = [
  "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
 ]
 
 [[package]]
@@ -2980,6 +3047,12 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "valuable"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"

+ 2 - 0
Cargo.toml

@@ -52,6 +52,8 @@ hostname = "0.4.2"
 noodles-tabix = "0.59.0"
 noodles-bgzf = "0.45.0"
 triple_accel = "0.4.0"
+chainfile = "0.3.0"
+omics = "0.2.0"
 
 [profile.dev]
 opt-level = 0

+ 12 - 0
out_hg38.vcf

@@ -0,0 +1,12 @@
+##fileformat=VCFv4.2
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE
+chr1	898547	.	T	C	10.577	PASS	FAU=0;FCU=11;FGU=0;FTU=11;RAU=0;RCU=14;RGU=0;RTU=6	GT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU	0/1:10:42:0.5952:0,25:0:3:0,0:0:25:0:17:0:0:0:3
+chr1	1000112	.	G	G	10.423	PASS	FAU=0;FCU=0;FGU=11;FTU=9;RAU=0;RCU=0;RGU=16;RTU=10	GT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU	0/1:10:46:0.587:0,27:0:3:0,0:0:0:27:19:0:0:0:3
+chr1	1002308	.	T	T	12.693	PASS	H;FAU=0;FCU=13;FGU=0;FTU=10;RAU=0;RCU=8;RGU=0;RTU=15	GT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU	0/1:12:46:0.5435:0,25:0:4:0,0:0:21:0:25:0:4:0:0
+chr1	1002736	.	T	T	11.459	PASS	H;FAU=0;FCU=0;FGU=15;FTU=9;RAU=0;RCU=0;RGU=9;RTU=14	GT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU	0/1:11:47:0.4894:0,23:0:4:0,0:0:0:24:23:0:0:4:0
+chr1	1007229	.	T	C	8.222	PASS	H;FAU=0;FCU=10;FGU=0;FTU=10;RAU=0;RCU=12;RGU=0;RTU=5	GT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU	0/1:8:37:0.5946:0,22:0.3333:3:0,1:0:22:0:15:0:1:0:2
+chr1	1036754	.	C	C	8.888	PASS	H;FAU=0;FCU=12;FGU=0;FTU=11;RAU=0;RCU=15;RGU=0;RTU=14	GT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU	0/1:8:52:0.5192:0,27:0.25:4:0,1:0:27:0:25:0:1:0:3
+chr1	1048681	.	C	T	15.842	PASS	H;FAU=0;FCU=6;FGU=0;FTU=15;RAU=0;RCU=7;RGU=0;RTU=13	GT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU	0/1:15:41:0.6829:0,28:0:7:0,0:0:13:0:28:0:7:0:0
+chr1	1048722	.	G	A	16.495	PASS	H;FAU=14;FCU=0;FGU=7;FTU=0;RAU=13;RCU=0;RGU=8;RTU=0	GT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU	0/1:16:42:0.6429:0,27:0:7:0,0:27:0:15:0:0:0:7:0
+chr1	1048822	.	A	G	13.393	PASS	H;FAU=6;FCU=0;FGU=14;FTU=0;RAU=8;RCU=0;RGU=12;RTU=0	GT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU	0/1:13:40:0.65:0,26:0:6:0,0:14:0:26:0:6:0:0:0
+chr1	1048922	.	T	T	13.211	PASS	H;FAU=0;FCU=7;FGU=0;FTU=14;RAU=0;RCU=8;RGU=0;RTU=11	GT:GQ:DP:AF:AD:NAF:NDP:NAD:AU:CU:GU:TU:NAU:NCU:NGU:NTU	0/1:13:40:0.625:0,25:0:6:0,0:0:15:0:25:0:6:0:0

+ 3 - 0
src/annotation/alpha_genome.rs

@@ -0,0 +1,3 @@
+
+
+

+ 1 - 0
src/annotation/mod.rs

@@ -3,6 +3,7 @@ pub mod echtvar;
 pub mod gnomad;
 pub mod ncbi;
 pub mod vep;
+pub mod alpha_genome;
 
 use std::{
     collections::{HashMap, HashSet},

+ 0 - 1
src/callers/clairs.rs

@@ -986,7 +986,6 @@ mod tests {
         test_init();
         let config = Config::default();
 
-        merge_haplotaged_tmp_bam(&config, "DUMCO")?;
         Ok(())
     }
 }

+ 1 - 0
src/callers/savana.rs

@@ -215,6 +215,7 @@ impl SlurmRunner for Savana {
         .to_args()
     }
 }
+
 impl SbatchRunner for Savana {
     fn slurm_params(&self) -> SlurmParams {
         SlurmParams {

+ 15 - 0
src/helpers.rs

@@ -1169,6 +1169,21 @@ pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) {
     }
 }
 
+pub fn revcomp(s: &str) -> String {
+    fn comp(b: u8) -> u8 {
+        match b {
+            b'A' => b'T',
+            b'C' => b'G',
+            b'G' => b'C',
+            b'T' => b'A',
+            b'N' => b'N',
+            _ => b'N',
+        }
+    }
+    let v: Vec<u8> = s.as_bytes().iter().rev().map(|&b| comp(b.to_ascii_uppercase())).collect();
+    String::from_utf8(v).unwrap()
+}
+
 #[cfg(test)]
 mod tests {
     use log::info;

+ 17 - 3
src/io/fasta.rs

@@ -1,6 +1,20 @@
-use std::fs::File;
-
-use noodles_fasta::io::IndexedReader;
+use std::{fs::File, path::Path};
+
+use noodles_fasta::io::{BufReader as NoodlesBufReader, IndexedReader};
+
+/// Open FASTA with its .fai index.
+///
+/// Note: noodles API differs slightly by version; adjust this function if needed.
+/// Commonly you either:
+/// - use a Builder that reads `{path}.fai`, or
+/// - create an IndexedReader from a reader + index.
+///
+/// This version assumes there is a `indexed_reader::Builder`.
+pub fn open_indexed_fasta(reference: &Path) -> anyhow::Result<IndexedReader<NoodlesBufReader<File>>> {
+    noodles_fasta::io::indexed_reader::Builder::default()
+        .build_from_path(reference)
+        .map_err(|e| anyhow::anyhow!("Failed to open indexed FASTA {}: {e}", reference.display()))
+}
 
 // 0-based position in input
 pub fn sequence_at(

+ 86 - 0
src/io/liftover.rs

@@ -0,0 +1,86 @@
+use std::{io::Read, path::Path};
+
+use anyhow::Context;
+use chainfile::liftover::machine::Builder;
+
+use crate::io::readers::get_reader;
+
+/// Build a chainfile liftover machine from a .chain (optionally gz).
+// pub fn build_machine_from_chain<P: AsRef<Path>>(
+//     chain_path: P,
+// ) -> anyhow::Result<chainfile::liftover::Machine> {
+//     let path = chain_path.as_ref();
+//     let p = path.to_str().with_context(|| format!("Failed to parse: {}", path.display()))?;
+//
+//     let mut reader = get_reader(p)?;
+//
+//     let mut buf = Vec::new();
+//     reader.read_to_end(&mut buf)?;
+//
+//     let chain_reader = chainfile::Reader::new(&buf[..]);
+//     let machine = Builder::default()
+//         .try_build_from(chain_reader)
+//         .context("build liftover machine from chain")?;
+//
+//     Ok(machine)
+// }
+
+pub fn build_machine_from_chain<P: AsRef<Path>>(
+    chain_path: P,
+) -> anyhow::Result<chainfile::liftover::Machine> {
+    let path = chain_path.as_ref();
+    let p = path.to_str().context("path to string")?;
+    
+    let reader = get_reader(p)?;
+    let buf_reader = BufReader::new(reader);
+    
+    // Sanitize: ensure tabs, no trailing whitespace
+    let sanitized: Vec<u8> = buf_reader
+        .lines()
+        .filter_map(|l| l.ok())
+        .map(|line| {
+            if line.starts_with("chain") || line.is_empty() {
+                line
+            } else {
+                // Convert spaces to tabs for alignment blocks
+                line.split_whitespace()
+                    .collect::<Vec<_>>()
+                    .join("\t")
+            }
+        })
+        .collect::<Vec<_>>()
+        .join("\n")
+        .into_bytes();
+    
+    let chain_reader = chainfile::Reader::new(&sanitized[..]);
+    Builder::default()
+        .try_build_from(chain_reader)
+        .context("build liftover machine from chain")
+}
+
+use std::io::{BufRead, BufReader};
+
+pub fn debug_chain_file<P: AsRef<Path>>(chain_path: P) -> anyhow::Result<()> {
+    let reader = get_reader(chain_path.as_ref().to_str().unwrap())?;
+    let buf_reader = BufReader::new(reader);
+    
+    for (idx, line) in buf_reader.lines().enumerate() {
+        let line = line?;
+        
+        // Check the problematic line area
+        if line.starts_with("246") {
+            eprintln!("Line {}: {:?}", idx + 1, line);
+            eprintln!("Bytes: {:?}", line.as_bytes());
+            eprintln!("Fields: {:?}", line.split('\t').collect::<Vec<_>>());
+        }
+        
+        // Validate alignment blocks
+        if !line.starts_with("chain") && !line.is_empty() {
+            let fields: Vec<&str> = line.split('\t').collect();
+            if fields.len() != 1 && fields.len() != 3 {
+                eprintln!("Invalid line {}: {} fields: {:?}", idx + 1, fields.len(), line);
+            }
+        }
+    }
+    Ok(())
+}

+ 1 - 0
src/io/mod.rs

@@ -11,3 +11,4 @@ pub mod writers;
 pub mod straglr;
 pub mod tsv;
 pub mod modkit;
+pub mod liftover;

+ 2 - 2
src/io/readers.rs

@@ -15,7 +15,7 @@ pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn std::io::Read>> {
         .collect::<Vec<&str>>()
         .last()
         .context(format!("Can't parse {path}"))?;
-    assert!(file_type == "gz" || file_type == "vcf" || file_type == "bed" || file_type == "tsv" || file_type == "json");
+    assert!(file_type == "gz" || file_type == "vcf" || file_type == "bed" || file_type == "tsv" || file_type == "json" || file_type == "chain");
 
     let raw_reader: Box<dyn std::io::Read> = Box::new(File::open(path)?);
 
@@ -24,7 +24,7 @@ pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn std::io::Read>> {
             let reader = Box::new(BGZFReader::new(raw_reader)?);
             Ok(Box::new(BufReader::new(reader)))
         }
-        "vcf" | "bed" | "tsv" | "json" => Ok(Box::new(BufReader::new(raw_reader))),
+        "vcf" | "bed" | "tsv" | "json" | "chain" => Ok(Box::new(BufReader::new(raw_reader))),
         t => {
             panic!("unknown file type: {}", t)
         }

+ 32 - 0
src/positions.rs

@@ -7,6 +7,7 @@ use std::{
 
 use anyhow::Context;
 use bitcode::{Decode, Encode};
+use chainfile::liftover::Machine;
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 
@@ -73,6 +74,37 @@ impl GenomePosition {
     pub fn contig(&self) -> String {
         num_to_contig(self.contig)
     }
+
+    /// Liftover this position using a UCSC chainfile Machine.
+    ///
+    /// - Coordinates are 0-based (interbase) end-to-end
+    /// - Returns `None` if the position is unmappable
+    pub fn liftover(&self, machine: &Machine) -> anyhow::Result<Option<GenomePosition>> {
+        let chrom = self.contig(); // e.g. "chr1"
+
+        let start = self.position as u64;
+        let end = start + 1;
+
+        // UCSC / omics interval: chr:+:start-end (interbase)
+        let interval_str = format!("{chrom}:+:{start}-{end}");
+        let interval: omics::coordinate::Interval<omics::coordinate::system::Interbase> = interval_str
+            .parse()
+            .with_context(|| format!("Invalid interval: {interval_str}"))?;
+
+        let lifted = match machine.liftover(interval) {
+            Some(v) if !v.is_empty() => v[0].clone(),
+            _ => return Ok(None),
+        };
+
+        let target = lifted.query();
+
+        let new_contig = contig_to_num(target.contig());
+
+        Ok(Some(GenomePosition {
+            contig: new_contig,
+            position: target.start().position().get() as u32, // still 0-based
+        }))
+    }
 }
 
 /// Implements the `Display` trait for `GenomePosition`.

+ 95 - 4
src/variant/variant_collection.rs

@@ -1,8 +1,8 @@
 use std::{
     collections::{HashMap, HashSet},
     fs::{self, File},
-    io::{Read, Write},
-    path::PathBuf,
+    io::{BufWriter, Read, Write},
+    path::{Path, PathBuf},
     sync::Arc,
 };
 
@@ -38,7 +38,13 @@ use crate::{
         app_storage_dir, detect_repetition, estimate_shannon_entropy, mean, Hash128, Repeat,
         TempFileGuard,
     },
-    io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header, writers::get_gz_writer},
+    io::{
+        fasta::{open_indexed_fasta, sequence_at},
+        liftover::build_machine_from_chain,
+        readers::get_reader,
+        vcf::vcf_header,
+        writers::get_gz_writer,
+    },
     positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomePosition},
     run,
 };
@@ -620,6 +626,76 @@ impl VariantCollection {
             self.variants.len()
         );
     }
+
+    /// Liftover the whole collection and write a new VCF.
+    ///
+    /// - Lifts in parallel by `chunk_size`.
+    /// - Each rayon worker opens its own FASTA IndexedReader (thread-safe).
+    /// - Writes results in original input order.
+    pub fn save_into_lifted<P: AsRef<Path>>(
+        &self,
+        out_vcf_path: P,
+        unmapped_vcf_path: P,
+        chain_path: P,
+        target_fasta_path: P,
+        chunk_size: usize,
+    ) -> anyhow::Result<()> {
+        let out_vcf_path: PathBuf = out_vcf_path.as_ref().to_path_buf();
+        let unmapped_vcf_path: PathBuf = unmapped_vcf_path.as_ref().to_path_buf();
+        let chain_path: PathBuf = chain_path.as_ref().to_path_buf();
+        let target_fasta_path: Arc<PathBuf> = Arc::new(target_fasta_path.as_ref().to_path_buf());
+
+        let machine = Arc::new(build_machine_from_chain(chain_path.as_path())?);
+
+        let n = self.variants.len();
+        let mut out: Vec<Option<VcfVariant>> = vec![None; n];
+
+        out.par_chunks_mut(chunk_size).enumerate().try_for_each(
+            |(chunk_idx, out_chunk)| -> anyhow::Result<()> {
+                let mut fasta = open_indexed_fasta(target_fasta_path.as_path())?;
+
+                let start = chunk_idx * chunk_size;
+                let end = (start + out_chunk.len()).min(n);
+
+                for (i, slot) in (start..end).zip(out_chunk.iter_mut()) {
+                    *slot = self.variants[i].liftover_with_fasta(&machine, &mut fasta)?;
+                }
+                Ok(())
+            },
+        )?;
+
+        // Write outputs (serial)
+        let mut w_ok = BufWriter::new(File::create(&out_vcf_path)?);
+        let mut w_un = BufWriter::new(File::create(&unmapped_vcf_path)?);
+
+        // --- header ---
+        let header_lines = [
+            "##fileformat=VCFv4.2",
+            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE",
+        ];
+        // Replace this with however your `Vcf` stores header lines.
+        for line in header_lines {
+            writeln!(w_ok, "{line}")?;
+            writeln!(w_un, "{line}")?;
+        }
+
+        // --- records ---
+        for (i, mapped) in out.into_iter().enumerate() {
+            match mapped {
+                Some(v) => {
+                    writeln!(w_ok, "{}", v.into_vcf_row())?;
+                }
+                None => {
+                    // write original record to unmapped
+                    writeln!(w_un, "{}", self.variants[i].into_vcf_row())?;
+                }
+            }
+        }
+
+        w_ok.flush()?;
+        w_un.flush()?;
+        Ok(())
+    }
 }
 
 /// Represents a consolidated genomic variant with associated information and annotations.
@@ -1983,7 +2059,7 @@ fn process_vep_chunk(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{annotation::Caller, helpers::test_init};
+    use crate::{annotation::{self, Caller}, callers::clairs::ClairS, commands::Command, helpers::test_init, io::liftover::debug_chain_file, pipes::Initialize, variant::vcf_variant::Variants};
 
     #[test]
     fn annotate_constit() -> anyhow::Result<()> {
@@ -2006,4 +2082,19 @@ mod tests {
         println!("{annotations:#?}");
         Ok(())
     }
+
+    #[test]
+    fn liftover_hg38() -> anyhow::Result<()> {
+        test_init();
+        
+        let config = Config::default();
+        let annotations = Annotations::default();
+        let mut var = ClairS::initialize("CHAHA", &config)?.variants(&annotations)?;
+        var.variants.truncate(10);
+
+        var.save_into_lifted("./out_hg38.vcf", "./unmapped", "/home/t_steimle/ref/hs1/chm13v2-grch38.chain", "/home/t_steimle/ref/hg38/hg38.fa", 10)?;
+
+        Ok(())
+    }
+
 }

+ 79 - 11
src/variant/vcf_variant.rs

@@ -107,24 +107,16 @@
 //! ```
 
 use crate::{
-    annotation::Annotations,
-    helpers::{estimate_shannon_entropy, mean, Hash128},
-    pipes::ShouldRun,
-    positions::{GenomePosition, GetGenomePosition, VcfPosition},
-    runners::Run,
-    variant::variant_collection::VariantCollection,
+    annotation::Annotations, helpers::{Hash128, estimate_shannon_entropy, mean, revcomp}, io::fasta::sequence_range, pipes::ShouldRun, positions::{GenomePosition, GetGenomePosition, VcfPosition, contig_to_num}, runners::Run, variant::variant_collection::VariantCollection
 };
 use anyhow::{anyhow, Context};
 use bitcode::{Decode, Encode};
+use chainfile::liftover::Machine;
 use log::{error, info};
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use std::{
-    cmp::Ordering,
-    collections::{BTreeSet, HashSet},
-    fmt,
-    hash::Hash,
-    str::FromStr,
+    cmp::Ordering, collections::{BTreeSet, HashSet}, fmt, fs::File, hash::Hash, str::FromStr
 };
 
 /// Represents a variant in the Variant Call Format (VCF).
@@ -276,6 +268,82 @@ impl VcfVariant {
         columns.join("\t")
     }
 
+    /// Liftover coordinates and rewrite REF (and possibly ALT) using target FASTA.
+    ///
+    /// - uses REF length to define the lifted interval
+    /// - rewrites REF to match target FASTA
+    /// - if mapping is on reverse strand, revcomps REF and ALT (sequence alleles)
+    pub fn liftover_with_fasta(
+        &self,
+        machine: &Machine,
+        fasta: &mut noodles_fasta::io::IndexedReader<noodles_fasta::io::BufReader<File>>,
+    ) -> anyhow::Result<Option<Self>> {
+        let src_contig = self.position.contig(); // "chr1" etc.
+        let src_start0 = self.position.position as u64;
+
+        let ref_seq = self.reference.to_string();
+        let ref_len = ref_seq.len();
+        if ref_len == 0 {
+            return Ok(None);
+        }
+
+        // interbase interval [start, start+len)
+        let start = src_start0;
+        let end = src_start0 + ref_len as u64;
+
+        let interval_str = format!("{src_contig}:+:{start}-{end}");
+        let interval: omics::coordinate::interval::interbase::Interval = interval_str
+            .parse()
+            .with_context(|| format!("Invalid interval: {interval_str}"))?;
+
+        let pairs = match machine.liftover(interval) {
+            Some(v) if !v.is_empty() => v,
+            _ => return Ok(None),
+        };
+
+        // best-scoring mapping
+        let target = pairs[0].query();
+
+        let tgt_contig_name = target.contig();
+        let tgt_contig = contig_to_num(tgt_contig_name);
+
+        let tgt_start0 = target.start().position().get() as usize; // 0-based
+        let tgt_end0_inclusive = tgt_start0 + ref_len - 1;
+
+        // Strand: adjust accessor to your omics version if needed.
+        let on_reverse = target.strand().to_string() == "-";
+
+        // Fetch REF from target assembly
+        let mut new_ref = sequence_range(fasta, tgt_contig_name, tgt_start0, tgt_end0_inclusive)
+            .with_context(|| format!("FASTA query failed at {tgt_contig_name}:{tgt_start0}-{tgt_end0_inclusive}"))?;
+
+        if on_reverse {
+            new_ref = revcomp(&new_ref);
+        }
+
+        // Rewrite alleles
+        let mut out = self.clone();
+        out.position = GenomePosition {
+            contig: tgt_contig,
+            position: tgt_start0 as u32,
+        };
+        out.reference = new_ref
+            .parse()
+            .context("Failed to parse rewritten REF")?;
+
+        if on_reverse {
+            // Only revcomp ALT if it is a plain sequence allele.
+            // If you have symbolic alleles (<DEL>, etc.), guard here.
+            let alt = out.alternative.to_string();
+            let alt_rc = revcomp(&alt);
+            out.alternative = alt_rc
+                .parse()
+                .context("Failed to parse rewritten ALT")?;
+        }
+
+        Ok(Some(out))
+    }
+
     /// Returns the hash of the variant.
     pub fn hash(&self) -> Hash128 {
         self.hash

+ 2 - 0
unmapped

@@ -0,0 +1,2 @@
+##fileformat=VCFv4.2
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE