|
|
@@ -1,8 +1,8 @@
|
|
|
use std::{
|
|
|
collections::{HashMap, HashSet},
|
|
|
fs::{self, File},
|
|
|
- io::{Read, Write},
|
|
|
- path::PathBuf,
|
|
|
+ io::{BufWriter, Read, Write},
|
|
|
+ path::{Path, PathBuf},
|
|
|
sync::Arc,
|
|
|
};
|
|
|
|
|
|
@@ -38,7 +38,13 @@ use crate::{
|
|
|
app_storage_dir, detect_repetition, estimate_shannon_entropy, mean, Hash128, Repeat,
|
|
|
TempFileGuard,
|
|
|
},
|
|
|
- io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header, writers::get_gz_writer},
|
|
|
+ io::{
|
|
|
+ fasta::{open_indexed_fasta, sequence_at},
|
|
|
+ liftover::build_machine_from_chain,
|
|
|
+ readers::get_reader,
|
|
|
+ vcf::vcf_header,
|
|
|
+ writers::get_gz_writer,
|
|
|
+ },
|
|
|
positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomePosition},
|
|
|
run,
|
|
|
};
|
|
|
@@ -620,6 +626,76 @@ impl VariantCollection {
|
|
|
self.variants.len()
|
|
|
);
|
|
|
}
|
|
|
+
|
|
|
+ /// Liftover the whole collection and write a new VCF.
|
|
|
+ ///
|
|
|
+ /// - Lifts in parallel by `chunk_size`.
|
|
|
+ /// - Each rayon worker opens its own FASTA IndexedReader (thread-safe).
|
|
|
+ /// - Writes results in original input order.
|
|
|
+ pub fn save_into_lifted<P: AsRef<Path>>(
|
|
|
+ &self,
|
|
|
+ out_vcf_path: P,
|
|
|
+ unmapped_vcf_path: P,
|
|
|
+ chain_path: P,
|
|
|
+ target_fasta_path: P,
|
|
|
+ chunk_size: usize,
|
|
|
+ ) -> anyhow::Result<()> {
|
|
|
+ let out_vcf_path: PathBuf = out_vcf_path.as_ref().to_path_buf();
|
|
|
+ let unmapped_vcf_path: PathBuf = unmapped_vcf_path.as_ref().to_path_buf();
|
|
|
+ let chain_path: PathBuf = chain_path.as_ref().to_path_buf();
|
|
|
+ let target_fasta_path: Arc<PathBuf> = Arc::new(target_fasta_path.as_ref().to_path_buf());
|
|
|
+
|
|
|
+ let machine = Arc::new(build_machine_from_chain(chain_path.as_path())?);
|
|
|
+
|
|
|
+ let n = self.variants.len();
|
|
|
+ let mut out: Vec<Option<VcfVariant>> = vec![None; n];
|
|
|
+
|
|
|
+ out.par_chunks_mut(chunk_size).enumerate().try_for_each(
|
|
|
+ |(chunk_idx, out_chunk)| -> anyhow::Result<()> {
|
|
|
+ let mut fasta = open_indexed_fasta(target_fasta_path.as_path())?;
|
|
|
+
|
|
|
+ let start = chunk_idx * chunk_size;
|
|
|
+ let end = (start + out_chunk.len()).min(n);
|
|
|
+
|
|
|
+ for (i, slot) in (start..end).zip(out_chunk.iter_mut()) {
|
|
|
+ *slot = self.variants[i].liftover_with_fasta(&machine, &mut fasta)?;
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ },
|
|
|
+ )?;
|
|
|
+
|
|
|
+ // Write outputs (serial)
|
|
|
+ let mut w_ok = BufWriter::new(File::create(&out_vcf_path)?);
|
|
|
+ let mut w_un = BufWriter::new(File::create(&unmapped_vcf_path)?);
|
|
|
+
|
|
|
+ // --- header ---
|
|
|
+ let header_lines = [
|
|
|
+ "##fileformat=VCFv4.2",
|
|
|
+ "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE",
|
|
|
+ ];
|
|
|
+ // Replace this with however your `Vcf` stores header lines.
|
|
|
+ for line in header_lines {
|
|
|
+ writeln!(w_ok, "{line}")?;
|
|
|
+ writeln!(w_un, "{line}")?;
|
|
|
+ }
|
|
|
+
|
|
|
+ // --- records ---
|
|
|
+ for (i, mapped) in out.into_iter().enumerate() {
|
|
|
+ match mapped {
|
|
|
+ Some(v) => {
|
|
|
+ writeln!(w_ok, "{}", v.into_vcf_row())?;
|
|
|
+ }
|
|
|
+ None => {
|
|
|
+ // write original record to unmapped
|
|
|
+ writeln!(w_un, "{}", self.variants[i].into_vcf_row())?;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ w_ok.flush()?;
|
|
|
+ w_un.flush()?;
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/// Represents a consolidated genomic variant with associated information and annotations.
|
|
|
@@ -1983,7 +2059,7 @@ fn process_vep_chunk(
|
|
|
#[cfg(test)]
|
|
|
mod tests {
|
|
|
use super::*;
|
|
|
- use crate::{annotation::Caller, helpers::test_init};
|
|
|
+ use crate::{annotation::{self, Caller}, callers::clairs::ClairS, commands::Command, helpers::test_init, io::liftover::debug_chain_file, pipes::Initialize, variant::vcf_variant::Variants};
|
|
|
|
|
|
#[test]
|
|
|
fn annotate_constit() -> anyhow::Result<()> {
|
|
|
@@ -2006,4 +2082,19 @@ mod tests {
|
|
|
println!("{annotations:#?}");
|
|
|
Ok(())
|
|
|
}
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn liftover_hg38() -> anyhow::Result<()> {
|
|
|
+ test_init();
|
|
|
+
|
|
|
+ let config = Config::default();
|
|
|
+ let annotations = Annotations::default();
|
|
|
+ let mut var = ClairS::initialize("CHAHA", &config)?.variants(&annotations)?;
|
|
|
+ var.variants.truncate(10);
|
|
|
+
|
|
|
+ var.save_into_lifted("./out_hg38.vcf", "./unmapped", "/home/t_steimle/ref/hs1/chm13v2-grch38.chain", "/home/t_steimle/ref/hg38/hg38.fa", 10)?;
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
}
|