فهرست منبع

new bgz writer remove BGZIP crate cause of error writing big bgzip files

Thomas 1 روز پیش
والد
کامیت
15bce449b8
5فایلهای تغییر یافته به همراه61 افزوده شده و 70 حذف شده
  1. 0 13
      Cargo.lock
  2. 1 1
      Cargo.toml
  3. 45 29
      src/io/readers.rs
  4. 4 6
      src/io/vcf.rs
  5. 11 21
      src/variant/variant_collection.rs

+ 0 - 13
Cargo.lock

@@ -358,18 +358,6 @@ version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
-[[package]]
-name = "bgzip"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b64fd8980fb64af5951bc05de7772b598150a6f7eac42ec17f73e8489915f99b"
-dependencies = [
- "flate2",
- "log",
- "rayon",
- "thiserror 1.0.69",
-]
-
 [[package]]
 name = "bindgen"
 version = "0.69.5"
@@ -1945,7 +1933,6 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "arrow",
- "bgzip",
  "bitcode",
  "blake3",
  "byte-unit",

+ 1 - 1
Cargo.toml

@@ -25,7 +25,7 @@ indicatif = "0.17.8"
 rust-htslib = "0.51.0"
 arrow = { git = "https://github.com/apache/arrow-rs" }
 # arrow = "54.2.1"
-bgzip = "0.3.1"
+# bgzip = "0.3.1"
 dashmap = { version = "6.1.0", features = ["rayon", "serde"] }
 noodles-fasta = "0.58.0"
 noodles-core = "0.18.0"

+ 45 - 29
src/io/readers.rs

@@ -5,39 +5,46 @@ use std::{
 };
 
 use anyhow::Context;
-use bgzip::{BGZFReader, BGZFWriter, Compression};
 use log::debug;
+use noodles_bgzf as bgzf;
 
-pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn std::io::Read>> {
+use crate::io::writers::{finalize_bgzf_file, get_gz_writer};
+
+pub type BGZFReader<R> = bgzf::io::Reader<R>;
+pub type BGZFWriter<W> = bgzf::io::Writer<W>;
+
+pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn Read>> {
     debug!("Reading: {path}");
-    let file_type = *path
-        .split(".")
-        .collect::<Vec<&str>>()
-        .last()
-        .context(format!("Can't parse {path}"))?;
-    assert!(file_type == "gz" || file_type == "vcf" || file_type == "bed" || file_type == "tsv" || file_type == "json" || file_type == "chain");
 
-    let raw_reader: Box<dyn std::io::Read> = Box::new(File::open(path)?);
+    let file_type = Path::new(path)
+        .extension()
+        .and_then(|s| s.to_str())
+        .with_context(|| format!("can't parse extension from {path}"))?;
+
+    anyhow::ensure!(
+        matches!(file_type, "gz" | "vcf" | "bed" | "tsv" | "json" | "chain"),
+        "unknown file type: {file_type}"
+    );
+
+    let raw_reader = File::open(path).with_context(|| format!("failed to open {path}"))?;
 
     match file_type {
         "gz" => {
-            let reader = Box::new(BGZFReader::new(raw_reader)?);
+            let reader = BGZFReader::new(raw_reader);
             Ok(Box::new(BufReader::new(reader)))
         }
         "vcf" | "bed" | "tsv" | "json" | "chain" => Ok(Box::new(BufReader::new(raw_reader))),
-        t => {
-            panic!("unknown file type: {}", t)
-        }
+        _ => unreachable!(),
     }
 }
 
 pub fn get_gz_reader(path: &str) -> anyhow::Result<BGZFReader<File>> {
     debug!("Reading: {path}");
-    let file_type = *path
-        .split(".")
-        .collect::<Vec<&str>>()
-        .last()
-        .context("Can't parse {path}.")?;
+
+    let file_type = Path::new(path)
+        .extension()
+        .and_then(|s| s.to_str())
+        .with_context(|| format!("can't parse extension from {path}"))?;
 
     let path = if file_type != "gz" {
         compress_to_bgzip(path)?
@@ -45,33 +52,42 @@ pub fn get_gz_reader(path: &str) -> anyhow::Result<BGZFReader<File>> {
         path.to_string()
     };
 
-    let reader = File::open(path)?;
-    Ok(BGZFReader::new(reader)?)
+    let file = File::open(&path).with_context(|| format!("failed to open BGZF file: {path}"))?;
+
+    Ok(BGZFReader::new(file))
 }
 
 pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result<String> {
-    let output_path = format!("{}.gz", input_path);
+    let output_path = format!("{input_path}.gz");
 
     if Path::new(&output_path).exists() {
         return Ok(output_path);
     }
 
     debug!("Compressing {input_path}");
-    let input_file = File::open(input_path)?;
+
+    let input_file = File::open(input_path)
+        .with_context(|| format!("failed to open input file: {input_path}"))?;
     let mut reader = BufReader::new(input_file);
 
-    let output_file = File::create(&output_path)?;
-    let mut writer = BGZFWriter::new(output_file, Compression::default());
+    let mut writer = get_gz_writer(&output_path, false)?;
 
-    let mut buffer = [0; 8192];
+    let mut buffer = [0u8; 8192];
     loop {
-        let bytes_read = reader.read(&mut buffer)?;
-        if bytes_read == 0 {
+        let n = reader
+            .read(&mut buffer)
+            .with_context(|| format!("failed reading input file: {input_path}"))?;
+
+        if n == 0 {
             break;
         }
-        writer.write_all(&buffer[..bytes_read])?;
+
+        writer
+            .write_all(&buffer[..n])
+            .with_context(|| format!("failed writing BGZF file: {output_path}"))?;
     }
 
-    writer.close()?;
+    finalize_bgzf_file(writer, &output_path)?;
+
     Ok(output_path)
 }

+ 4 - 6
src/io/vcf.rs

@@ -4,10 +4,9 @@ use std::{
 };
 
 use anyhow::Context;
-use bgzip::{write::BGZFMultiThreadWriter, Compression};
 use log::{info, warn};
 
-use crate::variant::vcf_variant::VcfVariant;
+use crate::{io::writers::{finalize_bgzf_file, get_gz_writer}, variant::vcf_variant::VcfVariant};
 
 use super::{dict::read_dict, readers::get_reader};
 
@@ -32,8 +31,8 @@ pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
 
 pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
     info!("Writing: {path}");
-    let file = File::create(path)?;
-    let mut writer = BGZFMultiThreadWriter::new(file, Compression::default());
+    let mut writer = get_gz_writer(path, true)?;
+    // write!(writer, b"##fileformat=VCFv4.2\n")
     writer.write_all(b"##fileformat=VCFv4.2\n")?;
     writer.write_all(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;
 
@@ -44,8 +43,7 @@ pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
         ))?;
     }
 
-    writer.close()?;
-    Ok(())
+    finalize_bgzf_file(writer, path)
 }
 
 #[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]

+ 11 - 21
src/variant/variant_collection.rs

@@ -7,7 +7,7 @@ use std::{
 };
 
 use anyhow::Context;
-use bgzip::{BGZFReader, BGZFWriter};
+// use bgzip::{BGZFReader, BGZFWriter};
 use bitcode::{Decode, Encode};
 use csv::ReaderBuilder;
 use dashmap::DashMap;
@@ -22,30 +22,24 @@ use super::vcf_variant::{
 };
 use crate::{
     annotation::{
-        cosmic::Cosmic,
-        echtvar::{parse_echtvar_val, run_echtvar},
-        gnomad::GnomAD,
-        parse_trinuc,
-        vep::{get_best_vep, VepJob, VepLine, VEP},
-        Annotation, Annotations,
+        Annotation, Annotations, cosmic::Cosmic, echtvar::{parse_echtvar_val, run_echtvar}, gnomad::GnomAD, parse_trinuc, vep::{VEP, VepJob, VepLine, get_best_vep}
     },
     collection::{
-        bam::{counts_at, PileBase},
+        bam::{PileBase, counts_at},
         vcf::Vcf,
     },
     config::Config,
     helpers::{
-        app_storage_dir, detect_repetition, estimate_shannon_entropy, mean, Hash128, Repeat,
-        TempFileGuard,
+        Hash128, Repeat, TempFileGuard, app_storage_dir, detect_repetition, estimate_shannon_entropy, mean
     },
     io::{
         fasta::{open_indexed_fasta, sequence_at},
         liftover::build_machine_from_chain,
-        readers::get_reader,
+        readers::{get_gz_reader, get_reader},
         vcf::vcf_header,
-        writers::get_gz_writer,
+        writers::{finalize_bgzf_file, get_gz_writer},
     },
-    positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomePosition},
+    positions::{GenomePosition, GenomeRange, GetGenomePosition, overlaps_par},
     run,
 };
 
@@ -1418,15 +1412,13 @@ impl Variants {
     /// This method uses BGZF compression, which is compatible with standard gzip decompression.
     /// The resulting file can be read using standard gzip-aware tools.
     pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
-        let file = File::create(filename)
-            .with_context(|| format!("Failed to create file: {}", filename))?;
-        let mut writer = BGZFWriter::new(file, bgzip::Compression::default());
+        let mut writer = get_gz_writer(filename, true)
+            .with_context(|| format!("Failed to open writer to file: {}", filename))?;
 
         serde_json::to_writer(&mut writer, self)
             .with_context(|| format!("Failed to serialize JSON to file: {}", filename))?;
 
-        writer
-            .close()
+        finalize_bgzf_file(writer, filename)
             .with_context(|| format!("Failed to close BGZF writer for file: {}", filename))?;
 
         debug!("Successfully saved variants to {}", filename);
@@ -1467,9 +1459,7 @@ impl Variants {
     /// This method expects the input file to be in BGZF-compressed JSON format,
     /// typically created by the `save_to_json` method of this struct.
     pub fn load_from_json(filename: &str) -> anyhow::Result<Self> {
-        let file =
-            File::open(filename).with_context(|| format!("Failed to open file: {}", filename))?;
-        let mut reader = BGZFReader::new(file)
+        let mut reader = get_gz_reader(filename)
             .with_context(|| format!("Failed to create BGZF reader for file: {}", filename))?;
 
         let variants: Self = serde_json::from_reader(&mut reader)