Browse Source

fix atomic BGZF write in readers.rs and writers.rs using TempFileGuard

The previous atomic-write attempt (readers.rs commit f257296) had a runtime
bug: the temp path format!("{output}.gz.tmp") does not end in .gz, causing
get_gz_writer to bail immediately with "file should end with .gz". cargo build
cannot catch this — it is a runtime check.

Fix both compress_to_bgzip (readers.rs) and convert_bgz (writers.rs) to use
TempFileGuard.tmp_path(".gz", same_dir) which:
  - Generates a UUID-named temp file ending in .gz (passes get_gz_writer check)
  - Places the temp file in the same directory as the output, ensuring
    fs::rename is an atomic metadata swap (same filesystem)
  - Automatically removes the temp file on failure or panic via Drop,
    so a crashed write never leaves an orphan temp file behind
  - disarm() on success prevents cleanup of the already-renamed file

Also fix convert_bgz to check force before writing (not after), so an existing
output with force=false is rejected before any I/O begins.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas 1 tháng trước cách đây
mục cha
commit
c3934a3c28
2 tập tin đã thay đổi với 97 bổ sung59 xóa
  1. 18 13
      src/io/readers.rs
  2. 79 46
      src/io/writers.rs

+ 18 - 13
src/io/readers.rs

@@ -14,7 +14,7 @@ use anyhow::Context;
 use log::debug;
 use noodles_bgzf as bgzf;
 
-use crate::io::writers::{finalize_bgzf_file, get_gz_writer};
+use crate::{helpers::TempFileGuard, io::writers::{finalize_bgzf_file, get_gz_writer}};
 
 /// Type alias for a BGZF reader wrapping any `Read` source.
 pub type BGZFReader<R> = bgzf::io::Reader<R>;
@@ -100,10 +100,10 @@ pub fn get_gz_reader(path: &str) -> anyhow::Result<BGZFReader<File>> {
 /// Compress a plain-text file to BGZF, returning the path of the `.gz` output.
 ///
 /// If `<input_path>.gz` already exists it is returned immediately (cached).
-/// Otherwise the file is written atomically: data is first written to a
-/// temporary `.gz.tmp` file, then renamed to `.gz` on success. This prevents
-/// a partially-written `.gz` from being treated as valid on a subsequent call
-/// after a failed write.
+/// Otherwise the file is written atomically: a UUID-named temp file is created
+/// in the same directory as the output (same filesystem, guaranteeing an atomic
+/// rename), then renamed to `<input_path>.gz` on success. A [`TempFileGuard`]
+/// ensures the temp file is removed automatically if the function fails or panics.
 ///
 /// # Arguments
 ///
@@ -126,30 +126,35 @@ pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result<String> {
 
     debug!("Compressing {input_path} → {output_path}");
 
-    let tmp_path = format!("{output_path}.tmp");
+    let tmp_dir = Path::new(input_path)
+        .parent()
+        .unwrap_or(Path::new("."));
+
+    let mut guard = TempFileGuard::new();
+    let tmp_path = guard.tmp_path(".gz", tmp_dir);
+    let tmp_str = tmp_path.to_string_lossy();
 
     let input_file = File::open(input_path)
         .with_context(|| format!("failed to open input file: {input_path}"))?;
     let mut reader = BufReader::new(input_file);
-    let mut writer = get_gz_writer(&tmp_path, true)?;
+    let mut writer = get_gz_writer(&tmp_str, false)?;
 
     let mut buffer = [0u8; 8192];
     loop {
         let n = reader
             .read(&mut buffer)
             .with_context(|| format!("failed reading {input_path}"))?;
-        if n == 0 {
-            break;
-        }
+        if n == 0 { break; }
         writer
             .write_all(&buffer[..n])
-            .with_context(|| format!("failed writing {tmp_path}"))?;
+            .with_context(|| format!("failed writing {tmp_str}"))?;
     }
 
-    finalize_bgzf_file(writer, &tmp_path)?;
+    finalize_bgzf_file(writer, &tmp_str)?;
 
     fs::rename(&tmp_path, &output_path)
-        .with_context(|| format!("failed to rename {tmp_path} → {output_path}"))?;
+        .with_context(|| format!("failed to rename {tmp_str} → {output_path}"))?;
 
+    guard.disarm();
     Ok(output_path)
 }

+ 79 - 46
src/io/writers.rs

@@ -1,3 +1,8 @@
+//! BGZF and plain-text file writer helpers.
+//!
+//! All BGZF writers produce files compatible with `bgzip`/HTSlib.
+//! Use [`finalize_bgzf_file`] to flush and sync every BGZF writer created here.
+
 use std::{
     fs::{self, File, OpenOptions},
     io::{BufWriter, Write},
@@ -5,42 +10,32 @@ use std::{
 };
 
 use anyhow::Context;
-// use bgzip::{BGZFWriter, Compression};
 use log::info;
-
-use crate::io::readers::get_reader;
-
-// pub fn get_gz_writer(path: &str, force: bool) -> anyhow::Result<BGZFWriter<File>> {
-//     if !path.ends_with(".gz") {
-//         anyhow::bail!("The file should end with gz");
-//     }
-//
-//     if force && Path::new(path).exists() {
-//         fs::remove_file(path).with_context(|| anyhow::anyhow!("Failed to remove file: {path}"))?;
-//     }
-//
-//     let file = OpenOptions::new()
-//         .write(true) // Open the file for writing
-//         .create_new(true)
-//         .truncate(true)
-//         .open(path)
-//         .with_context(|| anyhow::anyhow!("failed to open the file: {path}"))?;
-//
-//     info!("Writing into {path}");
-//     Ok(BGZFWriter::new(file, Compression::default()))
-// }
-
 use noodles_bgzf as bgzf;
 
+use crate::{helpers::TempFileGuard, io::readers::get_reader};
+
+/// Create a new BGZF writer for `path` (must end with `.gz`).
+///
+/// If `force` is `true` and the file already exists it is removed first.
+/// Otherwise an existing file causes an error (`create_new` semantics).
+///
+/// Call [`finalize_bgzf_file`] on the returned writer to flush, sync, and
+/// write the BGZF EOF block.
+///
+/// # Errors
+///
+/// Returns an error if the path does not end in `.gz`, removal fails when
+/// `force` is set, or the file cannot be created.
 pub fn get_gz_writer(
     path: &str,
     force: bool,
-) -> anyhow::Result<bgzf::io::Writer<BufWriter<std::fs::File>>> {
+) -> anyhow::Result<bgzf::io::Writer<BufWriter<File>>> {
     if !path.ends_with(".gz") {
         anyhow::bail!("file should end with .gz: {path}");
     }
 
-    if force && std::path::Path::new(path).exists() {
+    if force && Path::new(path).exists() {
         fs::remove_file(path).with_context(|| format!("failed to remove file: {path}"))?;
     }
 
@@ -53,47 +48,85 @@ pub fn get_gz_writer(
     Ok(bgzf::io::Writer::new(BufWriter::new(file)))
 }
 
+/// Create a new plain-text buffered writer for `path`.
+///
+/// Fails if the file already exists (`create_new` semantics).
+///
+/// # Errors
+///
+/// Returns an error if the file cannot be created.
 pub fn get_writer(path: &str) -> anyhow::Result<BufWriter<File>> {
     let file = OpenOptions::new()
-        .write(true) // Open the file for writing
+        .write(true)
         .create_new(true)
-        .truncate(true)
         .open(path)
-        .with_context(|| anyhow::anyhow!("failed to open the file: {path}"))?;
+        .with_context(|| format!("failed to create file: {path}"))?;
 
     info!("Writing into {path}");
     Ok(BufWriter::new(file))
 }
 
+/// Compress a plain-text file to BGZF, writing atomically via a UUID temp file.
+///
+/// The output is `<input>.gz`. If the output already exists and `force` is
+/// `false`, an error is returned before any writing begins. If `force` is
+/// `true`, the existing file is replaced.
+///
+/// A UUID-named temp file is created in the same directory as the output
+/// (same filesystem, guaranteeing an atomic rename). A [`TempFileGuard`] ensures
+/// the temp file is removed automatically on failure or panic.
+///
+/// # Arguments
+///
+/// * `input` - Path to the input file (plain text or BGZF as accepted by [`get_reader`])
+/// * `force` - Overwrite the output `.gz` if it already exists
+///
+/// # Errors
+///
+/// Returns an error if the output exists and `force` is `false`, the input
+/// cannot be read, the output cannot be written, or the atomic rename fails.
 pub fn convert_bgz(input: impl AsRef<Path>, force: bool) -> anyhow::Result<()> {
     let input = input.as_ref();
+    let output_path = format!("{}.gz", input.display());
 
-    let mut reader = get_reader(&input.to_string_lossy())?;
-    let mut writer = get_gz_writer(&format!("{}.gz", input.display()), force)?;
+    if !force && Path::new(&output_path).exists() {
+        anyhow::bail!("output already exists (use force=true to overwrite): {output_path}");
+    }
 
-    std::io::copy(&mut reader, &mut writer)?;
+    let tmp_dir = input.parent().unwrap_or(Path::new("."));
+    let mut guard = TempFileGuard::new();
+    let tmp_path = guard.tmp_path(".gz", tmp_dir);
+    let tmp_str = tmp_path.to_string_lossy();
 
-    writer
-        .try_finish()
-        .with_context(|| format!("failed finishing BGZF writer: {}", input.display()))?;
+    let mut reader = get_reader(&input.to_string_lossy())?;
+    let mut writer = get_gz_writer(&tmp_str, false)?;
 
-    let mut inner = writer
-        .finish()
-        .with_context(|| format!("failed returning inner BGZF writer: {}", input.display()))?;
+    std::io::copy(&mut reader, &mut writer)
+        .with_context(|| format!("failed copying {} → {tmp_str}", input.display()))?;
 
-    inner
-        .flush()
-        .with_context(|| format!("failed flushing inner writer: {}", input.display()))?;
+    finalize_bgzf_file(writer, &tmp_str)?;
 
-    inner
-        .into_inner()
-        .with_context(|| format!("failed unwrapping BufWriter: {}", input.display()))?
-        .sync_all()
-        .with_context(|| format!("failed syncing file: {}", input.display()))?;
+    if force && Path::new(&output_path).exists() {
+        fs::remove_file(&output_path)
+            .with_context(|| format!("failed to remove existing file: {output_path}"))?;
+    }
+
+    fs::rename(&tmp_path, &output_path)
+        .with_context(|| format!("failed to rename {tmp_str} → {output_path}"))?;
 
+    guard.disarm();
     Ok(())
 }
 
+/// Flush, sync, and write the BGZF EOF block for a writer created by [`get_gz_writer`].
+///
+/// Must be called after all data has been written; omitting it leaves the file
+/// without a valid BGZF terminator.
+///
+/// # Errors
+///
+/// Returns an error if flushing, unwrapping the inner `BufWriter`, or syncing
+/// the underlying file fails.
 pub fn finalize_bgzf_file(
     writer: bgzf::io::Writer<BufWriter<File>>,
     path: &str,