浏览代码

audit and fix src/io/readers.rs

- Fix compress_to_bgzip: write to .gz.tmp then atomically rename to .gz;
  previously a failed write left a partial .gz on disk which was served
  as valid on all subsequent calls (existence check bypassed re-compression)
- Remove dead BGZFWriter type alias (only referenced in commented-out code)
- Add //! module header documenting the BGZF-not-standard-gzip assumption
- Full rustdoc on all three functions including the side-effect warning on
  get_gz_reader (it creates files on disk as a by-product of being called)
- Simplify get_reader match arm (unreachable _ arm collapsed into plain _)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas 1 月之前
父节点
当前提交
f2572964e4
共有 1 个文件被更改,包括 79 次插入17 次删除
  1. 79 17
      src/io/readers.rs

+ 79 - 17
src/io/readers.rs

@@ -1,5 +1,11 @@
+//! File reader helpers for plain-text and BGZF-compressed genomic files.
+//!
+//! **Important:** `.gz` files are assumed to be **BGZF** (block gzip), not standard gzip.
+//! BGZF is the format produced by `bgzip` and used by BAM, VCF, BED.gz, etc.
+//! Plain `gzip` output will not decompress correctly here.
+
 use std::{
-    fs::File,
+    fs::{self, File},
     io::{BufReader, Read, Write},
     path::Path,
 };
@@ -10,9 +16,29 @@ use noodles_bgzf as bgzf;
 
 use crate::io::writers::{finalize_bgzf_file, get_gz_writer};
 
+/// Type alias for a BGZF reader wrapping any `Read` source.
 pub type BGZFReader<R> = bgzf::io::Reader<R>;
-pub type BGZFWriter<W> = bgzf::io::Writer<W>;
 
+/// Open a plain-text or BGZF-compressed file for reading.
+///
+/// The file format is detected from the extension. Supported extensions:
+/// `gz` (BGZF), `vcf`, `bed`, `tsv`, `json`, `chain`.
+///
+/// `.gz` files are opened with [`BGZFReader`] — they must be BGZF-encoded,
+/// not plain gzip.
+///
+/// # Arguments
+///
+/// * `path` - Path to the file
+///
+/// # Returns
+///
+/// A boxed `Read` impl, buffered internally.
+///
+/// # Errors
+///
+/// Returns an error if the extension is missing, unrecognised, or the file
+/// cannot be opened.
 pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn Read>> {
     debug!("Reading: {path}");
 
@@ -23,21 +49,35 @@ pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn Read>> {
 
     anyhow::ensure!(
         matches!(file_type, "gz" | "vcf" | "bed" | "tsv" | "json" | "chain"),
-        "unknown file type: {file_type}"
+        "unsupported file extension: {file_type}"
     );
 
     let raw_reader = File::open(path).with_context(|| format!("failed to open {path}"))?;
 
     match file_type {
-        "gz" => {
-            let reader = BGZFReader::new(raw_reader);
-            Ok(Box::new(BufReader::new(reader)))
-        }
-        "vcf" | "bed" | "tsv" | "json" | "chain" => Ok(Box::new(BufReader::new(raw_reader))),
-        _ => unreachable!(),
+        "gz" => Ok(Box::new(BufReader::new(BGZFReader::new(raw_reader)))),
+        _ => Ok(Box::new(BufReader::new(raw_reader))),
     }
 }
 
+/// Open a file as a BGZF reader, compressing it first if it is not already `.gz`.
+///
+/// If `path` does not end in `.gz`, [`compress_to_bgzip`] is called to produce
+/// `<path>.gz` on disk before opening it. This is a **side-effecting** operation:
+/// the compressed file is created and cached for subsequent calls.
+///
+/// # Arguments
+///
+/// * `path` - Path to the input file (plain text or BGZF)
+///
+/// # Returns
+///
+/// A [`BGZFReader`] over the (possibly newly created) `.gz` file.
+///
+/// # Errors
+///
+/// Returns an error if the extension cannot be determined, compression fails,
+/// or the resulting `.gz` file cannot be opened.
 pub fn get_gz_reader(path: &str) -> anyhow::Result<BGZFReader<File>> {
     debug!("Reading: {path}");
 
@@ -57,6 +97,26 @@ pub fn get_gz_reader(path: &str) -> anyhow::Result<BGZFReader<File>> {
     Ok(BGZFReader::new(file))
 }
 
+/// Compress a plain-text file to BGZF, returning the path of the `.gz` output.
+///
+/// If `<input_path>.gz` already exists it is returned immediately (cached).
+/// Otherwise the file is written atomically: data is first written to a
+/// temporary `.gz.tmp` file, then renamed to `.gz` on success. This prevents
+/// a partially-written `.gz` from being treated as valid on a subsequent call
+/// after a failed write.
+///
+/// # Arguments
+///
+/// * `input_path` - Path to the plain-text input file
+///
+/// # Returns
+///
+/// The path of the BGZF-compressed output file (`<input_path>.gz`).
+///
+/// # Errors
+///
+/// Returns an error if the input cannot be read, the output cannot be written,
+/// or the atomic rename fails.
 pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result<String> {
     let output_path = format!("{input_path}.gz");
 
@@ -64,30 +124,32 @@ pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result<String> {
         return Ok(output_path);
     }
 
-    debug!("Compressing {input_path}");
+    debug!("Compressing {input_path} → {output_path}");
+
+    let tmp_path = format!("{output_path}.tmp");
 
     let input_file = File::open(input_path)
         .with_context(|| format!("failed to open input file: {input_path}"))?;
     let mut reader = BufReader::new(input_file);
-
-    let mut writer = get_gz_writer(&output_path, false)?;
+    let mut writer = get_gz_writer(&tmp_path, true)?;
 
     let mut buffer = [0u8; 8192];
     loop {
         let n = reader
             .read(&mut buffer)
-            .with_context(|| format!("failed reading input file: {input_path}"))?;
-
+            .with_context(|| format!("failed reading {input_path}"))?;
         if n == 0 {
             break;
         }
-
         writer
             .write_all(&buffer[..n])
-            .with_context(|| format!("failed writing BGZF file: {output_path}"))?;
+            .with_context(|| format!("failed writing {tmp_path}"))?;
     }
 
-    finalize_bgzf_file(writer, &output_path)?;
+    finalize_bgzf_file(writer, &tmp_path)?;
+
+    fs::rename(&tmp_path, &output_path)
+        .with_context(|| format!("failed to rename {tmp_path} → {output_path}"))?;
 
     Ok(output_path)
 }