|
|
@@ -1,5 +1,11 @@
|
|
|
+//! File reader helpers for plain-text and BGZF-compressed genomic files.
|
|
|
+//!
|
|
|
+//! **Important:** `.gz` files are assumed to be **BGZF** (block gzip), not standard gzip.
|
|
|
+//! BGZF is the format produced by `bgzip` and used by BAM, VCF, BED.gz, etc.
|
|
|
+//! Plain `gzip` output will not decompress correctly here.
|
|
|
+
|
|
|
use std::{
|
|
|
- fs::File,
|
|
|
+ fs::{self, File},
|
|
|
io::{BufReader, Read, Write},
|
|
|
path::Path,
|
|
|
};
|
|
|
@@ -10,9 +16,29 @@ use noodles_bgzf as bgzf;
|
|
|
|
|
|
use crate::io::writers::{finalize_bgzf_file, get_gz_writer};
|
|
|
|
|
|
+/// Type alias for a BGZF reader wrapping any `Read` source.
|
|
|
pub type BGZFReader<R> = bgzf::io::Reader<R>;
|
|
|
-pub type BGZFWriter<W> = bgzf::io::Writer<W>;
|
|
|
|
|
|
+/// Open a plain-text or BGZF-compressed file for reading.
|
|
|
+///
|
|
|
+/// The file format is detected from the extension. Supported extensions:
|
|
|
+/// `gz` (BGZF), `vcf`, `bed`, `tsv`, `json`, `chain`.
|
|
|
+///
|
|
|
+/// `.gz` files are opened with [`BGZFReader`] — they must be BGZF-encoded,
|
|
|
+/// not plain gzip.
|
|
|
+///
|
|
|
+/// # Arguments
|
|
|
+///
|
|
|
+/// * `path` - Path to the file
|
|
|
+///
|
|
|
+/// # Returns
|
|
|
+///
|
|
|
+/// A boxed `Read` impl, buffered internally.
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+///
|
|
|
+/// Returns an error if the extension is missing, unrecognised, or the file
|
|
|
+/// cannot be opened.
|
|
|
pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn Read>> {
|
|
|
debug!("Reading: {path}");
|
|
|
|
|
|
@@ -23,21 +49,35 @@ pub fn get_reader(path: &str) -> anyhow::Result<Box<dyn Read>> {
|
|
|
|
|
|
anyhow::ensure!(
|
|
|
matches!(file_type, "gz" | "vcf" | "bed" | "tsv" | "json" | "chain"),
|
|
|
- "unknown file type: {file_type}"
|
|
|
+ "unsupported file extension: {file_type}"
|
|
|
);
|
|
|
|
|
|
let raw_reader = File::open(path).with_context(|| format!("failed to open {path}"))?;
|
|
|
|
|
|
match file_type {
|
|
|
- "gz" => {
|
|
|
- let reader = BGZFReader::new(raw_reader);
|
|
|
- Ok(Box::new(BufReader::new(reader)))
|
|
|
- }
|
|
|
- "vcf" | "bed" | "tsv" | "json" | "chain" => Ok(Box::new(BufReader::new(raw_reader))),
|
|
|
- _ => unreachable!(),
|
|
|
+ "gz" => Ok(Box::new(BufReader::new(BGZFReader::new(raw_reader)))),
|
|
|
+ _ => Ok(Box::new(BufReader::new(raw_reader))),
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+/// Open a file as a BGZF reader, compressing it first if it is not already `.gz`.
|
|
|
+///
|
|
|
+/// If `path` does not end in `.gz`, [`compress_to_bgzip`] is called to produce
|
|
|
+/// `<path>.gz` on disk before opening it. This is a **side-effecting** operation:
|
|
|
+/// the compressed file is created and cached for subsequent calls.
|
|
|
+///
|
|
|
+/// # Arguments
|
|
|
+///
|
|
|
+/// * `path` - Path to the input file (plain text or BGZF)
|
|
|
+///
|
|
|
+/// # Returns
|
|
|
+///
|
|
|
+/// A [`BGZFReader`] over the (possibly newly created) `.gz` file.
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+///
|
|
|
+/// Returns an error if the extension cannot be determined, compression fails,
|
|
|
+/// or the resulting `.gz` file cannot be opened.
|
|
|
pub fn get_gz_reader(path: &str) -> anyhow::Result<BGZFReader<File>> {
|
|
|
debug!("Reading: {path}");
|
|
|
|
|
|
@@ -57,6 +97,26 @@ pub fn get_gz_reader(path: &str) -> anyhow::Result<BGZFReader<File>> {
|
|
|
Ok(BGZFReader::new(file))
|
|
|
}
|
|
|
|
|
|
+/// Compress a plain-text file to BGZF, returning the path of the `.gz` output.
|
|
|
+///
|
|
|
+/// If `<input_path>.gz` already exists it is returned immediately (cached).
|
|
|
+/// Otherwise the file is written atomically: data is first written to a
|
|
|
+/// temporary `.gz.tmp` file, then renamed to `.gz` on success. This prevents
|
|
|
+/// a partially-written `.gz` from being treated as valid on a subsequent call
|
|
|
+/// after a failed write.
|
|
|
+///
|
|
|
+/// # Arguments
|
|
|
+///
|
|
|
+/// * `input_path` - Path to the plain-text input file
|
|
|
+///
|
|
|
+/// # Returns
|
|
|
+///
|
|
|
+/// The path of the BGZF-compressed output file (`<input_path>.gz`).
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+///
|
|
|
+/// Returns an error if the input cannot be read, the output cannot be written,
|
|
|
+/// or the atomic rename fails.
|
|
|
pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result<String> {
|
|
|
let output_path = format!("{input_path}.gz");
|
|
|
|
|
|
@@ -64,30 +124,32 @@ pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result<String> {
|
|
|
return Ok(output_path);
|
|
|
}
|
|
|
|
|
|
- debug!("Compressing {input_path}");
|
|
|
+ debug!("Compressing {input_path} → {output_path}");
|
|
|
+
|
|
|
+ let tmp_path = format!("{output_path}.tmp");
|
|
|
|
|
|
let input_file = File::open(input_path)
|
|
|
.with_context(|| format!("failed to open input file: {input_path}"))?;
|
|
|
let mut reader = BufReader::new(input_file);
|
|
|
-
|
|
|
- let mut writer = get_gz_writer(&output_path, false)?;
|
|
|
+ let mut writer = get_gz_writer(&tmp_path, true)?;
|
|
|
|
|
|
let mut buffer = [0u8; 8192];
|
|
|
loop {
|
|
|
let n = reader
|
|
|
.read(&mut buffer)
|
|
|
- .with_context(|| format!("failed reading input file: {input_path}"))?;
|
|
|
-
|
|
|
+ .with_context(|| format!("failed reading {input_path}"))?;
|
|
|
if n == 0 {
|
|
|
break;
|
|
|
}
|
|
|
-
|
|
|
writer
|
|
|
.write_all(&buffer[..n])
|
|
|
- .with_context(|| format!("failed writing BGZF file: {output_path}"))?;
|
|
|
+ .with_context(|| format!("failed writing {tmp_path}"))?;
|
|
|
}
|
|
|
|
|
|
- finalize_bgzf_file(writer, &output_path)?;
|
|
|
+ finalize_bgzf_file(writer, &tmp_path)?;
|
|
|
+
|
|
|
+ fs::rename(&tmp_path, &output_path)
|
|
|
+ .with_context(|| format!("failed to rename {tmp_path} → {output_path}"))?;
|
|
|
|
|
|
Ok(output_path)
|
|
|
}
|