|
|
@@ -1,17 +1,22 @@
|
|
|
//! BGZF and plain-text file writer helpers.
|
|
|
//!
|
|
|
//! All BGZF writers produce files compatible with `bgzip`/HTSlib.
|
|
|
-//! Use [`finalize_bgzf_file`] to flush and sync every BGZF writer created here.
|
|
|
+//! Use [`finalize_bgzf_file`] to flush and sync a raw BGZF writer, or use
|
|
|
+//! [`BgzTabixWriter`] to produce a BGZF file with an accompanying Tabix index
|
|
|
+//! in a single atomic pass.
|
|
|
|
|
|
use std::{
|
|
|
fs::{self, File, OpenOptions},
|
|
|
io::{BufWriter, Write},
|
|
|
- path::Path,
|
|
|
+ path::{Path, PathBuf},
|
|
|
};
|
|
|
|
|
|
use anyhow::Context;
|
|
|
use log::info;
|
|
|
use noodles_bgzf as bgzf;
|
|
|
+use noodles_core::Position;
|
|
|
+use noodles_csi::binning_index::index::{reference_sequence::bin::Chunk, Header};
|
|
|
+use noodles_tabix as tabix;
|
|
|
|
|
|
use crate::{helpers::TempFileGuard, io::readers::get_reader};
|
|
|
|
|
|
@@ -148,6 +153,252 @@ pub fn finalize_bgzf_file(
|
|
|
Ok(())
|
|
|
}
|
|
|
|
|
|
+/// Output index format for [`BgzTabixWriter`].
|
|
|
+///
|
|
|
+/// # Chromosome size limits
|
|
|
+///
|
|
|
+/// TBI uses a fixed binning scheme (min\_shift=14, depth=5) that supports chromosomes
|
|
|
+/// up to **2²⁹ bp (~512 Mbp)**. This covers all human chromosomes (chr1 ≈ 249 Mbp)
|
|
|
+/// and most common model organisms.
|
|
|
+///
|
|
|
+/// CSI uses variable-depth binning with no size limit and is required for large genomes
|
|
|
+/// (some plants, fish). It is supported by bcftools and modern HTSlib tools but not
|
|
|
+/// by all legacy software.
|
|
|
+///
|
|
|
+/// # noodles API difference
|
|
|
+///
|
|
|
+/// Swapping formats is not trivial at the noodles level: `tabix::index::Indexer`
|
|
|
+/// takes string reference names in `add_record`, while `csi::binning_index::Indexer`
|
|
|
+/// takes integer reference IDs, requiring a separate internal name→ID mapping.
|
|
|
+/// CSI support is therefore marked as not yet implemented.
|
|
|
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
|
+pub enum IndexFormat {
|
|
|
+ /// Tabix index (`.tbi`). Max chromosome 512 Mbp. Maximum tool compatibility.
|
|
|
+ Tbi,
|
|
|
+ /// CSI index (`.csi`). No chromosome size limit. Not yet implemented.
|
|
|
+ Csi,
|
|
|
+}
|
|
|
+
|
|
|
+/// A combined BGZF writer and Tabix indexer for a single output file.
|
|
|
+///
|
|
|
+/// Writes data lines to a BGZF-compressed file while simultaneously building
|
|
|
+/// a Tabix index. On [`finish`](BgzTabixWriter::finish), both the `.gz` and
|
|
|
+/// `.tbi` files are written atomically using a UUID temp file and [`TempFileGuard`],
|
|
|
+/// so a failed write never leaves a corrupt output behind.
|
|
|
+///
|
|
|
+/// # Coordinate systems
|
|
|
+///
|
|
|
+/// Tabix uses **1-based** positions internally. BED files use **0-based half-open**
|
|
|
+/// `[start, end)`. Use [`write_record`](BgzTabixWriter::write_record) with
|
|
|
+/// pre-converted 1-based [`Position`] values for any format, or the convenience
|
|
|
+/// method [`write_bed_record`](BgzTabixWriter::write_bed_record) which performs
|
|
|
+/// the BED → Tabix conversion automatically:
|
|
|
+///
|
|
|
+/// ```text
|
|
|
+/// tabix_start = bed_start + 1 (0-based inclusive → 1-based inclusive)
|
|
|
+/// tabix_end = bed_end (0-based exclusive = 1-based inclusive numerically)
|
|
|
+/// ```
|
|
|
+///
|
|
|
+/// # Example
|
|
|
+///
|
|
|
+/// ```no_run
|
|
|
+/// use pandora_lib_promethion::io::writers::BgzTabixWriter;
|
|
|
+///
|
|
|
+/// let mut w = BgzTabixWriter::new("out.bed.gz", IndexFormat::Tbi, false)?;
|
|
|
+/// w.write_header(b"# comment\n")?;
|
|
|
+/// w.write_bed_record(b"chr1\t0\t100\tgene\n", "chr1", 0, 100)?;
|
|
|
+/// w.finish()?;
|
|
|
+/// # anyhow::Ok(())
|
|
|
+/// ```
|
|
|
+pub struct BgzTabixWriter {
|
|
|
+ writer: bgzf::io::Writer<BufWriter<File>>,
|
|
|
+ indexer: tabix::index::Indexer,
|
|
|
+ format: IndexFormat,
|
|
|
+ tmp_path: PathBuf,
|
|
|
+ output_path: String,
|
|
|
+ guard: TempFileGuard,
|
|
|
+}
|
|
|
+
|
|
|
+impl BgzTabixWriter {
|
|
|
+ /// Create a new [`BgzTabixWriter`] targeting `output_path` (must end in `.gz`).
|
|
|
+ ///
|
|
|
+ /// A UUID temp file is created in the same directory as `output_path` to
|
|
|
+ /// guarantee that the final rename is atomic (same filesystem).
|
|
|
+ ///
|
|
|
+ /// # Arguments
|
|
|
+ ///
|
|
|
+ /// * `output_path` - Destination `.gz` path
|
|
|
+ /// * `format` - Index format to produce; see [`IndexFormat`]
|
|
|
+ /// * `force` - If `false`, return an error if the output already exists
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ ///
|
|
|
+ /// Returns an error if `format` is [`IndexFormat::Csi`] (not yet implemented),
|
|
|
+ /// if the output exists and `force` is `false`, or if the temp file cannot be created.
|
|
|
+ pub fn new(output_path: &str, format: IndexFormat, force: bool) -> anyhow::Result<Self> {
|
|
|
+ if format == IndexFormat::Csi {
|
|
|
+ anyhow::bail!(
|
|
|
+ "CSI index format is not yet implemented; use IndexFormat::Tbi. \
|
|
|
+ CSI requires a different noodles API path (integer reference IDs \
|
|
|
+ instead of string names in add_record)."
|
|
|
+ );
|
|
|
+ }
|
|
|
+
|
|
|
+ if !force && Path::new(output_path).exists() {
|
|
|
+ anyhow::bail!("output already exists (use force=true to overwrite): {output_path}");
|
|
|
+ }
|
|
|
+
|
|
|
+ let tmp_dir = Path::new(output_path).parent().unwrap_or(Path::new("."));
|
|
|
+ let mut guard = TempFileGuard::new();
|
|
|
+ let tmp_path = guard.tmp_path(".gz", tmp_dir);
|
|
|
+
|
|
|
+ let writer = get_gz_writer(&tmp_path.to_string_lossy(), false)?;
|
|
|
+
|
|
|
+ let mut indexer = tabix::index::Indexer::default();
|
|
|
+ indexer.set_header(Header::default());
|
|
|
+
|
|
|
+ Ok(Self {
|
|
|
+ writer,
|
|
|
+ indexer,
|
|
|
+ format,
|
|
|
+ tmp_path,
|
|
|
+ output_path: output_path.to_string(),
|
|
|
+ guard,
|
|
|
+ })
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Write a header or comment line to the BGZF output without indexing it.
|
|
|
+ ///
|
|
|
+ /// Use this for lines starting with `#`, `track`, `browser`, or any other
|
|
|
+ /// metadata that should not appear in the Tabix index.
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ ///
|
|
|
+ /// Returns an error if the write fails.
|
|
|
+ pub fn write_header(&mut self, line: &[u8]) -> anyhow::Result<()> {
|
|
|
+ self.writer.write_all(line).context("failed writing header line")?;
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Write a data line and register it in the Tabix index.
|
|
|
+ ///
|
|
|
+ /// `start` and `end` must be **1-based** [`Position`] values as expected by
|
|
|
+ /// Tabix. For BED files (0-based half-open), use [`write_bed_record`](Self::write_bed_record)
|
|
|
+ /// instead.
|
|
|
+ ///
|
|
|
+ /// # Arguments
|
|
|
+ ///
|
|
|
+ /// * `line` - Raw bytes of the line (including the trailing newline)
|
|
|
+ /// * `rname` - Reference/contig name matching the file's sequence names
|
|
|
+ /// * `start` - 1-based inclusive start position
|
|
|
+ /// * `end` - 1-based inclusive end position
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ ///
|
|
|
+ /// Returns an error if the write or index insertion fails.
|
|
|
+ pub fn write_record(
|
|
|
+ &mut self,
|
|
|
+ line: &[u8],
|
|
|
+ rname: &str,
|
|
|
+ start: Position,
|
|
|
+ end: Position,
|
|
|
+ ) -> anyhow::Result<()> {
|
|
|
+ let chunk_start = self.writer.virtual_position();
|
|
|
+ self.writer.write_all(line).context("failed writing record")?;
|
|
|
+ let chunk_end = self.writer.virtual_position();
|
|
|
+
|
|
|
+ self.indexer
|
|
|
+ .add_record(rname, start, end, Chunk::new(chunk_start, chunk_end))
|
|
|
+ .with_context(|| format!("tabix: failed to add record for {rname}"))?;
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Write a BED data line, converting 0-based coordinates to 1-based Tabix positions.
|
|
|
+ ///
|
|
|
+ /// BED coordinates are **0-based half-open `[start0, end0)`**. The conversion is:
|
|
|
+ /// - `tabix_start = start0 + 1` (0-based inclusive → 1-based inclusive)
|
|
|
+ /// - `tabix_end = end0` (0-based exclusive = 1-based inclusive numerically)
|
|
|
+ ///
|
|
|
+ /// # Arguments
|
|
|
+ ///
|
|
|
+ /// * `line` - Raw bytes of the BED line (including the trailing newline)
|
|
|
+ /// * `rname` - Contig name from column 1
|
|
|
+ /// * `start0` - BED column 2: 0-based inclusive start
|
|
|
+ /// * `end0` - BED column 3: 0-based exclusive end
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ ///
|
|
|
+ /// Returns an error if coordinate conversion overflows or the write fails.
|
|
|
+ pub fn write_bed_record(
|
|
|
+ &mut self,
|
|
|
+ line: &[u8],
|
|
|
+ rname: &str,
|
|
|
+ start0: u32,
|
|
|
+ end0: u32,
|
|
|
+ ) -> anyhow::Result<()> {
|
|
|
+ let start = Position::try_from(start0 as usize + 1)
|
|
|
+ .context("BED: start coordinate overflow to tabix Position")?;
|
|
|
+ let end = Position::try_from(end0 as usize)
|
|
|
+ .context("BED: end must be >= 1 for tabix indexing")?;
|
|
|
+ self.write_record(line, rname, start, end)
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Finalise the BGZF file, write the `.tbi` index, and atomically rename
|
|
|
+ /// the temp file to the output path.
|
|
|
+ ///
|
|
|
+ /// The [`TempFileGuard`] is disarmed on success; on failure it removes the
|
|
|
+ /// temp file automatically.
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ ///
|
|
|
+ /// Returns an error if BGZF finalisation, index writing, or the rename fails.
|
|
|
+ /// Finalise the BGZF file, write the index, and atomically rename the temp
|
|
|
+ /// file to the output path.
|
|
|
+ ///
|
|
|
+ /// The index extension matches the format: `.tbi` for [`IndexFormat::Tbi`],
|
|
|
+ /// `.csi` for [`IndexFormat::Csi`] (not yet implemented).
|
|
|
+ ///
|
|
|
+ /// The [`TempFileGuard`] is disarmed on success; on failure it removes the
|
|
|
+ /// temp file automatically.
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ ///
|
|
|
+ /// Returns an error if BGZF finalisation, index writing, or the rename fails.
|
|
|
+ pub fn finish(self) -> anyhow::Result<()> {
|
|
|
+ let Self { writer, indexer, format, tmp_path, output_path, mut guard } = self;
|
|
|
+
|
|
|
+ let tmp_str = tmp_path.to_string_lossy().to_string();
|
|
|
+
|
|
|
+ finalize_bgzf_file(writer, &tmp_str)?;
|
|
|
+
|
|
|
+ let index = indexer.build();
|
|
|
+
|
|
|
+ match format {
|
|
|
+ IndexFormat::Tbi => {
|
|
|
+ let tbi_path = format!("{output_path}.tbi");
|
|
|
+ tabix::fs::write(&tbi_path, &index)
|
|
|
+ .with_context(|| format!("failed writing tabix index: {tbi_path}"))?;
|
|
|
+ }
|
|
|
+ IndexFormat::Csi => {
|
|
|
+ // Unreachable: new() rejects Csi before any writing begins.
|
|
|
+ unreachable!("CSI format rejected in BgzTabixWriter::new");
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if Path::new(&output_path).exists() {
|
|
|
+ fs::remove_file(&output_path)
|
|
|
+ .with_context(|| format!("failed to remove existing output: {output_path}"))?;
|
|
|
+ }
|
|
|
+
|
|
|
+ fs::rename(&tmp_path, &output_path)
|
|
|
+ .with_context(|| format!("failed to rename {tmp_str} → {output_path}"))?;
|
|
|
+
|
|
|
+ guard.disarm();
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
#[cfg(test)]
|
|
|
mod tests {
|
|
|
use super::*;
|