Просмотр исходного кода

add BgzTabixWriter with IndexFormat enum and refactor convert_bgz_with_tabix

Introduce BgzTabixWriter in writers.rs — a combined BGZF writer and Tabix
indexer with atomic output via TempFileGuard UUID temp files:
  - write_header: writes to BGZF only, not indexed (for # / track / browser)
  - write_record: takes 1-based noodles Position (generic for VCF/GFF/etc.)
  - write_bed_record: takes 0-based u32 BED coords, converts internally:
      tabix_start = bed_start + 1  (0-based inclusive -> 1-based inclusive)
      tabix_end   = bed_end        (0-based exclusive = 1-based inclusive)
  - finish: finalises BGZF, writes .tbi, atomically renames tmp -> output

Add IndexFormat enum (Tbi / Csi) as a format selection parameter:
  - Tbi: fully implemented, max chromosome 512 Mbp, universal compatibility
  - Csi: rejected in new() with an explanatory error; requires a separate
    internal path because noodles_csi::add_record uses integer reference IDs
    instead of string names, unlike tabix::index::Indexer

Refactor convert_bgz_with_tabix in bed.rs to use BgzTabixWriter: the function
now contains only BED column parsing; all BGZF/Tabix mechanics removed. Drop
the now-unused noodles_core / noodles_csi / noodles_tabix imports from bed.rs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas 1 месяц назад
Родитель
Сommit
d2b910af0b
2 измененных файлов с 282 добавлено и 69 удалено
  1. 29 67
      src/io/bed.rs
  2. 253 2
      src/io/writers.rs

+ 29 - 67
src/io/bed.rs

@@ -13,7 +13,7 @@
 //! | [`annotate_with_bed`] | Annotate a [`Variants`] collection from a BED region set |
 //! | [`bedrow_overlaps_par`] | Parallel overlap query: which BED rows hit a set of query ranges |
 //! | [`GenesBedIndex`] | Per-contig indexed structure for fast gene-name lookup |
-//! | [`convert_bgz_with_tabix`] | Compress a BED file to BGZF and build a Tabix index in one pass |
+//! | [`convert_bgz_with_tabix`] | Compress a BED file to BGZF + Tabix index via [`BgzTabixWriter`] |
 //! | [`parse_centromere_intervals`] | Extract centromeric intervals from a UCSC cytoband file |
 
 use std::{
@@ -24,14 +24,10 @@ use anyhow::Context;
 use log::warn;
 use rayon::prelude::*;
 
-use crate::{annotation::Annotation, io::writers::{finalize_bgzf_file, get_gz_writer}, positions::{GenomePosition, GenomeRange, GetGenomeRange, extract_contig_indices, find_contig_indices, overlaps_par}, variant::variant_collection::Variants};
+use crate::{annotation::Annotation, io::writers::{BgzTabixWriter, IndexFormat}, positions::{GenomePosition, GenomeRange, GetGenomeRange, extract_contig_indices, find_contig_indices, overlaps_par}, variant::variant_collection::Variants};
 
 use super::readers::get_reader;
 
-use noodles_core::Position;
-use noodles_csi::binning_index::index::reference_sequence::bin::Chunk;
-use noodles_csi::binning_index::index::Header;
-use noodles_tabix as tabix;
 use std::io::Write;
 
 /// One parsed row from a BED file (up to BED6).
@@ -298,93 +294,59 @@ impl GenesBedIndex {
 
 /// Compress a BED file to BGZF and build a Tabix index (`.tbi`) in a single pass.
 ///
-/// The output files are `<input>.gz` and `<input>.gz.tbi`. The input must be
-/// tab-delimited, contig-grouped, and coordinate-sorted (Tabix requirement).
+/// Delegates all BGZF and Tabix mechanics to [`BgzTabixWriter`]. BED-specific work
+/// here is limited to column parsing and coordinate conversion.
+///
+/// Output files are `<input>.gz` and `<input>.gz.tbi`. Both are written atomically.
+/// The input must be tab-delimited, contig-grouped, and coordinate-sorted.
 ///
 /// # Coordinate conversion
 ///
-/// BED uses 0-based half-open `[start, end)`. Tabix uses 1-based positions.
-/// The conversion is:
-/// - `tabix_start = bed_start + 1`  (0-based → 1-based)
-/// - `tabix_end   = bed_end`        (0-based exclusive = 1-based inclusive numerically)
+/// BED coordinates are **0-based half-open `[start, end)`**. [`BgzTabixWriter::write_bed_record`]
+/// converts them to 1-based Tabix positions automatically:
+/// - `tabix_start = bed_start + 1`
+/// - `tabix_end   = bed_end`  (0-based exclusive = 1-based inclusive numerically)
 ///
 /// # Arguments
 ///
-/// * `input` - Path to the input BED file
-/// * `force` - Overwrite the output `.gz` file if it already exists
+/// * `input` - Path to the input BED file (plain text or BGZF)
+/// * `force` - Overwrite existing output files if present
 ///
 /// # Errors
 ///
-/// Returns an error if the input cannot be read, the output cannot be written,
-/// or any data line is malformed (missing columns, non-integer coordinates).
+/// Returns an error if the input cannot be read, any data line is malformed, or
+/// the output cannot be written.
 pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::Result<()> {
     let input = input.as_ref();
     let out_bgz = format!("{}.gz", input.display());
-    let out_tbi = format!("{}.tbi", out_bgz);
 
     let reader = get_reader(&input.to_string_lossy())?;
     let mut reader = BufReader::new(reader);
-
-    let mut writer = get_gz_writer(&out_bgz, force)?;
-
-    // Build a tabix index while we write.
-    let mut indexer = tabix::index::Indexer::default();
-    indexer.set_header(Header::default());
+    let mut writer = BgzTabixWriter::new(&out_bgz, IndexFormat::Tbi, force)?;
 
     let mut line = String::new();
     loop {
         line.clear();
-        let n = reader.read_line(&mut line)?;
-        if n == 0 {
+        if reader.read_line(&mut line)? == 0 {
             break;
         }
 
-        let chunk_start = writer.virtual_position();
-        writer.write_all(line.as_bytes())?;
-        let chunk_end = writer.virtual_position();
-
-        if !line.starts_with('#') && !line.trim().is_empty() {
-            let mut it = line.split('\t');
-            let rname = it
-                .next()
-                .context("BED: missing contig")?;
-
-            let start0: u32 = it
-                .next()
-                .context("BED: missing start")?
-                .parse()
-                .context("BED: invalid start")?;
-
-            let end0: u32 = it
-                .next()
-                .context("BED: missing end")?
-                .trim()
-                .parse()
-                .context("BED: invalid end")?;
-
-            let start1 = start0
-                .checked_add(1)
-                .context("BED: start overflow")?;
-
-            let start = Position::try_from(start1 as usize)
-                .context("BED: start must be >= 1")?;
-            let end = Position::try_from(end0 as usize)
-                .context("BED: end must be >= 1 for tabix indexing")?;
-
-            let chunk = Chunk::new(chunk_start, chunk_end);
-
-            indexer
-                .add_record(rname, start, end, chunk)
-                .with_context(|| format!("tabix: failed to add record for {rname}:{start0}-{end0}"))?;
+        if line.starts_with('#') || line.trim().is_empty() {
+            writer.write_header(line.as_bytes())?;
+            continue;
         }
-    }
 
-    finalize_bgzf_file(writer, &out_bgz)?;
+        let mut it = line.split('\t');
+        let rname = it.next().context("BED: missing contig")?;
+        let start0: u32 = it.next().context("BED: missing start")?
+            .parse().context("BED: invalid start")?;
+        let end0: u32 = it.next().context("BED: missing end")?
+            .trim().parse().context("BED: invalid end")?;
 
-    let index = indexer.build();
-    tabix::fs::write(&out_tbi, &index)?;
+        writer.write_bed_record(line.as_bytes(), rname, start0, end0)?;
+    }
 
-    Ok(())
+    writer.finish()
 }
 
 /// Parse centromeric intervals from a UCSC cytoband BED file.

+ 253 - 2
src/io/writers.rs

@@ -1,17 +1,22 @@
 //! BGZF and plain-text file writer helpers.
 //!
 //! All BGZF writers produce files compatible with `bgzip`/HTSlib.
-//! Use [`finalize_bgzf_file`] to flush and sync every BGZF writer created here.
+//! Use [`finalize_bgzf_file`] to flush and sync a raw BGZF writer, or use
+//! [`BgzTabixWriter`] to produce a BGZF file with an accompanying Tabix index
+//! in a single atomic pass.
 
 use std::{
     fs::{self, File, OpenOptions},
     io::{BufWriter, Write},
-    path::Path,
+    path::{Path, PathBuf},
 };
 
 use anyhow::Context;
 use log::info;
 use noodles_bgzf as bgzf;
+use noodles_core::Position;
+use noodles_csi::binning_index::index::{reference_sequence::bin::Chunk, Header};
+use noodles_tabix as tabix;
 
 use crate::{helpers::TempFileGuard, io::readers::get_reader};
 
@@ -148,6 +153,252 @@ pub fn finalize_bgzf_file(
     Ok(())
 }
 
+/// Output index format for [`BgzTabixWriter`].
+///
+/// # Chromosome size limits
+///
+/// TBI uses a fixed binning scheme (min\_shift=14, depth=5) that supports chromosomes
+/// up to **2²⁹ bp (~512 Mbp)**. This covers all human chromosomes (chr1 ≈ 249 Mbp)
+/// and most common model organisms.
+///
+/// CSI uses variable-depth binning with no size limit and is required for large genomes
+/// (some plants, fish). It is supported by bcftools and modern HTSlib tools but not
+/// by all legacy software.
+///
+/// # noodles API difference
+///
+/// Swapping formats is not trivial at the noodles level: `tabix::index::Indexer`
+/// takes string reference names in `add_record`, while `csi::binning_index::Indexer`
+/// takes integer reference IDs, requiring a separate internal name→ID mapping.
+/// CSI support is therefore marked as not yet implemented.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum IndexFormat {
+    /// Tabix index (`.tbi`). Max chromosome 512 Mbp. Maximum tool compatibility.
+    Tbi,
+    /// CSI index (`.csi`). No chromosome size limit. Not yet implemented.
+    Csi,
+}
+
+/// A combined BGZF writer and Tabix indexer for a single output file.
+///
+/// Writes data lines to a BGZF-compressed file while simultaneously building
+/// a Tabix index. On [`finish`](BgzTabixWriter::finish), both the `.gz` and
+/// `.tbi` files are written atomically using a UUID temp file and [`TempFileGuard`],
+/// so a failed write never leaves a corrupt output behind.
+///
+/// # Coordinate systems
+///
+/// Tabix uses **1-based** positions internally. BED files use **0-based half-open**
+/// `[start, end)`. Use [`write_record`](BgzTabixWriter::write_record) with
+/// pre-converted 1-based [`Position`] values for any format, or the convenience
+/// method [`write_bed_record`](BgzTabixWriter::write_bed_record) which performs
+/// the BED → Tabix conversion automatically:
+///
+/// ```text
+/// tabix_start = bed_start + 1        (0-based inclusive → 1-based inclusive)
+/// tabix_end   = bed_end              (0-based exclusive = 1-based inclusive numerically)
+/// ```
+///
+/// # Example
+///
+/// ```no_run
+/// use pandora_lib_promethion::io::writers::BgzTabixWriter;
+///
+/// let mut w = BgzTabixWriter::new("out.bed.gz", IndexFormat::Tbi, false)?;
+/// w.write_header(b"# comment\n")?;
+/// w.write_bed_record(b"chr1\t0\t100\tgene\n", "chr1", 0, 100)?;
+/// w.finish()?;
+/// # anyhow::Ok(())
+/// ```
+pub struct BgzTabixWriter {
+    writer: bgzf::io::Writer<BufWriter<File>>,
+    indexer: tabix::index::Indexer,
+    format: IndexFormat,
+    tmp_path: PathBuf,
+    output_path: String,
+    guard: TempFileGuard,
+}
+
+impl BgzTabixWriter {
+    /// Create a new [`BgzTabixWriter`] targeting `output_path` (must end in `.gz`).
+    ///
+    /// A UUID temp file is created in the same directory as `output_path` to
+    /// guarantee that the final rename is atomic (same filesystem).
+    ///
+    /// # Arguments
+    ///
+    /// * `output_path` - Destination `.gz` path
+    /// * `format` - Index format to produce; see [`IndexFormat`]
+    /// * `force` - If `false`, return an error if the output already exists
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if `format` is [`IndexFormat::Csi`] (not yet implemented),
+    /// if the output exists and `force` is `false`, or if the temp file cannot be created.
+    pub fn new(output_path: &str, format: IndexFormat, force: bool) -> anyhow::Result<Self> {
+        if format == IndexFormat::Csi {
+            anyhow::bail!(
+                "CSI index format is not yet implemented; use IndexFormat::Tbi. \
+                 CSI requires a different noodles API path (integer reference IDs \
+                 instead of string names in add_record)."
+            );
+        }
+
+        if !force && Path::new(output_path).exists() {
+            anyhow::bail!("output already exists (use force=true to overwrite): {output_path}");
+        }
+
+        let tmp_dir = Path::new(output_path).parent().unwrap_or(Path::new("."));
+        let mut guard = TempFileGuard::new();
+        let tmp_path = guard.tmp_path(".gz", tmp_dir);
+
+        let writer = get_gz_writer(&tmp_path.to_string_lossy(), false)?;
+
+        let mut indexer = tabix::index::Indexer::default();
+        indexer.set_header(Header::default());
+
+        Ok(Self {
+            writer,
+            indexer,
+            format,
+            tmp_path,
+            output_path: output_path.to_string(),
+            guard,
+        })
+    }
+
+    /// Write a header or comment line to the BGZF output without indexing it.
+    ///
+    /// Use this for lines starting with `#`, `track`, `browser`, or any other
+    /// metadata that should not appear in the Tabix index.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the write fails.
+    pub fn write_header(&mut self, line: &[u8]) -> anyhow::Result<()> {
+        self.writer.write_all(line).context("failed writing header line")?;
+        Ok(())
+    }
+
+    /// Write a data line and register it in the Tabix index.
+    ///
+    /// `start` and `end` must be **1-based** [`Position`] values as expected by
+    /// Tabix. For BED files (0-based half-open), use [`write_bed_record`](Self::write_bed_record)
+    /// instead.
+    ///
+    /// # Arguments
+    ///
+    /// * `line` - Raw bytes of the line (including the trailing newline)
+    /// * `rname` - Reference/contig name matching the file's sequence names
+    /// * `start` - 1-based inclusive start position
+    /// * `end` - 1-based inclusive end position
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the write or index insertion fails.
+    pub fn write_record(
+        &mut self,
+        line: &[u8],
+        rname: &str,
+        start: Position,
+        end: Position,
+    ) -> anyhow::Result<()> {
+        let chunk_start = self.writer.virtual_position();
+        self.writer.write_all(line).context("failed writing record")?;
+        let chunk_end = self.writer.virtual_position();
+
+        self.indexer
+            .add_record(rname, start, end, Chunk::new(chunk_start, chunk_end))
+            .with_context(|| format!("tabix: failed to add record for {rname}"))?;
+
+        Ok(())
+    }
+
+    /// Write a BED data line, converting 0-based coordinates to 1-based Tabix positions.
+    ///
+    /// BED coordinates are **0-based half-open `[start0, end0)`**. The conversion is:
+    /// - `tabix_start = start0 + 1`  (0-based inclusive → 1-based inclusive)
+    /// - `tabix_end   = end0`        (0-based exclusive = 1-based inclusive numerically)
+    ///
+    /// # Arguments
+    ///
+    /// * `line` - Raw bytes of the BED line (including the trailing newline)
+    /// * `rname` - Contig name from column 1
+    /// * `start0` - BED column 2: 0-based inclusive start
+    /// * `end0` - BED column 3: 0-based exclusive end
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if coordinate conversion overflows or the write fails.
+    pub fn write_bed_record(
+        &mut self,
+        line: &[u8],
+        rname: &str,
+        start0: u32,
+        end0: u32,
+    ) -> anyhow::Result<()> {
+        let start = Position::try_from(start0 as usize + 1)
+            .context("BED: start coordinate overflow to tabix Position")?;
+        let end = Position::try_from(end0 as usize)
+            .context("BED: end must be >= 1 for tabix indexing")?;
+        self.write_record(line, rname, start, end)
+    }
+
+    /// Finalise the BGZF file, write the `.tbi` index, and atomically rename
+    /// the temp file to the output path.
+    ///
+    /// The [`TempFileGuard`] is disarmed on success; on failure it removes the
+    /// temp file automatically.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if BGZF finalisation, index writing, or the rename fails.
+    /// Finalise the BGZF file, write the index, and atomically rename the temp
+    /// file to the output path.
+    ///
+    /// The index extension matches the format: `.tbi` for [`IndexFormat::Tbi`],
+    /// `.csi` for [`IndexFormat::Csi`] (not yet implemented).
+    ///
+    /// The [`TempFileGuard`] is disarmed on success; on failure it removes the
+    /// temp file automatically.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if BGZF finalisation, index writing, or the rename fails.
+    pub fn finish(self) -> anyhow::Result<()> {
+        let Self { writer, indexer, format, tmp_path, output_path, mut guard } = self;
+
+        let tmp_str = tmp_path.to_string_lossy().to_string();
+
+        finalize_bgzf_file(writer, &tmp_str)?;
+
+        let index = indexer.build();
+
+        match format {
+            IndexFormat::Tbi => {
+                let tbi_path = format!("{output_path}.tbi");
+                tabix::fs::write(&tbi_path, &index)
+                    .with_context(|| format!("failed writing tabix index: {tbi_path}"))?;
+            }
+            IndexFormat::Csi => {
+                // Unreachable: new() rejects Csi before any writing begins.
+                unreachable!("CSI format rejected in BgzTabixWriter::new");
+            }
+        }
+
+        if Path::new(&output_path).exists() {
+            fs::remove_file(&output_path)
+                .with_context(|| format!("failed to remove existing output: {output_path}"))?;
+        }
+
+        fs::rename(&tmp_path, &output_path)
+            .with_context(|| format!("failed to rename {tmp_str} → {output_path}"))?;
+
+        guard.disarm();
+        Ok(())
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;