2 luni în urmă · d2b910af0b
--- a/src/io/bed.rs
+++ b/src/io/bed.rs
@@ -13,7 +13,7 @@
 
				 //! | [`annotate_with_bed`] | Annotate a [`Variants`] collection from a BED region set |
			
 
				 //! | [`bedrow_overlaps_par`] | Parallel overlap query: which BED rows hit a set of query ranges |
			
 
				 //! | [`GenesBedIndex`] | Per-contig indexed structure for fast gene-name lookup |
			
 
				-//! | [`convert_bgz_with_tabix`] | Compress a BED file to BGZF and build a Tabix index in one pass |
			
 
				+//! | [`convert_bgz_with_tabix`] | Compress a BED file to BGZF + Tabix index via [`BgzTabixWriter`] |
			
 
				 //! | [`parse_centromere_intervals`] | Extract centromeric intervals from a UCSC cytoband file |
			
 
				 
			
 
				 use std::{
			
@@ -24,14 +24,10 @@ use anyhow::Context;
 
				 use log::warn;
			
 
				 use rayon::prelude::*;
			
 
				 
			
 
				-use crate::{annotation::Annotation, io::writers::{finalize_bgzf_file, get_gz_writer}, positions::{GenomePosition, GenomeRange, GetGenomeRange, extract_contig_indices, find_contig_indices, overlaps_par}, variant::variant_collection::Variants};
			
 
				+use crate::{annotation::Annotation, io::writers::{BgzTabixWriter, IndexFormat}, positions::{GenomePosition, GenomeRange, GetGenomeRange, extract_contig_indices, find_contig_indices, overlaps_par}, variant::variant_collection::Variants};
			
 
				 
			
 
				 use super::readers::get_reader;
			
 
				 
			
 
				-use noodles_core::Position;
			
 
				-use noodles_csi::binning_index::index::reference_sequence::bin::Chunk;
			
 
				-use noodles_csi::binning_index::index::Header;
			
 
				-use noodles_tabix as tabix;
			
 
				 use std::io::Write;
			
 
				 
			
 
				 /// One parsed row from a BED file (up to BED6).
			
@@ -298,93 +294,59 @@ impl GenesBedIndex {
 
				 
			
 
				 /// Compress a BED file to BGZF and build a Tabix index (`.tbi`) in a single pass.
			
 
				 ///
			
 
				-/// The output files are `<input>.gz` and `<input>.gz.tbi`. The input must be
			
 
				-/// tab-delimited, contig-grouped, and coordinate-sorted (Tabix requirement).
			
 
				+/// Delegates all BGZF and Tabix mechanics to [`BgzTabixWriter`]. BED-specific work
			
 
				+/// here is limited to column parsing and coordinate conversion.
			
 
				+///
			
 
				+/// Output files are `<input>.gz` and `<input>.gz.tbi`. Both are written atomically.
			
 
				+/// The input must be tab-delimited, contig-grouped, and coordinate-sorted.
			
 
				 ///
			
 
				 /// # Coordinate conversion
			
 
				 ///
			
 
				-/// BED uses 0-based half-open `[start, end)`. Tabix uses 1-based positions.
			
 
				-/// The conversion is:
			
 
				-/// - `tabix_start = bed_start + 1`  (0-based → 1-based)
			
 
				-/// - `tabix_end   = bed_end`        (0-based exclusive = 1-based inclusive numerically)
			
 
				+/// BED coordinates are **0-based half-open `[start, end)`**. [`BgzTabixWriter::write_bed_record`]
			
 
				+/// converts them to 1-based Tabix positions automatically:
			
 
				+/// - `tabix_start = bed_start + 1`
			
 
				+/// - `tabix_end   = bed_end`  (0-based exclusive = 1-based inclusive numerically)
			
 
				 ///
			
 
				 /// # Arguments
			
 
				 ///
			
 
				-/// * `input` - Path to the input BED file
			
 
				-/// * `force` - Overwrite the output `.gz` file if it already exists
			
 
				+/// * `input` - Path to the input BED file (plain text or BGZF)
			
 
				+/// * `force` - Overwrite existing output files if present
			
 
				 ///
			
 
				 /// # Errors
			
 
				 ///
			
 
				-/// Returns an error if the input cannot be read, the output cannot be written,
			
 
				-/// or any data line is malformed (missing columns, non-integer coordinates).
			
 
				+/// Returns an error if the input cannot be read, any data line is malformed, or
			
 
				+/// the output cannot be written.
			
 
				 pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::Result<()> {
			
 
				     let input = input.as_ref();
			
 
				     let out_bgz = format!("{}.gz", input.display());
			
 
				-    let out_tbi = format!("{}.tbi", out_bgz);
			
 
				 
			
 
				     let reader = get_reader(&input.to_string_lossy())?;
			
 
				     let mut reader = BufReader::new(reader);
			
 
				-
			
 
				-    let mut writer = get_gz_writer(&out_bgz, force)?;
			
 
				-
			
 
				-    // Build a tabix index while we write.
			
 
				-    let mut indexer = tabix::index::Indexer::default();
			
 
				-    indexer.set_header(Header::default());
			
 
				+    let mut writer = BgzTabixWriter::new(&out_bgz, IndexFormat::Tbi, force)?;
			
 
				 
			
 
				     let mut line = String::new();
			
 
				     loop {
			
 
				         line.clear();
			
 
				-        let n = reader.read_line(&mut line)?;
			
 
				-        if n == 0 {
			
 
				+        if reader.read_line(&mut line)? == 0 {
			
 
				             break;
			
 
				         }
			
 
				 
			
 
				-        let chunk_start = writer.virtual_position();
			
 
				-        writer.write_all(line.as_bytes())?;
			
 
				-        let chunk_end = writer.virtual_position();
			
 
				-
			
 
				-        if !line.starts_with('#') && !line.trim().is_empty() {
			
 
				-            let mut it = line.split('\t');
			
 
				-            let rname = it
			
 
				-                .next()
			
 
				-                .context("BED: missing contig")?;
			
 
				-
			
 
				-            let start0: u32 = it
			
 
				-                .next()
			
 
				-                .context("BED: missing start")?
			
 
				-                .parse()
			
 
				-                .context("BED: invalid start")?;
			
 
				-
			
 
				-            let end0: u32 = it
			
 
				-                .next()
			
 
				-                .context("BED: missing end")?
			
 
				-                .trim()
			
 
				-                .parse()
			
 
				-                .context("BED: invalid end")?;
			
 
				-
			
 
				-            let start1 = start0
			
 
				-                .checked_add(1)
			
 
				-                .context("BED: start overflow")?;
			
 
				-
			
 
				-            let start = Position::try_from(start1 as usize)
			
 
				-                .context("BED: start must be >= 1")?;
			
 
				-            let end = Position::try_from(end0 as usize)
			
 
				-                .context("BED: end must be >= 1 for tabix indexing")?;
			
 
				-
			
 
				-            let chunk = Chunk::new(chunk_start, chunk_end);
			
 
				-
			
 
				-            indexer
			
 
				-                .add_record(rname, start, end, chunk)
			
 
				-                .with_context(|| format!("tabix: failed to add record for {rname}:{start0}-{end0}"))?;
			
 
				+        if line.starts_with('#') || line.trim().is_empty() {
			
 
				+            writer.write_header(line.as_bytes())?;
			
 
				+            continue;
			
 
				         }
			
 
				-    }
			
 
				 
			
 
				-    finalize_bgzf_file(writer, &out_bgz)?;
			
 
				+        let mut it = line.split('\t');
			
 
				+        let rname = it.next().context("BED: missing contig")?;
			
 
				+        let start0: u32 = it.next().context("BED: missing start")?
			
 
				+            .parse().context("BED: invalid start")?;
			
 
				+        let end0: u32 = it.next().context("BED: missing end")?
			
 
				+            .trim().parse().context("BED: invalid end")?;
			
 
				 
			
 
				-    let index = indexer.build();
			
 
				-    tabix::fs::write(&out_tbi, &index)?;
			
 
				+        writer.write_bed_record(line.as_bytes(), rname, start0, end0)?;
			
 
				+    }
			
 
				 
			
 
				-    Ok(())
			
 
				+    writer.finish()
			
 
				 }
			
 
				 
			
 
				 /// Parse centromeric intervals from a UCSC cytoband BED file.
			
--- a/src/io/writers.rs
+++ b/src/io/writers.rs
@@ -1,17 +1,22 @@
 
				 //! BGZF and plain-text file writer helpers.
			
 
				 //!
			
 
				 //! All BGZF writers produce files compatible with `bgzip`/HTSlib.
			
 
				-//! Use [`finalize_bgzf_file`] to flush and sync every BGZF writer created here.
			
 
				+//! Use [`finalize_bgzf_file`] to flush and sync a raw BGZF writer, or use
			
 
				+//! [`BgzTabixWriter`] to produce a BGZF file with an accompanying Tabix index
			
 
				+//! in a single atomic pass.
			
 
				 
			
 
				 use std::{
			
 
				     fs::{self, File, OpenOptions},
			
 
				     io::{BufWriter, Write},
			
 
				-    path::Path,
			
 
				+    path::{Path, PathBuf},
			
 
				 };
			
 
				 
			
 
				 use anyhow::Context;
			
 
				 use log::info;
			
 
				 use noodles_bgzf as bgzf;
			
 
				+use noodles_core::Position;
			
 
				+use noodles_csi::binning_index::index::{reference_sequence::bin::Chunk, Header};
			
 
				+use noodles_tabix as tabix;
			
 
				 
			
 
				 use crate::{helpers::TempFileGuard, io::readers::get_reader};
			
 
				 
			
@@ -148,6 +153,252 @@ pub fn finalize_bgzf_file(
 
				     Ok(())
			
 
				 }
			
 
				 
			
 
				+/// Output index format for [`BgzTabixWriter`].
			
 
				+///
			
 
				+/// # Chromosome size limits
			
 
				+///
			
 
				+/// TBI uses a fixed binning scheme (min\_shift=14, depth=5) that supports chromosomes
			
 
				+/// up to **2²⁹ bp (~512 Mbp)**. This covers all human chromosomes (chr1 ≈ 249 Mbp)
			
 
				+/// and most common model organisms.
			
 
				+///
			
 
				+/// CSI uses variable-depth binning with no size limit and is required for large genomes
			
 
				+/// (some plants, fish). It is supported by bcftools and modern HTSlib tools but not
			
 
				+/// by all legacy software.
			
 
				+///
			
 
				+/// # noodles API difference
			
 
				+///
			
 
				+/// Swapping formats is not trivial at the noodles level: `tabix::index::Indexer`
			
 
				+/// takes string reference names in `add_record`, while `csi::binning_index::Indexer`
			
 
				+/// takes integer reference IDs, requiring a separate internal name→ID mapping.
			
 
				+/// CSI support is therefore marked as not yet implemented.
			
 
				+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
			
 
				+pub enum IndexFormat {
			
 
				+    /// Tabix index (`.tbi`). Max chromosome 512 Mbp. Maximum tool compatibility.
			
 
				+    Tbi,
			
 
				+    /// CSI index (`.csi`). No chromosome size limit. Not yet implemented.
			
 
				+    Csi,
			
 
				+}
			
 
				+
			
 
				+/// A combined BGZF writer and Tabix indexer for a single output file.
			
 
				+///
			
 
				+/// Writes data lines to a BGZF-compressed file while simultaneously building
			
 
				+/// a Tabix index. On [`finish`](BgzTabixWriter::finish), both the `.gz` and
			
 
				+/// `.tbi` files are written atomically using a UUID temp file and [`TempFileGuard`],
			
 
				+/// so a failed write never leaves a corrupt output behind.
			
 
				+///
			
 
				+/// # Coordinate systems
			
 
				+///
			
 
				+/// Tabix uses **1-based** positions internally. BED files use **0-based half-open**
			
 
				+/// `[start, end)`. Use [`write_record`](BgzTabixWriter::write_record) with
			
 
				+/// pre-converted 1-based [`Position`] values for any format, or the convenience
			
 
				+/// method [`write_bed_record`](BgzTabixWriter::write_bed_record) which performs
			
 
				+/// the BED → Tabix conversion automatically:
			
 
				+///
			
 
				+/// ```text
			
 
				+/// tabix_start = bed_start + 1        (0-based inclusive → 1-based inclusive)
			
 
				+/// tabix_end   = bed_end              (0-based exclusive = 1-based inclusive numerically)
			
 
				+/// ```
			
 
				+///
			
 
				+/// # Example
			
 
				+///
			
 
				+/// ```no_run
			
 
				+/// use pandora_lib_promethion::io::writers::BgzTabixWriter;
			
 
				+///
			
 
				+/// let mut w = BgzTabixWriter::new("out.bed.gz", IndexFormat::Tbi, false)?;
			
 
				+/// w.write_header(b"# comment\n")?;
			
 
				+/// w.write_bed_record(b"chr1\t0\t100\tgene\n", "chr1", 0, 100)?;
			
 
				+/// w.finish()?;
			
 
				+/// # anyhow::Ok(())
			
 
				+/// ```
			
 
				+pub struct BgzTabixWriter {
			
 
				+    writer: bgzf::io::Writer<BufWriter<File>>,
			
 
				+    indexer: tabix::index::Indexer,
			
 
				+    format: IndexFormat,
			
 
				+    tmp_path: PathBuf,
			
 
				+    output_path: String,
			
 
				+    guard: TempFileGuard,
			
 
				+}
			
 
				+
			
 
				+impl BgzTabixWriter {
			
 
				+    /// Create a new [`BgzTabixWriter`] targeting `output_path` (must end in `.gz`).
			
 
				+    ///
			
 
				+    /// A UUID temp file is created in the same directory as `output_path` to
			
 
				+    /// guarantee that the final rename is atomic (same filesystem).
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    ///
			
 
				+    /// * `output_path` - Destination `.gz` path
			
 
				+    /// * `format` - Index format to produce; see [`IndexFormat`]
			
 
				+    /// * `force` - If `false`, return an error if the output already exists
			
 
				+    ///
			
 
				+    /// # Errors
			
 
				+    ///
			
 
				+    /// Returns an error if `format` is [`IndexFormat::Csi`] (not yet implemented),
			
 
				+    /// if the output exists and `force` is `false`, or if the temp file cannot be created.
			
 
				+    pub fn new(output_path: &str, format: IndexFormat, force: bool) -> anyhow::Result<Self> {
			
 
				+        if format == IndexFormat::Csi {
			
 
				+            anyhow::bail!(
			
 
				+                "CSI index format is not yet implemented; use IndexFormat::Tbi. \
			
 
				+                 CSI requires a different noodles API path (integer reference IDs \
			
 
				+                 instead of string names in add_record)."
			
 
				+            );
			
 
				+        }
			
 
				+
			
 
				+        if !force && Path::new(output_path).exists() {
			
 
				+            anyhow::bail!("output already exists (use force=true to overwrite): {output_path}");
			
 
				+        }
			
 
				+
			
 
				+        let tmp_dir = Path::new(output_path).parent().unwrap_or(Path::new("."));
			
 
				+        let mut guard = TempFileGuard::new();
			
 
				+        let tmp_path = guard.tmp_path(".gz", tmp_dir);
			
 
				+
			
 
				+        let writer = get_gz_writer(&tmp_path.to_string_lossy(), false)?;
			
 
				+
			
 
				+        let mut indexer = tabix::index::Indexer::default();
			
 
				+        indexer.set_header(Header::default());
			
 
				+
			
 
				+        Ok(Self {
			
 
				+            writer,
			
 
				+            indexer,
			
 
				+            format,
			
 
				+            tmp_path,
			
 
				+            output_path: output_path.to_string(),
			
 
				+            guard,
			
 
				+        })
			
 
				+    }
			
 
				+
			
 
				+    /// Write a header or comment line to the BGZF output without indexing it.
			
 
				+    ///
			
 
				+    /// Use this for lines starting with `#`, `track`, `browser`, or any other
			
 
				+    /// metadata that should not appear in the Tabix index.
			
 
				+    ///
			
 
				+    /// # Errors
			
 
				+    ///
			
 
				+    /// Returns an error if the write fails.
			
 
				+    pub fn write_header(&mut self, line: &[u8]) -> anyhow::Result<()> {
			
 
				+        self.writer.write_all(line).context("failed writing header line")?;
			
 
				+        Ok(())
			
 
				+    }
			
 
				+
			
 
				+    /// Write a data line and register it in the Tabix index.
			
 
				+    ///
			
 
				+    /// `start` and `end` must be **1-based** [`Position`] values as expected by
			
 
				+    /// Tabix. For BED files (0-based half-open), use [`write_bed_record`](Self::write_bed_record)
			
 
				+    /// instead.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    ///
			
 
				+    /// * `line` - Raw bytes of the line (including the trailing newline)
			
 
				+    /// * `rname` - Reference/contig name matching the file's sequence names
			
 
				+    /// * `start` - 1-based inclusive start position
			
 
				+    /// * `end` - 1-based inclusive end position
			
 
				+    ///
			
 
				+    /// # Errors
			
 
				+    ///
			
 
				+    /// Returns an error if the write or index insertion fails.
			
 
				+    pub fn write_record(
			
 
				+        &mut self,
			
 
				+        line: &[u8],
			
 
				+        rname: &str,
			
 
				+        start: Position,
			
 
				+        end: Position,
			
 
				+    ) -> anyhow::Result<()> {
			
 
				+        let chunk_start = self.writer.virtual_position();
			
 
				+        self.writer.write_all(line).context("failed writing record")?;
			
 
				+        let chunk_end = self.writer.virtual_position();
			
 
				+
			
 
				+        self.indexer
			
 
				+            .add_record(rname, start, end, Chunk::new(chunk_start, chunk_end))
			
 
				+            .with_context(|| format!("tabix: failed to add record for {rname}"))?;
			
 
				+
			
 
				+        Ok(())
			
 
				+    }
			
 
				+
			
 
				+    /// Write a BED data line, converting 0-based coordinates to 1-based Tabix positions.
			
 
				+    ///
			
 
				+    /// BED coordinates are **0-based half-open `[start0, end0)`**. The conversion is:
			
 
				+    /// - `tabix_start = start0 + 1`  (0-based inclusive → 1-based inclusive)
			
 
				+    /// - `tabix_end   = end0`        (0-based exclusive = 1-based inclusive numerically)
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    ///
			
 
				+    /// * `line` - Raw bytes of the BED line (including the trailing newline)
			
 
				+    /// * `rname` - Contig name from column 1
			
 
				+    /// * `start0` - BED column 2: 0-based inclusive start
			
 
				+    /// * `end0` - BED column 3: 0-based exclusive end
			
 
				+    ///
			
 
				+    /// # Errors
			
 
				+    ///
			
 
				+    /// Returns an error if coordinate conversion overflows or the write fails.
			
 
				+    pub fn write_bed_record(
			
 
				+        &mut self,
			
 
				+        line: &[u8],
			
 
				+        rname: &str,
			
 
				+        start0: u32,
			
 
				+        end0: u32,
			
 
				+    ) -> anyhow::Result<()> {
			
 
				+        let start = Position::try_from(start0 as usize + 1)
			
 
				+            .context("BED: start coordinate overflow to tabix Position")?;
			
 
				+        let end = Position::try_from(end0 as usize)
			
 
				+            .context("BED: end must be >= 1 for tabix indexing")?;
			
 
				+        self.write_record(line, rname, start, end)
			
 
				+    }
			
 
				+
			
 
				+    /// Finalise the BGZF file, write the `.tbi` index, and atomically rename
			
 
				+    /// the temp file to the output path.
			
 
				+    ///
			
 
				+    /// The [`TempFileGuard`] is disarmed on success; on failure it removes the
			
 
				+    /// temp file automatically.
			
 
				+    ///
			
 
				+    /// # Errors
			
 
				+    ///
			
 
				+    /// Returns an error if BGZF finalisation, index writing, or the rename fails.
			
 
				+    /// Finalise the BGZF file, write the index, and atomically rename the temp
			
 
				+    /// file to the output path.
			
 
				+    ///
			
 
				+    /// The index extension matches the format: `.tbi` for [`IndexFormat::Tbi`],
			
 
				+    /// `.csi` for [`IndexFormat::Csi`] (not yet implemented).
			
 
				+    ///
			
 
				+    /// The [`TempFileGuard`] is disarmed on success; on failure it removes the
			
 
				+    /// temp file automatically.
			
 
				+    ///
			
 
				+    /// # Errors
			
 
				+    ///
			
 
				+    /// Returns an error if BGZF finalisation, index writing, or the rename fails.
			
 
				+    pub fn finish(self) -> anyhow::Result<()> {
			
 
				+        let Self { writer, indexer, format, tmp_path, output_path, mut guard } = self;
			
 
				+
			
 
				+        let tmp_str = tmp_path.to_string_lossy().to_string();
			
 
				+
			
 
				+        finalize_bgzf_file(writer, &tmp_str)?;
			
 
				+
			
 
				+        let index = indexer.build();
			
 
				+
			
 
				+        match format {
			
 
				+            IndexFormat::Tbi => {
			
 
				+                let tbi_path = format!("{output_path}.tbi");
			
 
				+                tabix::fs::write(&tbi_path, &index)
			
 
				+                    .with_context(|| format!("failed writing tabix index: {tbi_path}"))?;
			
 
				+            }
			
 
				+            IndexFormat::Csi => {
			
 
				+                // Unreachable: new() rejects Csi before any writing begins.
			
 
				+                unreachable!("CSI format rejected in BgzTabixWriter::new");
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        if Path::new(&output_path).exists() {
			
 
				+            fs::remove_file(&output_path)
			
 
				+                .with_context(|| format!("failed to remove existing output: {output_path}"))?;
			
 
				+        }
			
 
				+
			
 
				+        fs::rename(&tmp_path, &output_path)
			
 
				+            .with_context(|| format!("failed to rename {tmp_str} → {output_path}"))?;
			
 
				+
			
 
				+        guard.disarm();
			
 
				+        Ok(())
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 #[cfg(test)]
			
 
				 mod tests {
			
 
				     use super::*;