Browse Source

enforce tabix sort order in BgzTabixWriter::write_record

Tabix requires input sorted by (contig, start). BgzTabixWriter now tracks
the previous (rname, start) and raises an error on the first violation:
  - start decreases within the same contig
  - a contig reappears after another contig has begun (interleaving)

Enforcement lives in write_record so it applies to all callers automatically
(write_bed_record delegates to write_record). On error the TempFileGuard
drops and cleans up the partial temp file with no intervention from the caller.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas 1 tháng trước cách đây
mục cha
commit
15224fb822
1 tập tin đã thay đổi với 41 bổ sung2 xóa
  1. 41 2
      src/io/writers.rs

+ 41 - 2
src/io/writers.rs

@@ -6,6 +6,7 @@
 //! in a single atomic pass.
 
 use std::{
+    collections::HashSet,
     fs::{self, File, OpenOptions},
     io::{BufWriter, Write},
     path::{Path, PathBuf},
@@ -217,6 +218,10 @@ pub struct BgzTabixWriter {
     tmp_path: PathBuf,
     output_path: String,
     guard: TempFileGuard,
+    // sort-order tracking
+    prev_rname: String,
+    prev_start: Option<Position>,
+    finished_rnames: HashSet<String>,
 }
 
 impl BgzTabixWriter {
@@ -264,6 +269,9 @@ impl BgzTabixWriter {
             tmp_path,
             output_path: output_path.to_string(),
             guard,
+            prev_rname: String::new(),
+            prev_start: None,
+            finished_rnames: HashSet::new(),
         })
     }
 
@@ -286,6 +294,11 @@ impl BgzTabixWriter {
     /// Tabix. For BED files (0-based half-open), use [`write_bed_record`](Self::write_bed_record)
     /// instead.
     ///
+    /// Enforces tabix sort order: within a contig records must be in ascending
+    /// start order; a contig may not reappear after another contig has begun.
+    /// Violations return an error immediately — the [`TempFileGuard`] ensures
+    /// the partial temp file is cleaned up automatically.
+    ///
     /// # Arguments
     ///
     /// * `line` - Raw bytes of the line (including the trailing newline)
@@ -295,7 +308,7 @@ impl BgzTabixWriter {
     ///
     /// # Errors
     ///
-    /// Returns an error if the write or index insertion fails.
+    /// Returns an error if the sort order is violated or if the write fails.
     pub fn write_record(
         &mut self,
         line: &[u8],
@@ -303,6 +316,32 @@ impl BgzTabixWriter {
         start: Position,
         end: Position,
     ) -> anyhow::Result<()> {
+        if rname == self.prev_rname {
+            if let Some(prev) = self.prev_start {
+                if start < prev {
+                    anyhow::bail!(
+                        "BED/tabix input is not sorted: {rname} position {} comes after {} \
+                         (tabix requires records sorted by contig then start position)",
+                        usize::from(start),
+                        usize::from(prev),
+                    );
+                }
+            }
+        } else {
+            if self.finished_rnames.contains(rname) {
+                anyhow::bail!(
+                    "BED/tabix input is not sorted: contig {rname} appears non-contiguously \
+                     (tabix requires all records for the same contig to be consecutive)"
+                );
+            }
+            if !self.prev_rname.is_empty() {
+                self.finished_rnames.insert(self.prev_rname.clone());
+            }
+            self.prev_rname = rname.to_string();
+            self.prev_start = None;
+        }
+        self.prev_start = Some(start);
+
         let chunk_start = self.writer.virtual_position();
         self.writer.write_all(line).context("failed writing record")?;
         let chunk_end = self.writer.virtual_position();
@@ -366,7 +405,7 @@ impl BgzTabixWriter {
     ///
     /// Returns an error if BGZF finalisation, index writing, or the rename fails.
     pub fn finish(self) -> anyhow::Result<()> {
-        let Self { writer, indexer, format, tmp_path, output_path, mut guard } = self;
+        let Self { writer, indexer, format, tmp_path, output_path, mut guard, .. } = self;
 
         let tmp_str = tmp_path.to_string_lossy().to_string();