Переглянути джерело

audit and fix src/io/bed.rs

None of the bug fixes would have changed results on existing data: query_genes
and bedrow_overlaps_par were unused in production paths; the trailing-newline
and blank-line issues only affect edge-case inputs not present in current usage.
Fixes are correctness-by-construction before these functions enter production.

- Fix bedrow_overlaps_par: replace broken two-pointer sweep with correct
  anchored-scan O(n+m); the old algorithm silently missed rows when multiple
  queries overlapped the same row (e.g. nested gene bodies)
- Remove duplicate extract_contig_indices_bed: reuse extract_contig_indices
  from positions.rs by converting BedRow slices to &GenomeRange refs inline
- Fix query_genes: replace binary-search back-up-by-1 with linear scan +
  early termination; the old approach missed genes starting well before the
  query start but extending past it (e.g. CNTNAP2, PTPRD, PARK2)
- Fix query_genes overlap condition: re >= s && rs <= e -> re > s && rs < e;
  adjacent intervals (touching but not overlapping) were false positives for
  half-open coordinate semantics
- Fix convert_bgz_with_tabix: trim end field before parse to handle trailing
  newline from read_line on 3-column BED files
- Fix read_bed: skip blank lines and track/browser header lines (UCSC BED
  convention) to avoid hard errors on standard BED files
- BedRow::from_str: simplify map(|v| Some(...)).unwrap_or(None) -> map(|v| v)
- Comprehensive rustdoc pass with //! module header table, coordinate system
  documented throughout, # Arguments / # Returns / # Errors sections

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas 1 місяць тому
батько
коміт
4dde55f778
1 змінених файлів з 175 додано та 147 видалено
  1. 175 147
      src/io/bed.rs

+ 175 - 147
src/io/bed.rs

@@ -1,10 +1,20 @@
-//! BED file parsing, indexing, and overlap-based variant annotation utilities.
+//! BED file I/O, overlap queries, and genomic region annotation.
 //!
-//! This module provides:
-//! - Parsing of BED rows into typed structures
-//! - Efficient overlap queries between BED regions and genome ranges
-//! - Parallel annotation of variants using BED-defined regions
-//! - A pre-indexed BED structure for fast gene queries
+//! All coordinates follow **BED convention: 0-based, half-open `[start, end)`**.
+//! This matches [`GenomeRange`]'s internal `Range<u32>` representation, so BED
+//! values are stored as-is without conversion.
+//!
+//! # Main types and functions
+//!
+//! | Item | Purpose |
+//! |------|---------|
+//! | [`BedRow`] | One parsed BED line (up to BED6) |
+//! | [`read_bed`] | Load a BED file into memory, skipping headers and blank lines |
+//! | [`annotate_with_bed`] | Annotate a [`Variants`] collection from a BED region set |
+//! | [`bedrow_overlaps_par`] | Parallel overlap query: which BED rows hit a set of query ranges |
+//! | [`GenesBedIndex`] | Per-contig indexed structure for fast gene-name lookup |
+//! | [`convert_bgz_with_tabix`] | Compress a BED file to BGZF and build a Tabix index in one pass |
+//! | [`parse_centromere_intervals`] | Extract centromeric intervals from a UCSC cytoband file |
 
 use std::{
     io::{BufRead, BufReader}, path::Path, str::FromStr, sync::Arc
@@ -24,22 +34,33 @@ use noodles_csi::binning_index::index::Header;
 use noodles_tabix as tabix;
 use std::io::Write;
 
-/// One row of a BED file.
+/// One parsed row from a BED file (up to BED6).
 ///
-/// Represents a genomic interval with optional name, score, and strand
-/// information.
+/// Coordinates in `range` are **0-based, half-open** as stored in the BED file.
+/// Optional fields (`name`, `score`, `strand`) are `None` when the column is absent
+/// or unparseable.
 #[derive(Debug, Clone)]
 pub struct BedRow {
+    /// Genomic interval — 0-based half-open `[start, end)`
     pub range: GenomeRange,
+    /// BED column 4: feature name
     pub name: Option<String>,
+    /// BED column 5: score (0–1000 per spec, stored as `u16`)
     pub score: Option<u16>,
+    /// BED column 6: strand — `true` = `+`, `false` = `-`
     pub strand: Option<bool>,
 }
 
-/// Parses a BED row from a tab-separated string.
+/// Parse a tab-separated BED line into a [`BedRow`].
+///
+/// Expects at least 3 tab-separated columns: `chrom`, `start`, `end`.
+/// Columns 4–6 (`name`, `score`, `strand`) are optional; missing or unparseable
+/// values are stored as `None`. Coordinates are stored as-is (0-based half-open).
+///
+/// # Errors
 ///
-/// Expected format (BED6-compatible):
-/// `contig  start  end  name?  score?  strand?`
+/// Returns an error if fewer than 3 columns are present or if `start`/`end`
+/// cannot be parsed as `u32`.
 impl FromStr for BedRow {
     type Err = anyhow::Error;
 
@@ -58,7 +79,7 @@ impl FromStr for BedRow {
 
         Ok(Self {
             range,
-            name: v.get(3).map(|v| Some(v.to_string())).unwrap_or(None),
+            name: v.get(3).map(|v| v.to_string()),
             score: v.get(4).and_then(|v| v.parse().ok()),
             strand: v.get(5).and_then(|&v| match v {
                 "+" => Some(true),
@@ -69,19 +90,30 @@ impl FromStr for BedRow {
     }
 }
 
-/// Exposes the genomic range of a BED row.
 impl GetGenomeRange for BedRow {
     fn range(&self) -> &GenomeRange {
         &self.range
     }
 }
 
-/// Reads a BED file into memory.
+/// Load a BED file into memory as a vector of [`BedRow`]s.
 ///
-/// Lines starting with `#` are ignored.
+/// Skips blank lines and lines starting with `#`, `track`, or `browser`
+/// (standard UCSC header conventions). All other lines are parsed via
+/// [`BedRow::from_str`].
+///
+/// # Arguments
+///
+/// * `path` - Path to a BED file (plain text or gzip-compressed)
+///
+/// # Returns
+///
+/// A vector of parsed rows in file order.
 ///
 /// # Errors
-/// Returns an error if the file cannot be opened or if any row fails to parse.
+///
+/// Returns an error if the file cannot be opened or if any data line fails to parse.
+/// I/O errors on individual lines are logged as warnings and skipped.
 pub fn read_bed(path: &str) -> anyhow::Result<Vec<BedRow>> {
     let reader = BufReader::new(get_reader(path)?);
 
@@ -89,7 +121,11 @@ pub fn read_bed(path: &str) -> anyhow::Result<Vec<BedRow>> {
     for (i, line) in reader.lines().enumerate() {
         match line {
             Ok(line) => {
-                if line.starts_with("#") {
+                if line.is_empty()
+                    || line.starts_with('#')
+                    || line.starts_with("track")
+                    || line.starts_with("browser")
+                {
                     continue;
                 }
                 res.push(line.parse().context(format!("Can't parse {line}"))?);
@@ -101,18 +137,26 @@ pub fn read_bed(path: &str) -> anyhow::Result<Vec<BedRow>> {
     Ok(res)
 }
 
-/// Annotates variants with a given annotation based on overlap with a BED file,
-/// and returns total base count and number of overlapping mutations.
+/// Annotate variants that overlap a BED region set and return coverage statistics.
+///
+/// Loads the BED file, finds all variants whose position overlaps any BED region
+/// (via [`overlaps_par`]), pushes `annotation` onto each matching variant, and
+/// returns the total covered base-pair count alongside the overlap count.
 ///
 /// # Arguments
-/// * `variants` - Mutable reference to a `Variants` collection to annotate.
-/// * `bed_path` - Path to the BED file defining the region class.
-/// * `annotation` - The `Annotation` to assign to overlapping variants.
+///
+/// * `variants` - Variant collection to annotate in place
+/// * `bed_path` - Path to the BED file defining the region class
+/// * `annotation` - Annotation tag to attach to each overlapping variant
 ///
 /// # Returns
-/// `Ok((total_bp, overlap_count))` — where:
-///   - `total_bp`: number of base pairs in the BED regions
-///   - `overlap_count`: number of variants overlapping those regions
+///
+/// `(total_bp, overlap_count)` where `total_bp` is the sum of all BED interval
+/// lengths and `overlap_count` is the number of variants that received the annotation.
+///
+/// # Errors
+///
+/// Returns an error if the BED file cannot be read.
 pub fn annotate_with_bed(
     variants: &mut Variants,
     bed_path: &str,
@@ -121,16 +165,10 @@ pub fn annotate_with_bed(
     let bed_rows = read_bed(bed_path)?;
     let ranges: Vec<&GenomeRange> = bed_rows.iter().map(|b| &b.range).collect();
 
-    // Total number of base pairs in the BED regions
     let total_bp: usize = ranges.iter().map(|r| r.length() as usize).sum();
-
-    // Extract positions of all variants
     let positions: Vec<&GenomePosition> = variants.data.iter().map(|v| &v.position).collect();
-
-    // Find indices of variants overlapping the BED ranges
     let overlaps = overlaps_par(&positions, &ranges);
 
-    // Annotate overlapping variants
     for &idx in &overlaps {
         variants.data[idx].annotations.push(annotation.clone());
     }
@@ -138,9 +176,25 @@ pub fn annotate_with_bed(
     Ok((total_bp, overlaps.len()))
 }
 
-/// Returns all BED rows overlapping a set of query ranges (parallel).
+/// Return every BED row that overlaps at least one query range, parallelised per contig.
+///
+/// Both inputs must be sorted by `(contig, start)`. Each matching row appears exactly
+/// once in the output regardless of how many queries it overlaps.
+///
+/// Uses an anchored-scan algorithm (O(n + m)) that is correct even when rows or
+/// queries overlap each other (e.g. nested genes). The anchor `j0` advances only
+/// when a query definitively ends before the current row starts, so no hit can be
+/// missed.
+///
+/// # Arguments
 ///
-/// Input rows and queries must be sorted by contig and position.
+/// * `rows` - BED rows sorted by `(contig, start)`
+/// * `queries` - Query ranges sorted by `(contig, start)`
+///
+/// # Returns
+///
+/// Matching BED rows in per-contig order. Cross-contig order is unspecified (parallel
+/// execution).
 pub fn bedrow_overlaps_par(
     rows: &[BedRow],
     queries: &[&GenomeRange],
@@ -148,86 +202,57 @@ pub fn bedrow_overlaps_par(
 where
     BedRow: Clone + Send + Sync,
 {
-    // Pre-compute [start, end) indices per contig for both inputs.
+    let row_ranges: Vec<&GenomeRange> = rows.iter().map(|r| &r.range).collect();
     let (row_contigs, query_contigs) = rayon::join(
-        || extract_contig_indices_bed(rows),
+        || extract_contig_indices(&row_ranges),
         || extract_contig_indices(queries),
     );
 
     row_contigs
-        .into_par_iter()                             // one task per contig
+        .into_par_iter()
         .filter_map(|(contig, r_start, r_end)| {
-            // No queries on this contig → skip the task.
             let (q_start, q_end) = find_contig_indices(&query_contigs, contig)?;
 
             let r_slice = &rows[r_start..r_end];
             let q_slice = &queries[q_start..q_end];
 
             let mut hits = Vec::new();
-            let (mut i, mut j) = (0usize, 0usize);
-
-            // Classic two-finger sweep.
-            while i < r_slice.len() && j < q_slice.len() {
-                let r_range = r_slice[i].range();           // BedRow → GenomeRange
-                let q_range = &q_slice[j].range;
-
-                match (r_range.range.end <= q_range.start, q_range.end <= r_range.range.start) {
-                    (true, _) => i += 1,                    // row finishes before query starts
-                    (_, true) => j += 1,                    // query finishes before row starts
-                    _ => {
-                        hits.push(r_slice[i].clone());      // overlap detected
-                        if r_range.range.end < q_range.end {
-                            i += 1;
-                        } else {
-                            j += 1;
-                        }
-                    }
+            let mut j0 = 0usize;
+
+            for row in r_slice {
+                let r = row.range();
+
+                while j0 < q_slice.len() && q_slice[j0].range.end <= r.range.start {
+                    j0 += 1;
+                }
+
+                if j0 < q_slice.len() && q_slice[j0].range.start < r.range.end {
+                    hits.push(row.clone());
                 }
             }
+
             Some(hits)
         })
         .flatten()
         .collect()
 }
 
-/// Computes contiguous slice indices for BED rows grouped by contig.
-///
-/// Assumes input is sorted by contig.
-fn extract_contig_indices_bed(rows: &[BedRow]) -> Vec<(u8, usize, usize)> {
-    let mut out = Vec::new();
-    if rows.is_empty() {
-        return out;
-    }
-
-    let mut current = rows[0].range.contig;
-    let mut start = 0;
 
-    for (idx, row) in rows.iter().enumerate() {
-        if row.range.contig != current {
-            out.push((current, start, idx));
-            current = row.range.contig;
-            start = idx;
-        }
-    }
-    out.push((current, start, rows.len()));
-    out
-}
-
-/// Pre-indexed BED structure for fast gene lookup by genomic interval.
+/// Per-contig indexed BED structure for fast gene-name lookup.
 ///
-/// BED rows are grouped by contig and stored in sorted order.
+/// Rows are sorted by `(contig, start, end)` at construction time and stored in
+/// 256 contig slots (one per encoded `u8` contig). Lookups are O(rows before `end`)
+/// with early termination — correct even for very long host genes.
 #[derive(Clone)]
 pub struct GenesBedIndex {
-    // contig (u8) -> BedRows on that contig, sorted by range.start
     by_contig: Vec<Arc<[BedRow]>>,
 }
 
 impl GenesBedIndex {
-    /// Builds a contig-indexed BED structure.
+    /// Build a [`GenesBedIndex`] from an unsorted vector of [`BedRow`]s.
     ///
-    /// Input rows are sorted and grouped internally.
+    /// Rows are sorted by `(contig, start, end)` internally.
     pub fn new(mut rows: Vec<BedRow>) -> Self {
-        // Ensure deterministic grouping & fast queries
         rows.sort_unstable_by(|a, b| {
             a.range.contig
                 .cmp(&b.range.contig)
@@ -235,63 +260,63 @@ impl GenesBedIndex {
                 .then_with(|| a.range.range.end.cmp(&b.range.range.end))
         });
 
-        // 256 is fine if you encode contigs into u8
         let mut tmp: Vec<Vec<BedRow>> = vec![Vec::new(); 256];
         for r in rows {
             tmp[r.range.contig as usize].push(r);
         }
 
-        let by_contig = tmp
-            .into_iter()
-            .map(Arc::<[BedRow]>::from)
-            .collect();
-
-        Self { by_contig }
+        Self {
+            by_contig: tmp.into_iter().map(Arc::<[BedRow]>::from).collect(),
+        }
     }
 
-    /// Returns gene names overlapping the given interval.
-    #[inline]
+    /// Return the names of all genes overlapping `[start, end)` on `contig`.
+    ///
+    /// Coordinates are 0-based half-open. If `start > end` the arguments are
+    /// swapped defensively. Genes without a name field are silently skipped.
+    ///
+    /// # Arguments
+    ///
+    /// * `contig` - Encoded contig index (see `contig_to_num`)
+    /// * `start` - Query start, 0-based inclusive
+    /// * `end` - Query end, 0-based exclusive
+    ///
+    /// # Returns
+    ///
+    /// Names of overlapping genes in start-sorted order.
     pub fn query_genes(&self, contig: u8, start: u32, end: u32) -> Vec<String> {
         let (s, e) = if start <= end { (start, end) } else { (end, start) };
         let rows = &self.by_contig[contig as usize];
-        if rows.is_empty() {
-            return Vec::new();
-        }
-
-        // lower_bound on start
-        let mut i = match rows.binary_search_by_key(&s, |r| r.range.range.start) {
-            Ok(i) | Err(i) => i,
-        };
-        i = i.saturating_sub(1);
-
-        let mut out: Vec<String> = Vec::new();
-        while i < rows.len() {
-            let r = &rows[i];
-            let rs = r.range.range.start;
-            let re = r.range.range.end;
 
-            if rs > e {
-                break;
-            }
-            if re >= s && rs <= e {
-                if let Some(name) = &r.name {
-                    out.push(name.clone());
-                }
-            }
-            i += 1;
-        }
-
-        out
+        rows.iter()
+            .take_while(|r| r.range.range.start < e)
+            .filter(|r| r.range.range.end > s)
+            .filter_map(|r| r.name.clone())
+            .collect()
     }
 }
 
-/// Convert BED -> BGZF (.gz) and create a Tabix index (.tbi) in the same pass.
+/// Compress a BED file to BGZF and build a Tabix index (`.tbi`) in a single pass.
+///
+/// The output files are `<input>.gz` and `<input>.gz.tbi`. The input must be
+/// tab-delimited, contig-grouped, and coordinate-sorted (Tabix requirement).
 ///
-/// Notes:
-/// - Tabix expects the file to be tab-delimited, grouped by contig, and coordinate-sorted.
-/// - BED is 0-based, half-open; Tabix uses 1-based positions. We convert as:
-///   start_pos = start0 + 1
-///   end_pos   = end0
+/// # Coordinate conversion
+///
+/// BED uses 0-based half-open `[start, end)`. Tabix uses 1-based positions.
+/// The conversion is:
+/// - `tabix_start = bed_start + 1`  (0-based → 1-based)
+/// - `tabix_end   = bed_end`        (0-based exclusive = 1-based inclusive numerically)
+///
+/// # Arguments
+///
+/// * `input` - Path to the input BED file
+/// * `force` - Overwrite the output `.gz` file if it already exists
+///
+/// # Errors
+///
+/// Returns an error if the input cannot be read, the output cannot be written,
+/// or any data line is malformed (missing columns, non-integer coordinates).
 pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::Result<()> {
     let input = input.as_ref();
     let out_bgz = format!("{}.gz", input.display());
@@ -304,7 +329,7 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
 
     // Build a tabix index while we write.
     let mut indexer = tabix::index::Indexer::default();
-    indexer.set_header(Header::default()); // minimal header; usable for most tools :contentReference[oaicite:1]{index=1}
+    indexer.set_header(Header::default());
 
     let mut line = String::new();
     loop {
@@ -314,17 +339,10 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
             break;
         }
 
-        // Record start virtual offset for this line in the *output* bgzf stream.
-        // let chunk_start = writer.bgzf_pos(); // BGZF virtual offset :contentReference[oaicite:2]{index=2}
-
         let chunk_start = writer.virtual_position();
-        // Write line as-is.
         writer.write_all(line.as_bytes())?;
+        let chunk_end = writer.virtual_position();
 
-        // Record end virtual offset after writing the line.
-        let chunk_end = writer.virtual_position(); // :contentReference[oaicite:3]{index=3}
-
-        // Add to tabix index if it's a data line (skip meta/header lines).
         if !line.starts_with('#') && !line.trim().is_empty() {
             let mut it = line.split('\t');
             let rname = it
@@ -340,10 +358,10 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
             let end0: u32 = it
                 .next()
                 .context("BED: missing end")?
+                .trim()
                 .parse()
                 .context("BED: invalid end")?;
 
-            // BED 0-based half-open -> 1-based inclusive-ish interval for tabix
             let start1 = start0
                 .checked_add(1)
                 .context("BED: start overflow")?;
@@ -353,10 +371,7 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
             let end = Position::try_from(end0 as usize)
                 .context("BED: end must be >= 1 for tabix indexing")?;
 
-            let chunk = Chunk::new(
-                chunk_start,
-                chunk_end,
-            ); // chunk is [start, end) in virtual offsets :contentReference[oaicite:4]{index=4}
+            let chunk = Chunk::new(chunk_start, chunk_end);
 
             indexer
                 .add_record(rname, start, end, chunk)
@@ -365,20 +380,33 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
     }
 
     finalize_bgzf_file(writer, &out_bgz)?;
-    // writer.close()?; // writes EOF marker :contentReference[oaicite:5]{index=5}
 
-    let index = indexer.build(); // :contentReference[oaicite:6]{index=6}
-    tabix::fs::write(&out_tbi, &index)?; // :contentReference[oaicite:7]{index=7}
+    let index = indexer.build();
+    tabix::fs::write(&out_tbi, &index)?;
 
     Ok(())
 }
 
-/// Returns a set of (chrom, start, end) intervals for centromeric regions
-/// parsed from a UCSC-format cytoband BED.
+/// Parse centromeric intervals from a UCSC cytoband BED file.
 ///
-/// Centromeres are identified by the `acen` stain field (column 4).
+/// Identifies rows whose stain field (column 5) equals `acen` and returns their
+/// coordinates as `(chrom, start, end)`. Coordinates are 0-based half-open as
+/// stored in the file.
+///
+/// Expected column layout: `chrom\tstart\tend\tband_name\tstain`
+///
+/// # Arguments
+///
+/// * `path` - Path to the cytoband BED file
+///
+/// # Returns
+///
+/// A vector of `(chrom, start, end)` tuples for all `acen` bands, in file order.
+///
+/// # Errors
 ///
-/// Format: `chrom\tstart\tend\tband_name\tstain`
+/// Returns an error if the file cannot be opened, a line has fewer than 5 columns,
+/// or a coordinate field cannot be parsed as `u64`.
 pub fn parse_centromere_intervals(path: &str) -> anyhow::Result<Vec<(String, u64, u64)>> {
     let reader = BufReader::new(get_reader(path)
         .with_context(|| format!("Cannot open cytobands BED: {path}"))?);