Browse Source

GenesBedIndex comment + rm sort dedup

Thomas 2 tuần trước cách đây
mục cha
commit
fc120e0876
1 tập tin đã thay đổi với 36 bổ sung4 xóa
  1. 36 4
      src/io/bed.rs

+ 36 - 4
src/io/bed.rs

@@ -1,3 +1,11 @@
+//! BED file parsing, indexing, and overlap-based variant annotation utilities.
+//!
+//! This module provides:
+//! - Parsing of BED rows into typed structures
+//! - Efficient overlap queries between BED regions and genome ranges
+//! - Parallel annotation of variants using BED-defined regions
+//! - A pre-indexed BED structure for fast gene queries
+
 use std::{
     io::{BufRead, BufReader},
     str::FromStr, sync::Arc,
@@ -11,6 +19,10 @@ use crate::{annotation::Annotation, positions::{extract_contig_indices, find_con
 
 use super::readers::get_reader;
 
+/// One row of a BED file.
+///
+/// Represents a genomic interval with optional name, score, and strand
+/// information.
 #[derive(Debug, Clone)]
 pub struct BedRow {
     pub range: GenomeRange,
@@ -19,6 +31,10 @@ pub struct BedRow {
     pub strand: Option<bool>,
 }
 
+/// Parses a BED row from a tab-separated string.
+///
+/// Expected format (BED6-compatible):
+/// `contig  start  end  name?  score?  strand?`
 impl FromStr for BedRow {
     type Err = anyhow::Error;
 
@@ -48,12 +64,19 @@ impl FromStr for BedRow {
     }
 }
 
+/// Exposes the genomic range of a BED row.
 impl GetGenomeRange for BedRow {
     fn range(&self) -> &GenomeRange {
         &self.range
     }
 }
 
+/// Reads a BED file into memory.
+///
+/// Lines starting with `#` are ignored.
+///
+/// # Errors
+/// Returns an error if the file cannot be opened or if any row fails to parse.
 pub fn read_bed(path: &str) -> anyhow::Result<Vec<BedRow>> {
     let reader = BufReader::new(get_reader(path)?);
 
@@ -110,6 +133,9 @@ pub fn annotate_with_bed(
     Ok((total_bp, overlaps.len()))
 }
 
+/// Returns all BED rows overlapping a set of query ranges (parallel).
+///
+/// Input rows and queries must be sorted by contig and position.
 pub fn bedrow_overlaps_par(
     rows: &[BedRow],
     queries: &[&GenomeRange],
@@ -159,8 +185,9 @@ where
         .collect()
 }
 
-/// Returns a vector of  
-/// `(contig, slice_start, slice_end)` for each contig present.
+/// Computes contiguous slice indices for BED rows grouped by contig.
+///
+/// Assumes input is sorted by contig.
 fn extract_contig_indices_bed(rows: &[BedRow]) -> Vec<(u8, usize, usize)> {
     let mut out = Vec::new();
     if rows.is_empty() {
@@ -181,6 +208,9 @@ fn extract_contig_indices_bed(rows: &[BedRow]) -> Vec<(u8, usize, usize)> {
     out
 }
 
+/// Pre-indexed BED structure for fast gene lookup by genomic interval.
+///
+/// BED rows are grouped by contig and stored in sorted order.
 #[derive(Clone)]
 pub struct GenesBedIndex {
     // contig (u8) -> BedRows on that contig, sorted by range.start
@@ -188,6 +218,9 @@ pub struct GenesBedIndex {
 }
 
 impl GenesBedIndex {
+    /// Builds a contig-indexed BED structure.
+    ///
+    /// Input rows are sorted and grouped internally.
     pub fn new(mut rows: Vec<BedRow>) -> Self {
         // Ensure deterministic grouping & fast queries
         rows.sort_unstable_by(|a, b| {
@@ -211,6 +244,7 @@ impl GenesBedIndex {
         Self { by_contig }
     }
 
+    /// Returns gene names overlapping the given interval.
     #[inline]
     pub fn query_genes(&self, contig: u8, start: u32, end: u32) -> Vec<String> {
         let (s, e) = if start <= end { (start, end) } else { (end, start) };
@@ -242,8 +276,6 @@ impl GenesBedIndex {
             i += 1;
         }
 
-        out.sort_unstable();
-        out.dedup();
         out
     }
 }