|
|
@@ -1,3 +1,11 @@
|
|
|
+//! BED file parsing, indexing, and overlap-based variant annotation utilities.
|
|
|
+//!
|
|
|
+//! This module provides:
|
|
|
+//! - Parsing of BED rows into typed structures
|
|
|
+//! - Efficient overlap queries between BED regions and genome ranges
|
|
|
+//! - Parallel annotation of variants using BED-defined regions
|
|
|
+//! - A pre-indexed BED structure for fast gene queries
|
|
|
+
|
|
|
use std::{
|
|
|
io::{BufRead, BufReader},
|
|
|
str::FromStr, sync::Arc,
|
|
|
@@ -11,6 +19,10 @@ use crate::{annotation::Annotation, positions::{extract_contig_indices, find_con
|
|
|
|
|
|
use super::readers::get_reader;
|
|
|
|
|
|
+/// One row of a BED file.
|
|
|
+///
|
|
|
+/// Represents a genomic interval with optional name, score, and strand
|
|
|
+/// information.
|
|
|
#[derive(Debug, Clone)]
|
|
|
pub struct BedRow {
|
|
|
pub range: GenomeRange,
|
|
|
@@ -19,6 +31,10 @@ pub struct BedRow {
|
|
|
pub strand: Option<bool>,
|
|
|
}
|
|
|
|
|
|
+/// Parses a BED row from a tab-separated string.
|
|
|
+///
|
|
|
+/// Expected format (BED6-compatible):
|
|
|
+/// `contig start end name? score? strand?`
|
|
|
impl FromStr for BedRow {
|
|
|
type Err = anyhow::Error;
|
|
|
|
|
|
@@ -48,12 +64,19 @@ impl FromStr for BedRow {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+/// Exposes the genomic range of a BED row.
|
|
|
impl GetGenomeRange for BedRow {
|
|
|
fn range(&self) -> &GenomeRange {
|
|
|
&self.range
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+/// Reads a BED file into memory.
|
|
|
+///
|
|
|
+/// Lines starting with `#` are ignored.
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+/// Returns an error if the file cannot be opened or if any row fails to parse.
|
|
|
pub fn read_bed(path: &str) -> anyhow::Result<Vec<BedRow>> {
|
|
|
let reader = BufReader::new(get_reader(path)?);
|
|
|
|
|
|
@@ -110,6 +133,9 @@ pub fn annotate_with_bed(
|
|
|
Ok((total_bp, overlaps.len()))
|
|
|
}
|
|
|
|
|
|
+/// Returns all BED rows overlapping a set of query ranges (parallel).
|
|
|
+///
|
|
|
+/// Input rows and queries must be sorted by contig and position.
|
|
|
pub fn bedrow_overlaps_par(
|
|
|
rows: &[BedRow],
|
|
|
queries: &[&GenomeRange],
|
|
|
@@ -159,8 +185,9 @@ where
|
|
|
.collect()
|
|
|
}
|
|
|
|
|
|
-/// Returns a vector of
|
|
|
-/// `(contig, slice_start, slice_end)` for each contig present.
|
|
|
+/// Computes contiguous slice indices for BED rows grouped by contig.
|
|
|
+///
|
|
|
+/// Assumes input is sorted by contig.
|
|
|
fn extract_contig_indices_bed(rows: &[BedRow]) -> Vec<(u8, usize, usize)> {
|
|
|
let mut out = Vec::new();
|
|
|
if rows.is_empty() {
|
|
|
@@ -181,6 +208,9 @@ fn extract_contig_indices_bed(rows: &[BedRow]) -> Vec<(u8, usize, usize)> {
|
|
|
out
|
|
|
}
|
|
|
|
|
|
+/// Pre-indexed BED structure for fast gene lookup by genomic interval.
|
|
|
+///
|
|
|
+/// BED rows are grouped by contig and stored in sorted order.
|
|
|
#[derive(Clone)]
|
|
|
pub struct GenesBedIndex {
|
|
|
// contig (u8) -> BedRows on that contig, sorted by range.start
|
|
|
@@ -188,6 +218,9 @@ pub struct GenesBedIndex {
|
|
|
}
|
|
|
|
|
|
impl GenesBedIndex {
|
|
|
+ /// Builds a contig-indexed BED structure.
|
|
|
+ ///
|
|
|
+ /// Input rows are sorted and grouped internally.
|
|
|
pub fn new(mut rows: Vec<BedRow>) -> Self {
|
|
|
// Ensure deterministic grouping & fast queries
|
|
|
rows.sort_unstable_by(|a, b| {
|
|
|
@@ -211,6 +244,7 @@ impl GenesBedIndex {
|
|
|
Self { by_contig }
|
|
|
}
|
|
|
|
|
|
+ /// Returns gene names overlapping the given interval.
|
|
|
#[inline]
|
|
|
pub fn query_genes(&self, contig: u8, start: u32, end: u32) -> Vec<String> {
|
|
|
let (s, e) = if start <= end { (start, end) } else { (end, start) };
|
|
|
@@ -242,8 +276,6 @@ impl GenesBedIndex {
|
|
|
i += 1;
|
|
|
}
|
|
|
|
|
|
- out.sort_unstable();
|
|
|
- out.dedup();
|
|
|
out
|
|
|
}
|
|
|
}
|