|
@@ -1,10 +1,20 @@
|
|
|
-//! BED file parsing, indexing, and overlap-based variant annotation utilities.
|
|
|
|
|
|
|
+//! BED file I/O, overlap queries, and genomic region annotation.
|
|
|
//!
|
|
//!
|
|
|
-//! This module provides:
|
|
|
|
|
-//! - Parsing of BED rows into typed structures
|
|
|
|
|
-//! - Efficient overlap queries between BED regions and genome ranges
|
|
|
|
|
-//! - Parallel annotation of variants using BED-defined regions
|
|
|
|
|
-//! - A pre-indexed BED structure for fast gene queries
|
|
|
|
|
|
|
+//! All coordinates follow **BED convention: 0-based, half-open `[start, end)`**.
|
|
|
|
|
+//! This matches [`GenomeRange`]'s internal `Range<u32>` representation, so BED
|
|
|
|
|
+//! values are stored as-is without conversion.
|
|
|
|
|
+//!
|
|
|
|
|
+//! # Main types and functions
|
|
|
|
|
+//!
|
|
|
|
|
+//! | Item | Purpose |
|
|
|
|
|
+//! |------|---------|
|
|
|
|
|
+//! | [`BedRow`] | One parsed BED line (up to BED6) |
|
|
|
|
|
+//! | [`read_bed`] | Load a BED file into memory, skipping headers and blank lines |
|
|
|
|
|
+//! | [`annotate_with_bed`] | Annotate a [`Variants`] collection from a BED region set |
|
|
|
|
|
+//! | [`bedrow_overlaps_par`] | Parallel overlap query: which BED rows hit a set of query ranges |
|
|
|
|
|
+//! | [`GenesBedIndex`] | Per-contig indexed structure for fast gene-name lookup |
|
|
|
|
|
+//! | [`convert_bgz_with_tabix`] | Compress a BED file to BGZF and build a Tabix index in one pass |
|
|
|
|
|
+//! | [`parse_centromere_intervals`] | Extract centromeric intervals from a UCSC cytoband file |
|
|
|
|
|
|
|
|
use std::{
|
|
use std::{
|
|
|
io::{BufRead, BufReader}, path::Path, str::FromStr, sync::Arc
|
|
io::{BufRead, BufReader}, path::Path, str::FromStr, sync::Arc
|
|
@@ -24,22 +34,33 @@ use noodles_csi::binning_index::index::Header;
|
|
|
use noodles_tabix as tabix;
|
|
use noodles_tabix as tabix;
|
|
|
use std::io::Write;
|
|
use std::io::Write;
|
|
|
|
|
|
|
|
-/// One row of a BED file.
|
|
|
|
|
|
|
+/// One parsed row from a BED file (up to BED6).
|
|
|
///
|
|
///
|
|
|
-/// Represents a genomic interval with optional name, score, and strand
|
|
|
|
|
-/// information.
|
|
|
|
|
|
|
+/// Coordinates in `range` are **0-based, half-open** as stored in the BED file.
|
|
|
|
|
+/// Optional fields (`name`, `score`, `strand`) are `None` when the column is absent
|
|
|
|
|
+/// or unparseable.
|
|
|
#[derive(Debug, Clone)]
|
|
#[derive(Debug, Clone)]
|
|
|
pub struct BedRow {
|
|
pub struct BedRow {
|
|
|
|
|
+ /// Genomic interval — 0-based half-open `[start, end)`
|
|
|
pub range: GenomeRange,
|
|
pub range: GenomeRange,
|
|
|
|
|
+ /// BED column 4: feature name
|
|
|
pub name: Option<String>,
|
|
pub name: Option<String>,
|
|
|
|
|
+ /// BED column 5: score (0–1000 per spec, stored as `u16`)
|
|
|
pub score: Option<u16>,
|
|
pub score: Option<u16>,
|
|
|
|
|
+ /// BED column 6: strand — `true` = `+`, `false` = `-`
|
|
|
pub strand: Option<bool>,
|
|
pub strand: Option<bool>,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Parses a BED row from a tab-separated string.
|
|
|
|
|
|
|
+/// Parse a tab-separated BED line into a [`BedRow`].
|
|
|
|
|
+///
|
|
|
|
|
+/// Expects at least 3 tab-separated columns: `chrom`, `start`, `end`.
|
|
|
|
|
+/// Columns 4–6 (`name`, `score`, `strand`) are optional; missing or unparseable
|
|
|
|
|
+/// values are stored as `None`. Coordinates are stored as-is (0-based half-open).
|
|
|
|
|
+///
|
|
|
|
|
+/// # Errors
|
|
|
///
|
|
///
|
|
|
-/// Expected format (BED6-compatible):
|
|
|
|
|
-/// `contig start end name? score? strand?`
|
|
|
|
|
|
|
+/// Returns an error if fewer than 3 columns are present or if `start`/`end`
|
|
|
|
|
+/// cannot be parsed as `u32`.
|
|
|
impl FromStr for BedRow {
|
|
impl FromStr for BedRow {
|
|
|
type Err = anyhow::Error;
|
|
type Err = anyhow::Error;
|
|
|
|
|
|
|
@@ -58,7 +79,7 @@ impl FromStr for BedRow {
|
|
|
|
|
|
|
|
Ok(Self {
|
|
Ok(Self {
|
|
|
range,
|
|
range,
|
|
|
- name: v.get(3).map(|v| Some(v.to_string())).unwrap_or(None),
|
|
|
|
|
|
|
+ name: v.get(3).map(|v| v.to_string()),
|
|
|
score: v.get(4).and_then(|v| v.parse().ok()),
|
|
score: v.get(4).and_then(|v| v.parse().ok()),
|
|
|
strand: v.get(5).and_then(|&v| match v {
|
|
strand: v.get(5).and_then(|&v| match v {
|
|
|
"+" => Some(true),
|
|
"+" => Some(true),
|
|
@@ -69,19 +90,30 @@ impl FromStr for BedRow {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Exposes the genomic range of a BED row.
|
|
|
|
|
impl GetGenomeRange for BedRow {
|
|
impl GetGenomeRange for BedRow {
|
|
|
fn range(&self) -> &GenomeRange {
|
|
fn range(&self) -> &GenomeRange {
|
|
|
&self.range
|
|
&self.range
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Reads a BED file into memory.
|
|
|
|
|
|
|
+/// Load a BED file into memory as a vector of [`BedRow`]s.
|
|
|
///
|
|
///
|
|
|
-/// Lines starting with `#` are ignored.
|
|
|
|
|
|
|
+/// Skips blank lines and lines starting with `#`, `track`, or `browser`
|
|
|
|
|
+/// (standard UCSC header conventions). All other lines are parsed via
|
|
|
|
|
+/// [`BedRow::from_str`].
|
|
|
|
|
+///
|
|
|
|
|
+/// # Arguments
|
|
|
|
|
+///
|
|
|
|
|
+/// * `path` - Path to a BED file (plain text or gzip-compressed)
|
|
|
|
|
+///
|
|
|
|
|
+/// # Returns
|
|
|
|
|
+///
|
|
|
|
|
+/// A vector of parsed rows in file order.
|
|
|
///
|
|
///
|
|
|
/// # Errors
|
|
/// # Errors
|
|
|
-/// Returns an error if the file cannot be opened or if any row fails to parse.
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// Returns an error if the file cannot be opened or if any data line fails to parse.
|
|
|
|
|
+/// I/O errors on individual lines are logged as warnings and skipped.
|
|
|
pub fn read_bed(path: &str) -> anyhow::Result<Vec<BedRow>> {
|
|
pub fn read_bed(path: &str) -> anyhow::Result<Vec<BedRow>> {
|
|
|
let reader = BufReader::new(get_reader(path)?);
|
|
let reader = BufReader::new(get_reader(path)?);
|
|
|
|
|
|
|
@@ -89,7 +121,11 @@ pub fn read_bed(path: &str) -> anyhow::Result<Vec<BedRow>> {
|
|
|
for (i, line) in reader.lines().enumerate() {
|
|
for (i, line) in reader.lines().enumerate() {
|
|
|
match line {
|
|
match line {
|
|
|
Ok(line) => {
|
|
Ok(line) => {
|
|
|
- if line.starts_with("#") {
|
|
|
|
|
|
|
+ if line.is_empty()
|
|
|
|
|
+ || line.starts_with('#')
|
|
|
|
|
+ || line.starts_with("track")
|
|
|
|
|
+ || line.starts_with("browser")
|
|
|
|
|
+ {
|
|
|
continue;
|
|
continue;
|
|
|
}
|
|
}
|
|
|
res.push(line.parse().context(format!("Can't parse {line}"))?);
|
|
res.push(line.parse().context(format!("Can't parse {line}"))?);
|
|
@@ -101,18 +137,26 @@ pub fn read_bed(path: &str) -> anyhow::Result<Vec<BedRow>> {
|
|
|
Ok(res)
|
|
Ok(res)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Annotates variants with a given annotation based on overlap with a BED file,
|
|
|
|
|
-/// and returns total base count and number of overlapping mutations.
|
|
|
|
|
|
|
+/// Annotate variants that overlap a BED region set and return coverage statistics.
|
|
|
|
|
+///
|
|
|
|
|
+/// Loads the BED file, finds all variants whose position overlaps any BED region
|
|
|
|
|
+/// (via [`overlaps_par`]), pushes `annotation` onto each matching variant, and
|
|
|
|
|
+/// returns the total covered base-pair count alongside the overlap count.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
|
-/// * `variants` - Mutable reference to a `Variants` collection to annotate.
|
|
|
|
|
-/// * `bed_path` - Path to the BED file defining the region class.
|
|
|
|
|
-/// * `annotation` - The `Annotation` to assign to overlapping variants.
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// * `variants` - Variant collection to annotate in place
|
|
|
|
|
+/// * `bed_path` - Path to the BED file defining the region class
|
|
|
|
|
+/// * `annotation` - Annotation tag to attach to each overlapping variant
|
|
|
///
|
|
///
|
|
|
/// # Returns
|
|
/// # Returns
|
|
|
-/// `Ok((total_bp, overlap_count))` — where:
|
|
|
|
|
-/// - `total_bp`: number of base pairs in the BED regions
|
|
|
|
|
-/// - `overlap_count`: number of variants overlapping those regions
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// `(total_bp, overlap_count)` where `total_bp` is the sum of all BED interval
|
|
|
|
|
+/// lengths and `overlap_count` is the number of variants that received the annotation.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Errors
|
|
|
|
|
+///
|
|
|
|
|
+/// Returns an error if the BED file cannot be read.
|
|
|
pub fn annotate_with_bed(
|
|
pub fn annotate_with_bed(
|
|
|
variants: &mut Variants,
|
|
variants: &mut Variants,
|
|
|
bed_path: &str,
|
|
bed_path: &str,
|
|
@@ -121,16 +165,10 @@ pub fn annotate_with_bed(
|
|
|
let bed_rows = read_bed(bed_path)?;
|
|
let bed_rows = read_bed(bed_path)?;
|
|
|
let ranges: Vec<&GenomeRange> = bed_rows.iter().map(|b| &b.range).collect();
|
|
let ranges: Vec<&GenomeRange> = bed_rows.iter().map(|b| &b.range).collect();
|
|
|
|
|
|
|
|
- // Total number of base pairs in the BED regions
|
|
|
|
|
let total_bp: usize = ranges.iter().map(|r| r.length() as usize).sum();
|
|
let total_bp: usize = ranges.iter().map(|r| r.length() as usize).sum();
|
|
|
-
|
|
|
|
|
- // Extract positions of all variants
|
|
|
|
|
let positions: Vec<&GenomePosition> = variants.data.iter().map(|v| &v.position).collect();
|
|
let positions: Vec<&GenomePosition> = variants.data.iter().map(|v| &v.position).collect();
|
|
|
-
|
|
|
|
|
- // Find indices of variants overlapping the BED ranges
|
|
|
|
|
let overlaps = overlaps_par(&positions, &ranges);
|
|
let overlaps = overlaps_par(&positions, &ranges);
|
|
|
|
|
|
|
|
- // Annotate overlapping variants
|
|
|
|
|
for &idx in &overlaps {
|
|
for &idx in &overlaps {
|
|
|
variants.data[idx].annotations.push(annotation.clone());
|
|
variants.data[idx].annotations.push(annotation.clone());
|
|
|
}
|
|
}
|
|
@@ -138,9 +176,25 @@ pub fn annotate_with_bed(
|
|
|
Ok((total_bp, overlaps.len()))
|
|
Ok((total_bp, overlaps.len()))
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Returns all BED rows overlapping a set of query ranges (parallel).
|
|
|
|
|
|
|
+/// Return every BED row that overlaps at least one query range, parallelised per contig.
|
|
|
|
|
+///
|
|
|
|
|
+/// Both inputs must be sorted by `(contig, start)`. Each matching row appears exactly
|
|
|
|
|
+/// once in the output regardless of how many queries it overlaps.
|
|
|
|
|
+///
|
|
|
|
|
+/// Uses an anchored-scan algorithm (O(n + m)) that is correct even when rows or
|
|
|
|
|
+/// queries overlap each other (e.g. nested genes). The anchor `j0` advances only
|
|
|
|
|
+/// when a query definitively ends before the current row starts, so no hit can be
|
|
|
|
|
+/// missed.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Arguments
|
|
|
///
|
|
///
|
|
|
-/// Input rows and queries must be sorted by contig and position.
|
|
|
|
|
|
|
+/// * `rows` - BED rows sorted by `(contig, start)`
|
|
|
|
|
+/// * `queries` - Query ranges sorted by `(contig, start)`
|
|
|
|
|
+///
|
|
|
|
|
+/// # Returns
|
|
|
|
|
+///
|
|
|
|
|
+/// Matching BED rows in per-contig order. Cross-contig order is unspecified (parallel
|
|
|
|
|
+/// execution).
|
|
|
pub fn bedrow_overlaps_par(
|
|
pub fn bedrow_overlaps_par(
|
|
|
rows: &[BedRow],
|
|
rows: &[BedRow],
|
|
|
queries: &[&GenomeRange],
|
|
queries: &[&GenomeRange],
|
|
@@ -148,86 +202,57 @@ pub fn bedrow_overlaps_par(
|
|
|
where
|
|
where
|
|
|
BedRow: Clone + Send + Sync,
|
|
BedRow: Clone + Send + Sync,
|
|
|
{
|
|
{
|
|
|
- // Pre-compute [start, end) indices per contig for both inputs.
|
|
|
|
|
|
|
+ let row_ranges: Vec<&GenomeRange> = rows.iter().map(|r| &r.range).collect();
|
|
|
let (row_contigs, query_contigs) = rayon::join(
|
|
let (row_contigs, query_contigs) = rayon::join(
|
|
|
- || extract_contig_indices_bed(rows),
|
|
|
|
|
|
|
+ || extract_contig_indices(&row_ranges),
|
|
|
|| extract_contig_indices(queries),
|
|
|| extract_contig_indices(queries),
|
|
|
);
|
|
);
|
|
|
|
|
|
|
|
row_contigs
|
|
row_contigs
|
|
|
- .into_par_iter() // one task per contig
|
|
|
|
|
|
|
+ .into_par_iter()
|
|
|
.filter_map(|(contig, r_start, r_end)| {
|
|
.filter_map(|(contig, r_start, r_end)| {
|
|
|
- // No queries on this contig → skip the task.
|
|
|
|
|
let (q_start, q_end) = find_contig_indices(&query_contigs, contig)?;
|
|
let (q_start, q_end) = find_contig_indices(&query_contigs, contig)?;
|
|
|
|
|
|
|
|
let r_slice = &rows[r_start..r_end];
|
|
let r_slice = &rows[r_start..r_end];
|
|
|
let q_slice = &queries[q_start..q_end];
|
|
let q_slice = &queries[q_start..q_end];
|
|
|
|
|
|
|
|
let mut hits = Vec::new();
|
|
let mut hits = Vec::new();
|
|
|
- let (mut i, mut j) = (0usize, 0usize);
|
|
|
|
|
-
|
|
|
|
|
- // Classic two-finger sweep.
|
|
|
|
|
- while i < r_slice.len() && j < q_slice.len() {
|
|
|
|
|
- let r_range = r_slice[i].range(); // BedRow → GenomeRange
|
|
|
|
|
- let q_range = &q_slice[j].range;
|
|
|
|
|
-
|
|
|
|
|
- match (r_range.range.end <= q_range.start, q_range.end <= r_range.range.start) {
|
|
|
|
|
- (true, _) => i += 1, // row finishes before query starts
|
|
|
|
|
- (_, true) => j += 1, // query finishes before row starts
|
|
|
|
|
- _ => {
|
|
|
|
|
- hits.push(r_slice[i].clone()); // overlap detected
|
|
|
|
|
- if r_range.range.end < q_range.end {
|
|
|
|
|
- i += 1;
|
|
|
|
|
- } else {
|
|
|
|
|
- j += 1;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ let mut j0 = 0usize;
|
|
|
|
|
+
|
|
|
|
|
+ for row in r_slice {
|
|
|
|
|
+ let r = row.range();
|
|
|
|
|
+
|
|
|
|
|
+ while j0 < q_slice.len() && q_slice[j0].range.end <= r.range.start {
|
|
|
|
|
+ j0 += 1;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if j0 < q_slice.len() && q_slice[j0].range.start < r.range.end {
|
|
|
|
|
+ hits.push(row.clone());
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
Some(hits)
|
|
Some(hits)
|
|
|
})
|
|
})
|
|
|
.flatten()
|
|
.flatten()
|
|
|
.collect()
|
|
.collect()
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Computes contiguous slice indices for BED rows grouped by contig.
|
|
|
|
|
-///
|
|
|
|
|
-/// Assumes input is sorted by contig.
|
|
|
|
|
-fn extract_contig_indices_bed(rows: &[BedRow]) -> Vec<(u8, usize, usize)> {
|
|
|
|
|
- let mut out = Vec::new();
|
|
|
|
|
- if rows.is_empty() {
|
|
|
|
|
- return out;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- let mut current = rows[0].range.contig;
|
|
|
|
|
- let mut start = 0;
|
|
|
|
|
|
|
|
|
|
- for (idx, row) in rows.iter().enumerate() {
|
|
|
|
|
- if row.range.contig != current {
|
|
|
|
|
- out.push((current, start, idx));
|
|
|
|
|
- current = row.range.contig;
|
|
|
|
|
- start = idx;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- out.push((current, start, rows.len()));
|
|
|
|
|
- out
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-/// Pre-indexed BED structure for fast gene lookup by genomic interval.
|
|
|
|
|
|
|
+/// Per-contig indexed BED structure for fast gene-name lookup.
|
|
|
///
|
|
///
|
|
|
-/// BED rows are grouped by contig and stored in sorted order.
|
|
|
|
|
|
|
+/// Rows are sorted by `(contig, start, end)` at construction time and stored in
|
|
|
|
|
+/// 256 contig slots (one per encoded `u8` contig). Lookups are O(rows before `end`)
|
|
|
|
|
+/// with early termination — correct even for very long host genes.
|
|
|
#[derive(Clone)]
|
|
#[derive(Clone)]
|
|
|
pub struct GenesBedIndex {
|
|
pub struct GenesBedIndex {
|
|
|
- // contig (u8) -> BedRows on that contig, sorted by range.start
|
|
|
|
|
by_contig: Vec<Arc<[BedRow]>>,
|
|
by_contig: Vec<Arc<[BedRow]>>,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
impl GenesBedIndex {
|
|
impl GenesBedIndex {
|
|
|
- /// Builds a contig-indexed BED structure.
|
|
|
|
|
|
|
+ /// Build a [`GenesBedIndex`] from an unsorted vector of [`BedRow`]s.
|
|
|
///
|
|
///
|
|
|
- /// Input rows are sorted and grouped internally.
|
|
|
|
|
|
|
+ /// Rows are sorted by `(contig, start, end)` internally.
|
|
|
pub fn new(mut rows: Vec<BedRow>) -> Self {
|
|
pub fn new(mut rows: Vec<BedRow>) -> Self {
|
|
|
- // Ensure deterministic grouping & fast queries
|
|
|
|
|
rows.sort_unstable_by(|a, b| {
|
|
rows.sort_unstable_by(|a, b| {
|
|
|
a.range.contig
|
|
a.range.contig
|
|
|
.cmp(&b.range.contig)
|
|
.cmp(&b.range.contig)
|
|
@@ -235,63 +260,63 @@ impl GenesBedIndex {
|
|
|
.then_with(|| a.range.range.end.cmp(&b.range.range.end))
|
|
.then_with(|| a.range.range.end.cmp(&b.range.range.end))
|
|
|
});
|
|
});
|
|
|
|
|
|
|
|
- // 256 is fine if you encode contigs into u8
|
|
|
|
|
let mut tmp: Vec<Vec<BedRow>> = vec![Vec::new(); 256];
|
|
let mut tmp: Vec<Vec<BedRow>> = vec![Vec::new(); 256];
|
|
|
for r in rows {
|
|
for r in rows {
|
|
|
tmp[r.range.contig as usize].push(r);
|
|
tmp[r.range.contig as usize].push(r);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- let by_contig = tmp
|
|
|
|
|
- .into_iter()
|
|
|
|
|
- .map(Arc::<[BedRow]>::from)
|
|
|
|
|
- .collect();
|
|
|
|
|
-
|
|
|
|
|
- Self { by_contig }
|
|
|
|
|
|
|
+ Self {
|
|
|
|
|
+ by_contig: tmp.into_iter().map(Arc::<[BedRow]>::from).collect(),
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /// Returns gene names overlapping the given interval.
|
|
|
|
|
- #[inline]
|
|
|
|
|
|
|
+ /// Return the names of all genes overlapping `[start, end)` on `contig`.
|
|
|
|
|
+ ///
|
|
|
|
|
+ /// Coordinates are 0-based half-open. If `start > end` the arguments are
|
|
|
|
|
+ /// swapped defensively. Genes without a name field are silently skipped.
|
|
|
|
|
+ ///
|
|
|
|
|
+ /// # Arguments
|
|
|
|
|
+ ///
|
|
|
|
|
+ /// * `contig` - Encoded contig index (see `contig_to_num`)
|
|
|
|
|
+ /// * `start` - Query start, 0-based inclusive
|
|
|
|
|
+ /// * `end` - Query end, 0-based exclusive
|
|
|
|
|
+ ///
|
|
|
|
|
+ /// # Returns
|
|
|
|
|
+ ///
|
|
|
|
|
+ /// Names of overlapping genes in start-sorted order.
|
|
|
pub fn query_genes(&self, contig: u8, start: u32, end: u32) -> Vec<String> {
|
|
pub fn query_genes(&self, contig: u8, start: u32, end: u32) -> Vec<String> {
|
|
|
let (s, e) = if start <= end { (start, end) } else { (end, start) };
|
|
let (s, e) = if start <= end { (start, end) } else { (end, start) };
|
|
|
let rows = &self.by_contig[contig as usize];
|
|
let rows = &self.by_contig[contig as usize];
|
|
|
- if rows.is_empty() {
|
|
|
|
|
- return Vec::new();
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- // lower_bound on start
|
|
|
|
|
- let mut i = match rows.binary_search_by_key(&s, |r| r.range.range.start) {
|
|
|
|
|
- Ok(i) | Err(i) => i,
|
|
|
|
|
- };
|
|
|
|
|
- i = i.saturating_sub(1);
|
|
|
|
|
-
|
|
|
|
|
- let mut out: Vec<String> = Vec::new();
|
|
|
|
|
- while i < rows.len() {
|
|
|
|
|
- let r = &rows[i];
|
|
|
|
|
- let rs = r.range.range.start;
|
|
|
|
|
- let re = r.range.range.end;
|
|
|
|
|
|
|
|
|
|
- if rs > e {
|
|
|
|
|
- break;
|
|
|
|
|
- }
|
|
|
|
|
- if re >= s && rs <= e {
|
|
|
|
|
- if let Some(name) = &r.name {
|
|
|
|
|
- out.push(name.clone());
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- i += 1;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- out
|
|
|
|
|
|
|
+ rows.iter()
|
|
|
|
|
+ .take_while(|r| r.range.range.start < e)
|
|
|
|
|
+ .filter(|r| r.range.range.end > s)
|
|
|
|
|
+ .filter_map(|r| r.name.clone())
|
|
|
|
|
+ .collect()
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Convert BED -> BGZF (.gz) and create a Tabix index (.tbi) in the same pass.
|
|
|
|
|
|
|
+/// Compress a BED file to BGZF and build a Tabix index (`.tbi`) in a single pass.
|
|
|
|
|
+///
|
|
|
|
|
+/// The output files are `<input>.gz` and `<input>.gz.tbi`. The input must be
|
|
|
|
|
+/// tab-delimited, contig-grouped, and coordinate-sorted (Tabix requirement).
|
|
|
///
|
|
///
|
|
|
-/// Notes:
|
|
|
|
|
-/// - Tabix expects the file to be tab-delimited, grouped by contig, and coordinate-sorted.
|
|
|
|
|
-/// - BED is 0-based, half-open; Tabix uses 1-based positions. We convert as:
|
|
|
|
|
-/// start_pos = start0 + 1
|
|
|
|
|
-/// end_pos = end0
|
|
|
|
|
|
|
+/// # Coordinate conversion
|
|
|
|
|
+///
|
|
|
|
|
+/// BED uses 0-based half-open `[start, end)`. Tabix uses 1-based positions.
|
|
|
|
|
+/// The conversion is:
|
|
|
|
|
+/// - `tabix_start = bed_start + 1` (0-based → 1-based)
|
|
|
|
|
+/// - `tabix_end = bed_end` (0-based exclusive = 1-based inclusive numerically)
|
|
|
|
|
+///
|
|
|
|
|
+/// # Arguments
|
|
|
|
|
+///
|
|
|
|
|
+/// * `input` - Path to the input BED file
|
|
|
|
|
+/// * `force` - Overwrite the output `.gz` file if it already exists
|
|
|
|
|
+///
|
|
|
|
|
+/// # Errors
|
|
|
|
|
+///
|
|
|
|
|
+/// Returns an error if the input cannot be read, the output cannot be written,
|
|
|
|
|
+/// or any data line is malformed (missing columns, non-integer coordinates).
|
|
|
pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::Result<()> {
|
|
pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::Result<()> {
|
|
|
let input = input.as_ref();
|
|
let input = input.as_ref();
|
|
|
let out_bgz = format!("{}.gz", input.display());
|
|
let out_bgz = format!("{}.gz", input.display());
|
|
@@ -304,7 +329,7 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
|
|
|
|
|
|
|
|
// Build a tabix index while we write.
|
|
// Build a tabix index while we write.
|
|
|
let mut indexer = tabix::index::Indexer::default();
|
|
let mut indexer = tabix::index::Indexer::default();
|
|
|
- indexer.set_header(Header::default()); // minimal header; usable for most tools :contentReference[oaicite:1]{index=1}
|
|
|
|
|
|
|
+ indexer.set_header(Header::default());
|
|
|
|
|
|
|
|
let mut line = String::new();
|
|
let mut line = String::new();
|
|
|
loop {
|
|
loop {
|
|
@@ -314,17 +339,10 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
|
|
|
break;
|
|
break;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // Record start virtual offset for this line in the *output* bgzf stream.
|
|
|
|
|
- // let chunk_start = writer.bgzf_pos(); // BGZF virtual offset :contentReference[oaicite:2]{index=2}
|
|
|
|
|
-
|
|
|
|
|
let chunk_start = writer.virtual_position();
|
|
let chunk_start = writer.virtual_position();
|
|
|
- // Write line as-is.
|
|
|
|
|
writer.write_all(line.as_bytes())?;
|
|
writer.write_all(line.as_bytes())?;
|
|
|
|
|
+ let chunk_end = writer.virtual_position();
|
|
|
|
|
|
|
|
- // Record end virtual offset after writing the line.
|
|
|
|
|
- let chunk_end = writer.virtual_position(); // :contentReference[oaicite:3]{index=3}
|
|
|
|
|
-
|
|
|
|
|
- // Add to tabix index if it's a data line (skip meta/header lines).
|
|
|
|
|
if !line.starts_with('#') && !line.trim().is_empty() {
|
|
if !line.starts_with('#') && !line.trim().is_empty() {
|
|
|
let mut it = line.split('\t');
|
|
let mut it = line.split('\t');
|
|
|
let rname = it
|
|
let rname = it
|
|
@@ -340,10 +358,10 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
|
|
|
let end0: u32 = it
|
|
let end0: u32 = it
|
|
|
.next()
|
|
.next()
|
|
|
.context("BED: missing end")?
|
|
.context("BED: missing end")?
|
|
|
|
|
+ .trim()
|
|
|
.parse()
|
|
.parse()
|
|
|
.context("BED: invalid end")?;
|
|
.context("BED: invalid end")?;
|
|
|
|
|
|
|
|
- // BED 0-based half-open -> 1-based inclusive-ish interval for tabix
|
|
|
|
|
let start1 = start0
|
|
let start1 = start0
|
|
|
.checked_add(1)
|
|
.checked_add(1)
|
|
|
.context("BED: start overflow")?;
|
|
.context("BED: start overflow")?;
|
|
@@ -353,10 +371,7 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
|
|
|
let end = Position::try_from(end0 as usize)
|
|
let end = Position::try_from(end0 as usize)
|
|
|
.context("BED: end must be >= 1 for tabix indexing")?;
|
|
.context("BED: end must be >= 1 for tabix indexing")?;
|
|
|
|
|
|
|
|
- let chunk = Chunk::new(
|
|
|
|
|
- chunk_start,
|
|
|
|
|
- chunk_end,
|
|
|
|
|
- ); // chunk is [start, end) in virtual offsets :contentReference[oaicite:4]{index=4}
|
|
|
|
|
|
|
+ let chunk = Chunk::new(chunk_start, chunk_end);
|
|
|
|
|
|
|
|
indexer
|
|
indexer
|
|
|
.add_record(rname, start, end, chunk)
|
|
.add_record(rname, start, end, chunk)
|
|
@@ -365,20 +380,33 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
finalize_bgzf_file(writer, &out_bgz)?;
|
|
finalize_bgzf_file(writer, &out_bgz)?;
|
|
|
- // writer.close()?; // writes EOF marker :contentReference[oaicite:5]{index=5}
|
|
|
|
|
|
|
|
|
|
- let index = indexer.build(); // :contentReference[oaicite:6]{index=6}
|
|
|
|
|
- tabix::fs::write(&out_tbi, &index)?; // :contentReference[oaicite:7]{index=7}
|
|
|
|
|
|
|
+ let index = indexer.build();
|
|
|
|
|
+ tabix::fs::write(&out_tbi, &index)?;
|
|
|
|
|
|
|
|
Ok(())
|
|
Ok(())
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Returns a set of (chrom, start, end) intervals for centromeric regions
|
|
|
|
|
-/// parsed from a UCSC-format cytoband BED.
|
|
|
|
|
|
|
+/// Parse centromeric intervals from a UCSC cytoband BED file.
|
|
|
///
|
|
///
|
|
|
-/// Centromeres are identified by the `acen` stain field (column 4).
|
|
|
|
|
|
|
+/// Identifies rows whose stain field (column 5) equals `acen` and returns their
|
|
|
|
|
+/// coordinates as `(chrom, start, end)`. Coordinates are 0-based half-open as
|
|
|
|
|
+/// stored in the file.
|
|
|
|
|
+///
|
|
|
|
|
+/// Expected column layout: `chrom\tstart\tend\tband_name\tstain`
|
|
|
|
|
+///
|
|
|
|
|
+/// # Arguments
|
|
|
|
|
+///
|
|
|
|
|
+/// * `path` - Path to the cytoband BED file
|
|
|
|
|
+///
|
|
|
|
|
+/// # Returns
|
|
|
|
|
+///
|
|
|
|
|
+/// A vector of `(chrom, start, end)` tuples for all `acen` bands, in file order.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Errors
|
|
|
///
|
|
///
|
|
|
-/// Format: `chrom\tstart\tend\tband_name\tstain`
|
|
|
|
|
|
|
+/// Returns an error if the file cannot be opened, a line has fewer than 5 columns,
|
|
|
|
|
+/// or a coordinate field cannot be parsed as `u64`.
|
|
|
pub fn parse_centromere_intervals(path: &str) -> anyhow::Result<Vec<(String, u64, u64)>> {
|
|
pub fn parse_centromere_intervals(path: &str) -> anyhow::Result<Vec<(String, u64, u64)>> {
|
|
|
let reader = BufReader::new(get_reader(path)
|
|
let reader = BufReader::new(get_reader(path)
|
|
|
.with_context(|| format!("Cannot open cytobands BED: {path}"))?);
|
|
.with_context(|| format!("Cannot open cytobands BED: {path}"))?);
|