|
|
@@ -5,12 +5,13 @@ use std::{
|
|
|
|
|
|
use anyhow::Context;
|
|
|
use log::warn;
|
|
|
+use rayon::prelude::*;
|
|
|
|
|
|
-use crate::{annotation::Annotation, positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomeRange}, variant::variant_collection::Variants};
|
|
|
+use crate::{annotation::Annotation, positions::{extract_contig_indices, find_contig_indices, overlaps_par, GenomePosition, GenomeRange, GetGenomeRange}, variant::variant_collection::Variants};
|
|
|
|
|
|
use super::readers::get_reader;
|
|
|
|
|
|
-#[derive(Debug)]
|
|
|
+#[derive(Debug, Clone)]
|
|
|
pub struct BedRow {
|
|
|
pub range: GenomeRange,
|
|
|
pub name: Option<String>,
|
|
|
@@ -108,3 +109,76 @@ pub fn annotate_with_bed(
|
|
|
|
|
|
Ok((total_bp, overlaps.len()))
|
|
|
}
|
|
|
+
|
|
|
+pub fn bedrow_overlaps_par(
|
|
|
+ rows: &[BedRow],
|
|
|
+ queries: &[&GenomeRange],
|
|
|
+) -> Vec<BedRow>
|
|
|
+where
|
|
|
+ BedRow: Clone + Send + Sync,
|
|
|
+{
|
|
|
+ // Pre-compute [start, end) indices per contig for both inputs.
|
|
|
+ let (row_contigs, query_contigs) = rayon::join(
|
|
|
+ || extract_contig_indices_bed(rows),
|
|
|
+ || extract_contig_indices(queries),
|
|
|
+ );
|
|
|
+
|
|
|
+ row_contigs
|
|
|
+ .into_par_iter() // one task per contig
|
|
|
+ .filter_map(|(contig, r_start, r_end)| {
|
|
|
+ // No queries on this contig → skip the task.
|
|
|
+ let (q_start, q_end) = find_contig_indices(&query_contigs, contig)?;
|
|
|
+
|
|
|
+ let r_slice = &rows[r_start..r_end];
|
|
|
+ let q_slice = &queries[q_start..q_end];
|
|
|
+
|
|
|
+ let mut hits = Vec::new();
|
|
|
+ let (mut i, mut j) = (0usize, 0usize);
|
|
|
+
|
|
|
+ // Classic two-finger sweep.
|
|
|
+ while i < r_slice.len() && j < q_slice.len() {
|
|
|
+ let r_range = r_slice[i].range(); // BedRow → GenomeRange
|
|
|
+ let q_range = &q_slice[j].range;
|
|
|
+
|
|
|
+ match (r_range.range.end <= q_range.start, q_range.end <= r_range.range.start) {
|
|
|
+ (true, _) => i += 1, // row finishes before query starts
|
|
|
+ (_, true) => j += 1, // query finishes before row starts
|
|
|
+ _ => {
|
|
|
+ hits.push(r_slice[i].clone()); // overlap detected
|
|
|
+ if r_range.range.end < q_range.end {
|
|
|
+ i += 1;
|
|
|
+ } else {
|
|
|
+ j += 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Some(hits)
|
|
|
+ })
|
|
|
+ .flatten()
|
|
|
+ .collect()
|
|
|
+}
|
|
|
+
|
|
|
+/// Same idea as `extract_contig_indices` but for a `BedRow` slice.
|
|
|
+///
|
|
|
+/// Returns a vector of
|
|
|
+/// `(contig, slice_start, slice_end)` for each contig present.
|
|
|
+fn extract_contig_indices_bed(rows: &[BedRow]) -> Vec<(u8, usize, usize)> {
|
|
|
+ let mut out = Vec::new();
|
|
|
+ if rows.is_empty() {
|
|
|
+ return out;
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut current = rows[0].range.contig;
|
|
|
+ let mut start = 0;
|
|
|
+
|
|
|
+ for (idx, row) in rows.iter().enumerate() {
|
|
|
+ if row.range.contig != current {
|
|
|
+ out.push((current, start, idx));
|
|
|
+ current = row.range.contig;
|
|
|
+ start = idx;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ out.push((current, start, rows.len()));
|
|
|
+ out
|
|
|
+}
|