Bläddra i källkod

bedrow overlaps par

Thomas 7 månader sedan
förälder
incheckning
a4df568492
4 ändrade filer med 125 tillägg och 5 borttagningar
  1. 76 2
      src/io/bed.rs
  2. 9 1
      src/lib.rs
  3. 3 0
      src/pipes/somatic.rs
  4. 37 2
      src/positions.rs

+ 76 - 2
src/io/bed.rs

@@ -5,12 +5,13 @@ use std::{
 
 use anyhow::Context;
 use log::warn;
+use rayon::prelude::*;
 
-use crate::{annotation::Annotation, positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomeRange}, variant::variant_collection::Variants};
+use crate::{annotation::Annotation, positions::{extract_contig_indices, find_contig_indices, overlaps_par, GenomePosition, GenomeRange, GetGenomeRange}, variant::variant_collection::Variants};
 
 use super::readers::get_reader;
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct BedRow {
     pub range: GenomeRange,
     pub name: Option<String>,
@@ -108,3 +109,76 @@ pub fn annotate_with_bed(
 
     Ok((total_bp, overlaps.len()))
 }
+
+pub fn bedrow_overlaps_par(
+    rows: &[BedRow],
+    queries: &[&GenomeRange],
+) -> Vec<BedRow>
+where
+    BedRow: Clone + Send + Sync,
+{
+    // Pre-compute [start, end) indices per contig for both inputs.
+    let (row_contigs, query_contigs) = rayon::join(
+        || extract_contig_indices_bed(rows),
+        || extract_contig_indices(queries),
+    );
+
+    row_contigs
+        .into_par_iter()                             // one task per contig
+        .filter_map(|(contig, r_start, r_end)| {
+            // No queries on this contig → skip the task.
+            let (q_start, q_end) = find_contig_indices(&query_contigs, contig)?;
+
+            let r_slice = &rows[r_start..r_end];
+            let q_slice = &queries[q_start..q_end];
+
+            let mut hits = Vec::new();
+            let (mut i, mut j) = (0usize, 0usize);
+
+            // Classic two-finger sweep.
+            while i < r_slice.len() && j < q_slice.len() {
+                let r_range = r_slice[i].range();           // BedRow → GenomeRange
+                let q_range = &q_slice[j].range;
+
+                match (r_range.range.end <= q_range.start, q_range.end <= r_range.range.start) {
+                    (true, _) => i += 1,                    // row finishes before query starts
+                    (_, true) => j += 1,                    // query finishes before row starts
+                    _ => {
+                        hits.push(r_slice[i].clone());      // overlap detected
+                        if r_range.range.end < q_range.end {
+                            i += 1;
+                        } else {
+                            j += 1;
+                        }
+                    }
+                }
+            }
+            Some(hits)
+        })
+        .flatten()
+        .collect()
+}
+
+/// Same idea as `extract_contig_indices` but for a `BedRow` slice.
+///
+/// Returns a vector of  
+/// `(contig, slice_start, slice_end)` for each contig present.
+fn extract_contig_indices_bed(rows: &[BedRow]) -> Vec<(u8, usize, usize)> {
+    let mut out = Vec::new();
+    if rows.is_empty() {
+        return out;
+    }
+
+    let mut current = rows[0].range.contig;
+    let mut start = 0;
+
+    for (idx, row) in rows.iter().enumerate() {
+        if row.range.contig != current {
+            out.push((current, start, idx));
+            current = row.range.contig;
+            start = idx;
+        }
+    }
+    out.push((current, start, rows.len()));
+    out
+}

+ 9 - 1
src/lib.rs

@@ -178,7 +178,7 @@ mod tests {
 
     use self::{collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config};
     use super::*;
-    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::SavanaCN}, collection::{bam::{self, nt_pileup_new}, flowcells::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig, ShouldRun}, commands::dorado::Dorado, helpers::find_files, io::{dict::read_dict, gff::features_ranges}, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, range_intersection_par, sort_ranges}, scan::scan::somatic_scan, variant::{variant::{AlterationCategory, BNDDesc, BNDGraph, GroupByThreshold, ToBNDGraph}, variant_collection::{group_variants_by_bnd_desc, group_variants_by_bnd_rc, Variant}, variants_stats::{self, somatic_depth_quality_ranges, VariantsStats}}};
+    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::SavanaCN}, collection::{bam::{self, nt_pileup_new}, flowcells::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig, ShouldRun}, commands::dorado::Dorado, helpers::find_files, io::{bed::bedrow_overlaps_par, dict::read_dict, gff::features_ranges}, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, range_intersection_par, sort_ranges}, scan::scan::somatic_scan, variant::{variant::{AlterationCategory, BNDDesc, BNDGraph, GroupByThreshold, ToBNDGraph}, variant_collection::{group_variants_by_bnd_desc, group_variants_by_bnd_rc, Variant}, variants_stats::{self, somatic_depth_quality_ranges, VariantsStats}}};
 
     // export RUST_LOG="debug"
     fn init() {
@@ -636,6 +636,14 @@ mod tests {
         println!("{:?}", del);
         println!("{:?} {:?}", del.len(), variant.formats.n_alt_depth());
 
+        let path = "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_Genes.bed";
+        let r = read_bed(path)?;
+        
+        let deleted_genes = bedrow_overlaps_par(&r, &vec![&GenomeRange { contig: variant.position.contig, range: del.start..del.end }]).into_iter().filter_map(|e| {
+            e.name
+        }).collect::<Vec<String>>().join(", ");
+        println!("{deleted_genes}");
+
 
         Ok(())
     }

+ 3 - 0
src/pipes/somatic.rs

@@ -479,6 +479,9 @@ impl Run for SomaticPipe {
             .save_to_json(&format!("{stats_dir}/{id}_variants_stats_final.json.gz"))?;
 
         info!("Final unique variants: {}", variants.data.len());
+
+        // Ensure sorted (should already be sorted) and save
+        variants.sort();
         variants.save_to_json(&result_json)?;
         variants.save_to_file(&result_bit)?;
         variants.write_vcf(&result_vcf, &config.dict_file, config.somatic_pipe_force)?;

+ 37 - 2
src/positions.rs

@@ -696,6 +696,41 @@ pub fn overlaps_par(positions: &[&GenomePosition], ranges: &[&GenomeRange]) -> V
 //         .collect()
 // }
 
+/// Returns every region that **overlaps** between two sorted
+/// slices of genome ranges, distributing the work across
+/// threads with **Rayon**.
+///
+/// Both inputs **must**
+/// - be sorted by `(contig, start)`
+/// - contain non-overlapping, half-open ranges (`start < end`)
+/// - use the same coordinate system.
+///
+/// # Parameters
+/// * `a` – Slice of references to `GenomeRange`.
+/// * `b` – Slice of references to `GenomeRange`.
+///
+/// # Returns
+/// A `Vec<GenomeRange>` containing one entry for each pairwise
+/// intersection.  
+/// The output is *not* guaranteed to be sorted.
+///
+/// # Complexity
+/// `O(|a| + |b|)` total work; memory ≤ the number of produced
+/// intersections.  Work is split per contig and executed in
+/// parallel.
+///
+/// # Panics
+/// Never panics if the pre-conditions above are met.
+///
+/// # Example
+/// ```
+/// # use pandora_lib_promethion::{GenomeRange, range_intersection_par};
+/// let a_range = GenomeRange { contig: 1, range: 100..200 };
+/// let b_range = GenomeRange { contig: 1, range: 150..250 };
+///
+/// let out = range_intersection_par(&[&a_range], &[&b_range]);
+/// assert_eq!(out, vec![GenomeRange { contig: 1, range: 150..200 }]);
+/// ```
 pub fn range_intersection_par(a: &[&GenomeRange], b: &[&GenomeRange]) -> Vec<GenomeRange> {
     let (a_contigs, b_contigs) = rayon::join(
         || extract_contig_indices(a),
@@ -745,7 +780,7 @@ pub fn range_intersection_par(a: &[&GenomeRange], b: &[&GenomeRange]) -> Vec<Gen
 }
 
 // Helper to create (contig, start index, end index) tuples
-fn extract_contig_indices(ranges: &[&GenomeRange]) -> Vec<(u8, usize, usize)> {
+pub fn extract_contig_indices(ranges: &[&GenomeRange]) -> Vec<(u8, usize, usize)> {
     let mut indices = Vec::new();
     let mut current_contig = None;
     let mut start_idx = 0;
@@ -768,7 +803,7 @@ fn extract_contig_indices(ranges: &[&GenomeRange]) -> Vec<(u8, usize, usize)> {
 }
 
 // Binary search to find contig indices in precomputed list
-fn find_contig_indices(contigs: &[(u8, usize, usize)], target: u8) -> Option<(usize, usize)> {
+pub fn find_contig_indices(contigs: &[(u8, usize, usize)], target: u8) -> Option<(usize, usize)> {
     contigs.binary_search_by(|(c, _, _)| c.cmp(&target))
         .ok()
         .map(|idx| (contigs[idx].1, contigs[idx].2))