Thomas пре 6 дана
родитељ
комит
fc28b7c1a7
3 измењених фајлова са 68 додато и 9 уклоњено
  1. 1 1
      src/callers/mod.rs
  2. 4 4
      src/io/bam.rs
  3. 63 4
      src/scan/scan.rs

+ 1 - 1
src/callers/mod.rs

@@ -201,7 +201,7 @@ pub mod straglr;
 /// - DeepVariant: 2-4 hours (per sample, chunked)
 /// - ClairS: 4-6 hours (chunked)
 /// - Severus: 1-2 hours
-/// - Savana: 2-3 hours
+/// - Savana: 6 hours
 /// - NanomonSV: 1-2 hours
 /// - DeepSomatic: 3-5 hours (chunked)
 ///

+ 4 - 4
src/io/bam.rs

@@ -335,14 +335,14 @@ pub enum SegmentOrder {
 /// ```text
 /// Case: A on '+' strand, B on '-' strand
 ///
-///              a                      c
-///              |                      |
-///              v                      v
+///              a                        b
+///              |                        |
+///              v                        v
 /// Reference:---[=====A=================>]---------
 ///           --------[<=============B======]-------
 ///                   ^                     ^
 ///                   |                     |
-///                   b                     d
+///                   c                     d
 ///
 /// Alignment A (+ strand):  a=aln_a_start, b=aln_a_end
 /// Alignment B (- strand):  c=aln_b_start, d=aln_b_end

+ 63 - 4
src/scan/scan.rs

@@ -429,7 +429,47 @@ fn read_range_inclusive(rec: &Record) -> Option<(u32, u32)> {
     }
 }
 
-fn scan_contig(
+/// Scans a single contig by streaming all alignments in that contig once, accumulating per-bin
+/// statistics, and returning the corresponding [`BinCount`] rows.
+///
+/// ## Design
+/// This function implements the “bitwise-identical output” mode while improving I/O locality:
+/// - **One indexed `fetch()`** for the whole contig, then sequential iteration over records.
+/// - Bins are defined using the same enumeration logic as the legacy implementation
+///   (via [`build_bin_defs_grid`]), producing **identical bin start/end ranges**.
+/// - A `grid_to_def` lookup safely maps a regular `bin_size` grid index to an existing bin definition,
+///   allowing fast assignment without assuming every grid bin exists.
+///
+/// ## Semantics (bitwise-identical mode)
+/// For each bin:
+/// - `depths`, `low_qualities`, `fb_invs` are incremented **per alignment** for every overlapping base.
+/// - `n_reads` and the ratio metrics are computed using a **qname-deduplicated representative record**
+///   per bin, where the representative is the **last** alignment encountered for that qname within
+///   the bin (“last wins”), matching the previous `HashMap` overwrite behavior.
+/// - `ratio_sa` is derived from the representative’s SA tag presence, while `ratio_start` and `ratio_end`
+///   reflect the maximum pileup of representative starts/ends at a single coordinate within the bin.
+///
+/// ## Parameters
+/// - `bam_path`: path to the BAM file (must have an index for `IndexedReader::fetch`).
+/// - `contig`: contig name as present in the BAM header.
+/// - `contig_len`: contig length (bp) from the BAM header dictionary.
+/// - `bin_size`: bin width (bp).
+/// - `chunk_n_bin`: legacy chunk size parameter, used **only** to reproduce historical bin range enumeration.
+/// - `min_mapq`: minimum MAPQ threshold used to increment `low_qualities` and evaluate `fb_invs`.
+///
+/// ## Returns
+/// A vector of [`BinCount`] rows for the contig, with `start/end` matching the legacy implementation.
+/// `outlier` is always `None` here; outliers are filled later by [`fill_outliers`].
+///
+/// ## Errors
+/// Returns an error if:
+/// - the BAM cannot be opened via `IndexedReader`
+/// - the contig cannot be fetched
+///
+/// ## Notes
+/// - Some contigs may legitimately produce zero bins if the legacy enumeration did so (e.g. very short contigs).
+/// - This function does not perform sorting; callers typically sort by `start` before writing.
+pub fn scan_contig(
     bam_path: &str,
     contig: &str,
     contig_len: u32,
@@ -481,13 +521,28 @@ fn scan_contig(
         }
 
         let is_lowq = rec.mapq() < min_mapq;
-        let has_fbinv = fb_inv_from_record(rec, &header, min_mapq, Some(150), Some(200)).is_some();
         let has_sa = matches!(rec.aux(b"SA"), Ok(rust_htslib::bam::record::Aux::String(_)));
 
         // Representative fields (same ones used by old count_reads_sa_start_end on reads_store.values())
         let aln_start = rec.reference_start() as u32;
         let aln_end = rec.reference_end() as u32;
 
+        let fb_positions: [Option<u32>; 2] =
+            if let Some(inv) = fb_inv_from_record(rec, &header, min_mapq, Some(150), Some(200)) {
+                let a_end_minus_1 = (inv.aln_a_end > 0
+                    && (inv.aln_a_end - 1) < contig_len as i64)
+                    .then_some((inv.aln_a_end - 1) as u32);
+
+                let b_start = (inv.aln_b_start >= 0
+                    && inv.aln_b_start < contig_len as i64)
+                    .then_some(inv.aln_b_start as u32);
+
+                [a_end_minus_1, b_start]
+            } else {
+                [None, None]
+            };
+
+
         // "last wins" representative per qname (same as HashMap overwrite behavior)
         let qname = rec.qname().to_vec();
 
@@ -512,8 +567,12 @@ fn scan_contig(
                 if is_lowq {
                     b.lowq[i] += 1;
                 }
-                if has_fbinv {
-                    b.fb_invs[i] += 1;
+                
+            }
+
+            for pos in fb_positions.into_iter().flatten() {
+                if pos >= def.start && pos <= def.end {
+                    b.fb_invs[(pos - def.start) as usize] += 1;
                 }
             }