|
|
@@ -429,7 +429,47 @@ fn read_range_inclusive(rec: &Record) -> Option<(u32, u32)> {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-fn scan_contig(
|
|
|
+/// Scans a single contig by streaming all alignments in that contig once, accumulating per-bin
|
|
|
+/// statistics, and returning the corresponding [`BinCount`] rows.
|
|
|
+///
|
|
|
+/// ## Design
|
|
|
+/// This function implements the “bitwise-identical output” mode while improving I/O locality:
|
|
|
+/// - **One indexed `fetch()`** for the whole contig, then sequential iteration over records.
|
|
|
+/// - Bins are defined using the same enumeration logic as the legacy implementation
|
|
|
+/// (via [`build_bin_defs_grid`]), producing **identical bin start/end ranges**.
|
|
|
+/// - A `grid_to_def` lookup safely maps a regular `bin_size` grid index to an existing bin definition,
|
|
|
+/// allowing fast assignment without assuming every grid bin exists.
|
|
|
+///
|
|
|
+/// ## Semantics (bitwise-identical mode)
|
|
|
+/// For each bin:
|
|
|
+/// - `depths`, `low_qualities`, `fb_invs` are incremented **per alignment** for every overlapping base.
|
|
|
+/// - `n_reads` and the ratio metrics are computed using a **qname-deduplicated representative record**
|
|
|
+/// per bin, where the representative is the **last** alignment encountered for that qname within
|
|
|
+/// the bin (“last wins”), matching the previous `HashMap` overwrite behavior.
|
|
|
+/// - `ratio_sa` is derived from the representative’s SA tag presence, while `ratio_start` and `ratio_end`
|
|
|
+/// reflect the maximum pileup of representative starts/ends at a single coordinate within the bin.
|
|
|
+///
|
|
|
+/// ## Parameters
|
|
|
+/// - `bam_path`: path to the BAM file (must have an index for `IndexedReader::fetch`).
|
|
|
+/// - `contig`: contig name as present in the BAM header.
|
|
|
+/// - `contig_len`: contig length (bp) from the BAM header dictionary.
|
|
|
+/// - `bin_size`: bin width (bp).
|
|
|
+/// - `chunk_n_bin`: legacy chunk size parameter, used **only** to reproduce historical bin range enumeration.
|
|
|
+/// - `min_mapq`: minimum MAPQ threshold used to increment `low_qualities` and evaluate `fb_invs`.
|
|
|
+///
|
|
|
+/// ## Returns
|
|
|
+/// A vector of [`BinCount`] rows for the contig, with `start/end` matching the legacy implementation.
|
|
|
+/// `outlier` is always `None` here; outliers are filled later by [`fill_outliers`].
|
|
|
+///
|
|
|
+/// ## Errors
|
|
|
+/// Returns an error if:
|
|
|
+/// - the BAM cannot be opened via `IndexedReader`
|
|
|
+/// - the contig cannot be fetched
|
|
|
+///
|
|
|
+/// ## Notes
|
|
|
+/// - Some contigs may legitimately produce zero bins if the legacy enumeration did so (e.g. very short contigs).
|
|
|
+/// - This function does not perform sorting; callers typically sort by `start` before writing.
|
|
|
+pub fn scan_contig(
|
|
|
bam_path: &str,
|
|
|
contig: &str,
|
|
|
contig_len: u32,
|
|
|
@@ -481,13 +521,28 @@ fn scan_contig(
|
|
|
}
|
|
|
|
|
|
let is_lowq = rec.mapq() < min_mapq;
|
|
|
- let has_fbinv = fb_inv_from_record(rec, &header, min_mapq, Some(150), Some(200)).is_some();
|
|
|
let has_sa = matches!(rec.aux(b"SA"), Ok(rust_htslib::bam::record::Aux::String(_)));
|
|
|
|
|
|
// Representative fields (same ones used by old count_reads_sa_start_end on reads_store.values())
|
|
|
let aln_start = rec.reference_start() as u32;
|
|
|
let aln_end = rec.reference_end() as u32;
|
|
|
|
|
|
+ let fb_positions: [Option<u32>; 2] =
|
|
|
+ if let Some(inv) = fb_inv_from_record(rec, &header, min_mapq, Some(150), Some(200)) {
|
|
|
+ let a_end_minus_1 = (inv.aln_a_end > 0
|
|
|
+ && (inv.aln_a_end - 1) < contig_len as i64)
|
|
|
+ .then_some((inv.aln_a_end - 1) as u32);
|
|
|
+
|
|
|
+ let b_start = (inv.aln_b_start >= 0
|
|
|
+ && inv.aln_b_start < contig_len as i64)
|
|
|
+ .then_some(inv.aln_b_start as u32);
|
|
|
+
|
|
|
+ [a_end_minus_1, b_start]
|
|
|
+ } else {
|
|
|
+ [None, None]
|
|
|
+ };
|
|
|
+
|
|
|
+
|
|
|
// "last wins" representative per qname (same as HashMap overwrite behavior)
|
|
|
let qname = rec.qname().to_vec();
|
|
|
|
|
|
@@ -512,8 +567,12 @@ fn scan_contig(
|
|
|
if is_lowq {
|
|
|
b.lowq[i] += 1;
|
|
|
}
|
|
|
- if has_fbinv {
|
|
|
- b.fb_invs[i] += 1;
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ for pos in fb_positions.into_iter().flatten() {
|
|
|
+ if pos >= def.start && pos <= def.end {
|
|
|
+ b.fb_invs[(pos - def.start) as usize] += 1;
|
|
|
}
|
|
|
}
|
|
|
|