2 månader sedan · bd93786500
--- a/memo_fb_inv.md
+++ b/memo_fb_inv.md
@@ -0,0 +1,194 @@
 
															+# Fold-Back Inversion — Geometry, Math, and Implementation Notes
														
 
															+
														
 
															+## Reference
														
 
															+
														
 
															+Python reference implementation:
														
 
															+https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py
														
 
															+
														
 
															+Rust implementation: `src/io/bam.rs` — `fb_inv_from_record()`, `FbInv` struct, `breakpoints()`.
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## What Is a Fold-Back Inversion?
														
 
															+
														
 
															+A fold-back inversion (fb-inv) occurs when a sequencing read aligns to the reference
														
 
															+twice on the **same chromosome** with **opposite strands**. The read goes forward along
														
 
															+the reference, then "folds back" and runs in reverse over the same (or overlapping) region.
														
 
															+
														
 
															+This creates a signal that looks like an **inverted duplication**: the read covers a
														
 
															+stretch of reference sequence twice — once in the forward direction, once in reverse.
														
 
															+This is a known artefact in ONT long-read data at sites of DNA damage or structural
														
 
															+variation (double-strand breaks, inverted repeats).
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## Geometry
														
 
															+
														
 
															+### General case (fold with offset `S`)
														
 
															+
														
 
															+```
														
 
															+Reference position:
														
 
															+    a        d         b         c
														
 
															+    |        |         |         |
														
 
															+    v        v         v         v
														
 
															+
														
 
															+    [=============A===============>]          A: primary, + strand
														
 
															+             [<=================B====]        B: supplementary, − strand
														
 
															+
														
 
															+    |←— A length —————————————————→|
														
 
															+             |←——— B length ————————→|
														
 
															+             |←overlap→|
														
 
															+    |←— a_d ——→|                             outer offset  = |a − d|
														
 
															+                         |←— b_c ——→|        junction gap  = |b − c|
														
 
															+```
														
 
															+
														
 
															+### Canonical perfect inverted duplication (`S = 0`)
														
 
															+
														
 
															+When B covers exactly the same region as A:
														
 
															+
														
 
															+```
														
 
															+    a                             b
														
 
															+    |                             |
														
 
															+    v                             v
														
 
															+    [=============A===============>]
														
 
															+    [<==============B==============]
														
 
															+    ^                             ^
														
 
															+    d                             c
														
 
															+
														
 
															+    b_c = |b − c| = 0    (no junction gap)
														
 
															+    a_d = |a − d| = 0    (perfectly symmetric)
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## Breakpoint Convention
														
 
															+
														
 
															+Breakpoints `(a, b)` and `(c, d)` are defined in **read order** (5′→3′ along the read),
														
 
															+not in reference order. This is critical:
														
 
															+
														
 
															+| Strand | 5′ terminus (read-first) | 3′ terminus (read-last) |
														
 
															+|--------|--------------------------|--------------------------|
														
 
															+| `+`    | leftmost ref pos = `start`  | rightmost ref pos = `end` |
														
 
															+| `−`    | rightmost ref pos = `end`   | leftmost ref pos = `start` |
														
 
															+
														
 
															+So `breakpoints(start, end, strand)` always returns `(5′_coord, 3′_coord)`:
														
 
															+
														
 
															+```
														
 
															++ strand: (start, end)   →  a = start,  b = end
														
 
															+− strand: (end,   start) →  c = end,    d = start
														
 
															+```
														
 
															+
														
 
															+Alignments A and B are ordered by their position **on the read** (A comes first). The
														
 
															+canonical fold-back case is A on `+` strand (forward part of read), B on `−` strand
														
 
															+(reverse part of read).
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## Derived Metrics
														
 
															+
														
 
															+```
														
 
															+aln_a_len = |a − b|   alignment A length on reference
														
 
															+aln_b_len = |c − d|   alignment B length on reference
														
 
															+b_c       = |b − c|   read-order junction gap  (3′ end of A → 5′ end of B)
														
 
															+a_d       = |a − d|   read-order outer span    (5′ end of A → 3′ end of B)
														
 
															+```
														
 
															+
														
 
															+### Interpretation with offset `S`
														
 
															+
														
 
															+For an inverted duplication shifted by `S` bp:
														
 
															+
														
 
															+```
														
 
															+A: [X,     X+L)  on +
														
 
															+B: [X+S,   X+L+S) on −
														
 
															+
														
 
															+a = X,     b = X+L
														
 
															+c = X+L+S, d = X+S
														
 
															+
														
 
															+b_c = |(X+L) − (X+L+S)| = S
														
 
															+a_d = |X − (X+S)|        = S
														
 
															+```
														
 
															+
														
 
															+Both metrics equal `S`, the size of the non-duplicated tail on each side.
														
 
															+The **duplicated region** is `[d, b] = [X+S, X+L)`, of length `L − S`.
														
 
															+
														
 
															+For `S = 0` (perfect duplication): `b_c = 0`, `a_d = 0`, duplicated region = all of A.
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## Reference-Space Overlap / Gap (Filtering)
														
 
															+
														
 
															+Before computing breakpoints, a separate metric `interval_gap` is used to pre-filter:
														
 
															+
														
 
															+```
														
 
															+interval_gap = max(A_start, B_start) − min(A_end, B_end)
														
 
															+```
														
 
															+
														
 
															+- Negative → the two alignments **overlap** on the reference by `|gap|` bp (typical fb-inv)
														
 
															+- Zero → adjacent
														
 
															+- Positive → separated by a gap
														
 
															+
														
 
															+Note: `interval_gap` is a reference-space measure. `b_c`/`a_d` are read-space measures.
														
 
															+They capture complementary aspects of the fold geometry.
														
 
															+
														
 
															+### Recommended thresholds for artefact detection
														
 
															+
														
 
															+```
														
 
															+max_overlap = Some(150)   allow up to 150 bp overlap
														
 
															+max_gap     = Some(0)     require overlap — no gap allowed
														
 
															+```
														
 
															+
														
 
															+**Why `max_gap = Some(0)` for artefacts:** in Nanopore sequencing, a fold-back artefact
														
 
															+arises when the single-stranded DNA molecule forms a **hairpin** and re-translocates
														
 
															+back through the pore in the reverse direction. Because the same physical DNA strand is
														
 
															+read twice — once forward, once in reverse — the two alignments **must overlap** on the
														
 
															+reference. A positive reference-space gap between the alignments is inconsistent with
														
 
															+this mechanism and instead points to a true SV junction with unaligned breakpoint
														
 
															+sequence. Allowing a gap (e.g. `max_gap = Some(200)`) risks capturing genuine structural
														
 
															+variants and inflating the artefact count.
														
 
															+
														
 
															+The reference Python implementation requires strict overlap (`overlap > 0 bp`) for
														
 
															+exactly this reason.
														
 
															+
														
 
															+### Artefact vs. true SV classification
														
 
															+
														
 
															+Events passing the overlap filter are further classified by `a_d`:
														
 
															+
														
 
															+```
														
 
															+a_d < 150 bp  →  likely sequencing artefact (short hairpin / palindrome)
														
 
															+a_d ≥ 150 bp  →  candidate true structural variant
														
 
															+```
														
 
															+
														
 
															+This threshold comes directly from the reference Python implementation.
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## Diagram Label Summary
														
 
															+
														
 
															+```
														
 
															+Reference:---[=====A=================>]---------
														
 
															+          --------[<=============B======]-------
														
 
															+             ^    ^              ^    ^
														
 
															+             a    d              b    c
														
 
															+
														
 
															+a = 5′ of A (leftmost of A,  read-first)    = aln_a_start
														
 
															+b = 3′ of A (rightmost of A, read-last)     = aln_a_end
														
 
															+c = 5′ of B (rightmost of B, read-first)    = aln_b_end
														
 
															+d = 3′ of B (leftmost of B,  read-last)     = aln_b_start
														
 
															+```
														
 
															+
														
 
															+The region `[d, b]` (where d < b) is the **inverted duplication** — covered forward by A
														
 
															+and backward by B.
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## Filtering Logic Summary
														
 
															+
														
 
															+```
														
 
															+1. Record must be primary, mapped, MAPQ ≥ min_mapq
														
 
															+2. Exactly one SA tag entry, same chromosome, MAPQ ≥ min_mapq
														
 
															+3. Opposite strands (one +, one −)
														
 
															+4. |interval_gap| ≤ max_overlap  (if overlap)
														
 
															+5. interval_gap   ≤ max_gap      (if gap)
														
 
															+```
														
 
															+
														
 
															+All five conditions must hold for a record to be classified as a fold-back inversion.
														
--- a/src/de_novo/de_novo_pipe.rs
+++ b/src/de_novo/de_novo_pipe.rs
@@ -280,14 +280,11 @@ pub fn run_local_assembly_iterative(
 
															                 })
														
 
															             })
														
 
															             .filter_map(|supp_rec| {
														
 
															-                // Resolve primary from original BAM using SA tag
														
 
															-                let resolved = primary_record(&mut bam_reader, supp_rec);
														
 
															+                let resolved = primary_record(&mut bam_reader, supp_rec)
														
 
															+                    .map_err(|e| log::warn!("primary_record failed: {e}"))
														
 
															+                    .ok()?;
														
 
															                 let qname = resolved.qname().to_vec();
														
 
															-                if !seen.contains(&qname) {
														
 
															-                    Some(resolved)
														
 
															-                } else {
														
 
															-                    None
														
 
															-                }
														
 
															+                if !seen.contains(&qname) { Some(resolved) } else { None }
														
 
															             })
														
 
															             .collect();
														
--- a/src/io/bam.rs
+++ b/src/io/bam.rs
@@ -11,113 +11,125 @@ use rust_htslib::bam::{
 
															 use crate::{commands::samtools::SamtoolsReheader, config::Config, runners::Run};
														
 
															-/// Parses an SA tag string and extracts chromosome and position information.
														
 
															+/// A single entry from a BAM SA (supplementary alignment) tag.
														
 
															+///
														
 
															+/// SA tag format per the SAM spec: `rname,pos,strand,CIGAR,mapQ,NM` (semicolon-separated).
														
 
															+/// `pos` is stored as 0-based after conversion from the 1-based SAM value.
														
 
															+pub struct SaEntry<'a> {
														
 
															+    /// Reference/contig name
														
 
															+    pub chr: &'a str,
														
 
															+    /// Reference start position, 0-based
														
 
															+    pub pos: i32,
														
 
															+    /// Strand: `'+'` or `'-'`
														
 
															+    pub strand: char,
														
 
															+    /// CIGAR string (unparsed)
														
 
															+    pub cigar: &'a str,
														
 
															+    /// Mapping quality
														
 
															+    pub mapq: u8,
														
 
															+    /// Edit distance (NM tag value); `0` if absent or unparseable
														
 
															+    pub nm: u32,
														
 
															+}
														
 
															+
														
 
															+/// Parse all entries from a BAM SA tag value into [`SaEntry`] structs.
														
 
															+///
														
 
															+/// `pos` is converted from 1-based (SAM spec) to 0-based. Entries with fewer than
														
 
															+/// six comma-separated fields or unparseable numeric fields are silently skipped.
														
 
															 ///
														
 
															 /// # Arguments
														
 
															 ///
														
 
															-/// * `sa` - The SA tag string to parse
														
 
															+/// * `sa` - Raw SA tag value (the string after `SA:Z:` in a SAM record)
														
 
															 ///
														
 
															 /// # Returns
														
 
															 ///
														
 
															-/// A vector of tuples containing chromosome names and positions
														
 
															-pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> {
														
 
															+/// A vector of parsed entries. Empty if the tag contains no valid entries.
														
 
															+pub fn parse_sa_entries(sa: &str) -> Vec<SaEntry<'_>> {
														
 
															     sa.split(';')
														
 
															         .filter(|s| !s.is_empty())
														
 
															         .filter_map(|s| {
														
 
															-            let parts: Vec<&str> = s.split(',').take(2).collect();
														
 
															-            if parts.len() < 2 {
														
 
															+            let f: Vec<&str> = s.splitn(6, ',').collect();
														
 
															+            if f.len() < 6 {
														
 
															                 return None;
														
 
															             }
														
 
															-
														
 
															-            let chr = parts[0];
														
 
															-            match parts[1].parse::<i32>() {
														
 
															-                Ok(position) => Some((chr, position)),
														
 
															-                Err(_) => None,
														
 
															-            }
														
 
															+            Some(SaEntry {
														
 
															+                chr: f[0],
														
 
															+                pos: f[1].parse::<i32>().ok()? - 1,
														
 
															+                strand: f[2].chars().next()?,
														
 
															+                cigar: f[3],
														
 
															+                mapq: f[4].parse().ok()?,
														
 
															+                nm: f[5].parse().unwrap_or(0),
														
 
															+            })
														
 
															         })
														
 
															         .collect()
														
 
															 }
														
 
															-/// Resolves the primary record for supplementary alignments using BAM SA tag
														
 
															+/// Convenience wrapper over [`parse_sa_entries`] returning only `(chr, pos)` pairs.
														
 
															+///
														
 
															+/// `pos` is 0-based. Useful when only fetch coordinates are needed.
														
 
															+pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> {
														
 
															+    parse_sa_entries(sa).into_iter().map(|e| (e.chr, e.pos)).collect()
														
 
															+}
														
 
															+
														
 
															+/// Resolve a supplementary alignment to its primary record via the SA tag.
														
 
															 ///
														
 
															-/// For supplementary alignments, searches linked primary records using genomic positions
														
 
															-/// listed in the SA tag. Returns the first non-supplementary record with matching query name.
														
 
															+/// For supplementary alignments, each position listed in the SA tag is fetched and
														
 
															+/// scanned for a non-supplementary record with the same query name. The first match
														
 
															+/// is returned. If the record is not supplementary, or no primary is found across all
														
 
															+/// SA positions, the original record is returned unchanged.
														
 
															 ///
														
 
															 /// # Arguments
														
 
															-/// * `bam` - Mutable reference to indexed BAM reader for random access
														
 
															-/// * `record` - Input record to evaluate (typically a supplementary alignment)
														
 
															+///
														
 
															+/// * `bam` - Indexed BAM reader used for random-access fetches
														
 
															+/// * `record` - The record to resolve (typically a supplementary alignment)
														
 
															 ///
														
 
															 /// # Returns
														
 
															-/// - Original record if it's not supplementary
														
 
															-/// - First matching primary record found via SA tag positions
														
 
															-/// - Original record if no primary matches found
														
 
															 ///
														
 
															-/// # Panics
														
 
															-/// - If SA tag format is invalid (missing fields or non-integer positions)
														
 
															-/// - If BAM fetch operation fails
														
 
															+/// The primary alignment record, or the input record if resolution is not applicable
														
 
															+/// or unsuccessful.
														
 
															 ///
														
 
															-/// # Example
														
 
															-/// ```
														
 
															-/// use rust_htslib::{bam::{IndexedReader, Read}};
														
 
															+/// # Errors
														
 
															 ///
														
 
															-/// let mut bam = IndexedReader::from_path("input.bam").unwrap();
														
 
															-/// let record = bam.records().next().unwrap().unwrap();
														
 
															-/// let primary = primary_record(&mut bam, record);
														
 
															-/// ```
														
 
															-pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
														
 
															-    // Return immediately if not a supplementary alignment
														
 
															+/// Returns an error if a BAM fetch operation fails or if a fetched record is malformed.
														
 
															+pub fn primary_record(bam: &mut IndexedReader, record: Record) -> anyhow::Result<Record> {
														
 
															     if !record.is_supplementary() {
														
 
															-        return record;
														
 
															+        return Ok(record);
														
 
															     }
														
 
															     let qname = record.qname();
														
 
															-    // Process SA tag if present
														
 
															     if let Ok(Aux::String(sa)) = record.aux(b"SA") {
														
 
															-        // Search potential primary alignments at each SA position
														
 
															         for (chr, start) in parse_sa_tag(sa) {
														
 
															             bam.fetch((chr, start, start + 1))
														
 
															-                .expect("BAM fetch failed");
														
 
															+                .with_context(|| format!("BAM fetch failed at {chr}:{start}"))?;
														
 
															             for result in bam.records() {
														
 
															-                let candidate = result.expect("Invalid BAM record");
														
 
															-                if candidate.qname() == qname && !candidate.is_supplementary() {
														
 
															-                    return candidate.clone();
														
 
															+                let candidate = result.context("Invalid BAM record")?;
														
 
															+                if candidate.qname() == qname && !candidate.is_supplementary() && !candidate.is_secondary() {
														
 
															+                    return Ok(candidate);
														
 
															                 }
														
 
															             }
														
 
															         }
														
 
															     }
														
 
															-    // Fallback to original record if no primary found
														
 
															-    record
														
 
															+    Ok(record)
														
 
															 }
														
 
															-/// Creates optimized position bins for fetching records.
														
 
															+/// Group 0-based positions by chromosome into bins spanning at most 1000 bp, sorted by bin size descending.
														
 
															 ///
														
 
															-/// Groups positions by chromosome and creates bins of ±1000 bp.
														
 
															-/// Sorts bins by size (descending) to prioritize regions with more alignment hits.
														
 
															-///
														
 
															-/// # Arguments
														
 
															-///
														
 
															-/// * `positions` - Vector of chromosome/position tuples
														
 
															-///
														
 
															-/// # Returns
														
 
															-///
														
 
															-/// A vector of position bins, sorted by bin size
														
 
															+/// Positions within 1000 bp of the first position in the current bin are merged into
														
 
															+/// that bin. Larger bins are sorted first so that high-density regions are fetched
														
 
															+/// before sparse ones, enabling earlier early-exit in callers.
														
 
															 fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec<Vec<(&'a str, i32)>> {
														
 
															-    // Sort positions by chromosome and position
														
 
															     let mut sorted_positions = positions.to_vec();
														
 
															     sorted_positions.sort_by_key(|(chr, pos)| (*chr, *pos));
														
 
															     sorted_positions.dedup();
														
 
															-    // Group by chromosome and create bins of ±1000 bp
														
 
															     let mut grouped: HashMap<&str, Vec<Vec<(&str, i32)>>> = HashMap::new();
														
 
															     for (chr, pos) in sorted_positions {
														
 
															         let bins = grouped.entry(chr).or_default();
														
 
															         if let Some(last_bin) = bins.last_mut() {
														
 
															-            if last_bin.is_empty() || (pos - last_bin[0].1).abs() <= 1000 {
														
 
															+            if last_bin.is_empty() || pos - last_bin[0].1 <= 1000 {
														
 
															                 last_bin.push((chr, pos));
														
 
															             } else {
														
 
															                 bins.push(vec![(chr, pos)]);
														
@@ -127,44 +139,33 @@ fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec<Vec<(&'a str, i
 
															         }
														
 
															     }
														
 
															-    // Flatten and sort by bin size (descending)
														
 
															     let mut flattened: Vec<Vec<(&str, i32)>> = grouped.values().flatten().cloned().collect();
														
 
															-
														
 
															-    // Sort bins by size (descending) to prioritize regions with more hits
														
 
															     flattened.sort_by_key(|bin| std::cmp::Reverse(bin.len()));
														
 
															-
														
 
															     flattened
														
 
															 }
														
 
															-/// Retrieves primary alignment records based on a set of input records.
														
 
															+/// Resolve supplementary alignments in a record set to their primary records.
														
 
															 ///
														
 
															-/// This function processes a collection of BAM records and retrieves their primary alignments.
														
 
															-/// When supplementary alignments are found (with SA tags), it fetches the corresponding
														
 
															-/// primary alignments from the provided BAM file.
														
 
															+/// Primary and secondary records are separated in a first pass. For supplementary
														
 
															+/// records with an SA tag, the SA positions are binned by chromosome and fetched in
														
 
															+/// bulk. Fetching stops as soon as all target query names have been found (early exit).
														
 
															+/// Records that cannot be resolved are silently dropped with a warning.
														
 
															 ///
														
 
															 /// # Arguments
														
 
															 ///
														
 
															-/// * `bam` - A mutable reference to an IndexedReader for the BAM file
														
 
															-/// * `records` - A vector of input records to process
														
 
															+/// * `bam` - Indexed BAM reader used for random-access fetches
														
 
															+/// * `records` - Mixed set of primary, secondary, and supplementary records
														
 
															 ///
														
 
															 /// # Returns
														
 
															 ///
														
 
															-/// A vector of records containing both:
														
 
															-/// - Original primary alignments from the input
														
 
															-/// - Primary alignments fetched for any supplementary records in the input
														
 
															-///
														
 
															-/// # Examples
														
 
															-///
														
 
															-/// ```
														
 
															-/// use rust_htslib::bam::{IndexedReader, Record};
														
 
															-/// let mut bam = IndexedReader::from_path("sample.bam").unwrap();
														
 
															-/// let records = vec![/* some records */];
														
 
															-/// let primary_alignments = primary_records(&mut bam, records);
														
 
															-/// ```
														
 
															+/// A vector containing original non-supplementary records from the input plus any
														
 
															+/// primaries fetched for supplementary inputs. Secondary records in the input are
														
 
															+/// passed through unchanged; secondary records encountered during SA-position fetching
														
 
															+/// are skipped. Order is not guaranteed.
														
 
															 pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Record> {
														
 
															     let mut res = Vec::with_capacity(records.len());
														
 
															     let mut all_positions = Vec::new();
														
 
															-    let mut all_qnames_to_fetch = Vec::new();
														
 
															+    let mut all_qnames_to_fetch = HashSet::new();
														
 
															     // First pass: collect primary records and positions to fetch
														
 
															     for record in records.iter() {
														
@@ -176,7 +177,7 @@ pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Rec
 
															                 all_positions.extend(positions);
														
 
															                 match String::from_utf8(qname.to_vec()) {
														
 
															-                    Ok(qname_str) => all_qnames_to_fetch.push(qname_str),
														
 
															+                    Ok(qname_str) => { all_qnames_to_fetch.insert(qname_str); }
														
 
															                     Err(e) => warn!("Invalid UTF-8 in qname: {}", e),
														
 
															                 }
														
 
															             }
														
@@ -250,6 +251,22 @@ pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Rec
 
															     res
														
 
															 }
														
 
															+/// Extract contig names and lengths from a BAM header.
														
 
															+///
														
 
															+/// Reads all `@SQ` lines and returns a map of sequence name (`SN`) to sequence
														
 
															+/// length (`LN`).
														
 
															+///
														
 
															+/// # Arguments
														
 
															+///
														
 
															+/// * `header` - BAM header to parse
														
 
															+///
														
 
															+/// # Returns
														
 
															+///
														
 
															+/// A map from contig name to length in base pairs.
														
 
															+///
														
 
															+/// # Errors
														
 
															+///
														
 
															+/// Returns an error if the `LN` field of any `@SQ` line cannot be parsed as `u64`.
														
 
															 pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<HashMap<String, u64>> {
														
 
															     let mut genome = HashMap::new();
														
 
															     for (key, records) in header.to_hashmap() {
														
@@ -259,7 +276,7 @@ pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<Has
 
															                     record["SN"].to_string(),
														
 
															                     record["LN"]
														
 
															                         .parse::<u64>()
														
 
															-                        .expect("Failed parsing length of chromosomes"),
														
 
															+                        .with_context(|| format!("Failed to parse LN for contig {}", record["SN"]))?,
														
 
															                 );
														
 
															             }
														
 
															         }
														
@@ -267,8 +284,22 @@ pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<Has
 
															     Ok(genome)
														
 
															 }
														
 
															-/// Split the genome into N balanced region lists (`ctg:start-end` strings).
														
 
															-/// Each element of the outer Vec is the list of `-r` regions for one batch.
														
 
															+/// Partition a genome into `n` balanced batches of samtools-style region strings.
														
 
															+///
														
 
															+/// Contigs are sorted deterministically and sliced into segments so that each batch
														
 
															+/// covers approximately `total_bp / n` bases. A contig may be split across batches.
														
 
															+/// Region strings use 1-based inclusive coordinates (`ctg:start-end`) as expected
														
 
															+/// by samtools `-r`.
														
 
															+///
														
 
															+/// # Arguments
														
 
															+///
														
 
															+/// * `genome` - Map of contig name to length in bp, typically from [`get_genome_sizes`]
														
 
															+/// * `n` - Number of batches to produce
														
 
															+///
														
 
															+/// # Returns
														
 
															+///
														
 
															+/// A vector of `n` batches, each containing the region strings assigned to that batch.
														
 
															+/// Returns an empty vector if `n` is zero or `genome` is empty.
														
 
															 pub fn split_genome_into_n_regions(genome: &HashMap<String, u64>, n: usize) -> Vec<Vec<String>> {
														
 
															     if n == 0 || genome.is_empty() {
														
 
															         return Vec::new();
														
@@ -324,24 +355,39 @@ pub enum SegmentOrder {
 
															 }
														
 
															 /// Fold-back inversion event detected from a primary/supplementary alignment pair.
														
 
															-/// Thanks to: https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py
														
 
															 ///
														
 
															-/// A fold-back inversion occurs when a read has two alignments on the same chromosome
														
 
															-/// with opposite strands that overlap on the reference. This pattern suggests the
														
 
															-/// presence of an inverted duplication or fold-back structure.
														
 
															+/// A fold-back inversion occurs when a single read produces two alignments on the same
														
 
															+/// chromosome with opposite strand orientations. The read travels forward (alignment A),
														
 
															+/// then folds back and runs in reverse (alignment B), covering the same reference region
														
 
															+/// twice — producing a signature identical to an **inverted duplication**.
														
 
															+///
														
 
															+/// SA tag parsing is handled by [`parse_sa_entries`] / [`SaEntry`], which perform the
														
 
															+/// 1-based → 0-based coordinate conversion required by the SAM specification.
														
 
															 ///
														
 
															 /// # Coordinate System
														
 
															+///
														
 
															 /// - All coordinates are 0-based, half-open `[start, end)`
														
 
															-/// - Alignments A and B are ordered by their position on the read (A first)
														
 
															-/// - Breakpoint distances (`b_c`, `a_d`) use strand-aware coordinates
														
 
															+/// - Alignments A and B are ordered by their position on the read (A comes first)
														
 
															 ///
														
 
															 /// # Breakpoints
														
 
															-/// Breakpoints (a, b, c, d) are strand-aware positions representing alignment termini:
														
 
															-/// - For `+` strand: `a = start` (5'), `b = end` (3')
														
 
															-/// - For `-` strand: `c = start` (3'), `d = end` (5')
														
 
															+///
														
 
															+/// `(a, b, c, d)` are alignment termini in **read order** (5′→3′ along the read).
														
 
															+/// `b` is the **inner** (junction-proximal) breakpoint of A; `c` is the **inner**
														
 
															+/// breakpoint of B. `a` and `d` are the **outer** (junction-distal) endpoints.
														
 
															+///
														
 
															+/// For each strand, the 5′ end of the read maps to a different reference position:
														
 
															+///
														
 
															+/// | Strand | 5′ terminus | 3′ terminus |
														
 
															+/// |--------|-------------|-------------|
														
 
															+/// | `+`    | `start` (leftmost ref pos)  | `end` (rightmost ref pos)  |
														
 
															+/// | `−`    | `end`   (rightmost ref pos) | `start` (leftmost ref pos) |
														
 
															+///
														
 
															+/// When A comes first on the read: A's 3′ terminus (`b`) and B's 5′ terminus (`c`)
														
 
															+/// are the **inner** (junction-proximal) breakpoints. A's 5′ (`a`) and B's 3′ (`d`)
														
 
															+/// are the **outer** endpoints.
														
 
															 ///
														
 
															 /// ```text
														
 
															-/// Case: A on '+' strand, B on '-' strand
														
 
															+/// Case: A on '+' strand, B on '−' strand
														
 
															 ///
														
 
															 ///              a                        b
														
 
															 ///              |                        |
														
@@ -350,17 +396,39 @@ pub enum SegmentOrder {
 
															 ///           --------[<=============B======]-------
														
 
															 ///                   ^                     ^
														
 
															 ///                   |                     |
														
 
															-///                   c                     d
														
 
															+///                   d                     c
														
 
															 ///
														
 
															-/// Alignment A (+ strand):  a=aln_a_start, b=aln_a_end
														
 
															-/// Alignment B (- strand):  c=aln_b_start, d=aln_b_end
														
 
															+/// Alignment A (+ strand):  a = aln_a_start  (5' of A, outer)
														
 
															+///                          b = aln_a_end    (3' of A, inner — junction-proximal)
														
 
															+/// Alignment B (− strand):  c = aln_b_end    (5' of B in read order, inner — junction-proximal)
														
 
															+///                          d = aln_b_start  (3' of B in read order, outer)
														
 
															 ///
														
 
															 /// Distances:
														
 
															-///   aln_a_len = |a - b| = alignment A length
														
 
															-///   aln_b_len = |c - d| = alignment B length
														
 
															-///   b_c       = |b - c| = inner breakpoint distance (3' ends)
														
 
															-///   a_d       = |a - d| = outer breakpoint distance (5' ends)
														
 
															+///   aln_a_len = |a − b| = alignment A length on reference
														
 
															+///   aln_b_len = |c − d| = alignment B length on reference
														
 
															+///   b_c       = |b − c| = inner breakpoint distance (junction gap between A and B)
														
 
															+///   a_d       = |a − d| = outer span of the fold-back structure
														
 
															 /// ```
														
 
															+///
														
 
															+/// The region `[d, b]` (where `d < b` in the canonical case) is covered by both
														
 
															+/// alignments and constitutes the **inverted duplication**. For a perfect duplication
														
 
															+/// `b_c = 0` and `a_d = 0`. Events with `a_d < 150 bp` are typically classified as
														
 
															+/// **sequencing artefacts** (short hairpin/palindrome) rather than true structural variants.
														
 
															+///
														
 
															+/// # Divergence from reference implementation
														
 
															+///
														
 
															+/// The reference Python script requires the two alignments to **overlap** on the reference
														
 
															+/// (`overlap > 0 bp`). This implementation is more permissive: it also accepts a small
														
 
															+/// reference-space **gap** between the alignments (controlled by `max_gap` in
														
 
															+/// [`fb_inv_from_record`]). Set `max_gap = Some(0)` to match the strict Python behaviour,
														
 
															+/// which requires overlap because the artefact mechanism — ssDNA forming a hairpin and
														
 
															+/// re-translocating back through the nanopore — guarantees both alignments cover the
														
 
															+/// same physical DNA strand.
														
 
															+///
														
 
															+/// # References
														
 
															+///
														
 
															+/// Geometry, breakpoint labelling, and the `a_d < 150 bp` artefact threshold are adapted from:
														
 
															+/// <https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py>
														
 
															 #[derive(Debug)]
														
 
															 pub struct FbInv {
														
 
															     /// Read name (QNAME from BAM)
														
@@ -383,9 +451,9 @@ pub struct FbInv {
 
															     pub aln_a_len: i64,
														
 
															     /// Alignment B length on reference (bp)
														
 
															     pub aln_b_len: i64,
														
 
															-    /// Distance between breakpoints B and C (strand-aware)
														
 
															+    /// `|b - c|`: read-order junction gap — distance between the 3' end of A and the 5' end of B
														
 
															     pub b_c: i64,
														
 
															-    /// Distance between breakpoints A and D (strand-aware)
														
 
															+    /// `|a - d|`: read-order outer span — distance between the 5' end of A and the 3' end of B
														
 
															     pub a_d: i64,
														
 
															     /// Which alignment appears first on the read
														
 
															     pub first: SegmentOrder,
														
@@ -399,21 +467,18 @@ pub struct FbInv {
 
															     pub sa_qend: u32,
														
 
															 }
														
 
															-/// Parse a SAM CIGAR string into a vector of CIGAR operations.
														
 
															+/// Parse a SAM CIGAR string into a vector of [`Cigar`] operations.
														
 
															 ///
														
 
															-/// Supports all standard CIGAR operations: M, I, D, N, S, H, P, =, X
														
 
															+/// All standard CIGAR operations are supported: `M`, `I`, `D`, `N`, `S`, `H`, `P`, `=`, `X`.
														
 
															 ///
														
 
															 /// # Arguments
														
 
															-/// * `s` - CIGAR string (e.g., "10S100M5I50M20S")
														
 
															+///
														
 
															+/// * `s` - CIGAR string, e.g. `"10S100M5I50M20S"`
														
 
															 ///
														
 
															 /// # Returns
														
 
															-/// `Some(Vec<Cigar>)` if valid, `None` if malformed or empty
														
 
															 ///
														
 
															-/// # Examples
														
 
															-/// ```
														
 
															-/// let ops = parse_cigar_str("10S100M").unwrap();
														
 
															-/// assert_eq!(ops.len(), 2);
														
 
															-/// ```
														
 
															+/// `Some(ops)` on success, `None` if the string is empty, contains an unknown operation
														
 
															+/// character, has an operator without a preceding length, or ends with trailing digits.
														
 
															 pub fn parse_cigar_str(s: &str) -> Option<Vec<Cigar>> {
														
 
															     if s.is_empty() {
														
 
															         return None;
														
@@ -465,17 +530,22 @@ pub fn parse_cigar_str(s: &str) -> Option<Vec<Cigar>> {
 
															     Some(ops)
														
 
															 }
														
 
															-/// Compute query coordinates (start, end) on the read from a CIGAR.
														
 
															+/// Compute `(query_begin, query_end)` on the read from CIGAR operations.
														
 
															 ///
														
 
															-/// For plus strand, the first CIGAR operation corresponds to the read start.
														
 
															-/// For minus strand, the last CIGAR operation corresponds to the read start.
														
 
															+/// For `+` strand the first CIGAR op is at the read start; for `-` strand the last op
														
 
															+/// is at the read start (CIGAR is encoded in reference order regardless of strand).
														
 
															+/// Hard-clipped bases are excluded; soft-clip length sets `query_begin`.
														
 
															+/// Only query-consuming operations (`M`, `I`, `=`, `X`) contribute to the alignment length.
														
 
															 ///
														
 
															 /// # Arguments
														
 
															-/// * `ops` - CIGAR operations slice
														
 
															-/// * `strand` - '+' or '-'
														
 
															+///
														
 
															+/// * `ops` - Slice of CIGAR operations
														
 
															+/// * `strand` - `'+'` or `'-'`
														
 
															 ///
														
 
															 /// # Returns
														
 
															-/// Tuple of (query_begin, query_end) in read coordinates (excluding hard clips)
														
 
															+///
														
 
															+/// `(query_begin, query_end)` as read-coordinate offsets (0-based, hard clips excluded).
														
 
															+/// Returns `(0, 0)` for an empty slice.
														
 
															 fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) {
														
 
															     if ops.is_empty() {
														
 
															         return (0, 0);
														
@@ -505,15 +575,18 @@ fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) {
 
															     (query_beg, query_beg + aln_len)
														
 
															 }
														
 
															-/// Compute alignment length on the reference from CIGAR operations.
														
 
															+/// Compute the reference-consuming length of a CIGAR.
														
 
															 ///
														
 
															-/// Counts bases consumed by M, D, N, =, X operations.
														
 
															+/// Sums lengths of operations that advance the reference coordinate:
														
 
															+/// `M`, `D`, `N`, `=`, `X`. Insertions, soft clips, hard clips, and pads are excluded.
														
 
															 ///
														
 
															 /// # Arguments
														
 
															-/// * `ops` - CIGAR operations slice
														
 
															+///
														
 
															+/// * `ops` - Slice of CIGAR operations
														
 
															 ///
														
 
															 /// # Returns
														
 
															-/// Reference-consuming length in base pairs
														
 
															+///
														
 
															+/// Reference-consuming length in base pairs. Returns `0` for an empty slice.
														
 
															 fn alignment_ref_length(ops: &[Cigar]) -> u32 {
														
 
															     ops.iter()
														
 
															         .filter_map(|op| match op {
														
@@ -527,32 +600,40 @@ fn alignment_ref_length(ops: &[Cigar]) -> u32 {
 
															         .sum()
														
 
															 }
														
 
															-/// Compute gap between two intervals. Negative value means overlap.
														
 
															+/// Compute the gap in base pairs between two half-open intervals.
														
 
															+///
														
 
															+/// Both intervals are `[start, end)`. A negative return value indicates that the
														
 
															+/// intervals overlap by that many bases; zero means they are adjacent; positive means
														
 
															+/// they are separated.
														
 
															 ///
														
 
															 /// # Arguments
														
 
															+///
														
 
															 /// * `a_start`, `a_end` - First interval `[a_start, a_end)`
														
 
															 /// * `b_start`, `b_end` - Second interval `[b_start, b_end)`
														
 
															 ///
														
 
															 /// # Returns
														
 
															-/// Gap in bp (negative if overlapping, 0 if adjacent, positive if separated)
														
 
															+///
														
 
															+/// `max(starts) - min(ends)`: negative for overlap, 0 for adjacency, positive for gap.
														
 
															 #[inline]
														
 
															 fn interval_gap(a_start: i64, a_end: i64, b_start: i64, b_end: i64) -> i64 {
														
 
															     a_start.max(b_start) - a_end.min(b_end)
														
 
															 }
														
 
															-/// Compute strand-aware breakpoint coordinates.
														
 
															+/// Return `(5'_terminus, 3'_terminus)` in read order for a single alignment.
														
 
															 ///
														
 
															-/// For fold-back inversions, breakpoints depend on strand orientation:
														
 
															-/// - Plus strand: breakpoints are (start, end) - 5' to 3' direction
														
 
															-/// - Minus strand: breakpoints are (end, start) - reversed direction
														
 
															+/// For `+` strand the 5' end maps to the leftmost reference position (`start`) and the
														
 
															+/// 3' end to the rightmost (`end`). For `-` strand the read runs right-to-left on the
														
 
															+/// reference, so 5' = `end` and 3' = `start`.
														
 
															 ///
														
 
															 /// # Arguments
														
 
															-/// * `start` - Alignment start (0-based)
														
 
															-/// * `end` - Alignment end (exclusive)
														
 
															-/// * `strand` - '+' or '-'
														
 
															+///
														
 
															+/// * `start` - Alignment start, 0-based inclusive
														
 
															+/// * `end` - Alignment end, 0-based exclusive
														
 
															+/// * `strand` - `'+'` or `'-'`
														
 
															 ///
														
 
															 /// # Returns
														
 
															-/// Tuple of (breakpoint_a, breakpoint_b) in strand-aware order
														
 
															+///
														
 
															+/// `(5'_coord, 3'_coord)` where both are 0-based reference positions.
														
 
															 #[inline]
														
 
															 fn breakpoints(start: i64, end: i64, strand: char) -> (i64, i64) {
														
 
															     if strand == '+' {
														
@@ -562,35 +643,46 @@ fn breakpoints(start: i64, end: i64, strand: char) -> (i64, i64) {
 
															     }
														
 
															 }
														
 
															-/// Extract a fold-back inversion event from a BAM record.
														
 
															+/// Attempt to extract a fold-back inversion event from a primary BAM record.
														
 
															 ///
														
 
															-/// Analyzes a primary alignment and its supplementary alignment (from SA tag)
														
 
															-/// to detect fold-back inversion patterns characterized by:
														
 
															-/// - Same chromosome
														
 
															-/// - Opposite strands
														
 
															-/// - Reference coordinates within proximity threshold
														
 
															-/// - Exactly one supplementary alignment
														
 
															+/// Inspects the record's SA tag using [`parse_sa_entries`] and checks whether the single
														
 
															+/// supplementary alignment forms a fold-back inversion with the primary: same chromosome,
														
 
															+/// opposite strand, and reference coordinates within the specified proximity thresholds.
														
 
															+/// Returns a populated [`FbInv`] with all breakpoint metrics on success.
														
 
															+///
														
 
															+/// Alignments A and B in the returned struct are ordered by their position on the read
														
 
															+/// (A comes first). All coordinates are 0-based half-open; the 1-based SA tag `pos` is
														
 
															+/// converted automatically by [`SaEntry`].
														
 
															+///
														
 
															+/// Recommended thresholds for **artefact detection**: `max_overlap = Some(150)`,
														
 
															+/// `max_gap = Some(0)`. Requiring overlap (`max_gap = Some(0)`) matches the reference
														
 
															+/// Python implementation and avoids capturing true SV junctions that happen to have
														
 
															+/// opposite-strand supplementary alignments with a reference-space gap.
														
 
															 ///
														
 
															 /// # Arguments
														
 
															-/// * `record` - BAM record (must be primary alignment)
														
 
															-/// * `header` - BAM header for reference name lookup
														
 
															-/// * `min_mapq` - Minimum mapping quality threshold for both alignments
														
 
															-/// * `max_overlap` - Maximum allowed overlap in bp (None = no limit)
														
 
															-/// * `max_gap` - Maximum allowed gap in bp (None = no limit, 0 = require overlap or adjacent)
														
 
															-///   default should be [-150, 200) Some(150), Some(200)
														
 
															+///
														
 
															+/// * `record` - BAM record to evaluate; must be a primary alignment
														
 
															+/// * `header` - BAM header used to resolve the reference name from `tid`
														
 
															+/// * `min_mapq` - Minimum mapping quality required for both the primary and the supplementary
														
 
															+/// * `max_overlap` - Maximum allowed reference-space overlap in bp (`None` = no limit)
														
 
															+/// * `max_gap` - Maximum allowed reference-space gap in bp (`None` = no limit)
														
 
															 ///
														
 
															 /// # Returns
														
 
															-/// `Some(FbInv)` if a valid fold-back pattern is detected, `None` otherwise
														
 
															 ///
														
 
															-/// # Filtering
														
 
															-/// Returns `None` if any of these conditions are met:
														
 
															+/// `Some(FbInv)` if all criteria are met, `None` otherwise.
														
 
															+///
														
 
															+/// Returns `None` when any of the following conditions apply:
														
 
															 /// - Record is unmapped, secondary, or supplementary
														
 
															-/// - MAPQ below threshold (primary or supplementary)
														
 
															-/// - No SA tag or multiple supplementary alignments
														
 
															-/// - Supplementary on different chromosome
														
 
															-/// - Same strand orientation
														
 
															-/// - Overlap exceeds `max_overlap` (if set)
														
 
															-/// - Gap exceeds `max_gap` (if set)
														
 
															+/// - Primary or supplementary MAPQ is below `min_mapq`
														
 
															+/// - SA tag is absent or contains more than one entry
														
 
															+/// - Supplementary aligns to a different chromosome
														
 
															+/// - Both alignments are on the same strand
														
 
															+/// - Reference-space overlap exceeds `max_overlap`
														
 
															+/// - Reference-space gap exceeds `max_gap`
														
 
															+///
														
 
															+/// # References
														
 
															+///
														
 
															+/// <https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py>
														
 
															 pub fn fb_inv_from_record(
														
 
															     record: &bam::Record,
														
 
															     header: &bam::HeaderView,
														
@@ -607,32 +699,19 @@ pub fn fb_inv_from_record(
 
															         return None;
														
 
															     }
														
 
															-    // SA tag required
														
 
															+    // SA tag required; exactly one supplementary alignment expected
														
 
															     let sa_str = match record.aux(b"SA") {
														
 
															         Ok(Aux::String(s)) => s,
														
 
															         _ => return None,
														
 
															     };
														
 
															-    let sa_entries: Vec<&str> = sa_str.split(';').filter(|s| !s.is_empty()).collect();
														
 
															-
														
 
															-    // Require exactly one SA (like the Python code)
														
 
															+    let sa_entries = parse_sa_entries(sa_str);
														
 
															     if sa_entries.len() != 1 {
														
 
															         return None;
														
 
															     }
														
 
															-    let sa = sa_entries[0];
														
 
															-
														
 
															-    // SA format: chr,pos,strand,cigar,mapq,nm
														
 
															-    let fields: Vec<&str> = sa.split(',').collect();
														
 
															-    if fields.len() < 6 {
														
 
															-        return None;
														
 
															-    }
														
 
															-    let sa_rname = fields[0];
														
 
															-    let sa_pos_1based: i64 = fields[1].parse().ok()?; // SAM is 1-based
														
 
															-    let sa_strand_char: char = fields[2].chars().next()?;
														
 
															-    let sa_cigar_str = fields[3];
														
 
															-    let sa_mapq: u8 = fields[4].parse().ok()?;
														
 
															+    let sa = &sa_entries[0];
														
 
															-    if sa_mapq < min_mapq {
														
 
															+    if sa.mapq < min_mapq {
														
 
															         return None;
														
 
															     }
														
@@ -642,13 +721,13 @@ pub fn fb_inv_from_record(
 
															         return None;
														
 
															     }
														
 
															     let primary_rname = std::str::from_utf8(header.tid2name(tid as u32)).ok()?;
														
 
															-    if sa_rname != primary_rname {
														
 
															+    if sa.chr != primary_rname {
														
 
															         return None;
														
 
															     }
														
 
															     // Strand check: need opposite strands for fold-back inversion
														
 
															     let primary_strand_char = if record.is_reverse() { '-' } else { '+' };
														
 
															-    if primary_strand_char == sa_strand_char {
														
 
															+    if primary_strand_char == sa.strand {
														
 
															         return None;
														
 
															     }
														
@@ -656,17 +735,14 @@ pub fn fb_inv_from_record(
 
															     let primary_start = record.pos();
														
 
															     let primary_end = record.cigar().end_pos();
														
 
															-    // Parse SA CIGAR
														
 
															-    let sa_cigar_ops = parse_cigar_str(sa_cigar_str)?;
														
 
															+    // Parse SA CIGAR; pos is already 0-based from SaEntry
														
 
															+    let sa_cigar_ops = parse_cigar_str(sa.cigar)?;
														
 
															     let sa_ref_len = alignment_ref_length(&sa_cigar_ops);
														
 
															-
														
 
															-    // Validation: SA must have non-zero reference length
														
 
															     if sa_ref_len == 0 {
														
 
															         return None;
														
 
															     }
														
 
															-    // SA ref coords: convert POS from 1-based to 0-based
														
 
															-    let sa_start = sa_pos_1based - 1;
														
 
															+    let sa_start = sa.pos as i64;
														
 
															     let sa_end = sa_start + sa_ref_len as i64;
														
 
															     // Check proximity between alignments
														
@@ -692,7 +768,7 @@ pub fn fb_inv_from_record(
 
															     // Query coords for primary and SA
														
 
															     let primary_cigar: Vec<Cigar> = record.cigar().iter().cloned().collect();
														
 
															     let (primary_qbeg, primary_qend) = alignment_query_coords(&primary_cigar, primary_strand_char);
														
 
															-    let (sa_qbeg, sa_qend) = alignment_query_coords(&sa_cigar_ops, sa_strand_char);
														
 
															+    let (sa_qbeg, sa_qend) = alignment_query_coords(&sa_cigar_ops, sa.strand);
														
 
															     // Decide which segment is first on the read
														
 
															     let first = if primary_qbeg <= sa_qbeg {
														
@@ -710,13 +786,13 @@ pub fn fb_inv_from_record(
 
															                 primary_strand_char,
														
 
															                 sa_start,
														
 
															                 sa_end,
														
 
															-                sa_strand_char,
														
 
															+                sa.strand,
														
 
															             )
														
 
															         } else {
														
 
															             (
														
 
															                 sa_start,
														
 
															                 sa_end,
														
 
															-                sa_strand_char,
														
 
															+                sa.strand,
														
 
															                 primary_start,
														
 
															                 primary_end,
														
 
															                 primary_strand_char,
														
@@ -758,40 +834,27 @@ pub fn fb_inv_from_record(
 
															     })
														
 
															 }
														
 
															-// pub fn read_sm_tag(bam_path: &str) -> anyhow::Result<String> {
														
 
															-//     let reader = bam::Reader::from_path(bam_path)
														
 
															-//         .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
														
 
															-//     let header = bam::Header::from_template(reader.header());
														
 
															-//     let header_text = String::from_utf8_lossy(&header.to_bytes()).to_string();
														
 
															-//
														
 
															-//     for line in header_text.lines() {
														
 
															-//         if line.starts_with("@RG") {
														
 
															-//             for field in line.split('\t') {
														
 
															-//                 if let Some(sm) = field.strip_prefix("SM:") {
														
 
															-//                     return Ok(sm.to_string());
														
 
															-//                 }
														
 
															-//             }
														
 
															-//         }
														
 
															-//     }
														
 
															-//     anyhow::bail!("No SM tag found in @RG header of {bam_path}")
														
 
															-// }
														
 
															-
														
 
															-// fn ensure_bam_sm_tag(id: &str, config: &Config) -> anyhow::Result<()> {
														
 
															-//     for bam in [config.normal_bam(id), config.tumoral_bam(id)] {
														
 
															-//         if read_sm_tag(&bam).is_err() {
														
 
															-//             let sample = Path::new(&bam)
														
 
															-//                 .file_stem()
														
 
															-//                 .unwrap_or_default()
														
 
															-//                 .to_string_lossy()
														
 
															-//                 .to_string();
														
 
															-//             info!("Injecting missing @RG SM:{sample} into {bam}");
														
 
															-//             SamtoolsReheader::from_config(config, &bam, &sample).run()?;
														
 
															-//             SamtoolsIndex::from_config(config, &bam).run()?;
														
 
															-//         }
														
 
															-//     }
														
 
															-//     Ok(())
														
 
															-// }
														
 
															-
														
 
															+/// Read the `SM` tag from a BAM's `@RG` lines, injecting it if absent or inconsistent.
														
 
															+///
														
 
															+/// Inspects every `@RG` line in the header. If all lines carry an identical `SM` value,
														
 
															+/// that value is returned. If any line is missing `SM`, or if multiple distinct `SM`
														
 
															+/// values are present, `samtools reheader` is invoked to set `SM` uniformly to
														
 
															+/// `fallback_sample`. The file modification time is preserved after reheadering.
														
 
															+///
														
 
															+/// # Arguments
														
 
															+///
														
 
															+/// * `bam_path` - Path to the BAM file (must be writable if reheadering is needed)
														
 
															+/// * `fallback_sample` - Sample name to inject when the header is missing or ambiguous
														
 
															+/// * `config` - Pipeline configuration used to invoke [`SamtoolsReheader`]
														
 
															+///
														
 
															+/// # Returns
														
 
															+///
														
 
															+/// The `SM` value present in (or injected into) the header.
														
 
															+///
														
 
															+/// # Errors
														
 
															+///
														
 
															+/// Returns an error if the BAM cannot be opened, if reheadering fails, or if the
														
 
															+/// header contains no `@RG` lines at all.
														
 
															 pub fn read_sm_tag_or_inject(
														
 
															     bam_path: &str,
														
 
															     fallback_sample: &str,
														
@@ -800,7 +863,7 @@ pub fn read_sm_tag_or_inject(
 
															     let reader = bam::Reader::from_path(bam_path)
														
 
															         .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
														
 
															     let header = bam::Header::from_template(reader.header());
														
 
															-    let header_text = String::from_utf8_lossy(&header.to_bytes()).to_string();
														
 
															+    let header_text = String::from_utf8_lossy(&header.to_bytes()).into_owned();
														
 
															     let mut sm_values: Vec<String> = Vec::new();
														
 
															     let mut all_have_sm = true;
														
@@ -817,10 +880,8 @@ pub fn read_sm_tag_or_inject(
 
															         }
														
 
															     }
														
 
															-    // Determine whether reheadering is needed:
														
 
															-    // 1. Any @RG missing SM
														
 
															-    // 2. SM values are not all identical
														
 
															-    // 3. No @RG lines at all
														
 
															+    // Reheader if any @RG lacks SM or if multiple distinct SM values exist.
														
 
															+    // No @RG lines at all falls through to the error at the end.
														
 
															     let unique_sm: std::collections::HashSet<&str> = sm_values.iter().map(|s| s.as_str()).collect();
														
 
															     let needs_reheader = !all_have_sm || unique_sm.len() > 1;
														
@@ -856,29 +917,31 @@ pub fn read_sm_tag_or_inject(
 
															         .context(format!("No @RG lines found in {bam_path}"))
														
 
															 }
														
 
															-/// Fetch primary records from a BAM file, with optional region and qname filtering.
														
 
															+/// Fetch primary alignment records from an indexed BAM, with optional region and name filtering.
														
 
															+///
														
 
															+/// Supplementary alignments within the fetch region are resolved to their primaries via
														
 
															+/// [`primary_records`]. Duplicates introduced by that resolution step are removed by
														
 
															+/// query name. Secondary alignments are always excluded.
														
 
															 ///
														
 
															 /// # Arguments
														
 
															-/// * `bam_path`     - Path to indexed BAM file
														
 
															-/// * `region`       - Optional region to fetch (chr, start, end). None = whole file.
														
 
															-/// * `qname_filter` - Optional qname filter. None = no filter.
														
 
															 ///
														
 
															-/// # Notes
														
 
															-/// - Supplementary alignments are resolved to their primary via SA tag using `primary_records`
														
 
															-/// - White/blacklist are mutually exclusive — passing both is a logic error and will panic in debug
														
 
															+/// * `bam_path` - Path to an indexed BAM file (`.bai` index must exist)
														
 
															+/// * `region` - If `Some((chr, start, end))`, restrict the fetch to that 0-based half-open
														
 
															+///   interval; if `None`, the whole file is scanned
														
 
															+/// * `qname_filter` - Optional [`QnameFilter`] to retain or exclude specific read names
														
 
															+///
														
 
															+/// # Returns
														
 
															+///
														
 
															+/// A deduplicated vector of primary records, optionally filtered by query name.
														
 
															+///
														
 
															+/// # Errors
														
 
															+///
														
 
															+/// Returns an error if the BAM cannot be opened or if the fetch operation fails.
														
 
															 pub fn fetch_primary_records(
														
 
															     bam_path: &Path,
														
 
															     region: Option<(&str, i64, i64)>,
														
 
															     qname_filter: Option<QnameFilter>,
														
 
															 ) -> anyhow::Result<Vec<bam::Record>> {
														
 
															-    debug_assert!(
														
 
															-        !matches!(
														
 
															-            &qname_filter,
														
 
															-            Some(QnameFilter::Whitelist(_)) | Some(QnameFilter::Blacklist(_))
														
 
															-        ) || qname_filter.is_some(),
														
 
															-        "Cannot pass both whitelist and blacklist"
														
 
															-    );
														
 
															-
														
 
															     let mut reader = bam::IndexedReader::from_path(bam_path)
														
 
															         .with_context(|| format!("Cannot open BAM: {}", bam_path.display()))?;
														
@@ -924,66 +987,31 @@ pub fn fetch_primary_records(
 
															     Ok(records)
														
 
															 }
														
 
															+/// Query-name filter for [`fetch_primary_records`].
														
 
															 pub enum QnameFilter {
														
 
															+    /// Retain only records whose query name is in the set.
														
 
															     Whitelist(HashSet<Vec<u8>>),
														
 
															+    /// Exclude records whose query name is in the set.
														
 
															     Blacklist(HashSet<Vec<u8>>),
														
 
															 }
														
 
															-/// Convert an input BAM/CRAM file into a BED file containing one interval per
														
 
															-/// aligned record.
														
 
															+/// Convert a BAM/CRAM file to a BED4 file with one interval per aligned record.
														
 
															 ///
														
 
															-/// Each output line corresponds to a single aligned read and has the form:
														
 
															-///
														
 
															-/// ```text
														
 
															-/// chrom    start    end    read_name
														
 
															-/// ```
														
 
															-///
														
 
															-/// # Behavior
														
 
															-///
														
 
															-/// - Unmapped records are skipped.
														
 
															-/// - Chromosome names are taken from the BAM header.
														
 
															-/// - Coordinates follow BED conventions:
														
 
															-///   - `start` is **0-based inclusive**
														
 
															-///   - `end` is **0-based exclusive**
														
 
															-/// - The interval `[start, end)` is computed using:
														
 
															-///   - [`bam::Record::pos`] for the start
														
 
															-///   - [`bam::Record::reference_end`] for the end
														
 
															-/// - The end position accounts for all CIGAR operations consuming reference
														
 
															-///   (`M`, `D`, `N`, `=`, `X`).
														
 
															+/// Each output line has the form `chrom\tstart\tend\tread_name`. Coordinates are
														
 
															+/// 0-based half-open (`start` from [`Record::pos`], `end` from [`Record::reference_end`]),
														
 
															+/// consistent with BED convention. Unmapped records and records with a negative `tid`
														
 
															+/// are skipped. Secondary and supplementary alignments are **not** filtered — add
														
 
															+/// upstream filtering if stricter selection is needed.
														
 
															 ///
														
 
															 /// # Arguments
														
 
															 ///
														
 
															-/// * `input_bam` - Path to input BAM or CRAM file
														
 
															-/// * `output_bed` - Path to output BED file
														
 
															+/// * `input_bam` - Path to the input BAM or CRAM file
														
 
															+/// * `output_bed` - Path to the output BED file (created or overwritten)
														
 
															 ///
														
 
															 /// # Errors
														
 
															 ///
														
 
															-/// Returns an error if:
														
 
															-/// - The input file cannot be opened or read
														
 
															-/// - The output file cannot be created or written
														
 
															-/// - UTF-8 conversion of reference names or read names fails
														
 
															-///
														
 
															-/// # Example
														
 
															-///
														
 
															-/// ```no_run
														
 
															-/// use std::path::Path;
														
 
															-///
														
 
															-/// fn main() -> anyhow::Result<()> {
														
 
															-///     bam_to_aligned_bed("input.bam", "output.bed")?;
														
 
															-///     Ok(())
														
 
															-/// }
														
 
															-/// ```
														
 
															-///
														
 
															-/// # Notes
														
 
															-///
														
 
															-/// - Secondary, supplementary, and duplicate reads are **not filtered**.
														
 
															-///   Add additional flags if stricter filtering is required.
														
 
															-/// - For paired-end reads, each alignment record is emitted independently.
														
 
															-///
														
 
															-/// # See also
														
 
															-///
														
 
															-/// - [`rust_htslib::bam::Reader`]
														
 
															-/// - [`rust_htslib::bam::Record`]
														
 
															+/// Returns an error if the input cannot be read, the output cannot be written, or
														
 
															+/// if UTF-8 conversion of a reference name or query name fails.
														
 
															 pub fn bam_to_aligned_bed<P: AsRef<Path>>(
														
 
															     input_bam: P,
														
 
															     output_bed: P,
														
--- a/src/scan/bin.rs
+++ b/src/scan/bin.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;
 
															 use anyhow::Context;
														
 
															 use csv::ByteRecord;
														
 
															-use log::error;
														
 
															+use log::{error, warn};
														
 
															 use rust_htslib::bam::{HeaderView, IndexedReader, Read, Record, ext::BamRecordExtensions, record::Aux};
														
 
															 use crate::io::{bam::{fb_inv_from_record, primary_record, primary_records}, tsv::{parse_csv_u32_into, parse_u32}};
														
@@ -149,7 +149,11 @@ impl Bin {
 
															         self.reads_store
														
 
															             .values()
														
 
															             .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
														
 
															-            .map(|record| primary_record(bam_reader, record.clone()))
														
 
															+            .filter_map(|record| {
														
 
															+                primary_record(bam_reader, record.clone())
														
 
															+                    .map_err(|e| warn!("primary_record failed: {e}"))
														
 
															+                    .ok()
														
 
															+            })
														
 
															             .collect()
														
 
															     }
														
@@ -301,7 +305,11 @@ impl Bin {
 
															         self.reads_store
														
 
															             .values()
														
 
															             .filter(|record| record.reference_start() as u32 == pos)
														
 
															-            .map(|record| primary_record(bam_reader, record.clone()))
														
 
															+            .filter_map(|record| {
														
 
															+                primary_record(bam_reader, record.clone())
														
 
															+                    .map_err(|e| warn!("primary_record failed: {e}"))
														
 
															+                    .ok()
														
 
															+            })
														
 
															             .collect()
														
 
															     }
														
@@ -318,7 +326,11 @@ impl Bin {
 
															         self.reads_store
														
 
															             .values()
														
 
															             .filter(|record| record.reference_end() as u32 == pos)
														
 
															-            .map(|record| primary_record(bam_reader, record.clone()))
														
 
															+            .filter_map(|record| {
														
 
															+                primary_record(bam_reader, record.clone())
														
 
															+                    .map_err(|e| warn!("primary_record failed: {e}"))
														
 
															+                    .ok()
														
 
															+            })
														
 
															             .collect()
														
 
															     }