2 tháng trước cách đây · bd93786500
--- a/memo_fb_inv.md
+++ b/memo_fb_inv.md
@@ -0,0 +1,194 @@
 
				+# Fold-Back Inversion — Geometry, Math, and Implementation Notes
			
 
				+
			
 
				+## Reference
			
 
				+
			
 
				+Python reference implementation:
			
 
				+https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py
			
 
				+
			
 
				+Rust implementation: `src/io/bam.rs` — `fb_inv_from_record()`, `FbInv` struct, `breakpoints()`.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## What Is a Fold-Back Inversion?
			
 
				+
			
 
				+A fold-back inversion (fb-inv) occurs when a sequencing read aligns to the reference
			
 
				+twice on the **same chromosome** with **opposite strands**. The read goes forward along
			
 
				+the reference, then "folds back" and runs in reverse over the same (or overlapping) region.
			
 
				+
			
 
				+This creates a signal that looks like an **inverted duplication**: the read covers a
			
 
				+stretch of reference sequence twice — once in the forward direction, once in reverse.
			
 
				+This is a known artefact in ONT long-read data at sites of DNA damage or structural
			
 
				+variation (double-strand breaks, inverted repeats).
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Geometry
			
 
				+
			
 
				+### General case (fold with offset `S`)
			
 
				+
			
 
				+```
			
 
				+Reference position:
			
 
				+    a        d         b         c
			
 
				+    |        |         |         |
			
 
				+    v        v         v         v
			
 
				+
			
 
				+    [=============A===============>]          A: primary, + strand
			
 
				+             [<=================B====]        B: supplementary, − strand
			
 
				+
			
 
				+    |←— A length —————————————————→|
			
 
				+             |←——— B length ————————→|
			
 
				+             |←overlap→|
			
 
				+    |←— a_d ——→|                             outer offset  = |a − d|
			
 
				+                         |←— b_c ——→|        junction gap  = |b − c|
			
 
				+```
			
 
				+
			
 
				+### Canonical perfect inverted duplication (`S = 0`)
			
 
				+
			
 
				+When B covers exactly the same region as A:
			
 
				+
			
 
				+```
			
 
				+    a                             b
			
 
				+    |                             |
			
 
				+    v                             v
			
 
				+    [=============A===============>]
			
 
				+    [<==============B==============]
			
 
				+    ^                             ^
			
 
				+    d                             c
			
 
				+
			
 
				+    b_c = |b − c| = 0    (no junction gap)
			
 
				+    a_d = |a − d| = 0    (perfectly symmetric)
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Breakpoint Convention
			
 
				+
			
 
				+Breakpoints `(a, b)` and `(c, d)` are defined in **read order** (5′→3′ along the read),
			
 
				+not in reference order. This is critical:
			
 
				+
			
 
				+| Strand | 5′ terminus (read-first) | 3′ terminus (read-last) |
			
 
				+|--------|--------------------------|--------------------------|
			
 
				+| `+`    | leftmost ref pos = `start`  | rightmost ref pos = `end` |
			
 
				+| `−`    | rightmost ref pos = `end`   | leftmost ref pos = `start` |
			
 
				+
			
 
				+So `breakpoints(start, end, strand)` always returns `(5′_coord, 3′_coord)`:
			
 
				+
			
 
				+```
			
 
				++ strand: (start, end)   →  a = start,  b = end
			
 
				+− strand: (end,   start) →  c = end,    d = start
			
 
				+```
			
 
				+
			
 
				+Alignments A and B are ordered by their position **on the read** (A comes first). The
			
 
				+canonical fold-back case is A on `+` strand (forward part of read), B on `−` strand
			
 
				+(reverse part of read).
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Derived Metrics
			
 
				+
			
 
				+```
			
 
				+aln_a_len = |a − b|   alignment A length on reference
			
 
				+aln_b_len = |c − d|   alignment B length on reference
			
 
				+b_c       = |b − c|   read-order junction gap  (3′ end of A → 5′ end of B)
			
 
				+a_d       = |a − d|   read-order outer span    (5′ end of A → 3′ end of B)
			
 
				+```
			
 
				+
			
 
				+### Interpretation with offset `S`
			
 
				+
			
 
				+For an inverted duplication shifted by `S` bp:
			
 
				+
			
 
				+```
			
 
				+A: [X,     X+L)  on +
			
 
				+B: [X+S,   X+L+S) on −
			
 
				+
			
 
				+a = X,     b = X+L
			
 
				+c = X+L+S, d = X+S
			
 
				+
			
 
				+b_c = |(X+L) − (X+L+S)| = S
			
 
				+a_d = |X − (X+S)|        = S
			
 
				+```
			
 
				+
			
 
				+Both metrics equal `S`, the size of the non-duplicated tail on each side.
			
 
				+The **duplicated region** is `[d, b] = [X+S, X+L)`, of length `L − S`.
			
 
				+
			
 
				+For `S = 0` (perfect duplication): `b_c = 0`, `a_d = 0`, duplicated region = all of A.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Reference-Space Overlap / Gap (Filtering)
			
 
				+
			
 
				+Before computing breakpoints, a separate metric `interval_gap` is used to pre-filter:
			
 
				+
			
 
				+```
			
 
				+interval_gap = max(A_start, B_start) − min(A_end, B_end)
			
 
				+```
			
 
				+
			
 
				+- Negative → the two alignments **overlap** on the reference by `|gap|` bp (typical fb-inv)
			
 
				+- Zero → adjacent
			
 
				+- Positive → separated by a gap
			
 
				+
			
 
				+Note: `interval_gap` is a reference-space measure. `b_c`/`a_d` are read-space measures.
			
 
				+They capture complementary aspects of the fold geometry.
			
 
				+
			
 
				+### Recommended thresholds for artefact detection
			
 
				+
			
 
				+```
			
 
				+max_overlap = Some(150)   allow up to 150 bp overlap
			
 
				+max_gap     = Some(0)     require overlap — no gap allowed
			
 
				+```
			
 
				+
			
 
				+**Why `max_gap = Some(0)` for artefacts:** in Nanopore sequencing, a fold-back artefact
			
 
				+arises when the single-stranded DNA molecule forms a **hairpin** and re-translocates
			
 
				+back through the pore in the reverse direction. Because the same physical DNA strand is
			
 
				+read twice — once forward, once in reverse — the two alignments **must overlap** on the
			
 
				+reference. A positive reference-space gap between the alignments is inconsistent with
			
 
				+this mechanism and instead points to a true SV junction with unaligned breakpoint
			
 
				+sequence. Allowing a gap (e.g. `max_gap = Some(200)`) risks capturing genuine structural
			
 
				+variants and inflating the artefact count.
			
 
				+
			
 
				+The reference Python implementation requires strict overlap (`overlap > 0 bp`) for
			
 
				+exactly this reason.
			
 
				+
			
 
				+### Artefact vs. true SV classification
			
 
				+
			
 
				+Events passing the overlap filter are further classified by `a_d`:
			
 
				+
			
 
				+```
			
 
				+a_d < 150 bp  →  likely sequencing artefact (short hairpin / palindrome)
			
 
				+a_d ≥ 150 bp  →  candidate true structural variant
			
 
				+```
			
 
				+
			
 
				+This threshold comes directly from the reference Python implementation.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Diagram Label Summary
			
 
				+
			
 
				+```
			
 
				+Reference:---[=====A=================>]---------
			
 
				+          --------[<=============B======]-------
			
 
				+             ^    ^              ^    ^
			
 
				+             a    d              b    c
			
 
				+
			
 
				+a = 5′ of A (leftmost of A,  read-first)    = aln_a_start
			
 
				+b = 3′ of A (rightmost of A, read-last)     = aln_a_end
			
 
				+c = 5′ of B (rightmost of B, read-first)    = aln_b_end
			
 
				+d = 3′ of B (leftmost of B,  read-last)     = aln_b_start
			
 
				+```
			
 
				+
			
 
				+The region `[d, b]` (where d < b) is the **inverted duplication** — covered forward by A
			
 
				+and backward by B.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Filtering Logic Summary
			
 
				+
			
 
				+```
			
 
				+1. Record must be primary, mapped, MAPQ ≥ min_mapq
			
 
				+2. Exactly one SA tag entry, same chromosome, MAPQ ≥ min_mapq
			
 
				+3. Opposite strands (one +, one −)
			
 
				+4. |interval_gap| ≤ max_overlap  (if overlap)
			
 
				+5. interval_gap   ≤ max_gap      (if gap)
			
 
				+```
			
 
				+
			
 
				+All five conditions must hold for a record to be classified as a fold-back inversion.
			
--- a/src/de_novo/de_novo_pipe.rs
+++ b/src/de_novo/de_novo_pipe.rs
@@ -280,14 +280,11 @@ pub fn run_local_assembly_iterative(
 
				                 })
			
 
				             })
			
 
				             .filter_map(|supp_rec| {
			
 
				-                // Resolve primary from original BAM using SA tag
			
 
				-                let resolved = primary_record(&mut bam_reader, supp_rec);
			
 
				+                let resolved = primary_record(&mut bam_reader, supp_rec)
			
 
				+                    .map_err(|e| log::warn!("primary_record failed: {e}"))
			
 
				+                    .ok()?;
			
 
				                 let qname = resolved.qname().to_vec();
			
 
				-                if !seen.contains(&qname) {
			
 
				-                    Some(resolved)
			
 
				-                } else {
			
 
				-                    None
			
 
				-                }
			
 
				+                if !seen.contains(&qname) { Some(resolved) } else { None }
			
 
				             })
			
 
				             .collect();
			
 
				 
			
--- a/src/io/bam.rs
+++ b/src/io/bam.rs
@@ -11,113 +11,125 @@ use rust_htslib::bam::{
 
				 
			
 
				 use crate::{commands::samtools::SamtoolsReheader, config::Config, runners::Run};
			
 
				 
			
 
				-/// Parses an SA tag string and extracts chromosome and position information.
			
 
				+/// A single entry from a BAM SA (supplementary alignment) tag.
			
 
				+///
			
 
				+/// SA tag format per the SAM spec: `rname,pos,strand,CIGAR,mapQ,NM` (semicolon-separated).
			
 
				+/// `pos` is stored as 0-based after conversion from the 1-based SAM value.
			
 
				+pub struct SaEntry<'a> {
			
 
				+    /// Reference/contig name
			
 
				+    pub chr: &'a str,
			
 
				+    /// Reference start position, 0-based
			
 
				+    pub pos: i32,
			
 
				+    /// Strand: `'+'` or `'-'`
			
 
				+    pub strand: char,
			
 
				+    /// CIGAR string (unparsed)
			
 
				+    pub cigar: &'a str,
			
 
				+    /// Mapping quality
			
 
				+    pub mapq: u8,
			
 
				+    /// Edit distance (NM tag value); `0` if absent or unparseable
			
 
				+    pub nm: u32,
			
 
				+}
			
 
				+
			
 
				+/// Parse all entries from a BAM SA tag value into [`SaEntry`] structs.
			
 
				+///
			
 
				+/// `pos` is converted from 1-based (SAM spec) to 0-based. Entries with fewer than
			
 
				+/// six comma-separated fields or unparseable numeric fields are silently skipped.
			
 
				 ///
			
 
				 /// # Arguments
			
 
				 ///
			
 
				-/// * `sa` - The SA tag string to parse
			
 
				+/// * `sa` - Raw SA tag value (the string after `SA:Z:` in a SAM record)
			
 
				 ///
			
 
				 /// # Returns
			
 
				 ///
			
 
				-/// A vector of tuples containing chromosome names and positions
			
 
				-pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> {
			
 
				+/// A vector of parsed entries. Empty if the tag contains no valid entries.
			
 
				+pub fn parse_sa_entries(sa: &str) -> Vec<SaEntry<'_>> {
			
 
				     sa.split(';')
			
 
				         .filter(|s| !s.is_empty())
			
 
				         .filter_map(|s| {
			
 
				-            let parts: Vec<&str> = s.split(',').take(2).collect();
			
 
				-            if parts.len() < 2 {
			
 
				+            let f: Vec<&str> = s.splitn(6, ',').collect();
			
 
				+            if f.len() < 6 {
			
 
				                 return None;
			
 
				             }
			
 
				-
			
 
				-            let chr = parts[0];
			
 
				-            match parts[1].parse::<i32>() {
			
 
				-                Ok(position) => Some((chr, position)),
			
 
				-                Err(_) => None,
			
 
				-            }
			
 
				+            Some(SaEntry {
			
 
				+                chr: f[0],
			
 
				+                pos: f[1].parse::<i32>().ok()? - 1,
			
 
				+                strand: f[2].chars().next()?,
			
 
				+                cigar: f[3],
			
 
				+                mapq: f[4].parse().ok()?,
			
 
				+                nm: f[5].parse().unwrap_or(0),
			
 
				+            })
			
 
				         })
			
 
				         .collect()
			
 
				 }
			
 
				 
			
 
				-/// Resolves the primary record for supplementary alignments using BAM SA tag
			
 
				+/// Convenience wrapper over [`parse_sa_entries`] returning only `(chr, pos)` pairs.
			
 
				+///
			
 
				+/// `pos` is 0-based. Useful when only fetch coordinates are needed.
			
 
				+pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> {
			
 
				+    parse_sa_entries(sa).into_iter().map(|e| (e.chr, e.pos)).collect()
			
 
				+}
			
 
				+
			
 
				+/// Resolve a supplementary alignment to its primary record via the SA tag.
			
 
				 ///
			
 
				-/// For supplementary alignments, searches linked primary records using genomic positions
			
 
				-/// listed in the SA tag. Returns the first non-supplementary record with matching query name.
			
 
				+/// For supplementary alignments, each position listed in the SA tag is fetched and
			
 
				+/// scanned for a non-supplementary record with the same query name. The first match
			
 
				+/// is returned. If the record is not supplementary, or no primary is found across all
			
 
				+/// SA positions, the original record is returned unchanged.
			
 
				 ///
			
 
				 /// # Arguments
			
 
				-/// * `bam` - Mutable reference to indexed BAM reader for random access
			
 
				-/// * `record` - Input record to evaluate (typically a supplementary alignment)
			
 
				+///
			
 
				+/// * `bam` - Indexed BAM reader used for random-access fetches
			
 
				+/// * `record` - The record to resolve (typically a supplementary alignment)
			
 
				 ///
			
 
				 /// # Returns
			
 
				-/// - Original record if it's not supplementary
			
 
				-/// - First matching primary record found via SA tag positions
			
 
				-/// - Original record if no primary matches found
			
 
				 ///
			
 
				-/// # Panics
			
 
				-/// - If SA tag format is invalid (missing fields or non-integer positions)
			
 
				-/// - If BAM fetch operation fails
			
 
				+/// The primary alignment record, or the input record if resolution is not applicable
			
 
				+/// or unsuccessful.
			
 
				 ///
			
 
				-/// # Example
			
 
				-/// ```
			
 
				-/// use rust_htslib::{bam::{IndexedReader, Read}};
			
 
				+/// # Errors
			
 
				 ///
			
 
				-/// let mut bam = IndexedReader::from_path("input.bam").unwrap();
			
 
				-/// let record = bam.records().next().unwrap().unwrap();
			
 
				-/// let primary = primary_record(&mut bam, record);
			
 
				-/// ```
			
 
				-pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
			
 
				-    // Return immediately if not a supplementary alignment
			
 
				+/// Returns an error if a BAM fetch operation fails or if a fetched record is malformed.
			
 
				+pub fn primary_record(bam: &mut IndexedReader, record: Record) -> anyhow::Result<Record> {
			
 
				     if !record.is_supplementary() {
			
 
				-        return record;
			
 
				+        return Ok(record);
			
 
				     }
			
 
				 
			
 
				     let qname = record.qname();
			
 
				 
			
 
				-    // Process SA tag if present
			
 
				     if let Ok(Aux::String(sa)) = record.aux(b"SA") {
			
 
				-        // Search potential primary alignments at each SA position
			
 
				         for (chr, start) in parse_sa_tag(sa) {
			
 
				             bam.fetch((chr, start, start + 1))
			
 
				-                .expect("BAM fetch failed");
			
 
				+                .with_context(|| format!("BAM fetch failed at {chr}:{start}"))?;
			
 
				 
			
 
				             for result in bam.records() {
			
 
				-                let candidate = result.expect("Invalid BAM record");
			
 
				-                if candidate.qname() == qname && !candidate.is_supplementary() {
			
 
				-                    return candidate.clone();
			
 
				+                let candidate = result.context("Invalid BAM record")?;
			
 
				+                if candidate.qname() == qname && !candidate.is_supplementary() && !candidate.is_secondary() {
			
 
				+                    return Ok(candidate);
			
 
				                 }
			
 
				             }
			
 
				         }
			
 
				     }
			
 
				 
			
 
				-    // Fallback to original record if no primary found
			
 
				-    record
			
 
				+    Ok(record)
			
 
				 }
			
 
				 
			
 
				-/// Creates optimized position bins for fetching records.
			
 
				+/// Group 0-based positions by chromosome into bins spanning at most 1000 bp, sorted by bin size descending.
			
 
				 ///
			
 
				-/// Groups positions by chromosome and creates bins of ±1000 bp.
			
 
				-/// Sorts bins by size (descending) to prioritize regions with more alignment hits.
			
 
				-///
			
 
				-/// # Arguments
			
 
				-///
			
 
				-/// * `positions` - Vector of chromosome/position tuples
			
 
				-///
			
 
				-/// # Returns
			
 
				-///
			
 
				-/// A vector of position bins, sorted by bin size
			
 
				+/// Positions within 1000 bp of the first position in the current bin are merged into
			
 
				+/// that bin. Larger bins are sorted first so that high-density regions are fetched
			
 
				+/// before sparse ones, enabling earlier early-exit in callers.
			
 
				 fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec<Vec<(&'a str, i32)>> {
			
 
				-    // Sort positions by chromosome and position
			
 
				     let mut sorted_positions = positions.to_vec();
			
 
				     sorted_positions.sort_by_key(|(chr, pos)| (*chr, *pos));
			
 
				     sorted_positions.dedup();
			
 
				 
			
 
				-    // Group by chromosome and create bins of ±1000 bp
			
 
				     let mut grouped: HashMap<&str, Vec<Vec<(&str, i32)>>> = HashMap::new();
			
 
				 
			
 
				     for (chr, pos) in sorted_positions {
			
 
				         let bins = grouped.entry(chr).or_default();
			
 
				 
			
 
				         if let Some(last_bin) = bins.last_mut() {
			
 
				-            if last_bin.is_empty() || (pos - last_bin[0].1).abs() <= 1000 {
			
 
				+            if last_bin.is_empty() || pos - last_bin[0].1 <= 1000 {
			
 
				                 last_bin.push((chr, pos));
			
 
				             } else {
			
 
				                 bins.push(vec![(chr, pos)]);
			
@@ -127,44 +139,33 @@ fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec<Vec<(&'a str, i
 
				         }
			
 
				     }
			
 
				 
			
 
				-    // Flatten and sort by bin size (descending)
			
 
				     let mut flattened: Vec<Vec<(&str, i32)>> = grouped.values().flatten().cloned().collect();
			
 
				-
			
 
				-    // Sort bins by size (descending) to prioritize regions with more hits
			
 
				     flattened.sort_by_key(|bin| std::cmp::Reverse(bin.len()));
			
 
				-
			
 
				     flattened
			
 
				 }
			
 
				 
			
 
				-/// Retrieves primary alignment records based on a set of input records.
			
 
				+/// Resolve supplementary alignments in a record set to their primary records.
			
 
				 ///
			
 
				-/// This function processes a collection of BAM records and retrieves their primary alignments.
			
 
				-/// When supplementary alignments are found (with SA tags), it fetches the corresponding
			
 
				-/// primary alignments from the provided BAM file.
			
 
				+/// Primary and secondary records are separated in a first pass. For supplementary
			
 
				+/// records with an SA tag, the SA positions are binned by chromosome and fetched in
			
 
				+/// bulk. Fetching stops as soon as all target query names have been found (early exit).
			
 
				+/// Records that cannot be resolved are silently dropped with a warning.
			
 
				 ///
			
 
				 /// # Arguments
			
 
				 ///
			
 
				-/// * `bam` - A mutable reference to an IndexedReader for the BAM file
			
 
				-/// * `records` - A vector of input records to process
			
 
				+/// * `bam` - Indexed BAM reader used for random-access fetches
			
 
				+/// * `records` - Mixed set of primary, secondary, and supplementary records
			
 
				 ///
			
 
				 /// # Returns
			
 
				 ///
			
 
				-/// A vector of records containing both:
			
 
				-/// - Original primary alignments from the input
			
 
				-/// - Primary alignments fetched for any supplementary records in the input
			
 
				-///
			
 
				-/// # Examples
			
 
				-///
			
 
				-/// ```
			
 
				-/// use rust_htslib::bam::{IndexedReader, Record};
			
 
				-/// let mut bam = IndexedReader::from_path("sample.bam").unwrap();
			
 
				-/// let records = vec![/* some records */];
			
 
				-/// let primary_alignments = primary_records(&mut bam, records);
			
 
				-/// ```
			
 
				+/// A vector containing original non-supplementary records from the input plus any
			
 
				+/// primaries fetched for supplementary inputs. Secondary records in the input are
			
 
				+/// passed through unchanged; secondary records encountered during SA-position fetching
			
 
				+/// are skipped. Order is not guaranteed.
			
 
				 pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Record> {
			
 
				     let mut res = Vec::with_capacity(records.len());
			
 
				     let mut all_positions = Vec::new();
			
 
				-    let mut all_qnames_to_fetch = Vec::new();
			
 
				+    let mut all_qnames_to_fetch = HashSet::new();
			
 
				 
			
 
				     // First pass: collect primary records and positions to fetch
			
 
				     for record in records.iter() {
			
@@ -176,7 +177,7 @@ pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Rec
 
				                 all_positions.extend(positions);
			
 
				 
			
 
				                 match String::from_utf8(qname.to_vec()) {
			
 
				-                    Ok(qname_str) => all_qnames_to_fetch.push(qname_str),
			
 
				+                    Ok(qname_str) => { all_qnames_to_fetch.insert(qname_str); }
			
 
				                     Err(e) => warn!("Invalid UTF-8 in qname: {}", e),
			
 
				                 }
			
 
				             }
			
@@ -250,6 +251,22 @@ pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Rec
 
				     res
			
 
				 }
			
 
				 
			
 
				+/// Extract contig names and lengths from a BAM header.
			
 
				+///
			
 
				+/// Reads all `@SQ` lines and returns a map of sequence name (`SN`) to sequence
			
 
				+/// length (`LN`).
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `header` - BAM header to parse
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// A map from contig name to length in base pairs.
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Returns an error if the `LN` field of any `@SQ` line cannot be parsed as `u64`.
			
 
				 pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<HashMap<String, u64>> {
			
 
				     let mut genome = HashMap::new();
			
 
				     for (key, records) in header.to_hashmap() {
			
@@ -259,7 +276,7 @@ pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<Has
 
				                     record["SN"].to_string(),
			
 
				                     record["LN"]
			
 
				                         .parse::<u64>()
			
 
				-                        .expect("Failed parsing length of chromosomes"),
			
 
				+                        .with_context(|| format!("Failed to parse LN for contig {}", record["SN"]))?,
			
 
				                 );
			
 
				             }
			
 
				         }
			
@@ -267,8 +284,22 @@ pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<Has
 
				     Ok(genome)
			
 
				 }
			
 
				 
			
 
				-/// Split the genome into N balanced region lists (`ctg:start-end` strings).
			
 
				-/// Each element of the outer Vec is the list of `-r` regions for one batch.
			
 
				+/// Partition a genome into `n` balanced batches of samtools-style region strings.
			
 
				+///
			
 
				+/// Contigs are sorted deterministically and sliced into segments so that each batch
			
 
				+/// covers approximately `total_bp / n` bases. A contig may be split across batches.
			
 
				+/// Region strings use 1-based inclusive coordinates (`ctg:start-end`) as expected
			
 
				+/// by samtools `-r`.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `genome` - Map of contig name to length in bp, typically from [`get_genome_sizes`]
			
 
				+/// * `n` - Number of batches to produce
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// A vector of `n` batches, each containing the region strings assigned to that batch.
			
 
				+/// Returns an empty vector if `n` is zero or `genome` is empty.
			
 
				 pub fn split_genome_into_n_regions(genome: &HashMap<String, u64>, n: usize) -> Vec<Vec<String>> {
			
 
				     if n == 0 || genome.is_empty() {
			
 
				         return Vec::new();
			
@@ -324,24 +355,39 @@ pub enum SegmentOrder {
 
				 }
			
 
				 
			
 
				 /// Fold-back inversion event detected from a primary/supplementary alignment pair.
			
 
				-/// Thanks to: https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py
			
 
				 ///
			
 
				-/// A fold-back inversion occurs when a read has two alignments on the same chromosome
			
 
				-/// with opposite strands that overlap on the reference. This pattern suggests the
			
 
				-/// presence of an inverted duplication or fold-back structure.
			
 
				+/// A fold-back inversion occurs when a single read produces two alignments on the same
			
 
				+/// chromosome with opposite strand orientations. The read travels forward (alignment A),
			
 
				+/// then folds back and runs in reverse (alignment B), covering the same reference region
			
 
				+/// twice — producing a signature identical to an **inverted duplication**.
			
 
				+///
			
 
				+/// SA tag parsing is handled by [`parse_sa_entries`] / [`SaEntry`], which perform the
			
 
				+/// 1-based → 0-based coordinate conversion required by the SAM specification.
			
 
				 ///
			
 
				 /// # Coordinate System
			
 
				+///
			
 
				 /// - All coordinates are 0-based, half-open `[start, end)`
			
 
				-/// - Alignments A and B are ordered by their position on the read (A first)
			
 
				-/// - Breakpoint distances (`b_c`, `a_d`) use strand-aware coordinates
			
 
				+/// - Alignments A and B are ordered by their position on the read (A comes first)
			
 
				 ///
			
 
				 /// # Breakpoints
			
 
				-/// Breakpoints (a, b, c, d) are strand-aware positions representing alignment termini:
			
 
				-/// - For `+` strand: `a = start` (5'), `b = end` (3')
			
 
				-/// - For `-` strand: `c = start` (3'), `d = end` (5')
			
 
				+///
			
 
				+/// `(a, b, c, d)` are alignment termini in **read order** (5′→3′ along the read).
			
 
				+/// `b` is the **inner** (junction-proximal) breakpoint of A; `c` is the **inner**
			
 
				+/// breakpoint of B. `a` and `d` are the **outer** (junction-distal) endpoints.
			
 
				+///
			
 
				+/// For each strand, the 5′ end of the read maps to a different reference position:
			
 
				+///
			
 
				+/// | Strand | 5′ terminus | 3′ terminus |
			
 
				+/// |--------|-------------|-------------|
			
 
				+/// | `+`    | `start` (leftmost ref pos)  | `end` (rightmost ref pos)  |
			
 
				+/// | `−`    | `end`   (rightmost ref pos) | `start` (leftmost ref pos) |
			
 
				+///
			
 
				+/// When A comes first on the read: A's 3′ terminus (`b`) and B's 5′ terminus (`c`)
			
 
				+/// are the **inner** (junction-proximal) breakpoints. A's 5′ (`a`) and B's 3′ (`d`)
			
 
				+/// are the **outer** endpoints.
			
 
				 ///
			
 
				 /// ```text
			
 
				-/// Case: A on '+' strand, B on '-' strand
			
 
				+/// Case: A on '+' strand, B on '−' strand
			
 
				 ///
			
 
				 ///              a                        b
			
 
				 ///              |                        |
			
@@ -350,17 +396,39 @@ pub enum SegmentOrder {
 
				 ///           --------[<=============B======]-------
			
 
				 ///                   ^                     ^
			
 
				 ///                   |                     |
			
 
				-///                   c                     d
			
 
				+///                   d                     c
			
 
				 ///
			
 
				-/// Alignment A (+ strand):  a=aln_a_start, b=aln_a_end
			
 
				-/// Alignment B (- strand):  c=aln_b_start, d=aln_b_end
			
 
				+/// Alignment A (+ strand):  a = aln_a_start  (5' of A, outer)
			
 
				+///                          b = aln_a_end    (3' of A, inner — junction-proximal)
			
 
				+/// Alignment B (− strand):  c = aln_b_end    (5' of B in read order, inner — junction-proximal)
			
 
				+///                          d = aln_b_start  (3' of B in read order, outer)
			
 
				 ///
			
 
				 /// Distances:
			
 
				-///   aln_a_len = |a - b| = alignment A length
			
 
				-///   aln_b_len = |c - d| = alignment B length
			
 
				-///   b_c       = |b - c| = inner breakpoint distance (3' ends)
			
 
				-///   a_d       = |a - d| = outer breakpoint distance (5' ends)
			
 
				+///   aln_a_len = |a − b| = alignment A length on reference
			
 
				+///   aln_b_len = |c − d| = alignment B length on reference
			
 
				+///   b_c       = |b − c| = inner breakpoint distance (junction gap between A and B)
			
 
				+///   a_d       = |a − d| = outer span of the fold-back structure
			
 
				 /// ```
			
 
				+///
			
 
				+/// The region `[d, b]` (where `d < b` in the canonical case) is covered by both
			
 
				+/// alignments and constitutes the **inverted duplication**. For a perfect duplication
			
 
				+/// `b_c = 0` and `a_d = 0`. Events with `a_d < 150 bp` are typically classified as
			
 
				+/// **sequencing artefacts** (short hairpin/palindrome) rather than true structural variants.
			
 
				+///
			
 
				+/// # Divergence from reference implementation
			
 
				+///
			
 
				+/// The reference Python script requires the two alignments to **overlap** on the reference
			
 
				+/// (`overlap > 0 bp`). This implementation is more permissive: it also accepts a small
			
 
				+/// reference-space **gap** between the alignments (controlled by `max_gap` in
			
 
				+/// [`fb_inv_from_record`]). Set `max_gap = Some(0)` to match the strict Python behaviour,
			
 
				+/// which requires overlap because the artefact mechanism — ssDNA forming a hairpin and
			
 
				+/// re-translocating back through the nanopore — guarantees both alignments cover the
			
 
				+/// same physical DNA strand.
			
 
				+///
			
 
				+/// # References
			
 
				+///
			
 
				+/// Geometry, breakpoint labelling, and the `a_d < 150 bp` artefact threshold are adapted from:
			
 
				+/// <https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py>
			
 
				 #[derive(Debug)]
			
 
				 pub struct FbInv {
			
 
				     /// Read name (QNAME from BAM)
			
@@ -383,9 +451,9 @@ pub struct FbInv {
 
				     pub aln_a_len: i64,
			
 
				     /// Alignment B length on reference (bp)
			
 
				     pub aln_b_len: i64,
			
 
				-    /// Distance between breakpoints B and C (strand-aware)
			
 
				+    /// `|b - c|`: read-order junction gap — distance between the 3' end of A and the 5' end of B
			
 
				     pub b_c: i64,
			
 
				-    /// Distance between breakpoints A and D (strand-aware)
			
 
				+    /// `|a - d|`: read-order outer span — distance between the 5' end of A and the 3' end of B
			
 
				     pub a_d: i64,
			
 
				     /// Which alignment appears first on the read
			
 
				     pub first: SegmentOrder,
			
@@ -399,21 +467,18 @@ pub struct FbInv {
 
				     pub sa_qend: u32,
			
 
				 }
			
 
				 
			
 
				-/// Parse a SAM CIGAR string into a vector of CIGAR operations.
			
 
				+/// Parse a SAM CIGAR string into a vector of [`Cigar`] operations.
			
 
				 ///
			
 
				-/// Supports all standard CIGAR operations: M, I, D, N, S, H, P, =, X
			
 
				+/// All standard CIGAR operations are supported: `M`, `I`, `D`, `N`, `S`, `H`, `P`, `=`, `X`.
			
 
				 ///
			
 
				 /// # Arguments
			
 
				-/// * `s` - CIGAR string (e.g., "10S100M5I50M20S")
			
 
				+///
			
 
				+/// * `s` - CIGAR string, e.g. `"10S100M5I50M20S"`
			
 
				 ///
			
 
				 /// # Returns
			
 
				-/// `Some(Vec<Cigar>)` if valid, `None` if malformed or empty
			
 
				 ///
			
 
				-/// # Examples
			
 
				-/// ```
			
 
				-/// let ops = parse_cigar_str("10S100M").unwrap();
			
 
				-/// assert_eq!(ops.len(), 2);
			
 
				-/// ```
			
 
				+/// `Some(ops)` on success, `None` if the string is empty, contains an unknown operation
			
 
				+/// character, has an operator without a preceding length, or ends with trailing digits.
			
 
				 pub fn parse_cigar_str(s: &str) -> Option<Vec<Cigar>> {
			
 
				     if s.is_empty() {
			
 
				         return None;
			
@@ -465,17 +530,22 @@ pub fn parse_cigar_str(s: &str) -> Option<Vec<Cigar>> {
 
				     Some(ops)
			
 
				 }
			
 
				 
			
 
				-/// Compute query coordinates (start, end) on the read from a CIGAR.
			
 
				+/// Compute `(query_begin, query_end)` on the read from CIGAR operations.
			
 
				 ///
			
 
				-/// For plus strand, the first CIGAR operation corresponds to the read start.
			
 
				-/// For minus strand, the last CIGAR operation corresponds to the read start.
			
 
				+/// For `+` strand the first CIGAR op is at the read start; for `-` strand the last op
			
 
				+/// is at the read start (CIGAR is encoded in reference order regardless of strand).
			
 
				+/// Hard-clipped bases are excluded; soft-clip length sets `query_begin`.
			
 
				+/// Only query-consuming operations (`M`, `I`, `=`, `X`) contribute to the alignment length.
			
 
				 ///
			
 
				 /// # Arguments
			
 
				-/// * `ops` - CIGAR operations slice
			
 
				-/// * `strand` - '+' or '-'
			
 
				+///
			
 
				+/// * `ops` - Slice of CIGAR operations
			
 
				+/// * `strand` - `'+'` or `'-'`
			
 
				 ///
			
 
				 /// # Returns
			
 
				-/// Tuple of (query_begin, query_end) in read coordinates (excluding hard clips)
			
 
				+///
			
 
				+/// `(query_begin, query_end)` as read-coordinate offsets (0-based, hard clips excluded).
			
 
				+/// Returns `(0, 0)` for an empty slice.
			
 
				 fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) {
			
 
				     if ops.is_empty() {
			
 
				         return (0, 0);
			
@@ -505,15 +575,18 @@ fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) {
 
				     (query_beg, query_beg + aln_len)
			
 
				 }
			
 
				 
			
 
				-/// Compute alignment length on the reference from CIGAR operations.
			
 
				+/// Compute the reference-consuming length of a CIGAR.
			
 
				 ///
			
 
				-/// Counts bases consumed by M, D, N, =, X operations.
			
 
				+/// Sums lengths of operations that advance the reference coordinate:
			
 
				+/// `M`, `D`, `N`, `=`, `X`. Insertions, soft clips, hard clips, and pads are excluded.
			
 
				 ///
			
 
				 /// # Arguments
			
 
				-/// * `ops` - CIGAR operations slice
			
 
				+///
			
 
				+/// * `ops` - Slice of CIGAR operations
			
 
				 ///
			
 
				 /// # Returns
			
 
				-/// Reference-consuming length in base pairs
			
 
				+///
			
 
				+/// Reference-consuming length in base pairs. Returns `0` for an empty slice.
			
 
				 fn alignment_ref_length(ops: &[Cigar]) -> u32 {
			
 
				     ops.iter()
			
 
				         .filter_map(|op| match op {
			
@@ -527,32 +600,40 @@ fn alignment_ref_length(ops: &[Cigar]) -> u32 {
 
				         .sum()
			
 
				 }
			
 
				 
			
 
				-/// Compute gap between two intervals. Negative value means overlap.
			
 
				+/// Compute the gap in base pairs between two half-open intervals.
			
 
				+///
			
 
				+/// Both intervals are `[start, end)`. A negative return value indicates that the
			
 
				+/// intervals overlap by that many bases; zero means they are adjacent; positive means
			
 
				+/// they are separated.
			
 
				 ///
			
 
				 /// # Arguments
			
 
				+///
			
 
				 /// * `a_start`, `a_end` - First interval `[a_start, a_end)`
			
 
				 /// * `b_start`, `b_end` - Second interval `[b_start, b_end)`
			
 
				 ///
			
 
				 /// # Returns
			
 
				-/// Gap in bp (negative if overlapping, 0 if adjacent, positive if separated)
			
 
				+///
			
 
				+/// `max(starts) - min(ends)`: negative for overlap, 0 for adjacency, positive for gap.
			
 
				 #[inline]
			
 
				 fn interval_gap(a_start: i64, a_end: i64, b_start: i64, b_end: i64) -> i64 {
			
 
				     a_start.max(b_start) - a_end.min(b_end)
			
 
				 }
			
 
				 
			
 
				-/// Compute strand-aware breakpoint coordinates.
			
 
				+/// Return `(5'_terminus, 3'_terminus)` in read order for a single alignment.
			
 
				 ///
			
 
				-/// For fold-back inversions, breakpoints depend on strand orientation:
			
 
				-/// - Plus strand: breakpoints are (start, end) - 5' to 3' direction
			
 
				-/// - Minus strand: breakpoints are (end, start) - reversed direction
			
 
				+/// For `+` strand the 5' end maps to the leftmost reference position (`start`) and the
			
 
				+/// 3' end to the rightmost (`end`). For `-` strand the read runs right-to-left on the
			
 
				+/// reference, so 5' = `end` and 3' = `start`.
			
 
				 ///
			
 
				 /// # Arguments
			
 
				-/// * `start` - Alignment start (0-based)
			
 
				-/// * `end` - Alignment end (exclusive)
			
 
				-/// * `strand` - '+' or '-'
			
 
				+///
			
 
				+/// * `start` - Alignment start, 0-based inclusive
			
 
				+/// * `end` - Alignment end, 0-based exclusive
			
 
				+/// * `strand` - `'+'` or `'-'`
			
 
				 ///
			
 
				 /// # Returns
			
 
				-/// Tuple of (breakpoint_a, breakpoint_b) in strand-aware order
			
 
				+///
			
 
				+/// `(5'_coord, 3'_coord)` where both are 0-based reference positions.
			
 
				 #[inline]
			
 
				 fn breakpoints(start: i64, end: i64, strand: char) -> (i64, i64) {
			
 
				     if strand == '+' {
			
@@ -562,35 +643,46 @@ fn breakpoints(start: i64, end: i64, strand: char) -> (i64, i64) {
 
				     }
			
 
				 }
			
 
				 
			
 
				-/// Extract a fold-back inversion event from a BAM record.
			
 
				+/// Attempt to extract a fold-back inversion event from a primary BAM record.
			
 
				 ///
			
 
				-/// Analyzes a primary alignment and its supplementary alignment (from SA tag)
			
 
				-/// to detect fold-back inversion patterns characterized by:
			
 
				-/// - Same chromosome
			
 
				-/// - Opposite strands
			
 
				-/// - Reference coordinates within proximity threshold
			
 
				-/// - Exactly one supplementary alignment
			
 
				+/// Inspects the record's SA tag using [`parse_sa_entries`] and checks whether the single
			
 
				+/// supplementary alignment forms a fold-back inversion with the primary: same chromosome,
			
 
				+/// opposite strand, and reference coordinates within the specified proximity thresholds.
			
 
				+/// Returns a populated [`FbInv`] with all breakpoint metrics on success.
			
 
				+///
			
 
				+/// Alignments A and B in the returned struct are ordered by their position on the read
			
 
				+/// (A comes first). All coordinates are 0-based half-open; the 1-based SA tag `pos` is
			
 
				+/// converted automatically by [`SaEntry`].
			
 
				+///
			
 
				+/// Recommended thresholds for **artefact detection**: `max_overlap = Some(150)`,
			
 
				+/// `max_gap = Some(0)`. Requiring overlap (`max_gap = Some(0)`) matches the reference
			
 
				+/// Python implementation and avoids capturing true SV junctions that happen to have
			
 
				+/// opposite-strand supplementary alignments with a reference-space gap.
			
 
				 ///
			
 
				 /// # Arguments
			
 
				-/// * `record` - BAM record (must be primary alignment)
			
 
				-/// * `header` - BAM header for reference name lookup
			
 
				-/// * `min_mapq` - Minimum mapping quality threshold for both alignments
			
 
				-/// * `max_overlap` - Maximum allowed overlap in bp (None = no limit)
			
 
				-/// * `max_gap` - Maximum allowed gap in bp (None = no limit, 0 = require overlap or adjacent)
			
 
				-///   default should be [-150, 200) Some(150), Some(200)
			
 
				+///
			
 
				+/// * `record` - BAM record to evaluate; must be a primary alignment
			
 
				+/// * `header` - BAM header used to resolve the reference name from `tid`
			
 
				+/// * `min_mapq` - Minimum mapping quality required for both the primary and the supplementary
			
 
				+/// * `max_overlap` - Maximum allowed reference-space overlap in bp (`None` = no limit)
			
 
				+/// * `max_gap` - Maximum allowed reference-space gap in bp (`None` = no limit)
			
 
				 ///
			
 
				 /// # Returns
			
 
				-/// `Some(FbInv)` if a valid fold-back pattern is detected, `None` otherwise
			
 
				 ///
			
 
				-/// # Filtering
			
 
				-/// Returns `None` if any of these conditions are met:
			
 
				+/// `Some(FbInv)` if all criteria are met, `None` otherwise.
			
 
				+///
			
 
				+/// Returns `None` when any of the following conditions apply:
			
 
				 /// - Record is unmapped, secondary, or supplementary
			
 
				-/// - MAPQ below threshold (primary or supplementary)
			
 
				-/// - No SA tag or multiple supplementary alignments
			
 
				-/// - Supplementary on different chromosome
			
 
				-/// - Same strand orientation
			
 
				-/// - Overlap exceeds `max_overlap` (if set)
			
 
				-/// - Gap exceeds `max_gap` (if set)
			
 
				+/// - Primary or supplementary MAPQ is below `min_mapq`
			
 
				+/// - SA tag is absent or contains more than one entry
			
 
				+/// - Supplementary aligns to a different chromosome
			
 
				+/// - Both alignments are on the same strand
			
 
				+/// - Reference-space overlap exceeds `max_overlap`
			
 
				+/// - Reference-space gap exceeds `max_gap`
			
 
				+///
			
 
				+/// # References
			
 
				+///
			
 
				+/// <https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py>
			
 
				 pub fn fb_inv_from_record(
			
 
				     record: &bam::Record,
			
 
				     header: &bam::HeaderView,
			
@@ -607,32 +699,19 @@ pub fn fb_inv_from_record(
 
				         return None;
			
 
				     }
			
 
				 
			
 
				-    // SA tag required
			
 
				+    // SA tag required; exactly one supplementary alignment expected
			
 
				     let sa_str = match record.aux(b"SA") {
			
 
				         Ok(Aux::String(s)) => s,
			
 
				         _ => return None,
			
 
				     };
			
 
				 
			
 
				-    let sa_entries: Vec<&str> = sa_str.split(';').filter(|s| !s.is_empty()).collect();
			
 
				-
			
 
				-    // Require exactly one SA (like the Python code)
			
 
				+    let sa_entries = parse_sa_entries(sa_str);
			
 
				     if sa_entries.len() != 1 {
			
 
				         return None;
			
 
				     }
			
 
				-    let sa = sa_entries[0];
			
 
				-
			
 
				-    // SA format: chr,pos,strand,cigar,mapq,nm
			
 
				-    let fields: Vec<&str> = sa.split(',').collect();
			
 
				-    if fields.len() < 6 {
			
 
				-        return None;
			
 
				-    }
			
 
				-    let sa_rname = fields[0];
			
 
				-    let sa_pos_1based: i64 = fields[1].parse().ok()?; // SAM is 1-based
			
 
				-    let sa_strand_char: char = fields[2].chars().next()?;
			
 
				-    let sa_cigar_str = fields[3];
			
 
				-    let sa_mapq: u8 = fields[4].parse().ok()?;
			
 
				+    let sa = &sa_entries[0];
			
 
				 
			
 
				-    if sa_mapq < min_mapq {
			
 
				+    if sa.mapq < min_mapq {
			
 
				         return None;
			
 
				     }
			
 
				 
			
@@ -642,13 +721,13 @@ pub fn fb_inv_from_record(
 
				         return None;
			
 
				     }
			
 
				     let primary_rname = std::str::from_utf8(header.tid2name(tid as u32)).ok()?;
			
 
				-    if sa_rname != primary_rname {
			
 
				+    if sa.chr != primary_rname {
			
 
				         return None;
			
 
				     }
			
 
				 
			
 
				     // Strand check: need opposite strands for fold-back inversion
			
 
				     let primary_strand_char = if record.is_reverse() { '-' } else { '+' };
			
 
				-    if primary_strand_char == sa_strand_char {
			
 
				+    if primary_strand_char == sa.strand {
			
 
				         return None;
			
 
				     }
			
 
				 
			
@@ -656,17 +735,14 @@ pub fn fb_inv_from_record(
 
				     let primary_start = record.pos();
			
 
				     let primary_end = record.cigar().end_pos();
			
 
				 
			
 
				-    // Parse SA CIGAR
			
 
				-    let sa_cigar_ops = parse_cigar_str(sa_cigar_str)?;
			
 
				+    // Parse SA CIGAR; pos is already 0-based from SaEntry
			
 
				+    let sa_cigar_ops = parse_cigar_str(sa.cigar)?;
			
 
				     let sa_ref_len = alignment_ref_length(&sa_cigar_ops);
			
 
				-
			
 
				-    // Validation: SA must have non-zero reference length
			
 
				     if sa_ref_len == 0 {
			
 
				         return None;
			
 
				     }
			
 
				 
			
 
				-    // SA ref coords: convert POS from 1-based to 0-based
			
 
				-    let sa_start = sa_pos_1based - 1;
			
 
				+    let sa_start = sa.pos as i64;
			
 
				     let sa_end = sa_start + sa_ref_len as i64;
			
 
				 
			
 
				     // Check proximity between alignments
			
@@ -692,7 +768,7 @@ pub fn fb_inv_from_record(
 
				     // Query coords for primary and SA
			
 
				     let primary_cigar: Vec<Cigar> = record.cigar().iter().cloned().collect();
			
 
				     let (primary_qbeg, primary_qend) = alignment_query_coords(&primary_cigar, primary_strand_char);
			
 
				-    let (sa_qbeg, sa_qend) = alignment_query_coords(&sa_cigar_ops, sa_strand_char);
			
 
				+    let (sa_qbeg, sa_qend) = alignment_query_coords(&sa_cigar_ops, sa.strand);
			
 
				 
			
 
				     // Decide which segment is first on the read
			
 
				     let first = if primary_qbeg <= sa_qbeg {
			
@@ -710,13 +786,13 @@ pub fn fb_inv_from_record(
 
				                 primary_strand_char,
			
 
				                 sa_start,
			
 
				                 sa_end,
			
 
				-                sa_strand_char,
			
 
				+                sa.strand,
			
 
				             )
			
 
				         } else {
			
 
				             (
			
 
				                 sa_start,
			
 
				                 sa_end,
			
 
				-                sa_strand_char,
			
 
				+                sa.strand,
			
 
				                 primary_start,
			
 
				                 primary_end,
			
 
				                 primary_strand_char,
			
@@ -758,40 +834,27 @@ pub fn fb_inv_from_record(
 
				     })
			
 
				 }
			
 
				 
			
 
				-// pub fn read_sm_tag(bam_path: &str) -> anyhow::Result<String> {
			
 
				-//     let reader = bam::Reader::from_path(bam_path)
			
 
				-//         .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
			
 
				-//     let header = bam::Header::from_template(reader.header());
			
 
				-//     let header_text = String::from_utf8_lossy(&header.to_bytes()).to_string();
			
 
				-//
			
 
				-//     for line in header_text.lines() {
			
 
				-//         if line.starts_with("@RG") {
			
 
				-//             for field in line.split('\t') {
			
 
				-//                 if let Some(sm) = field.strip_prefix("SM:") {
			
 
				-//                     return Ok(sm.to_string());
			
 
				-//                 }
			
 
				-//             }
			
 
				-//         }
			
 
				-//     }
			
 
				-//     anyhow::bail!("No SM tag found in @RG header of {bam_path}")
			
 
				-// }
			
 
				-
			
 
				-// fn ensure_bam_sm_tag(id: &str, config: &Config) -> anyhow::Result<()> {
			
 
				-//     for bam in [config.normal_bam(id), config.tumoral_bam(id)] {
			
 
				-//         if read_sm_tag(&bam).is_err() {
			
 
				-//             let sample = Path::new(&bam)
			
 
				-//                 .file_stem()
			
 
				-//                 .unwrap_or_default()
			
 
				-//                 .to_string_lossy()
			
 
				-//                 .to_string();
			
 
				-//             info!("Injecting missing @RG SM:{sample} into {bam}");
			
 
				-//             SamtoolsReheader::from_config(config, &bam, &sample).run()?;
			
 
				-//             SamtoolsIndex::from_config(config, &bam).run()?;
			
 
				-//         }
			
 
				-//     }
			
 
				-//     Ok(())
			
 
				-// }
			
 
				-
			
 
				+/// Read the `SM` tag from a BAM's `@RG` lines, injecting it if absent or inconsistent.
			
 
				+///
			
 
				+/// Inspects every `@RG` line in the header. If all lines carry an identical `SM` value,
			
 
				+/// that value is returned. If any line is missing `SM`, or if multiple distinct `SM`
			
 
				+/// values are present, `samtools reheader` is invoked to set `SM` uniformly to
			
 
				+/// `fallback_sample`. The file modification time is preserved after reheadering.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `bam_path` - Path to the BAM file (must be writable if reheadering is needed)
			
 
				+/// * `fallback_sample` - Sample name to inject when the header is missing or ambiguous
			
 
				+/// * `config` - Pipeline configuration used to invoke [`SamtoolsReheader`]
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// The `SM` value present in (or injected into) the header.
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Returns an error if the BAM cannot be opened, if reheadering fails, or if the
			
 
				+/// header contains no `@RG` lines at all.
			
 
				 pub fn read_sm_tag_or_inject(
			
 
				     bam_path: &str,
			
 
				     fallback_sample: &str,
			
@@ -800,7 +863,7 @@ pub fn read_sm_tag_or_inject(
 
				     let reader = bam::Reader::from_path(bam_path)
			
 
				         .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
			
 
				     let header = bam::Header::from_template(reader.header());
			
 
				-    let header_text = String::from_utf8_lossy(&header.to_bytes()).to_string();
			
 
				+    let header_text = String::from_utf8_lossy(&header.to_bytes()).into_owned();
			
 
				 
			
 
				     let mut sm_values: Vec<String> = Vec::new();
			
 
				     let mut all_have_sm = true;
			
@@ -817,10 +880,8 @@ pub fn read_sm_tag_or_inject(
 
				         }
			
 
				     }
			
 
				 
			
 
				-    // Determine whether reheadering is needed:
			
 
				-    // 1. Any @RG missing SM
			
 
				-    // 2. SM values are not all identical
			
 
				-    // 3. No @RG lines at all
			
 
				+    // Reheader if any @RG lacks SM or if multiple distinct SM values exist.
			
 
				+    // No @RG lines at all falls through to the error at the end.
			
 
				     let unique_sm: std::collections::HashSet<&str> = sm_values.iter().map(|s| s.as_str()).collect();
			
 
				 
			
 
				     let needs_reheader = !all_have_sm || unique_sm.len() > 1;
			
@@ -856,29 +917,31 @@ pub fn read_sm_tag_or_inject(
 
				         .context(format!("No @RG lines found in {bam_path}"))
			
 
				 }
			
 
				 
			
 
				-/// Fetch primary records from a BAM file, with optional region and qname filtering.
			
 
				+/// Fetch primary alignment records from an indexed BAM, with optional region and name filtering.
			
 
				+///
			
 
				+/// Supplementary alignments within the fetch region are resolved to their primaries via
			
 
				+/// [`primary_records`]. Duplicates introduced by that resolution step are removed by
			
 
				+/// query name. Secondary alignments are always excluded.
			
 
				 ///
			
 
				 /// # Arguments
			
 
				-/// * `bam_path`     - Path to indexed BAM file
			
 
				-/// * `region`       - Optional region to fetch (chr, start, end). None = whole file.
			
 
				-/// * `qname_filter` - Optional qname filter. None = no filter.
			
 
				 ///
			
 
				-/// # Notes
			
 
				-/// - Supplementary alignments are resolved to their primary via SA tag using `primary_records`
			
 
				-/// - White/blacklist are mutually exclusive — passing both is a logic error and will panic in debug
			
 
				+/// * `bam_path` - Path to an indexed BAM file (`.bai` index must exist)
			
 
				+/// * `region` - If `Some((chr, start, end))`, restrict the fetch to that 0-based half-open
			
 
				+///   interval; if `None`, the whole file is scanned
			
 
				+/// * `qname_filter` - Optional [`QnameFilter`] to retain or exclude specific read names
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// A deduplicated vector of primary records, optionally filtered by query name.
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Returns an error if the BAM cannot be opened or if the fetch operation fails.
			
 
				 pub fn fetch_primary_records(
			
 
				     bam_path: &Path,
			
 
				     region: Option<(&str, i64, i64)>,
			
 
				     qname_filter: Option<QnameFilter>,
			
 
				 ) -> anyhow::Result<Vec<bam::Record>> {
			
 
				-    debug_assert!(
			
 
				-        !matches!(
			
 
				-            &qname_filter,
			
 
				-            Some(QnameFilter::Whitelist(_)) | Some(QnameFilter::Blacklist(_))
			
 
				-        ) || qname_filter.is_some(),
			
 
				-        "Cannot pass both whitelist and blacklist"
			
 
				-    );
			
 
				-
			
 
				     let mut reader = bam::IndexedReader::from_path(bam_path)
			
 
				         .with_context(|| format!("Cannot open BAM: {}", bam_path.display()))?;
			
 
				 
			
@@ -924,66 +987,31 @@ pub fn fetch_primary_records(
 
				     Ok(records)
			
 
				 }
			
 
				 
			
 
				+/// Query-name filter for [`fetch_primary_records`].
			
 
				 pub enum QnameFilter {
			
 
				+    /// Retain only records whose query name is in the set.
			
 
				     Whitelist(HashSet<Vec<u8>>),
			
 
				+    /// Exclude records whose query name is in the set.
			
 
				     Blacklist(HashSet<Vec<u8>>),
			
 
				 }
			
 
				 
			
 
				-/// Convert an input BAM/CRAM file into a BED file containing one interval per
			
 
				-/// aligned record.
			
 
				+/// Convert a BAM/CRAM file to a BED4 file with one interval per aligned record.
			
 
				 ///
			
 
				-/// Each output line corresponds to a single aligned read and has the form:
			
 
				-///
			
 
				-/// ```text
			
 
				-/// chrom    start    end    read_name
			
 
				-/// ```
			
 
				-///
			
 
				-/// # Behavior
			
 
				-///
			
 
				-/// - Unmapped records are skipped.
			
 
				-/// - Chromosome names are taken from the BAM header.
			
 
				-/// - Coordinates follow BED conventions:
			
 
				-///   - `start` is **0-based inclusive**
			
 
				-///   - `end` is **0-based exclusive**
			
 
				-/// - The interval `[start, end)` is computed using:
			
 
				-///   - [`bam::Record::pos`] for the start
			
 
				-///   - [`bam::Record::reference_end`] for the end
			
 
				-/// - The end position accounts for all CIGAR operations consuming reference
			
 
				-///   (`M`, `D`, `N`, `=`, `X`).
			
 
				+/// Each output line has the form `chrom\tstart\tend\tread_name`. Coordinates are
			
 
				+/// 0-based half-open (`start` from [`Record::pos`], `end` from [`Record::reference_end`]),
			
 
				+/// consistent with BED convention. Unmapped records and records with a negative `tid`
			
 
				+/// are skipped. Secondary and supplementary alignments are **not** filtered — add
			
 
				+/// upstream filtering if stricter selection is needed.
			
 
				 ///
			
 
				 /// # Arguments
			
 
				 ///
			
 
				-/// * `input_bam` - Path to input BAM or CRAM file
			
 
				-/// * `output_bed` - Path to output BED file
			
 
				+/// * `input_bam` - Path to the input BAM or CRAM file
			
 
				+/// * `output_bed` - Path to the output BED file (created or overwritten)
			
 
				 ///
			
 
				 /// # Errors
			
 
				 ///
			
 
				-/// Returns an error if:
			
 
				-/// - The input file cannot be opened or read
			
 
				-/// - The output file cannot be created or written
			
 
				-/// - UTF-8 conversion of reference names or read names fails
			
 
				-///
			
 
				-/// # Example
			
 
				-///
			
 
				-/// ```no_run
			
 
				-/// use std::path::Path;
			
 
				-///
			
 
				-/// fn main() -> anyhow::Result<()> {
			
 
				-///     bam_to_aligned_bed("input.bam", "output.bed")?;
			
 
				-///     Ok(())
			
 
				-/// }
			
 
				-/// ```
			
 
				-///
			
 
				-/// # Notes
			
 
				-///
			
 
				-/// - Secondary, supplementary, and duplicate reads are **not filtered**.
			
 
				-///   Add additional flags if stricter filtering is required.
			
 
				-/// - For paired-end reads, each alignment record is emitted independently.
			
 
				-///
			
 
				-/// # See also
			
 
				-///
			
 
				-/// - [`rust_htslib::bam::Reader`]
			
 
				-/// - [`rust_htslib::bam::Record`]
			
 
				+/// Returns an error if the input cannot be read, the output cannot be written, or
			
 
				+/// if UTF-8 conversion of a reference name or query name fails.
			
 
				 pub fn bam_to_aligned_bed<P: AsRef<Path>>(
			
 
				     input_bam: P,
			
 
				     output_bed: P,
			
--- a/src/scan/bin.rs
+++ b/src/scan/bin.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;
 
				 
			
 
				 use anyhow::Context;
			
 
				 use csv::ByteRecord;
			
 
				-use log::error;
			
 
				+use log::{error, warn};
			
 
				 use rust_htslib::bam::{HeaderView, IndexedReader, Read, Record, ext::BamRecordExtensions, record::Aux};
			
 
				 
			
 
				 use crate::io::{bam::{fb_inv_from_record, primary_record, primary_records}, tsv::{parse_csv_u32_into, parse_u32}};
			
@@ -149,7 +149,11 @@ impl Bin {
 
				         self.reads_store
			
 
				             .values()
			
 
				             .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
			
 
				-            .map(|record| primary_record(bam_reader, record.clone()))
			
 
				+            .filter_map(|record| {
			
 
				+                primary_record(bam_reader, record.clone())
			
 
				+                    .map_err(|e| warn!("primary_record failed: {e}"))
			
 
				+                    .ok()
			
 
				+            })
			
 
				             .collect()
			
 
				     }
			
 
				 
			
@@ -301,7 +305,11 @@ impl Bin {
 
				         self.reads_store
			
 
				             .values()
			
 
				             .filter(|record| record.reference_start() as u32 == pos)
			
 
				-            .map(|record| primary_record(bam_reader, record.clone()))
			
 
				+            .filter_map(|record| {
			
 
				+                primary_record(bam_reader, record.clone())
			
 
				+                    .map_err(|e| warn!("primary_record failed: {e}"))
			
 
				+                    .ok()
			
 
				+            })
			
 
				             .collect()
			
 
				     }
			
 
				 
			
@@ -318,7 +326,11 @@ impl Bin {
 
				         self.reads_store
			
 
				             .values()
			
 
				             .filter(|record| record.reference_end() as u32 == pos)
			
 
				-            .map(|record| primary_record(bam_reader, record.clone()))
			
 
				+            .filter_map(|record| {
			
 
				+                primary_record(bam_reader, record.clone())
			
 
				+                    .map_err(|e| warn!("primary_record failed: {e}"))
			
 
				+                    .ok()
			
 
				+            })
			
 
				             .collect()
			
 
				     }