| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123 |
- use std::io::Write;
- use std::{
- collections::{HashMap, HashSet},
- fs::File,
- path::Path,
- };
- use anyhow::Context;
- use log::{info, warn};
- use rust_htslib::bam::{
- self,
- ext::BamRecordExtensions,
- record::{Aux, Cigar},
- FetchDefinition, IndexedReader, Read, Record,
- };
- use crate::{commands::samtools::SamtoolsReheader, config::Config, runners::Run};
- /// A single entry from a BAM SA (supplementary alignment) tag.
- ///
- /// SA tag format per the SAM spec: `rname,pos,strand,CIGAR,mapQ,NM` (semicolon-separated).
- /// `pos` is stored as 0-based after conversion from the 1-based SAM value.
- pub struct SaEntry<'a> {
- /// Reference/contig name
- pub chr: &'a str,
- /// Reference start position, 0-based
- pub pos: i32,
- /// Strand: `'+'` or `'-'`
- pub strand: char,
- /// CIGAR string (unparsed)
- pub cigar: &'a str,
- /// Mapping quality
- pub mapq: u8,
- /// Edit distance (NM tag value); `0` if absent or unparseable
- pub nm: u32,
- }
- /// Parse all entries from a BAM SA tag value into [`SaEntry`] structs.
- ///
- /// `pos` is converted from 1-based (SAM spec) to 0-based. Entries with fewer than
- /// six comma-separated fields or unparseable numeric fields are silently skipped.
- ///
- /// # Arguments
- ///
- /// * `sa` - Raw SA tag value (the string after `SA:Z:` in a SAM record)
- ///
- /// # Returns
- ///
- /// A vector of parsed entries. Empty if the tag contains no valid entries.
- pub fn parse_sa_entries(sa: &str) -> Vec<SaEntry<'_>> {
- sa.split(';')
- .filter(|s| !s.is_empty())
- .filter_map(|s| {
- let f: Vec<&str> = s.splitn(6, ',').collect();
- if f.len() < 6 {
- return None;
- }
- Some(SaEntry {
- chr: f[0],
- pos: f[1].parse::<i32>().ok()? - 1,
- strand: f[2].chars().next()?,
- cigar: f[3],
- mapq: f[4].parse().ok()?,
- nm: f[5].parse().unwrap_or(0),
- })
- })
- .collect()
- }
- /// Convenience wrapper over [`parse_sa_entries`] returning only `(chr, pos)` pairs.
- ///
- /// `pos` is 0-based. Useful when only fetch coordinates are needed.
- pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> {
- parse_sa_entries(sa)
- .into_iter()
- .map(|e| (e.chr, e.pos))
- .collect()
- }
- /// Resolve a supplementary alignment to its primary record via the SA tag.
- ///
- /// For supplementary alignments, each position listed in the SA tag is fetched and
- /// scanned for a non-supplementary record with the same query name. The first match
- /// is returned. If the record is not supplementary, or no primary is found across all
- /// SA positions, the original record is returned unchanged.
- ///
- /// # Arguments
- ///
- /// * `bam` - Indexed BAM reader used for random-access fetches
- /// * `record` - The record to resolve (typically a supplementary alignment)
- ///
- /// # Returns
- ///
- /// The primary alignment record, or the input record if resolution is not applicable
- /// or unsuccessful.
- ///
- /// # Errors
- ///
- /// Returns an error if a BAM fetch operation fails or if a fetched record is malformed.
- pub fn primary_record(bam: &mut IndexedReader, record: Record) -> anyhow::Result<Record> {
- if !record.is_supplementary() {
- return Ok(record);
- }
- let qname = record.qname();
- if let Ok(Aux::String(sa)) = record.aux(b"SA") {
- for (chr, start) in parse_sa_tag(sa) {
- bam.fetch((chr, start, start + 1))
- .with_context(|| format!("BAM fetch failed at {chr}:{start}"))?;
- for result in bam.records() {
- let candidate = result.context("Invalid BAM record")?;
- if candidate.qname() == qname
- && !candidate.is_supplementary()
- && !candidate.is_secondary()
- {
- return Ok(candidate);
- }
- }
- }
- }
- Ok(record)
- }
- /// Group 0-based positions by chromosome into bins spanning at most 1000 bp, sorted by bin size descending.
- ///
- /// Positions within 1000 bp of the first position in the current bin are merged into
- /// that bin. Larger bins are sorted first so that high-density regions are fetched
- /// before sparse ones, enabling earlier early-exit in callers.
- fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec<Vec<(&'a str, i32)>> {
- let mut sorted_positions = positions.to_vec();
- sorted_positions.sort_by_key(|(chr, pos)| (*chr, *pos));
- sorted_positions.dedup();
- let mut grouped: HashMap<&str, Vec<Vec<(&str, i32)>>> = HashMap::new();
- for (chr, pos) in sorted_positions {
- let bins = grouped.entry(chr).or_default();
- if let Some(last_bin) = bins.last_mut() {
- if last_bin.is_empty() || pos - last_bin[0].1 <= 1000 {
- last_bin.push((chr, pos));
- } else {
- bins.push(vec![(chr, pos)]);
- }
- } else {
- bins.push(vec![(chr, pos)]);
- }
- }
- let mut flattened: Vec<Vec<(&str, i32)>> = grouped.values().flatten().cloned().collect();
- flattened.sort_by_key(|bin| std::cmp::Reverse(bin.len()));
- flattened
- }
- /// Resolve supplementary alignments in a record set to their primary records.
- ///
- /// Primary and secondary records are separated in a first pass. For supplementary
- /// records with an SA tag, the SA positions are binned by chromosome and fetched in
- /// bulk. Fetching stops as soon as all target query names have been found (early exit).
- /// Records that cannot be resolved are silently dropped with a warning.
- ///
- /// # Arguments
- ///
- /// * `bam` - Indexed BAM reader used for random-access fetches
- /// * `records` - Mixed set of primary, secondary, and supplementary records
- ///
- /// # Returns
- ///
- /// A vector containing original non-supplementary records from the input plus any
- /// primaries fetched for supplementary inputs. Secondary records in the input are
- /// passed through unchanged; secondary records encountered during SA-position fetching
- /// are skipped. Order is not guaranteed.
- pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Record> {
- let mut res = Vec::with_capacity(records.len());
- let mut all_positions = Vec::new();
- let mut all_qnames_to_fetch = HashSet::new();
- // First pass: collect primary records and positions to fetch
- for record in records.iter() {
- if record.is_supplementary() {
- let qname = record.qname();
- // Safely extract and process the SA tag
- if let Ok(Aux::String(sa)) = record.aux(b"SA") {
- let positions = parse_sa_tag(sa);
- all_positions.extend(positions);
- match String::from_utf8(qname.to_vec()) {
- Ok(qname_str) => {
- all_qnames_to_fetch.insert(qname_str);
- }
- Err(e) => warn!("Invalid UTF-8 in qname: {}", e),
- }
- }
- } else {
- res.push(record.clone());
- }
- }
- // Track which query names we've already found
- let mut primary_records: HashSet<String> = res
- .iter()
- .filter_map(|r| String::from_utf8(r.qname().to_vec()).ok())
- .collect();
- // Create position bins for efficient fetching
- let position_bins = create_position_bins(&all_positions);
- // Fetch records for each bin until we find all primaries
- for bin in position_bins {
- if bin.is_empty() {
- continue;
- }
- let (chr, start) = bin.first().unwrap();
- let end = bin.last().unwrap().1 + 1;
- // Fetch and process records in this region
- if let Err(e) = bam.fetch((*chr, *start, end)) {
- warn!("Failed to fetch region {}:{}-{}: {}", chr, start, end, e);
- continue;
- }
- for record_result in bam.records() {
- match record_result {
- Ok(record) => {
- // Skip secondary alignments
- if record.is_secondary() {
- continue;
- }
- // Check if this is a primary we're looking for
- match String::from_utf8(record.qname().to_vec()) {
- Ok(qname) => {
- if primary_records.contains(&qname) {
- continue;
- }
- if all_qnames_to_fetch.contains(&qname) {
- res.push(record);
- primary_records.insert(qname);
- }
- }
- Err(_) => continue,
- }
- }
- Err(e) => warn!("Error reading record: {}", e),
- }
- }
- // Early exit if we found all the records we were looking for
- let remaining = all_qnames_to_fetch
- .iter()
- .filter(|q| !primary_records.contains(*q))
- .count();
- if remaining == 0 {
- break;
- }
- }
- res
- }
- /// Extract contig names and lengths from a BAM header.
- ///
- /// Reads all `@SQ` lines and returns a map of sequence name (`SN`) to sequence
- /// length (`LN`).
- ///
- /// # Arguments
- ///
- /// * `header` - BAM header to parse
- ///
- /// # Returns
- ///
- /// A map from contig name to length in base pairs.
- ///
- /// # Errors
- ///
- /// Returns an error if the `LN` field of any `@SQ` line cannot be parsed as `u64`.
- pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<HashMap<String, u64>> {
- let mut genome = HashMap::new();
- for (key, records) in header.to_hashmap() {
- for record in records {
- if key == "SQ" {
- genome.insert(
- record["SN"].to_string(),
- record["LN"].parse::<u64>().with_context(|| {
- format!("Failed to parse LN for contig {}", record["SN"])
- })?,
- );
- }
- }
- }
- Ok(genome)
- }
- /// Partition a genome into `n` balanced batches of samtools-style region strings.
- ///
- /// Contigs are sorted deterministically and sliced into segments so that each batch
- /// covers approximately `total_bp / n` bases. A contig may be split across batches.
- /// Region strings use 1-based inclusive coordinates (`ctg:start-end`) as expected
- /// by samtools `-r`.
- ///
- /// # Arguments
- ///
- /// * `genome` - Map of contig name to length in bp, typically from [`get_genome_sizes`]
- /// * `n` - Number of batches to produce
- ///
- /// # Returns
- ///
- /// A vector of `n` batches, each containing the region strings assigned to that batch.
- /// Returns an empty vector if `n` is zero or `genome` is empty.
- pub fn split_genome_into_n_regions(genome: &HashMap<String, u64>, n: usize) -> Vec<Vec<String>> {
- if n == 0 || genome.is_empty() {
- return Vec::new();
- }
- // Deterministic order over contigs
- let mut contigs: Vec<_> = genome.iter().collect();
- contigs.sort_by(|(a, _), (b, _)| a.cmp(b));
- let total_bp: u64 = contigs.iter().map(|(_, &len)| len).sum();
- let target_per_batch: u64 = total_bp.div_ceil(n as u64); // ceil
- let mut batches: Vec<Vec<String>> = vec![Vec::new(); n];
- let mut batch_idx = 0usize;
- let mut batch_bp = 0u64;
- for (ctg, &len) in contigs {
- let mut start: u64 = 1;
- while start <= len && batch_idx < n {
- let remaining_in_ctg = len - start + 1;
- let remaining_for_batch = if batch_bp >= target_per_batch {
- 1 // force close & move to next batch
- } else {
- target_per_batch - batch_bp
- };
- let seg_len = remaining_in_ctg.min(remaining_for_batch);
- let end = start + seg_len - 1;
- let region = format!("{ctg}:{start}-{end}");
- batches[batch_idx].push(region);
- batch_bp += seg_len;
- start = end + 1;
- if batch_bp >= target_per_batch && batch_idx + 1 < n {
- batch_idx += 1;
- batch_bp = 0;
- }
- }
- }
- batches
- }
- /// Indicates which alignment segment appears first on the read.
- #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
- pub enum SegmentOrder {
- /// Primary alignment appears before supplementary on the read
- PrimaryFirst,
- /// Supplementary alignment appears before primary on the read
- SuppFirst,
- }
- /// Fold-back inversion event detected from a primary/supplementary alignment pair.
- ///
- /// A fold-back inversion occurs when a single read produces two alignments on the same
- /// chromosome with opposite strand orientations. The read travels forward (alignment A),
- /// then folds back and runs in reverse (alignment B), covering the same reference region
- /// twice — producing a signature identical to an **inverted duplication**.
- ///
- /// SA tag parsing is handled by [`parse_sa_entries`] / [`SaEntry`], which perform the
- /// 1-based → 0-based coordinate conversion required by the SAM specification.
- ///
- /// # Coordinate System
- ///
- /// - All coordinates are 0-based, half-open `[start, end)`
- /// - Alignments A and B are ordered by their position on the read (A comes first)
- ///
- /// # Breakpoints
- ///
- /// `(a, b, c, d)` are alignment termini in **read order** (5′→3′ along the read).
- /// `b` is the **inner** (junction-proximal) breakpoint of A; `c` is the **inner**
- /// breakpoint of B. `a` and `d` are the **outer** (junction-distal) endpoints.
- ///
- /// For each strand, the 5′ end of the read maps to a different reference position:
- ///
- /// | Strand | 5′ terminus | 3′ terminus |
- /// |--------|-------------|-------------|
- /// | `+` | `start` (leftmost ref pos) | `end` (rightmost ref pos) |
- /// | `−` | `end` (rightmost ref pos) | `start` (leftmost ref pos) |
- ///
- /// When A comes first on the read: A's 3′ terminus (`b`) and B's 5′ terminus (`c`)
- /// are the **inner** (junction-proximal) breakpoints. A's 5′ (`a`) and B's 3′ (`d`)
- /// are the **outer** endpoints.
- ///
- /// ```text
- /// Case: A on '+' strand, B on '−' strand
- ///
- /// a b
- /// | |
- /// v v
- /// Reference:---[=====A=================>]---------
- /// --------[<=============B======]-------
- /// ^ ^
- /// | |
- /// d c
- ///
- /// Alignment A (+ strand): a = aln_a_start (5' of A, outer)
- /// b = aln_a_end (3' of A, inner — junction-proximal)
- /// Alignment B (− strand): c = aln_b_end (5' of B in read order, inner — junction-proximal)
- /// d = aln_b_start (3' of B in read order, outer)
- ///
- /// Distances:
- /// aln_a_len = |a − b| = alignment A length on reference
- /// aln_b_len = |c − d| = alignment B length on reference
- /// b_c = |b − c| = inner breakpoint distance (junction gap between A and B)
- /// a_d = |a − d| = outer span of the fold-back structure
- /// ```
- ///
- /// The region `[d, b]` (where `d < b` in the canonical case) is covered by both
- /// alignments and constitutes the **inverted duplication**. For a perfect duplication
- /// `b_c = 0` and `a_d = 0`. Events with `a_d < 150 bp` are typically classified as
- /// **sequencing artefacts** (short hairpin/palindrome) rather than true structural variants.
- ///
- /// # Divergence from reference implementation
- ///
- /// The reference Python script requires the two alignments to **overlap** on the reference
- /// (`overlap > 0 bp`). This implementation is more permissive: it also accepts a small
- /// reference-space **gap** between the alignments (controlled by `max_gap` in
- /// [`fb_inv_from_record`]). Set `max_gap = Some(0)` to match the strict Python behaviour,
- /// which requires overlap because the artefact mechanism — ssDNA forming a hairpin and
- /// re-translocating back through the nanopore — guarantees both alignments cover the
- /// same physical DNA strand.
- ///
- /// # References
- ///
- /// Geometry, breakpoint labelling, and the `a_d < 150 bp` artefact threshold are adapted from:
- /// <https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py>
- #[derive(Debug)]
- pub struct FbInv {
- /// Read name (QNAME from BAM)
- pub read_name: String,
- /// Chromosome/contig name
- pub ref_name: String,
- /// Alignment A reference start (0-based)
- pub aln_a_start: i64,
- /// Alignment A reference end (exclusive)
- pub aln_a_end: i64,
- /// Alignment A strand ('+' or '-')
- pub aln_a_strand: char,
- /// Alignment B reference start (0-based)
- pub aln_b_start: i64,
- /// Alignment B reference end (exclusive)
- pub aln_b_end: i64,
- /// Alignment B strand ('+' or '-')
- pub aln_b_strand: char,
- /// Alignment A length on reference (bp)
- pub aln_a_len: i64,
- /// Alignment B length on reference (bp)
- pub aln_b_len: i64,
- /// `|b - c|`: read-order junction gap — distance between the 3' end of A and the 5' end of B
- pub b_c: i64,
- /// `|a - d|`: read-order outer span — distance between the 5' end of A and the 3' end of B
- pub a_d: i64,
- /// Which alignment appears first on the read
- pub first: SegmentOrder,
- /// Primary alignment query start on read
- pub primary_qbeg: u32,
- /// Primary alignment query end on read
- pub primary_qend: u32,
- /// Supplementary alignment query start on read
- pub sa_qbeg: u32,
- /// Supplementary alignment query end on read
- pub sa_qend: u32,
- }
- /// Parse a SAM CIGAR string into a vector of [`Cigar`] operations.
- ///
- /// All standard CIGAR operations are supported: `M`, `I`, `D`, `N`, `S`, `H`, `P`, `=`, `X`.
- ///
- /// # Arguments
- ///
- /// * `s` - CIGAR string, e.g. `"10S100M5I50M20S"`
- ///
- /// # Returns
- ///
- /// `Some(ops)` on success, `None` if the string is empty, contains an unknown operation
- /// character, has an operator without a preceding length, or ends with trailing digits.
- pub fn parse_cigar_str(s: &str) -> Option<Vec<Cigar>> {
- if s.is_empty() {
- return None;
- }
- // Pre-allocate: rough estimate ~1 op per 2-3 chars
- let mut ops = Vec::with_capacity(s.len() / 2);
- let mut num = String::new();
- for c in s.chars() {
- if c.is_ascii_digit() {
- num.push(c);
- continue;
- }
- // We hit an operator letter: parse the accumulated number
- if num.is_empty() {
- return None; // operator without length
- }
- let len: u32 = num.parse().ok()?;
- num.clear();
- let op = match c {
- 'M' => Cigar::Match(len),
- 'I' => Cigar::Ins(len),
- 'D' => Cigar::Del(len),
- 'N' => Cigar::RefSkip(len),
- 'S' => Cigar::SoftClip(len),
- 'H' => Cigar::HardClip(len),
- 'P' => Cigar::Pad(len),
- '=' => Cigar::Equal(len),
- 'X' => Cigar::Diff(len),
- _ => return None, // unknown operator
- };
- ops.push(op);
- }
- // CIGAR ending with digits only is invalid
- if !num.is_empty() {
- return None;
- }
- // Validation: CIGAR must have at least one operation
- if ops.is_empty() {
- return None;
- }
- Some(ops)
- }
- /// Compute `(query_begin, query_end)` on the read from CIGAR operations.
- ///
- /// For `+` strand the first CIGAR op is at the read start; for `-` strand the last op
- /// is at the read start (CIGAR is encoded in reference order regardless of strand).
- /// Hard-clipped bases are excluded; soft-clip length sets `query_begin`.
- /// Only query-consuming operations (`M`, `I`, `=`, `X`) contribute to the alignment length.
- ///
- /// # Arguments
- ///
- /// * `ops` - Slice of CIGAR operations
- /// * `strand` - `'+'` or `'-'`
- ///
- /// # Returns
- ///
- /// `(query_begin, query_end)` as read-coordinate offsets (0-based, hard clips excluded).
- /// Returns `(0, 0)` for an empty slice.
- fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) {
- if ops.is_empty() {
- return (0, 0);
- }
- let query_beg = if strand == '+' {
- terminal_soft_clip(ops.iter())
- } else {
- terminal_soft_clip(ops.iter().rev())
- };
- // Alignment length on the read (consumes query bases)
- let aln_len: u32 = ops
- .iter()
- .filter_map(|op| match op {
- Cigar::Match(l) | Cigar::Ins(l) | Cigar::Equal(l) | Cigar::Diff(l) => Some(*l),
- _ => None,
- })
- .sum();
- (query_beg, query_beg + aln_len)
- }
- fn terminal_soft_clip<'a>(mut ops: impl Iterator<Item = &'a Cigar>) -> u32 {
- ops.find_map(|op| match op {
- Cigar::HardClip(_) | Cigar::Pad(_) => None,
- Cigar::SoftClip(len) => Some(*len),
- _ => Some(0),
- })
- .unwrap_or(0)
- }
- /// Compute the reference-consuming length of a CIGAR.
- ///
- /// Sums lengths of operations that advance the reference coordinate:
- /// `M`, `D`, `N`, `=`, `X`. Insertions, soft clips, hard clips, and pads are excluded.
- ///
- /// # Arguments
- ///
- /// * `ops` - Slice of CIGAR operations
- ///
- /// # Returns
- ///
- /// Reference-consuming length in base pairs. Returns `0` for an empty slice.
- fn alignment_ref_length(ops: &[Cigar]) -> u32 {
- ops.iter()
- .filter_map(|op| match op {
- Cigar::Match(l)
- | Cigar::Del(l)
- | Cigar::RefSkip(l)
- | Cigar::Equal(l)
- | Cigar::Diff(l) => Some(*l),
- _ => None,
- })
- .sum()
- }
- /// Compute the gap in base pairs between two half-open intervals.
- ///
- /// Both intervals are `[start, end)`. A negative return value indicates that the
- /// intervals overlap by that many bases; zero means they are adjacent; positive means
- /// they are separated.
- ///
- /// # Arguments
- ///
- /// * `a_start`, `a_end` - First interval `[a_start, a_end)`
- /// * `b_start`, `b_end` - Second interval `[b_start, b_end)`
- ///
- /// # Returns
- ///
- /// `max(starts) - min(ends)`: negative for overlap, 0 for adjacency, positive for gap.
- #[inline]
- fn interval_gap(a_start: i64, a_end: i64, b_start: i64, b_end: i64) -> i64 {
- a_start.max(b_start) - a_end.min(b_end)
- }
- /// Return `(5'_terminus, 3'_terminus)` in read order for a single alignment.
- ///
- /// For `+` strand the 5' end maps to the leftmost reference position (`start`) and the
- /// 3' end to the rightmost (`end`). For `-` strand the read runs right-to-left on the
- /// reference, so 5' = `end` and 3' = `start`.
- ///
- /// # Arguments
- ///
- /// * `start` - Alignment start, 0-based inclusive
- /// * `end` - Alignment end, 0-based exclusive
- /// * `strand` - `'+'` or `'-'`
- ///
- /// # Returns
- ///
- /// `(5'_coord, 3'_coord)` where both are 0-based reference positions.
- #[inline]
- fn breakpoints(start: i64, end: i64, strand: char) -> (i64, i64) {
- if strand == '+' {
- (start, end)
- } else {
- (end, start)
- }
- }
- /// Attempt to extract a fold-back inversion event from a primary BAM record.
- ///
- /// Inspects the record's SA tag using [`parse_sa_entries`] and checks whether the single
- /// supplementary alignment forms a fold-back inversion with the primary: same chromosome,
- /// opposite strand, and reference coordinates within the specified proximity thresholds.
- /// Returns a populated [`FbInv`] with all breakpoint metrics on success.
- ///
- /// Alignments A and B in the returned struct are ordered by their position on the read
- /// (A comes first). All coordinates are 0-based half-open; the 1-based SA tag `pos` is
- /// converted automatically by [`SaEntry`].
- ///
- /// Recommended thresholds for **artefact detection**: `max_overlap = Some(150)`,
- /// `max_gap = Some(0)`. Requiring overlap (`max_gap = Some(0)`) matches the reference
- /// Python implementation and avoids capturing true SV junctions that happen to have
- /// opposite-strand supplementary alignments with a reference-space gap.
- ///
- /// # Arguments
- ///
- /// * `record` - BAM record to evaluate; must be a primary alignment
- /// * `header` - BAM header used to resolve the reference name from `tid`
- /// * `min_mapq` - Minimum mapping quality required for both the primary and the supplementary
- /// * `max_overlap` - Maximum allowed reference-space overlap in bp (`None` = no limit)
- /// * `max_gap` - Maximum allowed reference-space gap in bp (`None` = no limit)
- ///
- /// # Returns
- ///
- /// `Some(FbInv)` if all criteria are met, `None` otherwise.
- ///
- /// Returns `None` when any of the following conditions apply:
- /// - Record is unmapped, secondary, or supplementary
- /// - Primary or supplementary MAPQ is below `min_mapq`
- /// - SA tag is absent or contains more than one entry
- /// - Supplementary aligns to a different chromosome
- /// - Both alignments are on the same strand
- /// - Reference-space overlap exceeds `max_overlap`
- /// - Reference-space gap exceeds `max_gap`
- ///
- /// # References
- ///
- /// <https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py>
- pub fn fb_inv_from_record(
- record: &bam::Record,
- header: &bam::HeaderView,
- min_mapq: u8,
- max_overlap: Option<i64>,
- max_gap: Option<i64>,
- ) -> Option<FbInv> {
- // Basic filters (primary only, mapped, MAPQ)
- if record.is_unmapped()
- || record.is_secondary()
- || record.is_supplementary()
- || record.mapq() < min_mapq
- {
- return None;
- }
- // SA tag required; exactly one supplementary alignment expected
- let sa_str = match record.aux(b"SA") {
- Ok(Aux::String(s)) => s,
- _ => return None,
- };
- let sa_entries = parse_sa_entries(sa_str);
- if sa_entries.len() != 1 {
- return None;
- }
- let sa = &sa_entries[0];
- if sa.mapq < min_mapq {
- return None;
- }
- // Same chromosome: compare SA rname with primary reference name
- let tid = record.tid();
- if tid < 0 {
- return None;
- }
- let primary_rname = std::str::from_utf8(header.tid2name(tid as u32)).ok()?;
- if sa.chr != primary_rname {
- return None;
- }
- // Strand check: need opposite strands for fold-back inversion
- let primary_strand_char = if record.is_reverse() { '-' } else { '+' };
- if primary_strand_char == sa.strand {
- return None;
- }
- // Primary ref coords (0-based, half-open)
- let primary_start = record.pos();
- let primary_end = record.cigar().end_pos();
- // Parse SA CIGAR; pos is already 0-based from SaEntry
- let sa_cigar_ops = parse_cigar_str(sa.cigar)?;
- let sa_ref_len = alignment_ref_length(&sa_cigar_ops);
- if sa_ref_len == 0 {
- return None;
- }
- let sa_start = sa.pos as i64;
- let sa_end = sa_start + sa_ref_len as i64;
- // Check proximity between alignments
- // gap < 0: overlap of |gap| bp
- // gap = 0: adjacent (touching)
- // gap > 0: separated by gap bp
- let gap = interval_gap(primary_start, primary_end, sa_start, sa_end);
- // Check max_overlap: reject if overlap exceeds limit
- if let Some(max_ovl) = max_overlap {
- if gap < 0 && gap.abs() > max_ovl {
- return None;
- }
- }
- // Check max_gap: reject if gap exceeds limit
- if let Some(max_g) = max_gap {
- if gap > max_g {
- return None;
- }
- }
- // Query coords for primary and SA
- let primary_cigar: Vec<Cigar> = record.cigar().iter().cloned().collect();
- let (primary_qbeg, primary_qend) = alignment_query_coords(&primary_cigar, primary_strand_char);
- let (sa_qbeg, sa_qend) = alignment_query_coords(&sa_cigar_ops, sa.strand);
- // Decide which segment is first on the read
- let first = if primary_qbeg <= sa_qbeg {
- SegmentOrder::PrimaryFirst
- } else {
- SegmentOrder::SuppFirst
- };
- // Build FbInv with A/B ordered by read position
- let (aln_a_start, aln_a_end, aln_a_strand, aln_b_start, aln_b_end, aln_b_strand) =
- if first == SegmentOrder::PrimaryFirst {
- (
- primary_start,
- primary_end,
- primary_strand_char,
- sa_start,
- sa_end,
- sa.strand,
- )
- } else {
- (
- sa_start,
- sa_end,
- sa.strand,
- primary_start,
- primary_end,
- primary_strand_char,
- )
- };
- // Distances a_b, c_d, b_c, a_d (like the Python class)
- let (a, b) = breakpoints(aln_a_start, aln_a_end, aln_a_strand);
- let (c, d) = breakpoints(aln_b_start, aln_b_end, aln_b_strand);
- let aln_a_len = (a - b).abs();
- let aln_b_len = (c - d).abs();
- let b_c = (b - c).abs();
- let a_d = (a - d).abs();
- let read_name = std::str::from_utf8(record.qname())
- .map(|s| s.to_string())
- .unwrap_or_else(|_| String::from_utf8_lossy(record.qname()).into_owned());
- let ref_name = primary_rname.to_string();
- Some(FbInv {
- read_name,
- ref_name,
- aln_a_start,
- aln_a_end,
- aln_a_strand,
- aln_b_start,
- aln_b_end,
- aln_b_strand,
- aln_a_len,
- aln_b_len,
- b_c,
- a_d,
- first,
- primary_qbeg,
- primary_qend,
- sa_qbeg,
- sa_qend,
- })
- }
- /// Read the `SM` tag from a BAM's `@RG` lines, injecting it if absent or inconsistent.
- ///
- /// Inspects every `@RG` line in the header. If all lines carry an identical `SM` value,
- /// that value is returned. If any line is missing `SM`, or if multiple distinct `SM`
- /// values are present, `samtools reheader` is invoked to set `SM` uniformly to
- /// `fallback_sample`. The file modification time is preserved after reheadering.
- ///
- /// # Arguments
- ///
- /// * `bam_path` - Path to the BAM file (must be writable if reheadering is needed)
- /// * `fallback_sample` - Sample name to inject when the header is missing or ambiguous
- /// * `config` - Pipeline configuration used to invoke [`SamtoolsReheader`]
- ///
- /// # Returns
- ///
- /// The `SM` value present in (or injected into) the header.
- ///
- /// # Errors
- ///
- /// Returns an error if the BAM cannot be opened, if reheadering fails, or if the
- /// header contains no `@RG` lines at all.
- pub fn read_sm_tag_or_inject(
- bam_path: &str,
- fallback_sample: &str,
- config: &Config,
- ) -> anyhow::Result<String> {
- let reader = bam::Reader::from_path(bam_path)
- .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
- let header = bam::Header::from_template(reader.header());
- let header_text = String::from_utf8_lossy(&header.to_bytes()).into_owned();
- let mut sm_values: Vec<String> = Vec::new();
- let mut all_have_sm = true;
- for line in header_text.lines() {
- if line.starts_with("@RG") {
- match line
- .split('\t')
- .find_map(|f| f.strip_prefix("SM:").map(|s| s.to_string()))
- {
- Some(sm) => sm_values.push(sm),
- None => all_have_sm = false,
- }
- }
- }
- // Reheader if any @RG lacks SM or if multiple distinct SM values exist.
- // No @RG lines at all falls through to the error at the end.
- let unique_sm: std::collections::HashSet<&str> = sm_values.iter().map(|s| s.as_str()).collect();
- let needs_reheader = !all_have_sm || unique_sm.len() > 1;
- if needs_reheader {
- if !all_have_sm {
- info!("Some @RG lines in {bam_path} lack SM tag — injecting SM:{fallback_sample}");
- }
- if unique_sm.len() > 1 {
- info!(
- "Multiple distinct SM values in {bam_path} ({:?}) — collapsing to SM:{fallback_sample}",
- unique_sm
- );
- }
- let original_mtime =
- filetime::FileTime::from_last_modification_time(&std::fs::metadata(bam_path)?);
- let mut reheader = SamtoolsReheader::from_config(config, bam_path, fallback_sample);
- reheader.run()?;
- // let mut index = SamtoolsIndex::from_config(config, bam_path);
- // index.run()?;
- filetime::set_file_mtime(bam_path, original_mtime)?;
- return Ok(fallback_sample.to_string());
- }
- // All @RG lines present and SM is uniform — return the single value
- sm_values
- .into_iter()
- .next()
- .context(format!("No @RG lines found in {bam_path}"))
- }
- /// Fetch primary alignment records from an indexed BAM, with optional region and name filtering.
- ///
- /// Supplementary alignments within the fetch region are resolved to their primaries via
- /// [`primary_records`]. Duplicates introduced by that resolution step are removed by
- /// query name. Secondary alignments are always excluded.
- ///
- /// # Arguments
- ///
- /// * `bam_path` - Path to an indexed BAM file (`.bai` index must exist)
- /// * `region` - If `Some((chr, start, end))`, restrict the fetch to that 0-based half-open
- /// interval; if `None`, the whole file is scanned
- /// * `qname_filter` - Optional [`QnameFilter`] to retain or exclude specific read names
- ///
- /// # Returns
- ///
- /// A deduplicated vector of primary records, optionally filtered by query name.
- ///
- /// # Errors
- ///
- /// Returns an error if the BAM cannot be opened or if the fetch operation fails.
- pub fn fetch_primary_records(
- bam_path: &Path,
- region: Option<(&str, i64, i64)>,
- qname_filter: Option<QnameFilter>,
- ) -> anyhow::Result<Vec<bam::Record>> {
- let mut reader = bam::IndexedReader::from_path(bam_path)
- .with_context(|| format!("Cannot open BAM: {}", bam_path.display()))?;
- if let Some((chr, start, end)) = region {
- reader
- .fetch((chr, start, end))
- .with_context(|| format!("Fetch failed for {}:{}-{}", chr, start, end))?;
- } else {
- reader
- .fetch(FetchDefinition::All)
- .context("Fetch all failed")?;
- }
- let records: Vec<bam::Record> = reader
- .records()
- .filter_map(|r| r.ok())
- .filter(|r| !r.is_secondary())
- .collect();
- // Resolve supplementary → primary via SA tag
- let records = primary_records(&mut reader, records);
- // Deduplicate by qname (primary_records may introduce duplicates
- // if a primary was already in the fetch region)
- let mut seen = HashSet::new();
- let mut records: Vec<bam::Record> = records
- .into_iter()
- .filter(|r| seen.insert(r.qname().to_vec()))
- .collect();
- // Apply qname filter
- if let Some(filter) = qname_filter {
- match filter {
- QnameFilter::Whitelist(names) => {
- records.retain(|r| names.contains(r.qname()));
- }
- QnameFilter::Blacklist(names) => {
- records.retain(|r| !names.contains(r.qname()));
- }
- }
- }
- Ok(records)
- }
- /// Query-name filter for [`fetch_primary_records`].
- pub fn query_aligned_span(record: &rust_htslib::bam::Record) -> (usize, usize) {
- let cigar = record.cigar();
- let qstart = cigar
- .iter()
- .take_while(|c| matches!(c, Cigar::SoftClip(_) | Cigar::HardClip(_)))
- .filter_map(|c| match c {
- Cigar::SoftClip(len) => Some(*len as usize),
- _ => None,
- })
- .sum::<usize>();
- let qend = record.seq_len()
- - cigar
- .iter()
- .rev()
- .take_while(|c| matches!(c, Cigar::SoftClip(_) | Cigar::HardClip(_)))
- .filter_map(|c| match c {
- Cigar::SoftClip(len) => Some(*len as usize),
- _ => None,
- })
- .sum::<usize>();
- (qstart, qend)
- }
- pub enum QnameFilter {
- /// Retain only records whose query name is in the set.
- Whitelist(HashSet<Vec<u8>>),
- /// Exclude records whose query name is in the set.
- Blacklist(HashSet<Vec<u8>>),
- }
- /// Convert a BAM/CRAM file to a BED4 file with one interval per aligned record.
- ///
- /// Each output line has the form `chrom\tstart\tend\tread_name`. Coordinates are
- /// 0-based half-open (`start` from [`Record::pos`], `end` from [`Record::reference_end`]),
- /// consistent with BED convention. Unmapped records and records with a negative `tid`
- /// are skipped. Secondary and supplementary alignments are **not** filtered — add
- /// upstream filtering if stricter selection is needed.
- ///
- /// # Arguments
- ///
- /// * `input_bam` - Path to the input BAM or CRAM file
- /// * `output_bed` - Path to the output BED file (created or overwritten)
- ///
- /// # Errors
- ///
- /// Returns an error if the input cannot be read, the output cannot be written, or
- /// if UTF-8 conversion of a reference name or query name fails.
- pub fn bam_to_aligned_bed<P: AsRef<Path>>(input_bam: P, output_bed: P) -> anyhow::Result<()> {
- let mut bam = bam::Reader::from_path(input_bam)?;
- let header = bam.header().to_owned();
- let out = File::create(output_bed)?;
- let mut writer = std::io::BufWriter::new(out);
- for result in bam.records() {
- let record = result?;
- if record.is_unmapped() {
- continue;
- }
- let tid = record.tid();
- if tid < 0 {
- continue;
- }
- let chrom = std::str::from_utf8(header.tid2name(tid as u32))?;
- // BAM positions are already 0-based.
- let start = record.pos();
- // reference_end() accounts for CIGAR operations consuming reference:
- // M, D, N, =, X
- let end = record.reference_end();
- if end > start {
- let qname = std::str::from_utf8(record.qname())?;
- let (qstart, qend) = query_aligned_span(&record);
- writeln!(
- writer,
- "{chrom}\t{start}\t{end}\t{}:{}-{}",
- qname, qstart, qend
- )?;
- }
- }
- Ok(())
- }
- #[cfg(test)]
- mod tests {
- use super::*;
- use crate::helpers::test_init;
- #[test]
- fn sm_tag() -> anyhow::Result<()> {
- test_init();
- let config = Config::default();
- read_sm_tag_or_inject(&config.tumoral_bam("CHALO"), "CHALO_diag", &config)?;
- Ok(())
- }
- #[test]
- fn alignment_query_coords_counts_soft_clip_inside_hard_clip() {
- let ops = parse_cigar_str("5H10S90M").unwrap();
- assert_eq!(alignment_query_coords(&ops, '+'), (10, 100));
- let ops = parse_cigar_str("90M10S5H").unwrap();
- assert_eq!(alignment_query_coords(&ops, '-'), (10, 100));
- }
- }
|