use std::io::Write; use std::{ collections::{HashMap, HashSet}, fs::File, path::Path, }; use anyhow::Context; use log::{info, warn}; use rust_htslib::bam::{ self, ext::BamRecordExtensions, record::{Aux, Cigar}, FetchDefinition, IndexedReader, Read, Record, }; use crate::{commands::samtools::SamtoolsReheader, config::Config, runners::Run}; /// A single entry from a BAM SA (supplementary alignment) tag. /// /// SA tag format per the SAM spec: `rname,pos,strand,CIGAR,mapQ,NM` (semicolon-separated). /// `pos` is stored as 0-based after conversion from the 1-based SAM value. pub struct SaEntry<'a> { /// Reference/contig name pub chr: &'a str, /// Reference start position, 0-based pub pos: i32, /// Strand: `'+'` or `'-'` pub strand: char, /// CIGAR string (unparsed) pub cigar: &'a str, /// Mapping quality pub mapq: u8, /// Edit distance (NM tag value); `0` if absent or unparseable pub nm: u32, } /// Parse all entries from a BAM SA tag value into [`SaEntry`] structs. /// /// `pos` is converted from 1-based (SAM spec) to 0-based. Entries with fewer than /// six comma-separated fields or unparseable numeric fields are silently skipped. /// /// # Arguments /// /// * `sa` - Raw SA tag value (the string after `SA:Z:` in a SAM record) /// /// # Returns /// /// A vector of parsed entries. Empty if the tag contains no valid entries. pub fn parse_sa_entries(sa: &str) -> Vec> { sa.split(';') .filter(|s| !s.is_empty()) .filter_map(|s| { let f: Vec<&str> = s.splitn(6, ',').collect(); if f.len() < 6 { return None; } Some(SaEntry { chr: f[0], pos: f[1].parse::().ok()? - 1, strand: f[2].chars().next()?, cigar: f[3], mapq: f[4].parse().ok()?, nm: f[5].parse().unwrap_or(0), }) }) .collect() } /// Convenience wrapper over [`parse_sa_entries`] returning only `(chr, pos)` pairs. /// /// `pos` is 0-based. Useful when only fetch coordinates are needed. pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> { parse_sa_entries(sa) .into_iter() .map(|e| (e.chr, e.pos)) .collect() } /// Resolve a supplementary alignment to its primary record via the SA tag. /// /// For supplementary alignments, each position listed in the SA tag is fetched and /// scanned for a non-supplementary record with the same query name. The first match /// is returned. If the record is not supplementary, or no primary is found across all /// SA positions, the original record is returned unchanged. /// /// # Arguments /// /// * `bam` - Indexed BAM reader used for random-access fetches /// * `record` - The record to resolve (typically a supplementary alignment) /// /// # Returns /// /// The primary alignment record, or the input record if resolution is not applicable /// or unsuccessful. /// /// # Errors /// /// Returns an error if a BAM fetch operation fails or if a fetched record is malformed. pub fn primary_record(bam: &mut IndexedReader, record: Record) -> anyhow::Result { if !record.is_supplementary() { return Ok(record); } let qname = record.qname(); if let Ok(Aux::String(sa)) = record.aux(b"SA") { for (chr, start) in parse_sa_tag(sa) { bam.fetch((chr, start, start + 1)) .with_context(|| format!("BAM fetch failed at {chr}:{start}"))?; for result in bam.records() { let candidate = result.context("Invalid BAM record")?; if candidate.qname() == qname && !candidate.is_supplementary() && !candidate.is_secondary() { return Ok(candidate); } } } } Ok(record) } /// Group 0-based positions by chromosome into bins spanning at most 1000 bp, sorted by bin size descending. /// /// Positions within 1000 bp of the first position in the current bin are merged into /// that bin. Larger bins are sorted first so that high-density regions are fetched /// before sparse ones, enabling earlier early-exit in callers. fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec> { let mut sorted_positions = positions.to_vec(); sorted_positions.sort_by_key(|(chr, pos)| (*chr, *pos)); sorted_positions.dedup(); let mut grouped: HashMap<&str, Vec>> = HashMap::new(); for (chr, pos) in sorted_positions { let bins = grouped.entry(chr).or_default(); if let Some(last_bin) = bins.last_mut() { if last_bin.is_empty() || pos - last_bin[0].1 <= 1000 { last_bin.push((chr, pos)); } else { bins.push(vec![(chr, pos)]); } } else { bins.push(vec![(chr, pos)]); } } let mut flattened: Vec> = grouped.values().flatten().cloned().collect(); flattened.sort_by_key(|bin| std::cmp::Reverse(bin.len())); flattened } /// Resolve supplementary alignments in a record set to their primary records. /// /// Primary and secondary records are separated in a first pass. For supplementary /// records with an SA tag, the SA positions are binned by chromosome and fetched in /// bulk. Fetching stops as soon as all target query names have been found (early exit). /// Records that cannot be resolved are silently dropped with a warning. /// /// # Arguments /// /// * `bam` - Indexed BAM reader used for random-access fetches /// * `records` - Mixed set of primary, secondary, and supplementary records /// /// # Returns /// /// A vector containing original non-supplementary records from the input plus any /// primaries fetched for supplementary inputs. Secondary records in the input are /// passed through unchanged; secondary records encountered during SA-position fetching /// are skipped. Order is not guaranteed. pub fn primary_records(bam: &mut IndexedReader, records: Vec) -> Vec { let mut res = Vec::with_capacity(records.len()); let mut all_positions = Vec::new(); let mut all_qnames_to_fetch = HashSet::new(); // First pass: collect primary records and positions to fetch for record in records.iter() { if record.is_supplementary() { let qname = record.qname(); // Safely extract and process the SA tag if let Ok(Aux::String(sa)) = record.aux(b"SA") { let positions = parse_sa_tag(sa); all_positions.extend(positions); match String::from_utf8(qname.to_vec()) { Ok(qname_str) => { all_qnames_to_fetch.insert(qname_str); } Err(e) => warn!("Invalid UTF-8 in qname: {}", e), } } } else { res.push(record.clone()); } } // Track which query names we've already found let mut primary_records: HashSet = res .iter() .filter_map(|r| String::from_utf8(r.qname().to_vec()).ok()) .collect(); // Create position bins for efficient fetching let position_bins = create_position_bins(&all_positions); // Fetch records for each bin until we find all primaries for bin in position_bins { if bin.is_empty() { continue; } let (chr, start) = bin.first().unwrap(); let end = bin.last().unwrap().1 + 1; // Fetch and process records in this region if let Err(e) = bam.fetch((*chr, *start, end)) { warn!("Failed to fetch region {}:{}-{}: {}", chr, start, end, e); continue; } for record_result in bam.records() { match record_result { Ok(record) => { // Skip secondary alignments if record.is_secondary() { continue; } // Check if this is a primary we're looking for match String::from_utf8(record.qname().to_vec()) { Ok(qname) => { if primary_records.contains(&qname) { continue; } if all_qnames_to_fetch.contains(&qname) { res.push(record); primary_records.insert(qname); } } Err(_) => continue, } } Err(e) => warn!("Error reading record: {}", e), } } // Early exit if we found all the records we were looking for let remaining = all_qnames_to_fetch .iter() .filter(|q| !primary_records.contains(*q)) .count(); if remaining == 0 { break; } } res } /// Extract contig names and lengths from a BAM header. /// /// Reads all `@SQ` lines and returns a map of sequence name (`SN`) to sequence /// length (`LN`). /// /// # Arguments /// /// * `header` - BAM header to parse /// /// # Returns /// /// A map from contig name to length in base pairs. /// /// # Errors /// /// Returns an error if the `LN` field of any `@SQ` line cannot be parsed as `u64`. pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result> { let mut genome = HashMap::new(); for (key, records) in header.to_hashmap() { for record in records { if key == "SQ" { genome.insert( record["SN"].to_string(), record["LN"].parse::().with_context(|| { format!("Failed to parse LN for contig {}", record["SN"]) })?, ); } } } Ok(genome) } /// Partition a genome into `n` balanced batches of samtools-style region strings. /// /// Contigs are sorted deterministically and sliced into segments so that each batch /// covers approximately `total_bp / n` bases. A contig may be split across batches. /// Region strings use 1-based inclusive coordinates (`ctg:start-end`) as expected /// by samtools `-r`. /// /// # Arguments /// /// * `genome` - Map of contig name to length in bp, typically from [`get_genome_sizes`] /// * `n` - Number of batches to produce /// /// # Returns /// /// A vector of `n` batches, each containing the region strings assigned to that batch. /// Returns an empty vector if `n` is zero or `genome` is empty. pub fn split_genome_into_n_regions(genome: &HashMap, n: usize) -> Vec> { if n == 0 || genome.is_empty() { return Vec::new(); } // Deterministic order over contigs let mut contigs: Vec<_> = genome.iter().collect(); contigs.sort_by(|(a, _), (b, _)| a.cmp(b)); let total_bp: u64 = contigs.iter().map(|(_, &len)| len).sum(); let target_per_batch: u64 = total_bp.div_ceil(n as u64); // ceil let mut batches: Vec> = vec![Vec::new(); n]; let mut batch_idx = 0usize; let mut batch_bp = 0u64; for (ctg, &len) in contigs { let mut start: u64 = 1; while start <= len && batch_idx < n { let remaining_in_ctg = len - start + 1; let remaining_for_batch = if batch_bp >= target_per_batch { 1 // force close & move to next batch } else { target_per_batch - batch_bp }; let seg_len = remaining_in_ctg.min(remaining_for_batch); let end = start + seg_len - 1; let region = format!("{ctg}:{start}-{end}"); batches[batch_idx].push(region); batch_bp += seg_len; start = end + 1; if batch_bp >= target_per_batch && batch_idx + 1 < n { batch_idx += 1; batch_bp = 0; } } } batches } /// Indicates which alignment segment appears first on the read. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum SegmentOrder { /// Primary alignment appears before supplementary on the read PrimaryFirst, /// Supplementary alignment appears before primary on the read SuppFirst, } /// Fold-back inversion event detected from a primary/supplementary alignment pair. /// /// A fold-back inversion occurs when a single read produces two alignments on the same /// chromosome with opposite strand orientations. The read travels forward (alignment A), /// then folds back and runs in reverse (alignment B), covering the same reference region /// twice — producing a signature identical to an **inverted duplication**. /// /// SA tag parsing is handled by [`parse_sa_entries`] / [`SaEntry`], which perform the /// 1-based → 0-based coordinate conversion required by the SAM specification. /// /// # Coordinate System /// /// - All coordinates are 0-based, half-open `[start, end)` /// - Alignments A and B are ordered by their position on the read (A comes first) /// /// # Breakpoints /// /// `(a, b, c, d)` are alignment termini in **read order** (5′→3′ along the read). /// `b` is the **inner** (junction-proximal) breakpoint of A; `c` is the **inner** /// breakpoint of B. `a` and `d` are the **outer** (junction-distal) endpoints. /// /// For each strand, the 5′ end of the read maps to a different reference position: /// /// | Strand | 5′ terminus | 3′ terminus | /// |--------|-------------|-------------| /// | `+` | `start` (leftmost ref pos) | `end` (rightmost ref pos) | /// | `−` | `end` (rightmost ref pos) | `start` (leftmost ref pos) | /// /// When A comes first on the read: A's 3′ terminus (`b`) and B's 5′ terminus (`c`) /// are the **inner** (junction-proximal) breakpoints. A's 5′ (`a`) and B's 3′ (`d`) /// are the **outer** endpoints. /// /// ```text /// Case: A on '+' strand, B on '−' strand /// /// a b /// | | /// v v /// Reference:---[=====A=================>]--------- /// --------[<=============B======]------- /// ^ ^ /// | | /// d c /// /// Alignment A (+ strand): a = aln_a_start (5' of A, outer) /// b = aln_a_end (3' of A, inner — junction-proximal) /// Alignment B (− strand): c = aln_b_end (5' of B in read order, inner — junction-proximal) /// d = aln_b_start (3' of B in read order, outer) /// /// Distances: /// aln_a_len = |a − b| = alignment A length on reference /// aln_b_len = |c − d| = alignment B length on reference /// b_c = |b − c| = inner breakpoint distance (junction gap between A and B) /// a_d = |a − d| = outer span of the fold-back structure /// ``` /// /// The region `[d, b]` (where `d < b` in the canonical case) is covered by both /// alignments and constitutes the **inverted duplication**. For a perfect duplication /// `b_c = 0` and `a_d = 0`. Events with `a_d < 150 bp` are typically classified as /// **sequencing artefacts** (short hairpin/palindrome) rather than true structural variants. /// /// # Divergence from reference implementation /// /// The reference Python script requires the two alignments to **overlap** on the reference /// (`overlap > 0 bp`). This implementation is more permissive: it also accepts a small /// reference-space **gap** between the alignments (controlled by `max_gap` in /// [`fb_inv_from_record`]). Set `max_gap = Some(0)` to match the strict Python behaviour, /// which requires overlap because the artefact mechanism — ssDNA forming a hairpin and /// re-translocating back through the nanopore — guarantees both alignments cover the /// same physical DNA strand. /// /// # References /// /// Geometry, breakpoint labelling, and the `a_d < 150 bp` artefact threshold are adapted from: /// #[derive(Debug)] pub struct FbInv { /// Read name (QNAME from BAM) pub read_name: String, /// Chromosome/contig name pub ref_name: String, /// Alignment A reference start (0-based) pub aln_a_start: i64, /// Alignment A reference end (exclusive) pub aln_a_end: i64, /// Alignment A strand ('+' or '-') pub aln_a_strand: char, /// Alignment B reference start (0-based) pub aln_b_start: i64, /// Alignment B reference end (exclusive) pub aln_b_end: i64, /// Alignment B strand ('+' or '-') pub aln_b_strand: char, /// Alignment A length on reference (bp) pub aln_a_len: i64, /// Alignment B length on reference (bp) pub aln_b_len: i64, /// `|b - c|`: read-order junction gap — distance between the 3' end of A and the 5' end of B pub b_c: i64, /// `|a - d|`: read-order outer span — distance between the 5' end of A and the 3' end of B pub a_d: i64, /// Which alignment appears first on the read pub first: SegmentOrder, /// Primary alignment query start on read pub primary_qbeg: u32, /// Primary alignment query end on read pub primary_qend: u32, /// Supplementary alignment query start on read pub sa_qbeg: u32, /// Supplementary alignment query end on read pub sa_qend: u32, } /// Parse a SAM CIGAR string into a vector of [`Cigar`] operations. /// /// All standard CIGAR operations are supported: `M`, `I`, `D`, `N`, `S`, `H`, `P`, `=`, `X`. /// /// # Arguments /// /// * `s` - CIGAR string, e.g. `"10S100M5I50M20S"` /// /// # Returns /// /// `Some(ops)` on success, `None` if the string is empty, contains an unknown operation /// character, has an operator without a preceding length, or ends with trailing digits. pub fn parse_cigar_str(s: &str) -> Option> { if s.is_empty() { return None; } // Pre-allocate: rough estimate ~1 op per 2-3 chars let mut ops = Vec::with_capacity(s.len() / 2); let mut num = String::new(); for c in s.chars() { if c.is_ascii_digit() { num.push(c); continue; } // We hit an operator letter: parse the accumulated number if num.is_empty() { return None; // operator without length } let len: u32 = num.parse().ok()?; num.clear(); let op = match c { 'M' => Cigar::Match(len), 'I' => Cigar::Ins(len), 'D' => Cigar::Del(len), 'N' => Cigar::RefSkip(len), 'S' => Cigar::SoftClip(len), 'H' => Cigar::HardClip(len), 'P' => Cigar::Pad(len), '=' => Cigar::Equal(len), 'X' => Cigar::Diff(len), _ => return None, // unknown operator }; ops.push(op); } // CIGAR ending with digits only is invalid if !num.is_empty() { return None; } // Validation: CIGAR must have at least one operation if ops.is_empty() { return None; } Some(ops) } /// Compute `(query_begin, query_end)` on the read from CIGAR operations. /// /// For `+` strand the first CIGAR op is at the read start; for `-` strand the last op /// is at the read start (CIGAR is encoded in reference order regardless of strand). /// Hard-clipped bases are excluded; soft-clip length sets `query_begin`. /// Only query-consuming operations (`M`, `I`, `=`, `X`) contribute to the alignment length. /// /// # Arguments /// /// * `ops` - Slice of CIGAR operations /// * `strand` - `'+'` or `'-'` /// /// # Returns /// /// `(query_begin, query_end)` as read-coordinate offsets (0-based, hard clips excluded). /// Returns `(0, 0)` for an empty slice. fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) { if ops.is_empty() { return (0, 0); } let query_beg = if strand == '+' { terminal_soft_clip(ops.iter()) } else { terminal_soft_clip(ops.iter().rev()) }; // Alignment length on the read (consumes query bases) let aln_len: u32 = ops .iter() .filter_map(|op| match op { Cigar::Match(l) | Cigar::Ins(l) | Cigar::Equal(l) | Cigar::Diff(l) => Some(*l), _ => None, }) .sum(); (query_beg, query_beg + aln_len) } fn terminal_soft_clip<'a>(mut ops: impl Iterator) -> u32 { ops.find_map(|op| match op { Cigar::HardClip(_) | Cigar::Pad(_) => None, Cigar::SoftClip(len) => Some(*len), _ => Some(0), }) .unwrap_or(0) } /// Compute the reference-consuming length of a CIGAR. /// /// Sums lengths of operations that advance the reference coordinate: /// `M`, `D`, `N`, `=`, `X`. Insertions, soft clips, hard clips, and pads are excluded. /// /// # Arguments /// /// * `ops` - Slice of CIGAR operations /// /// # Returns /// /// Reference-consuming length in base pairs. Returns `0` for an empty slice. fn alignment_ref_length(ops: &[Cigar]) -> u32 { ops.iter() .filter_map(|op| match op { Cigar::Match(l) | Cigar::Del(l) | Cigar::RefSkip(l) | Cigar::Equal(l) | Cigar::Diff(l) => Some(*l), _ => None, }) .sum() } /// Compute the gap in base pairs between two half-open intervals. /// /// Both intervals are `[start, end)`. A negative return value indicates that the /// intervals overlap by that many bases; zero means they are adjacent; positive means /// they are separated. /// /// # Arguments /// /// * `a_start`, `a_end` - First interval `[a_start, a_end)` /// * `b_start`, `b_end` - Second interval `[b_start, b_end)` /// /// # Returns /// /// `max(starts) - min(ends)`: negative for overlap, 0 for adjacency, positive for gap. #[inline] fn interval_gap(a_start: i64, a_end: i64, b_start: i64, b_end: i64) -> i64 { a_start.max(b_start) - a_end.min(b_end) } /// Return `(5'_terminus, 3'_terminus)` in read order for a single alignment. /// /// For `+` strand the 5' end maps to the leftmost reference position (`start`) and the /// 3' end to the rightmost (`end`). For `-` strand the read runs right-to-left on the /// reference, so 5' = `end` and 3' = `start`. /// /// # Arguments /// /// * `start` - Alignment start, 0-based inclusive /// * `end` - Alignment end, 0-based exclusive /// * `strand` - `'+'` or `'-'` /// /// # Returns /// /// `(5'_coord, 3'_coord)` where both are 0-based reference positions. #[inline] fn breakpoints(start: i64, end: i64, strand: char) -> (i64, i64) { if strand == '+' { (start, end) } else { (end, start) } } /// Attempt to extract a fold-back inversion event from a primary BAM record. /// /// Inspects the record's SA tag using [`parse_sa_entries`] and checks whether the single /// supplementary alignment forms a fold-back inversion with the primary: same chromosome, /// opposite strand, and reference coordinates within the specified proximity thresholds. /// Returns a populated [`FbInv`] with all breakpoint metrics on success. /// /// Alignments A and B in the returned struct are ordered by their position on the read /// (A comes first). All coordinates are 0-based half-open; the 1-based SA tag `pos` is /// converted automatically by [`SaEntry`]. /// /// Recommended thresholds for **artefact detection**: `max_overlap = Some(150)`, /// `max_gap = Some(0)`. Requiring overlap (`max_gap = Some(0)`) matches the reference /// Python implementation and avoids capturing true SV junctions that happen to have /// opposite-strand supplementary alignments with a reference-space gap. /// /// # Arguments /// /// * `record` - BAM record to evaluate; must be a primary alignment /// * `header` - BAM header used to resolve the reference name from `tid` /// * `min_mapq` - Minimum mapping quality required for both the primary and the supplementary /// * `max_overlap` - Maximum allowed reference-space overlap in bp (`None` = no limit) /// * `max_gap` - Maximum allowed reference-space gap in bp (`None` = no limit) /// /// # Returns /// /// `Some(FbInv)` if all criteria are met, `None` otherwise. /// /// Returns `None` when any of the following conditions apply: /// - Record is unmapped, secondary, or supplementary /// - Primary or supplementary MAPQ is below `min_mapq` /// - SA tag is absent or contains more than one entry /// - Supplementary aligns to a different chromosome /// - Both alignments are on the same strand /// - Reference-space overlap exceeds `max_overlap` /// - Reference-space gap exceeds `max_gap` /// /// # References /// /// pub fn fb_inv_from_record( record: &bam::Record, header: &bam::HeaderView, min_mapq: u8, max_overlap: Option, max_gap: Option, ) -> Option { // Basic filters (primary only, mapped, MAPQ) if record.is_unmapped() || record.is_secondary() || record.is_supplementary() || record.mapq() < min_mapq { return None; } // SA tag required; exactly one supplementary alignment expected let sa_str = match record.aux(b"SA") { Ok(Aux::String(s)) => s, _ => return None, }; let sa_entries = parse_sa_entries(sa_str); if sa_entries.len() != 1 { return None; } let sa = &sa_entries[0]; if sa.mapq < min_mapq { return None; } // Same chromosome: compare SA rname with primary reference name let tid = record.tid(); if tid < 0 { return None; } let primary_rname = std::str::from_utf8(header.tid2name(tid as u32)).ok()?; if sa.chr != primary_rname { return None; } // Strand check: need opposite strands for fold-back inversion let primary_strand_char = if record.is_reverse() { '-' } else { '+' }; if primary_strand_char == sa.strand { return None; } // Primary ref coords (0-based, half-open) let primary_start = record.pos(); let primary_end = record.cigar().end_pos(); // Parse SA CIGAR; pos is already 0-based from SaEntry let sa_cigar_ops = parse_cigar_str(sa.cigar)?; let sa_ref_len = alignment_ref_length(&sa_cigar_ops); if sa_ref_len == 0 { return None; } let sa_start = sa.pos as i64; let sa_end = sa_start + sa_ref_len as i64; // Check proximity between alignments // gap < 0: overlap of |gap| bp // gap = 0: adjacent (touching) // gap > 0: separated by gap bp let gap = interval_gap(primary_start, primary_end, sa_start, sa_end); // Check max_overlap: reject if overlap exceeds limit if let Some(max_ovl) = max_overlap { if gap < 0 && gap.abs() > max_ovl { return None; } } // Check max_gap: reject if gap exceeds limit if let Some(max_g) = max_gap { if gap > max_g { return None; } } // Query coords for primary and SA let primary_cigar: Vec = record.cigar().iter().cloned().collect(); let (primary_qbeg, primary_qend) = alignment_query_coords(&primary_cigar, primary_strand_char); let (sa_qbeg, sa_qend) = alignment_query_coords(&sa_cigar_ops, sa.strand); // Decide which segment is first on the read let first = if primary_qbeg <= sa_qbeg { SegmentOrder::PrimaryFirst } else { SegmentOrder::SuppFirst }; // Build FbInv with A/B ordered by read position let (aln_a_start, aln_a_end, aln_a_strand, aln_b_start, aln_b_end, aln_b_strand) = if first == SegmentOrder::PrimaryFirst { ( primary_start, primary_end, primary_strand_char, sa_start, sa_end, sa.strand, ) } else { ( sa_start, sa_end, sa.strand, primary_start, primary_end, primary_strand_char, ) }; // Distances a_b, c_d, b_c, a_d (like the Python class) let (a, b) = breakpoints(aln_a_start, aln_a_end, aln_a_strand); let (c, d) = breakpoints(aln_b_start, aln_b_end, aln_b_strand); let aln_a_len = (a - b).abs(); let aln_b_len = (c - d).abs(); let b_c = (b - c).abs(); let a_d = (a - d).abs(); let read_name = std::str::from_utf8(record.qname()) .map(|s| s.to_string()) .unwrap_or_else(|_| String::from_utf8_lossy(record.qname()).into_owned()); let ref_name = primary_rname.to_string(); Some(FbInv { read_name, ref_name, aln_a_start, aln_a_end, aln_a_strand, aln_b_start, aln_b_end, aln_b_strand, aln_a_len, aln_b_len, b_c, a_d, first, primary_qbeg, primary_qend, sa_qbeg, sa_qend, }) } /// Read the `SM` tag from a BAM's `@RG` lines, injecting it if absent or inconsistent. /// /// Inspects every `@RG` line in the header. If all lines carry an identical `SM` value, /// that value is returned. If any line is missing `SM`, or if multiple distinct `SM` /// values are present, `samtools reheader` is invoked to set `SM` uniformly to /// `fallback_sample`. The file modification time is preserved after reheadering. /// /// # Arguments /// /// * `bam_path` - Path to the BAM file (must be writable if reheadering is needed) /// * `fallback_sample` - Sample name to inject when the header is missing or ambiguous /// * `config` - Pipeline configuration used to invoke [`SamtoolsReheader`] /// /// # Returns /// /// The `SM` value present in (or injected into) the header. /// /// # Errors /// /// Returns an error if the BAM cannot be opened, if reheadering fails, or if the /// header contains no `@RG` lines at all. pub fn read_sm_tag_or_inject( bam_path: &str, fallback_sample: &str, config: &Config, ) -> anyhow::Result { let reader = bam::Reader::from_path(bam_path) .with_context(|| format!("Failed to open BAM: {bam_path}"))?; let header = bam::Header::from_template(reader.header()); let header_text = String::from_utf8_lossy(&header.to_bytes()).into_owned(); let mut sm_values: Vec = Vec::new(); let mut all_have_sm = true; for line in header_text.lines() { if line.starts_with("@RG") { match line .split('\t') .find_map(|f| f.strip_prefix("SM:").map(|s| s.to_string())) { Some(sm) => sm_values.push(sm), None => all_have_sm = false, } } } // Reheader if any @RG lacks SM or if multiple distinct SM values exist. // No @RG lines at all falls through to the error at the end. let unique_sm: std::collections::HashSet<&str> = sm_values.iter().map(|s| s.as_str()).collect(); let needs_reheader = !all_have_sm || unique_sm.len() > 1; if needs_reheader { if !all_have_sm { info!("Some @RG lines in {bam_path} lack SM tag — injecting SM:{fallback_sample}"); } if unique_sm.len() > 1 { info!( "Multiple distinct SM values in {bam_path} ({:?}) — collapsing to SM:{fallback_sample}", unique_sm ); } let original_mtime = filetime::FileTime::from_last_modification_time(&std::fs::metadata(bam_path)?); let mut reheader = SamtoolsReheader::from_config(config, bam_path, fallback_sample); reheader.run()?; // let mut index = SamtoolsIndex::from_config(config, bam_path); // index.run()?; filetime::set_file_mtime(bam_path, original_mtime)?; return Ok(fallback_sample.to_string()); } // All @RG lines present and SM is uniform — return the single value sm_values .into_iter() .next() .context(format!("No @RG lines found in {bam_path}")) } /// Fetch primary alignment records from an indexed BAM, with optional region and name filtering. /// /// Supplementary alignments within the fetch region are resolved to their primaries via /// [`primary_records`]. Duplicates introduced by that resolution step are removed by /// query name. Secondary alignments are always excluded. /// /// # Arguments /// /// * `bam_path` - Path to an indexed BAM file (`.bai` index must exist) /// * `region` - If `Some((chr, start, end))`, restrict the fetch to that 0-based half-open /// interval; if `None`, the whole file is scanned /// * `qname_filter` - Optional [`QnameFilter`] to retain or exclude specific read names /// /// # Returns /// /// A deduplicated vector of primary records, optionally filtered by query name. /// /// # Errors /// /// Returns an error if the BAM cannot be opened or if the fetch operation fails. pub fn fetch_primary_records( bam_path: &Path, region: Option<(&str, i64, i64)>, qname_filter: Option, ) -> anyhow::Result> { let mut reader = bam::IndexedReader::from_path(bam_path) .with_context(|| format!("Cannot open BAM: {}", bam_path.display()))?; if let Some((chr, start, end)) = region { reader .fetch((chr, start, end)) .with_context(|| format!("Fetch failed for {}:{}-{}", chr, start, end))?; } else { reader .fetch(FetchDefinition::All) .context("Fetch all failed")?; } let records: Vec = reader .records() .filter_map(|r| r.ok()) .filter(|r| !r.is_secondary()) .collect(); // Resolve supplementary → primary via SA tag let records = primary_records(&mut reader, records); // Deduplicate by qname (primary_records may introduce duplicates // if a primary was already in the fetch region) let mut seen = HashSet::new(); let mut records: Vec = records .into_iter() .filter(|r| seen.insert(r.qname().to_vec())) .collect(); // Apply qname filter if let Some(filter) = qname_filter { match filter { QnameFilter::Whitelist(names) => { records.retain(|r| names.contains(r.qname())); } QnameFilter::Blacklist(names) => { records.retain(|r| !names.contains(r.qname())); } } } Ok(records) } /// Query-name filter for [`fetch_primary_records`]. pub fn query_aligned_span(record: &rust_htslib::bam::Record) -> (usize, usize) { let cigar = record.cigar(); let qstart = cigar .iter() .take_while(|c| matches!(c, Cigar::SoftClip(_) | Cigar::HardClip(_))) .filter_map(|c| match c { Cigar::SoftClip(len) => Some(*len as usize), _ => None, }) .sum::(); let qend = record.seq_len() - cigar .iter() .rev() .take_while(|c| matches!(c, Cigar::SoftClip(_) | Cigar::HardClip(_))) .filter_map(|c| match c { Cigar::SoftClip(len) => Some(*len as usize), _ => None, }) .sum::(); (qstart, qend) } pub enum QnameFilter { /// Retain only records whose query name is in the set. Whitelist(HashSet>), /// Exclude records whose query name is in the set. Blacklist(HashSet>), } /// Convert a BAM/CRAM file to a BED4 file with one interval per aligned record. /// /// Each output line has the form `chrom\tstart\tend\tread_name`. Coordinates are /// 0-based half-open (`start` from [`Record::pos`], `end` from [`Record::reference_end`]), /// consistent with BED convention. Unmapped records and records with a negative `tid` /// are skipped. Secondary and supplementary alignments are **not** filtered — add /// upstream filtering if stricter selection is needed. /// /// # Arguments /// /// * `input_bam` - Path to the input BAM or CRAM file /// * `output_bed` - Path to the output BED file (created or overwritten) /// /// # Errors /// /// Returns an error if the input cannot be read, the output cannot be written, or /// if UTF-8 conversion of a reference name or query name fails. pub fn bam_to_aligned_bed>(input_bam: P, output_bed: P) -> anyhow::Result<()> { let mut bam = bam::Reader::from_path(input_bam)?; let header = bam.header().to_owned(); let out = File::create(output_bed)?; let mut writer = std::io::BufWriter::new(out); for result in bam.records() { let record = result?; if record.is_unmapped() { continue; } let tid = record.tid(); if tid < 0 { continue; } let chrom = std::str::from_utf8(header.tid2name(tid as u32))?; // BAM positions are already 0-based. let start = record.pos(); // reference_end() accounts for CIGAR operations consuming reference: // M, D, N, =, X let end = record.reference_end(); if end > start { let qname = std::str::from_utf8(record.qname())?; let (qstart, qend) = query_aligned_span(&record); writeln!( writer, "{chrom}\t{start}\t{end}\t{}:{}-{}", qname, qstart, qend )?; } } Ok(()) } #[cfg(test)] mod tests { use super::*; use crate::helpers::test_init; #[test] fn sm_tag() -> anyhow::Result<()> { test_init(); let config = Config::default(); read_sm_tag_or_inject(&config.tumoral_bam("CHALO"), "CHALO_diag", &config)?; Ok(()) } #[test] fn alignment_query_coords_counts_soft_clip_inside_hard_clip() { let ops = parse_cigar_str("5H10S90M").unwrap(); assert_eq!(alignment_query_coords(&ops, '+'), (10, 100)); let ops = parse_cigar_str("90M10S5H").unwrap(); assert_eq!(alignment_query_coords(&ops, '-'), (10, 100)); } }