use std::collections::{HashMap, HashSet}; use anyhow::Context; use log::warn; use rust_htslib::bam::{self, record::Aux, record::Cigar, IndexedReader, Read, Record}; /// Parses an SA tag string and extracts chromosome and position information. /// /// # Arguments /// /// * `sa` - The SA tag string to parse /// /// # Returns /// /// A vector of tuples containing chromosome names and positions pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> { sa.split(';') .filter(|s| !s.is_empty()) .filter_map(|s| { let parts: Vec<&str> = s.split(',').take(2).collect(); if parts.len() < 2 { return None; } let chr = parts[0]; match parts[1].parse::() { Ok(position) => Some((chr, position)), Err(_) => None, } }) .collect() } /// Resolves the primary record for supplementary alignments using BAM SA tag /// /// For supplementary alignments, searches linked primary records using genomic positions /// listed in the SA tag. Returns the first non-supplementary record with matching query name. /// /// # Arguments /// * `bam` - Mutable reference to indexed BAM reader for random access /// * `record` - Input record to evaluate (typically a supplementary alignment) /// /// # Returns /// - Original record if it's not supplementary /// - First matching primary record found via SA tag positions /// - Original record if no primary matches found /// /// # Panics /// - If SA tag format is invalid (missing fields or non-integer positions) /// - If BAM fetch operation fails /// /// # Example /// ``` /// use rust_htslib::{bam::{IndexedReader, Read}}; /// /// let mut bam = IndexedReader::from_path("input.bam").unwrap(); /// let record = bam.records().next().unwrap().unwrap(); /// let primary = primary_record(&mut bam, record); /// ``` pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record { // Return immediately if not a supplementary alignment if !record.is_supplementary() { return record; } let qname = record.qname(); // Process SA tag if present if let Ok(Aux::String(sa)) = record.aux(b"SA") { // Search potential primary alignments at each SA position for (chr, start) in parse_sa_tag(sa) { bam.fetch((chr, start, start + 1)) .expect("BAM fetch failed"); for result in bam.records() { let candidate = result.expect("Invalid BAM record"); if candidate.qname() == qname && !candidate.is_supplementary() { return candidate.clone(); } } } } // Fallback to original record if no primary found record } /// Creates optimized position bins for fetching records. /// /// Groups positions by chromosome and creates bins of ±1000 bp. /// Sorts bins by size (descending) to prioritize regions with more alignment hits. /// /// # Arguments /// /// * `positions` - Vector of chromosome/position tuples /// /// # Returns /// /// A vector of position bins, sorted by bin size fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec> { // Sort positions by chromosome and position let mut sorted_positions = positions.to_vec(); sorted_positions.sort_by_key(|(chr, pos)| (*chr, *pos)); sorted_positions.dedup(); // Group by chromosome and create bins of ±1000 bp let mut grouped: HashMap<&str, Vec>> = HashMap::new(); for (chr, pos) in sorted_positions { let bins = grouped.entry(chr).or_default(); if let Some(last_bin) = bins.last_mut() { if last_bin.is_empty() || (pos - last_bin[0].1).abs() <= 1000 { last_bin.push((chr, pos)); } else { bins.push(vec![(chr, pos)]); } } else { bins.push(vec![(chr, pos)]); } } // Flatten and sort by bin size (descending) let mut flattened: Vec> = grouped.values().flatten().cloned().collect(); // Sort bins by size (descending) to prioritize regions with more hits flattened.sort_by_key(|bin| std::cmp::Reverse(bin.len())); flattened } /// Retrieves primary alignment records based on a set of input records. /// /// This function processes a collection of BAM records and retrieves their primary alignments. /// When supplementary alignments are found (with SA tags), it fetches the corresponding /// primary alignments from the provided BAM file. /// /// # Arguments /// /// * `bam` - A mutable reference to an IndexedReader for the BAM file /// * `records` - A vector of input records to process /// /// # Returns /// /// A vector of records containing both: /// - Original primary alignments from the input /// - Primary alignments fetched for any supplementary records in the input /// /// # Examples /// /// ``` /// use rust_htslib::bam::{IndexedReader, Record}; /// let mut bam = IndexedReader::from_path("sample.bam").unwrap(); /// let records = vec![/* some records */]; /// let primary_alignments = primary_records(&mut bam, records); /// ``` pub fn primary_records(bam: &mut IndexedReader, records: Vec) -> Vec { let mut res = Vec::with_capacity(records.len()); let mut all_positions = Vec::new(); let mut all_qnames_to_fetch = Vec::new(); // First pass: collect primary records and positions to fetch for record in records.iter() { if record.is_supplementary() { let qname = record.qname(); // Safely extract and process the SA tag if let Ok(Aux::String(sa)) = record.aux(b"SA") { let positions = parse_sa_tag(sa); all_positions.extend(positions); match String::from_utf8(qname.to_vec()) { Ok(qname_str) => all_qnames_to_fetch.push(qname_str), Err(e) => warn!("Invalid UTF-8 in qname: {}", e), } } } else { res.push(record.clone()); } } // Track which query names we've already found let mut primary_records: HashSet = res .iter() .filter_map(|r| String::from_utf8(r.qname().to_vec()).ok()) .collect(); // Create position bins for efficient fetching let position_bins = create_position_bins(&all_positions); // Fetch records for each bin until we find all primaries for bin in position_bins { if bin.is_empty() { continue; } let (chr, start) = bin.first().unwrap(); let end = bin.last().unwrap().1 + 1; // Fetch and process records in this region if let Err(e) = bam.fetch((*chr, *start, end)) { warn!("Failed to fetch region {}:{}-{}: {}", chr, start, end, e); continue; } for record_result in bam.records() { match record_result { Ok(record) => { // Skip secondary alignments if record.is_secondary() { continue; } // Check if this is a primary we're looking for match String::from_utf8(record.qname().to_vec()) { Ok(qname) => { if primary_records.contains(&qname) { continue; } if all_qnames_to_fetch.contains(&qname) { res.push(record); primary_records.insert(qname); } } Err(_) => continue, } } Err(e) => warn!("Error reading record: {}", e), } } // Early exit if we found all the records we were looking for let remaining = all_qnames_to_fetch .iter() .filter(|q| !primary_records.contains(*q)) .count(); if remaining == 0 { break; } } res } pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result> { let mut genome = HashMap::new(); for (key, records) in header.to_hashmap() { for record in records { if key == "SQ" { genome.insert( record["SN"].to_string(), record["LN"] .parse::() .expect("Failed parsing length of chromosomes"), ); } } } Ok(genome) } /// Split the genome into N balanced region lists (`ctg:start-end` strings). /// Each element of the outer Vec is the list of `-r` regions for one batch. pub fn split_genome_into_n_regions(genome: &HashMap, n: usize) -> Vec> { if n == 0 || genome.is_empty() { return Vec::new(); } // Deterministic order over contigs let mut contigs: Vec<_> = genome.iter().collect(); contigs.sort_by(|(a, _), (b, _)| a.cmp(b)); let total_bp: u64 = contigs.iter().map(|(_, &len)| len).sum(); let target_per_batch: u64 = total_bp.div_ceil(n as u64); // ceil let mut batches: Vec> = vec![Vec::new(); n]; let mut batch_idx = 0usize; let mut batch_bp = 0u64; for (ctg, &len) in contigs { let mut start: u64 = 1; while start <= len && batch_idx < n { let remaining_in_ctg = len - start + 1; let remaining_for_batch = if batch_bp >= target_per_batch { 1 // force close & move to next batch } else { target_per_batch - batch_bp }; let seg_len = remaining_in_ctg.min(remaining_for_batch); let end = start + seg_len - 1; let region = format!("{ctg}:{start}-{end}"); batches[batch_idx].push(region); batch_bp += seg_len; start = end + 1; if batch_bp >= target_per_batch && batch_idx + 1 < n { batch_idx += 1; batch_bp = 0; } } } batches } /// Indicates which alignment segment appears first on the read. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum SegmentOrder { /// Primary alignment appears before supplementary on the read PrimaryFirst, /// Supplementary alignment appears before primary on the read SuppFirst, } /// Fold-back inversion event detected from a primary/supplementary alignment pair. /// Thanks to: https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py /// /// A fold-back inversion occurs when a read has two alignments on the same chromosome /// with opposite strands that overlap on the reference. This pattern suggests the /// presence of an inverted duplication or fold-back structure. /// /// # Coordinate System /// - All coordinates are 0-based, half-open `[start, end)` /// - Alignments A and B are ordered by their position on the read (A first) /// - Breakpoint distances (`b_c`, `a_d`) use strand-aware coordinates /// /// # Breakpoints /// Breakpoints (a, b, c, d) are strand-aware positions representing alignment termini: /// - For `+` strand: `a = start` (5'), `b = end` (3') /// - For `-` strand: `c = start` (3'), `d = end` (5') /// /// ```text /// Case: A on '+' strand, B on '-' strand /// /// a b /// | | /// v v /// Reference:---[=====A=================>]--------- /// --------[<=============B======]------- /// ^ ^ /// | | /// c d /// /// Alignment A (+ strand): a=aln_a_start, b=aln_a_end /// Alignment B (- strand): c=aln_b_start, d=aln_b_end /// /// Distances: /// aln_a_len = |a - b| = alignment A length /// aln_b_len = |c - d| = alignment B length /// b_c = |b - c| = inner breakpoint distance (3' ends) /// a_d = |a - d| = outer breakpoint distance (5' ends) /// ``` #[derive(Debug)] pub struct FbInv { /// Read name (QNAME from BAM) pub read_name: String, /// Chromosome/contig name pub ref_name: String, /// Alignment A reference start (0-based) pub aln_a_start: i64, /// Alignment A reference end (exclusive) pub aln_a_end: i64, /// Alignment A strand ('+' or '-') pub aln_a_strand: char, /// Alignment B reference start (0-based) pub aln_b_start: i64, /// Alignment B reference end (exclusive) pub aln_b_end: i64, /// Alignment B strand ('+' or '-') pub aln_b_strand: char, /// Alignment A length on reference (bp) pub aln_a_len: i64, /// Alignment B length on reference (bp) pub aln_b_len: i64, /// Distance between breakpoints B and C (strand-aware) pub b_c: i64, /// Distance between breakpoints A and D (strand-aware) pub a_d: i64, /// Which alignment appears first on the read pub first: SegmentOrder, /// Primary alignment query start on read pub primary_qbeg: u32, /// Primary alignment query end on read pub primary_qend: u32, /// Supplementary alignment query start on read pub sa_qbeg: u32, /// Supplementary alignment query end on read pub sa_qend: u32, } /// Parse a SAM CIGAR string into a vector of CIGAR operations. /// /// Supports all standard CIGAR operations: M, I, D, N, S, H, P, =, X /// /// # Arguments /// * `s` - CIGAR string (e.g., "10S100M5I50M20S") /// /// # Returns /// `Some(Vec)` if valid, `None` if malformed or empty /// /// # Examples /// ``` /// let ops = parse_cigar_str("10S100M").unwrap(); /// assert_eq!(ops.len(), 2); /// ``` pub fn parse_cigar_str(s: &str) -> Option> { if s.is_empty() { return None; } // Pre-allocate: rough estimate ~1 op per 2-3 chars let mut ops = Vec::with_capacity(s.len() / 2); let mut num = String::new(); for c in s.chars() { if c.is_ascii_digit() { num.push(c); continue; } // We hit an operator letter: parse the accumulated number if num.is_empty() { return None; // operator without length } let len: u32 = num.parse().ok()?; num.clear(); let op = match c { 'M' => Cigar::Match(len), 'I' => Cigar::Ins(len), 'D' => Cigar::Del(len), 'N' => Cigar::RefSkip(len), 'S' => Cigar::SoftClip(len), 'H' => Cigar::HardClip(len), 'P' => Cigar::Pad(len), '=' => Cigar::Equal(len), 'X' => Cigar::Diff(len), _ => return None, // unknown operator }; ops.push(op); } // CIGAR ending with digits only is invalid if !num.is_empty() { return None; } // Validation: CIGAR must have at least one operation if ops.is_empty() { return None; } Some(ops) } /// Compute query coordinates (start, end) on the read from a CIGAR. /// /// For plus strand, the first CIGAR operation corresponds to the read start. /// For minus strand, the last CIGAR operation corresponds to the read start. /// /// # Arguments /// * `ops` - CIGAR operations slice /// * `strand` - '+' or '-' /// /// # Returns /// Tuple of (query_begin, query_end) in read coordinates (excluding hard clips) fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) { if ops.is_empty() { return (0, 0); } // For minus strand, the "first" op on the read is the last in the CIGAR let first_op = if strand == '+' { &ops[0] } else { ops.last().unwrap() }; let query_beg = match first_op { Cigar::SoftClip(len) => *len, _ => 0, }; // Alignment length on the read (consumes query bases) let aln_len: u32 = ops .iter() .filter_map(|op| match op { Cigar::Match(l) | Cigar::Ins(l) | Cigar::Equal(l) | Cigar::Diff(l) => Some(*l), _ => None, }) .sum(); (query_beg, query_beg + aln_len) } /// Compute alignment length on the reference from CIGAR operations. /// /// Counts bases consumed by M, D, N, =, X operations. /// /// # Arguments /// * `ops` - CIGAR operations slice /// /// # Returns /// Reference-consuming length in base pairs fn alignment_ref_length(ops: &[Cigar]) -> u32 { ops.iter() .filter_map(|op| match op { Cigar::Match(l) | Cigar::Del(l) | Cigar::RefSkip(l) | Cigar::Equal(l) | Cigar::Diff(l) => Some(*l), _ => None, }) .sum() } /// Compute gap between two intervals. Negative value means overlap. /// /// # Arguments /// * `a_start`, `a_end` - First interval `[a_start, a_end)` /// * `b_start`, `b_end` - Second interval `[b_start, b_end)` /// /// # Returns /// Gap in bp (negative if overlapping, 0 if adjacent, positive if separated) #[inline] fn interval_gap(a_start: i64, a_end: i64, b_start: i64, b_end: i64) -> i64 { a_start.max(b_start) - a_end.min(b_end) } /// Compute strand-aware breakpoint coordinates. /// /// For fold-back inversions, breakpoints depend on strand orientation: /// - Plus strand: breakpoints are (start, end) - 5' to 3' direction /// - Minus strand: breakpoints are (end, start) - reversed direction /// /// # Arguments /// * `start` - Alignment start (0-based) /// * `end` - Alignment end (exclusive) /// * `strand` - '+' or '-' /// /// # Returns /// Tuple of (breakpoint_a, breakpoint_b) in strand-aware order #[inline] fn breakpoints(start: i64, end: i64, strand: char) -> (i64, i64) { if strand == '+' { (start, end) } else { (end, start) } } /// Extract a fold-back inversion event from a BAM record. /// /// Analyzes a primary alignment and its supplementary alignment (from SA tag) /// to detect fold-back inversion patterns characterized by: /// - Same chromosome /// - Opposite strands /// - Reference coordinates within proximity threshold /// - Exactly one supplementary alignment /// /// # Arguments /// * `record` - BAM record (must be primary alignment) /// * `header` - BAM header for reference name lookup /// * `min_mapq` - Minimum mapping quality threshold for both alignments /// * `max_overlap` - Maximum allowed overlap in bp (None = no limit) /// * `max_gap` - Maximum allowed gap in bp (None = no limit, 0 = require overlap or adjacent) /// default should be [-150, 200) Some(150), Some(200) /// /// # Returns /// `Some(FbInv)` if a valid fold-back pattern is detected, `None` otherwise /// /// # Filtering /// Returns `None` if any of these conditions are met: /// - Record is unmapped, secondary, or supplementary /// - MAPQ below threshold (primary or supplementary) /// - No SA tag or multiple supplementary alignments /// - Supplementary on different chromosome /// - Same strand orientation /// - Overlap exceeds `max_overlap` (if set) /// - Gap exceeds `max_gap` (if set) pub fn fb_inv_from_record( record: &bam::Record, header: &bam::HeaderView, min_mapq: u8, max_overlap: Option, max_gap: Option, ) -> Option { // Basic filters (primary only, mapped, MAPQ) if record.is_unmapped() || record.is_secondary() || record.is_supplementary() || record.mapq() < min_mapq { return None; } // SA tag required let sa_str = match record.aux(b"SA") { Ok(Aux::String(s)) => s, _ => return None, }; let sa_entries: Vec<&str> = sa_str.split(';').filter(|s| !s.is_empty()).collect(); // Require exactly one SA (like the Python code) if sa_entries.len() != 1 { return None; } let sa = sa_entries[0]; // SA format: chr,pos,strand,cigar,mapq,nm let fields: Vec<&str> = sa.split(',').collect(); if fields.len() < 6 { return None; } let sa_rname = fields[0]; let sa_pos_1based: i64 = fields[1].parse().ok()?; // SAM is 1-based let sa_strand_char: char = fields[2].chars().next()?; let sa_cigar_str = fields[3]; let sa_mapq: u8 = fields[4].parse().ok()?; if sa_mapq < min_mapq { return None; } // Same chromosome: compare SA rname with primary reference name let tid = record.tid(); if tid < 0 { return None; } let primary_rname = std::str::from_utf8(header.tid2name(tid as u32)).ok()?; if sa_rname != primary_rname { return None; } // Strand check: need opposite strands for fold-back inversion let primary_strand_char = if record.is_reverse() { '-' } else { '+' }; if primary_strand_char == sa_strand_char { return None; } // Primary ref coords (0-based, half-open) let primary_start = record.pos(); let primary_end = record.cigar().end_pos(); // Parse SA CIGAR let sa_cigar_ops = parse_cigar_str(sa_cigar_str)?; let sa_ref_len = alignment_ref_length(&sa_cigar_ops); // Validation: SA must have non-zero reference length if sa_ref_len == 0 { return None; } // SA ref coords: convert POS from 1-based to 0-based let sa_start = sa_pos_1based - 1; let sa_end = sa_start + sa_ref_len as i64; // Check proximity between alignments // gap < 0: overlap of |gap| bp // gap = 0: adjacent (touching) // gap > 0: separated by gap bp let gap = interval_gap(primary_start, primary_end, sa_start, sa_end); // Check max_overlap: reject if overlap exceeds limit if let Some(max_ovl) = max_overlap { if gap < 0 && gap.abs() > max_ovl { return None; } } // Check max_gap: reject if gap exceeds limit if let Some(max_g) = max_gap { if gap > max_g { return None; } } // Query coords for primary and SA let primary_cigar: Vec = record.cigar().iter().cloned().collect(); let (primary_qbeg, primary_qend) = alignment_query_coords(&primary_cigar, primary_strand_char); let (sa_qbeg, sa_qend) = alignment_query_coords(&sa_cigar_ops, sa_strand_char); // Decide which segment is first on the read let first = if primary_qbeg <= sa_qbeg { SegmentOrder::PrimaryFirst } else { SegmentOrder::SuppFirst }; // Build FbInv with A/B ordered by read position let (aln_a_start, aln_a_end, aln_a_strand, aln_b_start, aln_b_end, aln_b_strand) = if first == SegmentOrder::PrimaryFirst { ( primary_start, primary_end, primary_strand_char, sa_start, sa_end, sa_strand_char, ) } else { ( sa_start, sa_end, sa_strand_char, primary_start, primary_end, primary_strand_char, ) }; // Distances a_b, c_d, b_c, a_d (like the Python class) let (a, b) = breakpoints(aln_a_start, aln_a_end, aln_a_strand); let (c, d) = breakpoints(aln_b_start, aln_b_end, aln_b_strand); let aln_a_len = (a - b).abs(); let aln_b_len = (c - d).abs(); let b_c = (b - c).abs(); let a_d = (a - d).abs(); let read_name = std::str::from_utf8(record.qname()) .map(|s| s.to_string()) .unwrap_or_else(|_| String::from_utf8_lossy(record.qname()).into_owned()); let ref_name = primary_rname.to_string(); Some(FbInv { read_name, ref_name, aln_a_start, aln_a_end, aln_a_strand, aln_b_start, aln_b_end, aln_b_strand, aln_a_len, aln_b_len, b_c, a_d, first, primary_qbeg, primary_qend, sa_qbeg, sa_qend, }) } pub fn read_sm_tag(bam_path: &str) -> anyhow::Result { let reader = bam::Reader::from_path(bam_path) .with_context(|| format!("Failed to open BAM: {bam_path}"))?; let header = bam::Header::from_template(reader.header()); let header_text = String::from_utf8_lossy(&header.to_bytes()).to_string(); for line in header_text.lines() { if line.starts_with("@RG") { for field in line.split('\t') { if let Some(sm) = field.strip_prefix("SM:") { return Ok(sm.to_string()); } } } } anyhow::bail!("No SM tag found in @RG header of {bam_path}") }