bam.rs 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770
  1. use std::collections::{HashMap, HashSet};
  2. use anyhow::Context;
  3. use log::warn;
  4. use rust_htslib::bam::{self, record::Aux, record::Cigar, IndexedReader, Read, Record};
  5. /// Parses an SA tag string and extracts chromosome and position information.
  6. ///
  7. /// # Arguments
  8. ///
  9. /// * `sa` - The SA tag string to parse
  10. ///
  11. /// # Returns
  12. ///
  13. /// A vector of tuples containing chromosome names and positions
  14. pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> {
  15. sa.split(';')
  16. .filter(|s| !s.is_empty())
  17. .filter_map(|s| {
  18. let parts: Vec<&str> = s.split(',').take(2).collect();
  19. if parts.len() < 2 {
  20. return None;
  21. }
  22. let chr = parts[0];
  23. match parts[1].parse::<i32>() {
  24. Ok(position) => Some((chr, position)),
  25. Err(_) => None,
  26. }
  27. })
  28. .collect()
  29. }
  30. /// Resolves the primary record for supplementary alignments using BAM SA tag
  31. ///
  32. /// For supplementary alignments, searches linked primary records using genomic positions
  33. /// listed in the SA tag. Returns the first non-supplementary record with matching query name.
  34. ///
  35. /// # Arguments
  36. /// * `bam` - Mutable reference to indexed BAM reader for random access
  37. /// * `record` - Input record to evaluate (typically a supplementary alignment)
  38. ///
  39. /// # Returns
  40. /// - Original record if it's not supplementary
  41. /// - First matching primary record found via SA tag positions
  42. /// - Original record if no primary matches found
  43. ///
  44. /// # Panics
  45. /// - If SA tag format is invalid (missing fields or non-integer positions)
  46. /// - If BAM fetch operation fails
  47. ///
  48. /// # Example
  49. /// ```
  50. /// use rust_htslib::{bam::{IndexedReader, Read}};
  51. ///
  52. /// let mut bam = IndexedReader::from_path("input.bam").unwrap();
  53. /// let record = bam.records().next().unwrap().unwrap();
  54. /// let primary = primary_record(&mut bam, record);
  55. /// ```
  56. pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
  57. // Return immediately if not a supplementary alignment
  58. if !record.is_supplementary() {
  59. return record;
  60. }
  61. let qname = record.qname();
  62. // Process SA tag if present
  63. if let Ok(Aux::String(sa)) = record.aux(b"SA") {
  64. // Search potential primary alignments at each SA position
  65. for (chr, start) in parse_sa_tag(sa) {
  66. bam.fetch((chr, start, start + 1))
  67. .expect("BAM fetch failed");
  68. for result in bam.records() {
  69. let candidate = result.expect("Invalid BAM record");
  70. if candidate.qname() == qname && !candidate.is_supplementary() {
  71. return candidate.clone();
  72. }
  73. }
  74. }
  75. }
  76. // Fallback to original record if no primary found
  77. record
  78. }
  79. /// Creates optimized position bins for fetching records.
  80. ///
  81. /// Groups positions by chromosome and creates bins of ±1000 bp.
  82. /// Sorts bins by size (descending) to prioritize regions with more alignment hits.
  83. ///
  84. /// # Arguments
  85. ///
  86. /// * `positions` - Vector of chromosome/position tuples
  87. ///
  88. /// # Returns
  89. ///
  90. /// A vector of position bins, sorted by bin size
  91. fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec<Vec<(&'a str, i32)>> {
  92. // Sort positions by chromosome and position
  93. let mut sorted_positions = positions.to_vec();
  94. sorted_positions.sort_by_key(|(chr, pos)| (*chr, *pos));
  95. sorted_positions.dedup();
  96. // Group by chromosome and create bins of ±1000 bp
  97. let mut grouped: HashMap<&str, Vec<Vec<(&str, i32)>>> = HashMap::new();
  98. for (chr, pos) in sorted_positions {
  99. let bins = grouped.entry(chr).or_default();
  100. if let Some(last_bin) = bins.last_mut() {
  101. if last_bin.is_empty() || (pos - last_bin[0].1).abs() <= 1000 {
  102. last_bin.push((chr, pos));
  103. } else {
  104. bins.push(vec![(chr, pos)]);
  105. }
  106. } else {
  107. bins.push(vec![(chr, pos)]);
  108. }
  109. }
  110. // Flatten and sort by bin size (descending)
  111. let mut flattened: Vec<Vec<(&str, i32)>> = grouped.values().flatten().cloned().collect();
  112. // Sort bins by size (descending) to prioritize regions with more hits
  113. flattened.sort_by_key(|bin| std::cmp::Reverse(bin.len()));
  114. flattened
  115. }
  116. /// Retrieves primary alignment records based on a set of input records.
  117. ///
  118. /// This function processes a collection of BAM records and retrieves their primary alignments.
  119. /// When supplementary alignments are found (with SA tags), it fetches the corresponding
  120. /// primary alignments from the provided BAM file.
  121. ///
  122. /// # Arguments
  123. ///
  124. /// * `bam` - A mutable reference to an IndexedReader for the BAM file
  125. /// * `records` - A vector of input records to process
  126. ///
  127. /// # Returns
  128. ///
  129. /// A vector of records containing both:
  130. /// - Original primary alignments from the input
  131. /// - Primary alignments fetched for any supplementary records in the input
  132. ///
  133. /// # Examples
  134. ///
  135. /// ```
  136. /// use rust_htslib::bam::{IndexedReader, Record};
  137. /// let mut bam = IndexedReader::from_path("sample.bam").unwrap();
  138. /// let records = vec![/* some records */];
  139. /// let primary_alignments = primary_records(&mut bam, records);
  140. /// ```
  141. pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Record> {
  142. let mut res = Vec::with_capacity(records.len());
  143. let mut all_positions = Vec::new();
  144. let mut all_qnames_to_fetch = Vec::new();
  145. // First pass: collect primary records and positions to fetch
  146. for record in records.iter() {
  147. if record.is_supplementary() {
  148. let qname = record.qname();
  149. // Safely extract and process the SA tag
  150. if let Ok(Aux::String(sa)) = record.aux(b"SA") {
  151. let positions = parse_sa_tag(sa);
  152. all_positions.extend(positions);
  153. match String::from_utf8(qname.to_vec()) {
  154. Ok(qname_str) => all_qnames_to_fetch.push(qname_str),
  155. Err(e) => warn!("Invalid UTF-8 in qname: {}", e),
  156. }
  157. }
  158. } else {
  159. res.push(record.clone());
  160. }
  161. }
  162. // Track which query names we've already found
  163. let mut primary_records: HashSet<String> = res
  164. .iter()
  165. .filter_map(|r| String::from_utf8(r.qname().to_vec()).ok())
  166. .collect();
  167. // Create position bins for efficient fetching
  168. let position_bins = create_position_bins(&all_positions);
  169. // Fetch records for each bin until we find all primaries
  170. for bin in position_bins {
  171. if bin.is_empty() {
  172. continue;
  173. }
  174. let (chr, start) = bin.first().unwrap();
  175. let end = bin.last().unwrap().1 + 1;
  176. // Fetch and process records in this region
  177. if let Err(e) = bam.fetch((*chr, *start, end)) {
  178. warn!("Failed to fetch region {}:{}-{}: {}", chr, start, end, e);
  179. continue;
  180. }
  181. for record_result in bam.records() {
  182. match record_result {
  183. Ok(record) => {
  184. // Skip secondary alignments
  185. if record.is_secondary() {
  186. continue;
  187. }
  188. // Check if this is a primary we're looking for
  189. match String::from_utf8(record.qname().to_vec()) {
  190. Ok(qname) => {
  191. if primary_records.contains(&qname) {
  192. continue;
  193. }
  194. if all_qnames_to_fetch.contains(&qname) {
  195. res.push(record);
  196. primary_records.insert(qname);
  197. }
  198. }
  199. Err(_) => continue,
  200. }
  201. }
  202. Err(e) => warn!("Error reading record: {}", e),
  203. }
  204. }
  205. // Early exit if we found all the records we were looking for
  206. let remaining = all_qnames_to_fetch
  207. .iter()
  208. .filter(|q| !primary_records.contains(*q))
  209. .count();
  210. if remaining == 0 {
  211. break;
  212. }
  213. }
  214. res
  215. }
  216. pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<HashMap<String, u64>> {
  217. let mut genome = HashMap::new();
  218. for (key, records) in header.to_hashmap() {
  219. for record in records {
  220. if key == "SQ" {
  221. genome.insert(
  222. record["SN"].to_string(),
  223. record["LN"]
  224. .parse::<u64>()
  225. .expect("Failed parsing length of chromosomes"),
  226. );
  227. }
  228. }
  229. }
  230. Ok(genome)
  231. }
  232. /// Split the genome into N balanced region lists (`ctg:start-end` strings).
  233. /// Each element of the outer Vec is the list of `-r` regions for one batch.
  234. pub fn split_genome_into_n_regions(genome: &HashMap<String, u64>, n: usize) -> Vec<Vec<String>> {
  235. if n == 0 || genome.is_empty() {
  236. return Vec::new();
  237. }
  238. // Deterministic order over contigs
  239. let mut contigs: Vec<_> = genome.iter().collect();
  240. contigs.sort_by(|(a, _), (b, _)| a.cmp(b));
  241. let total_bp: u64 = contigs.iter().map(|(_, &len)| len).sum();
  242. let target_per_batch: u64 = total_bp.div_ceil(n as u64); // ceil
  243. let mut batches: Vec<Vec<String>> = vec![Vec::new(); n];
  244. let mut batch_idx = 0usize;
  245. let mut batch_bp = 0u64;
  246. for (ctg, &len) in contigs {
  247. let mut start: u64 = 1;
  248. while start <= len && batch_idx < n {
  249. let remaining_in_ctg = len - start + 1;
  250. let remaining_for_batch = if batch_bp >= target_per_batch {
  251. 1 // force close & move to next batch
  252. } else {
  253. target_per_batch - batch_bp
  254. };
  255. let seg_len = remaining_in_ctg.min(remaining_for_batch);
  256. let end = start + seg_len - 1;
  257. let region = format!("{ctg}:{start}-{end}");
  258. batches[batch_idx].push(region);
  259. batch_bp += seg_len;
  260. start = end + 1;
  261. if batch_bp >= target_per_batch && batch_idx + 1 < n {
  262. batch_idx += 1;
  263. batch_bp = 0;
  264. }
  265. }
  266. }
  267. batches
  268. }
  269. /// Indicates which alignment segment appears first on the read.
  270. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
  271. pub enum SegmentOrder {
  272. /// Primary alignment appears before supplementary on the read
  273. PrimaryFirst,
  274. /// Supplementary alignment appears before primary on the read
  275. SuppFirst,
  276. }
  277. /// Fold-back inversion event detected from a primary/supplementary alignment pair.
  278. /// Thanks to: https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py
  279. ///
  280. /// A fold-back inversion occurs when a read has two alignments on the same chromosome
  281. /// with opposite strands that overlap on the reference. This pattern suggests the
  282. /// presence of an inverted duplication or fold-back structure.
  283. ///
  284. /// # Coordinate System
  285. /// - All coordinates are 0-based, half-open `[start, end)`
  286. /// - Alignments A and B are ordered by their position on the read (A first)
  287. /// - Breakpoint distances (`b_c`, `a_d`) use strand-aware coordinates
  288. ///
  289. /// # Breakpoints
  290. /// Breakpoints (a, b, c, d) are strand-aware positions representing alignment termini:
  291. /// - For `+` strand: `a = start` (5'), `b = end` (3')
  292. /// - For `-` strand: `c = start` (3'), `d = end` (5')
  293. ///
  294. /// ```text
  295. /// Case: A on '+' strand, B on '-' strand
  296. ///
  297. /// a b
  298. /// | |
  299. /// v v
  300. /// Reference:---[=====A=================>]---------
  301. /// --------[<=============B======]-------
  302. /// ^ ^
  303. /// | |
  304. /// c d
  305. ///
  306. /// Alignment A (+ strand): a=aln_a_start, b=aln_a_end
  307. /// Alignment B (- strand): c=aln_b_start, d=aln_b_end
  308. ///
  309. /// Distances:
  310. /// aln_a_len = |a - b| = alignment A length
  311. /// aln_b_len = |c - d| = alignment B length
  312. /// b_c = |b - c| = inner breakpoint distance (3' ends)
  313. /// a_d = |a - d| = outer breakpoint distance (5' ends)
  314. /// ```
  315. #[derive(Debug)]
  316. pub struct FbInv {
  317. /// Read name (QNAME from BAM)
  318. pub read_name: String,
  319. /// Chromosome/contig name
  320. pub ref_name: String,
  321. /// Alignment A reference start (0-based)
  322. pub aln_a_start: i64,
  323. /// Alignment A reference end (exclusive)
  324. pub aln_a_end: i64,
  325. /// Alignment A strand ('+' or '-')
  326. pub aln_a_strand: char,
  327. /// Alignment B reference start (0-based)
  328. pub aln_b_start: i64,
  329. /// Alignment B reference end (exclusive)
  330. pub aln_b_end: i64,
  331. /// Alignment B strand ('+' or '-')
  332. pub aln_b_strand: char,
  333. /// Alignment A length on reference (bp)
  334. pub aln_a_len: i64,
  335. /// Alignment B length on reference (bp)
  336. pub aln_b_len: i64,
  337. /// Distance between breakpoints B and C (strand-aware)
  338. pub b_c: i64,
  339. /// Distance between breakpoints A and D (strand-aware)
  340. pub a_d: i64,
  341. /// Which alignment appears first on the read
  342. pub first: SegmentOrder,
  343. /// Primary alignment query start on read
  344. pub primary_qbeg: u32,
  345. /// Primary alignment query end on read
  346. pub primary_qend: u32,
  347. /// Supplementary alignment query start on read
  348. pub sa_qbeg: u32,
  349. /// Supplementary alignment query end on read
  350. pub sa_qend: u32,
  351. }
  352. /// Parse a SAM CIGAR string into a vector of CIGAR operations.
  353. ///
  354. /// Supports all standard CIGAR operations: M, I, D, N, S, H, P, =, X
  355. ///
  356. /// # Arguments
  357. /// * `s` - CIGAR string (e.g., "10S100M5I50M20S")
  358. ///
  359. /// # Returns
  360. /// `Some(Vec<Cigar>)` if valid, `None` if malformed or empty
  361. ///
  362. /// # Examples
  363. /// ```
  364. /// let ops = parse_cigar_str("10S100M").unwrap();
  365. /// assert_eq!(ops.len(), 2);
  366. /// ```
  367. pub fn parse_cigar_str(s: &str) -> Option<Vec<Cigar>> {
  368. if s.is_empty() {
  369. return None;
  370. }
  371. // Pre-allocate: rough estimate ~1 op per 2-3 chars
  372. let mut ops = Vec::with_capacity(s.len() / 2);
  373. let mut num = String::new();
  374. for c in s.chars() {
  375. if c.is_ascii_digit() {
  376. num.push(c);
  377. continue;
  378. }
  379. // We hit an operator letter: parse the accumulated number
  380. if num.is_empty() {
  381. return None; // operator without length
  382. }
  383. let len: u32 = num.parse().ok()?;
  384. num.clear();
  385. let op = match c {
  386. 'M' => Cigar::Match(len),
  387. 'I' => Cigar::Ins(len),
  388. 'D' => Cigar::Del(len),
  389. 'N' => Cigar::RefSkip(len),
  390. 'S' => Cigar::SoftClip(len),
  391. 'H' => Cigar::HardClip(len),
  392. 'P' => Cigar::Pad(len),
  393. '=' => Cigar::Equal(len),
  394. 'X' => Cigar::Diff(len),
  395. _ => return None, // unknown operator
  396. };
  397. ops.push(op);
  398. }
  399. // CIGAR ending with digits only is invalid
  400. if !num.is_empty() {
  401. return None;
  402. }
  403. // Validation: CIGAR must have at least one operation
  404. if ops.is_empty() {
  405. return None;
  406. }
  407. Some(ops)
  408. }
  409. /// Compute query coordinates (start, end) on the read from a CIGAR.
  410. ///
  411. /// For plus strand, the first CIGAR operation corresponds to the read start.
  412. /// For minus strand, the last CIGAR operation corresponds to the read start.
  413. ///
  414. /// # Arguments
  415. /// * `ops` - CIGAR operations slice
  416. /// * `strand` - '+' or '-'
  417. ///
  418. /// # Returns
  419. /// Tuple of (query_begin, query_end) in read coordinates (excluding hard clips)
  420. fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) {
  421. if ops.is_empty() {
  422. return (0, 0);
  423. }
  424. // For minus strand, the "first" op on the read is the last in the CIGAR
  425. let first_op = if strand == '+' {
  426. &ops[0]
  427. } else {
  428. ops.last().unwrap()
  429. };
  430. let query_beg = match first_op {
  431. Cigar::SoftClip(len) => *len,
  432. _ => 0,
  433. };
  434. // Alignment length on the read (consumes query bases)
  435. let aln_len: u32 = ops
  436. .iter()
  437. .filter_map(|op| match op {
  438. Cigar::Match(l) | Cigar::Ins(l) | Cigar::Equal(l) | Cigar::Diff(l) => Some(*l),
  439. _ => None,
  440. })
  441. .sum();
  442. (query_beg, query_beg + aln_len)
  443. }
  444. /// Compute alignment length on the reference from CIGAR operations.
  445. ///
  446. /// Counts bases consumed by M, D, N, =, X operations.
  447. ///
  448. /// # Arguments
  449. /// * `ops` - CIGAR operations slice
  450. ///
  451. /// # Returns
  452. /// Reference-consuming length in base pairs
  453. fn alignment_ref_length(ops: &[Cigar]) -> u32 {
  454. ops.iter()
  455. .filter_map(|op| match op {
  456. Cigar::Match(l)
  457. | Cigar::Del(l)
  458. | Cigar::RefSkip(l)
  459. | Cigar::Equal(l)
  460. | Cigar::Diff(l) => Some(*l),
  461. _ => None,
  462. })
  463. .sum()
  464. }
  465. /// Compute gap between two intervals. Negative value means overlap.
  466. ///
  467. /// # Arguments
  468. /// * `a_start`, `a_end` - First interval `[a_start, a_end)`
  469. /// * `b_start`, `b_end` - Second interval `[b_start, b_end)`
  470. ///
  471. /// # Returns
  472. /// Gap in bp (negative if overlapping, 0 if adjacent, positive if separated)
  473. #[inline]
  474. fn interval_gap(a_start: i64, a_end: i64, b_start: i64, b_end: i64) -> i64 {
  475. a_start.max(b_start) - a_end.min(b_end)
  476. }
  477. /// Compute strand-aware breakpoint coordinates.
  478. ///
  479. /// For fold-back inversions, breakpoints depend on strand orientation:
  480. /// - Plus strand: breakpoints are (start, end) - 5' to 3' direction
  481. /// - Minus strand: breakpoints are (end, start) - reversed direction
  482. ///
  483. /// # Arguments
  484. /// * `start` - Alignment start (0-based)
  485. /// * `end` - Alignment end (exclusive)
  486. /// * `strand` - '+' or '-'
  487. ///
  488. /// # Returns
  489. /// Tuple of (breakpoint_a, breakpoint_b) in strand-aware order
  490. #[inline]
  491. fn breakpoints(start: i64, end: i64, strand: char) -> (i64, i64) {
  492. if strand == '+' {
  493. (start, end)
  494. } else {
  495. (end, start)
  496. }
  497. }
  498. /// Extract a fold-back inversion event from a BAM record.
  499. ///
  500. /// Analyzes a primary alignment and its supplementary alignment (from SA tag)
  501. /// to detect fold-back inversion patterns characterized by:
  502. /// - Same chromosome
  503. /// - Opposite strands
  504. /// - Reference coordinates within proximity threshold
  505. /// - Exactly one supplementary alignment
  506. ///
  507. /// # Arguments
  508. /// * `record` - BAM record (must be primary alignment)
  509. /// * `header` - BAM header for reference name lookup
  510. /// * `min_mapq` - Minimum mapping quality threshold for both alignments
  511. /// * `max_overlap` - Maximum allowed overlap in bp (None = no limit)
  512. /// * `max_gap` - Maximum allowed gap in bp (None = no limit, 0 = require overlap or adjacent)
  513. /// default should be [-150, 200) Some(150), Some(200)
  514. ///
  515. /// # Returns
  516. /// `Some(FbInv)` if a valid fold-back pattern is detected, `None` otherwise
  517. ///
  518. /// # Filtering
  519. /// Returns `None` if any of these conditions are met:
  520. /// - Record is unmapped, secondary, or supplementary
  521. /// - MAPQ below threshold (primary or supplementary)
  522. /// - No SA tag or multiple supplementary alignments
  523. /// - Supplementary on different chromosome
  524. /// - Same strand orientation
  525. /// - Overlap exceeds `max_overlap` (if set)
  526. /// - Gap exceeds `max_gap` (if set)
  527. pub fn fb_inv_from_record(
  528. record: &bam::Record,
  529. header: &bam::HeaderView,
  530. min_mapq: u8,
  531. max_overlap: Option<i64>,
  532. max_gap: Option<i64>,
  533. ) -> Option<FbInv> {
  534. // Basic filters (primary only, mapped, MAPQ)
  535. if record.is_unmapped()
  536. || record.is_secondary()
  537. || record.is_supplementary()
  538. || record.mapq() < min_mapq
  539. {
  540. return None;
  541. }
  542. // SA tag required
  543. let sa_str = match record.aux(b"SA") {
  544. Ok(Aux::String(s)) => s,
  545. _ => return None,
  546. };
  547. let sa_entries: Vec<&str> = sa_str.split(';').filter(|s| !s.is_empty()).collect();
  548. // Require exactly one SA (like the Python code)
  549. if sa_entries.len() != 1 {
  550. return None;
  551. }
  552. let sa = sa_entries[0];
  553. // SA format: chr,pos,strand,cigar,mapq,nm
  554. let fields: Vec<&str> = sa.split(',').collect();
  555. if fields.len() < 6 {
  556. return None;
  557. }
  558. let sa_rname = fields[0];
  559. let sa_pos_1based: i64 = fields[1].parse().ok()?; // SAM is 1-based
  560. let sa_strand_char: char = fields[2].chars().next()?;
  561. let sa_cigar_str = fields[3];
  562. let sa_mapq: u8 = fields[4].parse().ok()?;
  563. if sa_mapq < min_mapq {
  564. return None;
  565. }
  566. // Same chromosome: compare SA rname with primary reference name
  567. let tid = record.tid();
  568. if tid < 0 {
  569. return None;
  570. }
  571. let primary_rname = std::str::from_utf8(header.tid2name(tid as u32)).ok()?;
  572. if sa_rname != primary_rname {
  573. return None;
  574. }
  575. // Strand check: need opposite strands for fold-back inversion
  576. let primary_strand_char = if record.is_reverse() { '-' } else { '+' };
  577. if primary_strand_char == sa_strand_char {
  578. return None;
  579. }
  580. // Primary ref coords (0-based, half-open)
  581. let primary_start = record.pos();
  582. let primary_end = record.cigar().end_pos();
  583. // Parse SA CIGAR
  584. let sa_cigar_ops = parse_cigar_str(sa_cigar_str)?;
  585. let sa_ref_len = alignment_ref_length(&sa_cigar_ops);
  586. // Validation: SA must have non-zero reference length
  587. if sa_ref_len == 0 {
  588. return None;
  589. }
  590. // SA ref coords: convert POS from 1-based to 0-based
  591. let sa_start = sa_pos_1based - 1;
  592. let sa_end = sa_start + sa_ref_len as i64;
  593. // Check proximity between alignments
  594. // gap < 0: overlap of |gap| bp
  595. // gap = 0: adjacent (touching)
  596. // gap > 0: separated by gap bp
  597. let gap = interval_gap(primary_start, primary_end, sa_start, sa_end);
  598. // Check max_overlap: reject if overlap exceeds limit
  599. if let Some(max_ovl) = max_overlap {
  600. if gap < 0 && gap.abs() > max_ovl {
  601. return None;
  602. }
  603. }
  604. // Check max_gap: reject if gap exceeds limit
  605. if let Some(max_g) = max_gap {
  606. if gap > max_g {
  607. return None;
  608. }
  609. }
  610. // Query coords for primary and SA
  611. let primary_cigar: Vec<Cigar> = record.cigar().iter().cloned().collect();
  612. let (primary_qbeg, primary_qend) = alignment_query_coords(&primary_cigar, primary_strand_char);
  613. let (sa_qbeg, sa_qend) = alignment_query_coords(&sa_cigar_ops, sa_strand_char);
  614. // Decide which segment is first on the read
  615. let first = if primary_qbeg <= sa_qbeg {
  616. SegmentOrder::PrimaryFirst
  617. } else {
  618. SegmentOrder::SuppFirst
  619. };
  620. // Build FbInv with A/B ordered by read position
  621. let (aln_a_start, aln_a_end, aln_a_strand, aln_b_start, aln_b_end, aln_b_strand) =
  622. if first == SegmentOrder::PrimaryFirst {
  623. (
  624. primary_start,
  625. primary_end,
  626. primary_strand_char,
  627. sa_start,
  628. sa_end,
  629. sa_strand_char,
  630. )
  631. } else {
  632. (
  633. sa_start,
  634. sa_end,
  635. sa_strand_char,
  636. primary_start,
  637. primary_end,
  638. primary_strand_char,
  639. )
  640. };
  641. // Distances a_b, c_d, b_c, a_d (like the Python class)
  642. let (a, b) = breakpoints(aln_a_start, aln_a_end, aln_a_strand);
  643. let (c, d) = breakpoints(aln_b_start, aln_b_end, aln_b_strand);
  644. let aln_a_len = (a - b).abs();
  645. let aln_b_len = (c - d).abs();
  646. let b_c = (b - c).abs();
  647. let a_d = (a - d).abs();
  648. let read_name = std::str::from_utf8(record.qname())
  649. .map(|s| s.to_string())
  650. .unwrap_or_else(|_| String::from_utf8_lossy(record.qname()).into_owned());
  651. let ref_name = primary_rname.to_string();
  652. Some(FbInv {
  653. read_name,
  654. ref_name,
  655. aln_a_start,
  656. aln_a_end,
  657. aln_a_strand,
  658. aln_b_start,
  659. aln_b_end,
  660. aln_b_strand,
  661. aln_a_len,
  662. aln_b_len,
  663. b_c,
  664. a_d,
  665. first,
  666. primary_qbeg,
  667. primary_qend,
  668. sa_qbeg,
  669. sa_qend,
  670. })
  671. }
  672. pub fn read_sm_tag(bam_path: &str) -> anyhow::Result<String> {
  673. let reader = bam::Reader::from_path(bam_path)
  674. .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
  675. let header = bam::Header::from_template(reader.header());
  676. let header_text = String::from_utf8_lossy(&header.to_bytes()).to_string();
  677. for line in header_text.lines() {
  678. if line.starts_with("@RG") {
  679. for field in line.split('\t') {
  680. if let Some(sm) = field.strip_prefix("SM:") {
  681. return Ok(sm.to_string());
  682. }
  683. }
  684. }
  685. }
  686. anyhow::bail!("No SM tag found in @RG header of {bam_path}")
  687. }