bam.rs 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123
  1. use std::io::Write;
  2. use std::{
  3. collections::{HashMap, HashSet},
  4. fs::File,
  5. path::Path,
  6. };
  7. use anyhow::Context;
  8. use log::{info, warn};
  9. use rust_htslib::bam::{
  10. self,
  11. ext::BamRecordExtensions,
  12. record::{Aux, Cigar},
  13. FetchDefinition, IndexedReader, Read, Record,
  14. };
  15. use crate::{commands::samtools::SamtoolsReheader, config::Config, runners::Run};
  16. /// A single entry from a BAM SA (supplementary alignment) tag.
  17. ///
  18. /// SA tag format per the SAM spec: `rname,pos,strand,CIGAR,mapQ,NM` (semicolon-separated).
  19. /// `pos` is stored as 0-based after conversion from the 1-based SAM value.
  20. pub struct SaEntry<'a> {
  21. /// Reference/contig name
  22. pub chr: &'a str,
  23. /// Reference start position, 0-based
  24. pub pos: i32,
  25. /// Strand: `'+'` or `'-'`
  26. pub strand: char,
  27. /// CIGAR string (unparsed)
  28. pub cigar: &'a str,
  29. /// Mapping quality
  30. pub mapq: u8,
  31. /// Edit distance (NM tag value); `0` if absent or unparseable
  32. pub nm: u32,
  33. }
  34. /// Parse all entries from a BAM SA tag value into [`SaEntry`] structs.
  35. ///
  36. /// `pos` is converted from 1-based (SAM spec) to 0-based. Entries with fewer than
  37. /// six comma-separated fields or unparseable numeric fields are silently skipped.
  38. ///
  39. /// # Arguments
  40. ///
  41. /// * `sa` - Raw SA tag value (the string after `SA:Z:` in a SAM record)
  42. ///
  43. /// # Returns
  44. ///
  45. /// A vector of parsed entries. Empty if the tag contains no valid entries.
  46. pub fn parse_sa_entries(sa: &str) -> Vec<SaEntry<'_>> {
  47. sa.split(';')
  48. .filter(|s| !s.is_empty())
  49. .filter_map(|s| {
  50. let f: Vec<&str> = s.splitn(6, ',').collect();
  51. if f.len() < 6 {
  52. return None;
  53. }
  54. Some(SaEntry {
  55. chr: f[0],
  56. pos: f[1].parse::<i32>().ok()? - 1,
  57. strand: f[2].chars().next()?,
  58. cigar: f[3],
  59. mapq: f[4].parse().ok()?,
  60. nm: f[5].parse().unwrap_or(0),
  61. })
  62. })
  63. .collect()
  64. }
  65. /// Convenience wrapper over [`parse_sa_entries`] returning only `(chr, pos)` pairs.
  66. ///
  67. /// `pos` is 0-based. Useful when only fetch coordinates are needed.
  68. pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> {
  69. parse_sa_entries(sa)
  70. .into_iter()
  71. .map(|e| (e.chr, e.pos))
  72. .collect()
  73. }
  74. /// Resolve a supplementary alignment to its primary record via the SA tag.
  75. ///
  76. /// For supplementary alignments, each position listed in the SA tag is fetched and
  77. /// scanned for a non-supplementary record with the same query name. The first match
  78. /// is returned. If the record is not supplementary, or no primary is found across all
  79. /// SA positions, the original record is returned unchanged.
  80. ///
  81. /// # Arguments
  82. ///
  83. /// * `bam` - Indexed BAM reader used for random-access fetches
  84. /// * `record` - The record to resolve (typically a supplementary alignment)
  85. ///
  86. /// # Returns
  87. ///
  88. /// The primary alignment record, or the input record if resolution is not applicable
  89. /// or unsuccessful.
  90. ///
  91. /// # Errors
  92. ///
  93. /// Returns an error if a BAM fetch operation fails or if a fetched record is malformed.
  94. pub fn primary_record(bam: &mut IndexedReader, record: Record) -> anyhow::Result<Record> {
  95. if !record.is_supplementary() {
  96. return Ok(record);
  97. }
  98. let qname = record.qname();
  99. if let Ok(Aux::String(sa)) = record.aux(b"SA") {
  100. for (chr, start) in parse_sa_tag(sa) {
  101. bam.fetch((chr, start, start + 1))
  102. .with_context(|| format!("BAM fetch failed at {chr}:{start}"))?;
  103. for result in bam.records() {
  104. let candidate = result.context("Invalid BAM record")?;
  105. if candidate.qname() == qname
  106. && !candidate.is_supplementary()
  107. && !candidate.is_secondary()
  108. {
  109. return Ok(candidate);
  110. }
  111. }
  112. }
  113. }
  114. Ok(record)
  115. }
  116. /// Group 0-based positions by chromosome into bins spanning at most 1000 bp, sorted by bin size descending.
  117. ///
  118. /// Positions within 1000 bp of the first position in the current bin are merged into
  119. /// that bin. Larger bins are sorted first so that high-density regions are fetched
  120. /// before sparse ones, enabling earlier early-exit in callers.
  121. fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec<Vec<(&'a str, i32)>> {
  122. let mut sorted_positions = positions.to_vec();
  123. sorted_positions.sort_by_key(|(chr, pos)| (*chr, *pos));
  124. sorted_positions.dedup();
  125. let mut grouped: HashMap<&str, Vec<Vec<(&str, i32)>>> = HashMap::new();
  126. for (chr, pos) in sorted_positions {
  127. let bins = grouped.entry(chr).or_default();
  128. if let Some(last_bin) = bins.last_mut() {
  129. if last_bin.is_empty() || pos - last_bin[0].1 <= 1000 {
  130. last_bin.push((chr, pos));
  131. } else {
  132. bins.push(vec![(chr, pos)]);
  133. }
  134. } else {
  135. bins.push(vec![(chr, pos)]);
  136. }
  137. }
  138. let mut flattened: Vec<Vec<(&str, i32)>> = grouped.values().flatten().cloned().collect();
  139. flattened.sort_by_key(|bin| std::cmp::Reverse(bin.len()));
  140. flattened
  141. }
  142. /// Resolve supplementary alignments in a record set to their primary records.
  143. ///
  144. /// Primary and secondary records are separated in a first pass. For supplementary
  145. /// records with an SA tag, the SA positions are binned by chromosome and fetched in
  146. /// bulk. Fetching stops as soon as all target query names have been found (early exit).
  147. /// Records that cannot be resolved are silently dropped with a warning.
  148. ///
  149. /// # Arguments
  150. ///
  151. /// * `bam` - Indexed BAM reader used for random-access fetches
  152. /// * `records` - Mixed set of primary, secondary, and supplementary records
  153. ///
  154. /// # Returns
  155. ///
  156. /// A vector containing original non-supplementary records from the input plus any
  157. /// primaries fetched for supplementary inputs. Secondary records in the input are
  158. /// passed through unchanged; secondary records encountered during SA-position fetching
  159. /// are skipped. Order is not guaranteed.
  160. pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Record> {
  161. let mut res = Vec::with_capacity(records.len());
  162. let mut all_positions = Vec::new();
  163. let mut all_qnames_to_fetch = HashSet::new();
  164. // First pass: collect primary records and positions to fetch
  165. for record in records.iter() {
  166. if record.is_supplementary() {
  167. let qname = record.qname();
  168. // Safely extract and process the SA tag
  169. if let Ok(Aux::String(sa)) = record.aux(b"SA") {
  170. let positions = parse_sa_tag(sa);
  171. all_positions.extend(positions);
  172. match String::from_utf8(qname.to_vec()) {
  173. Ok(qname_str) => {
  174. all_qnames_to_fetch.insert(qname_str);
  175. }
  176. Err(e) => warn!("Invalid UTF-8 in qname: {}", e),
  177. }
  178. }
  179. } else {
  180. res.push(record.clone());
  181. }
  182. }
  183. // Track which query names we've already found
  184. let mut primary_records: HashSet<String> = res
  185. .iter()
  186. .filter_map(|r| String::from_utf8(r.qname().to_vec()).ok())
  187. .collect();
  188. // Create position bins for efficient fetching
  189. let position_bins = create_position_bins(&all_positions);
  190. // Fetch records for each bin until we find all primaries
  191. for bin in position_bins {
  192. if bin.is_empty() {
  193. continue;
  194. }
  195. let (chr, start) = bin.first().unwrap();
  196. let end = bin.last().unwrap().1 + 1;
  197. // Fetch and process records in this region
  198. if let Err(e) = bam.fetch((*chr, *start, end)) {
  199. warn!("Failed to fetch region {}:{}-{}: {}", chr, start, end, e);
  200. continue;
  201. }
  202. for record_result in bam.records() {
  203. match record_result {
  204. Ok(record) => {
  205. // Skip secondary alignments
  206. if record.is_secondary() {
  207. continue;
  208. }
  209. // Check if this is a primary we're looking for
  210. match String::from_utf8(record.qname().to_vec()) {
  211. Ok(qname) => {
  212. if primary_records.contains(&qname) {
  213. continue;
  214. }
  215. if all_qnames_to_fetch.contains(&qname) {
  216. res.push(record);
  217. primary_records.insert(qname);
  218. }
  219. }
  220. Err(_) => continue,
  221. }
  222. }
  223. Err(e) => warn!("Error reading record: {}", e),
  224. }
  225. }
  226. // Early exit if we found all the records we were looking for
  227. let remaining = all_qnames_to_fetch
  228. .iter()
  229. .filter(|q| !primary_records.contains(*q))
  230. .count();
  231. if remaining == 0 {
  232. break;
  233. }
  234. }
  235. res
  236. }
  237. /// Extract contig names and lengths from a BAM header.
  238. ///
  239. /// Reads all `@SQ` lines and returns a map of sequence name (`SN`) to sequence
  240. /// length (`LN`).
  241. ///
  242. /// # Arguments
  243. ///
  244. /// * `header` - BAM header to parse
  245. ///
  246. /// # Returns
  247. ///
  248. /// A map from contig name to length in base pairs.
  249. ///
  250. /// # Errors
  251. ///
  252. /// Returns an error if the `LN` field of any `@SQ` line cannot be parsed as `u64`.
  253. pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<HashMap<String, u64>> {
  254. let mut genome = HashMap::new();
  255. for (key, records) in header.to_hashmap() {
  256. for record in records {
  257. if key == "SQ" {
  258. genome.insert(
  259. record["SN"].to_string(),
  260. record["LN"].parse::<u64>().with_context(|| {
  261. format!("Failed to parse LN for contig {}", record["SN"])
  262. })?,
  263. );
  264. }
  265. }
  266. }
  267. Ok(genome)
  268. }
  269. /// Partition a genome into `n` balanced batches of samtools-style region strings.
  270. ///
  271. /// Contigs are sorted deterministically and sliced into segments so that each batch
  272. /// covers approximately `total_bp / n` bases. A contig may be split across batches.
  273. /// Region strings use 1-based inclusive coordinates (`ctg:start-end`) as expected
  274. /// by samtools `-r`.
  275. ///
  276. /// # Arguments
  277. ///
  278. /// * `genome` - Map of contig name to length in bp, typically from [`get_genome_sizes`]
  279. /// * `n` - Number of batches to produce
  280. ///
  281. /// # Returns
  282. ///
  283. /// A vector of `n` batches, each containing the region strings assigned to that batch.
  284. /// Returns an empty vector if `n` is zero or `genome` is empty.
  285. pub fn split_genome_into_n_regions(genome: &HashMap<String, u64>, n: usize) -> Vec<Vec<String>> {
  286. if n == 0 || genome.is_empty() {
  287. return Vec::new();
  288. }
  289. // Deterministic order over contigs
  290. let mut contigs: Vec<_> = genome.iter().collect();
  291. contigs.sort_by(|(a, _), (b, _)| a.cmp(b));
  292. let total_bp: u64 = contigs.iter().map(|(_, &len)| len).sum();
  293. let target_per_batch: u64 = total_bp.div_ceil(n as u64); // ceil
  294. let mut batches: Vec<Vec<String>> = vec![Vec::new(); n];
  295. let mut batch_idx = 0usize;
  296. let mut batch_bp = 0u64;
  297. for (ctg, &len) in contigs {
  298. let mut start: u64 = 1;
  299. while start <= len && batch_idx < n {
  300. let remaining_in_ctg = len - start + 1;
  301. let remaining_for_batch = if batch_bp >= target_per_batch {
  302. 1 // force close & move to next batch
  303. } else {
  304. target_per_batch - batch_bp
  305. };
  306. let seg_len = remaining_in_ctg.min(remaining_for_batch);
  307. let end = start + seg_len - 1;
  308. let region = format!("{ctg}:{start}-{end}");
  309. batches[batch_idx].push(region);
  310. batch_bp += seg_len;
  311. start = end + 1;
  312. if batch_bp >= target_per_batch && batch_idx + 1 < n {
  313. batch_idx += 1;
  314. batch_bp = 0;
  315. }
  316. }
  317. }
  318. batches
  319. }
  320. /// Indicates which alignment segment appears first on the read.
  321. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
  322. pub enum SegmentOrder {
  323. /// Primary alignment appears before supplementary on the read
  324. PrimaryFirst,
  325. /// Supplementary alignment appears before primary on the read
  326. SuppFirst,
  327. }
  328. /// Fold-back inversion event detected from a primary/supplementary alignment pair.
  329. ///
  330. /// A fold-back inversion occurs when a single read produces two alignments on the same
  331. /// chromosome with opposite strand orientations. The read travels forward (alignment A),
  332. /// then folds back and runs in reverse (alignment B), covering the same reference region
  333. /// twice — producing a signature identical to an **inverted duplication**.
  334. ///
  335. /// SA tag parsing is handled by [`parse_sa_entries`] / [`SaEntry`], which perform the
  336. /// 1-based → 0-based coordinate conversion required by the SAM specification.
  337. ///
  338. /// # Coordinate System
  339. ///
  340. /// - All coordinates are 0-based, half-open `[start, end)`
  341. /// - Alignments A and B are ordered by their position on the read (A comes first)
  342. ///
  343. /// # Breakpoints
  344. ///
  345. /// `(a, b, c, d)` are alignment termini in **read order** (5′→3′ along the read).
  346. /// `b` is the **inner** (junction-proximal) breakpoint of A; `c` is the **inner**
  347. /// breakpoint of B. `a` and `d` are the **outer** (junction-distal) endpoints.
  348. ///
  349. /// For each strand, the 5′ end of the read maps to a different reference position:
  350. ///
  351. /// | Strand | 5′ terminus | 3′ terminus |
  352. /// |--------|-------------|-------------|
  353. /// | `+` | `start` (leftmost ref pos) | `end` (rightmost ref pos) |
  354. /// | `−` | `end` (rightmost ref pos) | `start` (leftmost ref pos) |
  355. ///
  356. /// When A comes first on the read: A's 3′ terminus (`b`) and B's 5′ terminus (`c`)
  357. /// are the **inner** (junction-proximal) breakpoints. A's 5′ (`a`) and B's 3′ (`d`)
  358. /// are the **outer** endpoints.
  359. ///
  360. /// ```text
  361. /// Case: A on '+' strand, B on '−' strand
  362. ///
  363. /// a b
  364. /// | |
  365. /// v v
  366. /// Reference:---[=====A=================>]---------
  367. /// --------[<=============B======]-------
  368. /// ^ ^
  369. /// | |
  370. /// d c
  371. ///
  372. /// Alignment A (+ strand): a = aln_a_start (5' of A, outer)
  373. /// b = aln_a_end (3' of A, inner — junction-proximal)
  374. /// Alignment B (− strand): c = aln_b_end (5' of B in read order, inner — junction-proximal)
  375. /// d = aln_b_start (3' of B in read order, outer)
  376. ///
  377. /// Distances:
  378. /// aln_a_len = |a − b| = alignment A length on reference
  379. /// aln_b_len = |c − d| = alignment B length on reference
  380. /// b_c = |b − c| = inner breakpoint distance (junction gap between A and B)
  381. /// a_d = |a − d| = outer span of the fold-back structure
  382. /// ```
  383. ///
  384. /// The region `[d, b]` (where `d < b` in the canonical case) is covered by both
  385. /// alignments and constitutes the **inverted duplication**. For a perfect duplication
  386. /// `b_c = 0` and `a_d = 0`. Events with `a_d < 150 bp` are typically classified as
  387. /// **sequencing artefacts** (short hairpin/palindrome) rather than true structural variants.
  388. ///
  389. /// # Divergence from reference implementation
  390. ///
  391. /// The reference Python script requires the two alignments to **overlap** on the reference
  392. /// (`overlap > 0 bp`). This implementation is more permissive: it also accepts a small
  393. /// reference-space **gap** between the alignments (controlled by `max_gap` in
  394. /// [`fb_inv_from_record`]). Set `max_gap = Some(0)` to match the strict Python behaviour,
  395. /// which requires overlap because the artefact mechanism — ssDNA forming a hairpin and
  396. /// re-translocating back through the nanopore — guarantees both alignments cover the
  397. /// same physical DNA strand.
  398. ///
  399. /// # References
  400. ///
  401. /// Geometry, breakpoint labelling, and the `a_d < 150 bp` artefact threshold are adapted from:
  402. /// <https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py>
  403. #[derive(Debug)]
  404. pub struct FbInv {
  405. /// Read name (QNAME from BAM)
  406. pub read_name: String,
  407. /// Chromosome/contig name
  408. pub ref_name: String,
  409. /// Alignment A reference start (0-based)
  410. pub aln_a_start: i64,
  411. /// Alignment A reference end (exclusive)
  412. pub aln_a_end: i64,
  413. /// Alignment A strand ('+' or '-')
  414. pub aln_a_strand: char,
  415. /// Alignment B reference start (0-based)
  416. pub aln_b_start: i64,
  417. /// Alignment B reference end (exclusive)
  418. pub aln_b_end: i64,
  419. /// Alignment B strand ('+' or '-')
  420. pub aln_b_strand: char,
  421. /// Alignment A length on reference (bp)
  422. pub aln_a_len: i64,
  423. /// Alignment B length on reference (bp)
  424. pub aln_b_len: i64,
  425. /// `|b - c|`: read-order junction gap — distance between the 3' end of A and the 5' end of B
  426. pub b_c: i64,
  427. /// `|a - d|`: read-order outer span — distance between the 5' end of A and the 3' end of B
  428. pub a_d: i64,
  429. /// Which alignment appears first on the read
  430. pub first: SegmentOrder,
  431. /// Primary alignment query start on read
  432. pub primary_qbeg: u32,
  433. /// Primary alignment query end on read
  434. pub primary_qend: u32,
  435. /// Supplementary alignment query start on read
  436. pub sa_qbeg: u32,
  437. /// Supplementary alignment query end on read
  438. pub sa_qend: u32,
  439. }
  440. /// Parse a SAM CIGAR string into a vector of [`Cigar`] operations.
  441. ///
  442. /// All standard CIGAR operations are supported: `M`, `I`, `D`, `N`, `S`, `H`, `P`, `=`, `X`.
  443. ///
  444. /// # Arguments
  445. ///
  446. /// * `s` - CIGAR string, e.g. `"10S100M5I50M20S"`
  447. ///
  448. /// # Returns
  449. ///
  450. /// `Some(ops)` on success, `None` if the string is empty, contains an unknown operation
  451. /// character, has an operator without a preceding length, or ends with trailing digits.
  452. pub fn parse_cigar_str(s: &str) -> Option<Vec<Cigar>> {
  453. if s.is_empty() {
  454. return None;
  455. }
  456. // Pre-allocate: rough estimate ~1 op per 2-3 chars
  457. let mut ops = Vec::with_capacity(s.len() / 2);
  458. let mut num = String::new();
  459. for c in s.chars() {
  460. if c.is_ascii_digit() {
  461. num.push(c);
  462. continue;
  463. }
  464. // We hit an operator letter: parse the accumulated number
  465. if num.is_empty() {
  466. return None; // operator without length
  467. }
  468. let len: u32 = num.parse().ok()?;
  469. num.clear();
  470. let op = match c {
  471. 'M' => Cigar::Match(len),
  472. 'I' => Cigar::Ins(len),
  473. 'D' => Cigar::Del(len),
  474. 'N' => Cigar::RefSkip(len),
  475. 'S' => Cigar::SoftClip(len),
  476. 'H' => Cigar::HardClip(len),
  477. 'P' => Cigar::Pad(len),
  478. '=' => Cigar::Equal(len),
  479. 'X' => Cigar::Diff(len),
  480. _ => return None, // unknown operator
  481. };
  482. ops.push(op);
  483. }
  484. // CIGAR ending with digits only is invalid
  485. if !num.is_empty() {
  486. return None;
  487. }
  488. // Validation: CIGAR must have at least one operation
  489. if ops.is_empty() {
  490. return None;
  491. }
  492. Some(ops)
  493. }
  494. /// Compute `(query_begin, query_end)` on the read from CIGAR operations.
  495. ///
  496. /// For `+` strand the first CIGAR op is at the read start; for `-` strand the last op
  497. /// is at the read start (CIGAR is encoded in reference order regardless of strand).
  498. /// Hard-clipped bases are excluded; soft-clip length sets `query_begin`.
  499. /// Only query-consuming operations (`M`, `I`, `=`, `X`) contribute to the alignment length.
  500. ///
  501. /// # Arguments
  502. ///
  503. /// * `ops` - Slice of CIGAR operations
  504. /// * `strand` - `'+'` or `'-'`
  505. ///
  506. /// # Returns
  507. ///
  508. /// `(query_begin, query_end)` as read-coordinate offsets (0-based, hard clips excluded).
  509. /// Returns `(0, 0)` for an empty slice.
  510. fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) {
  511. if ops.is_empty() {
  512. return (0, 0);
  513. }
  514. let query_beg = if strand == '+' {
  515. terminal_soft_clip(ops.iter())
  516. } else {
  517. terminal_soft_clip(ops.iter().rev())
  518. };
  519. // Alignment length on the read (consumes query bases)
  520. let aln_len: u32 = ops
  521. .iter()
  522. .filter_map(|op| match op {
  523. Cigar::Match(l) | Cigar::Ins(l) | Cigar::Equal(l) | Cigar::Diff(l) => Some(*l),
  524. _ => None,
  525. })
  526. .sum();
  527. (query_beg, query_beg + aln_len)
  528. }
  529. fn terminal_soft_clip<'a>(mut ops: impl Iterator<Item = &'a Cigar>) -> u32 {
  530. ops.find_map(|op| match op {
  531. Cigar::HardClip(_) | Cigar::Pad(_) => None,
  532. Cigar::SoftClip(len) => Some(*len),
  533. _ => Some(0),
  534. })
  535. .unwrap_or(0)
  536. }
  537. /// Compute the reference-consuming length of a CIGAR.
  538. ///
  539. /// Sums lengths of operations that advance the reference coordinate:
  540. /// `M`, `D`, `N`, `=`, `X`. Insertions, soft clips, hard clips, and pads are excluded.
  541. ///
  542. /// # Arguments
  543. ///
  544. /// * `ops` - Slice of CIGAR operations
  545. ///
  546. /// # Returns
  547. ///
  548. /// Reference-consuming length in base pairs. Returns `0` for an empty slice.
  549. fn alignment_ref_length(ops: &[Cigar]) -> u32 {
  550. ops.iter()
  551. .filter_map(|op| match op {
  552. Cigar::Match(l)
  553. | Cigar::Del(l)
  554. | Cigar::RefSkip(l)
  555. | Cigar::Equal(l)
  556. | Cigar::Diff(l) => Some(*l),
  557. _ => None,
  558. })
  559. .sum()
  560. }
  561. /// Compute the gap in base pairs between two half-open intervals.
  562. ///
  563. /// Both intervals are `[start, end)`. A negative return value indicates that the
  564. /// intervals overlap by that many bases; zero means they are adjacent; positive means
  565. /// they are separated.
  566. ///
  567. /// # Arguments
  568. ///
  569. /// * `a_start`, `a_end` - First interval `[a_start, a_end)`
  570. /// * `b_start`, `b_end` - Second interval `[b_start, b_end)`
  571. ///
  572. /// # Returns
  573. ///
  574. /// `max(starts) - min(ends)`: negative for overlap, 0 for adjacency, positive for gap.
  575. #[inline]
  576. fn interval_gap(a_start: i64, a_end: i64, b_start: i64, b_end: i64) -> i64 {
  577. a_start.max(b_start) - a_end.min(b_end)
  578. }
  579. /// Return `(5'_terminus, 3'_terminus)` in read order for a single alignment.
  580. ///
  581. /// For `+` strand the 5' end maps to the leftmost reference position (`start`) and the
  582. /// 3' end to the rightmost (`end`). For `-` strand the read runs right-to-left on the
  583. /// reference, so 5' = `end` and 3' = `start`.
  584. ///
  585. /// # Arguments
  586. ///
  587. /// * `start` - Alignment start, 0-based inclusive
  588. /// * `end` - Alignment end, 0-based exclusive
  589. /// * `strand` - `'+'` or `'-'`
  590. ///
  591. /// # Returns
  592. ///
  593. /// `(5'_coord, 3'_coord)` where both are 0-based reference positions.
  594. #[inline]
  595. fn breakpoints(start: i64, end: i64, strand: char) -> (i64, i64) {
  596. if strand == '+' {
  597. (start, end)
  598. } else {
  599. (end, start)
  600. }
  601. }
  602. /// Attempt to extract a fold-back inversion event from a primary BAM record.
  603. ///
  604. /// Inspects the record's SA tag using [`parse_sa_entries`] and checks whether the single
  605. /// supplementary alignment forms a fold-back inversion with the primary: same chromosome,
  606. /// opposite strand, and reference coordinates within the specified proximity thresholds.
  607. /// Returns a populated [`FbInv`] with all breakpoint metrics on success.
  608. ///
  609. /// Alignments A and B in the returned struct are ordered by their position on the read
  610. /// (A comes first). All coordinates are 0-based half-open; the 1-based SA tag `pos` is
  611. /// converted automatically by [`SaEntry`].
  612. ///
  613. /// Recommended thresholds for **artefact detection**: `max_overlap = Some(150)`,
  614. /// `max_gap = Some(0)`. Requiring overlap (`max_gap = Some(0)`) matches the reference
  615. /// Python implementation and avoids capturing true SV junctions that happen to have
  616. /// opposite-strand supplementary alignments with a reference-space gap.
  617. ///
  618. /// # Arguments
  619. ///
  620. /// * `record` - BAM record to evaluate; must be a primary alignment
  621. /// * `header` - BAM header used to resolve the reference name from `tid`
  622. /// * `min_mapq` - Minimum mapping quality required for both the primary and the supplementary
  623. /// * `max_overlap` - Maximum allowed reference-space overlap in bp (`None` = no limit)
  624. /// * `max_gap` - Maximum allowed reference-space gap in bp (`None` = no limit)
  625. ///
  626. /// # Returns
  627. ///
  628. /// `Some(FbInv)` if all criteria are met, `None` otherwise.
  629. ///
  630. /// Returns `None` when any of the following conditions apply:
  631. /// - Record is unmapped, secondary, or supplementary
  632. /// - Primary or supplementary MAPQ is below `min_mapq`
  633. /// - SA tag is absent or contains more than one entry
  634. /// - Supplementary aligns to a different chromosome
  635. /// - Both alignments are on the same strand
  636. /// - Reference-space overlap exceeds `max_overlap`
  637. /// - Reference-space gap exceeds `max_gap`
  638. ///
  639. /// # References
  640. ///
  641. /// <https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py>
  642. pub fn fb_inv_from_record(
  643. record: &bam::Record,
  644. header: &bam::HeaderView,
  645. min_mapq: u8,
  646. max_overlap: Option<i64>,
  647. max_gap: Option<i64>,
  648. ) -> Option<FbInv> {
  649. // Basic filters (primary only, mapped, MAPQ)
  650. if record.is_unmapped()
  651. || record.is_secondary()
  652. || record.is_supplementary()
  653. || record.mapq() < min_mapq
  654. {
  655. return None;
  656. }
  657. // SA tag required; exactly one supplementary alignment expected
  658. let sa_str = match record.aux(b"SA") {
  659. Ok(Aux::String(s)) => s,
  660. _ => return None,
  661. };
  662. let sa_entries = parse_sa_entries(sa_str);
  663. if sa_entries.len() != 1 {
  664. return None;
  665. }
  666. let sa = &sa_entries[0];
  667. if sa.mapq < min_mapq {
  668. return None;
  669. }
  670. // Same chromosome: compare SA rname with primary reference name
  671. let tid = record.tid();
  672. if tid < 0 {
  673. return None;
  674. }
  675. let primary_rname = std::str::from_utf8(header.tid2name(tid as u32)).ok()?;
  676. if sa.chr != primary_rname {
  677. return None;
  678. }
  679. // Strand check: need opposite strands for fold-back inversion
  680. let primary_strand_char = if record.is_reverse() { '-' } else { '+' };
  681. if primary_strand_char == sa.strand {
  682. return None;
  683. }
  684. // Primary ref coords (0-based, half-open)
  685. let primary_start = record.pos();
  686. let primary_end = record.cigar().end_pos();
  687. // Parse SA CIGAR; pos is already 0-based from SaEntry
  688. let sa_cigar_ops = parse_cigar_str(sa.cigar)?;
  689. let sa_ref_len = alignment_ref_length(&sa_cigar_ops);
  690. if sa_ref_len == 0 {
  691. return None;
  692. }
  693. let sa_start = sa.pos as i64;
  694. let sa_end = sa_start + sa_ref_len as i64;
  695. // Check proximity between alignments
  696. // gap < 0: overlap of |gap| bp
  697. // gap = 0: adjacent (touching)
  698. // gap > 0: separated by gap bp
  699. let gap = interval_gap(primary_start, primary_end, sa_start, sa_end);
  700. // Check max_overlap: reject if overlap exceeds limit
  701. if let Some(max_ovl) = max_overlap {
  702. if gap < 0 && gap.abs() > max_ovl {
  703. return None;
  704. }
  705. }
  706. // Check max_gap: reject if gap exceeds limit
  707. if let Some(max_g) = max_gap {
  708. if gap > max_g {
  709. return None;
  710. }
  711. }
  712. // Query coords for primary and SA
  713. let primary_cigar: Vec<Cigar> = record.cigar().iter().cloned().collect();
  714. let (primary_qbeg, primary_qend) = alignment_query_coords(&primary_cigar, primary_strand_char);
  715. let (sa_qbeg, sa_qend) = alignment_query_coords(&sa_cigar_ops, sa.strand);
  716. // Decide which segment is first on the read
  717. let first = if primary_qbeg <= sa_qbeg {
  718. SegmentOrder::PrimaryFirst
  719. } else {
  720. SegmentOrder::SuppFirst
  721. };
  722. // Build FbInv with A/B ordered by read position
  723. let (aln_a_start, aln_a_end, aln_a_strand, aln_b_start, aln_b_end, aln_b_strand) =
  724. if first == SegmentOrder::PrimaryFirst {
  725. (
  726. primary_start,
  727. primary_end,
  728. primary_strand_char,
  729. sa_start,
  730. sa_end,
  731. sa.strand,
  732. )
  733. } else {
  734. (
  735. sa_start,
  736. sa_end,
  737. sa.strand,
  738. primary_start,
  739. primary_end,
  740. primary_strand_char,
  741. )
  742. };
  743. // Distances a_b, c_d, b_c, a_d (like the Python class)
  744. let (a, b) = breakpoints(aln_a_start, aln_a_end, aln_a_strand);
  745. let (c, d) = breakpoints(aln_b_start, aln_b_end, aln_b_strand);
  746. let aln_a_len = (a - b).abs();
  747. let aln_b_len = (c - d).abs();
  748. let b_c = (b - c).abs();
  749. let a_d = (a - d).abs();
  750. let read_name = std::str::from_utf8(record.qname())
  751. .map(|s| s.to_string())
  752. .unwrap_or_else(|_| String::from_utf8_lossy(record.qname()).into_owned());
  753. let ref_name = primary_rname.to_string();
  754. Some(FbInv {
  755. read_name,
  756. ref_name,
  757. aln_a_start,
  758. aln_a_end,
  759. aln_a_strand,
  760. aln_b_start,
  761. aln_b_end,
  762. aln_b_strand,
  763. aln_a_len,
  764. aln_b_len,
  765. b_c,
  766. a_d,
  767. first,
  768. primary_qbeg,
  769. primary_qend,
  770. sa_qbeg,
  771. sa_qend,
  772. })
  773. }
  774. /// Read the `SM` tag from a BAM's `@RG` lines, injecting it if absent or inconsistent.
  775. ///
  776. /// Inspects every `@RG` line in the header. If all lines carry an identical `SM` value,
  777. /// that value is returned. If any line is missing `SM`, or if multiple distinct `SM`
  778. /// values are present, `samtools reheader` is invoked to set `SM` uniformly to
  779. /// `fallback_sample`. The file modification time is preserved after reheadering.
  780. ///
  781. /// # Arguments
  782. ///
  783. /// * `bam_path` - Path to the BAM file (must be writable if reheadering is needed)
  784. /// * `fallback_sample` - Sample name to inject when the header is missing or ambiguous
  785. /// * `config` - Pipeline configuration used to invoke [`SamtoolsReheader`]
  786. ///
  787. /// # Returns
  788. ///
  789. /// The `SM` value present in (or injected into) the header.
  790. ///
  791. /// # Errors
  792. ///
  793. /// Returns an error if the BAM cannot be opened, if reheadering fails, or if the
  794. /// header contains no `@RG` lines at all.
  795. pub fn read_sm_tag_or_inject(
  796. bam_path: &str,
  797. fallback_sample: &str,
  798. config: &Config,
  799. ) -> anyhow::Result<String> {
  800. let reader = bam::Reader::from_path(bam_path)
  801. .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
  802. let header = bam::Header::from_template(reader.header());
  803. let header_text = String::from_utf8_lossy(&header.to_bytes()).into_owned();
  804. let mut sm_values: Vec<String> = Vec::new();
  805. let mut all_have_sm = true;
  806. for line in header_text.lines() {
  807. if line.starts_with("@RG") {
  808. match line
  809. .split('\t')
  810. .find_map(|f| f.strip_prefix("SM:").map(|s| s.to_string()))
  811. {
  812. Some(sm) => sm_values.push(sm),
  813. None => all_have_sm = false,
  814. }
  815. }
  816. }
  817. // Reheader if any @RG lacks SM or if multiple distinct SM values exist.
  818. // No @RG lines at all falls through to the error at the end.
  819. let unique_sm: std::collections::HashSet<&str> = sm_values.iter().map(|s| s.as_str()).collect();
  820. let needs_reheader = !all_have_sm || unique_sm.len() > 1;
  821. if needs_reheader {
  822. if !all_have_sm {
  823. info!("Some @RG lines in {bam_path} lack SM tag — injecting SM:{fallback_sample}");
  824. }
  825. if unique_sm.len() > 1 {
  826. info!(
  827. "Multiple distinct SM values in {bam_path} ({:?}) — collapsing to SM:{fallback_sample}",
  828. unique_sm
  829. );
  830. }
  831. let original_mtime =
  832. filetime::FileTime::from_last_modification_time(&std::fs::metadata(bam_path)?);
  833. let mut reheader = SamtoolsReheader::from_config(config, bam_path, fallback_sample);
  834. reheader.run()?;
  835. // let mut index = SamtoolsIndex::from_config(config, bam_path);
  836. // index.run()?;
  837. filetime::set_file_mtime(bam_path, original_mtime)?;
  838. return Ok(fallback_sample.to_string());
  839. }
  840. // All @RG lines present and SM is uniform — return the single value
  841. sm_values
  842. .into_iter()
  843. .next()
  844. .context(format!("No @RG lines found in {bam_path}"))
  845. }
  846. /// Fetch primary alignment records from an indexed BAM, with optional region and name filtering.
  847. ///
  848. /// Supplementary alignments within the fetch region are resolved to their primaries via
  849. /// [`primary_records`]. Duplicates introduced by that resolution step are removed by
  850. /// query name. Secondary alignments are always excluded.
  851. ///
  852. /// # Arguments
  853. ///
  854. /// * `bam_path` - Path to an indexed BAM file (`.bai` index must exist)
  855. /// * `region` - If `Some((chr, start, end))`, restrict the fetch to that 0-based half-open
  856. /// interval; if `None`, the whole file is scanned
  857. /// * `qname_filter` - Optional [`QnameFilter`] to retain or exclude specific read names
  858. ///
  859. /// # Returns
  860. ///
  861. /// A deduplicated vector of primary records, optionally filtered by query name.
  862. ///
  863. /// # Errors
  864. ///
  865. /// Returns an error if the BAM cannot be opened or if the fetch operation fails.
  866. pub fn fetch_primary_records(
  867. bam_path: &Path,
  868. region: Option<(&str, i64, i64)>,
  869. qname_filter: Option<QnameFilter>,
  870. ) -> anyhow::Result<Vec<bam::Record>> {
  871. let mut reader = bam::IndexedReader::from_path(bam_path)
  872. .with_context(|| format!("Cannot open BAM: {}", bam_path.display()))?;
  873. if let Some((chr, start, end)) = region {
  874. reader
  875. .fetch((chr, start, end))
  876. .with_context(|| format!("Fetch failed for {}:{}-{}", chr, start, end))?;
  877. } else {
  878. reader
  879. .fetch(FetchDefinition::All)
  880. .context("Fetch all failed")?;
  881. }
  882. let records: Vec<bam::Record> = reader
  883. .records()
  884. .filter_map(|r| r.ok())
  885. .filter(|r| !r.is_secondary())
  886. .collect();
  887. // Resolve supplementary → primary via SA tag
  888. let records = primary_records(&mut reader, records);
  889. // Deduplicate by qname (primary_records may introduce duplicates
  890. // if a primary was already in the fetch region)
  891. let mut seen = HashSet::new();
  892. let mut records: Vec<bam::Record> = records
  893. .into_iter()
  894. .filter(|r| seen.insert(r.qname().to_vec()))
  895. .collect();
  896. // Apply qname filter
  897. if let Some(filter) = qname_filter {
  898. match filter {
  899. QnameFilter::Whitelist(names) => {
  900. records.retain(|r| names.contains(r.qname()));
  901. }
  902. QnameFilter::Blacklist(names) => {
  903. records.retain(|r| !names.contains(r.qname()));
  904. }
  905. }
  906. }
  907. Ok(records)
  908. }
  909. /// Query-name filter for [`fetch_primary_records`].
  910. pub fn query_aligned_span(record: &rust_htslib::bam::Record) -> (usize, usize) {
  911. let cigar = record.cigar();
  912. let qstart = cigar
  913. .iter()
  914. .take_while(|c| matches!(c, Cigar::SoftClip(_) | Cigar::HardClip(_)))
  915. .filter_map(|c| match c {
  916. Cigar::SoftClip(len) => Some(*len as usize),
  917. _ => None,
  918. })
  919. .sum::<usize>();
  920. let qend = record.seq_len()
  921. - cigar
  922. .iter()
  923. .rev()
  924. .take_while(|c| matches!(c, Cigar::SoftClip(_) | Cigar::HardClip(_)))
  925. .filter_map(|c| match c {
  926. Cigar::SoftClip(len) => Some(*len as usize),
  927. _ => None,
  928. })
  929. .sum::<usize>();
  930. (qstart, qend)
  931. }
  932. pub enum QnameFilter {
  933. /// Retain only records whose query name is in the set.
  934. Whitelist(HashSet<Vec<u8>>),
  935. /// Exclude records whose query name is in the set.
  936. Blacklist(HashSet<Vec<u8>>),
  937. }
  938. /// Convert a BAM/CRAM file to a BED4 file with one interval per aligned record.
  939. ///
  940. /// Each output line has the form `chrom\tstart\tend\tread_name`. Coordinates are
  941. /// 0-based half-open (`start` from [`Record::pos`], `end` from [`Record::reference_end`]),
  942. /// consistent with BED convention. Unmapped records and records with a negative `tid`
  943. /// are skipped. Secondary and supplementary alignments are **not** filtered — add
  944. /// upstream filtering if stricter selection is needed.
  945. ///
  946. /// # Arguments
  947. ///
  948. /// * `input_bam` - Path to the input BAM or CRAM file
  949. /// * `output_bed` - Path to the output BED file (created or overwritten)
  950. ///
  951. /// # Errors
  952. ///
  953. /// Returns an error if the input cannot be read, the output cannot be written, or
  954. /// if UTF-8 conversion of a reference name or query name fails.
  955. pub fn bam_to_aligned_bed<P: AsRef<Path>>(input_bam: P, output_bed: P) -> anyhow::Result<()> {
  956. let mut bam = bam::Reader::from_path(input_bam)?;
  957. let header = bam.header().to_owned();
  958. let out = File::create(output_bed)?;
  959. let mut writer = std::io::BufWriter::new(out);
  960. for result in bam.records() {
  961. let record = result?;
  962. if record.is_unmapped() {
  963. continue;
  964. }
  965. let tid = record.tid();
  966. if tid < 0 {
  967. continue;
  968. }
  969. let chrom = std::str::from_utf8(header.tid2name(tid as u32))?;
  970. // BAM positions are already 0-based.
  971. let start = record.pos();
  972. // reference_end() accounts for CIGAR operations consuming reference:
  973. // M, D, N, =, X
  974. let end = record.reference_end();
  975. if end > start {
  976. let qname = std::str::from_utf8(record.qname())?;
  977. let (qstart, qend) = query_aligned_span(&record);
  978. writeln!(
  979. writer,
  980. "{chrom}\t{start}\t{end}\t{}:{}-{}",
  981. qname, qstart, qend
  982. )?;
  983. }
  984. }
  985. Ok(())
  986. }
  987. #[cfg(test)]
  988. mod tests {
  989. use super::*;
  990. use crate::helpers::test_init;
  991. #[test]
  992. fn sm_tag() -> anyhow::Result<()> {
  993. test_init();
  994. let config = Config::default();
  995. read_sm_tag_or_inject(&config.tumoral_bam("CHALO"), "CHALO_diag", &config)?;
  996. Ok(())
  997. }
  998. #[test]
  999. fn alignment_query_coords_counts_soft_clip_inside_hard_clip() {
  1000. let ops = parse_cigar_str("5H10S90M").unwrap();
  1001. assert_eq!(alignment_query_coords(&ops, '+'), (10, 100));
  1002. let ops = parse_cigar_str("90M10S5H").unwrap();
  1003. assert_eq!(alignment_query_coords(&ops, '-'), (10, 100));
  1004. }
  1005. }