| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770 |
- use std::collections::{HashMap, HashSet};
- use anyhow::Context;
- use log::warn;
- use rust_htslib::bam::{self, record::Aux, record::Cigar, IndexedReader, Read, Record};
- /// Parses an SA tag string and extracts chromosome and position information.
- ///
- /// # Arguments
- ///
- /// * `sa` - The SA tag string to parse
- ///
- /// # Returns
- ///
- /// A vector of tuples containing chromosome names and positions
- pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> {
- sa.split(';')
- .filter(|s| !s.is_empty())
- .filter_map(|s| {
- let parts: Vec<&str> = s.split(',').take(2).collect();
- if parts.len() < 2 {
- return None;
- }
- let chr = parts[0];
- match parts[1].parse::<i32>() {
- Ok(position) => Some((chr, position)),
- Err(_) => None,
- }
- })
- .collect()
- }
- /// Resolves the primary record for supplementary alignments using BAM SA tag
- ///
- /// For supplementary alignments, searches linked primary records using genomic positions
- /// listed in the SA tag. Returns the first non-supplementary record with matching query name.
- ///
- /// # Arguments
- /// * `bam` - Mutable reference to indexed BAM reader for random access
- /// * `record` - Input record to evaluate (typically a supplementary alignment)
- ///
- /// # Returns
- /// - Original record if it's not supplementary
- /// - First matching primary record found via SA tag positions
- /// - Original record if no primary matches found
- ///
- /// # Panics
- /// - If SA tag format is invalid (missing fields or non-integer positions)
- /// - If BAM fetch operation fails
- ///
- /// # Example
- /// ```
- /// use rust_htslib::{bam::{IndexedReader, Read}};
- ///
- /// let mut bam = IndexedReader::from_path("input.bam").unwrap();
- /// let record = bam.records().next().unwrap().unwrap();
- /// let primary = primary_record(&mut bam, record);
- /// ```
- pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
- // Return immediately if not a supplementary alignment
- if !record.is_supplementary() {
- return record;
- }
- let qname = record.qname();
- // Process SA tag if present
- if let Ok(Aux::String(sa)) = record.aux(b"SA") {
- // Search potential primary alignments at each SA position
- for (chr, start) in parse_sa_tag(sa) {
- bam.fetch((chr, start, start + 1))
- .expect("BAM fetch failed");
- for result in bam.records() {
- let candidate = result.expect("Invalid BAM record");
- if candidate.qname() == qname && !candidate.is_supplementary() {
- return candidate.clone();
- }
- }
- }
- }
- // Fallback to original record if no primary found
- record
- }
- /// Creates optimized position bins for fetching records.
- ///
- /// Groups positions by chromosome and creates bins of ±1000 bp.
- /// Sorts bins by size (descending) to prioritize regions with more alignment hits.
- ///
- /// # Arguments
- ///
- /// * `positions` - Vector of chromosome/position tuples
- ///
- /// # Returns
- ///
- /// A vector of position bins, sorted by bin size
- fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec<Vec<(&'a str, i32)>> {
- // Sort positions by chromosome and position
- let mut sorted_positions = positions.to_vec();
- sorted_positions.sort_by_key(|(chr, pos)| (*chr, *pos));
- sorted_positions.dedup();
- // Group by chromosome and create bins of ±1000 bp
- let mut grouped: HashMap<&str, Vec<Vec<(&str, i32)>>> = HashMap::new();
- for (chr, pos) in sorted_positions {
- let bins = grouped.entry(chr).or_default();
- if let Some(last_bin) = bins.last_mut() {
- if last_bin.is_empty() || (pos - last_bin[0].1).abs() <= 1000 {
- last_bin.push((chr, pos));
- } else {
- bins.push(vec![(chr, pos)]);
- }
- } else {
- bins.push(vec![(chr, pos)]);
- }
- }
- // Flatten and sort by bin size (descending)
- let mut flattened: Vec<Vec<(&str, i32)>> = grouped.values().flatten().cloned().collect();
- // Sort bins by size (descending) to prioritize regions with more hits
- flattened.sort_by_key(|bin| std::cmp::Reverse(bin.len()));
- flattened
- }
- /// Retrieves primary alignment records based on a set of input records.
- ///
- /// This function processes a collection of BAM records and retrieves their primary alignments.
- /// When supplementary alignments are found (with SA tags), it fetches the corresponding
- /// primary alignments from the provided BAM file.
- ///
- /// # Arguments
- ///
- /// * `bam` - A mutable reference to an IndexedReader for the BAM file
- /// * `records` - A vector of input records to process
- ///
- /// # Returns
- ///
- /// A vector of records containing both:
- /// - Original primary alignments from the input
- /// - Primary alignments fetched for any supplementary records in the input
- ///
- /// # Examples
- ///
- /// ```
- /// use rust_htslib::bam::{IndexedReader, Record};
- /// let mut bam = IndexedReader::from_path("sample.bam").unwrap();
- /// let records = vec![/* some records */];
- /// let primary_alignments = primary_records(&mut bam, records);
- /// ```
- pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Record> {
- let mut res = Vec::with_capacity(records.len());
- let mut all_positions = Vec::new();
- let mut all_qnames_to_fetch = Vec::new();
- // First pass: collect primary records and positions to fetch
- for record in records.iter() {
- if record.is_supplementary() {
- let qname = record.qname();
- // Safely extract and process the SA tag
- if let Ok(Aux::String(sa)) = record.aux(b"SA") {
- let positions = parse_sa_tag(sa);
- all_positions.extend(positions);
- match String::from_utf8(qname.to_vec()) {
- Ok(qname_str) => all_qnames_to_fetch.push(qname_str),
- Err(e) => warn!("Invalid UTF-8 in qname: {}", e),
- }
- }
- } else {
- res.push(record.clone());
- }
- }
- // Track which query names we've already found
- let mut primary_records: HashSet<String> = res
- .iter()
- .filter_map(|r| String::from_utf8(r.qname().to_vec()).ok())
- .collect();
- // Create position bins for efficient fetching
- let position_bins = create_position_bins(&all_positions);
- // Fetch records for each bin until we find all primaries
- for bin in position_bins {
- if bin.is_empty() {
- continue;
- }
- let (chr, start) = bin.first().unwrap();
- let end = bin.last().unwrap().1 + 1;
- // Fetch and process records in this region
- if let Err(e) = bam.fetch((*chr, *start, end)) {
- warn!("Failed to fetch region {}:{}-{}: {}", chr, start, end, e);
- continue;
- }
- for record_result in bam.records() {
- match record_result {
- Ok(record) => {
- // Skip secondary alignments
- if record.is_secondary() {
- continue;
- }
- // Check if this is a primary we're looking for
- match String::from_utf8(record.qname().to_vec()) {
- Ok(qname) => {
- if primary_records.contains(&qname) {
- continue;
- }
- if all_qnames_to_fetch.contains(&qname) {
- res.push(record);
- primary_records.insert(qname);
- }
- }
- Err(_) => continue,
- }
- }
- Err(e) => warn!("Error reading record: {}", e),
- }
- }
- // Early exit if we found all the records we were looking for
- let remaining = all_qnames_to_fetch
- .iter()
- .filter(|q| !primary_records.contains(*q))
- .count();
- if remaining == 0 {
- break;
- }
- }
- res
- }
- pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<HashMap<String, u64>> {
- let mut genome = HashMap::new();
- for (key, records) in header.to_hashmap() {
- for record in records {
- if key == "SQ" {
- genome.insert(
- record["SN"].to_string(),
- record["LN"]
- .parse::<u64>()
- .expect("Failed parsing length of chromosomes"),
- );
- }
- }
- }
- Ok(genome)
- }
- /// Split the genome into N balanced region lists (`ctg:start-end` strings).
- /// Each element of the outer Vec is the list of `-r` regions for one batch.
- pub fn split_genome_into_n_regions(genome: &HashMap<String, u64>, n: usize) -> Vec<Vec<String>> {
- if n == 0 || genome.is_empty() {
- return Vec::new();
- }
- // Deterministic order over contigs
- let mut contigs: Vec<_> = genome.iter().collect();
- contigs.sort_by(|(a, _), (b, _)| a.cmp(b));
- let total_bp: u64 = contigs.iter().map(|(_, &len)| len).sum();
- let target_per_batch: u64 = total_bp.div_ceil(n as u64); // ceil
- let mut batches: Vec<Vec<String>> = vec![Vec::new(); n];
- let mut batch_idx = 0usize;
- let mut batch_bp = 0u64;
- for (ctg, &len) in contigs {
- let mut start: u64 = 1;
- while start <= len && batch_idx < n {
- let remaining_in_ctg = len - start + 1;
- let remaining_for_batch = if batch_bp >= target_per_batch {
- 1 // force close & move to next batch
- } else {
- target_per_batch - batch_bp
- };
- let seg_len = remaining_in_ctg.min(remaining_for_batch);
- let end = start + seg_len - 1;
- let region = format!("{ctg}:{start}-{end}");
- batches[batch_idx].push(region);
- batch_bp += seg_len;
- start = end + 1;
- if batch_bp >= target_per_batch && batch_idx + 1 < n {
- batch_idx += 1;
- batch_bp = 0;
- }
- }
- }
- batches
- }
- /// Indicates which alignment segment appears first on the read.
- #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
- pub enum SegmentOrder {
- /// Primary alignment appears before supplementary on the read
- PrimaryFirst,
- /// Supplementary alignment appears before primary on the read
- SuppFirst,
- }
- /// Fold-back inversion event detected from a primary/supplementary alignment pair.
- /// Thanks to: https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py
- ///
- /// A fold-back inversion occurs when a read has two alignments on the same chromosome
- /// with opposite strands that overlap on the reference. This pattern suggests the
- /// presence of an inverted duplication or fold-back structure.
- ///
- /// # Coordinate System
- /// - All coordinates are 0-based, half-open `[start, end)`
- /// - Alignments A and B are ordered by their position on the read (A first)
- /// - Breakpoint distances (`b_c`, `a_d`) use strand-aware coordinates
- ///
- /// # Breakpoints
- /// Breakpoints (a, b, c, d) are strand-aware positions representing alignment termini:
- /// - For `+` strand: `a = start` (5'), `b = end` (3')
- /// - For `-` strand: `c = start` (3'), `d = end` (5')
- ///
- /// ```text
- /// Case: A on '+' strand, B on '-' strand
- ///
- /// a b
- /// | |
- /// v v
- /// Reference:---[=====A=================>]---------
- /// --------[<=============B======]-------
- /// ^ ^
- /// | |
- /// c d
- ///
- /// Alignment A (+ strand): a=aln_a_start, b=aln_a_end
- /// Alignment B (- strand): c=aln_b_start, d=aln_b_end
- ///
- /// Distances:
- /// aln_a_len = |a - b| = alignment A length
- /// aln_b_len = |c - d| = alignment B length
- /// b_c = |b - c| = inner breakpoint distance (3' ends)
- /// a_d = |a - d| = outer breakpoint distance (5' ends)
- /// ```
- #[derive(Debug)]
- pub struct FbInv {
- /// Read name (QNAME from BAM)
- pub read_name: String,
- /// Chromosome/contig name
- pub ref_name: String,
- /// Alignment A reference start (0-based)
- pub aln_a_start: i64,
- /// Alignment A reference end (exclusive)
- pub aln_a_end: i64,
- /// Alignment A strand ('+' or '-')
- pub aln_a_strand: char,
- /// Alignment B reference start (0-based)
- pub aln_b_start: i64,
- /// Alignment B reference end (exclusive)
- pub aln_b_end: i64,
- /// Alignment B strand ('+' or '-')
- pub aln_b_strand: char,
- /// Alignment A length on reference (bp)
- pub aln_a_len: i64,
- /// Alignment B length on reference (bp)
- pub aln_b_len: i64,
- /// Distance between breakpoints B and C (strand-aware)
- pub b_c: i64,
- /// Distance between breakpoints A and D (strand-aware)
- pub a_d: i64,
- /// Which alignment appears first on the read
- pub first: SegmentOrder,
- /// Primary alignment query start on read
- pub primary_qbeg: u32,
- /// Primary alignment query end on read
- pub primary_qend: u32,
- /// Supplementary alignment query start on read
- pub sa_qbeg: u32,
- /// Supplementary alignment query end on read
- pub sa_qend: u32,
- }
- /// Parse a SAM CIGAR string into a vector of CIGAR operations.
- ///
- /// Supports all standard CIGAR operations: M, I, D, N, S, H, P, =, X
- ///
- /// # Arguments
- /// * `s` - CIGAR string (e.g., "10S100M5I50M20S")
- ///
- /// # Returns
- /// `Some(Vec<Cigar>)` if valid, `None` if malformed or empty
- ///
- /// # Examples
- /// ```
- /// let ops = parse_cigar_str("10S100M").unwrap();
- /// assert_eq!(ops.len(), 2);
- /// ```
- pub fn parse_cigar_str(s: &str) -> Option<Vec<Cigar>> {
- if s.is_empty() {
- return None;
- }
- // Pre-allocate: rough estimate ~1 op per 2-3 chars
- let mut ops = Vec::with_capacity(s.len() / 2);
- let mut num = String::new();
- for c in s.chars() {
- if c.is_ascii_digit() {
- num.push(c);
- continue;
- }
- // We hit an operator letter: parse the accumulated number
- if num.is_empty() {
- return None; // operator without length
- }
- let len: u32 = num.parse().ok()?;
- num.clear();
- let op = match c {
- 'M' => Cigar::Match(len),
- 'I' => Cigar::Ins(len),
- 'D' => Cigar::Del(len),
- 'N' => Cigar::RefSkip(len),
- 'S' => Cigar::SoftClip(len),
- 'H' => Cigar::HardClip(len),
- 'P' => Cigar::Pad(len),
- '=' => Cigar::Equal(len),
- 'X' => Cigar::Diff(len),
- _ => return None, // unknown operator
- };
- ops.push(op);
- }
- // CIGAR ending with digits only is invalid
- if !num.is_empty() {
- return None;
- }
- // Validation: CIGAR must have at least one operation
- if ops.is_empty() {
- return None;
- }
- Some(ops)
- }
- /// Compute query coordinates (start, end) on the read from a CIGAR.
- ///
- /// For plus strand, the first CIGAR operation corresponds to the read start.
- /// For minus strand, the last CIGAR operation corresponds to the read start.
- ///
- /// # Arguments
- /// * `ops` - CIGAR operations slice
- /// * `strand` - '+' or '-'
- ///
- /// # Returns
- /// Tuple of (query_begin, query_end) in read coordinates (excluding hard clips)
- fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) {
- if ops.is_empty() {
- return (0, 0);
- }
- // For minus strand, the "first" op on the read is the last in the CIGAR
- let first_op = if strand == '+' {
- &ops[0]
- } else {
- ops.last().unwrap()
- };
- let query_beg = match first_op {
- Cigar::SoftClip(len) => *len,
- _ => 0,
- };
- // Alignment length on the read (consumes query bases)
- let aln_len: u32 = ops
- .iter()
- .filter_map(|op| match op {
- Cigar::Match(l) | Cigar::Ins(l) | Cigar::Equal(l) | Cigar::Diff(l) => Some(*l),
- _ => None,
- })
- .sum();
- (query_beg, query_beg + aln_len)
- }
- /// Compute alignment length on the reference from CIGAR operations.
- ///
- /// Counts bases consumed by M, D, N, =, X operations.
- ///
- /// # Arguments
- /// * `ops` - CIGAR operations slice
- ///
- /// # Returns
- /// Reference-consuming length in base pairs
- fn alignment_ref_length(ops: &[Cigar]) -> u32 {
- ops.iter()
- .filter_map(|op| match op {
- Cigar::Match(l)
- | Cigar::Del(l)
- | Cigar::RefSkip(l)
- | Cigar::Equal(l)
- | Cigar::Diff(l) => Some(*l),
- _ => None,
- })
- .sum()
- }
- /// Compute gap between two intervals. Negative value means overlap.
- ///
- /// # Arguments
- /// * `a_start`, `a_end` - First interval `[a_start, a_end)`
- /// * `b_start`, `b_end` - Second interval `[b_start, b_end)`
- ///
- /// # Returns
- /// Gap in bp (negative if overlapping, 0 if adjacent, positive if separated)
- #[inline]
- fn interval_gap(a_start: i64, a_end: i64, b_start: i64, b_end: i64) -> i64 {
- a_start.max(b_start) - a_end.min(b_end)
- }
- /// Compute strand-aware breakpoint coordinates.
- ///
- /// For fold-back inversions, breakpoints depend on strand orientation:
- /// - Plus strand: breakpoints are (start, end) - 5' to 3' direction
- /// - Minus strand: breakpoints are (end, start) - reversed direction
- ///
- /// # Arguments
- /// * `start` - Alignment start (0-based)
- /// * `end` - Alignment end (exclusive)
- /// * `strand` - '+' or '-'
- ///
- /// # Returns
- /// Tuple of (breakpoint_a, breakpoint_b) in strand-aware order
- #[inline]
- fn breakpoints(start: i64, end: i64, strand: char) -> (i64, i64) {
- if strand == '+' {
- (start, end)
- } else {
- (end, start)
- }
- }
- /// Extract a fold-back inversion event from a BAM record.
- ///
- /// Analyzes a primary alignment and its supplementary alignment (from SA tag)
- /// to detect fold-back inversion patterns characterized by:
- /// - Same chromosome
- /// - Opposite strands
- /// - Reference coordinates within proximity threshold
- /// - Exactly one supplementary alignment
- ///
- /// # Arguments
- /// * `record` - BAM record (must be primary alignment)
- /// * `header` - BAM header for reference name lookup
- /// * `min_mapq` - Minimum mapping quality threshold for both alignments
- /// * `max_overlap` - Maximum allowed overlap in bp (None = no limit)
- /// * `max_gap` - Maximum allowed gap in bp (None = no limit, 0 = require overlap or adjacent)
- /// default should be [-150, 200) Some(150), Some(200)
- ///
- /// # Returns
- /// `Some(FbInv)` if a valid fold-back pattern is detected, `None` otherwise
- ///
- /// # Filtering
- /// Returns `None` if any of these conditions are met:
- /// - Record is unmapped, secondary, or supplementary
- /// - MAPQ below threshold (primary or supplementary)
- /// - No SA tag or multiple supplementary alignments
- /// - Supplementary on different chromosome
- /// - Same strand orientation
- /// - Overlap exceeds `max_overlap` (if set)
- /// - Gap exceeds `max_gap` (if set)
- pub fn fb_inv_from_record(
- record: &bam::Record,
- header: &bam::HeaderView,
- min_mapq: u8,
- max_overlap: Option<i64>,
- max_gap: Option<i64>,
- ) -> Option<FbInv> {
- // Basic filters (primary only, mapped, MAPQ)
- if record.is_unmapped()
- || record.is_secondary()
- || record.is_supplementary()
- || record.mapq() < min_mapq
- {
- return None;
- }
- // SA tag required
- let sa_str = match record.aux(b"SA") {
- Ok(Aux::String(s)) => s,
- _ => return None,
- };
- let sa_entries: Vec<&str> = sa_str.split(';').filter(|s| !s.is_empty()).collect();
- // Require exactly one SA (like the Python code)
- if sa_entries.len() != 1 {
- return None;
- }
- let sa = sa_entries[0];
- // SA format: chr,pos,strand,cigar,mapq,nm
- let fields: Vec<&str> = sa.split(',').collect();
- if fields.len() < 6 {
- return None;
- }
- let sa_rname = fields[0];
- let sa_pos_1based: i64 = fields[1].parse().ok()?; // SAM is 1-based
- let sa_strand_char: char = fields[2].chars().next()?;
- let sa_cigar_str = fields[3];
- let sa_mapq: u8 = fields[4].parse().ok()?;
- if sa_mapq < min_mapq {
- return None;
- }
- // Same chromosome: compare SA rname with primary reference name
- let tid = record.tid();
- if tid < 0 {
- return None;
- }
- let primary_rname = std::str::from_utf8(header.tid2name(tid as u32)).ok()?;
- if sa_rname != primary_rname {
- return None;
- }
- // Strand check: need opposite strands for fold-back inversion
- let primary_strand_char = if record.is_reverse() { '-' } else { '+' };
- if primary_strand_char == sa_strand_char {
- return None;
- }
- // Primary ref coords (0-based, half-open)
- let primary_start = record.pos();
- let primary_end = record.cigar().end_pos();
- // Parse SA CIGAR
- let sa_cigar_ops = parse_cigar_str(sa_cigar_str)?;
- let sa_ref_len = alignment_ref_length(&sa_cigar_ops);
- // Validation: SA must have non-zero reference length
- if sa_ref_len == 0 {
- return None;
- }
- // SA ref coords: convert POS from 1-based to 0-based
- let sa_start = sa_pos_1based - 1;
- let sa_end = sa_start + sa_ref_len as i64;
- // Check proximity between alignments
- // gap < 0: overlap of |gap| bp
- // gap = 0: adjacent (touching)
- // gap > 0: separated by gap bp
- let gap = interval_gap(primary_start, primary_end, sa_start, sa_end);
- // Check max_overlap: reject if overlap exceeds limit
- if let Some(max_ovl) = max_overlap {
- if gap < 0 && gap.abs() > max_ovl {
- return None;
- }
- }
- // Check max_gap: reject if gap exceeds limit
- if let Some(max_g) = max_gap {
- if gap > max_g {
- return None;
- }
- }
- // Query coords for primary and SA
- let primary_cigar: Vec<Cigar> = record.cigar().iter().cloned().collect();
- let (primary_qbeg, primary_qend) = alignment_query_coords(&primary_cigar, primary_strand_char);
- let (sa_qbeg, sa_qend) = alignment_query_coords(&sa_cigar_ops, sa_strand_char);
- // Decide which segment is first on the read
- let first = if primary_qbeg <= sa_qbeg {
- SegmentOrder::PrimaryFirst
- } else {
- SegmentOrder::SuppFirst
- };
- // Build FbInv with A/B ordered by read position
- let (aln_a_start, aln_a_end, aln_a_strand, aln_b_start, aln_b_end, aln_b_strand) =
- if first == SegmentOrder::PrimaryFirst {
- (
- primary_start,
- primary_end,
- primary_strand_char,
- sa_start,
- sa_end,
- sa_strand_char,
- )
- } else {
- (
- sa_start,
- sa_end,
- sa_strand_char,
- primary_start,
- primary_end,
- primary_strand_char,
- )
- };
- // Distances a_b, c_d, b_c, a_d (like the Python class)
- let (a, b) = breakpoints(aln_a_start, aln_a_end, aln_a_strand);
- let (c, d) = breakpoints(aln_b_start, aln_b_end, aln_b_strand);
- let aln_a_len = (a - b).abs();
- let aln_b_len = (c - d).abs();
- let b_c = (b - c).abs();
- let a_d = (a - d).abs();
- let read_name = std::str::from_utf8(record.qname())
- .map(|s| s.to_string())
- .unwrap_or_else(|_| String::from_utf8_lossy(record.qname()).into_owned());
- let ref_name = primary_rname.to_string();
- Some(FbInv {
- read_name,
- ref_name,
- aln_a_start,
- aln_a_end,
- aln_a_strand,
- aln_b_start,
- aln_b_end,
- aln_b_strand,
- aln_a_len,
- aln_b_len,
- b_c,
- a_d,
- first,
- primary_qbeg,
- primary_qend,
- sa_qbeg,
- sa_qend,
- })
- }
- pub fn read_sm_tag(bam_path: &str) -> anyhow::Result<String> {
- let reader = bam::Reader::from_path(bam_path)
- .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
- let header = bam::Header::from_template(reader.header());
- let header_text = String::from_utf8_lossy(&header.to_bytes()).to_string();
- for line in header_text.lines() {
- if line.starts_with("@RG") {
- for field in line.split('\t') {
- if let Some(sm) = field.strip_prefix("SM:") {
- return Ok(sm.to_string());
- }
- }
- }
- }
- anyhow::bail!("No SM tag found in @RG header of {bam_path}")
- }
|