|
@@ -11,113 +11,125 @@ use rust_htslib::bam::{
|
|
|
|
|
|
|
|
use crate::{commands::samtools::SamtoolsReheader, config::Config, runners::Run};
|
|
use crate::{commands::samtools::SamtoolsReheader, config::Config, runners::Run};
|
|
|
|
|
|
|
|
-/// Parses an SA tag string and extracts chromosome and position information.
|
|
|
|
|
|
|
+/// A single entry from a BAM SA (supplementary alignment) tag.
|
|
|
|
|
+///
|
|
|
|
|
+/// SA tag format per the SAM spec: `rname,pos,strand,CIGAR,mapQ,NM` (semicolon-separated).
|
|
|
|
|
+/// `pos` is stored as 0-based after conversion from the 1-based SAM value.
|
|
|
|
|
+pub struct SaEntry<'a> {
|
|
|
|
|
+ /// Reference/contig name
|
|
|
|
|
+ pub chr: &'a str,
|
|
|
|
|
+ /// Reference start position, 0-based
|
|
|
|
|
+ pub pos: i32,
|
|
|
|
|
+ /// Strand: `'+'` or `'-'`
|
|
|
|
|
+ pub strand: char,
|
|
|
|
|
+ /// CIGAR string (unparsed)
|
|
|
|
|
+ pub cigar: &'a str,
|
|
|
|
|
+ /// Mapping quality
|
|
|
|
|
+ pub mapq: u8,
|
|
|
|
|
+ /// Edit distance (NM tag value); `0` if absent or unparseable
|
|
|
|
|
+ pub nm: u32,
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+/// Parse all entries from a BAM SA tag value into [`SaEntry`] structs.
|
|
|
|
|
+///
|
|
|
|
|
+/// `pos` is converted from 1-based (SAM spec) to 0-based. Entries with fewer than
|
|
|
|
|
+/// six comma-separated fields or unparseable numeric fields are silently skipped.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
|
///
|
|
///
|
|
|
-/// * `sa` - The SA tag string to parse
|
|
|
|
|
|
|
+/// * `sa` - Raw SA tag value (the string after `SA:Z:` in a SAM record)
|
|
|
///
|
|
///
|
|
|
/// # Returns
|
|
/// # Returns
|
|
|
///
|
|
///
|
|
|
-/// A vector of tuples containing chromosome names and positions
|
|
|
|
|
-pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> {
|
|
|
|
|
|
|
+/// A vector of parsed entries. Empty if the tag contains no valid entries.
|
|
|
|
|
+pub fn parse_sa_entries(sa: &str) -> Vec<SaEntry<'_>> {
|
|
|
sa.split(';')
|
|
sa.split(';')
|
|
|
.filter(|s| !s.is_empty())
|
|
.filter(|s| !s.is_empty())
|
|
|
.filter_map(|s| {
|
|
.filter_map(|s| {
|
|
|
- let parts: Vec<&str> = s.split(',').take(2).collect();
|
|
|
|
|
- if parts.len() < 2 {
|
|
|
|
|
|
|
+ let f: Vec<&str> = s.splitn(6, ',').collect();
|
|
|
|
|
+ if f.len() < 6 {
|
|
|
return None;
|
|
return None;
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
- let chr = parts[0];
|
|
|
|
|
- match parts[1].parse::<i32>() {
|
|
|
|
|
- Ok(position) => Some((chr, position)),
|
|
|
|
|
- Err(_) => None,
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ Some(SaEntry {
|
|
|
|
|
+ chr: f[0],
|
|
|
|
|
+ pos: f[1].parse::<i32>().ok()? - 1,
|
|
|
|
|
+ strand: f[2].chars().next()?,
|
|
|
|
|
+ cigar: f[3],
|
|
|
|
|
+ mapq: f[4].parse().ok()?,
|
|
|
|
|
+ nm: f[5].parse().unwrap_or(0),
|
|
|
|
|
+ })
|
|
|
})
|
|
})
|
|
|
.collect()
|
|
.collect()
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Resolves the primary record for supplementary alignments using BAM SA tag
|
|
|
|
|
|
|
+/// Convenience wrapper over [`parse_sa_entries`] returning only `(chr, pos)` pairs.
|
|
|
|
|
+///
|
|
|
|
|
+/// `pos` is 0-based. Useful when only fetch coordinates are needed.
|
|
|
|
|
+pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> {
|
|
|
|
|
+ parse_sa_entries(sa).into_iter().map(|e| (e.chr, e.pos)).collect()
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+/// Resolve a supplementary alignment to its primary record via the SA tag.
|
|
|
///
|
|
///
|
|
|
-/// For supplementary alignments, searches linked primary records using genomic positions
|
|
|
|
|
-/// listed in the SA tag. Returns the first non-supplementary record with matching query name.
|
|
|
|
|
|
|
+/// For supplementary alignments, each position listed in the SA tag is fetched and
|
|
|
|
|
+/// scanned for a non-supplementary record with the same query name. The first match
|
|
|
|
|
+/// is returned. If the record is not supplementary, or no primary is found across all
|
|
|
|
|
+/// SA positions, the original record is returned unchanged.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
|
-/// * `bam` - Mutable reference to indexed BAM reader for random access
|
|
|
|
|
-/// * `record` - Input record to evaluate (typically a supplementary alignment)
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// * `bam` - Indexed BAM reader used for random-access fetches
|
|
|
|
|
+/// * `record` - The record to resolve (typically a supplementary alignment)
|
|
|
///
|
|
///
|
|
|
/// # Returns
|
|
/// # Returns
|
|
|
-/// - Original record if it's not supplementary
|
|
|
|
|
-/// - First matching primary record found via SA tag positions
|
|
|
|
|
-/// - Original record if no primary matches found
|
|
|
|
|
///
|
|
///
|
|
|
-/// # Panics
|
|
|
|
|
-/// - If SA tag format is invalid (missing fields or non-integer positions)
|
|
|
|
|
-/// - If BAM fetch operation fails
|
|
|
|
|
|
|
+/// The primary alignment record, or the input record if resolution is not applicable
|
|
|
|
|
+/// or unsuccessful.
|
|
|
///
|
|
///
|
|
|
-/// # Example
|
|
|
|
|
-/// ```
|
|
|
|
|
-/// use rust_htslib::{bam::{IndexedReader, Read}};
|
|
|
|
|
|
|
+/// # Errors
|
|
|
///
|
|
///
|
|
|
-/// let mut bam = IndexedReader::from_path("input.bam").unwrap();
|
|
|
|
|
-/// let record = bam.records().next().unwrap().unwrap();
|
|
|
|
|
-/// let primary = primary_record(&mut bam, record);
|
|
|
|
|
-/// ```
|
|
|
|
|
-pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
|
|
|
|
|
- // Return immediately if not a supplementary alignment
|
|
|
|
|
|
|
+/// Returns an error if a BAM fetch operation fails or if a fetched record is malformed.
|
|
|
|
|
+pub fn primary_record(bam: &mut IndexedReader, record: Record) -> anyhow::Result<Record> {
|
|
|
if !record.is_supplementary() {
|
|
if !record.is_supplementary() {
|
|
|
- return record;
|
|
|
|
|
|
|
+ return Ok(record);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
let qname = record.qname();
|
|
let qname = record.qname();
|
|
|
|
|
|
|
|
- // Process SA tag if present
|
|
|
|
|
if let Ok(Aux::String(sa)) = record.aux(b"SA") {
|
|
if let Ok(Aux::String(sa)) = record.aux(b"SA") {
|
|
|
- // Search potential primary alignments at each SA position
|
|
|
|
|
for (chr, start) in parse_sa_tag(sa) {
|
|
for (chr, start) in parse_sa_tag(sa) {
|
|
|
bam.fetch((chr, start, start + 1))
|
|
bam.fetch((chr, start, start + 1))
|
|
|
- .expect("BAM fetch failed");
|
|
|
|
|
|
|
+ .with_context(|| format!("BAM fetch failed at {chr}:{start}"))?;
|
|
|
|
|
|
|
|
for result in bam.records() {
|
|
for result in bam.records() {
|
|
|
- let candidate = result.expect("Invalid BAM record");
|
|
|
|
|
- if candidate.qname() == qname && !candidate.is_supplementary() {
|
|
|
|
|
- return candidate.clone();
|
|
|
|
|
|
|
+ let candidate = result.context("Invalid BAM record")?;
|
|
|
|
|
+ if candidate.qname() == qname && !candidate.is_supplementary() && !candidate.is_secondary() {
|
|
|
|
|
+ return Ok(candidate);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // Fallback to original record if no primary found
|
|
|
|
|
- record
|
|
|
|
|
|
|
+ Ok(record)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Creates optimized position bins for fetching records.
|
|
|
|
|
|
|
+/// Group 0-based positions by chromosome into bins spanning at most 1000 bp, sorted by bin size descending.
|
|
|
///
|
|
///
|
|
|
-/// Groups positions by chromosome and creates bins of ±1000 bp.
|
|
|
|
|
-/// Sorts bins by size (descending) to prioritize regions with more alignment hits.
|
|
|
|
|
-///
|
|
|
|
|
-/// # Arguments
|
|
|
|
|
-///
|
|
|
|
|
-/// * `positions` - Vector of chromosome/position tuples
|
|
|
|
|
-///
|
|
|
|
|
-/// # Returns
|
|
|
|
|
-///
|
|
|
|
|
-/// A vector of position bins, sorted by bin size
|
|
|
|
|
|
|
+/// Positions within 1000 bp of the first position in the current bin are merged into
|
|
|
|
|
+/// that bin. Larger bins are sorted first so that high-density regions are fetched
|
|
|
|
|
+/// before sparse ones, enabling earlier early-exit in callers.
|
|
|
fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec<Vec<(&'a str, i32)>> {
|
|
fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec<Vec<(&'a str, i32)>> {
|
|
|
- // Sort positions by chromosome and position
|
|
|
|
|
let mut sorted_positions = positions.to_vec();
|
|
let mut sorted_positions = positions.to_vec();
|
|
|
sorted_positions.sort_by_key(|(chr, pos)| (*chr, *pos));
|
|
sorted_positions.sort_by_key(|(chr, pos)| (*chr, *pos));
|
|
|
sorted_positions.dedup();
|
|
sorted_positions.dedup();
|
|
|
|
|
|
|
|
- // Group by chromosome and create bins of ±1000 bp
|
|
|
|
|
let mut grouped: HashMap<&str, Vec<Vec<(&str, i32)>>> = HashMap::new();
|
|
let mut grouped: HashMap<&str, Vec<Vec<(&str, i32)>>> = HashMap::new();
|
|
|
|
|
|
|
|
for (chr, pos) in sorted_positions {
|
|
for (chr, pos) in sorted_positions {
|
|
|
let bins = grouped.entry(chr).or_default();
|
|
let bins = grouped.entry(chr).or_default();
|
|
|
|
|
|
|
|
if let Some(last_bin) = bins.last_mut() {
|
|
if let Some(last_bin) = bins.last_mut() {
|
|
|
- if last_bin.is_empty() || (pos - last_bin[0].1).abs() <= 1000 {
|
|
|
|
|
|
|
+ if last_bin.is_empty() || pos - last_bin[0].1 <= 1000 {
|
|
|
last_bin.push((chr, pos));
|
|
last_bin.push((chr, pos));
|
|
|
} else {
|
|
} else {
|
|
|
bins.push(vec![(chr, pos)]);
|
|
bins.push(vec![(chr, pos)]);
|
|
@@ -127,44 +139,33 @@ fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec<Vec<(&'a str, i
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // Flatten and sort by bin size (descending)
|
|
|
|
|
let mut flattened: Vec<Vec<(&str, i32)>> = grouped.values().flatten().cloned().collect();
|
|
let mut flattened: Vec<Vec<(&str, i32)>> = grouped.values().flatten().cloned().collect();
|
|
|
-
|
|
|
|
|
- // Sort bins by size (descending) to prioritize regions with more hits
|
|
|
|
|
flattened.sort_by_key(|bin| std::cmp::Reverse(bin.len()));
|
|
flattened.sort_by_key(|bin| std::cmp::Reverse(bin.len()));
|
|
|
-
|
|
|
|
|
flattened
|
|
flattened
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Retrieves primary alignment records based on a set of input records.
|
|
|
|
|
|
|
+/// Resolve supplementary alignments in a record set to their primary records.
|
|
|
///
|
|
///
|
|
|
-/// This function processes a collection of BAM records and retrieves their primary alignments.
|
|
|
|
|
-/// When supplementary alignments are found (with SA tags), it fetches the corresponding
|
|
|
|
|
-/// primary alignments from the provided BAM file.
|
|
|
|
|
|
|
+/// Primary and secondary records are separated in a first pass. For supplementary
|
|
|
|
|
+/// records with an SA tag, the SA positions are binned by chromosome and fetched in
|
|
|
|
|
+/// bulk. Fetching stops as soon as all target query names have been found (early exit).
|
|
|
|
|
+/// Records that cannot be resolved are silently dropped with a warning.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
|
///
|
|
///
|
|
|
-/// * `bam` - A mutable reference to an IndexedReader for the BAM file
|
|
|
|
|
-/// * `records` - A vector of input records to process
|
|
|
|
|
|
|
+/// * `bam` - Indexed BAM reader used for random-access fetches
|
|
|
|
|
+/// * `records` - Mixed set of primary, secondary, and supplementary records
|
|
|
///
|
|
///
|
|
|
/// # Returns
|
|
/// # Returns
|
|
|
///
|
|
///
|
|
|
-/// A vector of records containing both:
|
|
|
|
|
-/// - Original primary alignments from the input
|
|
|
|
|
-/// - Primary alignments fetched for any supplementary records in the input
|
|
|
|
|
-///
|
|
|
|
|
-/// # Examples
|
|
|
|
|
-///
|
|
|
|
|
-/// ```
|
|
|
|
|
-/// use rust_htslib::bam::{IndexedReader, Record};
|
|
|
|
|
-/// let mut bam = IndexedReader::from_path("sample.bam").unwrap();
|
|
|
|
|
-/// let records = vec![/* some records */];
|
|
|
|
|
-/// let primary_alignments = primary_records(&mut bam, records);
|
|
|
|
|
-/// ```
|
|
|
|
|
|
|
+/// A vector containing original non-supplementary records from the input plus any
|
|
|
|
|
+/// primaries fetched for supplementary inputs. Secondary records in the input are
|
|
|
|
|
+/// passed through unchanged; secondary records encountered during SA-position fetching
|
|
|
|
|
+/// are skipped. Order is not guaranteed.
|
|
|
pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Record> {
|
|
pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Record> {
|
|
|
let mut res = Vec::with_capacity(records.len());
|
|
let mut res = Vec::with_capacity(records.len());
|
|
|
let mut all_positions = Vec::new();
|
|
let mut all_positions = Vec::new();
|
|
|
- let mut all_qnames_to_fetch = Vec::new();
|
|
|
|
|
|
|
+ let mut all_qnames_to_fetch = HashSet::new();
|
|
|
|
|
|
|
|
// First pass: collect primary records and positions to fetch
|
|
// First pass: collect primary records and positions to fetch
|
|
|
for record in records.iter() {
|
|
for record in records.iter() {
|
|
@@ -176,7 +177,7 @@ pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Rec
|
|
|
all_positions.extend(positions);
|
|
all_positions.extend(positions);
|
|
|
|
|
|
|
|
match String::from_utf8(qname.to_vec()) {
|
|
match String::from_utf8(qname.to_vec()) {
|
|
|
- Ok(qname_str) => all_qnames_to_fetch.push(qname_str),
|
|
|
|
|
|
|
+ Ok(qname_str) => { all_qnames_to_fetch.insert(qname_str); }
|
|
|
Err(e) => warn!("Invalid UTF-8 in qname: {}", e),
|
|
Err(e) => warn!("Invalid UTF-8 in qname: {}", e),
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -250,6 +251,22 @@ pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Rec
|
|
|
res
|
|
res
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/// Extract contig names and lengths from a BAM header.
|
|
|
|
|
+///
|
|
|
|
|
+/// Reads all `@SQ` lines and returns a map of sequence name (`SN`) to sequence
|
|
|
|
|
+/// length (`LN`).
|
|
|
|
|
+///
|
|
|
|
|
+/// # Arguments
|
|
|
|
|
+///
|
|
|
|
|
+/// * `header` - BAM header to parse
|
|
|
|
|
+///
|
|
|
|
|
+/// # Returns
|
|
|
|
|
+///
|
|
|
|
|
+/// A map from contig name to length in base pairs.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Errors
|
|
|
|
|
+///
|
|
|
|
|
+/// Returns an error if the `LN` field of any `@SQ` line cannot be parsed as `u64`.
|
|
|
pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<HashMap<String, u64>> {
|
|
pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<HashMap<String, u64>> {
|
|
|
let mut genome = HashMap::new();
|
|
let mut genome = HashMap::new();
|
|
|
for (key, records) in header.to_hashmap() {
|
|
for (key, records) in header.to_hashmap() {
|
|
@@ -259,7 +276,7 @@ pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<Has
|
|
|
record["SN"].to_string(),
|
|
record["SN"].to_string(),
|
|
|
record["LN"]
|
|
record["LN"]
|
|
|
.parse::<u64>()
|
|
.parse::<u64>()
|
|
|
- .expect("Failed parsing length of chromosomes"),
|
|
|
|
|
|
|
+ .with_context(|| format!("Failed to parse LN for contig {}", record["SN"]))?,
|
|
|
);
|
|
);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -267,8 +284,22 @@ pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<Has
|
|
|
Ok(genome)
|
|
Ok(genome)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Split the genome into N balanced region lists (`ctg:start-end` strings).
|
|
|
|
|
-/// Each element of the outer Vec is the list of `-r` regions for one batch.
|
|
|
|
|
|
|
+/// Partition a genome into `n` balanced batches of samtools-style region strings.
|
|
|
|
|
+///
|
|
|
|
|
+/// Contigs are sorted deterministically and sliced into segments so that each batch
|
|
|
|
|
+/// covers approximately `total_bp / n` bases. A contig may be split across batches.
|
|
|
|
|
+/// Region strings use 1-based inclusive coordinates (`ctg:start-end`) as expected
|
|
|
|
|
+/// by samtools `-r`.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Arguments
|
|
|
|
|
+///
|
|
|
|
|
+/// * `genome` - Map of contig name to length in bp, typically from [`get_genome_sizes`]
|
|
|
|
|
+/// * `n` - Number of batches to produce
|
|
|
|
|
+///
|
|
|
|
|
+/// # Returns
|
|
|
|
|
+///
|
|
|
|
|
+/// A vector of `n` batches, each containing the region strings assigned to that batch.
|
|
|
|
|
+/// Returns an empty vector if `n` is zero or `genome` is empty.
|
|
|
pub fn split_genome_into_n_regions(genome: &HashMap<String, u64>, n: usize) -> Vec<Vec<String>> {
|
|
pub fn split_genome_into_n_regions(genome: &HashMap<String, u64>, n: usize) -> Vec<Vec<String>> {
|
|
|
if n == 0 || genome.is_empty() {
|
|
if n == 0 || genome.is_empty() {
|
|
|
return Vec::new();
|
|
return Vec::new();
|
|
@@ -324,24 +355,39 @@ pub enum SegmentOrder {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/// Fold-back inversion event detected from a primary/supplementary alignment pair.
|
|
/// Fold-back inversion event detected from a primary/supplementary alignment pair.
|
|
|
-/// Thanks to: https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py
|
|
|
|
|
///
|
|
///
|
|
|
-/// A fold-back inversion occurs when a read has two alignments on the same chromosome
|
|
|
|
|
-/// with opposite strands that overlap on the reference. This pattern suggests the
|
|
|
|
|
-/// presence of an inverted duplication or fold-back structure.
|
|
|
|
|
|
|
+/// A fold-back inversion occurs when a single read produces two alignments on the same
|
|
|
|
|
+/// chromosome with opposite strand orientations. The read travels forward (alignment A),
|
|
|
|
|
+/// then folds back and runs in reverse (alignment B), covering the same reference region
|
|
|
|
|
+/// twice — producing a signature identical to an **inverted duplication**.
|
|
|
|
|
+///
|
|
|
|
|
+/// SA tag parsing is handled by [`parse_sa_entries`] / [`SaEntry`], which perform the
|
|
|
|
|
+/// 1-based → 0-based coordinate conversion required by the SAM specification.
|
|
|
///
|
|
///
|
|
|
/// # Coordinate System
|
|
/// # Coordinate System
|
|
|
|
|
+///
|
|
|
/// - All coordinates are 0-based, half-open `[start, end)`
|
|
/// - All coordinates are 0-based, half-open `[start, end)`
|
|
|
-/// - Alignments A and B are ordered by their position on the read (A first)
|
|
|
|
|
-/// - Breakpoint distances (`b_c`, `a_d`) use strand-aware coordinates
|
|
|
|
|
|
|
+/// - Alignments A and B are ordered by their position on the read (A comes first)
|
|
|
///
|
|
///
|
|
|
/// # Breakpoints
|
|
/// # Breakpoints
|
|
|
-/// Breakpoints (a, b, c, d) are strand-aware positions representing alignment termini:
|
|
|
|
|
-/// - For `+` strand: `a = start` (5'), `b = end` (3')
|
|
|
|
|
-/// - For `-` strand: `c = start` (3'), `d = end` (5')
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// `(a, b, c, d)` are alignment termini in **read order** (5′→3′ along the read).
|
|
|
|
|
+/// `b` is the **inner** (junction-proximal) breakpoint of A; `c` is the **inner**
|
|
|
|
|
+/// breakpoint of B. `a` and `d` are the **outer** (junction-distal) endpoints.
|
|
|
|
|
+///
|
|
|
|
|
+/// For each strand, the 5′ end of the read maps to a different reference position:
|
|
|
|
|
+///
|
|
|
|
|
+/// | Strand | 5′ terminus | 3′ terminus |
|
|
|
|
|
+/// |--------|-------------|-------------|
|
|
|
|
|
+/// | `+` | `start` (leftmost ref pos) | `end` (rightmost ref pos) |
|
|
|
|
|
+/// | `−` | `end` (rightmost ref pos) | `start` (leftmost ref pos) |
|
|
|
|
|
+///
|
|
|
|
|
+/// When A comes first on the read: A's 3′ terminus (`b`) and B's 5′ terminus (`c`)
|
|
|
|
|
+/// are the **inner** (junction-proximal) breakpoints. A's 5′ (`a`) and B's 3′ (`d`)
|
|
|
|
|
+/// are the **outer** endpoints.
|
|
|
///
|
|
///
|
|
|
/// ```text
|
|
/// ```text
|
|
|
-/// Case: A on '+' strand, B on '-' strand
|
|
|
|
|
|
|
+/// Case: A on '+' strand, B on '−' strand
|
|
|
///
|
|
///
|
|
|
/// a b
|
|
/// a b
|
|
|
/// | |
|
|
/// | |
|
|
@@ -350,17 +396,39 @@ pub enum SegmentOrder {
|
|
|
/// --------[<=============B======]-------
|
|
/// --------[<=============B======]-------
|
|
|
/// ^ ^
|
|
/// ^ ^
|
|
|
/// | |
|
|
/// | |
|
|
|
-/// c d
|
|
|
|
|
|
|
+/// d c
|
|
|
///
|
|
///
|
|
|
-/// Alignment A (+ strand): a=aln_a_start, b=aln_a_end
|
|
|
|
|
-/// Alignment B (- strand): c=aln_b_start, d=aln_b_end
|
|
|
|
|
|
|
+/// Alignment A (+ strand): a = aln_a_start (5' of A, outer)
|
|
|
|
|
+/// b = aln_a_end (3' of A, inner — junction-proximal)
|
|
|
|
|
+/// Alignment B (− strand): c = aln_b_end (5' of B in read order, inner — junction-proximal)
|
|
|
|
|
+/// d = aln_b_start (3' of B in read order, outer)
|
|
|
///
|
|
///
|
|
|
/// Distances:
|
|
/// Distances:
|
|
|
-/// aln_a_len = |a - b| = alignment A length
|
|
|
|
|
-/// aln_b_len = |c - d| = alignment B length
|
|
|
|
|
-/// b_c = |b - c| = inner breakpoint distance (3' ends)
|
|
|
|
|
-/// a_d = |a - d| = outer breakpoint distance (5' ends)
|
|
|
|
|
|
|
+/// aln_a_len = |a − b| = alignment A length on reference
|
|
|
|
|
+/// aln_b_len = |c − d| = alignment B length on reference
|
|
|
|
|
+/// b_c = |b − c| = inner breakpoint distance (junction gap between A and B)
|
|
|
|
|
+/// a_d = |a − d| = outer span of the fold-back structure
|
|
|
/// ```
|
|
/// ```
|
|
|
|
|
+///
|
|
|
|
|
+/// The region `[d, b]` (where `d < b` in the canonical case) is covered by both
|
|
|
|
|
+/// alignments and constitutes the **inverted duplication**. For a perfect duplication
|
|
|
|
|
+/// `b_c = 0` and `a_d = 0`. Events with `a_d < 150 bp` are typically classified as
|
|
|
|
|
+/// **sequencing artefacts** (short hairpin/palindrome) rather than true structural variants.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Divergence from reference implementation
|
|
|
|
|
+///
|
|
|
|
|
+/// The reference Python script requires the two alignments to **overlap** on the reference
|
|
|
|
|
+/// (`overlap > 0 bp`). This implementation is more permissive: it also accepts a small
|
|
|
|
|
+/// reference-space **gap** between the alignments (controlled by `max_gap` in
|
|
|
|
|
+/// [`fb_inv_from_record`]). Set `max_gap = Some(0)` to match the strict Python behaviour,
|
|
|
|
|
+/// which requires overlap because the artefact mechanism — ssDNA forming a hairpin and
|
|
|
|
|
+/// re-translocating back through the nanopore — guarantees both alignments cover the
|
|
|
|
|
+/// same physical DNA strand.
|
|
|
|
|
+///
|
|
|
|
|
+/// # References
|
|
|
|
|
+///
|
|
|
|
|
+/// Geometry, breakpoint labelling, and the `a_d < 150 bp` artefact threshold are adapted from:
|
|
|
|
|
+/// <https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py>
|
|
|
#[derive(Debug)]
|
|
#[derive(Debug)]
|
|
|
pub struct FbInv {
|
|
pub struct FbInv {
|
|
|
/// Read name (QNAME from BAM)
|
|
/// Read name (QNAME from BAM)
|
|
@@ -383,9 +451,9 @@ pub struct FbInv {
|
|
|
pub aln_a_len: i64,
|
|
pub aln_a_len: i64,
|
|
|
/// Alignment B length on reference (bp)
|
|
/// Alignment B length on reference (bp)
|
|
|
pub aln_b_len: i64,
|
|
pub aln_b_len: i64,
|
|
|
- /// Distance between breakpoints B and C (strand-aware)
|
|
|
|
|
|
|
+ /// `|b - c|`: read-order junction gap — distance between the 3' end of A and the 5' end of B
|
|
|
pub b_c: i64,
|
|
pub b_c: i64,
|
|
|
- /// Distance between breakpoints A and D (strand-aware)
|
|
|
|
|
|
|
+ /// `|a - d|`: read-order outer span — distance between the 5' end of A and the 3' end of B
|
|
|
pub a_d: i64,
|
|
pub a_d: i64,
|
|
|
/// Which alignment appears first on the read
|
|
/// Which alignment appears first on the read
|
|
|
pub first: SegmentOrder,
|
|
pub first: SegmentOrder,
|
|
@@ -399,21 +467,18 @@ pub struct FbInv {
|
|
|
pub sa_qend: u32,
|
|
pub sa_qend: u32,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Parse a SAM CIGAR string into a vector of CIGAR operations.
|
|
|
|
|
|
|
+/// Parse a SAM CIGAR string into a vector of [`Cigar`] operations.
|
|
|
///
|
|
///
|
|
|
-/// Supports all standard CIGAR operations: M, I, D, N, S, H, P, =, X
|
|
|
|
|
|
|
+/// All standard CIGAR operations are supported: `M`, `I`, `D`, `N`, `S`, `H`, `P`, `=`, `X`.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
|
-/// * `s` - CIGAR string (e.g., "10S100M5I50M20S")
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// * `s` - CIGAR string, e.g. `"10S100M5I50M20S"`
|
|
|
///
|
|
///
|
|
|
/// # Returns
|
|
/// # Returns
|
|
|
-/// `Some(Vec<Cigar>)` if valid, `None` if malformed or empty
|
|
|
|
|
///
|
|
///
|
|
|
-/// # Examples
|
|
|
|
|
-/// ```
|
|
|
|
|
-/// let ops = parse_cigar_str("10S100M").unwrap();
|
|
|
|
|
-/// assert_eq!(ops.len(), 2);
|
|
|
|
|
-/// ```
|
|
|
|
|
|
|
+/// `Some(ops)` on success, `None` if the string is empty, contains an unknown operation
|
|
|
|
|
+/// character, has an operator without a preceding length, or ends with trailing digits.
|
|
|
pub fn parse_cigar_str(s: &str) -> Option<Vec<Cigar>> {
|
|
pub fn parse_cigar_str(s: &str) -> Option<Vec<Cigar>> {
|
|
|
if s.is_empty() {
|
|
if s.is_empty() {
|
|
|
return None;
|
|
return None;
|
|
@@ -465,17 +530,22 @@ pub fn parse_cigar_str(s: &str) -> Option<Vec<Cigar>> {
|
|
|
Some(ops)
|
|
Some(ops)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Compute query coordinates (start, end) on the read from a CIGAR.
|
|
|
|
|
|
|
+/// Compute `(query_begin, query_end)` on the read from CIGAR operations.
|
|
|
///
|
|
///
|
|
|
-/// For plus strand, the first CIGAR operation corresponds to the read start.
|
|
|
|
|
-/// For minus strand, the last CIGAR operation corresponds to the read start.
|
|
|
|
|
|
|
+/// For `+` strand the first CIGAR op is at the read start; for `-` strand the last op
|
|
|
|
|
+/// is at the read start (CIGAR is encoded in reference order regardless of strand).
|
|
|
|
|
+/// Hard-clipped bases are excluded; soft-clip length sets `query_begin`.
|
|
|
|
|
+/// Only query-consuming operations (`M`, `I`, `=`, `X`) contribute to the alignment length.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
|
-/// * `ops` - CIGAR operations slice
|
|
|
|
|
-/// * `strand` - '+' or '-'
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// * `ops` - Slice of CIGAR operations
|
|
|
|
|
+/// * `strand` - `'+'` or `'-'`
|
|
|
///
|
|
///
|
|
|
/// # Returns
|
|
/// # Returns
|
|
|
-/// Tuple of (query_begin, query_end) in read coordinates (excluding hard clips)
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// `(query_begin, query_end)` as read-coordinate offsets (0-based, hard clips excluded).
|
|
|
|
|
+/// Returns `(0, 0)` for an empty slice.
|
|
|
fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) {
|
|
fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) {
|
|
|
if ops.is_empty() {
|
|
if ops.is_empty() {
|
|
|
return (0, 0);
|
|
return (0, 0);
|
|
@@ -505,15 +575,18 @@ fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) {
|
|
|
(query_beg, query_beg + aln_len)
|
|
(query_beg, query_beg + aln_len)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Compute alignment length on the reference from CIGAR operations.
|
|
|
|
|
|
|
+/// Compute the reference-consuming length of a CIGAR.
|
|
|
///
|
|
///
|
|
|
-/// Counts bases consumed by M, D, N, =, X operations.
|
|
|
|
|
|
|
+/// Sums lengths of operations that advance the reference coordinate:
|
|
|
|
|
+/// `M`, `D`, `N`, `=`, `X`. Insertions, soft clips, hard clips, and pads are excluded.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
|
-/// * `ops` - CIGAR operations slice
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// * `ops` - Slice of CIGAR operations
|
|
|
///
|
|
///
|
|
|
/// # Returns
|
|
/// # Returns
|
|
|
-/// Reference-consuming length in base pairs
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// Reference-consuming length in base pairs. Returns `0` for an empty slice.
|
|
|
fn alignment_ref_length(ops: &[Cigar]) -> u32 {
|
|
fn alignment_ref_length(ops: &[Cigar]) -> u32 {
|
|
|
ops.iter()
|
|
ops.iter()
|
|
|
.filter_map(|op| match op {
|
|
.filter_map(|op| match op {
|
|
@@ -527,32 +600,40 @@ fn alignment_ref_length(ops: &[Cigar]) -> u32 {
|
|
|
.sum()
|
|
.sum()
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Compute gap between two intervals. Negative value means overlap.
|
|
|
|
|
|
|
+/// Compute the gap in base pairs between two half-open intervals.
|
|
|
|
|
+///
|
|
|
|
|
+/// Both intervals are `[start, end)`. A negative return value indicates that the
|
|
|
|
|
+/// intervals overlap by that many bases; zero means they are adjacent; positive means
|
|
|
|
|
+/// they are separated.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
|
|
|
+///
|
|
|
/// * `a_start`, `a_end` - First interval `[a_start, a_end)`
|
|
/// * `a_start`, `a_end` - First interval `[a_start, a_end)`
|
|
|
/// * `b_start`, `b_end` - Second interval `[b_start, b_end)`
|
|
/// * `b_start`, `b_end` - Second interval `[b_start, b_end)`
|
|
|
///
|
|
///
|
|
|
/// # Returns
|
|
/// # Returns
|
|
|
-/// Gap in bp (negative if overlapping, 0 if adjacent, positive if separated)
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// `max(starts) - min(ends)`: negative for overlap, 0 for adjacency, positive for gap.
|
|
|
#[inline]
|
|
#[inline]
|
|
|
fn interval_gap(a_start: i64, a_end: i64, b_start: i64, b_end: i64) -> i64 {
|
|
fn interval_gap(a_start: i64, a_end: i64, b_start: i64, b_end: i64) -> i64 {
|
|
|
a_start.max(b_start) - a_end.min(b_end)
|
|
a_start.max(b_start) - a_end.min(b_end)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Compute strand-aware breakpoint coordinates.
|
|
|
|
|
|
|
+/// Return `(5'_terminus, 3'_terminus)` in read order for a single alignment.
|
|
|
///
|
|
///
|
|
|
-/// For fold-back inversions, breakpoints depend on strand orientation:
|
|
|
|
|
-/// - Plus strand: breakpoints are (start, end) - 5' to 3' direction
|
|
|
|
|
-/// - Minus strand: breakpoints are (end, start) - reversed direction
|
|
|
|
|
|
|
+/// For `+` strand the 5' end maps to the leftmost reference position (`start`) and the
|
|
|
|
|
+/// 3' end to the rightmost (`end`). For `-` strand the read runs right-to-left on the
|
|
|
|
|
+/// reference, so 5' = `end` and 3' = `start`.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
|
-/// * `start` - Alignment start (0-based)
|
|
|
|
|
-/// * `end` - Alignment end (exclusive)
|
|
|
|
|
-/// * `strand` - '+' or '-'
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// * `start` - Alignment start, 0-based inclusive
|
|
|
|
|
+/// * `end` - Alignment end, 0-based exclusive
|
|
|
|
|
+/// * `strand` - `'+'` or `'-'`
|
|
|
///
|
|
///
|
|
|
/// # Returns
|
|
/// # Returns
|
|
|
-/// Tuple of (breakpoint_a, breakpoint_b) in strand-aware order
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// `(5'_coord, 3'_coord)` where both are 0-based reference positions.
|
|
|
#[inline]
|
|
#[inline]
|
|
|
fn breakpoints(start: i64, end: i64, strand: char) -> (i64, i64) {
|
|
fn breakpoints(start: i64, end: i64, strand: char) -> (i64, i64) {
|
|
|
if strand == '+' {
|
|
if strand == '+' {
|
|
@@ -562,35 +643,46 @@ fn breakpoints(start: i64, end: i64, strand: char) -> (i64, i64) {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Extract a fold-back inversion event from a BAM record.
|
|
|
|
|
|
|
+/// Attempt to extract a fold-back inversion event from a primary BAM record.
|
|
|
///
|
|
///
|
|
|
-/// Analyzes a primary alignment and its supplementary alignment (from SA tag)
|
|
|
|
|
-/// to detect fold-back inversion patterns characterized by:
|
|
|
|
|
-/// - Same chromosome
|
|
|
|
|
-/// - Opposite strands
|
|
|
|
|
-/// - Reference coordinates within proximity threshold
|
|
|
|
|
-/// - Exactly one supplementary alignment
|
|
|
|
|
|
|
+/// Inspects the record's SA tag using [`parse_sa_entries`] and checks whether the single
|
|
|
|
|
+/// supplementary alignment forms a fold-back inversion with the primary: same chromosome,
|
|
|
|
|
+/// opposite strand, and reference coordinates within the specified proximity thresholds.
|
|
|
|
|
+/// Returns a populated [`FbInv`] with all breakpoint metrics on success.
|
|
|
|
|
+///
|
|
|
|
|
+/// Alignments A and B in the returned struct are ordered by their position on the read
|
|
|
|
|
+/// (A comes first). All coordinates are 0-based half-open; the 1-based SA tag `pos` is
|
|
|
|
|
+/// converted automatically by [`SaEntry`].
|
|
|
|
|
+///
|
|
|
|
|
+/// Recommended thresholds for **artefact detection**: `max_overlap = Some(150)`,
|
|
|
|
|
+/// `max_gap = Some(0)`. Requiring overlap (`max_gap = Some(0)`) matches the reference
|
|
|
|
|
+/// Python implementation and avoids capturing true SV junctions that happen to have
|
|
|
|
|
+/// opposite-strand supplementary alignments with a reference-space gap.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
|
-/// * `record` - BAM record (must be primary alignment)
|
|
|
|
|
-/// * `header` - BAM header for reference name lookup
|
|
|
|
|
-/// * `min_mapq` - Minimum mapping quality threshold for both alignments
|
|
|
|
|
-/// * `max_overlap` - Maximum allowed overlap in bp (None = no limit)
|
|
|
|
|
-/// * `max_gap` - Maximum allowed gap in bp (None = no limit, 0 = require overlap or adjacent)
|
|
|
|
|
-/// default should be [-150, 200) Some(150), Some(200)
|
|
|
|
|
|
|
+///
|
|
|
|
|
+/// * `record` - BAM record to evaluate; must be a primary alignment
|
|
|
|
|
+/// * `header` - BAM header used to resolve the reference name from `tid`
|
|
|
|
|
+/// * `min_mapq` - Minimum mapping quality required for both the primary and the supplementary
|
|
|
|
|
+/// * `max_overlap` - Maximum allowed reference-space overlap in bp (`None` = no limit)
|
|
|
|
|
+/// * `max_gap` - Maximum allowed reference-space gap in bp (`None` = no limit)
|
|
|
///
|
|
///
|
|
|
/// # Returns
|
|
/// # Returns
|
|
|
-/// `Some(FbInv)` if a valid fold-back pattern is detected, `None` otherwise
|
|
|
|
|
///
|
|
///
|
|
|
-/// # Filtering
|
|
|
|
|
-/// Returns `None` if any of these conditions are met:
|
|
|
|
|
|
|
+/// `Some(FbInv)` if all criteria are met, `None` otherwise.
|
|
|
|
|
+///
|
|
|
|
|
+/// Returns `None` when any of the following conditions apply:
|
|
|
/// - Record is unmapped, secondary, or supplementary
|
|
/// - Record is unmapped, secondary, or supplementary
|
|
|
-/// - MAPQ below threshold (primary or supplementary)
|
|
|
|
|
-/// - No SA tag or multiple supplementary alignments
|
|
|
|
|
-/// - Supplementary on different chromosome
|
|
|
|
|
-/// - Same strand orientation
|
|
|
|
|
-/// - Overlap exceeds `max_overlap` (if set)
|
|
|
|
|
-/// - Gap exceeds `max_gap` (if set)
|
|
|
|
|
|
|
+/// - Primary or supplementary MAPQ is below `min_mapq`
|
|
|
|
|
+/// - SA tag is absent or contains more than one entry
|
|
|
|
|
+/// - Supplementary aligns to a different chromosome
|
|
|
|
|
+/// - Both alignments are on the same strand
|
|
|
|
|
+/// - Reference-space overlap exceeds `max_overlap`
|
|
|
|
|
+/// - Reference-space gap exceeds `max_gap`
|
|
|
|
|
+///
|
|
|
|
|
+/// # References
|
|
|
|
|
+///
|
|
|
|
|
+/// <https://github.com/cortes-ciriano-lab/ont_fb-inv_artifacts/blob/main/fb-inv_artefact_rates.py>
|
|
|
pub fn fb_inv_from_record(
|
|
pub fn fb_inv_from_record(
|
|
|
record: &bam::Record,
|
|
record: &bam::Record,
|
|
|
header: &bam::HeaderView,
|
|
header: &bam::HeaderView,
|
|
@@ -607,32 +699,19 @@ pub fn fb_inv_from_record(
|
|
|
return None;
|
|
return None;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // SA tag required
|
|
|
|
|
|
|
+ // SA tag required; exactly one supplementary alignment expected
|
|
|
let sa_str = match record.aux(b"SA") {
|
|
let sa_str = match record.aux(b"SA") {
|
|
|
Ok(Aux::String(s)) => s,
|
|
Ok(Aux::String(s)) => s,
|
|
|
_ => return None,
|
|
_ => return None,
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
- let sa_entries: Vec<&str> = sa_str.split(';').filter(|s| !s.is_empty()).collect();
|
|
|
|
|
-
|
|
|
|
|
- // Require exactly one SA (like the Python code)
|
|
|
|
|
|
|
+ let sa_entries = parse_sa_entries(sa_str);
|
|
|
if sa_entries.len() != 1 {
|
|
if sa_entries.len() != 1 {
|
|
|
return None;
|
|
return None;
|
|
|
}
|
|
}
|
|
|
- let sa = sa_entries[0];
|
|
|
|
|
-
|
|
|
|
|
- // SA format: chr,pos,strand,cigar,mapq,nm
|
|
|
|
|
- let fields: Vec<&str> = sa.split(',').collect();
|
|
|
|
|
- if fields.len() < 6 {
|
|
|
|
|
- return None;
|
|
|
|
|
- }
|
|
|
|
|
- let sa_rname = fields[0];
|
|
|
|
|
- let sa_pos_1based: i64 = fields[1].parse().ok()?; // SAM is 1-based
|
|
|
|
|
- let sa_strand_char: char = fields[2].chars().next()?;
|
|
|
|
|
- let sa_cigar_str = fields[3];
|
|
|
|
|
- let sa_mapq: u8 = fields[4].parse().ok()?;
|
|
|
|
|
|
|
+ let sa = &sa_entries[0];
|
|
|
|
|
|
|
|
- if sa_mapq < min_mapq {
|
|
|
|
|
|
|
+ if sa.mapq < min_mapq {
|
|
|
return None;
|
|
return None;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -642,13 +721,13 @@ pub fn fb_inv_from_record(
|
|
|
return None;
|
|
return None;
|
|
|
}
|
|
}
|
|
|
let primary_rname = std::str::from_utf8(header.tid2name(tid as u32)).ok()?;
|
|
let primary_rname = std::str::from_utf8(header.tid2name(tid as u32)).ok()?;
|
|
|
- if sa_rname != primary_rname {
|
|
|
|
|
|
|
+ if sa.chr != primary_rname {
|
|
|
return None;
|
|
return None;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// Strand check: need opposite strands for fold-back inversion
|
|
// Strand check: need opposite strands for fold-back inversion
|
|
|
let primary_strand_char = if record.is_reverse() { '-' } else { '+' };
|
|
let primary_strand_char = if record.is_reverse() { '-' } else { '+' };
|
|
|
- if primary_strand_char == sa_strand_char {
|
|
|
|
|
|
|
+ if primary_strand_char == sa.strand {
|
|
|
return None;
|
|
return None;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -656,17 +735,14 @@ pub fn fb_inv_from_record(
|
|
|
let primary_start = record.pos();
|
|
let primary_start = record.pos();
|
|
|
let primary_end = record.cigar().end_pos();
|
|
let primary_end = record.cigar().end_pos();
|
|
|
|
|
|
|
|
- // Parse SA CIGAR
|
|
|
|
|
- let sa_cigar_ops = parse_cigar_str(sa_cigar_str)?;
|
|
|
|
|
|
|
+ // Parse SA CIGAR; pos is already 0-based from SaEntry
|
|
|
|
|
+ let sa_cigar_ops = parse_cigar_str(sa.cigar)?;
|
|
|
let sa_ref_len = alignment_ref_length(&sa_cigar_ops);
|
|
let sa_ref_len = alignment_ref_length(&sa_cigar_ops);
|
|
|
-
|
|
|
|
|
- // Validation: SA must have non-zero reference length
|
|
|
|
|
if sa_ref_len == 0 {
|
|
if sa_ref_len == 0 {
|
|
|
return None;
|
|
return None;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // SA ref coords: convert POS from 1-based to 0-based
|
|
|
|
|
- let sa_start = sa_pos_1based - 1;
|
|
|
|
|
|
|
+ let sa_start = sa.pos as i64;
|
|
|
let sa_end = sa_start + sa_ref_len as i64;
|
|
let sa_end = sa_start + sa_ref_len as i64;
|
|
|
|
|
|
|
|
// Check proximity between alignments
|
|
// Check proximity between alignments
|
|
@@ -692,7 +768,7 @@ pub fn fb_inv_from_record(
|
|
|
// Query coords for primary and SA
|
|
// Query coords for primary and SA
|
|
|
let primary_cigar: Vec<Cigar> = record.cigar().iter().cloned().collect();
|
|
let primary_cigar: Vec<Cigar> = record.cigar().iter().cloned().collect();
|
|
|
let (primary_qbeg, primary_qend) = alignment_query_coords(&primary_cigar, primary_strand_char);
|
|
let (primary_qbeg, primary_qend) = alignment_query_coords(&primary_cigar, primary_strand_char);
|
|
|
- let (sa_qbeg, sa_qend) = alignment_query_coords(&sa_cigar_ops, sa_strand_char);
|
|
|
|
|
|
|
+ let (sa_qbeg, sa_qend) = alignment_query_coords(&sa_cigar_ops, sa.strand);
|
|
|
|
|
|
|
|
// Decide which segment is first on the read
|
|
// Decide which segment is first on the read
|
|
|
let first = if primary_qbeg <= sa_qbeg {
|
|
let first = if primary_qbeg <= sa_qbeg {
|
|
@@ -710,13 +786,13 @@ pub fn fb_inv_from_record(
|
|
|
primary_strand_char,
|
|
primary_strand_char,
|
|
|
sa_start,
|
|
sa_start,
|
|
|
sa_end,
|
|
sa_end,
|
|
|
- sa_strand_char,
|
|
|
|
|
|
|
+ sa.strand,
|
|
|
)
|
|
)
|
|
|
} else {
|
|
} else {
|
|
|
(
|
|
(
|
|
|
sa_start,
|
|
sa_start,
|
|
|
sa_end,
|
|
sa_end,
|
|
|
- sa_strand_char,
|
|
|
|
|
|
|
+ sa.strand,
|
|
|
primary_start,
|
|
primary_start,
|
|
|
primary_end,
|
|
primary_end,
|
|
|
primary_strand_char,
|
|
primary_strand_char,
|
|
@@ -758,40 +834,27 @@ pub fn fb_inv_from_record(
|
|
|
})
|
|
})
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-// pub fn read_sm_tag(bam_path: &str) -> anyhow::Result<String> {
|
|
|
|
|
-// let reader = bam::Reader::from_path(bam_path)
|
|
|
|
|
-// .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
|
|
|
|
|
-// let header = bam::Header::from_template(reader.header());
|
|
|
|
|
-// let header_text = String::from_utf8_lossy(&header.to_bytes()).to_string();
|
|
|
|
|
-//
|
|
|
|
|
-// for line in header_text.lines() {
|
|
|
|
|
-// if line.starts_with("@RG") {
|
|
|
|
|
-// for field in line.split('\t') {
|
|
|
|
|
-// if let Some(sm) = field.strip_prefix("SM:") {
|
|
|
|
|
-// return Ok(sm.to_string());
|
|
|
|
|
-// }
|
|
|
|
|
-// }
|
|
|
|
|
-// }
|
|
|
|
|
-// }
|
|
|
|
|
-// anyhow::bail!("No SM tag found in @RG header of {bam_path}")
|
|
|
|
|
-// }
|
|
|
|
|
-
|
|
|
|
|
-// fn ensure_bam_sm_tag(id: &str, config: &Config) -> anyhow::Result<()> {
|
|
|
|
|
-// for bam in [config.normal_bam(id), config.tumoral_bam(id)] {
|
|
|
|
|
-// if read_sm_tag(&bam).is_err() {
|
|
|
|
|
-// let sample = Path::new(&bam)
|
|
|
|
|
-// .file_stem()
|
|
|
|
|
-// .unwrap_or_default()
|
|
|
|
|
-// .to_string_lossy()
|
|
|
|
|
-// .to_string();
|
|
|
|
|
-// info!("Injecting missing @RG SM:{sample} into {bam}");
|
|
|
|
|
-// SamtoolsReheader::from_config(config, &bam, &sample).run()?;
|
|
|
|
|
-// SamtoolsIndex::from_config(config, &bam).run()?;
|
|
|
|
|
-// }
|
|
|
|
|
-// }
|
|
|
|
|
-// Ok(())
|
|
|
|
|
-// }
|
|
|
|
|
-
|
|
|
|
|
|
|
+/// Read the `SM` tag from a BAM's `@RG` lines, injecting it if absent or inconsistent.
|
|
|
|
|
+///
|
|
|
|
|
+/// Inspects every `@RG` line in the header. If all lines carry an identical `SM` value,
|
|
|
|
|
+/// that value is returned. If any line is missing `SM`, or if multiple distinct `SM`
|
|
|
|
|
+/// values are present, `samtools reheader` is invoked to set `SM` uniformly to
|
|
|
|
|
+/// `fallback_sample`. The file modification time is preserved after reheadering.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Arguments
|
|
|
|
|
+///
|
|
|
|
|
+/// * `bam_path` - Path to the BAM file (must be writable if reheadering is needed)
|
|
|
|
|
+/// * `fallback_sample` - Sample name to inject when the header is missing or ambiguous
|
|
|
|
|
+/// * `config` - Pipeline configuration used to invoke [`SamtoolsReheader`]
|
|
|
|
|
+///
|
|
|
|
|
+/// # Returns
|
|
|
|
|
+///
|
|
|
|
|
+/// The `SM` value present in (or injected into) the header.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Errors
|
|
|
|
|
+///
|
|
|
|
|
+/// Returns an error if the BAM cannot be opened, if reheadering fails, or if the
|
|
|
|
|
+/// header contains no `@RG` lines at all.
|
|
|
pub fn read_sm_tag_or_inject(
|
|
pub fn read_sm_tag_or_inject(
|
|
|
bam_path: &str,
|
|
bam_path: &str,
|
|
|
fallback_sample: &str,
|
|
fallback_sample: &str,
|
|
@@ -800,7 +863,7 @@ pub fn read_sm_tag_or_inject(
|
|
|
let reader = bam::Reader::from_path(bam_path)
|
|
let reader = bam::Reader::from_path(bam_path)
|
|
|
.with_context(|| format!("Failed to open BAM: {bam_path}"))?;
|
|
.with_context(|| format!("Failed to open BAM: {bam_path}"))?;
|
|
|
let header = bam::Header::from_template(reader.header());
|
|
let header = bam::Header::from_template(reader.header());
|
|
|
- let header_text = String::from_utf8_lossy(&header.to_bytes()).to_string();
|
|
|
|
|
|
|
+ let header_text = String::from_utf8_lossy(&header.to_bytes()).into_owned();
|
|
|
|
|
|
|
|
let mut sm_values: Vec<String> = Vec::new();
|
|
let mut sm_values: Vec<String> = Vec::new();
|
|
|
let mut all_have_sm = true;
|
|
let mut all_have_sm = true;
|
|
@@ -817,10 +880,8 @@ pub fn read_sm_tag_or_inject(
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // Determine whether reheadering is needed:
|
|
|
|
|
- // 1. Any @RG missing SM
|
|
|
|
|
- // 2. SM values are not all identical
|
|
|
|
|
- // 3. No @RG lines at all
|
|
|
|
|
|
|
+ // Reheader if any @RG lacks SM or if multiple distinct SM values exist.
|
|
|
|
|
+ // No @RG lines at all falls through to the error at the end.
|
|
|
let unique_sm: std::collections::HashSet<&str> = sm_values.iter().map(|s| s.as_str()).collect();
|
|
let unique_sm: std::collections::HashSet<&str> = sm_values.iter().map(|s| s.as_str()).collect();
|
|
|
|
|
|
|
|
let needs_reheader = !all_have_sm || unique_sm.len() > 1;
|
|
let needs_reheader = !all_have_sm || unique_sm.len() > 1;
|
|
@@ -856,29 +917,31 @@ pub fn read_sm_tag_or_inject(
|
|
|
.context(format!("No @RG lines found in {bam_path}"))
|
|
.context(format!("No @RG lines found in {bam_path}"))
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Fetch primary records from a BAM file, with optional region and qname filtering.
|
|
|
|
|
|
|
+/// Fetch primary alignment records from an indexed BAM, with optional region and name filtering.
|
|
|
|
|
+///
|
|
|
|
|
+/// Supplementary alignments within the fetch region are resolved to their primaries via
|
|
|
|
|
+/// [`primary_records`]. Duplicates introduced by that resolution step are removed by
|
|
|
|
|
+/// query name. Secondary alignments are always excluded.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
|
-/// * `bam_path` - Path to indexed BAM file
|
|
|
|
|
-/// * `region` - Optional region to fetch (chr, start, end). None = whole file.
|
|
|
|
|
-/// * `qname_filter` - Optional qname filter. None = no filter.
|
|
|
|
|
///
|
|
///
|
|
|
-/// # Notes
|
|
|
|
|
-/// - Supplementary alignments are resolved to their primary via SA tag using `primary_records`
|
|
|
|
|
-/// - White/blacklist are mutually exclusive — passing both is a logic error and will panic in debug
|
|
|
|
|
|
|
+/// * `bam_path` - Path to an indexed BAM file (`.bai` index must exist)
|
|
|
|
|
+/// * `region` - If `Some((chr, start, end))`, restrict the fetch to that 0-based half-open
|
|
|
|
|
+/// interval; if `None`, the whole file is scanned
|
|
|
|
|
+/// * `qname_filter` - Optional [`QnameFilter`] to retain or exclude specific read names
|
|
|
|
|
+///
|
|
|
|
|
+/// # Returns
|
|
|
|
|
+///
|
|
|
|
|
+/// A deduplicated vector of primary records, optionally filtered by query name.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Errors
|
|
|
|
|
+///
|
|
|
|
|
+/// Returns an error if the BAM cannot be opened or if the fetch operation fails.
|
|
|
pub fn fetch_primary_records(
|
|
pub fn fetch_primary_records(
|
|
|
bam_path: &Path,
|
|
bam_path: &Path,
|
|
|
region: Option<(&str, i64, i64)>,
|
|
region: Option<(&str, i64, i64)>,
|
|
|
qname_filter: Option<QnameFilter>,
|
|
qname_filter: Option<QnameFilter>,
|
|
|
) -> anyhow::Result<Vec<bam::Record>> {
|
|
) -> anyhow::Result<Vec<bam::Record>> {
|
|
|
- debug_assert!(
|
|
|
|
|
- !matches!(
|
|
|
|
|
- &qname_filter,
|
|
|
|
|
- Some(QnameFilter::Whitelist(_)) | Some(QnameFilter::Blacklist(_))
|
|
|
|
|
- ) || qname_filter.is_some(),
|
|
|
|
|
- "Cannot pass both whitelist and blacklist"
|
|
|
|
|
- );
|
|
|
|
|
-
|
|
|
|
|
let mut reader = bam::IndexedReader::from_path(bam_path)
|
|
let mut reader = bam::IndexedReader::from_path(bam_path)
|
|
|
.with_context(|| format!("Cannot open BAM: {}", bam_path.display()))?;
|
|
.with_context(|| format!("Cannot open BAM: {}", bam_path.display()))?;
|
|
|
|
|
|
|
@@ -924,66 +987,31 @@ pub fn fetch_primary_records(
|
|
|
Ok(records)
|
|
Ok(records)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/// Query-name filter for [`fetch_primary_records`].
|
|
|
pub enum QnameFilter {
|
|
pub enum QnameFilter {
|
|
|
|
|
+ /// Retain only records whose query name is in the set.
|
|
|
Whitelist(HashSet<Vec<u8>>),
|
|
Whitelist(HashSet<Vec<u8>>),
|
|
|
|
|
+ /// Exclude records whose query name is in the set.
|
|
|
Blacklist(HashSet<Vec<u8>>),
|
|
Blacklist(HashSet<Vec<u8>>),
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Convert an input BAM/CRAM file into a BED file containing one interval per
|
|
|
|
|
-/// aligned record.
|
|
|
|
|
|
|
+/// Convert a BAM/CRAM file to a BED4 file with one interval per aligned record.
|
|
|
///
|
|
///
|
|
|
-/// Each output line corresponds to a single aligned read and has the form:
|
|
|
|
|
-///
|
|
|
|
|
-/// ```text
|
|
|
|
|
-/// chrom start end read_name
|
|
|
|
|
-/// ```
|
|
|
|
|
-///
|
|
|
|
|
-/// # Behavior
|
|
|
|
|
-///
|
|
|
|
|
-/// - Unmapped records are skipped.
|
|
|
|
|
-/// - Chromosome names are taken from the BAM header.
|
|
|
|
|
-/// - Coordinates follow BED conventions:
|
|
|
|
|
-/// - `start` is **0-based inclusive**
|
|
|
|
|
-/// - `end` is **0-based exclusive**
|
|
|
|
|
-/// - The interval `[start, end)` is computed using:
|
|
|
|
|
-/// - [`bam::Record::pos`] for the start
|
|
|
|
|
-/// - [`bam::Record::reference_end`] for the end
|
|
|
|
|
-/// - The end position accounts for all CIGAR operations consuming reference
|
|
|
|
|
-/// (`M`, `D`, `N`, `=`, `X`).
|
|
|
|
|
|
|
+/// Each output line has the form `chrom\tstart\tend\tread_name`. Coordinates are
|
|
|
|
|
+/// 0-based half-open (`start` from [`Record::pos`], `end` from [`Record::reference_end`]),
|
|
|
|
|
+/// consistent with BED convention. Unmapped records and records with a negative `tid`
|
|
|
|
|
+/// are skipped. Secondary and supplementary alignments are **not** filtered — add
|
|
|
|
|
+/// upstream filtering if stricter selection is needed.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
|
///
|
|
///
|
|
|
-/// * `input_bam` - Path to input BAM or CRAM file
|
|
|
|
|
-/// * `output_bed` - Path to output BED file
|
|
|
|
|
|
|
+/// * `input_bam` - Path to the input BAM or CRAM file
|
|
|
|
|
+/// * `output_bed` - Path to the output BED file (created or overwritten)
|
|
|
///
|
|
///
|
|
|
/// # Errors
|
|
/// # Errors
|
|
|
///
|
|
///
|
|
|
-/// Returns an error if:
|
|
|
|
|
-/// - The input file cannot be opened or read
|
|
|
|
|
-/// - The output file cannot be created or written
|
|
|
|
|
-/// - UTF-8 conversion of reference names or read names fails
|
|
|
|
|
-///
|
|
|
|
|
-/// # Example
|
|
|
|
|
-///
|
|
|
|
|
-/// ```no_run
|
|
|
|
|
-/// use std::path::Path;
|
|
|
|
|
-///
|
|
|
|
|
-/// fn main() -> anyhow::Result<()> {
|
|
|
|
|
-/// bam_to_aligned_bed("input.bam", "output.bed")?;
|
|
|
|
|
-/// Ok(())
|
|
|
|
|
-/// }
|
|
|
|
|
-/// ```
|
|
|
|
|
-///
|
|
|
|
|
-/// # Notes
|
|
|
|
|
-///
|
|
|
|
|
-/// - Secondary, supplementary, and duplicate reads are **not filtered**.
|
|
|
|
|
-/// Add additional flags if stricter filtering is required.
|
|
|
|
|
-/// - For paired-end reads, each alignment record is emitted independently.
|
|
|
|
|
-///
|
|
|
|
|
-/// # See also
|
|
|
|
|
-///
|
|
|
|
|
-/// - [`rust_htslib::bam::Reader`]
|
|
|
|
|
-/// - [`rust_htslib::bam::Record`]
|
|
|
|
|
|
|
+/// Returns an error if the input cannot be read, the output cannot be written, or
|
|
|
|
|
+/// if UTF-8 conversion of a reference name or query name fails.
|
|
|
pub fn bam_to_aligned_bed<P: AsRef<Path>>(
|
|
pub fn bam_to_aligned_bed<P: AsRef<Path>>(
|
|
|
input_bam: P,
|
|
input_bam: P,
|
|
|
output_bed: P,
|
|
output_bed: P,
|