use std::collections::HashMap; use anyhow::Context; use log::error; use rust_htslib::bam::{ext::BamRecordExtensions, record::Aux, IndexedReader, Read, Record}; use crate::io::bam::{primary_record, primary_records}; /// A genomic bin containing reads from a specific region. /// /// This struct represents a segment of a BAM file, storing reads that overlap /// a specific genomic region. It provides methods to analyze read distributions /// and extract primary alignments for supplementary alignments. #[derive(Debug)] pub struct Bin { /// The contig/chromosome name pub contig: String, /// The 0-based inclusive start position pub start: u32, /// The 0-based inclusive end position pub end: u32, /// Map of read query names to their BAM records pub reads_store: HashMap, Record>, /// Indexed BAM reader for fetching additional records // pub bam_reader: IndexedReader, /// Average read length within this bin // pub reads_mean_len: u32, /// Number of reads filtered due to low mapping quality pub n_low_mapq: u32, /// Vec of depths pub depths: Vec, } impl Bin { /// Creates a new genomic bin from a BAM file. /// /// # Arguments /// /// * `bam_path` - Path to the BAM file /// * `contig` - Chromosome/contig name /// * `start` - 0-based inclusive start position /// * `length` - Length of the region to extract /// /// # Returns /// /// A Result containing the new Bin if successful, or an error if the BAM file /// couldn't be read or the region couldn't be fetched. /// /// # Examples /// /// ``` /// let bin = Bin::new("sample.bam", "chr1", 1000000, 1000)?; /// println!("Loaded {} reads", bin.n_reads()); /// ``` pub fn new( bam_reader: &mut IndexedReader, contig: &str, start: u32, length: u32, min_mapq: u8, ) -> anyhow::Result { let end = start + length - 1; bam_reader .fetch((contig, start as i64, end as i64)) .with_context(|| format!("Failed to fetch region {}:{}-{}", contig, start, end))?; let mut reads_store: HashMap, Record> = HashMap::new(); let mut n_low_mapq = 0; let mut depths = vec![0u32; length as usize]; // Optional pseudo-depth for record_result in bam_reader.rc_records() { let rc_record = match record_result { Ok(rc) => rc, Err(e) => { error!( "Failed to parse record in {}:{}-{}: {}", contig, start, end, e ); continue; } }; let record = rc_record.as_ref(); if record.mapq() < min_mapq { n_low_mapq += 1; continue; } if let Some((read_start, read_end)) = read_range(record) { if read_end < start || read_start > end { continue; // Not overlapping this bin } // Clone the underlying Record reads_store.insert(record.qname().to_vec(), record.clone()); // Optional: depth accumulation let local_start = start.max(read_start); let local_end = end.min(read_end); for pos in local_start..=local_end { let i = (pos - start) as usize; if i < depths.len() { depths[i] += 1; } } } } Ok(Bin { contig: contig.to_string(), start, end, reads_store, n_low_mapq, depths, }) } /// Returns the total number of reads in this bin. pub fn n_reads(&self) -> usize { self.reads_store.len() } /// Returns the number of reads with SA (supplementary alignment) tags. pub fn n_sa(&self) -> usize { self.reads_store .values() .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_)))) .count() } /// Retrieves primary alignments for all reads with SA tags in this bin. /// /// # Returns /// /// A vector of primary alignment records for reads with supplementary alignments. pub fn sa_primary(&mut self, bam_reader: &mut IndexedReader) -> Vec { self.reads_store .values() .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_)))) .map(|record| primary_record(bam_reader, record.clone())) .collect() } /// Finds the position with the highest concentration of read starts or ends. /// /// # Returns /// /// A tuple containing (position, count) of the position with the most read starts or ends. pub fn max_start_or_end(&self) -> (u32, usize) { let mut position_counts: HashMap = HashMap::new(); for record in self.reads_store.values() { let reference_start = record.reference_start() as u32; let reference_end = record.reference_end() as u32; // Count start positions within the bin if reference_start >= self.start && reference_start <= self.end { *position_counts.entry(reference_start).or_default() += 1; } // Count end positions within the bin if reference_end >= self.start && reference_end <= self.end { *position_counts.entry(reference_end).or_default() += 1; } } position_counts .into_iter() .max_by_key(|(_, count)| *count) .unwrap_or((0, 0)) } /// Computes the number of reads with SA tags, and finds the highest /// counts of read starts and ends at the same position. /// /// # Returns /// /// A tuple containing: /// - Number of reads with SA tags /// - Highest count of reads starting at any single position /// - Highest count of reads ending at any single position pub fn count_reads_sa_start_end(&self) -> (usize, usize, usize) { let mut n_sa = 0; let mut start_counts: HashMap = HashMap::new(); let mut end_counts: HashMap = HashMap::new(); for record in self.reads_store.values() { // Count reads with SA tags if matches!(record.aux(b"SA"), Ok(Aux::String(_))) { n_sa += 1; } let reference_start = record.reference_start() as u32; let reference_end = record.reference_end() as u32; // Count start positions within the bin if reference_start >= self.start && reference_start <= self.end { *start_counts.entry(reference_start).or_default() += 1; } // Count end positions within the bin if reference_end >= self.start && reference_end <= self.end { *end_counts.entry(reference_end).or_default() += 1; } } let max_start_count = start_counts.values().max().copied().unwrap_or(0); let max_end_count = end_counts.values().max().copied().unwrap_or(0); (n_sa, max_start_count, max_end_count) } /// Finds the position with the highest concentration of read starts. /// /// # Returns /// /// A tuple containing (position, count) of the position with the most read starts. pub fn max_start(&self) -> (u32, usize) { let mut start_counts: HashMap = HashMap::new(); for record in self.reads_store.values() { let reference_start = record.reference_start() as u32; if reference_start >= self.start && reference_start <= self.end { *start_counts.entry(reference_start).or_default() += 1; } } start_counts .into_iter() .max_by_key(|(_, count)| *count) .unwrap_or((0, 0)) } /// Finds the position with the highest concentration of read ends. /// /// # Returns /// /// A tuple containing (position, count) of the position with the most read ends. pub fn max_end(&self) -> (u32, usize) { let mut end_counts: HashMap = HashMap::new(); for record in self.reads_store.values() { let reference_end = record.reference_end() as u32; if reference_end >= self.start && reference_end <= self.end { *end_counts.entry(reference_end).or_default() += 1; } } end_counts .into_iter() .max_by_key(|(_, count)| *count) .unwrap_or((0, 0)) } /// Gets primary alignments for reads that start or end at a specific position. /// /// # Arguments /// /// * `pos` - The position to check for read starts or ends /// /// # Returns /// /// A vector of primary alignment records. pub fn se_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec { let records: Vec = self .reads_store .values() .filter(|record| { record.reference_start() as u32 == pos || record.reference_end() as u32 == pos }) .cloned() .collect(); primary_records(bam_reader, records) } /// Gets primary alignments for reads that start at a specific position. /// /// # Arguments /// /// * `pos` - The position to check for read starts /// /// # Returns /// /// A vector of primary alignment records. pub fn s_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec { self.reads_store .values() .filter(|record| record.reference_start() as u32 == pos) .map(|record| primary_record(bam_reader, record.clone())) .collect() } /// Gets primary alignments for reads that end at a specific position. /// /// # Arguments /// /// * `pos` - The position to check for read ends /// /// # Returns /// /// A vector of primary alignment records. pub fn e_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec { self.reads_store .values() .filter(|record| record.reference_end() as u32 == pos) .map(|record| primary_record(bam_reader, record.clone())) .collect() } /// Calculates the mean coverage by considering the actual /// span of each read in the bin. /// /// # Returns /// /// The mean bin coverage as a floating-point value. pub fn mean_coverage(&self) -> f64 { // If bin has no length or no reads, return 0 if self.end <= self.start || self.reads_store.is_empty() { return 0.0; } // Calculate the total length of the bin let bin_length = (self.end - self.start + 1) as f64; // Calculate the total bases covered by all reads within the bin let mut total_bases_covered = 0.0; for record in self.reads_store.values() { let read_start = record.reference_start() as u32; let read_end = record.reference_end() as u32; // Skip reads that don't overlap with our bin if read_end < self.start || read_start > self.end { continue; } // Calculate the overlapping region let overlap_start = read_start.max(self.start); let overlap_end = read_end.min(self.end); // Add the number of bases this read covers within our bin total_bases_covered += (overlap_end - overlap_start + 1) as f64; } // Divide by the bin length to get average coverage total_bases_covered / bin_length } pub fn mean_coverage_from_depths(&self) -> f64 { self.depths.iter().sum::() as f64 / self.depths.len() as f64 } } /// Helper: get start and end position of a read fn read_range(record: &Record) -> Option<(u32, u32)> { let start = record.pos(); let end = record.cigar().end_pos(); if start >= 0 && end > start { Some((start as u32, end as u32 - 1)) } else { None } }