use std::collections::HashMap; use anyhow::Context; use csv::ByteRecord; use log::error; use rust_htslib::bam::{HeaderView, IndexedReader, Read, Record, ext::BamRecordExtensions, record::Aux}; use crate::io::{bam::{fb_inv_from_record, primary_record, primary_records}, tsv::{parse_csv_u32_into, parse_u32}}; /// A genomic bin containing reads from a specific region. /// /// This struct represents a segment of a BAM file, storing reads that overlap /// a specific genomic region. It provides methods to analyze read distributions /// and extract primary alignments for supplementary alignments. #[derive(Debug)] pub struct Bin { /// The contig/chromosome name pub contig: String, /// The 0-based inclusive start position pub start: u32, /// The 0-based inclusive end position pub end: u32, /// Map of read query names to their BAM records pub reads_store: HashMap, Record>, /// Indexed BAM reader for fetching additional records // pub bam_reader: IndexedReader, /// Average read length within this bin // pub reads_mean_len: u32, /// Number of reads filtered due to low mapping quality pub low_qualities: Vec, /// Vec of depths pub depths: Vec, pub fb_invs: Vec, } impl Bin { /// Creates a new genomic bin from a BAM file. /// /// # Arguments /// /// * `bam_path` - Path to the BAM file /// * `contig` - Chromosome/contig name /// * `start` - 0-based inclusive start position /// * `length` - Length of the region to extract /// /// # Returns /// /// A Result containing the new Bin if successful, or an error if the BAM file /// couldn't be read or the region couldn't be fetched. /// /// # Examples /// /// ``` /// let bin = Bin::new("sample.bam", "chr1", 1000000, 1000)?; /// println!("Loaded {} reads", bin.n_reads()); /// ``` pub fn new( bam_reader: &mut IndexedReader, contig: &str, start: u32, length: u32, min_mapq: u8, ) -> anyhow::Result { anyhow::ensure!(length > 0, "bin length must be > 0"); let end = start + length - 1; bam_reader .fetch((contig, start as i64, end as i64)) .with_context(|| format!("Failed to fetch region {}:{}-{}", contig, start, end))?; let header = bam_reader.header().clone(); let mut reads_store: HashMap, Record> = HashMap::new(); let mut depths = vec![0u32; length as usize]; // Optional pseudo-depth let mut low_qualities = vec![0u32; length as usize]; // Optional pseudo-depth let mut fb_invs = vec![0u32; length as usize]; for record_result in bam_reader.rc_records() { let rc_record = match record_result { Ok(rc) => rc, Err(e) => { error!( "Failed to parse record in {}:{}-{}: {}", contig, start, end, e ); continue; } }; let record = rc_record.as_ref(); if let Some((read_start, read_end)) = read_range(record) { if read_end < start || read_start > end { continue; // Not overlapping this bin } let has_fbinv = fb_inv_from_record(record, &header, min_mapq, Some(150), Some(200)).is_some(); // Clone the underlying Record reads_store.insert(record.qname().to_vec(), record.clone()); // Optional: depth accumulation let local_start = start.max(read_start); let local_end = end.min(read_end); for pos in local_start..=local_end { let i = (pos - start) as usize; if i < depths.len() { depths[i] += 1; if record.mapq() < min_mapq { low_qualities[i] += 1; } if has_fbinv { fb_invs[i] += 1; } } } } } Ok(Bin { contig: contig.to_string(), start, end, reads_store, low_qualities, depths, fb_invs, }) } /// Returns the total number of reads in this bin. pub fn n_reads(&self) -> usize { self.reads_store.len() } /// Returns the number of reads with SA (supplementary alignment) tags. pub fn n_sa(&self) -> usize { self.reads_store .values() .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_)))) .count() } /// Retrieves primary alignments for all reads with SA tags in this bin. /// /// # Returns /// /// A vector of primary alignment records for reads with supplementary alignments. pub fn sa_primary(&mut self, bam_reader: &mut IndexedReader) -> Vec { self.reads_store .values() .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_)))) .map(|record| primary_record(bam_reader, record.clone())) .collect() } /// Finds the position with the highest concentration of read starts or ends. /// /// # Returns /// /// A tuple containing (position, count) of the position with the most read starts or ends. pub fn max_start_or_end(&self) -> (u32, usize) { let mut position_counts: HashMap = HashMap::new(); for record in self.reads_store.values() { let reference_start = record.reference_start() as u32; let reference_end = record.reference_end() as u32; // Count start positions within the bin if reference_start >= self.start && reference_start <= self.end { *position_counts.entry(reference_start).or_default() += 1; } // Count end positions within the bin if reference_end >= self.start && reference_end <= self.end { *position_counts.entry(reference_end).or_default() += 1; } } position_counts .into_iter() .max_by_key(|(_, count)| *count) .unwrap_or((0, 0)) } /// Computes the number of reads with SA tags, and finds the highest /// counts of read starts and ends at the same position. /// /// # Returns /// /// A tuple containing: /// - Number of reads with SA tags /// - Highest count of reads starting at any single position /// - Highest count of reads ending at any single position pub fn count_reads_sa_start_end(&self) -> (usize, usize, usize) { let mut n_sa = 0; let mut start_counts: HashMap = HashMap::new(); let mut end_counts: HashMap = HashMap::new(); for record in self.reads_store.values() { // Count reads with SA tags if matches!(record.aux(b"SA"), Ok(Aux::String(_))) { n_sa += 1; } let reference_start = record.reference_start() as u32; let reference_end = record.reference_end() as u32; // Count start positions within the bin if reference_start >= self.start && reference_start <= self.end { *start_counts.entry(reference_start).or_default() += 1; } // Count end positions within the bin if reference_end >= self.start && reference_end <= self.end { *end_counts.entry(reference_end).or_default() += 1; } } let max_start_count = start_counts.values().max().copied().unwrap_or(0); let max_end_count = end_counts.values().max().copied().unwrap_or(0); (n_sa, max_start_count, max_end_count) } /// Finds the position with the highest concentration of read starts. /// /// # Returns /// /// A tuple containing (position, count) of the position with the most read starts. pub fn max_start(&self) -> (u32, usize) { let mut start_counts: HashMap = HashMap::new(); for record in self.reads_store.values() { let reference_start = record.reference_start() as u32; if reference_start >= self.start && reference_start <= self.end { *start_counts.entry(reference_start).or_default() += 1; } } start_counts .into_iter() .max_by_key(|(_, count)| *count) .unwrap_or((0, 0)) } /// Finds the position with the highest concentration of read ends. /// /// # Returns /// /// A tuple containing (position, count) of the position with the most read ends. pub fn max_end(&self) -> (u32, usize) { let mut end_counts: HashMap = HashMap::new(); for record in self.reads_store.values() { let reference_end = record.reference_end() as u32; if reference_end >= self.start && reference_end <= self.end { *end_counts.entry(reference_end).or_default() += 1; } } end_counts .into_iter() .max_by_key(|(_, count)| *count) .unwrap_or((0, 0)) } /// Gets primary alignments for reads that start or end at a specific position. /// /// # Arguments /// /// * `pos` - The position to check for read starts or ends /// /// # Returns /// /// A vector of primary alignment records. pub fn se_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec { let records: Vec = self .reads_store .values() .filter(|record| { record.reference_start() as u32 == pos || record.reference_end() as u32 == pos }) .cloned() .collect(); primary_records(bam_reader, records) } /// Gets primary alignments for reads that start at a specific position. /// /// # Arguments /// /// * `pos` - The position to check for read starts /// /// # Returns /// /// A vector of primary alignment records. pub fn s_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec { self.reads_store .values() .filter(|record| record.reference_start() as u32 == pos) .map(|record| primary_record(bam_reader, record.clone())) .collect() } /// Gets primary alignments for reads that end at a specific position. /// /// # Arguments /// /// * `pos` - The position to check for read ends /// /// # Returns /// /// A vector of primary alignment records. pub fn e_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec { self.reads_store .values() .filter(|record| record.reference_end() as u32 == pos) .map(|record| primary_record(bam_reader, record.clone())) .collect() } /// Calculates the mean coverage by considering the actual /// span of each read in the bin. /// /// # Returns /// /// The mean bin coverage as a floating-point value. pub fn mean_coverage(&self) -> f64 { // If bin has no length or no reads, return 0 if self.end <= self.start || self.reads_store.is_empty() { return 0.0; } // Calculate the total length of the bin let bin_length = (self.end - self.start + 1) as f64; // Calculate the total bases covered by all reads within the bin let mut total_bases_covered = 0.0; for record in self.reads_store.values() { let read_start = record.reference_start() as u32; let read_end = record.reference_end() as u32; // Skip reads that don't overlap with our bin if read_end < self.start || read_start > self.end { continue; } // Calculate the overlapping region let overlap_start = read_start.max(self.start); let overlap_end = read_end.min(self.end); // Add the number of bases this read covers within our bin total_bases_covered += (overlap_end - overlap_start + 1) as f64; } // Divide by the bin length to get average coverage total_bases_covered / bin_length } pub fn mean_coverage_from_depths(&self) -> f64 { if self.depths.is_empty() { return 0.0; } self.depths.iter().sum::() as f64 / self.depths.len() as f64 } } /// Helper: get start and end position of a read fn read_range(record: &Record) -> Option<(u32, u32)> { let start = record.pos(); let end = record.cigar().end_pos(); if start >= 0 && end > start { Some((start as u32, end as u32 - 1)) } else { None } } /// Reused per-reader parse buffers #[derive(Default)] pub struct BinRowBuf { pub depths: Vec, pub lowq: Vec, } /// Example: parse one TSV record into buffers. /// Returns (start, depths_slice, lowq_slice) pub fn parse_bin_record_into<'a>( rec: &'a ByteRecord, buf: &'a mut BinRowBuf, contig_expected: &str, ) -> anyhow::Result<(u32, &'a [u32], &'a [u32])> { let i_contig = 0usize; let i_start = 1usize; let i_end = 2usize; // adjust if your file has more/less scalar columns, but for your pasted line: let i_depths = 9usize; // big CSV list let i_lowq = 10usize; // big CSV list let contig = std::str::from_utf8(rec.get(i_contig).context("missing contig")?) .context("non-utf8 contig")?; anyhow::ensure!(contig == contig_expected, "unexpected contig"); let start = parse_u32(rec.get(i_start).context("missing start")?)?; let end = parse_u32(rec.get(i_end).context("missing end")?)?; anyhow::ensure!(end >= start, "invalid bin coordinates: end < start ({start} > {end})"); parse_csv_u32_into(&mut buf.depths, rec.get(i_depths).context("missing depths")?) .context("parse depths")?; parse_csv_u32_into(&mut buf.lowq, rec.get(i_lowq).context("missing lowq")?) .context("parse lowq")?; // critical sanity check: end-start+1 should match vector length for per-base bins anyhow::ensure!( (end - start + 1) as usize == buf.depths.len(), "bin width mismatch: {}..{} has width {}, depths has len {}", start, end, end - start + 1, buf.depths.len() ); anyhow::ensure!(buf.depths.len() == buf.lowq.len(), "depth/lowq len mismatch"); Ok((start, &buf.depths, &buf.lowq)) } #[derive(Debug)] pub struct BinStats { pub contig: String, pub start: u32, pub end: u32, pub n_reads: u32, pub n_sa: u32, pub max_start_count: u32, pub max_end_count: u32, pub depths: Vec, pub low_qualities: Vec, // keep fb_invs too if you want pub fb_invs: Vec, } impl BinStats { #[inline] pub fn mean_coverage_from_depths(&self) -> f64 { if self.depths.is_empty() { return 0.0; } self.depths.iter().sum::() as f64 / self.depths.len() as f64 } } impl BinStats { pub fn from_bam_region( bam_reader: &mut IndexedReader, header: &HeaderView, contig: &str, start: u32, length: u32, min_mapq: u8, ) -> anyhow::Result { anyhow::ensure!(length > 0, "bin length must be > 0"); let end = start + length - 1; bam_reader .fetch((contig, start as i64, end as i64)) .with_context(|| format!("Failed to fetch region {}:{}-{}", contig, start, end))?; let mut depths = vec![0u32; length as usize]; let mut low_qualities = vec![0u32; length as usize]; let mut fb_invs = vec![0u32; length as usize]; let mut n_reads: u32 = 0; let mut n_sa: u32 = 0; let mut start_counts: HashMap = HashMap::new(); let mut end_counts: HashMap = HashMap::new(); for record_result in bam_reader.rc_records() { let rc_record = match record_result { Ok(rc) => rc, Err(e) => { error!( "Failed to parse record in {}:{}-{}: {}", contig, start, end, e ); continue; } }; let record: &Record = rc_record.as_ref(); // read genomic span (inclusive) let rs_i64 = record.pos(); let re_i64 = record.cigar().end_pos(); // exclusive if rs_i64 < 0 || re_i64 <= rs_i64 { continue; } let read_start = rs_i64 as u32; let read_end = (re_i64 as u32).saturating_sub(1); if read_end < start || read_start > end { continue; } // counts n_reads += 1; if matches!(record.aux(b"SA"), Ok(Aux::String(_))) { n_sa += 1; } let aln_start = record.reference_start() as u32; let aln_end = record.reference_end() as u32; if (start..=end).contains(&aln_start) { *start_counts.entry(aln_start).or_insert(0) += 1; } if (start..=end).contains(&aln_end) { *end_counts.entry(aln_end).or_insert(0) += 1; } // per-record flags (hoisted) let is_lowq = record.mapq() < min_mapq; let has_fbinv = fb_inv_from_record(record, header, min_mapq, Some(150), Some(200)).is_some(); // per-base accumulation (same semantics as your original) let local_start = start.max(read_start); let local_end = end.min(read_end); for pos in local_start..=local_end { let i = (pos - start) as usize; depths[i] += 1; if is_lowq { low_qualities[i] += 1; } if has_fbinv { fb_invs[i] += 1; } } } let max_start_count = start_counts.values().copied().max().unwrap_or(0); let max_end_count = end_counts.values().copied().max().unwrap_or(0); Ok(Self { contig: contig.to_string(), start, end, n_reads, n_sa, max_start_count, max_end_count, depths, low_qualities, fb_invs, }) } }