| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558 |
- use std::collections::HashMap;
- use anyhow::Context;
- use csv::ByteRecord;
- use log::error;
- use rust_htslib::bam::{HeaderView, IndexedReader, Read, Record, ext::BamRecordExtensions, record::Aux};
- use crate::io::{bam::{fb_inv_from_record, primary_record, primary_records}, tsv::{parse_csv_u32_into, parse_u32}};
- /// A genomic bin containing reads from a specific region.
- ///
- /// This struct represents a segment of a BAM file, storing reads that overlap
- /// a specific genomic region. It provides methods to analyze read distributions
- /// and extract primary alignments for supplementary alignments.
- #[derive(Debug)]
- pub struct Bin {
- /// The contig/chromosome name
- pub contig: String,
- /// The 0-based inclusive start position
- pub start: u32,
- /// The 0-based inclusive end position
- pub end: u32,
- /// Map of read query names to their BAM records
- pub reads_store: HashMap<Vec<u8>, Record>,
- /// Indexed BAM reader for fetching additional records
- // pub bam_reader: IndexedReader,
- /// Average read length within this bin
- // pub reads_mean_len: u32,
- /// Number of reads filtered due to low mapping quality
- pub low_qualities: Vec<u32>,
- /// Vec of depths
- pub depths: Vec<u32>,
- pub fb_invs: Vec<u32>,
- }
- impl Bin {
- /// Creates a new genomic bin from a BAM file.
- ///
- /// # Arguments
- ///
- /// * `bam_path` - Path to the BAM file
- /// * `contig` - Chromosome/contig name
- /// * `start` - 0-based inclusive start position
- /// * `length` - Length of the region to extract
- ///
- /// # Returns
- ///
- /// A Result containing the new Bin if successful, or an error if the BAM file
- /// couldn't be read or the region couldn't be fetched.
- ///
- /// # Examples
- ///
- /// ```
- /// let bin = Bin::new("sample.bam", "chr1", 1000000, 1000)?;
- /// println!("Loaded {} reads", bin.n_reads());
- /// ```
- pub fn new(
- bam_reader: &mut IndexedReader,
- contig: &str,
- start: u32,
- length: u32,
- min_mapq: u8,
- ) -> anyhow::Result<Self> {
- anyhow::ensure!(length > 0, "bin length must be > 0");
- let end = start + length - 1;
- bam_reader
- .fetch((contig, start as i64, end as i64))
- .with_context(|| format!("Failed to fetch region {}:{}-{}", contig, start, end))?;
- let header = bam_reader.header().clone();
- let mut reads_store: HashMap<Vec<u8>, Record> = HashMap::new();
- let mut depths = vec![0u32; length as usize]; // Optional pseudo-depth
- let mut low_qualities = vec![0u32; length as usize]; // Optional pseudo-depth
- let mut fb_invs = vec![0u32; length as usize];
- for record_result in bam_reader.rc_records() {
- let rc_record = match record_result {
- Ok(rc) => rc,
- Err(e) => {
- error!(
- "Failed to parse record in {}:{}-{}: {}",
- contig, start, end, e
- );
- continue;
- }
- };
- let record = rc_record.as_ref();
- if let Some((read_start, read_end)) = read_range(record) {
- if read_end < start || read_start > end {
- continue; // Not overlapping this bin
- }
- let has_fbinv =
- fb_inv_from_record(record, &header, min_mapq, Some(150), Some(200)).is_some();
- // Clone the underlying Record
- reads_store.insert(record.qname().to_vec(), record.clone());
- // Optional: depth accumulation
- let local_start = start.max(read_start);
- let local_end = end.min(read_end);
- for pos in local_start..=local_end {
- let i = (pos - start) as usize;
- if i < depths.len() {
- depths[i] += 1;
- if record.mapq() < min_mapq {
- low_qualities[i] += 1;
- }
- if has_fbinv {
- fb_invs[i] += 1;
- }
- }
- }
- }
- }
- Ok(Bin {
- contig: contig.to_string(),
- start,
- end,
- reads_store,
- low_qualities,
- depths,
- fb_invs,
- })
- }
- /// Returns the total number of reads in this bin.
- pub fn n_reads(&self) -> usize {
- self.reads_store.len()
- }
- /// Returns the number of reads with SA (supplementary alignment) tags.
- pub fn n_sa(&self) -> usize {
- self.reads_store
- .values()
- .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
- .count()
- }
- /// Retrieves primary alignments for all reads with SA tags in this bin.
- ///
- /// # Returns
- ///
- /// A vector of primary alignment records for reads with supplementary alignments.
- pub fn sa_primary(&mut self, bam_reader: &mut IndexedReader) -> Vec<Record> {
- self.reads_store
- .values()
- .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
- .map(|record| primary_record(bam_reader, record.clone()))
- .collect()
- }
- /// Finds the position with the highest concentration of read starts or ends.
- ///
- /// # Returns
- ///
- /// A tuple containing (position, count) of the position with the most read starts or ends.
- pub fn max_start_or_end(&self) -> (u32, usize) {
- let mut position_counts: HashMap<u32, usize> = HashMap::new();
- for record in self.reads_store.values() {
- let reference_start = record.reference_start() as u32;
- let reference_end = record.reference_end() as u32;
- // Count start positions within the bin
- if reference_start >= self.start && reference_start <= self.end {
- *position_counts.entry(reference_start).or_default() += 1;
- }
- // Count end positions within the bin
- if reference_end >= self.start && reference_end <= self.end {
- *position_counts.entry(reference_end).or_default() += 1;
- }
- }
- position_counts
- .into_iter()
- .max_by_key(|(_, count)| *count)
- .unwrap_or((0, 0))
- }
- /// Computes the number of reads with SA tags, and finds the highest
- /// counts of read starts and ends at the same position.
- ///
- /// # Returns
- ///
- /// A tuple containing:
- /// - Number of reads with SA tags
- /// - Highest count of reads starting at any single position
- /// - Highest count of reads ending at any single position
- pub fn count_reads_sa_start_end(&self) -> (usize, usize, usize) {
- let mut n_sa = 0;
- let mut start_counts: HashMap<u32, usize> = HashMap::new();
- let mut end_counts: HashMap<u32, usize> = HashMap::new();
- for record in self.reads_store.values() {
- // Count reads with SA tags
- if matches!(record.aux(b"SA"), Ok(Aux::String(_))) {
- n_sa += 1;
- }
- let reference_start = record.reference_start() as u32;
- let reference_end = record.reference_end() as u32;
- // Count start positions within the bin
- if reference_start >= self.start && reference_start <= self.end {
- *start_counts.entry(reference_start).or_default() += 1;
- }
- // Count end positions within the bin
- if reference_end >= self.start && reference_end <= self.end {
- *end_counts.entry(reference_end).or_default() += 1;
- }
- }
- let max_start_count = start_counts.values().max().copied().unwrap_or(0);
- let max_end_count = end_counts.values().max().copied().unwrap_or(0);
- (n_sa, max_start_count, max_end_count)
- }
- /// Finds the position with the highest concentration of read starts.
- ///
- /// # Returns
- ///
- /// A tuple containing (position, count) of the position with the most read starts.
- pub fn max_start(&self) -> (u32, usize) {
- let mut start_counts: HashMap<u32, usize> = HashMap::new();
- for record in self.reads_store.values() {
- let reference_start = record.reference_start() as u32;
- if reference_start >= self.start && reference_start <= self.end {
- *start_counts.entry(reference_start).or_default() += 1;
- }
- }
- start_counts
- .into_iter()
- .max_by_key(|(_, count)| *count)
- .unwrap_or((0, 0))
- }
- /// Finds the position with the highest concentration of read ends.
- ///
- /// # Returns
- ///
- /// A tuple containing (position, count) of the position with the most read ends.
- pub fn max_end(&self) -> (u32, usize) {
- let mut end_counts: HashMap<u32, usize> = HashMap::new();
- for record in self.reads_store.values() {
- let reference_end = record.reference_end() as u32;
- if reference_end >= self.start && reference_end <= self.end {
- *end_counts.entry(reference_end).or_default() += 1;
- }
- }
- end_counts
- .into_iter()
- .max_by_key(|(_, count)| *count)
- .unwrap_or((0, 0))
- }
- /// Gets primary alignments for reads that start or end at a specific position.
- ///
- /// # Arguments
- ///
- /// * `pos` - The position to check for read starts or ends
- ///
- /// # Returns
- ///
- /// A vector of primary alignment records.
- pub fn se_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
- let records: Vec<Record> = self
- .reads_store
- .values()
- .filter(|record| {
- record.reference_start() as u32 == pos || record.reference_end() as u32 == pos
- })
- .cloned()
- .collect();
- primary_records(bam_reader, records)
- }
- /// Gets primary alignments for reads that start at a specific position.
- ///
- /// # Arguments
- ///
- /// * `pos` - The position to check for read starts
- ///
- /// # Returns
- ///
- /// A vector of primary alignment records.
- pub fn s_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
- self.reads_store
- .values()
- .filter(|record| record.reference_start() as u32 == pos)
- .map(|record| primary_record(bam_reader, record.clone()))
- .collect()
- }
- /// Gets primary alignments for reads that end at a specific position.
- ///
- /// # Arguments
- ///
- /// * `pos` - The position to check for read ends
- ///
- /// # Returns
- ///
- /// A vector of primary alignment records.
- pub fn e_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
- self.reads_store
- .values()
- .filter(|record| record.reference_end() as u32 == pos)
- .map(|record| primary_record(bam_reader, record.clone()))
- .collect()
- }
- /// Calculates the mean coverage by considering the actual
- /// span of each read in the bin.
- ///
- /// # Returns
- ///
- /// The mean bin coverage as a floating-point value.
- pub fn mean_coverage(&self) -> f64 {
- // If bin has no length or no reads, return 0
- if self.end <= self.start || self.reads_store.is_empty() {
- return 0.0;
- }
- // Calculate the total length of the bin
- let bin_length = (self.end - self.start + 1) as f64;
- // Calculate the total bases covered by all reads within the bin
- let mut total_bases_covered = 0.0;
- for record in self.reads_store.values() {
- let read_start = record.reference_start() as u32;
- let read_end = record.reference_end() as u32;
- // Skip reads that don't overlap with our bin
- if read_end < self.start || read_start > self.end {
- continue;
- }
- // Calculate the overlapping region
- let overlap_start = read_start.max(self.start);
- let overlap_end = read_end.min(self.end);
- // Add the number of bases this read covers within our bin
- total_bases_covered += (overlap_end - overlap_start + 1) as f64;
- }
- // Divide by the bin length to get average coverage
- total_bases_covered / bin_length
- }
- pub fn mean_coverage_from_depths(&self) -> f64 {
- if self.depths.is_empty() {
- return 0.0;
- }
- self.depths.iter().sum::<u32>() as f64 / self.depths.len() as f64
- }
- }
- /// Helper: get start and end position of a read
- fn read_range(record: &Record) -> Option<(u32, u32)> {
- let start = record.pos();
- let end = record.cigar().end_pos();
- if start >= 0 && end > start {
- Some((start as u32, end as u32 - 1))
- } else {
- None
- }
- }
- /// Reused per-reader parse buffers
- #[derive(Default)]
- pub struct BinRowBuf {
- pub depths: Vec<u32>,
- pub lowq: Vec<u32>,
- }
- /// Example: parse one TSV record into buffers.
- /// Returns (start, depths_slice, lowq_slice)
- pub fn parse_bin_record_into<'a>(
- rec: &'a ByteRecord,
- buf: &'a mut BinRowBuf,
- contig_expected: &str,
- ) -> anyhow::Result<(u32, &'a [u32], &'a [u32])> {
- let i_contig = 0usize;
- let i_start = 1usize;
- let i_end = 2usize;
- // adjust if your file has more/less scalar columns, but for your pasted line:
- let i_depths = 9usize; // big CSV list
- let i_lowq = 10usize; // big CSV list
- let contig = std::str::from_utf8(rec.get(i_contig).context("missing contig")?)
- .context("non-utf8 contig")?;
- anyhow::ensure!(contig == contig_expected, "unexpected contig");
- let start = parse_u32(rec.get(i_start).context("missing start")?)?;
- let end = parse_u32(rec.get(i_end).context("missing end")?)?;
- anyhow::ensure!(end >= start, "invalid bin coordinates: end < start ({start} > {end})");
- parse_csv_u32_into(&mut buf.depths, rec.get(i_depths).context("missing depths")?)
- .context("parse depths")?;
- parse_csv_u32_into(&mut buf.lowq, rec.get(i_lowq).context("missing lowq")?)
- .context("parse lowq")?;
- // critical sanity check: end-start+1 should match vector length for per-base bins
- anyhow::ensure!(
- (end - start + 1) as usize == buf.depths.len(),
- "bin width mismatch: {}..{} has width {}, depths has len {}",
- start, end, end - start + 1, buf.depths.len()
- );
- anyhow::ensure!(buf.depths.len() == buf.lowq.len(), "depth/lowq len mismatch");
- Ok((start, &buf.depths, &buf.lowq))
- }
- #[derive(Debug)]
- pub struct BinStats {
- pub contig: String,
- pub start: u32,
- pub end: u32,
- pub n_reads: u32,
- pub n_sa: u32,
- pub max_start_count: u32,
- pub max_end_count: u32,
- pub depths: Vec<u32>,
- pub low_qualities: Vec<u32>,
- // keep fb_invs too if you want
- pub fb_invs: Vec<u32>,
- }
- impl BinStats {
- #[inline]
- pub fn mean_coverage_from_depths(&self) -> f64 {
- if self.depths.is_empty() { return 0.0; }
- self.depths.iter().sum::<u32>() as f64 / self.depths.len() as f64
- }
- }
- impl BinStats {
- pub fn from_bam_region(
- bam_reader: &mut IndexedReader,
- header: &HeaderView,
- contig: &str,
- start: u32,
- length: u32,
- min_mapq: u8,
- ) -> anyhow::Result<Self> {
- anyhow::ensure!(length > 0, "bin length must be > 0");
- let end = start + length - 1;
- bam_reader
- .fetch((contig, start as i64, end as i64))
- .with_context(|| format!("Failed to fetch region {}:{}-{}", contig, start, end))?;
- let mut depths = vec![0u32; length as usize];
- let mut low_qualities = vec![0u32; length as usize];
- let mut fb_invs = vec![0u32; length as usize];
- let mut n_reads: u32 = 0;
- let mut n_sa: u32 = 0;
- let mut start_counts: HashMap<u32, u32> = HashMap::new();
- let mut end_counts: HashMap<u32, u32> = HashMap::new();
- for record_result in bam_reader.rc_records() {
- let rc_record = match record_result {
- Ok(rc) => rc,
- Err(e) => {
- error!(
- "Failed to parse record in {}:{}-{}: {}",
- contig, start, end, e
- );
- continue;
- }
- };
- let record: &Record = rc_record.as_ref();
- // read genomic span (inclusive)
- let rs_i64 = record.pos();
- let re_i64 = record.cigar().end_pos(); // exclusive
- if rs_i64 < 0 || re_i64 <= rs_i64 {
- continue;
- }
- let read_start = rs_i64 as u32;
- let read_end = (re_i64 as u32).saturating_sub(1);
- if read_end < start || read_start > end {
- continue;
- }
- // counts
- n_reads += 1;
- if matches!(record.aux(b"SA"), Ok(Aux::String(_))) {
- n_sa += 1;
- }
- let aln_start = record.reference_start() as u32;
- let aln_end = record.reference_end() as u32;
- if (start..=end).contains(&aln_start) {
- *start_counts.entry(aln_start).or_insert(0) += 1;
- }
- if (start..=end).contains(&aln_end) {
- *end_counts.entry(aln_end).or_insert(0) += 1;
- }
- // per-record flags (hoisted)
- let is_lowq = record.mapq() < min_mapq;
- let has_fbinv =
- fb_inv_from_record(record, header, min_mapq, Some(150), Some(200)).is_some();
- // per-base accumulation (same semantics as your original)
- let local_start = start.max(read_start);
- let local_end = end.min(read_end);
- for pos in local_start..=local_end {
- let i = (pos - start) as usize;
- depths[i] += 1;
- if is_lowq {
- low_qualities[i] += 1;
- }
- if has_fbinv {
- fb_invs[i] += 1;
- }
- }
- }
- let max_start_count = start_counts.values().copied().max().unwrap_or(0);
- let max_end_count = end_counts.values().copied().max().unwrap_or(0);
- Ok(Self {
- contig: contig.to_string(),
- start,
- end,
- n_reads,
- n_sa,
- max_start_count,
- max_end_count,
- depths,
- low_qualities,
- fb_invs,
- })
- }
- }
|