bin.rs 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. use std::collections::HashMap;
  2. use anyhow::Context;
  3. use log::error;
  4. use rust_htslib::bam::{ext::BamRecordExtensions, record::Aux, IndexedReader, Read, Record};
  5. use crate::io::bam::{primary_record, primary_records};
  6. /// A genomic bin containing reads from a specific region.
  7. ///
  8. /// This struct represents a segment of a BAM file, storing reads that overlap
  9. /// a specific genomic region. It provides methods to analyze read distributions
  10. /// and extract primary alignments for supplementary alignments.
  11. #[derive(Debug)]
  12. pub struct Bin {
  13. /// The contig/chromosome name
  14. pub contig: String,
  15. /// The 0-based inclusive start position
  16. pub start: u32,
  17. /// The 0-based inclusive end position
  18. pub end: u32,
  19. /// Map of read query names to their BAM records
  20. pub reads_store: HashMap<Vec<u8>, Record>,
  21. /// Indexed BAM reader for fetching additional records
  22. // pub bam_reader: IndexedReader,
  23. /// Average read length within this bin
  24. // pub reads_mean_len: u32,
  25. /// Number of reads filtered due to low mapping quality
  26. pub n_low_mapq: u32,
  27. /// Vec of depths
  28. pub depths: Vec<u32>,
  29. }
  30. impl Bin {
  31. /// Creates a new genomic bin from a BAM file.
  32. ///
  33. /// # Arguments
  34. ///
  35. /// * `bam_path` - Path to the BAM file
  36. /// * `contig` - Chromosome/contig name
  37. /// * `start` - 0-based inclusive start position
  38. /// * `length` - Length of the region to extract
  39. ///
  40. /// # Returns
  41. ///
  42. /// A Result containing the new Bin if successful, or an error if the BAM file
  43. /// couldn't be read or the region couldn't be fetched.
  44. ///
  45. /// # Examples
  46. ///
  47. /// ```
  48. /// let bin = Bin::new("sample.bam", "chr1", 1000000, 1000)?;
  49. /// println!("Loaded {} reads", bin.n_reads());
  50. /// ```
  51. pub fn new(
  52. bam_reader: &mut IndexedReader,
  53. contig: &str,
  54. start: u32,
  55. length: u32,
  56. min_mapq: u8,
  57. ) -> anyhow::Result<Self> {
  58. let end = start + length - 1;
  59. bam_reader
  60. .fetch((contig, start as i64, end as i64))
  61. .with_context(|| format!("Failed to fetch region {}:{}-{}", contig, start, end))?;
  62. let mut reads_store: HashMap<Vec<u8>, Record> = HashMap::new();
  63. let mut n_low_mapq = 0;
  64. let mut depths = vec![0u32; length as usize]; // Optional pseudo-depth
  65. for record_result in bam_reader.rc_records() {
  66. let rc_record = match record_result {
  67. Ok(rc) => rc,
  68. Err(e) => {
  69. error!(
  70. "Failed to parse record in {}:{}-{}: {}",
  71. contig, start, end, e
  72. );
  73. continue;
  74. }
  75. };
  76. let record = rc_record.as_ref();
  77. if record.mapq() < min_mapq {
  78. n_low_mapq += 1;
  79. continue;
  80. }
  81. if let Some((read_start, read_end)) = read_range(record) {
  82. if read_end < start || read_start > end {
  83. continue; // Not overlapping this bin
  84. }
  85. // Clone the underlying Record
  86. reads_store.insert(record.qname().to_vec(), record.clone());
  87. // Optional: depth accumulation
  88. let local_start = start.max(read_start);
  89. let local_end = end.min(read_end);
  90. for pos in local_start..=local_end {
  91. let i = (pos - start) as usize;
  92. if i < depths.len() {
  93. depths[i] += 1;
  94. }
  95. }
  96. }
  97. }
  98. Ok(Bin {
  99. contig: contig.to_string(),
  100. start,
  101. end,
  102. reads_store,
  103. n_low_mapq,
  104. depths,
  105. })
  106. }
  107. /// Returns the total number of reads in this bin.
  108. pub fn n_reads(&self) -> usize {
  109. self.reads_store.len()
  110. }
  111. /// Returns the number of reads with SA (supplementary alignment) tags.
  112. pub fn n_sa(&self) -> usize {
  113. self.reads_store
  114. .values()
  115. .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
  116. .count()
  117. }
  118. /// Retrieves primary alignments for all reads with SA tags in this bin.
  119. ///
  120. /// # Returns
  121. ///
  122. /// A vector of primary alignment records for reads with supplementary alignments.
  123. pub fn sa_primary(&mut self, bam_reader: &mut IndexedReader) -> Vec<Record> {
  124. self.reads_store
  125. .values()
  126. .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
  127. .map(|record| primary_record(bam_reader, record.clone()))
  128. .collect()
  129. }
  130. /// Finds the position with the highest concentration of read starts or ends.
  131. ///
  132. /// # Returns
  133. ///
  134. /// A tuple containing (position, count) of the position with the most read starts or ends.
  135. pub fn max_start_or_end(&self) -> (u32, usize) {
  136. let mut position_counts: HashMap<u32, usize> = HashMap::new();
  137. for record in self.reads_store.values() {
  138. let reference_start = record.reference_start() as u32;
  139. let reference_end = record.reference_end() as u32;
  140. // Count start positions within the bin
  141. if reference_start >= self.start && reference_start <= self.end {
  142. *position_counts.entry(reference_start).or_default() += 1;
  143. }
  144. // Count end positions within the bin
  145. if reference_end >= self.start && reference_end <= self.end {
  146. *position_counts.entry(reference_end).or_default() += 1;
  147. }
  148. }
  149. position_counts
  150. .into_iter()
  151. .max_by_key(|(_, count)| *count)
  152. .unwrap_or((0, 0))
  153. }
  154. /// Computes the number of reads with SA tags, and finds the highest
  155. /// counts of read starts and ends at the same position.
  156. ///
  157. /// # Returns
  158. ///
  159. /// A tuple containing:
  160. /// - Number of reads with SA tags
  161. /// - Highest count of reads starting at any single position
  162. /// - Highest count of reads ending at any single position
  163. pub fn count_reads_sa_start_end(&self) -> (usize, usize, usize) {
  164. let mut n_sa = 0;
  165. let mut start_counts: HashMap<u32, usize> = HashMap::new();
  166. let mut end_counts: HashMap<u32, usize> = HashMap::new();
  167. for record in self.reads_store.values() {
  168. // Count reads with SA tags
  169. if matches!(record.aux(b"SA"), Ok(Aux::String(_))) {
  170. n_sa += 1;
  171. }
  172. let reference_start = record.reference_start() as u32;
  173. let reference_end = record.reference_end() as u32;
  174. // Count start positions within the bin
  175. if reference_start >= self.start && reference_start <= self.end {
  176. *start_counts.entry(reference_start).or_default() += 1;
  177. }
  178. // Count end positions within the bin
  179. if reference_end >= self.start && reference_end <= self.end {
  180. *end_counts.entry(reference_end).or_default() += 1;
  181. }
  182. }
  183. let max_start_count = start_counts.values().max().copied().unwrap_or(0);
  184. let max_end_count = end_counts.values().max().copied().unwrap_or(0);
  185. (n_sa, max_start_count, max_end_count)
  186. }
  187. /// Finds the position with the highest concentration of read starts.
  188. ///
  189. /// # Returns
  190. ///
  191. /// A tuple containing (position, count) of the position with the most read starts.
  192. pub fn max_start(&self) -> (u32, usize) {
  193. let mut start_counts: HashMap<u32, usize> = HashMap::new();
  194. for record in self.reads_store.values() {
  195. let reference_start = record.reference_start() as u32;
  196. if reference_start >= self.start && reference_start <= self.end {
  197. *start_counts.entry(reference_start).or_default() += 1;
  198. }
  199. }
  200. start_counts
  201. .into_iter()
  202. .max_by_key(|(_, count)| *count)
  203. .unwrap_or((0, 0))
  204. }
  205. /// Finds the position with the highest concentration of read ends.
  206. ///
  207. /// # Returns
  208. ///
  209. /// A tuple containing (position, count) of the position with the most read ends.
  210. pub fn max_end(&self) -> (u32, usize) {
  211. let mut end_counts: HashMap<u32, usize> = HashMap::new();
  212. for record in self.reads_store.values() {
  213. let reference_end = record.reference_end() as u32;
  214. if reference_end >= self.start && reference_end <= self.end {
  215. *end_counts.entry(reference_end).or_default() += 1;
  216. }
  217. }
  218. end_counts
  219. .into_iter()
  220. .max_by_key(|(_, count)| *count)
  221. .unwrap_or((0, 0))
  222. }
  223. /// Gets primary alignments for reads that start or end at a specific position.
  224. ///
  225. /// # Arguments
  226. ///
  227. /// * `pos` - The position to check for read starts or ends
  228. ///
  229. /// # Returns
  230. ///
  231. /// A vector of primary alignment records.
  232. pub fn se_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
  233. let records: Vec<Record> = self
  234. .reads_store
  235. .values()
  236. .filter(|record| {
  237. record.reference_start() as u32 == pos || record.reference_end() as u32 == pos
  238. })
  239. .cloned()
  240. .collect();
  241. primary_records(bam_reader, records)
  242. }
  243. /// Gets primary alignments for reads that start at a specific position.
  244. ///
  245. /// # Arguments
  246. ///
  247. /// * `pos` - The position to check for read starts
  248. ///
  249. /// # Returns
  250. ///
  251. /// A vector of primary alignment records.
  252. pub fn s_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
  253. self.reads_store
  254. .values()
  255. .filter(|record| record.reference_start() as u32 == pos)
  256. .map(|record| primary_record(bam_reader, record.clone()))
  257. .collect()
  258. }
  259. /// Gets primary alignments for reads that end at a specific position.
  260. ///
  261. /// # Arguments
  262. ///
  263. /// * `pos` - The position to check for read ends
  264. ///
  265. /// # Returns
  266. ///
  267. /// A vector of primary alignment records.
  268. pub fn e_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
  269. self.reads_store
  270. .values()
  271. .filter(|record| record.reference_end() as u32 == pos)
  272. .map(|record| primary_record(bam_reader, record.clone()))
  273. .collect()
  274. }
  275. /// Calculates the mean coverage by considering the actual
  276. /// span of each read in the bin.
  277. ///
  278. /// # Returns
  279. ///
  280. /// The mean bin coverage as a floating-point value.
  281. pub fn mean_coverage(&self) -> f64 {
  282. // If bin has no length or no reads, return 0
  283. if self.end <= self.start || self.reads_store.is_empty() {
  284. return 0.0;
  285. }
  286. // Calculate the total length of the bin
  287. let bin_length = (self.end - self.start + 1) as f64;
  288. // Calculate the total bases covered by all reads within the bin
  289. let mut total_bases_covered = 0.0;
  290. for record in self.reads_store.values() {
  291. let read_start = record.reference_start() as u32;
  292. let read_end = record.reference_end() as u32;
  293. // Skip reads that don't overlap with our bin
  294. if read_end < self.start || read_start > self.end {
  295. continue;
  296. }
  297. // Calculate the overlapping region
  298. let overlap_start = read_start.max(self.start);
  299. let overlap_end = read_end.min(self.end);
  300. // Add the number of bases this read covers within our bin
  301. total_bases_covered += (overlap_end - overlap_start + 1) as f64;
  302. }
  303. // Divide by the bin length to get average coverage
  304. total_bases_covered / bin_length
  305. }
  306. pub fn mean_coverage_from_depths(&self) -> f64 {
  307. self.depths.iter().sum::<u32>() as f64 / self.depths.len() as f64
  308. }
  309. }
  310. /// Helper: get start and end position of a read
  311. fn read_range(record: &Record) -> Option<(u32, u32)> {
  312. let start = record.pos();
  313. let end = record.cigar().end_pos();
  314. if start >= 0 && end > start {
  315. Some((start as u32, end as u32 - 1))
  316. } else {
  317. None
  318. }
  319. }