bin.rs 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558
  1. use std::collections::HashMap;
  2. use anyhow::Context;
  3. use csv::ByteRecord;
  4. use log::error;
  5. use rust_htslib::bam::{HeaderView, IndexedReader, Read, Record, ext::BamRecordExtensions, record::Aux};
  6. use crate::io::{bam::{fb_inv_from_record, primary_record, primary_records}, tsv::{parse_csv_u32_into, parse_u32}};
  7. /// A genomic bin containing reads from a specific region.
  8. ///
  9. /// This struct represents a segment of a BAM file, storing reads that overlap
  10. /// a specific genomic region. It provides methods to analyze read distributions
  11. /// and extract primary alignments for supplementary alignments.
  12. #[derive(Debug)]
  13. pub struct Bin {
  14. /// The contig/chromosome name
  15. pub contig: String,
  16. /// The 0-based inclusive start position
  17. pub start: u32,
  18. /// The 0-based inclusive end position
  19. pub end: u32,
  20. /// Map of read query names to their BAM records
  21. pub reads_store: HashMap<Vec<u8>, Record>,
  22. /// Indexed BAM reader for fetching additional records
  23. // pub bam_reader: IndexedReader,
  24. /// Average read length within this bin
  25. // pub reads_mean_len: u32,
  26. /// Number of reads filtered due to low mapping quality
  27. pub low_qualities: Vec<u32>,
  28. /// Vec of depths
  29. pub depths: Vec<u32>,
  30. pub fb_invs: Vec<u32>,
  31. }
  32. impl Bin {
  33. /// Creates a new genomic bin from a BAM file.
  34. ///
  35. /// # Arguments
  36. ///
  37. /// * `bam_path` - Path to the BAM file
  38. /// * `contig` - Chromosome/contig name
  39. /// * `start` - 0-based inclusive start position
  40. /// * `length` - Length of the region to extract
  41. ///
  42. /// # Returns
  43. ///
  44. /// A Result containing the new Bin if successful, or an error if the BAM file
  45. /// couldn't be read or the region couldn't be fetched.
  46. ///
  47. /// # Examples
  48. ///
  49. /// ```
  50. /// let bin = Bin::new("sample.bam", "chr1", 1000000, 1000)?;
  51. /// println!("Loaded {} reads", bin.n_reads());
  52. /// ```
  53. pub fn new(
  54. bam_reader: &mut IndexedReader,
  55. contig: &str,
  56. start: u32,
  57. length: u32,
  58. min_mapq: u8,
  59. ) -> anyhow::Result<Self> {
  60. anyhow::ensure!(length > 0, "bin length must be > 0");
  61. let end = start + length - 1;
  62. bam_reader
  63. .fetch((contig, start as i64, end as i64))
  64. .with_context(|| format!("Failed to fetch region {}:{}-{}", contig, start, end))?;
  65. let header = bam_reader.header().clone();
  66. let mut reads_store: HashMap<Vec<u8>, Record> = HashMap::new();
  67. let mut depths = vec![0u32; length as usize]; // Optional pseudo-depth
  68. let mut low_qualities = vec![0u32; length as usize]; // Optional pseudo-depth
  69. let mut fb_invs = vec![0u32; length as usize];
  70. for record_result in bam_reader.rc_records() {
  71. let rc_record = match record_result {
  72. Ok(rc) => rc,
  73. Err(e) => {
  74. error!(
  75. "Failed to parse record in {}:{}-{}: {}",
  76. contig, start, end, e
  77. );
  78. continue;
  79. }
  80. };
  81. let record = rc_record.as_ref();
  82. if let Some((read_start, read_end)) = read_range(record) {
  83. if read_end < start || read_start > end {
  84. continue; // Not overlapping this bin
  85. }
  86. let has_fbinv =
  87. fb_inv_from_record(record, &header, min_mapq, Some(150), Some(200)).is_some();
  88. // Clone the underlying Record
  89. reads_store.insert(record.qname().to_vec(), record.clone());
  90. // Optional: depth accumulation
  91. let local_start = start.max(read_start);
  92. let local_end = end.min(read_end);
  93. for pos in local_start..=local_end {
  94. let i = (pos - start) as usize;
  95. if i < depths.len() {
  96. depths[i] += 1;
  97. if record.mapq() < min_mapq {
  98. low_qualities[i] += 1;
  99. }
  100. if has_fbinv {
  101. fb_invs[i] += 1;
  102. }
  103. }
  104. }
  105. }
  106. }
  107. Ok(Bin {
  108. contig: contig.to_string(),
  109. start,
  110. end,
  111. reads_store,
  112. low_qualities,
  113. depths,
  114. fb_invs,
  115. })
  116. }
  117. /// Returns the total number of reads in this bin.
  118. pub fn n_reads(&self) -> usize {
  119. self.reads_store.len()
  120. }
  121. /// Returns the number of reads with SA (supplementary alignment) tags.
  122. pub fn n_sa(&self) -> usize {
  123. self.reads_store
  124. .values()
  125. .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
  126. .count()
  127. }
  128. /// Retrieves primary alignments for all reads with SA tags in this bin.
  129. ///
  130. /// # Returns
  131. ///
  132. /// A vector of primary alignment records for reads with supplementary alignments.
  133. pub fn sa_primary(&mut self, bam_reader: &mut IndexedReader) -> Vec<Record> {
  134. self.reads_store
  135. .values()
  136. .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
  137. .map(|record| primary_record(bam_reader, record.clone()))
  138. .collect()
  139. }
  140. /// Finds the position with the highest concentration of read starts or ends.
  141. ///
  142. /// # Returns
  143. ///
  144. /// A tuple containing (position, count) of the position with the most read starts or ends.
  145. pub fn max_start_or_end(&self) -> (u32, usize) {
  146. let mut position_counts: HashMap<u32, usize> = HashMap::new();
  147. for record in self.reads_store.values() {
  148. let reference_start = record.reference_start() as u32;
  149. let reference_end = record.reference_end() as u32;
  150. // Count start positions within the bin
  151. if reference_start >= self.start && reference_start <= self.end {
  152. *position_counts.entry(reference_start).or_default() += 1;
  153. }
  154. // Count end positions within the bin
  155. if reference_end >= self.start && reference_end <= self.end {
  156. *position_counts.entry(reference_end).or_default() += 1;
  157. }
  158. }
  159. position_counts
  160. .into_iter()
  161. .max_by_key(|(_, count)| *count)
  162. .unwrap_or((0, 0))
  163. }
  164. /// Computes the number of reads with SA tags, and finds the highest
  165. /// counts of read starts and ends at the same position.
  166. ///
  167. /// # Returns
  168. ///
  169. /// A tuple containing:
  170. /// - Number of reads with SA tags
  171. /// - Highest count of reads starting at any single position
  172. /// - Highest count of reads ending at any single position
  173. pub fn count_reads_sa_start_end(&self) -> (usize, usize, usize) {
  174. let mut n_sa = 0;
  175. let mut start_counts: HashMap<u32, usize> = HashMap::new();
  176. let mut end_counts: HashMap<u32, usize> = HashMap::new();
  177. for record in self.reads_store.values() {
  178. // Count reads with SA tags
  179. if matches!(record.aux(b"SA"), Ok(Aux::String(_))) {
  180. n_sa += 1;
  181. }
  182. let reference_start = record.reference_start() as u32;
  183. let reference_end = record.reference_end() as u32;
  184. // Count start positions within the bin
  185. if reference_start >= self.start && reference_start <= self.end {
  186. *start_counts.entry(reference_start).or_default() += 1;
  187. }
  188. // Count end positions within the bin
  189. if reference_end >= self.start && reference_end <= self.end {
  190. *end_counts.entry(reference_end).or_default() += 1;
  191. }
  192. }
  193. let max_start_count = start_counts.values().max().copied().unwrap_or(0);
  194. let max_end_count = end_counts.values().max().copied().unwrap_or(0);
  195. (n_sa, max_start_count, max_end_count)
  196. }
  197. /// Finds the position with the highest concentration of read starts.
  198. ///
  199. /// # Returns
  200. ///
  201. /// A tuple containing (position, count) of the position with the most read starts.
  202. pub fn max_start(&self) -> (u32, usize) {
  203. let mut start_counts: HashMap<u32, usize> = HashMap::new();
  204. for record in self.reads_store.values() {
  205. let reference_start = record.reference_start() as u32;
  206. if reference_start >= self.start && reference_start <= self.end {
  207. *start_counts.entry(reference_start).or_default() += 1;
  208. }
  209. }
  210. start_counts
  211. .into_iter()
  212. .max_by_key(|(_, count)| *count)
  213. .unwrap_or((0, 0))
  214. }
  215. /// Finds the position with the highest concentration of read ends.
  216. ///
  217. /// # Returns
  218. ///
  219. /// A tuple containing (position, count) of the position with the most read ends.
  220. pub fn max_end(&self) -> (u32, usize) {
  221. let mut end_counts: HashMap<u32, usize> = HashMap::new();
  222. for record in self.reads_store.values() {
  223. let reference_end = record.reference_end() as u32;
  224. if reference_end >= self.start && reference_end <= self.end {
  225. *end_counts.entry(reference_end).or_default() += 1;
  226. }
  227. }
  228. end_counts
  229. .into_iter()
  230. .max_by_key(|(_, count)| *count)
  231. .unwrap_or((0, 0))
  232. }
  233. /// Gets primary alignments for reads that start or end at a specific position.
  234. ///
  235. /// # Arguments
  236. ///
  237. /// * `pos` - The position to check for read starts or ends
  238. ///
  239. /// # Returns
  240. ///
  241. /// A vector of primary alignment records.
  242. pub fn se_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
  243. let records: Vec<Record> = self
  244. .reads_store
  245. .values()
  246. .filter(|record| {
  247. record.reference_start() as u32 == pos || record.reference_end() as u32 == pos
  248. })
  249. .cloned()
  250. .collect();
  251. primary_records(bam_reader, records)
  252. }
  253. /// Gets primary alignments for reads that start at a specific position.
  254. ///
  255. /// # Arguments
  256. ///
  257. /// * `pos` - The position to check for read starts
  258. ///
  259. /// # Returns
  260. ///
  261. /// A vector of primary alignment records.
  262. pub fn s_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
  263. self.reads_store
  264. .values()
  265. .filter(|record| record.reference_start() as u32 == pos)
  266. .map(|record| primary_record(bam_reader, record.clone()))
  267. .collect()
  268. }
  269. /// Gets primary alignments for reads that end at a specific position.
  270. ///
  271. /// # Arguments
  272. ///
  273. /// * `pos` - The position to check for read ends
  274. ///
  275. /// # Returns
  276. ///
  277. /// A vector of primary alignment records.
  278. pub fn e_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
  279. self.reads_store
  280. .values()
  281. .filter(|record| record.reference_end() as u32 == pos)
  282. .map(|record| primary_record(bam_reader, record.clone()))
  283. .collect()
  284. }
  285. /// Calculates the mean coverage by considering the actual
  286. /// span of each read in the bin.
  287. ///
  288. /// # Returns
  289. ///
  290. /// The mean bin coverage as a floating-point value.
  291. pub fn mean_coverage(&self) -> f64 {
  292. // If bin has no length or no reads, return 0
  293. if self.end <= self.start || self.reads_store.is_empty() {
  294. return 0.0;
  295. }
  296. // Calculate the total length of the bin
  297. let bin_length = (self.end - self.start + 1) as f64;
  298. // Calculate the total bases covered by all reads within the bin
  299. let mut total_bases_covered = 0.0;
  300. for record in self.reads_store.values() {
  301. let read_start = record.reference_start() as u32;
  302. let read_end = record.reference_end() as u32;
  303. // Skip reads that don't overlap with our bin
  304. if read_end < self.start || read_start > self.end {
  305. continue;
  306. }
  307. // Calculate the overlapping region
  308. let overlap_start = read_start.max(self.start);
  309. let overlap_end = read_end.min(self.end);
  310. // Add the number of bases this read covers within our bin
  311. total_bases_covered += (overlap_end - overlap_start + 1) as f64;
  312. }
  313. // Divide by the bin length to get average coverage
  314. total_bases_covered / bin_length
  315. }
  316. pub fn mean_coverage_from_depths(&self) -> f64 {
  317. if self.depths.is_empty() {
  318. return 0.0;
  319. }
  320. self.depths.iter().sum::<u32>() as f64 / self.depths.len() as f64
  321. }
  322. }
  323. /// Helper: get start and end position of a read
  324. fn read_range(record: &Record) -> Option<(u32, u32)> {
  325. let start = record.pos();
  326. let end = record.cigar().end_pos();
  327. if start >= 0 && end > start {
  328. Some((start as u32, end as u32 - 1))
  329. } else {
  330. None
  331. }
  332. }
  333. /// Reused per-reader parse buffers
  334. #[derive(Default)]
  335. pub struct BinRowBuf {
  336. pub depths: Vec<u32>,
  337. pub lowq: Vec<u32>,
  338. }
  339. /// Example: parse one TSV record into buffers.
  340. /// Returns (start, depths_slice, lowq_slice)
  341. pub fn parse_bin_record_into<'a>(
  342. rec: &'a ByteRecord,
  343. buf: &'a mut BinRowBuf,
  344. contig_expected: &str,
  345. ) -> anyhow::Result<(u32, &'a [u32], &'a [u32])> {
  346. let i_contig = 0usize;
  347. let i_start = 1usize;
  348. let i_end = 2usize;
  349. // adjust if your file has more/less scalar columns, but for your pasted line:
  350. let i_depths = 9usize; // big CSV list
  351. let i_lowq = 10usize; // big CSV list
  352. let contig = std::str::from_utf8(rec.get(i_contig).context("missing contig")?)
  353. .context("non-utf8 contig")?;
  354. anyhow::ensure!(contig == contig_expected, "unexpected contig");
  355. let start = parse_u32(rec.get(i_start).context("missing start")?)?;
  356. let end = parse_u32(rec.get(i_end).context("missing end")?)?;
  357. anyhow::ensure!(end >= start, "invalid bin coordinates: end < start ({start} > {end})");
  358. parse_csv_u32_into(&mut buf.depths, rec.get(i_depths).context("missing depths")?)
  359. .context("parse depths")?;
  360. parse_csv_u32_into(&mut buf.lowq, rec.get(i_lowq).context("missing lowq")?)
  361. .context("parse lowq")?;
  362. // critical sanity check: end-start+1 should match vector length for per-base bins
  363. anyhow::ensure!(
  364. (end - start + 1) as usize == buf.depths.len(),
  365. "bin width mismatch: {}..{} has width {}, depths has len {}",
  366. start, end, end - start + 1, buf.depths.len()
  367. );
  368. anyhow::ensure!(buf.depths.len() == buf.lowq.len(), "depth/lowq len mismatch");
  369. Ok((start, &buf.depths, &buf.lowq))
  370. }
  371. #[derive(Debug)]
  372. pub struct BinStats {
  373. pub contig: String,
  374. pub start: u32,
  375. pub end: u32,
  376. pub n_reads: u32,
  377. pub n_sa: u32,
  378. pub max_start_count: u32,
  379. pub max_end_count: u32,
  380. pub depths: Vec<u32>,
  381. pub low_qualities: Vec<u32>,
  382. // keep fb_invs too if you want
  383. pub fb_invs: Vec<u32>,
  384. }
  385. impl BinStats {
  386. #[inline]
  387. pub fn mean_coverage_from_depths(&self) -> f64 {
  388. if self.depths.is_empty() { return 0.0; }
  389. self.depths.iter().sum::<u32>() as f64 / self.depths.len() as f64
  390. }
  391. }
  392. impl BinStats {
  393. pub fn from_bam_region(
  394. bam_reader: &mut IndexedReader,
  395. header: &HeaderView,
  396. contig: &str,
  397. start: u32,
  398. length: u32,
  399. min_mapq: u8,
  400. ) -> anyhow::Result<Self> {
  401. anyhow::ensure!(length > 0, "bin length must be > 0");
  402. let end = start + length - 1;
  403. bam_reader
  404. .fetch((contig, start as i64, end as i64))
  405. .with_context(|| format!("Failed to fetch region {}:{}-{}", contig, start, end))?;
  406. let mut depths = vec![0u32; length as usize];
  407. let mut low_qualities = vec![0u32; length as usize];
  408. let mut fb_invs = vec![0u32; length as usize];
  409. let mut n_reads: u32 = 0;
  410. let mut n_sa: u32 = 0;
  411. let mut start_counts: HashMap<u32, u32> = HashMap::new();
  412. let mut end_counts: HashMap<u32, u32> = HashMap::new();
  413. for record_result in bam_reader.rc_records() {
  414. let rc_record = match record_result {
  415. Ok(rc) => rc,
  416. Err(e) => {
  417. error!(
  418. "Failed to parse record in {}:{}-{}: {}",
  419. contig, start, end, e
  420. );
  421. continue;
  422. }
  423. };
  424. let record: &Record = rc_record.as_ref();
  425. // read genomic span (inclusive)
  426. let rs_i64 = record.pos();
  427. let re_i64 = record.cigar().end_pos(); // exclusive
  428. if rs_i64 < 0 || re_i64 <= rs_i64 {
  429. continue;
  430. }
  431. let read_start = rs_i64 as u32;
  432. let read_end = (re_i64 as u32).saturating_sub(1);
  433. if read_end < start || read_start > end {
  434. continue;
  435. }
  436. // counts
  437. n_reads += 1;
  438. if matches!(record.aux(b"SA"), Ok(Aux::String(_))) {
  439. n_sa += 1;
  440. }
  441. let aln_start = record.reference_start() as u32;
  442. let aln_end = record.reference_end() as u32;
  443. if (start..=end).contains(&aln_start) {
  444. *start_counts.entry(aln_start).or_insert(0) += 1;
  445. }
  446. if (start..=end).contains(&aln_end) {
  447. *end_counts.entry(aln_end).or_insert(0) += 1;
  448. }
  449. // per-record flags (hoisted)
  450. let is_lowq = record.mapq() < min_mapq;
  451. let has_fbinv =
  452. fb_inv_from_record(record, header, min_mapq, Some(150), Some(200)).is_some();
  453. // per-base accumulation (same semantics as your original)
  454. let local_start = start.max(read_start);
  455. let local_end = end.min(read_end);
  456. for pos in local_start..=local_end {
  457. let i = (pos - start) as usize;
  458. depths[i] += 1;
  459. if is_lowq {
  460. low_qualities[i] += 1;
  461. }
  462. if has_fbinv {
  463. fb_invs[i] += 1;
  464. }
  465. }
  466. }
  467. let max_start_count = start_counts.values().copied().max().unwrap_or(0);
  468. let max_end_count = end_counts.values().copied().max().unwrap_or(0);
  469. Ok(Self {
  470. contig: contig.to_string(),
  471. start,
  472. end,
  473. n_reads,
  474. n_sa,
  475. max_start_count,
  476. max_end_count,
  477. depths,
  478. low_qualities,
  479. fb_invs,
  480. })
  481. }
  482. }