bam.rs 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832
  1. use std::{
  2. collections::HashMap,
  3. fs::{self, File},
  4. path::PathBuf,
  5. };
  6. use anyhow::{anyhow, Context};
  7. use chrono::{DateTime, Utc};
  8. use dashmap::DashMap;
  9. use glob::glob;
  10. use log::{debug, info, warn};
  11. use rand::{rng, Rng};
  12. use rayon::prelude::*;
  13. use rust_htslib::bam::{ext::BamRecordExtensions, record::Cigar, Read};
  14. use serde::{Deserialize, Serialize};
  15. use crate::config::Config;
  16. use super::flowcells::FlowCell;
  17. #[derive(Debug, Clone, Deserialize, Serialize)]
  18. pub struct WGSBam {
  19. pub id: String,
  20. pub time_point: String,
  21. pub reference_genome: String,
  22. // pub bam_type: BamType,
  23. pub path: PathBuf,
  24. pub modified: DateTime<Utc>,
  25. pub bam_stats: WGSBamStats,
  26. // pub cramino: Option<CraminoRes>,
  27. pub composition: Vec<(String, String, f64)>, // acquisition id, fn
  28. }
  29. // #[derive(Debug, PartialEq, Clone, Deserialize, Serialize)]
  30. // pub enum BamType {
  31. // WGS,
  32. // Panel(String),
  33. // ChIP(String),
  34. // }
  35. //
  36. impl WGSBam {
  37. pub fn new(path: PathBuf) -> anyhow::Result<Self> {
  38. let stem = path
  39. .clone()
  40. .file_stem()
  41. .context(format!("Can't parse stem from {}", path.display()))?
  42. .to_string_lossy()
  43. .to_string();
  44. let stem: Vec<&str> = stem.split('_').collect();
  45. if stem.len() != 3 {
  46. return Err(anyhow!("Error in bam name: {}", path.display()));
  47. }
  48. let id = stem[0].to_string();
  49. let time_point = stem[1].to_string();
  50. let reference_genome = stem
  51. .last()
  52. .context("Can't get last from stem {stem}")?
  53. .to_string();
  54. // let bam_type = if stem.len() == 4 {
  55. // match stem[2] {
  56. // "oncoT" => BamType::Panel("oncoT".to_string()),
  57. // "H3K27ac" => BamType::ChIP("H3K27ac".to_string()),
  58. // "H3K4me3" => BamType::ChIP("H3K4me3".to_string()),
  59. // _ => return Err(anyhow!("Error in bam name: {}", path.display())),
  60. // }
  61. // } else {
  62. // BamType::WGS
  63. // };
  64. let file_metadata = fs::metadata(&path)?;
  65. let modified = file_metadata.modified()?;
  66. let tp_dir = path
  67. .parent()
  68. .context("Can't parse parent from: {bam_path}")?;
  69. let json_path = format!(
  70. "{}/{id}_{time_point}_{reference_genome}_info.json",
  71. tp_dir.to_string_lossy()
  72. );
  73. let json_file = PathBuf::from(&json_path);
  74. if json_file.exists() && json_file.metadata()?.modified()? > modified {
  75. match Self::load_json(&json_path) {
  76. Ok(s) => return Ok(s),
  77. Err(e) => {
  78. warn!("Error while loading {}.\n{e}", json_path);
  79. }
  80. }
  81. }
  82. // let cramino_path = format!(
  83. // "{}/{id}_{time_point}_{reference_genome}_cramino.txt",
  84. // tp_dir.to_string_lossy()
  85. // );
  86. // let cramino = if bam_type == BamType::WGS {
  87. // if !PathBuf::from_str(&cramino_path)?.exists() {
  88. // // return Err(anyhow!("Cramino file missing {cramino_path}"));
  89. // Cramino::default().with_bam(path.to_str().unwrap())?;
  90. // }
  91. // let mut cramino = Cramino::default().with_result_path(&cramino_path);
  92. // cramino
  93. // .parse_results()
  94. // .context(format!("Error while parsing cramino for {cramino_path}"))?;
  95. //
  96. // if let Some(cramino) = cramino.results {
  97. // Some(cramino)
  98. // } else {
  99. // return Err(anyhow!("Cramino results parsing failed"));
  100. // }
  101. // } else {
  102. // None
  103. // };
  104. let composition = bam_compo(path.to_string_lossy().as_ref(), 20_000).context(format!(
  105. "Error while reading BAM composition for {}",
  106. path.display()
  107. ))?;
  108. let s = Self {
  109. path,
  110. bam_stats: WGSBamStats::new(&id, &time_point, Config::default())?,
  111. // cramino,
  112. id: id.to_string(),
  113. time_point: time_point.to_string(),
  114. // bam_type,
  115. reference_genome,
  116. composition,
  117. modified: modified.into(),
  118. };
  119. s.save_json(&json_path)?;
  120. Ok(s)
  121. }
  122. pub fn load_json(path: &str) -> anyhow::Result<Self> {
  123. let f = File::open(path)?;
  124. let s: Self = serde_json::from_reader(f)?;
  125. Ok(s)
  126. }
  127. pub fn save_json(&self, path: &str) -> anyhow::Result<()> {
  128. let f = File::create(path)?;
  129. serde_json::to_writer(f, self)?;
  130. Ok(())
  131. }
  132. }
  133. #[derive(Debug, Default, Clone)]
  134. pub struct BamCollection {
  135. pub bams: Vec<WGSBam>,
  136. }
  137. impl BamCollection {
  138. pub fn new(result_dir: &str) -> Self {
  139. load_bam_collection(result_dir)
  140. }
  141. pub fn by_acquisition_id(&self) -> HashMap<String, Vec<&WGSBam>> {
  142. let mut acq: HashMap<String, Vec<&WGSBam>> = HashMap::new();
  143. for bam in self.bams.iter() {
  144. for (acq_id, _, _) in bam.composition.iter() {
  145. if let Some(entry) = acq.get_mut(acq_id) {
  146. entry.push(bam);
  147. } else {
  148. acq.insert(acq_id.to_string(), vec![bam]);
  149. }
  150. }
  151. }
  152. acq
  153. }
  154. pub fn get(&self, id: &str, time_point: &str) -> Vec<&WGSBam> {
  155. self.bams
  156. .iter()
  157. .filter(|b| b.id == id && b.time_point == time_point)
  158. .collect()
  159. }
  160. pub fn by_id_completed(&self, min_diag_cov: f32, min_mrd_cov: f32) -> Vec<WGSBam> {
  161. self.bams
  162. .iter()
  163. // .filter(|b| matches!(b.bam_type, BamType::WGS))
  164. .filter(|b| match b.time_point.as_str() {
  165. "diag" => b.bam_stats.global_coverage >= min_diag_cov as f64,
  166. "mrd" => b.bam_stats.global_coverage >= min_mrd_cov as f64,
  167. _ => false,
  168. })
  169. // .filter(|b| match &b.cramino {
  170. // Some(cramino) => match b.time_point.as_str() {
  171. // "diag" => cramino.mean_length >= min_diag_cov as f64,
  172. // "mrd" => cramino.mean_length >= min_mrd_cov as f64,
  173. // _ => false,
  174. // },
  175. // _ => false,
  176. // })
  177. .cloned()
  178. .collect()
  179. }
  180. pub fn ids(&self) -> Vec<String> {
  181. let mut ids: Vec<String> = self.bams.iter().map(|b| b.id.clone()).collect();
  182. ids.sort();
  183. ids.dedup();
  184. ids
  185. }
  186. }
  187. pub fn load_bam_collection(result_dir: &str) -> BamCollection {
  188. let pattern = format!("{}/*/*/*.bam", result_dir);
  189. let bams = glob(&pattern)
  190. .expect("Failed to read glob pattern")
  191. // .par_bridge()
  192. .filter_map(|entry| {
  193. match entry {
  194. Ok(path) => match WGSBam::new(path) {
  195. Ok(bam) => return Some(bam),
  196. Err(e) => warn!("{e}"),
  197. },
  198. Err(e) => warn!("Error: {:?}", e),
  199. }
  200. None
  201. })
  202. .collect();
  203. BamCollection { bams }
  204. }
  205. pub fn bam_compo(file_path: &str, sample_size: usize) -> anyhow::Result<Vec<(String, String, f64)>> {
  206. let mut bam = rust_htslib::bam::Reader::from_path(file_path)?;
  207. let mut rgs: HashMap<( String, String ), u64> = HashMap::new();
  208. for result in bam.records().filter_map(Result::ok).take(sample_size) {
  209. if let ( rust_htslib::bam::record::Aux::String(s), rust_htslib::bam::record::Aux::String(b)) = ( result.aux(b"RG")?, result.aux(b"fn")? ) {
  210. *rgs.entry(( s.to_string(), b.to_string())).or_default() += 1;
  211. }
  212. }
  213. Ok(rgs
  214. .into_iter()
  215. .map(|(k, n)| (k.0.to_string(), k.1.to_string(), n as f64 * 100.0 / sample_size as f64))
  216. .map(|(rg, f, p)| {
  217. let (a, _) = rg.split_once('_').unwrap();
  218. let (b, _) = f.split_once('_').unwrap();
  219. (a.to_string(), b.to_string(), p)
  220. })
  221. .collect())
  222. }
  223. pub fn bam_has_fc(bam: &WGSBam, flow_cell: &FlowCell) -> anyhow::Result<bool> {
  224. let mut bam = rust_htslib::bam::Reader::from_path(&bam.path)?;
  225. let mut rgs: HashMap<String, u64> = HashMap::new();
  226. for result in bam.records().filter_map(Result::ok).take(2_000) {
  227. if let rust_htslib::bam::record::Aux::String(s) = result.aux(b"fn")? {
  228. *rgs.entry(s.to_string()).or_default() += 1;
  229. }
  230. }
  231. for k in rgs.keys() {
  232. if k.contains(&flow_cell.sample_sheet.flow_cell_id) {
  233. return Ok(true);
  234. }
  235. }
  236. Ok(false)
  237. }
  238. pub struct BamReadsSampler {
  239. pub positions: Vec<(String, u64)>,
  240. pub reader: rust_htslib::bam::IndexedReader,
  241. current_index: usize,
  242. }
  243. impl BamReadsSampler {
  244. pub fn new(path: &str, n: usize) -> anyhow::Result<Self> {
  245. debug!("Reading BAM file: {path}");
  246. let reader = rust_htslib::bam::IndexedReader::from_path(path)?;
  247. let header = reader.header();
  248. let contig_len: Vec<(String, u64)> = header
  249. .target_names()
  250. .into_iter()
  251. .map(|target_name| {
  252. let tid = header.tid(target_name).unwrap();
  253. (
  254. String::from_utf8(target_name.to_vec()).unwrap(),
  255. header.target_len(tid).unwrap(),
  256. )
  257. })
  258. .collect();
  259. let positions = sample_random_positions(&contig_len, n);
  260. Ok(Self {
  261. positions,
  262. reader,
  263. current_index: 0,
  264. })
  265. }
  266. }
  267. impl Iterator for BamReadsSampler {
  268. type Item = Result<rust_htslib::bam::Record, rust_htslib::errors::Error>;
  269. fn next(&mut self) -> Option<Self::Item> {
  270. loop {
  271. if self.current_index < self.positions.len() {
  272. let (contig, pos) = &self.positions[self.current_index];
  273. match self.reader.fetch((contig, *pos, pos + 1)) {
  274. Ok(_) => (),
  275. Err(e) => warn!("Error while reading BAM {e}"),
  276. }
  277. let result = self.reader.records().next();
  278. self.current_index += 1;
  279. if result.is_some() {
  280. return result;
  281. }
  282. } else {
  283. return None;
  284. }
  285. }
  286. }
  287. }
  288. pub fn n_mapped_reads(bam_path: &str) -> anyhow::Result<u64> {
  289. let mut bam = rust_htslib::bam::IndexedReader::from_path(bam_path)?;
  290. let mapped = bam
  291. .index_stats()?
  292. .into_iter()
  293. .filter(|(tid, _, _, _)| *tid < 24)
  294. .map(|(_, _, m, _)| m)
  295. .sum();
  296. Ok(mapped)
  297. }
  298. // 0-based inclusif
  299. pub fn nt_pileup(
  300. bam: &mut rust_htslib::bam::IndexedReader,
  301. chr: &str,
  302. position: u32,
  303. with_next_ins: bool,
  304. ) -> anyhow::Result<Vec<u8>> {
  305. use rust_htslib::{bam, bam::Read};
  306. let mut bases = Vec::new();
  307. bam.fetch((chr, position, position + 1))?;
  308. let mut bam_pileup = Vec::new();
  309. for p in bam.pileup() {
  310. let pileup = p.context(format!(
  311. "Can't pileup bam at position {}:{} (0-based)",
  312. chr, position
  313. ))?;
  314. let cur_position = pileup.pos();
  315. if cur_position == position {
  316. for alignment in pileup.alignments() {
  317. match alignment.indel() {
  318. bam::pileup::Indel::Ins(_len) => bam_pileup.push(b'I'),
  319. bam::pileup::Indel::Del(_len) => bam_pileup.push(b'D'),
  320. _ => {
  321. let record = alignment.record();
  322. if record.seq_len() > 0 {
  323. if let Some(b) = base_at(&record, position, with_next_ins)? {
  324. bases.push(b);
  325. }
  326. } else if alignment.is_del() {
  327. bases.push(b'D');
  328. }
  329. }
  330. }
  331. }
  332. }
  333. }
  334. Ok(bases)
  335. }
  336. pub fn base_at(
  337. record: &rust_htslib::bam::record::Record,
  338. at_pos: u32,
  339. with_next_ins: bool,
  340. ) -> anyhow::Result<Option<u8>> {
  341. let cigar = record.cigar();
  342. let seq = record.seq();
  343. let pos = cigar.pos() as u32;
  344. let mut read_i = 0u32;
  345. // let at_pos = at_pos - 1;
  346. let mut ref_pos = pos;
  347. if ref_pos > at_pos {
  348. return Ok(None);
  349. }
  350. for (id, op) in cigar.iter().enumerate() {
  351. let (add_read, add_ref) = match *op {
  352. Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len),
  353. Cigar::Ins(len) => (len, 0),
  354. Cigar::Del(len) => (0, len),
  355. Cigar::RefSkip(len) => (0, len),
  356. Cigar::SoftClip(len) => (len, 0),
  357. Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
  358. };
  359. // If at the end of the op len and next op is Ins return I
  360. if with_next_ins && ref_pos + add_read == at_pos + 1 {
  361. if let Some(Cigar::Ins(_)) = cigar.get(id + 1) {
  362. return Ok(Some(b'I'));
  363. }
  364. }
  365. if ref_pos + add_ref > at_pos {
  366. // Handle deletions directly
  367. if let Cigar::Del(_) = *op {
  368. return Ok(Some(b'D'));
  369. } else if let Cigar::RefSkip(_) = op {
  370. return Ok(None);
  371. } else {
  372. let diff = at_pos - ref_pos;
  373. let p = read_i + diff;
  374. return Ok(Some(seq[p as usize]));
  375. }
  376. }
  377. read_i += add_read;
  378. ref_pos += add_ref;
  379. }
  380. Ok(None)
  381. }
  382. pub fn counts_at(
  383. bam: &mut rust_htslib::bam::IndexedReader,
  384. chr: &str,
  385. position: u32,
  386. ) -> anyhow::Result<HashMap<String, i32>> {
  387. let p = nt_pileup(bam, chr, position, false)?
  388. .iter()
  389. .map(|e| String::from_utf8(vec![*e]).unwrap())
  390. .collect::<Vec<_>>();
  391. let mut counts = HashMap::new();
  392. for item in p.iter() {
  393. *counts.entry(item.to_string()).or_insert(0) += 1;
  394. }
  395. Ok(counts)
  396. }
  397. pub fn ins_pileup(
  398. bam: &mut rust_htslib::bam::IndexedReader,
  399. chr: &str,
  400. position: u32,
  401. ) -> anyhow::Result<Vec<String>> {
  402. let mut bases = Vec::new();
  403. bam.fetch((chr, position, position + 10))?;
  404. for p in bam.pileup() {
  405. let pileup = p.context(format!(
  406. "Can't pileup bam at position {}:{} (0-based)",
  407. chr, position
  408. ))?;
  409. let cur_position = pileup.pos();
  410. // Ins in the next position
  411. if cur_position == position + 1 {
  412. // debug!("{cur_position}");
  413. for alignment in pileup.alignments() {
  414. let record = alignment.record();
  415. if record.seq_len() > 0 {
  416. if let Some(b) = ins_at(&record, position)? {
  417. bases.push(b);
  418. }
  419. }
  420. }
  421. }
  422. }
  423. Ok(bases)
  424. }
  425. pub fn ins_at(
  426. record: &rust_htslib::bam::record::Record,
  427. at_pos: u32,
  428. ) -> anyhow::Result<Option<String>> {
  429. use rust_htslib::bam::record::Cigar;
  430. let cigar = record.cigar();
  431. let seq = record.seq();
  432. let pos = cigar.pos() as u32;
  433. let mut read_i = 0u32;
  434. let mut ref_pos = pos;
  435. if ref_pos > at_pos {
  436. return Ok(None);
  437. }
  438. // debug!(
  439. // "read: {}",
  440. // String::from_utf8(record.qname().to_vec()).unwrap()
  441. // );
  442. for op in cigar.iter() {
  443. let (add_read, add_ref) = match *op {
  444. Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len),
  445. Cigar::Ins(len) => (len, 0),
  446. Cigar::Del(len) => (0, len),
  447. Cigar::RefSkip(len) => (0, len),
  448. Cigar::SoftClip(len) => (len, 0),
  449. Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
  450. };
  451. if ref_pos + add_read > at_pos && ref_pos + add_read < at_pos + 10 {
  452. if let Cigar::Ins(v) = *op {
  453. // debug!(
  454. // "ins size {v} @ {} (corrected {})",
  455. // ref_pos + add_read,
  456. // (ref_pos + add_read) - v - 1
  457. // );
  458. if (ref_pos + add_read) - v - 1 == at_pos {
  459. let inserted_seq =
  460. seq.as_bytes()[read_i as usize..(read_i + v) as usize].to_vec();
  461. return Ok(Some(String::from_utf8(inserted_seq)?));
  462. }
  463. }
  464. }
  465. read_i += add_read;
  466. ref_pos += add_ref;
  467. }
  468. Ok(None)
  469. }
  470. pub fn counts_ins_at(
  471. bam: &mut rust_htslib::bam::IndexedReader,
  472. chr: &str,
  473. position: u32,
  474. ) -> anyhow::Result<HashMap<String, i32>> {
  475. let p = ins_pileup(bam, chr, position)?;
  476. let mut counts = HashMap::new();
  477. for item in p.iter() {
  478. *counts.entry(item.to_string()).or_insert(0) += 1;
  479. }
  480. Ok(counts)
  481. }
  482. pub fn sample_random_positions(chromosomes: &Vec<(String, u64)>, n: usize) -> Vec<(String, u64)> {
  483. let mut rng = rng();
  484. let total_length: u64 = chromosomes.iter().map(|(_, length)| length).sum();
  485. (0..n)
  486. .map(|_| {
  487. let random_position = rng.random_range(0..total_length);
  488. let mut cumulative_length = 0;
  489. for (chr, length) in chromosomes {
  490. cumulative_length += length;
  491. if random_position < cumulative_length {
  492. let position_within_chr = random_position - (cumulative_length - length);
  493. return (chr.clone(), position_within_chr);
  494. }
  495. }
  496. unreachable!("Should always find a chromosome")
  497. })
  498. .collect()
  499. }
  500. #[derive(Debug, Clone, Deserialize, Serialize)]
  501. pub struct WGSBamStats {
  502. pub all_records: u128,
  503. pub n_reads: u128,
  504. pub mapped_yield: u128,
  505. pub global_coverage: f64,
  506. pub karyotype: Vec<(i32, u64, String, u128)>,
  507. pub n50: u128,
  508. pub by_lengths: Vec<(u128, u32)>,
  509. }
  510. impl WGSBamStats {
  511. pub fn new(id: &str, time: &str, config: Config) -> anyhow::Result<Self> {
  512. let bam_path = config.solo_bam(id, time);
  513. let Config {
  514. bam_min_mapq,
  515. bam_n_threads,
  516. ..
  517. } = config;
  518. let mut bam = rust_htslib::bam::Reader::from_path(bam_path)?;
  519. let h = bam.header().clone();
  520. let header = rust_htslib::bam::Header::from_template(&h);
  521. bam.set_threads(bam_n_threads as usize)?;
  522. let mut mapped_lengths = Vec::new();
  523. let mut all_records = 0;
  524. let mut n_reads = 0;
  525. let mut lengths_by_tid: HashMap<i32, Vec<u128>> = HashMap::new();
  526. for read in bam
  527. .rc_records()
  528. .map(|x| x.expect("Failure parsing Bam file"))
  529. .inspect(|_| all_records += 1)
  530. .filter(|read| {
  531. read.flags()
  532. & (rust_htslib::htslib::BAM_FUNMAP
  533. | rust_htslib::htslib::BAM_FSECONDARY
  534. | rust_htslib::htslib::BAM_FQCFAIL
  535. | rust_htslib::htslib::BAM_FDUP) as u16
  536. == 0
  537. })
  538. .filter(|r| r.mapq() >= bam_min_mapq)
  539. {
  540. n_reads += 1;
  541. let mapped_len = (read.reference_end() - read.pos()) as u128;
  542. mapped_lengths.push(mapped_len);
  543. lengths_by_tid
  544. .entry(read.tid())
  545. .or_default()
  546. .push(mapped_len);
  547. if n_reads % 500_000 == 0 {
  548. info!("{n_reads} reads parsed");
  549. }
  550. }
  551. let mapped_yield = mapped_lengths.par_iter().sum::<u128>();
  552. let genome = get_genome_sizes(&header)?;
  553. let genome_size = genome.values().map(|v| *v as u128).sum::<u128>();
  554. let global_coverage = mapped_yield as f64 / genome_size as f64;
  555. let by_lengths: DashMap<u128, u32> = DashMap::new();
  556. mapped_lengths
  557. .par_iter()
  558. .for_each(|l| *by_lengths.entry(*l).or_default() += 1);
  559. let mut by_lengths: Vec<(u128, u32)> =
  560. by_lengths.iter().map(|e| (*e.key(), *e.value())).collect();
  561. by_lengths.par_sort_by(|a, b| a.0.cmp(&b.0));
  562. // This is not n50
  563. let n50 = by_lengths[by_lengths.len() / 2].0;
  564. let mut karyotype: Vec<(i32, u64, String, u128)> = lengths_by_tid
  565. .iter()
  566. .map(|(tid, lengths)| {
  567. let contig = String::from_utf8(h.tid2name(*tid as u32).to_vec()).unwrap();
  568. (
  569. *tid,
  570. *genome.get(&contig).unwrap(),
  571. contig,
  572. lengths.par_iter().sum::<u128>(),
  573. )
  574. })
  575. .collect();
  576. karyotype.sort_by(|(ac, _, _, _), (bc, _, _, _)| ac.cmp(bc));
  577. Ok(Self {
  578. all_records,
  579. n_reads,
  580. mapped_yield,
  581. global_coverage,
  582. karyotype,
  583. n50,
  584. by_lengths,
  585. })
  586. }
  587. pub fn print(&self) {
  588. println!("N records\t{}", self.all_records);
  589. println!("N counted reads\t{}", self.n_reads);
  590. println!("Mapped yield [Gb]\t{:.2}", self.mapped_yield as f64 / 1e9);
  591. println!("Mean coverage\t{:.2}", self.global_coverage);
  592. self.karyotype
  593. .iter()
  594. .for_each(|(_, contig_len, contig, contig_yield)| {
  595. println!(
  596. "{contig}\t{:.2}",
  597. (*contig_yield as f64 / *contig_len as f64) / self.global_coverage
  598. );
  599. });
  600. }
  601. }
  602. pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<HashMap<String, u64>> {
  603. let mut genome = HashMap::new();
  604. for (key, records) in header.to_hashmap() {
  605. for record in records {
  606. if key == "SQ" {
  607. genome.insert(
  608. record["SN"].to_string(),
  609. record["LN"]
  610. .parse::<u64>()
  611. .expect("Failed parsing length of chromosomes"),
  612. );
  613. }
  614. }
  615. }
  616. Ok(genome)
  617. }
  618. /// Result of querying a read at a reference position.
  619. #[derive(Clone, Debug, Eq, PartialEq)]
  620. pub enum PileBase {
  621. A,
  622. C,
  623. G,
  624. T,
  625. N,
  626. Ins, // insertion immediately after the queried base
  627. Del((Vec<u8>, u32)), // deletion covering the queried base
  628. Skip, // reference skip (N)
  629. }
  630. /// Decode one HTSlib 4bit nucleotide (0=A,1=C,2=G,3=T,4=N, …) to `Base`
  631. #[inline]
  632. fn decode(b: u8) -> PileBase {
  633. match b & 0b1111 {
  634. 0 => PileBase::A,
  635. 1 => PileBase::C,
  636. 2 => PileBase::G,
  637. 3 => PileBase::T,
  638. _ => PileBase::N,
  639. }
  640. }
  641. pub fn base_at_new(
  642. record: &rust_htslib::bam::Record,
  643. at_pos: i64,
  644. with_next_ins: bool,
  645. ) -> anyhow::Result<Option<PileBase>> {
  646. if record.pos() > at_pos {
  647. return Ok(None);
  648. }
  649. let mut ref_pos = record.pos();
  650. let mut read_pos = 0_i64;
  651. let cigar = record.cigar();
  652. let seq = record.seq().as_bytes(); // already expanded to ASCII (A,C,G,T,N)
  653. for (i, op) in cigar.iter().enumerate() {
  654. let (consume_r, consume_ref) = match op {
  655. Cigar::Match(l) | Cigar::Equal(l) | Cigar::Diff(l) => (*l as i64, *l as i64),
  656. Cigar::Ins(l) | Cigar::SoftClip(l) => (*l as i64, 0),
  657. Cigar::Del(l) | Cigar::RefSkip(l) => (0, *l as i64),
  658. _ => (0, 0),
  659. };
  660. /* look ahead for an insertion immediately AFTER the queried base */
  661. if with_next_ins
  662. && ref_pos + consume_ref == at_pos + 1
  663. && matches!(cigar.get(i + 1), Some(Cigar::Ins(_)))
  664. {
  665. return Ok(Some(PileBase::Ins));
  666. }
  667. /* Does the queried reference coordinate fall inside this CIGAR op? */
  668. if ref_pos + consume_ref > at_pos {
  669. return Ok(Some(match op {
  670. Cigar::Del(e) => PileBase::Del((record.qname().to_vec(), *e)),
  671. Cigar::RefSkip(_) => PileBase::Skip,
  672. _ => {
  673. let offset = (at_pos - ref_pos) as usize;
  674. if read_pos as usize + offset >= seq.len() {
  675. return Ok(None); // out‑of‑range, malformed BAM
  676. }
  677. decode(seq[read_pos as usize + offset])
  678. }
  679. }));
  680. }
  681. read_pos += consume_r;
  682. ref_pos += consume_ref;
  683. }
  684. Ok(None) // beyond alignment
  685. }
  686. /// Return the pile‑up of **all reads** at `chr:pos` (0‑based, inclusive).
  687. ///
  688. /// * `bam` – an open [`rust_htslib::bam::IndexedReader`].
  689. /// * `chr` / `pos` – reference contig name and coordinate.
  690. /// * `with_next_ins` – if `true`, report [`Base::Ins`] when an insertion starts
  691. /// **right after** the queried base.
  692. ///
  693. /// The function bounds the internal pile‑up depth to 10 000 reads to protect
  694. /// against malformed BAM files that could otherwise allocate unbounded memory.
  695. ///
  696. /// # Errors
  697. ///
  698. /// Propagates any I/O or parsing error from *rust‑htslib*, wrapped in
  699. /// [`anyhow::Error`].
  700. pub fn nt_pileup_new(
  701. bam: &mut rust_htslib::bam::IndexedReader,
  702. chr: &str,
  703. pos: u32,
  704. with_next_ins: bool,
  705. ) -> anyhow::Result<Vec<PileBase>> {
  706. // Storage for the per‑read observations.
  707. let mut pile = Vec::new();
  708. // Restrict BAM iterator to the one‑base span of interest.
  709. bam.fetch((chr, pos, pos + 1))?;
  710. // Hard cap on depth (adjust if your use‑case needs deeper coverage).
  711. bam.pileup().set_max_depth(10_000);
  712. // Stream the pileup; HTSlib walks the reads for us.
  713. for pileup in bam.pileup() {
  714. // Attach context to any low‑level error.
  715. let pileup = pileup.context(format!("can't pile up BAM at {chr}:{pos}"))?;
  716. // Skip other positions quickly.
  717. if pileup.pos() != pos {
  718. continue;
  719. }
  720. for aln in pileup.alignments() {
  721. // Handle the three indel states reported by HTSlib.
  722. match aln.indel() {
  723. rust_htslib::bam::pileup::Indel::Ins(_) => {
  724. pile.push(PileBase::Ins); // insertion *starts at* this base
  725. }
  726. rust_htslib::bam::pileup::Indel::Del(e) => {
  727. let qname = aln.record().qname().to_vec();
  728. pile.push(PileBase::Del((qname, e))); // deletion length > 1, but still covers here
  729. }
  730. rust_htslib::bam::pileup::Indel::None => {
  731. // Regular match/mismatch: delegate to `base_at`.
  732. if let Some(base) = base_at_new(&aln.record(), pos as i64, with_next_ins)? {
  733. pile.push(base);
  734. }
  735. }
  736. }
  737. }
  738. }
  739. Ok(pile)
  740. }