bam.rs 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855
  1. use std::{
  2. collections::HashMap,
  3. fs::{self, File},
  4. path::PathBuf,
  5. };
  6. use anyhow::{anyhow, Context};
  7. use chrono::{DateTime, Utc};
  8. use dashmap::DashMap;
  9. use glob::glob;
  10. use log::{debug, info, warn};
  11. use rand::{rng, Rng};
  12. use rayon::prelude::*;
  13. use rust_htslib::bam::{ext::BamRecordExtensions, record::Cigar, Read};
  14. use serde::{Deserialize, Serialize};
  15. use crate::config::Config;
  16. use super::flowcells::FlowCell;
  17. #[derive(Debug, Clone, Deserialize, Serialize)]
  18. pub struct WGSBam {
  19. pub id: String,
  20. pub time_point: String,
  21. pub reference_genome: String,
  22. // pub bam_type: BamType,
  23. pub path: PathBuf,
  24. pub modified: DateTime<Utc>,
  25. pub bam_stats: WGSBamStats,
  26. // pub cramino: Option<CraminoRes>,
  27. pub composition: Vec<(String, String, f64)>, // acquisition id, fn
  28. }
  29. // #[derive(Debug, PartialEq, Clone, Deserialize, Serialize)]
  30. // pub enum BamType {
  31. // WGS,
  32. // Panel(String),
  33. // ChIP(String),
  34. // }
  35. //
  36. impl WGSBam {
  37. pub fn new(path: PathBuf) -> anyhow::Result<Self> {
  38. let stem = path
  39. .clone()
  40. .file_stem()
  41. .context(format!("Can't parse stem from {}", path.display()))?
  42. .to_string_lossy()
  43. .to_string();
  44. let stem: Vec<&str> = stem.split('_').collect();
  45. if stem.len() != 3 {
  46. return Err(anyhow!("Error in bam name: {}", path.display()));
  47. }
  48. let id = stem[0].to_string();
  49. let time_point = stem[1].to_string();
  50. let reference_genome = stem
  51. .last()
  52. .context("Can't get last from stem {stem}")?
  53. .to_string();
  54. // let bam_type = if stem.len() == 4 {
  55. // match stem[2] {
  56. // "oncoT" => BamType::Panel("oncoT".to_string()),
  57. // "H3K27ac" => BamType::ChIP("H3K27ac".to_string()),
  58. // "H3K4me3" => BamType::ChIP("H3K4me3".to_string()),
  59. // _ => return Err(anyhow!("Error in bam name: {}", path.display())),
  60. // }
  61. // } else {
  62. // BamType::WGS
  63. // };
  64. let file_metadata = fs::metadata(&path)?;
  65. let modified = file_metadata.modified()?;
  66. let tp_dir = path
  67. .parent()
  68. .context("Can't parse parent from: {bam_path}")?;
  69. let json_path = format!(
  70. "{}/{id}_{time_point}_{reference_genome}_info.json",
  71. tp_dir.to_string_lossy()
  72. );
  73. let json_file = PathBuf::from(&json_path);
  74. if json_file.exists() && json_file.metadata()?.modified()? > modified {
  75. match Self::load_json(&json_path) {
  76. Ok(s) => return Ok(s),
  77. Err(e) => {
  78. warn!("Error while loading {}.\n{e}", json_path);
  79. }
  80. }
  81. }
  82. // let cramino_path = format!(
  83. // "{}/{id}_{time_point}_{reference_genome}_cramino.txt",
  84. // tp_dir.to_string_lossy()
  85. // );
  86. // let cramino = if bam_type == BamType::WGS {
  87. // if !PathBuf::from_str(&cramino_path)?.exists() {
  88. // // return Err(anyhow!("Cramino file missing {cramino_path}"));
  89. // Cramino::default().with_bam(path.to_str().unwrap())?;
  90. // }
  91. // let mut cramino = Cramino::default().with_result_path(&cramino_path);
  92. // cramino
  93. // .parse_results()
  94. // .context(format!("Error while parsing cramino for {cramino_path}"))?;
  95. //
  96. // if let Some(cramino) = cramino.results {
  97. // Some(cramino)
  98. // } else {
  99. // return Err(anyhow!("Cramino results parsing failed"));
  100. // }
  101. // } else {
  102. // None
  103. // };
  104. let composition = bam_compo(path.to_string_lossy().as_ref(), 20_000).context(format!(
  105. "Error while reading BAM composition for {}",
  106. path.display()
  107. ))?;
  108. let s = Self {
  109. path,
  110. bam_stats: WGSBamStats::new(&id, &time_point, Config::default())?,
  111. // cramino,
  112. id: id.to_string(),
  113. time_point: time_point.to_string(),
  114. // bam_type,
  115. reference_genome,
  116. composition,
  117. modified: modified.into(),
  118. };
  119. s.save_json(&json_path)?;
  120. Ok(s)
  121. }
  122. pub fn load_json(path: &str) -> anyhow::Result<Self> {
  123. let f = File::open(path)?;
  124. let s: Self = serde_json::from_reader(f)?;
  125. Ok(s)
  126. }
  127. pub fn save_json(&self, path: &str) -> anyhow::Result<()> {
  128. let f = File::create(path)?;
  129. serde_json::to_writer(f, self)?;
  130. Ok(())
  131. }
  132. }
  133. #[derive(Debug, Default, Clone)]
  134. pub struct BamCollection {
  135. pub bams: Vec<WGSBam>,
  136. }
  137. impl BamCollection {
  138. pub fn new(result_dir: &str) -> Self {
  139. load_bam_collection(result_dir)
  140. }
  141. pub fn by_acquisition_id(&self) -> HashMap<String, Vec<&WGSBam>> {
  142. let mut acq: HashMap<String, Vec<&WGSBam>> = HashMap::new();
  143. for bam in self.bams.iter() {
  144. for (acq_id, _, _) in bam.composition.iter() {
  145. if let Some(entry) = acq.get_mut(acq_id) {
  146. entry.push(bam);
  147. } else {
  148. acq.insert(acq_id.to_string(), vec![bam]);
  149. }
  150. }
  151. }
  152. acq
  153. }
  154. pub fn get(&self, id: &str, time_point: &str) -> Vec<&WGSBam> {
  155. self.bams
  156. .iter()
  157. .filter(|b| b.id == id && b.time_point == time_point)
  158. .collect()
  159. }
  160. pub fn by_id_completed(&self, min_diag_cov: f32, min_mrd_cov: f32) -> Vec<WGSBam> {
  161. self.bams
  162. .iter()
  163. // .filter(|b| matches!(b.bam_type, BamType::WGS))
  164. .filter(|b| match b.time_point.as_str() {
  165. "diag" => b.bam_stats.global_coverage >= min_diag_cov as f64,
  166. "mrd" => b.bam_stats.global_coverage >= min_mrd_cov as f64,
  167. _ => false,
  168. })
  169. // .filter(|b| match &b.cramino {
  170. // Some(cramino) => match b.time_point.as_str() {
  171. // "diag" => cramino.mean_length >= min_diag_cov as f64,
  172. // "mrd" => cramino.mean_length >= min_mrd_cov as f64,
  173. // _ => false,
  174. // },
  175. // _ => false,
  176. // })
  177. .cloned()
  178. .collect()
  179. }
  180. pub fn ids(&self) -> Vec<String> {
  181. let mut ids: Vec<String> = self.bams.iter().map(|b| b.id.clone()).collect();
  182. ids.sort();
  183. ids.dedup();
  184. ids
  185. }
  186. }
  187. pub fn load_bam_collection(result_dir: &str) -> BamCollection {
  188. let pattern = format!("{}/*/*/*.bam", result_dir);
  189. let bams = glob(&pattern)
  190. .expect("Failed to read glob pattern")
  191. // .par_bridge()
  192. .filter_map(|entry| {
  193. match entry {
  194. Ok(path) => match WGSBam::new(path) {
  195. Ok(bam) => return Some(bam),
  196. Err(e) => warn!("{e}"),
  197. },
  198. Err(e) => warn!("Error: {:?}", e),
  199. }
  200. None
  201. })
  202. .collect();
  203. BamCollection { bams }
  204. }
  205. pub fn bam_compo(
  206. file_path: &str,
  207. sample_size: usize,
  208. ) -> anyhow::Result<Vec<(String, String, f64)>> {
  209. let mut bam = rust_htslib::bam::Reader::from_path(file_path)?;
  210. let mut rgs: HashMap<(String, String), u64> = HashMap::new();
  211. for result in bam.records().filter_map(Result::ok).take(sample_size) {
  212. if let (
  213. rust_htslib::bam::record::Aux::String(s),
  214. rust_htslib::bam::record::Aux::String(b),
  215. ) = (result.aux(b"RG")?, result.aux(b"fn")?)
  216. {
  217. *rgs.entry((s.to_string(), b.to_string())).or_default() += 1;
  218. }
  219. }
  220. Ok(rgs
  221. .into_iter()
  222. .map(|(k, n)| {
  223. (
  224. k.0.to_string(),
  225. k.1.to_string(),
  226. n as f64 * 100.0 / sample_size as f64,
  227. )
  228. })
  229. .map(|(rg, f, p)| {
  230. let (a, _) = rg.split_once('_').unwrap();
  231. let (b, _) = f.split_once('_').unwrap();
  232. (a.to_string(), b.to_string(), p)
  233. })
  234. .collect())
  235. }
  236. pub fn bam_has_fc(bam: &WGSBam, flow_cell: &FlowCell) -> anyhow::Result<bool> {
  237. let mut bam = rust_htslib::bam::Reader::from_path(&bam.path)?;
  238. let mut rgs: HashMap<String, u64> = HashMap::new();
  239. for result in bam.records().filter_map(Result::ok).take(2_000) {
  240. if let rust_htslib::bam::record::Aux::String(s) = result.aux(b"fn")? {
  241. *rgs.entry(s.to_string()).or_default() += 1;
  242. }
  243. }
  244. for k in rgs.keys() {
  245. if k.contains(&flow_cell.sample_sheet.flow_cell_id) {
  246. return Ok(true);
  247. }
  248. }
  249. Ok(false)
  250. }
  251. pub struct BamReadsSampler {
  252. pub positions: Vec<(String, u64)>,
  253. pub reader: rust_htslib::bam::IndexedReader,
  254. current_index: usize,
  255. }
  256. impl BamReadsSampler {
  257. pub fn new(path: &str, n: usize) -> anyhow::Result<Self> {
  258. debug!("Reading BAM file: {path}");
  259. let reader = rust_htslib::bam::IndexedReader::from_path(path)?;
  260. let header = reader.header();
  261. let contig_len: Vec<(String, u64)> = header
  262. .target_names()
  263. .into_iter()
  264. .map(|target_name| {
  265. let tid = header.tid(target_name).unwrap();
  266. (
  267. String::from_utf8(target_name.to_vec()).unwrap(),
  268. header.target_len(tid).unwrap(),
  269. )
  270. })
  271. .collect();
  272. let positions = sample_random_positions(&contig_len, n);
  273. Ok(Self {
  274. positions,
  275. reader,
  276. current_index: 0,
  277. })
  278. }
  279. }
  280. impl Iterator for BamReadsSampler {
  281. type Item = Result<rust_htslib::bam::Record, rust_htslib::errors::Error>;
  282. fn next(&mut self) -> Option<Self::Item> {
  283. loop {
  284. if self.current_index < self.positions.len() {
  285. let (contig, pos) = &self.positions[self.current_index];
  286. match self.reader.fetch((contig, *pos, pos + 1)) {
  287. Ok(_) => (),
  288. Err(e) => warn!("Error while reading BAM {e}"),
  289. }
  290. let result = self.reader.records().next();
  291. self.current_index += 1;
  292. if result.is_some() {
  293. return result;
  294. }
  295. } else {
  296. return None;
  297. }
  298. }
  299. }
  300. }
  301. pub fn n_mapped_reads(bam_path: &str) -> anyhow::Result<u64> {
  302. let mut bam = rust_htslib::bam::IndexedReader::from_path(bam_path)?;
  303. let mapped = bam
  304. .index_stats()?
  305. .into_iter()
  306. .filter(|(tid, _, _, _)| *tid < 24)
  307. .map(|(_, _, m, _)| m)
  308. .sum();
  309. Ok(mapped)
  310. }
  311. // 0-based inclusif
  312. pub fn nt_pileup(
  313. bam: &mut rust_htslib::bam::IndexedReader,
  314. chr: &str,
  315. position: u32,
  316. with_next_ins: bool,
  317. ) -> anyhow::Result<Vec<u8>> {
  318. use rust_htslib::{bam, bam::Read};
  319. let mut bases = Vec::new();
  320. bam.fetch((chr, position, position + 1))?;
  321. let mut bam_pileup = Vec::new();
  322. for p in bam.pileup() {
  323. let pileup = p.context(format!(
  324. "Can't pileup bam at position {}:{} (0-based)",
  325. chr, position
  326. ))?;
  327. let cur_position = pileup.pos();
  328. if cur_position == position {
  329. for alignment in pileup.alignments() {
  330. match alignment.indel() {
  331. bam::pileup::Indel::Ins(_len) => bam_pileup.push(b'I'),
  332. bam::pileup::Indel::Del(_len) => bam_pileup.push(b'D'),
  333. _ => {
  334. let record = alignment.record();
  335. if record.seq_len() > 0 {
  336. if let Some(b) = base_at(&record, position, with_next_ins)? {
  337. bases.push(b);
  338. }
  339. } else if alignment.is_del() {
  340. bases.push(b'D');
  341. }
  342. }
  343. }
  344. }
  345. }
  346. }
  347. Ok(bases)
  348. }
  349. pub fn base_at(
  350. record: &rust_htslib::bam::record::Record,
  351. at_pos: u32,
  352. with_next_ins: bool,
  353. ) -> anyhow::Result<Option<u8>> {
  354. let cigar = record.cigar();
  355. let seq = record.seq();
  356. let pos = cigar.pos() as u32;
  357. let mut read_i = 0u32;
  358. // let at_pos = at_pos - 1;
  359. let mut ref_pos = pos;
  360. if ref_pos > at_pos {
  361. return Ok(None);
  362. }
  363. for (id, op) in cigar.iter().enumerate() {
  364. let (add_read, add_ref) = match *op {
  365. Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len),
  366. Cigar::Ins(len) => (len, 0),
  367. Cigar::Del(len) => (0, len),
  368. Cigar::RefSkip(len) => (0, len),
  369. Cigar::SoftClip(len) => (len, 0),
  370. Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
  371. };
  372. // If at the end of the op len and next op is Ins return I
  373. if with_next_ins && ref_pos + add_read == at_pos + 1 {
  374. if let Some(Cigar::Ins(_)) = cigar.get(id + 1) {
  375. return Ok(Some(b'I'));
  376. }
  377. }
  378. if ref_pos + add_ref > at_pos {
  379. // Handle deletions directly
  380. if let Cigar::Del(_) = *op {
  381. return Ok(Some(b'D'));
  382. } else if let Cigar::RefSkip(_) = op {
  383. return Ok(None);
  384. } else {
  385. let diff = at_pos - ref_pos;
  386. let p = read_i + diff;
  387. return Ok(Some(seq[p as usize]));
  388. }
  389. }
  390. read_i += add_read;
  391. ref_pos += add_ref;
  392. }
  393. Ok(None)
  394. }
  395. pub fn counts_at(
  396. bam: &mut rust_htslib::bam::IndexedReader,
  397. chr: &str,
  398. position: u32,
  399. ) -> anyhow::Result<HashMap<String, i32>> {
  400. let p = nt_pileup(bam, chr, position, false)?
  401. .iter()
  402. .map(|e| String::from_utf8(vec![*e]).unwrap())
  403. .collect::<Vec<_>>();
  404. let mut counts = HashMap::new();
  405. for item in p.iter() {
  406. *counts.entry(item.to_string()).or_insert(0) += 1;
  407. }
  408. Ok(counts)
  409. }
  410. pub fn ins_pileup(
  411. bam: &mut rust_htslib::bam::IndexedReader,
  412. chr: &str,
  413. position: u32,
  414. ) -> anyhow::Result<Vec<String>> {
  415. let mut bases = Vec::new();
  416. bam.fetch((chr, position, position + 10))?;
  417. for p in bam.pileup() {
  418. let pileup = p.context(format!(
  419. "Can't pileup bam at position {}:{} (0-based)",
  420. chr, position
  421. ))?;
  422. let cur_position = pileup.pos();
  423. // Ins in the next position
  424. if cur_position == position + 1 {
  425. // debug!("{cur_position}");
  426. for alignment in pileup.alignments() {
  427. let record = alignment.record();
  428. if record.seq_len() > 0 {
  429. if let Some(b) = ins_at(&record, position)? {
  430. bases.push(b);
  431. }
  432. }
  433. }
  434. }
  435. }
  436. Ok(bases)
  437. }
  438. pub fn ins_at(
  439. record: &rust_htslib::bam::record::Record,
  440. at_pos: u32,
  441. ) -> anyhow::Result<Option<String>> {
  442. use rust_htslib::bam::record::Cigar;
  443. let cigar = record.cigar();
  444. let seq = record.seq();
  445. let pos = cigar.pos() as u32;
  446. let mut read_i = 0u32;
  447. let mut ref_pos = pos;
  448. if ref_pos > at_pos {
  449. return Ok(None);
  450. }
  451. // debug!(
  452. // "read: {}",
  453. // String::from_utf8(record.qname().to_vec()).unwrap()
  454. // );
  455. for op in cigar.iter() {
  456. let (add_read, add_ref) = match *op {
  457. Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len),
  458. Cigar::Ins(len) => (len, 0),
  459. Cigar::Del(len) => (0, len),
  460. Cigar::RefSkip(len) => (0, len),
  461. Cigar::SoftClip(len) => (len, 0),
  462. Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
  463. };
  464. if ref_pos + add_read > at_pos && ref_pos + add_read < at_pos + 10 {
  465. if let Cigar::Ins(v) = *op {
  466. // debug!(
  467. // "ins size {v} @ {} (corrected {})",
  468. // ref_pos + add_read,
  469. // (ref_pos + add_read) - v - 1
  470. // );
  471. if (ref_pos + add_read) - v - 1 == at_pos {
  472. let inserted_seq =
  473. seq.as_bytes()[read_i as usize..(read_i + v) as usize].to_vec();
  474. return Ok(Some(String::from_utf8(inserted_seq)?));
  475. }
  476. }
  477. }
  478. read_i += add_read;
  479. ref_pos += add_ref;
  480. }
  481. Ok(None)
  482. }
  483. pub fn counts_ins_at(
  484. bam: &mut rust_htslib::bam::IndexedReader,
  485. chr: &str,
  486. position: u32,
  487. ) -> anyhow::Result<HashMap<String, i32>> {
  488. let p = ins_pileup(bam, chr, position)?;
  489. let mut counts = HashMap::new();
  490. for item in p.iter() {
  491. *counts.entry(item.to_string()).or_insert(0) += 1;
  492. }
  493. Ok(counts)
  494. }
  495. pub fn sample_random_positions(chromosomes: &Vec<(String, u64)>, n: usize) -> Vec<(String, u64)> {
  496. let mut rng = rng();
  497. let total_length: u64 = chromosomes.iter().map(|(_, length)| length).sum();
  498. (0..n)
  499. .map(|_| {
  500. let random_position = rng.random_range(0..total_length);
  501. let mut cumulative_length = 0;
  502. for (chr, length) in chromosomes {
  503. cumulative_length += length;
  504. if random_position < cumulative_length {
  505. let position_within_chr = random_position - (cumulative_length - length);
  506. return (chr.clone(), position_within_chr);
  507. }
  508. }
  509. unreachable!("Should always find a chromosome")
  510. })
  511. .collect()
  512. }
  513. #[derive(Debug, Clone, Deserialize, Serialize)]
  514. pub struct WGSBamStats {
  515. pub all_records: u128,
  516. pub n_reads: u128,
  517. pub mapped_yield: u128,
  518. pub global_coverage: f64,
  519. pub karyotype: Vec<(i32, u64, String, u128)>,
  520. pub n50: u128,
  521. pub by_lengths: Vec<(u128, u32)>,
  522. }
  523. impl WGSBamStats {
  524. pub fn new(id: &str, time: &str, config: Config) -> anyhow::Result<Self> {
  525. let bam_path = config.solo_bam(id, time);
  526. let Config {
  527. bam_min_mapq,
  528. bam_n_threads,
  529. ..
  530. } = config;
  531. let mut bam = rust_htslib::bam::Reader::from_path(bam_path)?;
  532. let h = bam.header().clone();
  533. let header = rust_htslib::bam::Header::from_template(&h);
  534. bam.set_threads(bam_n_threads as usize)?;
  535. let mut mapped_lengths = Vec::new();
  536. let mut all_records = 0;
  537. let mut n_reads = 0;
  538. let mut lengths_by_tid: HashMap<i32, Vec<u128>> = HashMap::new();
  539. for read in bam
  540. .rc_records()
  541. .map(|x| x.expect("Failure parsing Bam file"))
  542. .inspect(|_| all_records += 1)
  543. .filter(|read| {
  544. read.flags()
  545. & (rust_htslib::htslib::BAM_FUNMAP
  546. | rust_htslib::htslib::BAM_FSECONDARY
  547. | rust_htslib::htslib::BAM_FQCFAIL
  548. | rust_htslib::htslib::BAM_FDUP) as u16
  549. == 0
  550. })
  551. .filter(|r| r.mapq() >= bam_min_mapq)
  552. {
  553. n_reads += 1;
  554. let mapped_len = (read.reference_end() - read.pos()) as u128;
  555. mapped_lengths.push(mapped_len);
  556. lengths_by_tid
  557. .entry(read.tid())
  558. .or_default()
  559. .push(mapped_len);
  560. if n_reads % 500_000 == 0 {
  561. info!("{n_reads} reads parsed");
  562. }
  563. }
  564. let mapped_yield = mapped_lengths.par_iter().sum::<u128>();
  565. let genome = get_genome_sizes(&header)?;
  566. let genome_size = genome.values().map(|v| *v as u128).sum::<u128>();
  567. let global_coverage = mapped_yield as f64 / genome_size as f64;
  568. let by_lengths: DashMap<u128, u32> = DashMap::new();
  569. mapped_lengths
  570. .par_iter()
  571. .for_each(|l| *by_lengths.entry(*l).or_default() += 1);
  572. let mut by_lengths: Vec<(u128, u32)> =
  573. by_lengths.iter().map(|e| (*e.key(), *e.value())).collect();
  574. by_lengths.par_sort_by(|a, b| a.0.cmp(&b.0));
  575. // This is not n50
  576. let n50 = by_lengths[by_lengths.len() / 2].0;
  577. let mut karyotype: Vec<(i32, u64, String, u128)> = lengths_by_tid
  578. .iter()
  579. .map(|(tid, lengths)| {
  580. let contig = String::from_utf8(h.tid2name(*tid as u32).to_vec()).unwrap();
  581. (
  582. *tid,
  583. *genome.get(&contig).unwrap(),
  584. contig,
  585. lengths.par_iter().sum::<u128>(),
  586. )
  587. })
  588. .collect();
  589. karyotype.sort_by(|(ac, _, _, _), (bc, _, _, _)| ac.cmp(bc));
  590. Ok(Self {
  591. all_records,
  592. n_reads,
  593. mapped_yield,
  594. global_coverage,
  595. karyotype,
  596. n50,
  597. by_lengths,
  598. })
  599. }
  600. pub fn print(&self) {
  601. println!("N records\t{}", self.all_records);
  602. println!("N counted reads\t{}", self.n_reads);
  603. println!("Mapped yield [Gb]\t{:.2}", self.mapped_yield as f64 / 1e9);
  604. println!("Mean coverage\t{:.2}", self.global_coverage);
  605. self.karyotype
  606. .iter()
  607. .for_each(|(_, contig_len, contig, contig_yield)| {
  608. println!(
  609. "{contig}\t{:.2}",
  610. (*contig_yield as f64 / *contig_len as f64) / self.global_coverage
  611. );
  612. });
  613. }
  614. }
  615. pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<HashMap<String, u64>> {
  616. let mut genome = HashMap::new();
  617. for (key, records) in header.to_hashmap() {
  618. for record in records {
  619. if key == "SQ" {
  620. genome.insert(
  621. record["SN"].to_string(),
  622. record["LN"]
  623. .parse::<u64>()
  624. .expect("Failed parsing length of chromosomes"),
  625. );
  626. }
  627. }
  628. }
  629. Ok(genome)
  630. }
  631. /// Result of querying a read at a reference position.
  632. #[derive(Clone, Debug, Eq, PartialEq)]
  633. pub enum PileBase {
  634. A,
  635. C,
  636. G,
  637. T,
  638. N,
  639. Ins, // insertion immediately after the queried base
  640. Del((Vec<u8>, u32)), // deletion covering the queried base
  641. Skip, // reference skip (N)
  642. }
  643. /// Decode one HTSlib 4bit nucleotide (0=A,1=C,2=G,3=T,4=N, …) to `Base`
  644. #[inline]
  645. fn decode(b: u8) -> PileBase {
  646. match b & 0b1111 {
  647. 0 => PileBase::A,
  648. 1 => PileBase::C,
  649. 2 => PileBase::G,
  650. 3 => PileBase::T,
  651. _ => PileBase::N,
  652. }
  653. }
  654. pub fn base_at_new(
  655. record: &rust_htslib::bam::Record,
  656. at_pos: i64,
  657. with_next_ins: bool,
  658. ) -> Option<PileBase> {
  659. if record.pos() > at_pos {
  660. return None;
  661. }
  662. let mut ref_pos = record.pos();
  663. let mut read_pos = 0_i64;
  664. let cigar = record.cigar();
  665. let seq = record.seq().as_bytes(); // already expanded to ASCII (A,C,G,T,N)
  666. for (i, op) in cigar.iter().enumerate() {
  667. let l = op.len() as i64;
  668. let (consume_read, consume_ref) = match op.char() {
  669. 'M' | '=' | 'X' => (l, l),
  670. 'I' | 'S' => (l, 0),
  671. 'D' | 'N' => (0, l),
  672. 'H' | 'P' => (0, 0),
  673. _ => unreachable!(),
  674. };
  675. /* look ahead for an insertion immediately AFTER the queried base */
  676. if with_next_ins
  677. && ref_pos + consume_ref == at_pos + 1
  678. && matches!(cigar.get(i + 1), Some(Cigar::Ins(_)))
  679. {
  680. return Some(PileBase::Ins);
  681. }
  682. // Instead of PileBase::Ins, consider
  683. // extracting the inserted sequence and returning its length or actual bases:
  684. // if let Some(Cigar::Ins(len)) = cigar.get(i + 1) {
  685. // let ins_seq = &seq[read_pos as usize + (consume_ref as usize)
  686. // ..read_pos as usize + (consume_ref as usize) + (*len as usize)];
  687. // return Some(PileBase::Ins(ins_seq.to_vec()));
  688. // }
  689. /* Does the queried reference coordinate fall inside this CIGAR op? */
  690. if ref_pos + consume_ref > at_pos {
  691. return Some(match op {
  692. Cigar::Del(e) => PileBase::Del((record.qname().to_vec(), *e)),
  693. Cigar::RefSkip(_) => PileBase::Skip,
  694. _ => {
  695. let offset = (at_pos - ref_pos) as usize;
  696. if read_pos as usize + offset >= seq.len() {
  697. return None; // out of range, malformed BAM
  698. }
  699. decode(seq[read_pos as usize + offset])
  700. }
  701. });
  702. }
  703. read_pos += consume_read;
  704. ref_pos += consume_ref;
  705. }
  706. None // beyond alignment
  707. }
  708. /// Return the pile‑up of **all reads** at `chr:pos` (0‑based, inclusive).
  709. ///
  710. /// * `bam` – an open [`rust_htslib::bam::IndexedReader`].
  711. /// * `chr` / `pos` – reference contig name and coordinate.
  712. /// * `with_next_ins` – if `true`, report [`Base::Ins`] when an insertion starts
  713. /// **right after** the queried base.
  714. ///
  715. /// The function bounds the internal pile‑up depth to 10 000 reads to protect
  716. /// against malformed BAM files that could otherwise allocate unbounded memory.
  717. ///
  718. /// # Errors
  719. ///
  720. /// Propagates any I/O or parsing error from *rust‑htslib*, wrapped in
  721. /// [`anyhow::Error`].
  722. pub fn nt_pileup_new(
  723. bam: &mut rust_htslib::bam::IndexedReader,
  724. chr: &str,
  725. pos: u32,
  726. with_next_ins: bool,
  727. ) -> anyhow::Result<Vec<PileBase>> {
  728. // Storage for the per‑read observations.
  729. let mut pile = Vec::new();
  730. // Restrict BAM iterator to the one‑base span of interest.
  731. bam.fetch((chr, pos, pos + 1))?;
  732. // Hard cap on depth (adjust if your use‑case needs deeper coverage).
  733. bam.pileup().set_max_depth(10_000);
  734. // Stream the pileup; HTSlib walks the reads for us.
  735. for pileup in bam.pileup() {
  736. // Attach context to any low‑level error.
  737. let pileup = pileup.context(format!("can't pile up BAM at {chr}:{pos}"))?;
  738. // Skip other positions quickly.
  739. if pileup.pos() != pos {
  740. continue;
  741. }
  742. for aln in pileup.alignments() {
  743. // Handle the three indel states reported by HTSlib.
  744. match aln.indel() {
  745. rust_htslib::bam::pileup::Indel::Ins(_) => {
  746. pile.push(PileBase::Ins); // insertion *starts at* this base
  747. }
  748. rust_htslib::bam::pileup::Indel::Del(e) => {
  749. let qname = aln.record().qname().to_vec();
  750. pile.push(PileBase::Del((qname, e))); // deletion length > 1, but still covers here
  751. }
  752. rust_htslib::bam::pileup::Indel::None => {
  753. // Regular match/mismatch: delegate to `base_at`.
  754. if let Some(base) = base_at_new(&aln.record(), pos as i64, with_next_ins) {
  755. pile.push(base);
  756. }
  757. }
  758. }
  759. }
  760. }
  761. Ok(pile)
  762. }