| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120 |
- use core::fmt;
- use std::{
- collections::{BTreeMap, HashMap},
- fs::{self, File},
- path::PathBuf,
- };
- use anyhow::{anyhow, Context};
- use chrono::{DateTime, Utc};
- use dashmap::DashMap;
- use glob::glob;
- use log::{debug, info, warn};
- use rand::{rng, Rng};
- use rayon::prelude::*;
- use rust_htslib::bam::{ext::BamRecordExtensions, record::Cigar, Read};
- use serde::{Deserialize, Serialize};
- use crate::config::Config;
- use super::flowcells::FlowCell;
- #[derive(Debug, Clone, Deserialize, Serialize)]
- pub struct WGSBam {
- pub id: String,
- pub time_point: String,
- pub reference_genome: String,
- // pub bam_type: BamType,
- pub path: PathBuf,
- pub modified: DateTime<Utc>,
- pub bam_stats: WGSBamStats,
- // pub cramino: Option<CraminoRes>,
- pub composition: Vec<(String, String, f64)>, // acquisition id, fn
- }
- // #[derive(Debug, PartialEq, Clone, Deserialize, Serialize)]
- // pub enum BamType {
- // WGS,
- // Panel(String),
- // ChIP(String),
- // }
- //
- impl WGSBam {
- pub fn new(path: PathBuf) -> anyhow::Result<Self> {
- let stem = path
- .clone()
- .file_stem()
- .context(format!("Can't parse stem from {}", path.display()))?
- .to_string_lossy()
- .to_string();
- let stem: Vec<&str> = stem.split('_').collect();
- if stem.len() != 3 {
- return Err(anyhow!("Error in bam name: {}", path.display()));
- }
- let id = stem[0].to_string();
- let time_point = stem[1].to_string();
- let reference_genome = stem
- .last()
- .context("Can't get last from stem {stem}")?
- .to_string();
- // let bam_type = if stem.len() == 4 {
- // match stem[2] {
- // "oncoT" => BamType::Panel("oncoT".to_string()),
- // "H3K27ac" => BamType::ChIP("H3K27ac".to_string()),
- // "H3K4me3" => BamType::ChIP("H3K4me3".to_string()),
- // _ => return Err(anyhow!("Error in bam name: {}", path.display())),
- // }
- // } else {
- // BamType::WGS
- // };
- let file_metadata = fs::metadata(&path)?;
- let modified = file_metadata.modified()?;
- let tp_dir = path
- .parent()
- .context("Can't parse parent from: {bam_path}")?;
- let json_path = format!(
- "{}/{id}_{time_point}_{reference_genome}_info.json",
- tp_dir.to_string_lossy()
- );
- let json_file = PathBuf::from(&json_path);
- if json_file.exists() && json_file.metadata()?.modified()? > modified {
- match Self::load_json(&json_path) {
- Ok(s) => return Ok(s),
- Err(e) => {
- warn!("Error while loading {}.\n{e}", json_path);
- }
- }
- }
- // let cramino_path = format!(
- // "{}/{id}_{time_point}_{reference_genome}_cramino.txt",
- // tp_dir.to_string_lossy()
- // );
- // let cramino = if bam_type == BamType::WGS {
- // if !PathBuf::from_str(&cramino_path)?.exists() {
- // // return Err(anyhow!("Cramino file missing {cramino_path}"));
- // Cramino::default().with_bam(path.to_str().unwrap())?;
- // }
- // let mut cramino = Cramino::default().with_result_path(&cramino_path);
- // cramino
- // .parse_results()
- // .context(format!("Error while parsing cramino for {cramino_path}"))?;
- //
- // if let Some(cramino) = cramino.results {
- // Some(cramino)
- // } else {
- // return Err(anyhow!("Cramino results parsing failed"));
- // }
- // } else {
- // None
- // };
- let composition = bam_composition(path.to_string_lossy().as_ref(), 20_000).context(format!(
- "Error while reading BAM composition for {}",
- path.display()
- ))?;
- let s = Self {
- path,
- bam_stats: WGSBamStats::new(&id, &time_point, &Config::default())?,
- // cramino,
- id: id.to_string(),
- time_point: time_point.to_string(),
- // bam_type,
- reference_genome,
- composition,
- modified: modified.into(),
- };
- s.save_json(&json_path)?;
- Ok(s)
- }
- pub fn load_json(path: &str) -> anyhow::Result<Self> {
- let f = File::open(path)?;
- let s: Self = serde_json::from_reader(f)?;
- Ok(s)
- }
- pub fn save_json(&self, path: &str) -> anyhow::Result<()> {
- let f = File::create(path)?;
- serde_json::to_writer(f, self)?;
- Ok(())
- }
- }
- #[derive(Debug, Default, Clone)]
- pub struct BamCollection {
- pub bams: Vec<WGSBam>,
- }
- impl BamCollection {
- pub fn new(result_dir: &str) -> Self {
- load_bam_collection(result_dir)
- }
- pub fn by_acquisition_id(&self) -> HashMap<String, Vec<&WGSBam>> {
- let mut acq: HashMap<String, Vec<&WGSBam>> = HashMap::new();
- for bam in self.bams.iter() {
- for (acq_id, _, _) in bam.composition.iter() {
- if let Some(entry) = acq.get_mut(acq_id) {
- entry.push(bam);
- } else {
- acq.insert(acq_id.to_string(), vec![bam]);
- }
- }
- }
- acq
- }
- pub fn get(&self, id: &str, time_point: &str) -> Vec<&WGSBam> {
- self.bams
- .iter()
- .filter(|b| b.id == id && b.time_point == time_point)
- .collect()
- }
- pub fn by_id_completed(&self, min_diag_cov: f32, min_mrd_cov: f32) -> Vec<WGSBam> {
- self.bams
- .iter()
- // .filter(|b| matches!(b.bam_type, BamType::WGS))
- .filter(|b| match b.time_point.as_str() {
- "diag" => b.bam_stats.global_coverage >= min_diag_cov as f64,
- "mrd" => b.bam_stats.global_coverage >= min_mrd_cov as f64,
- _ => false,
- })
- // .filter(|b| match &b.cramino {
- // Some(cramino) => match b.time_point.as_str() {
- // "diag" => cramino.mean_length >= min_diag_cov as f64,
- // "mrd" => cramino.mean_length >= min_mrd_cov as f64,
- // _ => false,
- // },
- // _ => false,
- // })
- .cloned()
- .collect()
- }
- pub fn ids(&self) -> Vec<String> {
- let mut ids: Vec<String> = self.bams.iter().map(|b| b.id.clone()).collect();
- ids.sort();
- ids.dedup();
- ids
- }
- }
- pub fn load_bam_collection(result_dir: &str) -> BamCollection {
- let pattern = format!("{}/*/*/*.bam", result_dir);
- let bams = glob(&pattern)
- .expect("Failed to read glob pattern.")
- // .par_bridge()
- .filter_map(|entry| {
- match entry {
- Ok(path) => match WGSBam::new(path) {
- Ok(bam) => return Some(bam),
- Err(e) => warn!("{e}"),
- },
- Err(e) => warn!("Error: {:?}", e),
- }
- None
- })
- .collect();
- BamCollection { bams }
- }
- /// Computes the composition of `(RG, fn)` tag prefixes in a BAM file sample.
- ///
- /// This function parses a BAM file and, for a subset of records (`sample_size`),
- /// extracts the `RG` (read group) and `fn` (typically a file or flowcell ID) string tags from each record.
- /// It groups records by the prefix (before `_`, if present) of each tag,
- /// counts occurrences for each pair, and reports their proportion in the sample.
- ///
- /// # Arguments
- ///
- /// * `file_path` - Path to the BAM file.
- /// * `sample_size` - Maximum number of records to sample and process.
- ///
- /// # Returns
- ///
- /// Returns a vector of tuples, each containing:
- /// - `String`: the <runid> (up to the first underscore of the `RG` tag, or full tag if no underscore)
- /// - `String`: the <flowcell-id> (up to the first underscore of the `fn` tag, or full tag if no underscore)
- /// - `f64`: percentage of records with this `(runid, flowcell-id)` prefix pair, relative to total successfully processed records
- ///
- /// # Errors
- ///
- /// Returns an error if the BAM file cannot be opened or read.
- /// Any record in the sampled set does not have both required tags, or the tags are not strings.
- ///
- /// # Examples
- ///
- /// ```no_run
- /// # use anyhow::Result;
- /// let stats = bam_composition("reads.bam", 10_000)?;
- /// for (rg, flowcell, pct) in stats {
- /// println!("{} / {}: {:.2}%", rg, flowcell, pct);
- /// }
- /// # Ok::<(), anyhow::Error>(())
- /// ```
- ///
- /// # Notes
- ///
- /// - Only the first `sample_size` records in the BAM are processed.
- /// - Percentages are normalized by the number of records that had both required tags.
- /// - This function uses the `rust-htslib` crate for BAM parsing.
- ///
- /// # Tag requirements
- ///
- /// - Both `RG` and `fn` tags must be present and of string type.
- /// - Tag values are split on the first underscore; only the prefix is used for grouping.
- ///
- /// # See also
- ///
- /// - [SAM/BAM tag conventions](https://samtools.github.io/hts-specs/SAMv1.pdf)
- /// - [Dorado SAM specifications](https://github.com/nanoporetech/dorado/blob/release-v1.0/documentation/SAM.md)
- ///
- pub fn bam_composition(
- file_path: &str,
- sample_size: usize,
- ) -> anyhow::Result<Vec<(String, String, f64)>> {
- use std::collections::HashMap;
- use rust_htslib::bam::{Reader, record::Aux};
- let mut bam = Reader::from_path(file_path)?;
- let mut rgs: HashMap<(String, String), u64> = HashMap::new();
- let mut processed = 0u64;
- for (i, rec) in bam.records().filter_map(Result::ok).take(sample_size).enumerate() {
- let rg = match rec.aux(b"RG") {
- Ok(Aux::String(s)) => s,
- Ok(_) => return Err(anyhow::anyhow!("Record {}: RG tag is not a string", i + 1)),
- Err(_) => return Err(anyhow::anyhow!("Record {}: RG tag missing", i + 1)),
- };
- let fn_tag = match rec.aux(b"fn") {
- Ok(Aux::String(b)) => b,
- Ok(_) => return Err(anyhow::anyhow!("Record {}: fn tag is not a string", i + 1)),
- Err(_) => return Err(anyhow::anyhow!("Record {}: fn tag missing", i + 1)),
- };
- let rg_prefix = rg.split_once('_').map(|(a, _)| a).unwrap_or(rg);
- let fn_prefix = fn_tag.split_once('_').map(|(b, _)| b).unwrap_or(fn_tag);
- *rgs.entry((rg_prefix.to_string(), fn_prefix.to_string())).or_default() += 1;
- processed += 1;
- }
- let results: Vec<_> = rgs
- .into_iter()
- .map(|((rg, fn_tag), n)| (
- rg,
- fn_tag,
- n as f64 * 100.0 / processed as f64
- ))
- .collect();
- Ok(results)
- }
- pub fn bam_has_fc(bam: &WGSBam, flow_cell: &FlowCell) -> anyhow::Result<bool> {
- let mut bam = rust_htslib::bam::Reader::from_path(&bam.path)?;
- let mut rgs: HashMap<String, u64> = HashMap::new();
- for result in bam.records().filter_map(Result::ok).take(2_000) {
- if let rust_htslib::bam::record::Aux::String(s) = result.aux(b"fn")? {
- *rgs.entry(s.to_string()).or_default() += 1;
- }
- }
- for k in rgs.keys() {
- if k.contains(&flow_cell.sample_sheet.flow_cell_id) {
- return Ok(true);
- }
- }
- Ok(false)
- }
- pub struct BamReadsSampler {
- pub positions: Vec<(String, u64)>,
- pub reader: rust_htslib::bam::IndexedReader,
- current_index: usize,
- }
- impl BamReadsSampler {
- pub fn new(path: &str, n: usize) -> anyhow::Result<Self> {
- debug!("Reading BAM file: {path}");
- let reader = rust_htslib::bam::IndexedReader::from_path(path)?;
- let header = reader.header();
- let contig_len: Vec<(String, u64)> = header
- .target_names()
- .into_iter()
- .map(|target_name| {
- let tid = header.tid(target_name).unwrap();
- (
- String::from_utf8(target_name.to_vec()).unwrap(),
- header.target_len(tid).unwrap(),
- )
- })
- .collect();
- let positions = sample_random_positions(&contig_len, n);
- Ok(Self {
- positions,
- reader,
- current_index: 0,
- })
- }
- }
- impl Iterator for BamReadsSampler {
- type Item = Result<rust_htslib::bam::Record, rust_htslib::errors::Error>;
- fn next(&mut self) -> Option<Self::Item> {
- loop {
- if self.current_index < self.positions.len() {
- let (contig, pos) = &self.positions[self.current_index];
- match self.reader.fetch((contig, *pos, pos + 1)) {
- Ok(_) => (),
- Err(e) => warn!("Error while reading BAM {e}"),
- }
- let result = self.reader.records().next();
- self.current_index += 1;
- if result.is_some() {
- return result;
- }
- } else {
- return None;
- }
- }
- }
- }
- pub fn n_mapped_reads(bam_path: &str) -> anyhow::Result<u64> {
- let mut bam = rust_htslib::bam::IndexedReader::from_path(bam_path)?;
- let mapped = bam
- .index_stats()?
- .into_iter()
- .filter(|(tid, _, _, _)| *tid < 24)
- .map(|(_, _, m, _)| m)
- .sum();
- Ok(mapped)
- }
- // 0-based inclusif
- pub fn nt_pileup(
- bam: &mut rust_htslib::bam::IndexedReader,
- chr: &str,
- position: u32,
- with_next_ins: bool,
- ) -> anyhow::Result<Vec<u8>> {
- use rust_htslib::{bam, bam::Read};
- let mut bases = Vec::new();
- bam.fetch((chr, position, position + 1))?;
- let mut bam_pileup = Vec::new();
- for p in bam.pileup() {
- let pileup = p.context(format!(
- "Can't pileup bam at position {}:{} (0-based)",
- chr, position
- ))?;
- let cur_position = pileup.pos();
- if cur_position == position {
- for alignment in pileup.alignments() {
- match alignment.indel() {
- bam::pileup::Indel::Ins(_len) => bam_pileup.push(b'I'),
- bam::pileup::Indel::Del(_len) => bam_pileup.push(b'D'),
- _ => {
- let record = alignment.record();
- if record.seq_len() > 0 {
- if let Some(b) = base_at(&record, position, with_next_ins)? {
- bases.push(b);
- }
- } else if alignment.is_del() {
- bases.push(b'D');
- }
- }
- }
- }
- }
- }
- Ok(bases)
- }
- pub fn base_at(
- record: &rust_htslib::bam::record::Record,
- at_pos: u32,
- with_next_ins: bool,
- ) -> anyhow::Result<Option<u8>> {
- let cigar = record.cigar();
- let seq = record.seq();
- let pos = cigar.pos() as u32;
- let mut read_i = 0u32;
- // let at_pos = at_pos - 1;
- let mut ref_pos = pos;
- if ref_pos > at_pos {
- return Ok(None);
- }
- for (id, op) in cigar.iter().enumerate() {
- let (add_read, add_ref) = match *op {
- Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len),
- Cigar::Ins(len) => (len, 0),
- Cigar::Del(len) => (0, len),
- Cigar::RefSkip(len) => (0, len),
- Cigar::SoftClip(len) => (len, 0),
- Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
- };
- // If at the end of the op len and next op is Ins return I
- if with_next_ins && ref_pos + add_read == at_pos + 1 {
- if let Some(Cigar::Ins(_)) = cigar.get(id + 1) {
- return Ok(Some(b'I'));
- }
- }
- if ref_pos + add_ref > at_pos {
- // Handle deletions directly
- if let Cigar::Del(_) = *op {
- return Ok(Some(b'D'));
- } else if let Cigar::RefSkip(_) = op {
- return Ok(None);
- } else {
- let diff = at_pos - ref_pos;
- let p = read_i + diff;
- return Ok(Some(seq[p as usize]));
- }
- }
- read_i += add_read;
- ref_pos += add_ref;
- }
- Ok(None)
- }
- pub fn counts_at(
- bam: &mut rust_htslib::bam::IndexedReader,
- chr: &str,
- position: u32,
- ) -> anyhow::Result<HashMap<String, i32>> {
- let p = nt_pileup(bam, chr, position, false)?
- .iter()
- .map(|e| String::from_utf8(vec![*e]).unwrap())
- .collect::<Vec<_>>();
- let mut counts = HashMap::new();
- for item in p.iter() {
- *counts.entry(item.to_string()).or_insert(0) += 1;
- }
- Ok(counts)
- }
- pub fn ins_pileup(
- bam: &mut rust_htslib::bam::IndexedReader,
- chr: &str,
- position: u32,
- ) -> anyhow::Result<Vec<String>> {
- let mut bases = Vec::new();
- bam.fetch((chr, position, position + 10))?;
- for p in bam.pileup() {
- let pileup = p.context(format!(
- "Can't pileup bam at position {}:{} (0-based)",
- chr, position
- ))?;
- let cur_position = pileup.pos();
- // Ins in the next position
- if cur_position == position + 1 {
- // debug!("{cur_position}");
- for alignment in pileup.alignments() {
- let record = alignment.record();
- if record.seq_len() > 0 {
- if let Some(b) = ins_at(&record, position)? {
- bases.push(b);
- }
- }
- }
- }
- }
- Ok(bases)
- }
- pub fn ins_at(
- record: &rust_htslib::bam::record::Record,
- at_pos: u32,
- ) -> anyhow::Result<Option<String>> {
- use rust_htslib::bam::record::Cigar;
- let cigar = record.cigar();
- let seq = record.seq();
- let pos = cigar.pos() as u32;
- let mut read_i = 0u32;
- let mut ref_pos = pos;
- if ref_pos > at_pos {
- return Ok(None);
- }
- // debug!(
- // "read: {}",
- // String::from_utf8(record.qname().to_vec()).unwrap()
- // );
- for op in cigar.iter() {
- let (add_read, add_ref) = match *op {
- Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len),
- Cigar::Ins(len) => (len, 0),
- Cigar::Del(len) => (0, len),
- Cigar::RefSkip(len) => (0, len),
- Cigar::SoftClip(len) => (len, 0),
- Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
- };
- if ref_pos + add_read > at_pos && ref_pos + add_read < at_pos + 10 {
- if let Cigar::Ins(v) = *op {
- // debug!(
- // "ins size {v} @ {} (corrected {})",
- // ref_pos + add_read,
- // (ref_pos + add_read) - v - 1
- // );
- if (ref_pos + add_read) - v - 1 == at_pos {
- let inserted_seq =
- seq.as_bytes()[read_i as usize..(read_i + v) as usize].to_vec();
- return Ok(Some(String::from_utf8(inserted_seq)?));
- }
- }
- }
- read_i += add_read;
- ref_pos += add_ref;
- }
- Ok(None)
- }
- pub fn counts_ins_at(
- bam: &mut rust_htslib::bam::IndexedReader,
- chr: &str,
- position: u32,
- ) -> anyhow::Result<HashMap<String, i32>> {
- let p = ins_pileup(bam, chr, position)?;
- let mut counts = HashMap::new();
- for item in p.iter() {
- *counts.entry(item.to_string()).or_insert(0) += 1;
- }
- Ok(counts)
- }
- pub fn sample_random_positions(chromosomes: &Vec<(String, u64)>, n: usize) -> Vec<(String, u64)> {
- let mut rng = rng();
- let total_length: u64 = chromosomes.iter().map(|(_, length)| length).sum();
- (0..n)
- .map(|_| {
- let random_position = rng.random_range(0..total_length);
- let mut cumulative_length = 0;
- for (chr, length) in chromosomes {
- cumulative_length += length;
- if random_position < cumulative_length {
- let position_within_chr = random_position - (cumulative_length - length);
- return (chr.clone(), position_within_chr);
- }
- }
- unreachable!("Should always find a chromosome")
- })
- .collect()
- }
- /// High-level mapping statistics for a WGS BAM file.
- ///
- /// This struct aggregates essential quality control and mapping metrics, suitable for reporting or downstream QC.
- /// Unless otherwise specified, units are in base pairs (bp) or counts of reads.
- ///
- /// # Fields
- /// - `all_records`: Total number of records in the BAM, before any filtering.
- /// - `n_reads`: Number of primary, mapped, QC-passed, non-duplicate reads counted for statistics.
- /// - `mapped_fraction`: Fraction of mapped (counted) reads relative to all records.
- /// - `n_unmapped`: Number of unmapped reads (`BAM_FUNMAP`).
- /// - `n_duplicates`: Number of reads marked as duplicate (`BAM_FDUP`).
- /// - `mapped_yield`: Total sum of mapped read lengths (bp).
- /// - `mean_read_length`: Mean length of mapped reads (bp).
- /// - `median_read_length`: Median mapped read length (bp).
- /// - `coverage_stddev`: Standard deviation of per-read global coverage contribution.
- /// - `global_coverage`: Mean mapped yield divided by total genome size.
- /// - `karyotype`: Per-contig summary: (TID, contig length, contig name, mapped yield, coverage stddev).
- /// - `n50`: N50 statistic for mapped read lengths (bp).
- /// - `by_lengths`: Histogram of mapped read lengths as (length, count), sorted by length.
- ///
- /// # Filtering Rule
- /// Only records that are:
- /// - primary alignments (not secondary/supplementary),
- /// - mapped (`!BAM_FUNMAP`),
- /// - pass vendor quality checks (`!BAM_FQCFAIL`),
- /// - not marked as duplicate (`!BAM_FDUP`),
- /// - and with mapping quality ≥ `bam_min_mapq`,
- /// are counted as mapped reads.
- ///
- #[derive(Debug, Clone, Deserialize, Serialize)]
- pub struct WGSBamStats {
- /// Total number of BAM records (including unmapped, filtered, duplicates).
- pub all_records: u64,
- /// Number of mapped, primary, QC-passed, non-duplicate reads (final filtered count).
- pub n_reads: u64,
- /// Fraction of counted mapped reads over all BAM records.
- pub mapped_fraction: f64,
- /// Number of unmapped records (flag 0x4).
- pub n_unmapped: u64,
- /// Number of duplicate records (flag 0x400).
- pub n_duplicates: u64,
- /// Number of duplicate records (flag 0x400).
- pub n_lowq: u64,
- /// Total sum of mapped read lengths (bp).
- pub mapped_yield: u64,
- /// Mean mapped read length (bp).
- pub mean_read_length: f64,
- /// Median mapped read length (bp).
- pub median_read_length: u64,
- /// Standard deviation of per-read global coverage contributions.
- pub coverage_stddev: f64,
- /// Mean global coverage: `mapped_yield / genome_size`.
- pub global_coverage: f64,
- /// (tid, contig_length, contig_name, mapped_yield, mean_coverage, coverage_stddev, coverage_ratio)
- pub karyotype: Vec<(i32, u64, String, u64, f64, f64, f64)>,
- /// N50 value of mapped read lengths (bp).
- pub n50: u64,
- /// Histogram of mapped read lengths: (length, count), sorted by length.
- pub by_lengths: Vec<(u64, u32)>,
- }
- impl WGSBamStats {
- /// Computes `WGSBamStats` by scanning a BAM file and summarizing essential statistics.
- ///
- /// # Parameters
- /// - `case_id`: Sample/case identifier used to locate the BAM file via config.
- /// - `time`: Timestamp string for progress/logging.
- /// - `config`: Configuration struct. Must provide:
- /// - `solo_bam(case_id, time)`: path to BAM file.
- /// - `bam_min_mapq`: minimum mapping quality to include read.
- /// - `bam_n_threads`: number of BAM decompression threads.
- ///
- /// # Filtering Rule
- /// Only primary, mapped, non-duplicate, QC-passed reads with sufficient mapping quality are included (see struct doc).
- ///
- /// # Returns
- /// On success, returns a fully populated `WGSBamStats` with all core and extended fields.
- /// Returns an error if the BAM cannot be parsed or genome sizes cannot be retrieved.
- pub fn new(case_id: &str, time: &str, config: &Config) -> anyhow::Result<Self> {
- let bam_path = config.solo_bam(case_id, time);
- let bam_min_mapq = config.bam_min_mapq;
- let bam_n_threads = config.bam_n_threads;
- let mut bam = rust_htslib::bam::Reader::from_path(&bam_path)?;
- let h = bam.header().clone();
- let header = rust_htslib::bam::Header::from_template(&h);
- bam.set_threads(bam_n_threads as usize)?;
- info!("Parsing BAM file: {bam_path}");
- let mut all_records = 0u64;
- let mut n_reads = 0u64;
- let mut n_unmapped = 0u64;
- let mut n_duplicates = 0u64;
- let mut n_lowq = 0u64;
- let mut mapped_lengths = Vec::new();
- let mut lengths_by_tid: HashMap<i32, Vec<u64>> = HashMap::new();
- // Main scan loop: apply flag and MAPQ filters.
- for rec in bam.rc_records() {
- all_records += 1;
- let r = rec.with_context(|| "failed to parse BAM record")?;
- let flags = r.flags();
- // Count and exclude unmapped.
- if flags & (rust_htslib::htslib::BAM_FUNMAP as u16) != 0 {
- n_unmapped += 1;
- continue;
- }
- // Count and exclude duplicates.
- if flags & (rust_htslib::htslib::BAM_FDUP as u16) != 0 {
- n_duplicates += 1;
- continue;
- }
- // Filter by mapping quality.
- if r.mapq() < bam_min_mapq {
- n_lowq += 1;
- continue;
- }
- // Exclude secondary and QC-failed.
- let bad_flags = rust_htslib::htslib::BAM_FSECONDARY | rust_htslib::htslib::BAM_FQCFAIL;
- if (flags & (bad_flags as u16)) != 0 {
- continue;
- }
- n_reads += 1;
- let start = r.pos();
- let end = r.reference_end();
- let len = if end > start { (end - start) as u64 } else { 0 };
- mapped_lengths.push(len);
- lengths_by_tid.entry(r.tid()).or_default().push(len);
- if n_reads % 500_000 == 0 {
- info!("{case_id} {time}: processed {n_reads} mapped reads");
- }
- }
- let mapped_fraction = if all_records > 0 {
- n_reads as f64 / all_records as f64
- } else {
- 0.0
- };
- // Compute mean, median, N50 (single sort).
- let mut lens = mapped_lengths.clone();
- lens.sort_unstable();
- let mean_read_length = if n_reads > 0 {
- mapped_lengths.iter().sum::<u64>() as f64 / n_reads as f64
- } else {
- 0.0
- };
- let median_read_length = if lens.is_empty() {
- 0
- } else {
- lens[lens.len() / 2]
- };
- let mapped_yield: u64 = mapped_lengths.iter().sum();
- // N50: minimal length L such that sum(lengths >= L) >= 50% yield.
- let mut cum = 0u64;
- let half = mapped_yield / 2;
- let n50 = lens
- .iter()
- .rev()
- .find_map(|&l| {
- cum += l;
- if cum >= half {
- Some(l)
- } else {
- None
- }
- })
- .unwrap_or(0);
- // Histogram (sorted by length).
- let mut histo = BTreeMap::new();
- for &len in &mapped_lengths {
- *histo.entry(len).or_insert(0) += 1;
- }
- let by_lengths: Vec<_> = histo.into_iter().collect();
- // Genome/coverage.
- let genome = get_genome_sizes(&header)?;
- let genome_size: u64 = genome.values().sum();
- let global_coverage = if genome_size > 0 {
- mapped_yield as f64 / genome_size as f64
- } else {
- 0.0
- };
- // Global coverage stdev: stdev of per-read contributions to global coverage.
- let mut total_sq_cov = 0.0;
- for &len in &mapped_lengths {
- let cov = if genome_size > 0 {
- len as f64 / genome_size as f64
- } else {
- 0.0
- };
- total_sq_cov += cov * cov;
- }
- let mean_cov = global_coverage;
- let variance = if n_reads > 0 {
- let var = total_sq_cov / n_reads as f64 - mean_cov.powi(2);
- if var < 0.0 {
- 0.0
- } else {
- var
- }
- } else {
- 0.0
- };
- let coverage_stddev = variance.sqrt();
- // Per-contig: (TID, contig_length, contig_name, mapped_yield, mean_coverage, coverage_stddev , coverage_ratio).
- let mut karyotype: Vec<(i32, u64, String, u64, f64, f64, f64)> = lengths_by_tid
- .iter()
- .map(|(tid, lengths)| {
- let contig = String::from_utf8(h.tid2name(*tid as u32).to_vec()).unwrap();
- let contig_len = genome.get(&contig).copied().unwrap_or(0);
- let mapped_sum: u64 = lengths.iter().sum();
- // Per-contig mean coverage
- let mean_cov = if contig_len > 0 {
- mapped_sum as f64 / contig_len as f64
- } else {
- 0.0
- };
- let coverage_ratio = if global_coverage > 0.0 {
- mean_cov / global_coverage
- } else {
- 0.0
- };
- // Per-contig coverage stdev (over all reads mapped to this contig)
- let var = if contig_len > 0 && !lengths.is_empty() {
- lengths
- .iter()
- .map(|&l| {
- let cov = l as f64 / contig_len as f64;
- (cov - mean_cov).powi(2)
- })
- .sum::<f64>()
- / lengths.len() as f64
- } else {
- 0.0
- };
- let stddev = var.sqrt();
- (
- *tid,
- contig_len,
- contig,
- mapped_sum,
- mean_cov,
- stddev,
- coverage_ratio,
- )
- })
- .collect();
- karyotype.sort_unstable_by_key(|&(tid, _, _, _, _, _, _)| tid);
- Ok(Self {
- all_records,
- n_reads,
- mapped_fraction,
- n_unmapped,
- n_duplicates,
- mapped_yield,
- mean_read_length,
- median_read_length,
- coverage_stddev,
- global_coverage,
- karyotype,
- n50,
- by_lengths,
- n_lowq,
- })
- }
- }
- impl fmt::Display for WGSBamStats {
- /// Formats the BAM statistics as a readable summary table.
- ///
- /// Shows all global stats and a per-contig breakdown with coverage stdev.
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- writeln!(f, "BAM statistics summary:")?;
- writeln!(f, " All BAM records: {}", self.all_records)?;
- writeln!(f, " Unmapped reads: {}", self.n_unmapped)?;
- writeln!(f, " Duplicate reads: {}", self.n_duplicates)?;
- writeln!(f, " Low Q reads: {}", self.n_lowq)?;
- writeln!(f, " Counted (mapped) reads: {}", self.n_reads)?;
- writeln!(f, " Mapped fraction: {:.4}", self.mapped_fraction)?;
- writeln!(
- f,
- " Mapped yield [Gb]: {:.2}",
- self.mapped_yield as f64 / 1e9
- )?;
- writeln!(f, " Mean read length [bp]: {:.2}", self.mean_read_length)?;
- writeln!(f, " Median read length [bp]: {}", self.median_read_length)?;
- writeln!(f, " N50 [bp]: {}", self.n50)?;
- writeln!(f, " Global mean coverage: {:.2}", self.global_coverage)?;
- writeln!(f, " Global coverage stdev: {:.2}", self.coverage_stddev)?;
- writeln!(f, " Per-contig stats:")?;
- writeln!(
- f,
- " {:<8} {:<10} {:<12} {:<14} {:<12} {:<10} {:<10}",
- "TID", "Length", "Name", "MappedYield", "CovMean", "CovStdev", "CovRatio"
- )?;
- for (tid, contig_len, contig, mapped_sum, mean_cov, stddev, coverage_ratio) in
- &self.karyotype
- {
- writeln!(
- f,
- " {:<8} {:<10} {:<12} {:<14} {:<12.4} {:.4} {:.2}",
- tid, contig_len, contig, mapped_sum, mean_cov, stddev, coverage_ratio
- )?;
- }
- Ok(())
- }
- }
- pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<HashMap<String, u64>> {
- let mut genome = HashMap::new();
- for (key, records) in header.to_hashmap() {
- for record in records {
- if key == "SQ" {
- genome.insert(
- record["SN"].to_string(),
- record["LN"]
- .parse::<u64>()
- .expect("Failed parsing length of chromosomes"),
- );
- }
- }
- }
- Ok(genome)
- }
- /// Result of querying a read at a reference position.
- #[derive(Clone, Debug, Eq, PartialEq)]
- pub enum PileBase {
- A,
- C,
- G,
- T,
- N,
- Ins, // insertion immediately after the queried base
- Del((Vec<u8>, u32)), // deletion covering the queried base
- Skip, // reference skip (N)
- }
- /// Decode one HTSlib 4bit nucleotide (0=A,1=C,2=G,3=T,4=N, …) to `Base`
- #[inline]
- fn decode(b: u8) -> PileBase {
- match b & 0b1111 {
- 0 => PileBase::A,
- 1 => PileBase::C,
- 2 => PileBase::G,
- 3 => PileBase::T,
- _ => PileBase::N,
- }
- }
- pub fn base_at_new(
- record: &rust_htslib::bam::Record,
- at_pos: i64,
- with_next_ins: bool,
- ) -> Option<PileBase> {
- if record.pos() > at_pos {
- return None;
- }
- let mut ref_pos = record.pos();
- let mut read_pos = 0_i64;
- let cigar = record.cigar();
- let seq = record.seq().as_bytes(); // already expanded to ASCII (A,C,G,T,N)
- for (i, op) in cigar.iter().enumerate() {
- let l = op.len() as i64;
- let (consume_read, consume_ref) = match op.char() {
- 'M' | '=' | 'X' => (l, l),
- 'I' | 'S' => (l, 0),
- 'D' | 'N' => (0, l),
- 'H' | 'P' => (0, 0),
- _ => unreachable!(),
- };
- /* look ahead for an insertion immediately AFTER the queried base */
- if with_next_ins
- && ref_pos + consume_ref == at_pos + 1
- && matches!(cigar.get(i + 1), Some(Cigar::Ins(_)))
- {
- return Some(PileBase::Ins);
- }
- // Instead of PileBase::Ins, consider
- // extracting the inserted sequence and returning its length or actual bases:
- // if let Some(Cigar::Ins(len)) = cigar.get(i + 1) {
- // let ins_seq = &seq[read_pos as usize + (consume_ref as usize)
- // ..read_pos as usize + (consume_ref as usize) + (*len as usize)];
- // return Some(PileBase::Ins(ins_seq.to_vec()));
- // }
- /* Does the queried reference coordinate fall inside this CIGAR op? */
- if ref_pos + consume_ref > at_pos {
- return Some(match op {
- Cigar::Del(e) => PileBase::Del((record.qname().to_vec(), *e)),
- Cigar::RefSkip(_) => PileBase::Skip,
- _ => {
- let offset = (at_pos - ref_pos) as usize;
- if read_pos as usize + offset >= seq.len() {
- return None; // out of range, malformed BAM
- }
- decode(seq[read_pos as usize + offset])
- }
- });
- }
- read_pos += consume_read;
- ref_pos += consume_ref;
- }
- None // beyond alignment
- }
- /// Return the pile‑up of **all reads** at `chr:pos` (0‑based, inclusive).
- ///
- /// * `bam` – an open [`rust_htslib::bam::IndexedReader`].
- /// * `chr` / `pos` – reference contig name and coordinate.
- /// * `with_next_ins` – if `true`, report [`Base::Ins`] when an insertion starts
- /// **right after** the queried base.
- ///
- /// The function bounds the internal pile‑up depth to 10 000 reads to protect
- /// against malformed BAM files that could otherwise allocate unbounded memory.
- ///
- /// # Errors
- ///
- /// Propagates any I/O or parsing error from *rust‑htslib*, wrapped in
- /// [`anyhow::Error`].
- pub fn nt_pileup_new(
- bam: &mut rust_htslib::bam::IndexedReader,
- chr: &str,
- pos: u32,
- with_next_ins: bool,
- ) -> anyhow::Result<Vec<PileBase>> {
- // Storage for the per‑read observations.
- let mut pile = Vec::new();
- // Restrict BAM iterator to the one‑base span of interest.
- bam.fetch((chr, pos, pos + 1))?;
- // Hard cap on depth (adjust if your use‑case needs deeper coverage).
- bam.pileup().set_max_depth(10_000);
- // Stream the pileup; HTSlib walks the reads for us.
- for pileup in bam.pileup() {
- // Attach context to any low‑level error.
- let pileup = pileup.context(format!("Failed to pileup BAM at {chr}:{pos}"))?;
- // Skip other positions quickly.
- if pileup.pos() != pos {
- continue;
- }
- for aln in pileup.alignments() {
- // Handle the three indel states reported by HTSlib.
- match aln.indel() {
- rust_htslib::bam::pileup::Indel::Ins(_) => {
- pile.push(PileBase::Ins); // insertion *starts at* this base
- }
- rust_htslib::bam::pileup::Indel::Del(e) => {
- let qname = aln.record().qname().to_vec();
- pile.push(PileBase::Del((qname, e))); // deletion length > 1, but still covers here
- }
- rust_htslib::bam::pileup::Indel::None => {
- // Regular match/mismatch: delegate to `base_at`.
- if let Some(base) = base_at_new(&aln.record(), pos as i64, with_next_ins) {
- pile.push(base);
- }
- }
- }
- }
- }
- Ok(pile)
- }
|