use core::fmt; use std::{ collections::{BTreeMap, HashMap}, fs::{self, File}, path::PathBuf, }; use anyhow::{anyhow, Context}; use chrono::{DateTime, Utc}; use dashmap::DashMap; use glob::glob; use log::{debug, info, warn}; use rand::{rng, Rng}; use rayon::prelude::*; use rust_htslib::bam::{ext::BamRecordExtensions, record::Cigar, Read}; use serde::{Deserialize, Serialize}; use crate::config::Config; use super::flowcells::FlowCell; #[derive(Debug, Clone, Deserialize, Serialize)] pub struct WGSBam { pub id: String, pub time_point: String, pub reference_genome: String, // pub bam_type: BamType, pub path: PathBuf, pub modified: DateTime, pub bam_stats: WGSBamStats, // pub cramino: Option, pub composition: Vec<(String, String, f64)>, // acquisition id, fn } // #[derive(Debug, PartialEq, Clone, Deserialize, Serialize)] // pub enum BamType { // WGS, // Panel(String), // ChIP(String), // } // impl WGSBam { pub fn new(path: PathBuf) -> anyhow::Result { let stem = path .clone() .file_stem() .context(format!("Can't parse stem from {}", path.display()))? .to_string_lossy() .to_string(); let stem: Vec<&str> = stem.split('_').collect(); if stem.len() != 3 { return Err(anyhow!("Error in bam name: {}", path.display())); } let id = stem[0].to_string(); let time_point = stem[1].to_string(); let reference_genome = stem .last() .context("Can't get last from stem {stem}")? .to_string(); // let bam_type = if stem.len() == 4 { // match stem[2] { // "oncoT" => BamType::Panel("oncoT".to_string()), // "H3K27ac" => BamType::ChIP("H3K27ac".to_string()), // "H3K4me3" => BamType::ChIP("H3K4me3".to_string()), // _ => return Err(anyhow!("Error in bam name: {}", path.display())), // } // } else { // BamType::WGS // }; let file_metadata = fs::metadata(&path)?; let modified = file_metadata.modified()?; let tp_dir = path .parent() .context("Can't parse parent from: {bam_path}")?; let json_path = format!( "{}/{id}_{time_point}_{reference_genome}_info.json", tp_dir.to_string_lossy() ); let json_file = PathBuf::from(&json_path); if json_file.exists() && json_file.metadata()?.modified()? > modified { match Self::load_json(&json_path) { Ok(s) => return Ok(s), Err(e) => { warn!("Error while loading {}.\n{e}", json_path); } } } // let cramino_path = format!( // "{}/{id}_{time_point}_{reference_genome}_cramino.txt", // tp_dir.to_string_lossy() // ); // let cramino = if bam_type == BamType::WGS { // if !PathBuf::from_str(&cramino_path)?.exists() { // // return Err(anyhow!("Cramino file missing {cramino_path}")); // Cramino::default().with_bam(path.to_str().unwrap())?; // } // let mut cramino = Cramino::default().with_result_path(&cramino_path); // cramino // .parse_results() // .context(format!("Error while parsing cramino for {cramino_path}"))?; // // if let Some(cramino) = cramino.results { // Some(cramino) // } else { // return Err(anyhow!("Cramino results parsing failed")); // } // } else { // None // }; let composition = bam_composition(path.to_string_lossy().as_ref(), 20_000).context(format!( "Error while reading BAM composition for {}", path.display() ))?; let s = Self { path, bam_stats: WGSBamStats::new(&id, &time_point, &Config::default())?, // cramino, id: id.to_string(), time_point: time_point.to_string(), // bam_type, reference_genome, composition, modified: modified.into(), }; s.save_json(&json_path)?; Ok(s) } pub fn load_json(path: &str) -> anyhow::Result { let f = File::open(path)?; let s: Self = serde_json::from_reader(f)?; Ok(s) } pub fn save_json(&self, path: &str) -> anyhow::Result<()> { let f = File::create(path)?; serde_json::to_writer(f, self)?; Ok(()) } } #[derive(Debug, Default, Clone)] pub struct BamCollection { pub bams: Vec, } impl BamCollection { pub fn new(result_dir: &str) -> Self { load_bam_collection(result_dir) } pub fn by_acquisition_id(&self) -> HashMap> { let mut acq: HashMap> = HashMap::new(); for bam in self.bams.iter() { for (acq_id, _, _) in bam.composition.iter() { if let Some(entry) = acq.get_mut(acq_id) { entry.push(bam); } else { acq.insert(acq_id.to_string(), vec![bam]); } } } acq } pub fn get(&self, id: &str, time_point: &str) -> Vec<&WGSBam> { self.bams .iter() .filter(|b| b.id == id && b.time_point == time_point) .collect() } pub fn by_id_completed(&self, min_diag_cov: f32, min_mrd_cov: f32) -> Vec { self.bams .iter() // .filter(|b| matches!(b.bam_type, BamType::WGS)) .filter(|b| match b.time_point.as_str() { "diag" => b.bam_stats.global_coverage >= min_diag_cov as f64, "mrd" => b.bam_stats.global_coverage >= min_mrd_cov as f64, _ => false, }) // .filter(|b| match &b.cramino { // Some(cramino) => match b.time_point.as_str() { // "diag" => cramino.mean_length >= min_diag_cov as f64, // "mrd" => cramino.mean_length >= min_mrd_cov as f64, // _ => false, // }, // _ => false, // }) .cloned() .collect() } pub fn ids(&self) -> Vec { let mut ids: Vec = self.bams.iter().map(|b| b.id.clone()).collect(); ids.sort(); ids.dedup(); ids } } pub fn load_bam_collection(result_dir: &str) -> BamCollection { let pattern = format!("{}/*/*/*.bam", result_dir); let bams = glob(&pattern) .expect("Failed to read glob pattern.") // .par_bridge() .filter_map(|entry| { match entry { Ok(path) => match WGSBam::new(path) { Ok(bam) => return Some(bam), Err(e) => warn!("{e}"), }, Err(e) => warn!("Error: {:?}", e), } None }) .collect(); BamCollection { bams } } /// Computes the composition of `(RG, fn)` tag prefixes in a BAM file sample. /// /// This function parses a BAM file and, for a subset of records (`sample_size`), /// extracts the `RG` (read group) and `fn` (typically a file or flowcell ID) string tags from each record. /// It groups records by the prefix (before `_`, if present) of each tag, /// counts occurrences for each pair, and reports their proportion in the sample. /// /// # Arguments /// /// * `file_path` - Path to the BAM file. /// * `sample_size` - Maximum number of records to sample and process. /// /// # Returns /// /// Returns a vector of tuples, each containing: /// - `String`: the (up to the first underscore of the `RG` tag, or full tag if no underscore) /// - `String`: the (up to the first underscore of the `fn` tag, or full tag if no underscore) /// - `f64`: percentage of records with this `(runid, flowcell-id)` prefix pair, relative to total successfully processed records /// /// # Errors /// /// Returns an error if the BAM file cannot be opened or read. /// Any record in the sampled set does not have both required tags, or the tags are not strings. /// /// # Examples /// /// ```no_run /// # use anyhow::Result; /// let stats = bam_composition("reads.bam", 10_000)?; /// for (rg, flowcell, pct) in stats { /// println!("{} / {}: {:.2}%", rg, flowcell, pct); /// } /// # Ok::<(), anyhow::Error>(()) /// ``` /// /// # Notes /// /// - Only the first `sample_size` records in the BAM are processed. /// - Percentages are normalized by the number of records that had both required tags. /// - This function uses the `rust-htslib` crate for BAM parsing. /// /// # Tag requirements /// /// - Both `RG` and `fn` tags must be present and of string type. /// - Tag values are split on the first underscore; only the prefix is used for grouping. /// /// # See also /// /// - [SAM/BAM tag conventions](https://samtools.github.io/hts-specs/SAMv1.pdf) /// - [Dorado SAM specifications](https://github.com/nanoporetech/dorado/blob/release-v1.0/documentation/SAM.md) /// pub fn bam_composition( file_path: &str, sample_size: usize, ) -> anyhow::Result> { use std::collections::HashMap; use rust_htslib::bam::{Reader, record::Aux}; let mut bam = Reader::from_path(file_path)?; let mut rgs: HashMap<(String, String), u64> = HashMap::new(); let mut processed = 0u64; for (i, rec) in bam.records().filter_map(Result::ok).take(sample_size).enumerate() { let rg = match rec.aux(b"RG") { Ok(Aux::String(s)) => s, Ok(_) => return Err(anyhow::anyhow!("Record {}: RG tag is not a string", i + 1)), Err(_) => return Err(anyhow::anyhow!("Record {}: RG tag missing", i + 1)), }; let fn_tag = match rec.aux(b"fn") { Ok(Aux::String(b)) => b, Ok(_) => return Err(anyhow::anyhow!("Record {}: fn tag is not a string", i + 1)), Err(_) => return Err(anyhow::anyhow!("Record {}: fn tag missing", i + 1)), }; let rg_prefix = rg.split_once('_').map(|(a, _)| a).unwrap_or(rg); let fn_prefix = fn_tag.split_once('_').map(|(b, _)| b).unwrap_or(fn_tag); *rgs.entry((rg_prefix.to_string(), fn_prefix.to_string())).or_default() += 1; processed += 1; } let results: Vec<_> = rgs .into_iter() .map(|((rg, fn_tag), n)| ( rg, fn_tag, n as f64 * 100.0 / processed as f64 )) .collect(); Ok(results) } pub fn bam_has_fc(bam: &WGSBam, flow_cell: &FlowCell) -> anyhow::Result { let mut bam = rust_htslib::bam::Reader::from_path(&bam.path)?; let mut rgs: HashMap = HashMap::new(); for result in bam.records().filter_map(Result::ok).take(2_000) { if let rust_htslib::bam::record::Aux::String(s) = result.aux(b"fn")? { *rgs.entry(s.to_string()).or_default() += 1; } } for k in rgs.keys() { if k.contains(&flow_cell.sample_sheet.flow_cell_id) { return Ok(true); } } Ok(false) } pub struct BamReadsSampler { pub positions: Vec<(String, u64)>, pub reader: rust_htslib::bam::IndexedReader, current_index: usize, } impl BamReadsSampler { pub fn new(path: &str, n: usize) -> anyhow::Result { debug!("Reading BAM file: {path}"); let reader = rust_htslib::bam::IndexedReader::from_path(path)?; let header = reader.header(); let contig_len: Vec<(String, u64)> = header .target_names() .into_iter() .map(|target_name| { let tid = header.tid(target_name).unwrap(); ( String::from_utf8(target_name.to_vec()).unwrap(), header.target_len(tid).unwrap(), ) }) .collect(); let positions = sample_random_positions(&contig_len, n); Ok(Self { positions, reader, current_index: 0, }) } } impl Iterator for BamReadsSampler { type Item = Result; fn next(&mut self) -> Option { loop { if self.current_index < self.positions.len() { let (contig, pos) = &self.positions[self.current_index]; match self.reader.fetch((contig, *pos, pos + 1)) { Ok(_) => (), Err(e) => warn!("Error while reading BAM {e}"), } let result = self.reader.records().next(); self.current_index += 1; if result.is_some() { return result; } } else { return None; } } } } pub fn n_mapped_reads(bam_path: &str) -> anyhow::Result { let mut bam = rust_htslib::bam::IndexedReader::from_path(bam_path)?; let mapped = bam .index_stats()? .into_iter() .filter(|(tid, _, _, _)| *tid < 24) .map(|(_, _, m, _)| m) .sum(); Ok(mapped) } // 0-based inclusif pub fn nt_pileup( bam: &mut rust_htslib::bam::IndexedReader, chr: &str, position: u32, with_next_ins: bool, ) -> anyhow::Result> { use rust_htslib::{bam, bam::Read}; let mut bases = Vec::new(); bam.fetch((chr, position, position + 1))?; let mut bam_pileup = Vec::new(); for p in bam.pileup() { let pileup = p.context(format!( "Can't pileup bam at position {}:{} (0-based)", chr, position ))?; let cur_position = pileup.pos(); if cur_position == position { for alignment in pileup.alignments() { match alignment.indel() { bam::pileup::Indel::Ins(_len) => bam_pileup.push(b'I'), bam::pileup::Indel::Del(_len) => bam_pileup.push(b'D'), _ => { let record = alignment.record(); if record.seq_len() > 0 { if let Some(b) = base_at(&record, position, with_next_ins)? { bases.push(b); } } else if alignment.is_del() { bases.push(b'D'); } } } } } } Ok(bases) } pub fn base_at( record: &rust_htslib::bam::record::Record, at_pos: u32, with_next_ins: bool, ) -> anyhow::Result> { let cigar = record.cigar(); let seq = record.seq(); let pos = cigar.pos() as u32; let mut read_i = 0u32; // let at_pos = at_pos - 1; let mut ref_pos = pos; if ref_pos > at_pos { return Ok(None); } for (id, op) in cigar.iter().enumerate() { let (add_read, add_ref) = match *op { Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len), Cigar::Ins(len) => (len, 0), Cigar::Del(len) => (0, len), Cigar::RefSkip(len) => (0, len), Cigar::SoftClip(len) => (len, 0), Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0), }; // If at the end of the op len and next op is Ins return I if with_next_ins && ref_pos + add_read == at_pos + 1 { if let Some(Cigar::Ins(_)) = cigar.get(id + 1) { return Ok(Some(b'I')); } } if ref_pos + add_ref > at_pos { // Handle deletions directly if let Cigar::Del(_) = *op { return Ok(Some(b'D')); } else if let Cigar::RefSkip(_) = op { return Ok(None); } else { let diff = at_pos - ref_pos; let p = read_i + diff; return Ok(Some(seq[p as usize])); } } read_i += add_read; ref_pos += add_ref; } Ok(None) } pub fn counts_at( bam: &mut rust_htslib::bam::IndexedReader, chr: &str, position: u32, ) -> anyhow::Result> { let p = nt_pileup(bam, chr, position, false)? .iter() .map(|e| String::from_utf8(vec![*e]).unwrap()) .collect::>(); let mut counts = HashMap::new(); for item in p.iter() { *counts.entry(item.to_string()).or_insert(0) += 1; } Ok(counts) } pub fn ins_pileup( bam: &mut rust_htslib::bam::IndexedReader, chr: &str, position: u32, ) -> anyhow::Result> { let mut bases = Vec::new(); bam.fetch((chr, position, position + 10))?; for p in bam.pileup() { let pileup = p.context(format!( "Can't pileup bam at position {}:{} (0-based)", chr, position ))?; let cur_position = pileup.pos(); // Ins in the next position if cur_position == position + 1 { // debug!("{cur_position}"); for alignment in pileup.alignments() { let record = alignment.record(); if record.seq_len() > 0 { if let Some(b) = ins_at(&record, position)? { bases.push(b); } } } } } Ok(bases) } pub fn ins_at( record: &rust_htslib::bam::record::Record, at_pos: u32, ) -> anyhow::Result> { use rust_htslib::bam::record::Cigar; let cigar = record.cigar(); let seq = record.seq(); let pos = cigar.pos() as u32; let mut read_i = 0u32; let mut ref_pos = pos; if ref_pos > at_pos { return Ok(None); } // debug!( // "read: {}", // String::from_utf8(record.qname().to_vec()).unwrap() // ); for op in cigar.iter() { let (add_read, add_ref) = match *op { Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len), Cigar::Ins(len) => (len, 0), Cigar::Del(len) => (0, len), Cigar::RefSkip(len) => (0, len), Cigar::SoftClip(len) => (len, 0), Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0), }; if ref_pos + add_read > at_pos && ref_pos + add_read < at_pos + 10 { if let Cigar::Ins(v) = *op { // debug!( // "ins size {v} @ {} (corrected {})", // ref_pos + add_read, // (ref_pos + add_read) - v - 1 // ); if (ref_pos + add_read) - v - 1 == at_pos { let inserted_seq = seq.as_bytes()[read_i as usize..(read_i + v) as usize].to_vec(); return Ok(Some(String::from_utf8(inserted_seq)?)); } } } read_i += add_read; ref_pos += add_ref; } Ok(None) } pub fn counts_ins_at( bam: &mut rust_htslib::bam::IndexedReader, chr: &str, position: u32, ) -> anyhow::Result> { let p = ins_pileup(bam, chr, position)?; let mut counts = HashMap::new(); for item in p.iter() { *counts.entry(item.to_string()).or_insert(0) += 1; } Ok(counts) } pub fn sample_random_positions(chromosomes: &Vec<(String, u64)>, n: usize) -> Vec<(String, u64)> { let mut rng = rng(); let total_length: u64 = chromosomes.iter().map(|(_, length)| length).sum(); (0..n) .map(|_| { let random_position = rng.random_range(0..total_length); let mut cumulative_length = 0; for (chr, length) in chromosomes { cumulative_length += length; if random_position < cumulative_length { let position_within_chr = random_position - (cumulative_length - length); return (chr.clone(), position_within_chr); } } unreachable!("Should always find a chromosome") }) .collect() } /// High-level mapping statistics for a WGS BAM file. /// /// This struct aggregates essential quality control and mapping metrics, suitable for reporting or downstream QC. /// Unless otherwise specified, units are in base pairs (bp) or counts of reads. /// /// # Fields /// - `all_records`: Total number of records in the BAM, before any filtering. /// - `n_reads`: Number of primary, mapped, QC-passed, non-duplicate reads counted for statistics. /// - `mapped_fraction`: Fraction of mapped (counted) reads relative to all records. /// - `n_unmapped`: Number of unmapped reads (`BAM_FUNMAP`). /// - `n_duplicates`: Number of reads marked as duplicate (`BAM_FDUP`). /// - `mapped_yield`: Total sum of mapped read lengths (bp). /// - `mean_read_length`: Mean length of mapped reads (bp). /// - `median_read_length`: Median mapped read length (bp). /// - `coverage_stddev`: Standard deviation of per-read global coverage contribution. /// - `global_coverage`: Mean mapped yield divided by total genome size. /// - `karyotype`: Per-contig summary: (TID, contig length, contig name, mapped yield, coverage stddev). /// - `n50`: N50 statistic for mapped read lengths (bp). /// - `by_lengths`: Histogram of mapped read lengths as (length, count), sorted by length. /// /// # Filtering Rule /// Only records that are: /// - primary alignments (not secondary/supplementary), /// - mapped (`!BAM_FUNMAP`), /// - pass vendor quality checks (`!BAM_FQCFAIL`), /// - not marked as duplicate (`!BAM_FDUP`), /// - and with mapping quality ≥ `bam_min_mapq`, /// are counted as mapped reads. /// #[derive(Debug, Clone, Deserialize, Serialize)] pub struct WGSBamStats { /// Total number of BAM records (including unmapped, filtered, duplicates). pub all_records: u64, /// Number of mapped, primary, QC-passed, non-duplicate reads (final filtered count). pub n_reads: u64, /// Fraction of counted mapped reads over all BAM records. pub mapped_fraction: f64, /// Number of unmapped records (flag 0x4). pub n_unmapped: u64, /// Number of duplicate records (flag 0x400). pub n_duplicates: u64, /// Number of duplicate records (flag 0x400). pub n_lowq: u64, /// Total sum of mapped read lengths (bp). pub mapped_yield: u64, /// Mean mapped read length (bp). pub mean_read_length: f64, /// Median mapped read length (bp). pub median_read_length: u64, /// Standard deviation of per-read global coverage contributions. pub coverage_stddev: f64, /// Mean global coverage: `mapped_yield / genome_size`. pub global_coverage: f64, /// (tid, contig_length, contig_name, mapped_yield, mean_coverage, coverage_stddev, coverage_ratio) pub karyotype: Vec<(i32, u64, String, u64, f64, f64, f64)>, /// N50 value of mapped read lengths (bp). pub n50: u64, /// Histogram of mapped read lengths: (length, count), sorted by length. pub by_lengths: Vec<(u64, u32)>, } impl WGSBamStats { /// Computes `WGSBamStats` by scanning a BAM file and summarizing essential statistics. /// /// # Parameters /// - `case_id`: Sample/case identifier used to locate the BAM file via config. /// - `time`: Timestamp string for progress/logging. /// - `config`: Configuration struct. Must provide: /// - `solo_bam(case_id, time)`: path to BAM file. /// - `bam_min_mapq`: minimum mapping quality to include read. /// - `bam_n_threads`: number of BAM decompression threads. /// /// # Filtering Rule /// Only primary, mapped, non-duplicate, QC-passed reads with sufficient mapping quality are included (see struct doc). /// /// # Returns /// On success, returns a fully populated `WGSBamStats` with all core and extended fields. /// Returns an error if the BAM cannot be parsed or genome sizes cannot be retrieved. pub fn new(case_id: &str, time: &str, config: &Config) -> anyhow::Result { let bam_path = config.solo_bam(case_id, time); let bam_min_mapq = config.bam_min_mapq; let bam_n_threads = config.bam_n_threads; let mut bam = rust_htslib::bam::Reader::from_path(&bam_path)?; let h = bam.header().clone(); let header = rust_htslib::bam::Header::from_template(&h); bam.set_threads(bam_n_threads as usize)?; info!("Parsing BAM file: {bam_path}"); let mut all_records = 0u64; let mut n_reads = 0u64; let mut n_unmapped = 0u64; let mut n_duplicates = 0u64; let mut n_lowq = 0u64; let mut mapped_lengths = Vec::new(); let mut lengths_by_tid: HashMap> = HashMap::new(); // Main scan loop: apply flag and MAPQ filters. for rec in bam.rc_records() { all_records += 1; let r = rec.with_context(|| "failed to parse BAM record")?; let flags = r.flags(); // Count and exclude unmapped. if flags & (rust_htslib::htslib::BAM_FUNMAP as u16) != 0 { n_unmapped += 1; continue; } // Count and exclude duplicates. if flags & (rust_htslib::htslib::BAM_FDUP as u16) != 0 { n_duplicates += 1; continue; } // Filter by mapping quality. if r.mapq() < bam_min_mapq { n_lowq += 1; continue; } // Exclude secondary and QC-failed. let bad_flags = rust_htslib::htslib::BAM_FSECONDARY | rust_htslib::htslib::BAM_FQCFAIL; if (flags & (bad_flags as u16)) != 0 { continue; } n_reads += 1; let start = r.pos(); let end = r.reference_end(); let len = if end > start { (end - start) as u64 } else { 0 }; mapped_lengths.push(len); lengths_by_tid.entry(r.tid()).or_default().push(len); if n_reads % 500_000 == 0 { info!("{case_id} {time}: processed {n_reads} mapped reads"); } } let mapped_fraction = if all_records > 0 { n_reads as f64 / all_records as f64 } else { 0.0 }; // Compute mean, median, N50 (single sort). let mut lens = mapped_lengths.clone(); lens.sort_unstable(); let mean_read_length = if n_reads > 0 { mapped_lengths.iter().sum::() as f64 / n_reads as f64 } else { 0.0 }; let median_read_length = if lens.is_empty() { 0 } else { lens[lens.len() / 2] }; let mapped_yield: u64 = mapped_lengths.iter().sum(); // N50: minimal length L such that sum(lengths >= L) >= 50% yield. let mut cum = 0u64; let half = mapped_yield / 2; let n50 = lens .iter() .rev() .find_map(|&l| { cum += l; if cum >= half { Some(l) } else { None } }) .unwrap_or(0); // Histogram (sorted by length). let mut histo = BTreeMap::new(); for &len in &mapped_lengths { *histo.entry(len).or_insert(0) += 1; } let by_lengths: Vec<_> = histo.into_iter().collect(); // Genome/coverage. let genome = get_genome_sizes(&header)?; let genome_size: u64 = genome.values().sum(); let global_coverage = if genome_size > 0 { mapped_yield as f64 / genome_size as f64 } else { 0.0 }; // Global coverage stdev: stdev of per-read contributions to global coverage. let mut total_sq_cov = 0.0; for &len in &mapped_lengths { let cov = if genome_size > 0 { len as f64 / genome_size as f64 } else { 0.0 }; total_sq_cov += cov * cov; } let mean_cov = global_coverage; let variance = if n_reads > 0 { let var = total_sq_cov / n_reads as f64 - mean_cov.powi(2); if var < 0.0 { 0.0 } else { var } } else { 0.0 }; let coverage_stddev = variance.sqrt(); // Per-contig: (TID, contig_length, contig_name, mapped_yield, mean_coverage, coverage_stddev , coverage_ratio). let mut karyotype: Vec<(i32, u64, String, u64, f64, f64, f64)> = lengths_by_tid .iter() .map(|(tid, lengths)| { let contig = String::from_utf8(h.tid2name(*tid as u32).to_vec()).unwrap(); let contig_len = genome.get(&contig).copied().unwrap_or(0); let mapped_sum: u64 = lengths.iter().sum(); // Per-contig mean coverage let mean_cov = if contig_len > 0 { mapped_sum as f64 / contig_len as f64 } else { 0.0 }; let coverage_ratio = if global_coverage > 0.0 { mean_cov / global_coverage } else { 0.0 }; // Per-contig coverage stdev (over all reads mapped to this contig) let var = if contig_len > 0 && !lengths.is_empty() { lengths .iter() .map(|&l| { let cov = l as f64 / contig_len as f64; (cov - mean_cov).powi(2) }) .sum::() / lengths.len() as f64 } else { 0.0 }; let stddev = var.sqrt(); ( *tid, contig_len, contig, mapped_sum, mean_cov, stddev, coverage_ratio, ) }) .collect(); karyotype.sort_unstable_by_key(|&(tid, _, _, _, _, _, _)| tid); Ok(Self { all_records, n_reads, mapped_fraction, n_unmapped, n_duplicates, mapped_yield, mean_read_length, median_read_length, coverage_stddev, global_coverage, karyotype, n50, by_lengths, n_lowq, }) } } impl fmt::Display for WGSBamStats { /// Formats the BAM statistics as a readable summary table. /// /// Shows all global stats and a per-contig breakdown with coverage stdev. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "BAM statistics summary:")?; writeln!(f, " All BAM records: {}", self.all_records)?; writeln!(f, " Unmapped reads: {}", self.n_unmapped)?; writeln!(f, " Duplicate reads: {}", self.n_duplicates)?; writeln!(f, " Low Q reads: {}", self.n_lowq)?; writeln!(f, " Counted (mapped) reads: {}", self.n_reads)?; writeln!(f, " Mapped fraction: {:.4}", self.mapped_fraction)?; writeln!( f, " Mapped yield [Gb]: {:.2}", self.mapped_yield as f64 / 1e9 )?; writeln!(f, " Mean read length [bp]: {:.2}", self.mean_read_length)?; writeln!(f, " Median read length [bp]: {}", self.median_read_length)?; writeln!(f, " N50 [bp]: {}", self.n50)?; writeln!(f, " Global mean coverage: {:.2}", self.global_coverage)?; writeln!(f, " Global coverage stdev: {:.2}", self.coverage_stddev)?; writeln!(f, " Per-contig stats:")?; writeln!( f, " {:<8} {:<10} {:<12} {:<14} {:<12} {:<10} {:<10}", "TID", "Length", "Name", "MappedYield", "CovMean", "CovStdev", "CovRatio" )?; for (tid, contig_len, contig, mapped_sum, mean_cov, stddev, coverage_ratio) in &self.karyotype { writeln!( f, " {:<8} {:<10} {:<12} {:<14} {:<12.4} {:.4} {:.2}", tid, contig_len, contig, mapped_sum, mean_cov, stddev, coverage_ratio )?; } Ok(()) } } pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result> { let mut genome = HashMap::new(); for (key, records) in header.to_hashmap() { for record in records { if key == "SQ" { genome.insert( record["SN"].to_string(), record["LN"] .parse::() .expect("Failed parsing length of chromosomes"), ); } } } Ok(genome) } /// Result of querying a read at a reference position. #[derive(Clone, Debug, Eq, PartialEq)] pub enum PileBase { A, C, G, T, N, Ins, // insertion immediately after the queried base Del((Vec, u32)), // deletion covering the queried base Skip, // reference skip (N) } /// Decode one HTSlib 4bit nucleotide (0=A,1=C,2=G,3=T,4=N, …) to `Base` #[inline] fn decode(b: u8) -> PileBase { match b & 0b1111 { 0 => PileBase::A, 1 => PileBase::C, 2 => PileBase::G, 3 => PileBase::T, _ => PileBase::N, } } pub fn base_at_new( record: &rust_htslib::bam::Record, at_pos: i64, with_next_ins: bool, ) -> Option { if record.pos() > at_pos { return None; } let mut ref_pos = record.pos(); let mut read_pos = 0_i64; let cigar = record.cigar(); let seq = record.seq().as_bytes(); // already expanded to ASCII (A,C,G,T,N) for (i, op) in cigar.iter().enumerate() { let l = op.len() as i64; let (consume_read, consume_ref) = match op.char() { 'M' | '=' | 'X' => (l, l), 'I' | 'S' => (l, 0), 'D' | 'N' => (0, l), 'H' | 'P' => (0, 0), _ => unreachable!(), }; /* look ahead for an insertion immediately AFTER the queried base */ if with_next_ins && ref_pos + consume_ref == at_pos + 1 && matches!(cigar.get(i + 1), Some(Cigar::Ins(_))) { return Some(PileBase::Ins); } // Instead of PileBase::Ins, consider // extracting the inserted sequence and returning its length or actual bases: // if let Some(Cigar::Ins(len)) = cigar.get(i + 1) { // let ins_seq = &seq[read_pos as usize + (consume_ref as usize) // ..read_pos as usize + (consume_ref as usize) + (*len as usize)]; // return Some(PileBase::Ins(ins_seq.to_vec())); // } /* Does the queried reference coordinate fall inside this CIGAR op? */ if ref_pos + consume_ref > at_pos { return Some(match op { Cigar::Del(e) => PileBase::Del((record.qname().to_vec(), *e)), Cigar::RefSkip(_) => PileBase::Skip, _ => { let offset = (at_pos - ref_pos) as usize; if read_pos as usize + offset >= seq.len() { return None; // out of range, malformed BAM } decode(seq[read_pos as usize + offset]) } }); } read_pos += consume_read; ref_pos += consume_ref; } None // beyond alignment } /// Return the pile‑up of **all reads** at `chr:pos` (0‑based, inclusive). /// /// * `bam` – an open [`rust_htslib::bam::IndexedReader`]. /// * `chr` / `pos` – reference contig name and coordinate. /// * `with_next_ins` – if `true`, report [`Base::Ins`] when an insertion starts /// **right after** the queried base. /// /// The function bounds the internal pile‑up depth to 10 000 reads to protect /// against malformed BAM files that could otherwise allocate unbounded memory. /// /// # Errors /// /// Propagates any I/O or parsing error from *rust‑htslib*, wrapped in /// [`anyhow::Error`]. pub fn nt_pileup_new( bam: &mut rust_htslib::bam::IndexedReader, chr: &str, pos: u32, with_next_ins: bool, ) -> anyhow::Result> { // Storage for the per‑read observations. let mut pile = Vec::new(); // Restrict BAM iterator to the one‑base span of interest. bam.fetch((chr, pos, pos + 1))?; // Hard cap on depth (adjust if your use‑case needs deeper coverage). bam.pileup().set_max_depth(10_000); // Stream the pileup; HTSlib walks the reads for us. for pileup in bam.pileup() { // Attach context to any low‑level error. let pileup = pileup.context(format!("Failed to pileup BAM at {chr}:{pos}"))?; // Skip other positions quickly. if pileup.pos() != pos { continue; } for aln in pileup.alignments() { // Handle the three indel states reported by HTSlib. match aln.indel() { rust_htslib::bam::pileup::Indel::Ins(_) => { pile.push(PileBase::Ins); // insertion *starts at* this base } rust_htslib::bam::pileup::Indel::Del(e) => { let qname = aln.record().qname().to_vec(); pile.push(PileBase::Del((qname, e))); // deletion length > 1, but still covers here } rust_htslib::bam::pileup::Indel::None => { // Regular match/mismatch: delegate to `base_at`. if let Some(base) = base_at_new(&aln.record(), pos as i64, with_next_ins) { pile.push(base); } } } } } Ok(pile) }