|
|
@@ -0,0 +1,910 @@
|
|
|
+use std::{
|
|
|
+ collections::BTreeMap,
|
|
|
+ fmt, fs,
|
|
|
+ hash::{Hash, Hasher},
|
|
|
+ io::{Read, Write},
|
|
|
+ path::{Path, PathBuf},
|
|
|
+};
|
|
|
+
|
|
|
+use anyhow::Context;
|
|
|
+use log::info;
|
|
|
+use rust_htslib::{
|
|
|
+ bam::{ext::BamRecordExtensions, record::Aux},
|
|
|
+ htslib::{BAM_FDUP, BAM_FQCFAIL, BAM_FSECONDARY, BAM_FSUPPLEMENTARY, BAM_FUNMAP},
|
|
|
+};
|
|
|
+use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
|
|
|
+use serde::{Deserialize, Serialize};
|
|
|
+
|
|
|
+use crate::config::Config;
|
|
|
+
|
|
|
+const SKIP_FLAGS: u16 = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FSUPPLEMENTARY) as u16;
|
|
|
+const UNMAP_FLAG: u16 = BAM_FUNMAP as u16;
|
|
|
+const DUP_FLAG: u16 = BAM_FDUP as u16;
|
|
|
+const LEN_BIN: u64 = 10; // Bin size for length histogram
|
|
|
+
|
|
|
+#[inline(always)]
|
|
|
+fn hash_bytes(bytes: &[u8]) -> u64 {
|
|
|
+ let mut hasher = FxHasher::default();
|
|
|
+ bytes.hash(&mut hasher);
|
|
|
+ hasher.finish()
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
|
+pub struct RGStats {
|
|
|
+ pub rg_id: String,
|
|
|
+ pub n_reads: u64,
|
|
|
+ pub mapped_yield: u64,
|
|
|
+ pub mean_read_length: f64,
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
|
+pub struct FnStats {
|
|
|
+ pub filename: String,
|
|
|
+ pub n_reads: u64,
|
|
|
+ pub mapped_yield: u64,
|
|
|
+ pub mean_read_length: f64,
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone, Default, Deserialize, Serialize)]
|
|
|
+pub struct FlagStats {
|
|
|
+ pub paired: u64,
|
|
|
+ pub proper_pair: u64,
|
|
|
+ pub unmap: u64,
|
|
|
+ pub munmap: u64,
|
|
|
+ pub reverse: u64,
|
|
|
+ pub mreverse: u64,
|
|
|
+ pub read1: u64,
|
|
|
+ pub read2: u64,
|
|
|
+ pub secondary: u64,
|
|
|
+ pub qcfail: u64,
|
|
|
+ pub dup: u64,
|
|
|
+ pub supplementary: u64,
|
|
|
+}
|
|
|
+
|
|
|
+impl FlagStats {
|
|
|
+ #[inline(always)]
|
|
|
+ fn update(&mut self, flags: u16) {
|
|
|
+ self.paired += ((flags & 0x1) != 0) as u64;
|
|
|
+ self.proper_pair += ((flags & 0x2) != 0) as u64;
|
|
|
+ self.unmap += ((flags & 0x4) != 0) as u64;
|
|
|
+ self.munmap += ((flags & 0x8) != 0) as u64;
|
|
|
+ self.reverse += ((flags & 0x10) != 0) as u64;
|
|
|
+ self.mreverse += ((flags & 0x20) != 0) as u64;
|
|
|
+ self.read1 += ((flags & 0x40) != 0) as u64;
|
|
|
+ self.read2 += ((flags & 0x80) != 0) as u64;
|
|
|
+ self.secondary += ((flags & 0x100) != 0) as u64;
|
|
|
+ self.qcfail += ((flags & 0x200) != 0) as u64;
|
|
|
+ self.dup += ((flags & 0x400) != 0) as u64;
|
|
|
+ self.supplementary += ((flags & 0x800) != 0) as u64;
|
|
|
+ }
|
|
|
+
|
|
|
+ fn as_vec(&self) -> Vec<(&'static str, u64)> {
|
|
|
+ vec![
|
|
|
+ ("PAIRED", self.paired),
|
|
|
+ ("PROPER_PAIR", self.proper_pair),
|
|
|
+ ("UNMAP", self.unmap),
|
|
|
+ ("MUNMAP", self.munmap),
|
|
|
+ ("REVERSE", self.reverse),
|
|
|
+ ("MREVERSE", self.mreverse),
|
|
|
+ ("READ1", self.read1),
|
|
|
+ ("READ2", self.read2),
|
|
|
+ ("SECONDARY", self.secondary),
|
|
|
+ ("QCFAIL", self.qcfail),
|
|
|
+ ("DUP", self.dup),
|
|
|
+ ("SUPPLEMENTARY", self.supplementary),
|
|
|
+ ]
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
|
+pub struct WGSBamStats {
|
|
|
+ pub all_records: u64,
|
|
|
+ pub n_reads: u64,
|
|
|
+ pub mapped_fraction: f64,
|
|
|
+ pub n_unmapped: u64,
|
|
|
+ pub n_duplicates: u64,
|
|
|
+ pub n_lowq: u64,
|
|
|
+ pub mapped_yield: u64,
|
|
|
+ pub mean_read_length: f64,
|
|
|
+ pub median_read_length: u64,
|
|
|
+ pub global_coverage: f64,
|
|
|
+ pub karyotype: Vec<(i32, u64, String, u64, f64, f64)>,
|
|
|
+ pub n50: u64,
|
|
|
+ pub by_lengths: Vec<(u64, u64)>,
|
|
|
+ pub by_rg: Vec<RGStats>,
|
|
|
+ pub by_fn: Vec<FnStats>,
|
|
|
+ pub flag_stats: FlagStats,
|
|
|
+}
|
|
|
+
|
|
|
+impl WGSBamStats {
|
|
|
+ /// Open stats from BAM path (compute if needed)
|
|
|
+ pub fn from_bam(bam_path: impl AsRef<Path>, config: &Config) -> anyhow::Result<Self> {
|
|
|
+ let bam_path = bam_path.as_ref();
|
|
|
+ let json_path = Self::json_path_from_bam(bam_path);
|
|
|
+
|
|
|
+ let should_recompute = if !json_path.exists() {
|
|
|
+ info!("No cached stats found, computing...");
|
|
|
+ true
|
|
|
+ } else if Self::is_bam_newer(bam_path, &json_path)? {
|
|
|
+ info!("BAM is newer than cached stats, recomputing...");
|
|
|
+ true
|
|
|
+ } else {
|
|
|
+ false
|
|
|
+ };
|
|
|
+
|
|
|
+ if should_recompute {
|
|
|
+ let (stats, qnames) = Self::from_bam_path(bam_path, config)?;
|
|
|
+
|
|
|
+ if let Some(parent) = json_path.parent() {
|
|
|
+ fs::create_dir_all(parent)?;
|
|
|
+ }
|
|
|
+
|
|
|
+ stats.save_json(&json_path)?;
|
|
|
+ info!("Saved stats to: {}", json_path.display());
|
|
|
+
|
|
|
+ let qnames_path = Self::qnames_path_from_bam(bam_path);
|
|
|
+ qnames.save(qnames_path)?;
|
|
|
+ info!("Saved qnames to: {}", json_path.display());
|
|
|
+ Ok(stats)
|
|
|
+ } else {
|
|
|
+ info!("Loading cached stats from: {}", json_path.display());
|
|
|
+ Self::load_json(&json_path)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Derive JSON path from BAM path
|
|
|
+ /// e.g., /path/to/34528_norm_hs1.bam -> /path/to/34528_norm_bam_stats.json
|
|
|
+ fn json_path_from_bam(bam_path: &Path) -> PathBuf {
|
|
|
+ let dir = bam_path.parent().unwrap_or(Path::new("."));
|
|
|
+ let stem = bam_path
|
|
|
+ .file_stem()
|
|
|
+ .map(|s| s.to_string_lossy())
|
|
|
+ .unwrap_or_default();
|
|
|
+
|
|
|
+ // "34528_norm_hs1" -> "34528_norm"
|
|
|
+ let parts: Vec<&str> = stem.split('_').collect();
|
|
|
+ let prefix = if parts.len() >= 2 {
|
|
|
+ format!("{}_{}", parts[0], parts[1])
|
|
|
+ } else {
|
|
|
+ stem.to_string()
|
|
|
+ };
|
|
|
+
|
|
|
+ dir.join(format!("{}_bam_stats.json", prefix))
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Derive JSON path from BAM path
|
|
|
+ /// e.g., /path/to/34528_norm_hs1.bam -> /path/to/34528_norm_bam_stats.json
|
|
|
+ pub fn qnames_path_from_bam(bam_path: &Path) -> PathBuf {
|
|
|
+ let dir = bam_path.parent().unwrap_or(Path::new("."));
|
|
|
+ let stem = bam_path
|
|
|
+ .file_stem()
|
|
|
+ .map(|s| s.to_string_lossy())
|
|
|
+ .unwrap_or_default();
|
|
|
+
|
|
|
+ // "34528_norm_hs1" -> "34528_norm"
|
|
|
+ let parts: Vec<&str> = stem.split('_').collect();
|
|
|
+ let prefix = if parts.len() >= 2 {
|
|
|
+ format!("{}_{}", parts[0], parts[1])
|
|
|
+ } else {
|
|
|
+ stem.to_string()
|
|
|
+ };
|
|
|
+
|
|
|
+ dir.join(format!("{}_bam_qnames.bin", prefix))
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Open stats from JSON cache, or compute and save if:
|
|
|
+ /// - JSON doesn't exist
|
|
|
+ /// - BAM is newer than JSON
|
|
|
+ pub fn open(case_id: &str, time: &str, config: &Config) -> anyhow::Result<Self> {
|
|
|
+ let bam_path = PathBuf::from(config.solo_bam(case_id, time));
|
|
|
+
|
|
|
+ Self::from_bam(&bam_path, config)
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Check if BAM file is newer than JSON cache
|
|
|
+ fn is_bam_newer(bam_path: &Path, json_path: &Path) -> anyhow::Result<bool> {
|
|
|
+ let bam_modified = bam_path
|
|
|
+ .metadata()
|
|
|
+ .with_context(|| format!("Failed to get BAM metadata: {}", bam_path.display()))?
|
|
|
+ .modified()
|
|
|
+ .with_context(|| format!("Failed to get BAM modified time: {}", bam_path.display()))?;
|
|
|
+
|
|
|
+ let json_modified = json_path
|
|
|
+ .metadata()
|
|
|
+ .with_context(|| format!("Failed to get JSON metadata: {}", json_path.display()))?
|
|
|
+ .modified()
|
|
|
+ .with_context(|| {
|
|
|
+ format!("Failed to get JSON modified time: {}", json_path.display())
|
|
|
+ })?;
|
|
|
+
|
|
|
+ Ok(bam_modified > json_modified)
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Get the JSON cache path for a case
|
|
|
+ fn json_path(case_id: &str, time: &str, config: &Config) -> PathBuf {
|
|
|
+ PathBuf::from(&config.result_dir)
|
|
|
+ .join(case_id)
|
|
|
+ .join(time)
|
|
|
+ .join(format!("{}_{}_bam_stats.json", case_id, time))
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Get the Qnames cache path for a case
|
|
|
+ fn qnames_path(case_id: &str, time: &str, config: &Config) -> PathBuf {
|
|
|
+ PathBuf::from(&config.result_dir)
|
|
|
+ .join(case_id)
|
|
|
+ .join(time)
|
|
|
+ .join(format!("{}_{}_bam_qnames.bin", case_id, time))
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Force recompute stats (ignore cache)
|
|
|
+ pub fn recompute(case_id: &str, time: &str, config: &Config) -> anyhow::Result<Self> {
|
|
|
+ info!("Recomputing stats for {} {}", case_id, time);
|
|
|
+ let bam_path = PathBuf::from(config.solo_bam(case_id, time));
|
|
|
+ let (stats, qnames) = Self::from_bam_path(&bam_path, config)?;
|
|
|
+ let json_path = Self::json_path(case_id, time, config);
|
|
|
+ let qnames_path = Self::qnames_path(case_id, time, config);
|
|
|
+
|
|
|
+ if let Some(parent) = json_path.parent() {
|
|
|
+ fs::create_dir_all(parent)?;
|
|
|
+ }
|
|
|
+
|
|
|
+ stats.save_json(&json_path)?;
|
|
|
+ info!("Saved stats to: {}", json_path.display());
|
|
|
+
|
|
|
+ qnames.save(qnames_path)?;
|
|
|
+ info!("Saved qnames to: {}", json_path.display());
|
|
|
+ Ok(stats)
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Save stats to JSON file
|
|
|
+ pub fn save_json(&self, path: impl AsRef<Path>) -> anyhow::Result<()> {
|
|
|
+ let file = fs::File::create(path.as_ref())
|
|
|
+ .with_context(|| format!("Failed to create JSON file: {}", path.as_ref().display()))?;
|
|
|
+ let writer = std::io::BufWriter::new(file);
|
|
|
+ serde_json::to_writer_pretty(writer, self)
|
|
|
+ .with_context(|| format!("Failed to write JSON to: {}", path.as_ref().display()))?;
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Load stats from JSON file
|
|
|
+ pub fn load_json(path: impl AsRef<Path>) -> anyhow::Result<Self> {
|
|
|
+ let file = fs::File::open(path.as_ref())
|
|
|
+ .with_context(|| format!("Failed to open JSON file: {}", path.as_ref().display()))?;
|
|
|
+ let reader = std::io::BufReader::new(file);
|
|
|
+ let stats: Self = serde_json::from_reader(reader)
|
|
|
+ .with_context(|| format!("Failed to parse JSON from: {}", path.as_ref().display()))?;
|
|
|
+ Ok(stats)
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Compute stats directly from BAM path
|
|
|
+ fn from_bam_path(bam_path: &Path, config: &Config) -> anyhow::Result<(Self, QNameSet)> {
|
|
|
+ use rust_htslib::bam::Read;
|
|
|
+
|
|
|
+ let bam_min_mapq = config.bam_min_mapq;
|
|
|
+ let bam_n_threads = config.bam_n_threads;
|
|
|
+
|
|
|
+ let mut bam = rust_htslib::bam::Reader::from_path(bam_path)?;
|
|
|
+ let h = bam.header().clone();
|
|
|
+ let header = rust_htslib::bam::Header::from_template(&h);
|
|
|
+ bam.set_threads(bam_n_threads as usize)?;
|
|
|
+
|
|
|
+ info!("Parsing BAM file: {}", bam_path.display());
|
|
|
+
|
|
|
+ let mut all_records = 0u64;
|
|
|
+ let mut qnames = QNameSet::default();
|
|
|
+ let mut n_reads = 0u64;
|
|
|
+ let mut n_unmapped = 0u64;
|
|
|
+ let mut n_duplicates = 0u64;
|
|
|
+ let mut n_lowq = 0u64;
|
|
|
+
|
|
|
+ let mut flag_stats = FlagStats::default();
|
|
|
+ let mut length_hist: FxHashMap<u64, u64> = FxHashMap::default();
|
|
|
+ let mut yield_by_tid: FxHashMap<i32, u64> = FxHashMap::default();
|
|
|
+
|
|
|
+ // Hash -> (count, yield)
|
|
|
+ let mut rg_stats: FxHashMap<u64, (u64, u64)> = FxHashMap::default();
|
|
|
+ let mut fn_stats: FxHashMap<u64, (u64, u64)> = FxHashMap::default();
|
|
|
+ // Hash -> name (only store once)
|
|
|
+ let mut rg_names: FxHashMap<u64, String> = FxHashMap::default();
|
|
|
+ let mut fn_names: FxHashMap<u64, String> = FxHashMap::default();
|
|
|
+
|
|
|
+ for rec in bam.rc_records() {
|
|
|
+ all_records += 1;
|
|
|
+ let r = rec.with_context(|| "failed to parse BAM record")?;
|
|
|
+ qnames.add_qname_bytes(r.qname());
|
|
|
+
|
|
|
+ let flags = r.flags();
|
|
|
+
|
|
|
+ flag_stats.update(flags);
|
|
|
+
|
|
|
+ if flags & SKIP_FLAGS != 0 {
|
|
|
+ n_unmapped += ((flags & UNMAP_FLAG) != 0) as u64;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ if flags & DUP_FLAG != 0 {
|
|
|
+ n_duplicates += 1;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ if r.mapq() < bam_min_mapq {
|
|
|
+ n_lowq += 1;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ n_reads += 1;
|
|
|
+
|
|
|
+ let len = {
|
|
|
+ let start = r.pos();
|
|
|
+ let end = r.reference_end();
|
|
|
+ if end > start {
|
|
|
+ (end - start) as u64
|
|
|
+ } else {
|
|
|
+ 0
|
|
|
+ }
|
|
|
+ };
|
|
|
+
|
|
|
+ *length_hist.entry(len / LEN_BIN * LEN_BIN).or_insert(0) += 1;
|
|
|
+ *yield_by_tid.entry(r.tid()).or_insert(0) += len;
|
|
|
+
|
|
|
+ if let Ok(Aux::String(rg)) = r.aux(b"RG") {
|
|
|
+ let h = hash_bytes(rg.as_bytes());
|
|
|
+ let entry = rg_stats.entry(h).or_insert((0, 0));
|
|
|
+ entry.0 += 1;
|
|
|
+ entry.1 += len;
|
|
|
+ rg_names.entry(h).or_insert_with(|| rg.to_string());
|
|
|
+ }
|
|
|
+
|
|
|
+ if let Ok(Aux::String(fn_tag)) = r.aux(b"fn") {
|
|
|
+ let h = hash_bytes(fn_tag.as_bytes());
|
|
|
+ let entry = fn_stats.entry(h).or_insert((0, 0));
|
|
|
+ entry.0 += 1;
|
|
|
+ entry.1 += len;
|
|
|
+ fn_names.entry(h).or_insert_with(|| fn_tag.to_string());
|
|
|
+ }
|
|
|
+
|
|
|
+ if n_reads.is_multiple_of(500_000) {
|
|
|
+ info!("{}: processed {n_reads} mapped reads", bam_path.display());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let mapped_yield: u64 = length_hist.iter().map(|(len, count)| len * count).sum();
|
|
|
+
|
|
|
+ let mapped_fraction = if all_records > 0 {
|
|
|
+ n_reads as f64 / all_records as f64
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ };
|
|
|
+
|
|
|
+ let mean_read_length = if n_reads > 0 {
|
|
|
+ mapped_yield as f64 / n_reads as f64
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ };
|
|
|
+
|
|
|
+ // Convert to BTreeMap for median/N50 calculation
|
|
|
+ let sorted_hist: BTreeMap<u64, u64> = length_hist.into_iter().collect();
|
|
|
+ let median_read_length = median_from_hist(&sorted_hist, n_reads);
|
|
|
+ let n50 = n50_from_hist(&sorted_hist, mapped_yield);
|
|
|
+
|
|
|
+ let genome = get_genome_sizes(&header)?;
|
|
|
+ let genome_size: u64 = genome.values().sum();
|
|
|
+ let global_coverage = if genome_size > 0 {
|
|
|
+ mapped_yield as f64 / genome_size as f64
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ };
|
|
|
+
|
|
|
+ let mut karyotype: Vec<_> = yield_by_tid
|
|
|
+ .iter()
|
|
|
+ .map(|(tid, &mapped_sum)| {
|
|
|
+ let contig = String::from_utf8(h.tid2name(*tid as u32).to_vec())
|
|
|
+ .unwrap_or_else(|_| format!("tid_{}", tid));
|
|
|
+ let contig_len = genome.get(&contig).copied().unwrap_or(0);
|
|
|
+
|
|
|
+ let mean_cov = if contig_len > 0 {
|
|
|
+ mapped_sum as f64 / contig_len as f64
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ };
|
|
|
+
|
|
|
+ let coverage_ratio = if global_coverage > 0.0 {
|
|
|
+ mean_cov / global_coverage
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ };
|
|
|
+
|
|
|
+ (
|
|
|
+ *tid,
|
|
|
+ contig_len,
|
|
|
+ contig,
|
|
|
+ mapped_sum,
|
|
|
+ mean_cov,
|
|
|
+ coverage_ratio,
|
|
|
+ )
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ karyotype.sort_unstable_by_key(|(tid, _, _, _, _, _)| *tid);
|
|
|
+
|
|
|
+ // Convert hash-based stats to final structs
|
|
|
+ let mut by_rg: Vec<RGStats> = rg_stats
|
|
|
+ .into_iter()
|
|
|
+ .map(|(h, (count, yield_sum))| {
|
|
|
+ let rg_id = rg_names
|
|
|
+ .get(&h)
|
|
|
+ .cloned()
|
|
|
+ .unwrap_or_else(|| "unknown".into());
|
|
|
+ RGStats {
|
|
|
+ rg_id,
|
|
|
+ n_reads: count,
|
|
|
+ mapped_yield: yield_sum,
|
|
|
+ mean_read_length: if count > 0 {
|
|
|
+ yield_sum as f64 / count as f64
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ },
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+ by_rg.sort_by(|a, b| a.rg_id.cmp(&b.rg_id));
|
|
|
+
|
|
|
+ let mut by_fn: Vec<FnStats> = fn_stats
|
|
|
+ .into_iter()
|
|
|
+ .map(|(h, (count, yield_sum))| {
|
|
|
+ let filename = fn_names
|
|
|
+ .get(&h)
|
|
|
+ .cloned()
|
|
|
+ .unwrap_or_else(|| "unknown".into());
|
|
|
+ FnStats {
|
|
|
+ filename,
|
|
|
+ n_reads: count,
|
|
|
+ mapped_yield: yield_sum,
|
|
|
+ mean_read_length: if count > 0 {
|
|
|
+ yield_sum as f64 / count as f64
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ },
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+ by_fn.sort_by(|a, b| a.filename.cmp(&b.filename));
|
|
|
+
|
|
|
+ let by_lengths: Vec<_> = sorted_hist.into_iter().collect();
|
|
|
+
|
|
|
+ Ok((
|
|
|
+ Self {
|
|
|
+ all_records,
|
|
|
+ n_reads,
|
|
|
+ mapped_fraction,
|
|
|
+ n_unmapped,
|
|
|
+ n_duplicates,
|
|
|
+ n_lowq,
|
|
|
+ mapped_yield,
|
|
|
+ mean_read_length,
|
|
|
+ median_read_length,
|
|
|
+ global_coverage,
|
|
|
+ karyotype,
|
|
|
+ n50,
|
|
|
+ by_lengths,
|
|
|
+ by_rg,
|
|
|
+ by_fn,
|
|
|
+ flag_stats,
|
|
|
+ },
|
|
|
+ qnames,
|
|
|
+ ))
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+pub fn median_from_hist(hist: &BTreeMap<u64, u64>, n_reads: u64) -> u64 {
|
|
|
+ if n_reads == 0 {
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ let mid = n_reads / 2;
|
|
|
+ let mut cum = 0u64;
|
|
|
+
|
|
|
+ for (&len, &count) in hist {
|
|
|
+ cum += count;
|
|
|
+ if cum >= mid {
|
|
|
+ return len;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ 0
|
|
|
+}
|
|
|
+
|
|
|
+pub fn n50_from_hist(hist: &BTreeMap<u64, u64>, mapped_yield: u64) -> u64 {
|
|
|
+ if mapped_yield == 0 {
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ let half = mapped_yield / 2;
|
|
|
+ let mut cum = 0u64;
|
|
|
+
|
|
|
+ for (&len, &count) in hist.iter().rev() {
|
|
|
+ cum += len * count;
|
|
|
+ if cum >= half {
|
|
|
+ return len;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ 0
|
|
|
+}
|
|
|
+
|
|
|
+fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<FxHashMap<String, u64>> {
|
|
|
+ let mut sizes = FxHashMap::default();
|
|
|
+ for (_, records) in header.to_hashmap() {
|
|
|
+ for record in records {
|
|
|
+ if let (Some(sn), Some(ln)) = (record.get("SN"), record.get("LN")) {
|
|
|
+ if let Ok(len) = ln.parse::<u64>() {
|
|
|
+ sizes.insert(sn.clone(), len);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(sizes)
|
|
|
+}
|
|
|
+
|
|
|
+/// Display a horizontal histogram with ASCII bars
|
|
|
+pub fn fmt_histogram(
|
|
|
+ f: &mut fmt::Formatter<'_>,
|
|
|
+ data: &[(u64, u64)],
|
|
|
+ title: &str,
|
|
|
+ max_width: usize,
|
|
|
+) -> fmt::Result {
|
|
|
+ if data.is_empty() {
|
|
|
+ return Ok(());
|
|
|
+ }
|
|
|
+
|
|
|
+ writeln!(f)?;
|
|
|
+ writeln!(f, " {}:", title)?;
|
|
|
+
|
|
|
+ let total: u64 = data.iter().map(|(_, c)| *c).sum();
|
|
|
+
|
|
|
+ // Group into bins for display (max ~20 rows)
|
|
|
+ let display_data: Vec<(String, u64, f64)> = if data.len() <= 25 {
|
|
|
+ data.iter()
|
|
|
+ .map(|(len, count)| {
|
|
|
+ let pct = 100.0 * *count as f64 / total as f64;
|
|
|
+ (format!("{:>6}", len), *count, pct)
|
|
|
+ })
|
|
|
+ .collect()
|
|
|
+ } else {
|
|
|
+ let min_len = data.first().map(|(l, _)| *l).unwrap_or(0);
|
|
|
+ let max_len = data.last().map(|(l, _)| *l).unwrap_or(0);
|
|
|
+ let bin_size = ((max_len - min_len) / 20).max(LEN_BIN);
|
|
|
+
|
|
|
+ let mut bins: Vec<(u64, u64, u64)> = Vec::new();
|
|
|
+ let mut current_start = min_len;
|
|
|
+ let mut current_count = 0u64;
|
|
|
+
|
|
|
+ for (len, count) in data {
|
|
|
+ if *len >= current_start + bin_size && current_count > 0 {
|
|
|
+ bins.push((current_start, current_start + bin_size - 1, current_count));
|
|
|
+ current_start += bin_size;
|
|
|
+ current_count = 0;
|
|
|
+ }
|
|
|
+ while *len >= current_start + bin_size {
|
|
|
+ current_start += bin_size;
|
|
|
+ }
|
|
|
+ current_count += count;
|
|
|
+ }
|
|
|
+ if current_count > 0 {
|
|
|
+ bins.push((current_start, current_start + bin_size - 1, current_count));
|
|
|
+ }
|
|
|
+
|
|
|
+ bins.iter()
|
|
|
+ .map(|(start, end, count)| {
|
|
|
+ let pct = 100.0 * *count as f64 / total as f64;
|
|
|
+ (format!("{:>6}-{:<6}", start, end), *count, pct)
|
|
|
+ })
|
|
|
+ .collect()
|
|
|
+ };
|
|
|
+
|
|
|
+ let max_count_display = display_data.iter().map(|(_, c, _)| *c).max().unwrap_or(1);
|
|
|
+
|
|
|
+ for (label, count, pct) in &display_data {
|
|
|
+ let bar_len = (*count as f64 / max_count_display as f64 * max_width as f64) as usize;
|
|
|
+ let bar: String = "█".repeat(bar_len);
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " {} │{:<width$}│ {:>10} ({:>5.1}%)",
|
|
|
+ label,
|
|
|
+ bar,
|
|
|
+ count,
|
|
|
+ pct,
|
|
|
+ width = max_width
|
|
|
+ )?;
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+}
|
|
|
+
|
|
|
+/// Truncate string with ellipsis if too long
|
|
|
+fn truncate_str(s: &str, max_len: usize) -> String {
|
|
|
+ if s.len() <= max_len {
|
|
|
+ s.to_string()
|
|
|
+ } else if max_len <= 3 {
|
|
|
+ s[..max_len].to_string()
|
|
|
+ } else {
|
|
|
+ format!("{}...", &s[..max_len - 3])
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl fmt::Display for WGSBamStats {
|
|
|
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
+ writeln!(f, "BAM statistics summary:")?;
|
|
|
+ writeln!(f, " All BAM records: {}", self.all_records)?;
|
|
|
+ writeln!(f, " Unmapped reads: {}", self.n_unmapped)?;
|
|
|
+ writeln!(f, " Duplicate reads: {}", self.n_duplicates)?;
|
|
|
+ writeln!(f, " Low MAPQ reads: {}", self.n_lowq)?;
|
|
|
+ writeln!(f, " Counted (mapped) reads: {}", self.n_reads)?;
|
|
|
+ writeln!(f, " Mapped fraction: {:.4}", self.mapped_fraction)?;
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " Mapped yield [Gb]: {:.2}",
|
|
|
+ self.mapped_yield as f64 / 1e9
|
|
|
+ )?;
|
|
|
+ writeln!(f, " Mean read length [bp]: {:.2}", self.mean_read_length)?;
|
|
|
+ writeln!(f, " Median read length [bp]: {}", self.median_read_length)?;
|
|
|
+ writeln!(f, " N50 [bp]: {}", self.n50)?;
|
|
|
+ writeln!(f, " Global mean coverage: {:.2}x", self.global_coverage)?;
|
|
|
+
|
|
|
+ // Flag stats
|
|
|
+ writeln!(f)?;
|
|
|
+ writeln!(f, " Flag statistics:")?;
|
|
|
+ let flag_data = self.flag_stats.as_vec();
|
|
|
+ let max_flag = flag_data.iter().map(|(_, c)| *c).max().unwrap_or(1);
|
|
|
+ for (name, count) in &flag_data {
|
|
|
+ let pct = if self.all_records > 0 {
|
|
|
+ 100.0 * *count as f64 / self.all_records as f64
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ };
|
|
|
+ let bar_len = (*count as f64 / max_flag as f64 * 30.0) as usize;
|
|
|
+ let bar: String = "█".repeat(bar_len);
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " {:<14} │{:<30}│ {:>12} ({:>6.2}%)",
|
|
|
+ name, bar, count, pct
|
|
|
+ )?;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Read groups
|
|
|
+ writeln!(f)?;
|
|
|
+ writeln!(f, " Read groups ({}):", self.by_rg.len())?;
|
|
|
+ if !self.by_rg.is_empty() {
|
|
|
+ let max_rg = self.by_rg.iter().map(|r| r.n_reads).max().unwrap_or(1);
|
|
|
+ let max_rg_len = self.by_rg.iter().map(|r| r.rg_id.len()).max().unwrap_or(40);
|
|
|
+
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " {:<width$} {:>12} {:>14} {:>10}",
|
|
|
+ "RG",
|
|
|
+ "Reads",
|
|
|
+ "Yield [Mb]",
|
|
|
+ "MeanLen",
|
|
|
+ width = max_rg_len
|
|
|
+ )?;
|
|
|
+ for rg in &self.by_rg {
|
|
|
+ let bar_len = (rg.n_reads as f64 / max_rg as f64 * 20.0) as usize;
|
|
|
+ let bar: String = "▓".repeat(bar_len);
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " {:<width$} {:>12} {:>14.2} {:>10.1} │{:<20}│",
|
|
|
+ rg.rg_id,
|
|
|
+ rg.n_reads,
|
|
|
+ rg.mapped_yield as f64 / 1e6,
|
|
|
+ rg.mean_read_length,
|
|
|
+ bar,
|
|
|
+ width = max_rg_len
|
|
|
+ )?;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Source files
|
|
|
+ writeln!(f)?;
|
|
|
+ writeln!(f, " Source files ({}):", self.by_fn.len())?;
|
|
|
+ if !self.by_fn.is_empty() {
|
|
|
+ let max_fn = self.by_fn.iter().map(|r| r.n_reads).max().unwrap_or(1);
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " {:<30} {:>12} {:>14} {:>10}",
|
|
|
+ "Filename", "Reads", "Yield [Mb]", "MeanLen"
|
|
|
+ )?;
|
|
|
+ for fn_stat in &self.by_fn {
|
|
|
+ let bar_len = (fn_stat.n_reads as f64 / max_fn as f64 * 20.0) as usize;
|
|
|
+ let bar: String = "▓".repeat(bar_len);
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " {:<30} {:>12} {:>14.2} {:>10.1} │{:<20}│",
|
|
|
+ truncate_str(&fn_stat.filename, 30),
|
|
|
+ fn_stat.n_reads,
|
|
|
+ fn_stat.mapped_yield as f64 / 1e6,
|
|
|
+ fn_stat.mean_read_length,
|
|
|
+ bar
|
|
|
+ )?;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Length histogram
|
|
|
+ fmt_histogram(f, &self.by_lengths, "Read length distribution [bp]", 40)?;
|
|
|
+
|
|
|
+ // Per-contig stats
|
|
|
+ writeln!(f)?;
|
|
|
+ writeln!(f, " Per-contig stats:")?;
|
|
|
+ if !self.karyotype.is_empty() {
|
|
|
+ // Separate chrM from rest
|
|
|
+ let (chrm, regular): (Vec<_>, Vec<_>) = self
|
|
|
+ .karyotype
|
|
|
+ .iter()
|
|
|
+ .partition(|(_, _, contig, _, _, _)| contig == "chrM");
|
|
|
+
|
|
|
+ // Max coverage excluding chrM for histogram scaling
|
|
|
+ let max_cov = regular
|
|
|
+ .iter()
|
|
|
+ .map(|(_, _, _, _, c, _)| *c)
|
|
|
+ .fold(0.0f64, |a, b| a.max(b));
|
|
|
+
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " {:<6} {:>12} {:<12} {:>14} {:>8}",
|
|
|
+ "TID", "Length", "Name", "MappedYield", "Cov"
|
|
|
+ )?;
|
|
|
+
|
|
|
+ // Display regular contigs with histogram
|
|
|
+ for (tid, contig_len, contig, mapped_sum, mean_cov, _) in ®ular {
|
|
|
+ let bar_len = if max_cov > 0.0 {
|
|
|
+ (*mean_cov / max_cov * 20.0) as usize
|
|
|
+ } else {
|
|
|
+ 0
|
|
|
+ };
|
|
|
+ let bar: String = "▒".repeat(bar_len);
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " {:<6} {:>12} {:<12} {:>14} {:>7.1}x │{:<20}│",
|
|
|
+ tid,
|
|
|
+ contig_len,
|
|
|
+ truncate_str(contig, 12),
|
|
|
+ mapped_sum,
|
|
|
+ mean_cov,
|
|
|
+ bar
|
|
|
+ )?;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Display chrM without histogram bar
|
|
|
+ for (tid, contig_len, contig, mapped_sum, mean_cov, _) in &chrm {
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " {:<6} {:>12} {:<12} {:>14} {:>7.1}x",
|
|
|
+ tid,
|
|
|
+ contig_len,
|
|
|
+ truncate_str(contig, 12),
|
|
|
+ mapped_sum,
|
|
|
+ mean_cov,
|
|
|
+ )?;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
|
|
+pub struct QNameSet {
|
|
|
+ pub qnames: FxHashSet<Vec<u8>>, // 16 raw UUID bytes each
|
|
|
+}
|
|
|
+
|
|
|
+impl QNameSet {
|
|
|
+ /// Load QNAMEs from a BAM file into memory (no stats, no disk write)
|
|
|
+ pub fn from_bam_in_memory(bam_path: impl AsRef<Path>, bam_min_mapq: u8, n_threads: usize) -> anyhow::Result<Self> {
|
|
|
+ let mut bam = rust_htslib::bam::Reader::from_path(bam_path.as_ref())?;
|
|
|
+ rust_htslib::bam::Read::set_threads(&mut bam, n_threads)?;
|
|
|
+ let mut qs = Self::default();
|
|
|
+
|
|
|
+ for rec in rust_htslib::bam::Read::rc_records(&mut bam) {
|
|
|
+ let r = rec?;
|
|
|
+ let flags = r.flags();
|
|
|
+ if flags & SKIP_FLAGS != 0 { continue; }
|
|
|
+ if flags & DUP_FLAG != 0 { continue; }
|
|
|
+ if r.mapq() < bam_min_mapq { continue; }
|
|
|
+ qs.add_qname_bytes(r.qname());
|
|
|
+ }
|
|
|
+ Ok(qs)
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Add a new QNAME (hex or dashed UUID string), store as 16 bytes
|
|
|
+ pub fn add(&mut self, qname: &str) -> bool {
|
|
|
+ let clean = qname.replace('-', "");
|
|
|
+ if clean.len() != 32 {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ match hex::decode(&clean) {
|
|
|
+ Ok(bytes) => self.qnames.insert(bytes),
|
|
|
+ Err(_) => false,
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Add QNAME directly from BAM record (`&[u8]`), expect ASCII UUID
|
|
|
+ pub fn add_qname_bytes(&mut self, qname: &[u8]) -> bool {
|
|
|
+ match std::str::from_utf8(qname) {
|
|
|
+ Ok(s) => self.add(s), // uses your hex/dash logic
|
|
|
+ Err(_) => false,
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Save raw bytes (binary format), 16 bytes per QNAME
|
|
|
+ pub fn save(self, path: impl AsRef<Path>) -> anyhow::Result<()> {
|
|
|
+ let mut f = fs::File::create(path.as_ref()).context("write QNameSet")?;
|
|
|
+ for q in &self.qnames {
|
|
|
+ f.write_all(q)?;
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Load raw bytes, assume 16 bytes per QNAME
|
|
|
+ pub fn load(path: impl AsRef<Path>) -> anyhow::Result<Self> {
|
|
|
+ let mut buf = Vec::new();
|
|
|
+ fs::File::open(path.as_ref())?.read_to_end(&mut buf)?;
|
|
|
+ let mut qs = Self::default();
|
|
|
+
|
|
|
+ for chunk in buf.chunks_exact(16) {
|
|
|
+ qs.qnames.insert(chunk.to_vec());
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(qs)
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn exists(&self, qname: &str) -> bool {
|
|
|
+ let clean = qname.replace('-', "");
|
|
|
+ if let Ok(bytes) = hex::decode(&clean) {
|
|
|
+ self.qnames.contains(&bytes)
|
|
|
+ } else {
|
|
|
+ false
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn len(&self) -> usize {
|
|
|
+ self.qnames.len()
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn clear(&mut self) {
|
|
|
+ self.qnames.clear()
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Intersect two sets and return:
|
|
|
+ /// (intersection_set, overlapping_fraction)
|
|
|
+ pub fn intersect(&self, other: &Self) -> (Self, f64) {
|
|
|
+ let mut inter = Self::default();
|
|
|
+ for q in &self.qnames {
|
|
|
+ if other.qnames.contains(q) {
|
|
|
+ inter.qnames.insert(q.clone());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let frac = if self.qnames.is_empty() {
|
|
|
+ 0.0
|
|
|
+ } else {
|
|
|
+ inter.qnames.len() as f64 / self.qnames.len() as f64
|
|
|
+ };
|
|
|
+
|
|
|
+ (inter, frac)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[cfg(test)]
|
|
|
+mod tests {
|
|
|
+ use super::*;
|
|
|
+ use crate::helpers::test_init;
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn bam_stats() -> anyhow::Result<()> {
|
|
|
+ test_init();
|
|
|
+
|
|
|
+ let config = Config::default();
|
|
|
+ let stats = WGSBamStats::open("36167", "norm", &config)?;
|
|
|
+ println!("{stats}");
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+}
|