|
|
@@ -1,5 +1,6 @@
|
|
|
+use core::fmt;
|
|
|
use std::{
|
|
|
- collections::HashMap,
|
|
|
+ collections::{BTreeMap, HashMap},
|
|
|
fs::{self, File},
|
|
|
path::PathBuf,
|
|
|
};
|
|
|
@@ -116,14 +117,14 @@ impl WGSBam {
|
|
|
// None
|
|
|
// };
|
|
|
|
|
|
- let composition = bam_compo(path.to_string_lossy().as_ref(), 20_000).context(format!(
|
|
|
+ let composition = bam_composition(path.to_string_lossy().as_ref(), 20_000).context(format!(
|
|
|
"Error while reading BAM composition for {}",
|
|
|
path.display()
|
|
|
))?;
|
|
|
|
|
|
let s = Self {
|
|
|
path,
|
|
|
- bam_stats: WGSBamStats::new(&id, &time_point, Config::default())?,
|
|
|
+ bam_stats: WGSBamStats::new(&id, &time_point, &Config::default())?,
|
|
|
// cramino,
|
|
|
id: id.to_string(),
|
|
|
time_point: time_point.to_string(),
|
|
|
@@ -229,39 +230,92 @@ pub fn load_bam_collection(result_dir: &str) -> BamCollection {
|
|
|
BamCollection { bams }
|
|
|
}
|
|
|
|
|
|
-pub fn bam_compo(
|
|
|
+/// Computes the composition of `(RG, fn)` tag prefixes in a BAM file sample.
|
|
|
+///
|
|
|
+/// This function parses a BAM file and, for a subset of records (`sample_size`),
|
|
|
+/// extracts the `RG` (read group) and `fn` (typically a file or flowcell ID) string tags from each record.
|
|
|
+/// It groups records by the prefix (before `_`, if present) of each tag,
|
|
|
+/// counts occurrences for each pair, and reports their proportion in the sample.
|
|
|
+///
|
|
|
+/// # Arguments
|
|
|
+///
|
|
|
+/// * `file_path` - Path to the BAM file.
|
|
|
+/// * `sample_size` - Maximum number of records to sample and process.
|
|
|
+///
|
|
|
+/// # Returns
|
|
|
+///
|
|
|
+/// Returns a vector of tuples, each containing:
|
|
|
+/// - `String`: the prefix of the `RG` tag (up to the first underscore, or full tag if no underscore)
|
|
|
+/// - `String`: the prefix of the `fn` tag (similarly processed)
|
|
|
+/// - `f64`: percentage of records with this `(RG, fn)` prefix pair, relative to total successfully processed records
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+///
|
|
|
+/// Returns an error if the BAM file cannot be opened or read.
|
|
|
+/// Records missing the `RG` or `fn` tag (or with wrong type) are silently skipped.
|
|
|
+///
|
|
|
+/// # Examples
|
|
|
+///
|
|
|
+/// ```no_run
|
|
|
+/// # use anyhow::Result;
|
|
|
+/// let stats = bam_composition("reads.bam", 10_000)?;
|
|
|
+/// for (rg, flowcell, pct) in stats {
|
|
|
+/// println!("{} / {}: {:.2}%", rg, flowcell, pct);
|
|
|
+/// }
|
|
|
+/// # Ok::<(), anyhow::Error>(())
|
|
|
+/// ```
|
|
|
+///
|
|
|
+/// # Notes
|
|
|
+///
|
|
|
+/// - Only the first `sample_size` records in the BAM are processed.
|
|
|
+/// - Percentages are normalized by the number of records that had both required tags.
|
|
|
+/// - This function uses the `rust-htslib` crate for BAM parsing.
|
|
|
+///
|
|
|
+/// # Tag requirements
|
|
|
+///
|
|
|
+/// - Both `RG` and `fn` tags must be present and of string type.
|
|
|
+/// - Tag values are split on the first underscore; only the prefix is used for grouping.
|
|
|
+///
|
|
|
+/// # See also
|
|
|
+///
|
|
|
+/// - [SAM/BAM tag conventions](https://samtools.github.io/hts-specs/SAMv1.pdf)
|
|
|
+///
|
|
|
+pub fn bam_composition(
|
|
|
file_path: &str,
|
|
|
sample_size: usize,
|
|
|
) -> anyhow::Result<Vec<(String, String, f64)>> {
|
|
|
- let mut bam = rust_htslib::bam::Reader::from_path(file_path)
|
|
|
- .map_err(|e| anyhow::anyhow!("Failed to open bam from path: {file_path}\n\t{e}"))?;
|
|
|
+ use std::collections::HashMap;
|
|
|
+ use rust_htslib::bam::{Reader, record::Aux};
|
|
|
|
|
|
+ let mut bam = Reader::from_path(file_path)?;
|
|
|
let mut rgs: HashMap<(String, String), u64> = HashMap::new();
|
|
|
- for result in bam.records().filter_map(Result::ok).take(sample_size) {
|
|
|
- if let (
|
|
|
- rust_htslib::bam::record::Aux::String(s),
|
|
|
- rust_htslib::bam::record::Aux::String(b),
|
|
|
- ) = (result.aux(b"RG")?, result.aux(b"fn")?)
|
|
|
- {
|
|
|
- *rgs.entry((s.to_string(), b.to_string())).or_default() += 1;
|
|
|
- }
|
|
|
+ let mut total = 0u64;
|
|
|
+
|
|
|
+ for rec in bam.records().filter_map(Result::ok).take(sample_size) {
|
|
|
+ let rg = match rec.aux(b"RG") {
|
|
|
+ Ok(Aux::String(s)) => s,
|
|
|
+ _ => continue,
|
|
|
+ };
|
|
|
+ let fn_tag = match rec.aux(b"fn") {
|
|
|
+ Ok(Aux::String(b)) => b,
|
|
|
+ _ => continue,
|
|
|
+ };
|
|
|
+ let rg_prefix = rg.split_once('_').map(|(a, _)| a).unwrap_or(rg);
|
|
|
+ let fn_prefix = fn_tag.split_once('_').map(|(b, _)| b).unwrap_or(fn_tag);
|
|
|
+ *rgs.entry((rg_prefix.to_string(), fn_prefix.to_string())).or_default() += 1;
|
|
|
+ total += 1;
|
|
|
}
|
|
|
|
|
|
- Ok(rgs
|
|
|
+ let results: Vec<_> = rgs
|
|
|
.into_iter()
|
|
|
- .map(|(k, n)| {
|
|
|
- (
|
|
|
- k.0.to_string(),
|
|
|
- k.1.to_string(),
|
|
|
- n as f64 * 100.0 / sample_size as f64,
|
|
|
- )
|
|
|
- })
|
|
|
- .map(|(rg, f, p)| {
|
|
|
- let (a, _) = rg.split_once('_').unwrap();
|
|
|
- let (b, _) = f.split_once('_').unwrap();
|
|
|
- (a.to_string(), b.to_string(), p)
|
|
|
- })
|
|
|
- .collect())
|
|
|
+ .map(|((rg, fn_tag), n)| (
|
|
|
+ rg,
|
|
|
+ fn_tag,
|
|
|
+ n as f64 * 100.0 / total as f64
|
|
|
+ ))
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ Ok(results)
|
|
|
}
|
|
|
|
|
|
pub fn bam_has_fc(bam: &WGSBam, flow_cell: &FlowCell) -> anyhow::Result<bool> {
|
|
|
@@ -576,161 +630,324 @@ pub fn sample_random_positions(chromosomes: &Vec<(String, u64)>, n: usize) -> Ve
|
|
|
.collect()
|
|
|
}
|
|
|
|
|
|
+/// High-level mapping statistics for a WGS BAM file.
|
|
|
+///
|
|
|
+/// This struct aggregates essential quality control and mapping metrics, suitable for reporting or downstream QC.
|
|
|
+/// Unless otherwise specified, units are in base pairs (bp) or counts of reads.
|
|
|
+///
|
|
|
+/// # Fields
|
|
|
+/// - `all_records`: Total number of records in the BAM, before any filtering.
|
|
|
+/// - `n_reads`: Number of primary, mapped, QC-passed, non-duplicate reads counted for statistics.
|
|
|
+/// - `mapped_fraction`: Fraction of mapped (counted) reads relative to all records.
|
|
|
+/// - `n_unmapped`: Number of unmapped reads (`BAM_FUNMAP`).
|
|
|
+/// - `n_duplicates`: Number of reads marked as duplicate (`BAM_FDUP`).
|
|
|
+/// - `mapped_yield`: Total sum of mapped read lengths (bp).
|
|
|
+/// - `mean_read_length`: Mean length of mapped reads (bp).
|
|
|
+/// - `median_read_length`: Median mapped read length (bp).
|
|
|
+/// - `coverage_stddev`: Standard deviation of per-read global coverage contribution.
|
|
|
+/// - `global_coverage`: Mean mapped yield divided by total genome size.
|
|
|
+/// - `karyotype`: Per-contig summary: (TID, contig length, contig name, mapped yield, coverage stddev).
|
|
|
+/// - `n50`: N50 statistic for mapped read lengths (bp).
|
|
|
+/// - `by_lengths`: Histogram of mapped read lengths as (length, count), sorted by length.
|
|
|
+///
|
|
|
+/// # Filtering Rule
|
|
|
+/// Only records that are:
|
|
|
+/// - primary alignments (not secondary/supplementary),
|
|
|
+/// - mapped (`!BAM_FUNMAP`),
|
|
|
+/// - pass vendor quality checks (`!BAM_FQCFAIL`),
|
|
|
+/// - not marked as duplicate (`!BAM_FDUP`),
|
|
|
+/// - and with mapping quality ≥ `bam_min_mapq`,
|
|
|
+/// are counted as mapped reads.
|
|
|
+///
|
|
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
|
pub struct WGSBamStats {
|
|
|
- pub all_records: u128,
|
|
|
- pub n_reads: u128,
|
|
|
- pub mapped_yield: u128,
|
|
|
+ /// Total number of BAM records (including unmapped, filtered, duplicates).
|
|
|
+ pub all_records: u64,
|
|
|
+ /// Number of mapped, primary, QC-passed, non-duplicate reads (final filtered count).
|
|
|
+ pub n_reads: u64,
|
|
|
+ /// Fraction of counted mapped reads over all BAM records.
|
|
|
+ pub mapped_fraction: f64,
|
|
|
+ /// Number of unmapped records (flag 0x4).
|
|
|
+ pub n_unmapped: u64,
|
|
|
+ /// Number of duplicate records (flag 0x400).
|
|
|
+ pub n_duplicates: u64,
|
|
|
+ /// Number of duplicate records (flag 0x400).
|
|
|
+ pub n_lowq: u64,
|
|
|
+ /// Total sum of mapped read lengths (bp).
|
|
|
+ pub mapped_yield: u64,
|
|
|
+ /// Mean mapped read length (bp).
|
|
|
+ pub mean_read_length: f64,
|
|
|
+ /// Median mapped read length (bp).
|
|
|
+ pub median_read_length: u64,
|
|
|
+ /// Standard deviation of per-read global coverage contributions.
|
|
|
+ pub coverage_stddev: f64,
|
|
|
+ /// Mean global coverage: `mapped_yield / genome_size`.
|
|
|
pub global_coverage: f64,
|
|
|
- pub karyotype: Vec<(i32, u64, String, u128)>,
|
|
|
- pub n50: u128,
|
|
|
- pub by_lengths: Vec<(u128, u32)>,
|
|
|
+ /// (tid, contig_length, contig_name, mapped_yield, mean_coverage, coverage_stddev, coverage_ratio)
|
|
|
+ pub karyotype: Vec<(i32, u64, String, u64, f64, f64, f64)>,
|
|
|
+ /// N50 value of mapped read lengths (bp).
|
|
|
+ pub n50: u64,
|
|
|
+ /// Histogram of mapped read lengths: (length, count), sorted by length.
|
|
|
+ pub by_lengths: Vec<(u64, u32)>,
|
|
|
}
|
|
|
|
|
|
impl WGSBamStats {
|
|
|
- /// Creates a new `WGSBamStats` by scanning the BAM file for a given case and time,
|
|
|
- /// computing mapping statistics and coverage.
|
|
|
+ /// Computes `WGSBamStats` by scanning a BAM file and summarizing essential statistics.
|
|
|
///
|
|
|
/// # Parameters
|
|
|
+ /// - `case_id`: Sample/case identifier used to locate the BAM file via config.
|
|
|
+ /// - `time`: Timestamp string for progress/logging.
|
|
|
+ /// - `config`: Configuration struct. Must provide:
|
|
|
+ /// - `solo_bam(case_id, time)`: path to BAM file.
|
|
|
+ /// - `bam_min_mapq`: minimum mapping quality to include read.
|
|
|
+ /// - `bam_n_threads`: number of BAM decompression threads.
|
|
|
///
|
|
|
- /// - `case_id`: Identifier for the sample/case, used to locate the BAM via `config.solo_bam`.
|
|
|
- /// - `time`: Timestamp string included in progress log messages.
|
|
|
- /// - `config`: Configuration struct providing:
|
|
|
- /// - `solo_bam(case_id, time)`: path to the BAM file,
|
|
|
- /// - `bam_min_mapq`: minimum mapping quality filter,
|
|
|
- /// - `bam_n_threads`: number of threads for decompression.
|
|
|
+ /// # Filtering Rule
|
|
|
+ /// Only primary, mapped, non-duplicate, QC-passed reads with sufficient mapping quality are included (see struct doc).
|
|
|
///
|
|
|
/// # Returns
|
|
|
- ///
|
|
|
- /// A populated `YourStruct` with fields:
|
|
|
- /// - `all_records`: total BAM records before filtering,
|
|
|
- /// - `n_reads`: number of reads that passed all filters,
|
|
|
- /// - `mapped_yield`: total sum of mapped read lengths,
|
|
|
- /// - `global_coverage`: `mapped_yield / genome_size`,
|
|
|
- /// - `karyotype`: vector of `(tid, contig_len, contig_name, mapped_sum)` per contig,
|
|
|
- /// - `n50`: N50 of the mapped lengths,
|
|
|
- /// - `by_lengths`: histogram of mapped-read lengths as `(length, count)`.
|
|
|
- ///
|
|
|
- /// # Errors
|
|
|
- ///
|
|
|
- /// Returns an error if:
|
|
|
- /// - the BAM file cannot be opened or parsed,
|
|
|
- /// - genome sizes cannot be retrieved.
|
|
|
- pub fn new(case_id: &str, time: &str, config: Config) -> anyhow::Result<Self> {
|
|
|
+ /// On success, returns a fully populated `WGSBamStats` with all core and extended fields.
|
|
|
+ /// Returns an error if the BAM cannot be parsed or genome sizes cannot be retrieved.
|
|
|
+ pub fn new(case_id: &str, time: &str, config: &Config) -> anyhow::Result<Self> {
|
|
|
let bam_path = config.solo_bam(case_id, time);
|
|
|
- let Config {
|
|
|
- bam_min_mapq,
|
|
|
- bam_n_threads,
|
|
|
- ..
|
|
|
- } = config;
|
|
|
- let mut bam = rust_htslib::bam::Reader::from_path(bam_path)?;
|
|
|
+ let bam_min_mapq = config.bam_min_mapq;
|
|
|
+ let bam_n_threads = config.bam_n_threads;
|
|
|
+
|
|
|
+ let mut bam = rust_htslib::bam::Reader::from_path(&bam_path)?;
|
|
|
let h = bam.header().clone();
|
|
|
let header = rust_htslib::bam::Header::from_template(&h);
|
|
|
bam.set_threads(bam_n_threads as usize)?;
|
|
|
|
|
|
- // 2) Sequential scan
|
|
|
- let mut all_records = 0;
|
|
|
- let mut n_reads = 0;
|
|
|
+ info!("Parsing BAM file: {bam_path}");
|
|
|
+
|
|
|
+ let mut all_records = 0u64;
|
|
|
+ let mut n_reads = 0u64;
|
|
|
+ let mut n_unmapped = 0u64;
|
|
|
+ let mut n_duplicates = 0u64;
|
|
|
+ let mut n_lowq = 0u64;
|
|
|
let mut mapped_lengths = Vec::new();
|
|
|
- let mut lengths_by_tid: HashMap<i32, Vec<u128>> = HashMap::new();
|
|
|
+ let mut lengths_by_tid: HashMap<i32, Vec<u64>> = HashMap::new();
|
|
|
|
|
|
+ // Main scan loop: apply flag and MAPQ filters.
|
|
|
for rec in bam.rc_records() {
|
|
|
all_records += 1;
|
|
|
let r = rec.with_context(|| "failed to parse BAM record")?;
|
|
|
|
|
|
- if (r.flags()
|
|
|
- & (rust_htslib::htslib::BAM_FUNMAP
|
|
|
- | rust_htslib::htslib::BAM_FSECONDARY
|
|
|
- | rust_htslib::htslib::BAM_FQCFAIL
|
|
|
- | rust_htslib::htslib::BAM_FDUP) as u16
|
|
|
- == 0)
|
|
|
- || r.mapq() < bam_min_mapq
|
|
|
- {
|
|
|
+ let flags = r.flags();
|
|
|
+ // Count and exclude unmapped.
|
|
|
+ if flags & (rust_htslib::htslib::BAM_FUNMAP as u16) != 0 {
|
|
|
+ n_unmapped += 1;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ // Count and exclude duplicates.
|
|
|
+ if flags & (rust_htslib::htslib::BAM_FDUP as u16) != 0 {
|
|
|
+ n_duplicates += 1;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ // Filter by mapping quality.
|
|
|
+ if r.mapq() < bam_min_mapq {
|
|
|
+ n_lowq += 1;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ // Exclude secondary and QC-failed.
|
|
|
+ let bad_flags = rust_htslib::htslib::BAM_FSECONDARY | rust_htslib::htslib::BAM_FQCFAIL;
|
|
|
+ if (flags & (bad_flags as u16)) != 0 {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
n_reads += 1;
|
|
|
- let len = (r.reference_end() - r.pos()) as u128;
|
|
|
+ let start = r.pos();
|
|
|
+ let end = r.reference_end();
|
|
|
+ let len = if end > start { (end - start) as u64 } else { 0 };
|
|
|
mapped_lengths.push(len);
|
|
|
lengths_by_tid.entry(r.tid()).or_default().push(len);
|
|
|
|
|
|
if n_reads % 500_000 == 0 {
|
|
|
- info!("{case_id} {time}: processed {} mapped reads", n_reads);
|
|
|
+ info!("{case_id} {time}: processed {n_reads} mapped reads");
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- // 3) Yield & coverage
|
|
|
- let mapped_yield: u128 = mapped_lengths.iter().sum();
|
|
|
- let genome = get_genome_sizes(&header)?;
|
|
|
- let genome_size: u128 = genome.values().map(|&v| v as u128).sum();
|
|
|
- let global_coverage = mapped_yield as f64 / genome_size as f64;
|
|
|
+ let mapped_fraction = if all_records > 0 {
|
|
|
+ n_reads as f64 / all_records as f64
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ };
|
|
|
|
|
|
- // 4) N50 (parallel sort)
|
|
|
+ // Compute mean, median, N50 (single sort).
|
|
|
let mut lens = mapped_lengths.clone();
|
|
|
- lens.par_sort_unstable();
|
|
|
+ lens.sort_unstable();
|
|
|
+
|
|
|
+ let mean_read_length = if n_reads > 0 {
|
|
|
+ mapped_lengths.iter().sum::<u64>() as f64 / n_reads as f64
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ };
|
|
|
+ let median_read_length = if lens.is_empty() {
|
|
|
+ 0
|
|
|
+ } else {
|
|
|
+ lens[lens.len() / 2]
|
|
|
+ };
|
|
|
+ let mapped_yield: u64 = mapped_lengths.iter().sum();
|
|
|
+ // N50: minimal length L such that sum(lengths >= L) >= 50% yield.
|
|
|
+ let mut cum = 0u64;
|
|
|
let half = mapped_yield / 2;
|
|
|
- let mut cum = 0;
|
|
|
let n50 = lens
|
|
|
- .into_iter()
|
|
|
+ .iter()
|
|
|
.rev()
|
|
|
- .find(|&l| {
|
|
|
+ .find_map(|&l| {
|
|
|
cum += l;
|
|
|
- cum >= half
|
|
|
+ if cum >= half {
|
|
|
+ Some(l)
|
|
|
+ } else {
|
|
|
+ None
|
|
|
+ }
|
|
|
})
|
|
|
.unwrap_or(0);
|
|
|
|
|
|
- // 5) Length histogram (parallel)
|
|
|
- let by_lengths: Vec<(u128, u32)> = {
|
|
|
- let freq: DashMap<u128, u32> = DashMap::new();
|
|
|
- mapped_lengths.par_iter().for_each(|&l| {
|
|
|
- *freq.entry(l).or_default() += 1;
|
|
|
- });
|
|
|
- let mut v: Vec<_> = freq.iter().map(|e| (*e.key(), *e.value())).collect();
|
|
|
- v.par_sort_unstable_by_key(|&(len, _)| len);
|
|
|
- v
|
|
|
+ // Histogram (sorted by length).
|
|
|
+ let mut histo = BTreeMap::new();
|
|
|
+ for &len in &mapped_lengths {
|
|
|
+ *histo.entry(len).or_insert(0) += 1;
|
|
|
+ }
|
|
|
+ let by_lengths: Vec<_> = histo.into_iter().collect();
|
|
|
+
|
|
|
+ // Genome/coverage.
|
|
|
+ let genome = get_genome_sizes(&header)?;
|
|
|
+ let genome_size: u64 = genome.values().sum();
|
|
|
+ let global_coverage = if genome_size > 0 {
|
|
|
+ mapped_yield as f64 / genome_size as f64
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
};
|
|
|
|
|
|
- // 6) Karyotype
|
|
|
- let mut karyotype: Vec<(i32, u64, String, u128)> = lengths_by_tid
|
|
|
+ // Global coverage stdev: stdev of per-read contributions to global coverage.
|
|
|
+ let mut total_sq_cov = 0.0;
|
|
|
+ for &len in &mapped_lengths {
|
|
|
+ let cov = if genome_size > 0 {
|
|
|
+ len as f64 / genome_size as f64
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ };
|
|
|
+ total_sq_cov += cov * cov;
|
|
|
+ }
|
|
|
+ let mean_cov = global_coverage;
|
|
|
+ let variance = if n_reads > 0 {
|
|
|
+ let var = total_sq_cov / n_reads as f64 - mean_cov.powi(2);
|
|
|
+ if var < 0.0 {
|
|
|
+ 0.0
|
|
|
+ } else {
|
|
|
+ var
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ };
|
|
|
+ let coverage_stddev = variance.sqrt();
|
|
|
+
|
|
|
+ // Per-contig: (TID, contig_length, contig_name, mapped_yield, mean_coverage, coverage_stddev , coverage_ratio).
|
|
|
+ let mut karyotype: Vec<(i32, u64, String, u64, f64, f64, f64)> = lengths_by_tid
|
|
|
.iter()
|
|
|
.map(|(tid, lengths)| {
|
|
|
let contig = String::from_utf8(h.tid2name(*tid as u32).to_vec()).unwrap();
|
|
|
+ let contig_len = genome.get(&contig).copied().unwrap_or(0);
|
|
|
+ let mapped_sum: u64 = lengths.iter().sum();
|
|
|
+ // Per-contig mean coverage
|
|
|
+ let mean_cov = if contig_len > 0 {
|
|
|
+ mapped_sum as f64 / contig_len as f64
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ };
|
|
|
+ let coverage_ratio = if global_coverage > 0.0 {
|
|
|
+ mean_cov / global_coverage
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ };
|
|
|
+
|
|
|
+ // Per-contig coverage stdev (over all reads mapped to this contig)
|
|
|
+ let var = if contig_len > 0 && !lengths.is_empty() {
|
|
|
+ lengths
|
|
|
+ .iter()
|
|
|
+ .map(|&l| {
|
|
|
+ let cov = l as f64 / contig_len as f64;
|
|
|
+ (cov - mean_cov).powi(2)
|
|
|
+ })
|
|
|
+ .sum::<f64>()
|
|
|
+ / lengths.len() as f64
|
|
|
+ } else {
|
|
|
+ 0.0
|
|
|
+ };
|
|
|
+ let stddev = var.sqrt();
|
|
|
(
|
|
|
*tid,
|
|
|
- *genome.get(&contig).unwrap(),
|
|
|
+ contig_len,
|
|
|
contig,
|
|
|
- lengths.par_iter().sum::<u128>(),
|
|
|
+ mapped_sum,
|
|
|
+ mean_cov,
|
|
|
+ stddev,
|
|
|
+ coverage_ratio,
|
|
|
)
|
|
|
})
|
|
|
.collect();
|
|
|
|
|
|
- karyotype.par_sort_unstable_by_key(|&(tid, _, _, _)| tid);
|
|
|
+ karyotype.sort_unstable_by_key(|&(tid, _, _, _, _, _, _)| tid);
|
|
|
|
|
|
Ok(Self {
|
|
|
all_records,
|
|
|
n_reads,
|
|
|
+ mapped_fraction,
|
|
|
+ n_unmapped,
|
|
|
+ n_duplicates,
|
|
|
mapped_yield,
|
|
|
+ mean_read_length,
|
|
|
+ median_read_length,
|
|
|
+ coverage_stddev,
|
|
|
global_coverage,
|
|
|
karyotype,
|
|
|
n50,
|
|
|
by_lengths,
|
|
|
+ n_lowq,
|
|
|
})
|
|
|
}
|
|
|
+}
|
|
|
|
|
|
- pub fn print(&self) {
|
|
|
- println!("N records\t{}", self.all_records);
|
|
|
- println!("N counted reads\t{}", self.n_reads);
|
|
|
+impl fmt::Display for WGSBamStats {
|
|
|
+ /// Formats the BAM statistics as a readable summary table.
|
|
|
+ ///
|
|
|
+ /// Shows all global stats and a per-contig breakdown with coverage stdev.
|
|
|
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
+ writeln!(f, "BAM statistics summary:")?;
|
|
|
+ writeln!(f, " All BAM records: {}", self.all_records)?;
|
|
|
+ writeln!(f, " Unmapped reads: {}", self.n_unmapped)?;
|
|
|
+ writeln!(f, " Duplicate reads: {}", self.n_duplicates)?;
|
|
|
+ writeln!(f, " Low Q reads: {}", self.n_lowq)?;
|
|
|
+ writeln!(f, " Counted (mapped) reads: {}", self.n_reads)?;
|
|
|
+ writeln!(f, " Mapped fraction: {:.4}", self.mapped_fraction)?;
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " Mapped yield [Gb]: {:.2}",
|
|
|
+ self.mapped_yield as f64 / 1e9
|
|
|
+ )?;
|
|
|
+ writeln!(f, " Mean read length [bp]: {:.2}", self.mean_read_length)?;
|
|
|
+ writeln!(f, " Median read length [bp]: {}", self.median_read_length)?;
|
|
|
+ writeln!(f, " N50 [bp]: {}", self.n50)?;
|
|
|
+ writeln!(f, " Global mean coverage: {:.2}", self.global_coverage)?;
|
|
|
+ writeln!(f, " Global coverage stdev: {:.2}", self.coverage_stddev)?;
|
|
|
+ writeln!(f, " Per-contig stats:")?;
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " {:<8} {:<10} {:<12} {:<14} {:<12} {:<10} {:<10}",
|
|
|
+ "TID", "Length", "Name", "MappedYield", "CovMean", "CovStdev", "CovRatio"
|
|
|
+ )?;
|
|
|
+ for (tid, contig_len, contig, mapped_sum, mean_cov, stddev, coverage_ratio) in
|
|
|
+ &self.karyotype
|
|
|
+ {
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " {:<8} {:<10} {:<12} {:<14} {:<12.4} {:.4} {:.2}",
|
|
|
+ tid, contig_len, contig, mapped_sum, mean_cov, stddev, coverage_ratio
|
|
|
+ )?;
|
|
|
+ }
|
|
|
|
|
|
- println!("Mapped yield [Gb]\t{:.2}", self.mapped_yield as f64 / 1e9);
|
|
|
- println!("Mean coverage\t{:.2}", self.global_coverage);
|
|
|
- self.karyotype
|
|
|
- .iter()
|
|
|
- .for_each(|(_, contig_len, contig, contig_yield)| {
|
|
|
- println!(
|
|
|
- "{contig}\t{:.2}",
|
|
|
- (*contig_yield as f64 / *contig_len as f64) / self.global_coverage
|
|
|
- );
|
|
|
- });
|
|
|
+ Ok(())
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -870,7 +1087,7 @@ pub fn nt_pileup_new(
|
|
|
// Stream the pileup; HTSlib walks the reads for us.
|
|
|
for pileup in bam.pileup() {
|
|
|
// Attach context to any low‑level error.
|
|
|
- let pileup = pileup.context(format!("can't pile up BAM at {chr}:{pos}"))?;
|
|
|
+ let pileup = pileup.context(format!("Failed to pileup BAM at {chr}:{pos}"))?;
|
|
|
|
|
|
// Skip other positions quickly.
|
|
|
if pileup.pos() != pos {
|