10 maanden geleden · 5d43d529e8
--- a/TODO.md
+++ b/TODO.md
@@ -0,0 +1,22 @@
 
				+
			
 
				+
			
 
				+## Constit deletion not detected:
			
 
				+Bonnerave chr8:4,875,950_4,877,070 (1,121)
			
 
				+
			
 
				+## DUP as DEL
			
 
				+CUNY chr15:26,711,767_26,721,508 (9,742) chr15:30,618,667_30,627,964 (9,298) chr15:48,194,912_48,227,648 (32,737)
			
 
				+CUNY: chr16:69,402,657_69,410,431 (7,775) chr17:32,924,913_32,933,324 (8,412) 	
			
 
				+chr17:74,332,063_74,335,643 chr21:33,231,440_33,244,738 (13,299) chr1:11,640,521_11,642,684 (2,164)
			
 
				+
			
 
				+RAOUL: 	
			
 
				+chr20:30,065,516_30,320,534 (255,019)
			
 
				+
			
 
				+BITANG
			
 
				+chr20:8,843,900_8,847,600 (3,701)
			
 
				+
			
 
				+DESLANDES:
			
 
				+chr20:42,758,275_46,295,435 (3,537,161)
			
 
				+
			
 
				+
			
 
				+SNV artefacts:
			
 
				+chr1:404,924 G>A TAMOURTBIR MORIN
			
--- a/src/collection/bam.rs
+++ b/src/collection/bam.rs
@@ -1,5 +1,6 @@
 
				+use core::fmt;
			
 
				 use std::{
			
 
				-    collections::HashMap,
			
 
				+    collections::{BTreeMap, HashMap},
			
 
				     fs::{self, File},
			
 
				     path::PathBuf,
			
 
				 };
			
@@ -116,14 +117,14 @@ impl WGSBam {
 
				         //     None
			
 
				         // };
			
 
				 
			
 
				-        let composition = bam_compo(path.to_string_lossy().as_ref(), 20_000).context(format!(
			
 
				+        let composition = bam_composition(path.to_string_lossy().as_ref(), 20_000).context(format!(
			
 
				             "Error while reading BAM composition for {}",
			
 
				             path.display()
			
 
				         ))?;
			
 
				 
			
 
				         let s = Self {
			
 
				             path,
			
 
				-            bam_stats: WGSBamStats::new(&id, &time_point, Config::default())?,
			
 
				+            bam_stats: WGSBamStats::new(&id, &time_point, &Config::default())?,
			
 
				             // cramino,
			
 
				             id: id.to_string(),
			
 
				             time_point: time_point.to_string(),
			
@@ -229,39 +230,92 @@ pub fn load_bam_collection(result_dir: &str) -> BamCollection {
 
				     BamCollection { bams }
			
 
				 }
			
 
				 
			
 
				-pub fn bam_compo(
			
 
				+/// Computes the composition of `(RG, fn)` tag prefixes in a BAM file sample.
			
 
				+///
			
 
				+/// This function parses a BAM file and, for a subset of records (`sample_size`),
			
 
				+/// extracts the `RG` (read group) and `fn` (typically a file or flowcell ID) string tags from each record.
			
 
				+/// It groups records by the prefix (before `_`, if present) of each tag,
			
 
				+/// counts occurrences for each pair, and reports their proportion in the sample.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `file_path` - Path to the BAM file.
			
 
				+/// * `sample_size` - Maximum number of records to sample and process.
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// Returns a vector of tuples, each containing:
			
 
				+///   - `String`: the prefix of the `RG` tag (up to the first underscore, or full tag if no underscore)
			
 
				+///   - `String`: the prefix of the `fn` tag (similarly processed)
			
 
				+///   - `f64`: percentage of records with this `(RG, fn)` prefix pair, relative to total successfully processed records
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Returns an error if the BAM file cannot be opened or read.
			
 
				+/// Records missing the `RG` or `fn` tag (or with wrong type) are silently skipped.
			
 
				+///
			
 
				+/// # Examples
			
 
				+///
			
 
				+/// ```no_run
			
 
				+/// # use anyhow::Result;
			
 
				+/// let stats = bam_composition("reads.bam", 10_000)?;
			
 
				+/// for (rg, flowcell, pct) in stats {
			
 
				+///     println!("{} / {}: {:.2}%", rg, flowcell, pct);
			
 
				+/// }
			
 
				+/// # Ok::<(), anyhow::Error>(())
			
 
				+/// ```
			
 
				+///
			
 
				+/// # Notes
			
 
				+///
			
 
				+/// - Only the first `sample_size` records in the BAM are processed.
			
 
				+/// - Percentages are normalized by the number of records that had both required tags.
			
 
				+/// - This function uses the `rust-htslib` crate for BAM parsing.
			
 
				+///
			
 
				+/// # Tag requirements
			
 
				+///
			
 
				+/// - Both `RG` and `fn` tags must be present and of string type.
			
 
				+/// - Tag values are split on the first underscore; only the prefix is used for grouping.
			
 
				+///
			
 
				+/// # See also
			
 
				+///
			
 
				+/// - [SAM/BAM tag conventions](https://samtools.github.io/hts-specs/SAMv1.pdf)
			
 
				+///
			
 
				+pub fn bam_composition(
			
 
				     file_path: &str,
			
 
				     sample_size: usize,
			
 
				 ) -> anyhow::Result<Vec<(String, String, f64)>> {
			
 
				-    let mut bam = rust_htslib::bam::Reader::from_path(file_path)
			
 
				-        .map_err(|e| anyhow::anyhow!("Failed to open bam from path: {file_path}\n\t{e}"))?;
			
 
				+    use std::collections::HashMap;
			
 
				+    use rust_htslib::bam::{Reader, record::Aux};
			
 
				 
			
 
				+    let mut bam = Reader::from_path(file_path)?;
			
 
				     let mut rgs: HashMap<(String, String), u64> = HashMap::new();
			
 
				-    for result in bam.records().filter_map(Result::ok).take(sample_size) {
			
 
				-        if let (
			
 
				-            rust_htslib::bam::record::Aux::String(s),
			
 
				-            rust_htslib::bam::record::Aux::String(b),
			
 
				-        ) = (result.aux(b"RG")?, result.aux(b"fn")?)
			
 
				-        {
			
 
				-            *rgs.entry((s.to_string(), b.to_string())).or_default() += 1;
			
 
				-        }
			
 
				+    let mut total = 0u64;
			
 
				+
			
 
				+    for rec in bam.records().filter_map(Result::ok).take(sample_size) {
			
 
				+        let rg = match rec.aux(b"RG") {
			
 
				+            Ok(Aux::String(s)) => s,
			
 
				+            _ => continue,
			
 
				+        };
			
 
				+        let fn_tag = match rec.aux(b"fn") {
			
 
				+            Ok(Aux::String(b)) => b,
			
 
				+            _ => continue,
			
 
				+        };
			
 
				+        let rg_prefix = rg.split_once('_').map(|(a, _)| a).unwrap_or(rg);
			
 
				+        let fn_prefix = fn_tag.split_once('_').map(|(b, _)| b).unwrap_or(fn_tag);
			
 
				+        *rgs.entry((rg_prefix.to_string(), fn_prefix.to_string())).or_default() += 1;
			
 
				+        total += 1;
			
 
				     }
			
 
				 
			
 
				-    Ok(rgs
			
 
				+    let results: Vec<_> = rgs
			
 
				         .into_iter()
			
 
				-        .map(|(k, n)| {
			
 
				-            (
			
 
				-                k.0.to_string(),
			
 
				-                k.1.to_string(),
			
 
				-                n as f64 * 100.0 / sample_size as f64,
			
 
				-            )
			
 
				-        })
			
 
				-        .map(|(rg, f, p)| {
			
 
				-            let (a, _) = rg.split_once('_').unwrap();
			
 
				-            let (b, _) = f.split_once('_').unwrap();
			
 
				-            (a.to_string(), b.to_string(), p)
			
 
				-        })
			
 
				-        .collect())
			
 
				+        .map(|((rg, fn_tag), n)| (
			
 
				+            rg,
			
 
				+            fn_tag,
			
 
				+            n as f64 * 100.0 / total as f64
			
 
				+        ))
			
 
				+        .collect();
			
 
				+
			
 
				+    Ok(results)
			
 
				 }
			
 
				 
			
 
				 pub fn bam_has_fc(bam: &WGSBam, flow_cell: &FlowCell) -> anyhow::Result<bool> {
			
@@ -576,161 +630,324 @@ pub fn sample_random_positions(chromosomes: &Vec<(String, u64)>, n: usize) -> Ve
 
				         .collect()
			
 
				 }
			
 
				 
			
 
				+/// High-level mapping statistics for a WGS BAM file.
			
 
				+///
			
 
				+/// This struct aggregates essential quality control and mapping metrics, suitable for reporting or downstream QC.
			
 
				+/// Unless otherwise specified, units are in base pairs (bp) or counts of reads.
			
 
				+///
			
 
				+/// # Fields
			
 
				+/// - `all_records`: Total number of records in the BAM, before any filtering.
			
 
				+/// - `n_reads`: Number of primary, mapped, QC-passed, non-duplicate reads counted for statistics.
			
 
				+/// - `mapped_fraction`: Fraction of mapped (counted) reads relative to all records.
			
 
				+/// - `n_unmapped`: Number of unmapped reads (`BAM_FUNMAP`).
			
 
				+/// - `n_duplicates`: Number of reads marked as duplicate (`BAM_FDUP`).
			
 
				+/// - `mapped_yield`: Total sum of mapped read lengths (bp).
			
 
				+/// - `mean_read_length`: Mean length of mapped reads (bp).
			
 
				+/// - `median_read_length`: Median mapped read length (bp).
			
 
				+/// - `coverage_stddev`: Standard deviation of per-read global coverage contribution.
			
 
				+/// - `global_coverage`: Mean mapped yield divided by total genome size.
			
 
				+/// - `karyotype`: Per-contig summary: (TID, contig length, contig name, mapped yield, coverage stddev).
			
 
				+/// - `n50`: N50 statistic for mapped read lengths (bp).
			
 
				+/// - `by_lengths`: Histogram of mapped read lengths as (length, count), sorted by length.
			
 
				+///
			
 
				+/// # Filtering Rule
			
 
				+/// Only records that are:
			
 
				+/// - primary alignments (not secondary/supplementary),
			
 
				+/// - mapped (`!BAM_FUNMAP`),
			
 
				+/// - pass vendor quality checks (`!BAM_FQCFAIL`),
			
 
				+/// - not marked as duplicate (`!BAM_FDUP`),
			
 
				+/// - and with mapping quality ≥ `bam_min_mapq`,
			
 
				+/// are counted as mapped reads.
			
 
				+///
			
 
				 #[derive(Debug, Clone, Deserialize, Serialize)]
			
 
				 pub struct WGSBamStats {
			
 
				-    pub all_records: u128,
			
 
				-    pub n_reads: u128,
			
 
				-    pub mapped_yield: u128,
			
 
				+    /// Total number of BAM records (including unmapped, filtered, duplicates).
			
 
				+    pub all_records: u64,
			
 
				+    /// Number of mapped, primary, QC-passed, non-duplicate reads (final filtered count).
			
 
				+    pub n_reads: u64,
			
 
				+    /// Fraction of counted mapped reads over all BAM records.
			
 
				+    pub mapped_fraction: f64,
			
 
				+    /// Number of unmapped records (flag 0x4).
			
 
				+    pub n_unmapped: u64,
			
 
				+    /// Number of duplicate records (flag 0x400).
			
 
				+    pub n_duplicates: u64,
			
 
				+    /// Number of duplicate records (flag 0x400).
			
 
				+    pub n_lowq: u64,
			
 
				+    /// Total sum of mapped read lengths (bp).
			
 
				+    pub mapped_yield: u64,
			
 
				+    /// Mean mapped read length (bp).
			
 
				+    pub mean_read_length: f64,
			
 
				+    /// Median mapped read length (bp).
			
 
				+    pub median_read_length: u64,
			
 
				+    /// Standard deviation of per-read global coverage contributions.
			
 
				+    pub coverage_stddev: f64,
			
 
				+    /// Mean global coverage: `mapped_yield / genome_size`.
			
 
				     pub global_coverage: f64,
			
 
				-    pub karyotype: Vec<(i32, u64, String, u128)>,
			
 
				-    pub n50: u128,
			
 
				-    pub by_lengths: Vec<(u128, u32)>,
			
 
				+    /// (tid, contig_length, contig_name, mapped_yield, mean_coverage, coverage_stddev, coverage_ratio)
			
 
				+    pub karyotype: Vec<(i32, u64, String, u64, f64, f64, f64)>,
			
 
				+    /// N50 value of mapped read lengths (bp).
			
 
				+    pub n50: u64,
			
 
				+    /// Histogram of mapped read lengths: (length, count), sorted by length.
			
 
				+    pub by_lengths: Vec<(u64, u32)>,
			
 
				 }
			
 
				 
			
 
				 impl WGSBamStats {
			
 
				-    /// Creates a new `WGSBamStats` by scanning the BAM file for a given case and time,
			
 
				-    /// computing mapping statistics and coverage.
			
 
				+    /// Computes `WGSBamStats` by scanning a BAM file and summarizing essential statistics.
			
 
				     ///
			
 
				     /// # Parameters
			
 
				+    /// - `case_id`: Sample/case identifier used to locate the BAM file via config.
			
 
				+    /// - `time`: Timestamp string for progress/logging.
			
 
				+    /// - `config`: Configuration struct. Must provide:
			
 
				+    ///     - `solo_bam(case_id, time)`: path to BAM file.
			
 
				+    ///     - `bam_min_mapq`: minimum mapping quality to include read.
			
 
				+    ///     - `bam_n_threads`: number of BAM decompression threads.
			
 
				     ///
			
 
				-    /// - `case_id`: Identifier for the sample/case, used to locate the BAM via `config.solo_bam`.
			
 
				-    /// - `time`: Timestamp string included in progress log messages.
			
 
				-    /// - `config`: Configuration struct providing:
			
 
				-    ///     - `solo_bam(case_id, time)`: path to the BAM file,
			
 
				-    ///     - `bam_min_mapq`: minimum mapping quality filter,
			
 
				-    ///     - `bam_n_threads`: number of threads for decompression.
			
 
				+    /// # Filtering Rule
			
 
				+    /// Only primary, mapped, non-duplicate, QC-passed reads with sufficient mapping quality are included (see struct doc).
			
 
				     ///
			
 
				     /// # Returns
			
 
				-    ///
			
 
				-    /// A populated `YourStruct` with fields:
			
 
				-    /// - `all_records`: total BAM records before filtering,
			
 
				-    /// - `n_reads`: number of reads that passed all filters,
			
 
				-    /// - `mapped_yield`: total sum of mapped read lengths,
			
 
				-    /// - `global_coverage`: `mapped_yield / genome_size`,
			
 
				-    /// - `karyotype`: vector of `(tid, contig_len, contig_name, mapped_sum)` per contig,
			
 
				-    /// - `n50`: N50 of the mapped lengths,
			
 
				-    /// - `by_lengths`: histogram of mapped-read lengths as `(length, count)`.
			
 
				-    ///
			
 
				-    /// # Errors
			
 
				-    ///
			
 
				-    /// Returns an error if:
			
 
				-    /// - the BAM file cannot be opened or parsed,
			
 
				-    /// - genome sizes cannot be retrieved.
			
 
				-    pub fn new(case_id: &str, time: &str, config: Config) -> anyhow::Result<Self> {
			
 
				+    /// On success, returns a fully populated `WGSBamStats` with all core and extended fields.
			
 
				+    /// Returns an error if the BAM cannot be parsed or genome sizes cannot be retrieved.
			
 
				+    pub fn new(case_id: &str, time: &str, config: &Config) -> anyhow::Result<Self> {
			
 
				         let bam_path = config.solo_bam(case_id, time);
			
 
				-        let Config {
			
 
				-            bam_min_mapq,
			
 
				-            bam_n_threads,
			
 
				-            ..
			
 
				-        } = config;
			
 
				-        let mut bam = rust_htslib::bam::Reader::from_path(bam_path)?;
			
 
				+        let bam_min_mapq = config.bam_min_mapq;
			
 
				+        let bam_n_threads = config.bam_n_threads;
			
 
				+
			
 
				+        let mut bam = rust_htslib::bam::Reader::from_path(&bam_path)?;
			
 
				         let h = bam.header().clone();
			
 
				         let header = rust_htslib::bam::Header::from_template(&h);
			
 
				         bam.set_threads(bam_n_threads as usize)?;
			
 
				 
			
 
				-        // 2) Sequential scan
			
 
				-        let mut all_records = 0;
			
 
				-        let mut n_reads = 0;
			
 
				+        info!("Parsing BAM file: {bam_path}");
			
 
				+
			
 
				+        let mut all_records = 0u64;
			
 
				+        let mut n_reads = 0u64;
			
 
				+        let mut n_unmapped = 0u64;
			
 
				+        let mut n_duplicates = 0u64;
			
 
				+        let mut n_lowq = 0u64;
			
 
				         let mut mapped_lengths = Vec::new();
			
 
				-        let mut lengths_by_tid: HashMap<i32, Vec<u128>> = HashMap::new();
			
 
				+        let mut lengths_by_tid: HashMap<i32, Vec<u64>> = HashMap::new();
			
 
				 
			
 
				+        // Main scan loop: apply flag and MAPQ filters.
			
 
				         for rec in bam.rc_records() {
			
 
				             all_records += 1;
			
 
				             let r = rec.with_context(|| "failed to parse BAM record")?;
			
 
				 
			
 
				-            if (r.flags()
			
 
				-                & (rust_htslib::htslib::BAM_FUNMAP
			
 
				-                    | rust_htslib::htslib::BAM_FSECONDARY
			
 
				-                    | rust_htslib::htslib::BAM_FQCFAIL
			
 
				-                    | rust_htslib::htslib::BAM_FDUP) as u16
			
 
				-                == 0)
			
 
				-                || r.mapq() < bam_min_mapq
			
 
				-            {
			
 
				+            let flags = r.flags();
			
 
				+            // Count and exclude unmapped.
			
 
				+            if flags & (rust_htslib::htslib::BAM_FUNMAP as u16) != 0 {
			
 
				+                n_unmapped += 1;
			
 
				+                continue;
			
 
				+            }
			
 
				+            // Count and exclude duplicates.
			
 
				+            if flags & (rust_htslib::htslib::BAM_FDUP as u16) != 0 {
			
 
				+                n_duplicates += 1;
			
 
				+                continue;
			
 
				+            }
			
 
				+            // Filter by mapping quality.
			
 
				+            if r.mapq() < bam_min_mapq {
			
 
				+                n_lowq += 1;
			
 
				+                continue;
			
 
				+            }
			
 
				+            // Exclude secondary and QC-failed.
			
 
				+            let bad_flags = rust_htslib::htslib::BAM_FSECONDARY | rust_htslib::htslib::BAM_FQCFAIL;
			
 
				+            if (flags & (bad_flags as u16)) != 0 {
			
 
				                 continue;
			
 
				             }
			
 
				 
			
 
				             n_reads += 1;
			
 
				-            let len = (r.reference_end() - r.pos()) as u128;
			
 
				+            let start = r.pos();
			
 
				+            let end = r.reference_end();
			
 
				+            let len = if end > start { (end - start) as u64 } else { 0 };
			
 
				             mapped_lengths.push(len);
			
 
				             lengths_by_tid.entry(r.tid()).or_default().push(len);
			
 
				 
			
 
				             if n_reads % 500_000 == 0 {
			
 
				-                info!("{case_id} {time}: processed {} mapped reads", n_reads);
			
 
				+                info!("{case_id} {time}: processed {n_reads} mapped reads");
			
 
				             }
			
 
				         }
			
 
				 
			
 
				-        // 3) Yield & coverage
			
 
				-        let mapped_yield: u128 = mapped_lengths.iter().sum();
			
 
				-        let genome = get_genome_sizes(&header)?;
			
 
				-        let genome_size: u128 = genome.values().map(|&v| v as u128).sum();
			
 
				-        let global_coverage = mapped_yield as f64 / genome_size as f64;
			
 
				+        let mapped_fraction = if all_records > 0 {
			
 
				+            n_reads as f64 / all_records as f64
			
 
				+        } else {
			
 
				+            0.0
			
 
				+        };
			
 
				 
			
 
				-        // 4) N50 (parallel sort)
			
 
				+        // Compute mean, median, N50 (single sort).
			
 
				         let mut lens = mapped_lengths.clone();
			
 
				-        lens.par_sort_unstable();
			
 
				+        lens.sort_unstable();
			
 
				+
			
 
				+        let mean_read_length = if n_reads > 0 {
			
 
				+            mapped_lengths.iter().sum::<u64>() as f64 / n_reads as f64
			
 
				+        } else {
			
 
				+            0.0
			
 
				+        };
			
 
				+        let median_read_length = if lens.is_empty() {
			
 
				+            0
			
 
				+        } else {
			
 
				+            lens[lens.len() / 2]
			
 
				+        };
			
 
				+        let mapped_yield: u64 = mapped_lengths.iter().sum();
			
 
				+        // N50: minimal length L such that sum(lengths >= L) >= 50% yield.
			
 
				+        let mut cum = 0u64;
			
 
				         let half = mapped_yield / 2;
			
 
				-        let mut cum = 0;
			
 
				         let n50 = lens
			
 
				-            .into_iter()
			
 
				+            .iter()
			
 
				             .rev()
			
 
				-            .find(|&l| {
			
 
				+            .find_map(|&l| {
			
 
				                 cum += l;
			
 
				-                cum >= half
			
 
				+                if cum >= half {
			
 
				+                    Some(l)
			
 
				+                } else {
			
 
				+                    None
			
 
				+                }
			
 
				             })
			
 
				             .unwrap_or(0);
			
 
				 
			
 
				-        // 5) Length histogram (parallel)
			
 
				-        let by_lengths: Vec<(u128, u32)> = {
			
 
				-            let freq: DashMap<u128, u32> = DashMap::new();
			
 
				-            mapped_lengths.par_iter().for_each(|&l| {
			
 
				-                *freq.entry(l).or_default() += 1;
			
 
				-            });
			
 
				-            let mut v: Vec<_> = freq.iter().map(|e| (*e.key(), *e.value())).collect();
			
 
				-            v.par_sort_unstable_by_key(|&(len, _)| len);
			
 
				-            v
			
 
				+        // Histogram (sorted by length).
			
 
				+        let mut histo = BTreeMap::new();
			
 
				+        for &len in &mapped_lengths {
			
 
				+            *histo.entry(len).or_insert(0) += 1;
			
 
				+        }
			
 
				+        let by_lengths: Vec<_> = histo.into_iter().collect();
			
 
				+
			
 
				+        // Genome/coverage.
			
 
				+        let genome = get_genome_sizes(&header)?;
			
 
				+        let genome_size: u64 = genome.values().sum();
			
 
				+        let global_coverage = if genome_size > 0 {
			
 
				+            mapped_yield as f64 / genome_size as f64
			
 
				+        } else {
			
 
				+            0.0
			
 
				         };
			
 
				 
			
 
				-        // 6) Karyotype
			
 
				-        let mut karyotype: Vec<(i32, u64, String, u128)> = lengths_by_tid
			
 
				+        // Global coverage stdev: stdev of per-read contributions to global coverage.
			
 
				+        let mut total_sq_cov = 0.0;
			
 
				+        for &len in &mapped_lengths {
			
 
				+            let cov = if genome_size > 0 {
			
 
				+                len as f64 / genome_size as f64
			
 
				+            } else {
			
 
				+                0.0
			
 
				+            };
			
 
				+            total_sq_cov += cov * cov;
			
 
				+        }
			
 
				+        let mean_cov = global_coverage;
			
 
				+        let variance = if n_reads > 0 {
			
 
				+            let var = total_sq_cov / n_reads as f64 - mean_cov.powi(2);
			
 
				+            if var < 0.0 {
			
 
				+                0.0
			
 
				+            } else {
			
 
				+                var
			
 
				+            }
			
 
				+        } else {
			
 
				+            0.0
			
 
				+        };
			
 
				+        let coverage_stddev = variance.sqrt();
			
 
				+
			
 
				+        // Per-contig: (TID, contig_length, contig_name, mapped_yield, mean_coverage, coverage_stddev , coverage_ratio).
			
 
				+        let mut karyotype: Vec<(i32, u64, String, u64, f64, f64, f64)> = lengths_by_tid
			
 
				             .iter()
			
 
				             .map(|(tid, lengths)| {
			
 
				                 let contig = String::from_utf8(h.tid2name(*tid as u32).to_vec()).unwrap();
			
 
				+                let contig_len = genome.get(&contig).copied().unwrap_or(0);
			
 
				+                let mapped_sum: u64 = lengths.iter().sum();
			
 
				+                // Per-contig mean coverage
			
 
				+                let mean_cov = if contig_len > 0 {
			
 
				+                    mapped_sum as f64 / contig_len as f64
			
 
				+                } else {
			
 
				+                    0.0
			
 
				+                };
			
 
				+                let coverage_ratio = if global_coverage > 0.0 {
			
 
				+                    mean_cov / global_coverage
			
 
				+                } else {
			
 
				+                    0.0
			
 
				+                };
			
 
				+
			
 
				+                // Per-contig coverage stdev (over all reads mapped to this contig)
			
 
				+                let var = if contig_len > 0 && !lengths.is_empty() {
			
 
				+                    lengths
			
 
				+                        .iter()
			
 
				+                        .map(|&l| {
			
 
				+                            let cov = l as f64 / contig_len as f64;
			
 
				+                            (cov - mean_cov).powi(2)
			
 
				+                        })
			
 
				+                        .sum::<f64>()
			
 
				+                        / lengths.len() as f64
			
 
				+                } else {
			
 
				+                    0.0
			
 
				+                };
			
 
				+                let stddev = var.sqrt();
			
 
				                 (
			
 
				                     *tid,
			
 
				-                    *genome.get(&contig).unwrap(),
			
 
				+                    contig_len,
			
 
				                     contig,
			
 
				-                    lengths.par_iter().sum::<u128>(),
			
 
				+                    mapped_sum,
			
 
				+                    mean_cov,
			
 
				+                    stddev,
			
 
				+                    coverage_ratio,
			
 
				                 )
			
 
				             })
			
 
				             .collect();
			
 
				 
			
 
				-        karyotype.par_sort_unstable_by_key(|&(tid, _, _, _)| tid);
			
 
				+        karyotype.sort_unstable_by_key(|&(tid, _, _, _, _, _, _)| tid);
			
 
				 
			
 
				         Ok(Self {
			
 
				             all_records,
			
 
				             n_reads,
			
 
				+            mapped_fraction,
			
 
				+            n_unmapped,
			
 
				+            n_duplicates,
			
 
				             mapped_yield,
			
 
				+            mean_read_length,
			
 
				+            median_read_length,
			
 
				+            coverage_stddev,
			
 
				             global_coverage,
			
 
				             karyotype,
			
 
				             n50,
			
 
				             by_lengths,
			
 
				+            n_lowq,
			
 
				         })
			
 
				     }
			
 
				+}
			
 
				 
			
 
				-    pub fn print(&self) {
			
 
				-        println!("N records\t{}", self.all_records);
			
 
				-        println!("N counted reads\t{}", self.n_reads);
			
 
				+impl fmt::Display for WGSBamStats {
			
 
				+    /// Formats the BAM statistics as a readable summary table.
			
 
				+    ///
			
 
				+    /// Shows all global stats and a per-contig breakdown with coverage stdev.
			
 
				+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
			
 
				+        writeln!(f, "BAM statistics summary:")?;
			
 
				+        writeln!(f, "  All BAM records:         {}", self.all_records)?;
			
 
				+        writeln!(f, "  Unmapped reads:          {}", self.n_unmapped)?;
			
 
				+        writeln!(f, "  Duplicate reads:         {}", self.n_duplicates)?;
			
 
				+        writeln!(f, "  Low Q reads:         {}", self.n_lowq)?;
			
 
				+        writeln!(f, "  Counted (mapped) reads:  {}", self.n_reads)?;
			
 
				+        writeln!(f, "  Mapped fraction:         {:.4}", self.mapped_fraction)?;
			
 
				+        writeln!(
			
 
				+            f,
			
 
				+            "  Mapped yield [Gb]:       {:.2}",
			
 
				+            self.mapped_yield as f64 / 1e9
			
 
				+        )?;
			
 
				+        writeln!(f, "  Mean read length [bp]:   {:.2}", self.mean_read_length)?;
			
 
				+        writeln!(f, "  Median read length [bp]: {}", self.median_read_length)?;
			
 
				+        writeln!(f, "  N50 [bp]:                {}", self.n50)?;
			
 
				+        writeln!(f, "  Global mean coverage:    {:.2}", self.global_coverage)?;
			
 
				+        writeln!(f, "  Global coverage stdev:   {:.2}", self.coverage_stddev)?;
			
 
				+        writeln!(f, "  Per-contig stats:")?;
			
 
				+        writeln!(
			
 
				+            f,
			
 
				+            "    {:<8} {:<10} {:<12} {:<14} {:<12} {:<10} {:<10}",
			
 
				+            "TID", "Length", "Name", "MappedYield", "CovMean", "CovStdev", "CovRatio"
			
 
				+        )?;
			
 
				+        for (tid, contig_len, contig, mapped_sum, mean_cov, stddev, coverage_ratio) in
			
 
				+            &self.karyotype
			
 
				+        {
			
 
				+            writeln!(
			
 
				+                f,
			
 
				+                "    {:<8} {:<10} {:<12} {:<14} {:<12.4} {:.4} {:.2}",
			
 
				+                tid, contig_len, contig, mapped_sum, mean_cov, stddev, coverage_ratio
			
 
				+            )?;
			
 
				+        }
			
 
				 
			
 
				-        println!("Mapped yield [Gb]\t{:.2}", self.mapped_yield as f64 / 1e9);
			
 
				-        println!("Mean coverage\t{:.2}", self.global_coverage);
			
 
				-        self.karyotype
			
 
				-            .iter()
			
 
				-            .for_each(|(_, contig_len, contig, contig_yield)| {
			
 
				-                println!(
			
 
				-                    "{contig}\t{:.2}",
			
 
				-                    (*contig_yield as f64 / *contig_len as f64) / self.global_coverage
			
 
				-                );
			
 
				-            });
			
 
				+        Ok(())
			
 
				     }
			
 
				 }
			
 
				 
			
@@ -870,7 +1087,7 @@ pub fn nt_pileup_new(
 
				     // Stream the pileup; HTSlib walks the reads for us.
			
 
				     for pileup in bam.pileup() {
			
 
				         // Attach context to any low‑level error.
			
 
				-        let pileup = pileup.context(format!("can't pile up BAM at {chr}:{pos}"))?;
			
 
				+        let pileup = pileup.context(format!("Failed to pileup BAM at {chr}:{pos}"))?;
			
 
				 
			
 
				         // Skip other positions quickly.
			
 
				         if pileup.pos() != pos {
			
--- a/src/collection/mod.rs
+++ b/src/collection/mod.rs
@@ -38,6 +38,9 @@ pub mod vcf;
 
				 pub mod flowcells;
			
 
				 pub mod minknow;
			
 
				 
			
 
				+#[cfg(test)]
			
 
				+mod tests;
			
 
				+
			
 
				 #[derive(Debug, Clone)]
			
 
				 pub struct CollectionsConfig {
			
 
				     pub pod_dir: String,
			
--- a/src/collection/tests.rs
+++ b/src/collection/tests.rs
@@ -0,0 +1,61 @@
 
				+use log::info;
			
 
				+
			
 
				+use crate::{collection::bam::{bam_composition, WGSBam, WGSBamStats}, config::Config};
			
 
				+
			
 
				+fn init() {
			
 
				+    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
			
 
				+        .is_test(true)
			
 
				+        .try_init();
			
 
				+}
			
 
				+
			
 
				+#[test]
			
 
				+fn wgs_bam_stats() -> anyhow::Result<()> {
			
 
				+    init();
			
 
				+    let case_id = "HOULNE";
			
 
				+    let time = "diag";
			
 
				+
			
 
				+    info!("WGSBamStats::new");
			
 
				+
			
 
				+    let r = WGSBamStats::new(case_id, time, &Config::default())?;
			
 
				+    println!("{r}");
			
 
				+    assert_eq!(r.all_records, 22417713);
			
 
				+    let c = r.karyotype.iter().find_map(|e| if e.2 == "chr9" { Some(e.6) } else { None }).unwrap();
			
 
				+    assert_eq!(c, 0.8800890147031387);
			
 
				+
			
 
				+    Ok(())
			
 
				+}
			
 
				+
			
 
				+#[test]
			
 
				+fn wgs_bam_new() -> anyhow::Result<()> {
			
 
				+    init();
			
 
				+    let case_id = "HOULNE";
			
 
				+    let time = "diag";
			
 
				+
			
 
				+    info!("WGSBam::new");
			
 
				+
			
 
				+    let c = Config::default();
			
 
				+
			
 
				+    let b = WGSBam::new(c.solo_bam(case_id, time).into())?;
			
 
				+
			
 
				+    println!("{:?}", b.composition);
			
 
				+
			
 
				+    Ok(())
			
 
				+}
			
 
				+
			
 
				+#[test]
			
 
				+fn bam_c() -> anyhow::Result<()> {
			
 
				+    init();
			
 
				+    let case_id = "HOULNE";
			
 
				+    let time = "diag";
			
 
				+
			
 
				+    info!("WGSBam::new");
			
 
				+
			
 
				+    let c = Config::default();
			
 
				+
			
 
				+    let compo = bam_composition(&c.solo_bam(case_id, time), 20000)?;
			
 
				+
			
 
				+
			
 
				+    println!("{:?}", compo);
			
 
				+
			
 
				+    Ok(())
			
 
				+}
			
--- a/src/commands/dorado.rs
+++ b/src/commands/dorado.rs
@@ -9,7 +9,7 @@ use log::{debug, info, warn};
 
				 use uuid::Uuid;
			
 
				 
			
 
				 use crate::{
			
 
				-    collection::{bam::bam_compo, flowcells::FlowCell, pod5::FlowCellCase},
			
 
				+    collection::{bam::bam_composition, flowcells::FlowCell, pod5::FlowCellCase},
			
 
				     config::Config,
			
 
				     helpers::find_unique_file,
			
 
				     io::pod5_infos::Pod5Info,
			
@@ -209,11 +209,11 @@ impl Dorado {
 
				     }
			
 
				 
			
 
				     pub fn merge_bam(&self, bam: &Path) -> anyhow::Result<()> {
			
 
				-        let composition_a: Vec<String> = bam_compo(bam.to_string_lossy().as_ref(), 20000)?
			
 
				+        let composition_a: Vec<String> = bam_composition(bam.to_string_lossy().as_ref(), 20000)?
			
 
				             .iter()
			
 
				             .map(|(i, _, _)| i.clone())
			
 
				             .collect();
			
 
				-        let composition_b: Vec<String> = bam_compo(&self.bam, 20000)?
			
 
				+        let composition_b: Vec<String> = bam_composition(&self.bam, 20000)?
			
 
				             .iter()
			
 
				             .map(|(i, _, _)| i.clone())
			
 
				             .collect();
			
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1070,15 +1070,6 @@ let variant: VcfVariant = row.parse()?;
 
				         Ok(())
			
 
				     }
			
 
				 
			
 
				-    #[test]
			
 
				-    fn coverage() -> anyhow::Result<()> {
			
 
				-        init();
			
 
				-        let id = "CAMARA";
			
 
				-        let time = "diag";
			
 
				-        WGSBamStats::new(id, time, Config::default())?.print();
			
 
				-        Ok(())
			
 
				-    }
			
 
				-
			
 
				     #[test]
			
 
				     fn load_bam() -> anyhow::Result<()> {
			
 
				         init();