Bladeren bron

better bam_compo // WGSBamStats

Thomas 7 maanden geleden
bovenliggende
commit
5d43d529e8
6 gewijzigde bestanden met toevoegingen van 423 en 129 verwijderingen
  1. 22 0
      TODO.md
  2. 334 117
      src/collection/bam.rs
  3. 3 0
      src/collection/mod.rs
  4. 61 0
      src/collection/tests.rs
  5. 3 3
      src/commands/dorado.rs
  6. 0 9
      src/lib.rs

+ 22 - 0
TODO.md

@@ -0,0 +1,22 @@
+
+
+## Constit deletion not detected:
+Bonnerave chr8:4,875,950_4,877,070 (1,121)
+
+## DUP as DEL
+CUNY chr15:26,711,767_26,721,508 (9,742) chr15:30,618,667_30,627,964 (9,298) chr15:48,194,912_48,227,648 (32,737)
+CUNY: chr16:69,402,657_69,410,431 (7,775) chr17:32,924,913_32,933,324 (8,412) 	
+chr17:74,332,063_74,335,643 chr21:33,231,440_33,244,738 (13,299) chr1:11,640,521_11,642,684 (2,164)
+
+RAOUL: 	
+chr20:30,065,516_30,320,534 (255,019)
+
+BITANG
+chr20:8,843,900_8,847,600 (3,701)
+
+DESLANDES:
+chr20:42,758,275_46,295,435 (3,537,161)
+
+
+SNV artefacts:
+chr1:404,924 G>A TAMOURTBIR MORIN

+ 334 - 117
src/collection/bam.rs

@@ -1,5 +1,6 @@
+use core::fmt;
 use std::{
-    collections::HashMap,
+    collections::{BTreeMap, HashMap},
     fs::{self, File},
     path::PathBuf,
 };
@@ -116,14 +117,14 @@ impl WGSBam {
         //     None
         // };
 
-        let composition = bam_compo(path.to_string_lossy().as_ref(), 20_000).context(format!(
+        let composition = bam_composition(path.to_string_lossy().as_ref(), 20_000).context(format!(
             "Error while reading BAM composition for {}",
             path.display()
         ))?;
 
         let s = Self {
             path,
-            bam_stats: WGSBamStats::new(&id, &time_point, Config::default())?,
+            bam_stats: WGSBamStats::new(&id, &time_point, &Config::default())?,
             // cramino,
             id: id.to_string(),
             time_point: time_point.to_string(),
@@ -229,39 +230,92 @@ pub fn load_bam_collection(result_dir: &str) -> BamCollection {
     BamCollection { bams }
 }
 
-pub fn bam_compo(
+/// Computes the composition of `(RG, fn)` tag prefixes in a BAM file sample.
+///
+/// This function parses a BAM file and, for a subset of records (`sample_size`),
+/// extracts the `RG` (read group) and `fn` (typically a file or flowcell ID) string tags from each record.
+/// It groups records by the prefix (before `_`, if present) of each tag,
+/// counts occurrences for each pair, and reports their proportion in the sample.
+///
+/// # Arguments
+///
+/// * `file_path` - Path to the BAM file.
+/// * `sample_size` - Maximum number of records to sample and process.
+///
+/// # Returns
+///
+/// Returns a vector of tuples, each containing:
+///   - `String`: the prefix of the `RG` tag (up to the first underscore, or full tag if no underscore)
+///   - `String`: the prefix of the `fn` tag (similarly processed)
+///   - `f64`: percentage of records with this `(RG, fn)` prefix pair, relative to total successfully processed records
+///
+/// # Errors
+///
+/// Returns an error if the BAM file cannot be opened or read.
+/// Records missing the `RG` or `fn` tag (or with wrong type) are silently skipped.
+///
+/// # Examples
+///
+/// ```no_run
+/// # use anyhow::Result;
+/// let stats = bam_composition("reads.bam", 10_000)?;
+/// for (rg, flowcell, pct) in stats {
+///     println!("{} / {}: {:.2}%", rg, flowcell, pct);
+/// }
+/// # Ok::<(), anyhow::Error>(())
+/// ```
+///
+/// # Notes
+///
+/// - Only the first `sample_size` records in the BAM are processed.
+/// - Percentages are normalized by the number of records that had both required tags.
+/// - This function uses the `rust-htslib` crate for BAM parsing.
+///
+/// # Tag requirements
+///
+/// - Both `RG` and `fn` tags must be present and of string type.
+/// - Tag values are split on the first underscore; only the prefix is used for grouping.
+///
+/// # See also
+///
+/// - [SAM/BAM tag conventions](https://samtools.github.io/hts-specs/SAMv1.pdf)
+///
+pub fn bam_composition(
     file_path: &str,
     sample_size: usize,
 ) -> anyhow::Result<Vec<(String, String, f64)>> {
-    let mut bam = rust_htslib::bam::Reader::from_path(file_path)
-        .map_err(|e| anyhow::anyhow!("Failed to open bam from path: {file_path}\n\t{e}"))?;
+    use std::collections::HashMap;
+    use rust_htslib::bam::{Reader, record::Aux};
 
+    let mut bam = Reader::from_path(file_path)?;
     let mut rgs: HashMap<(String, String), u64> = HashMap::new();
-    for result in bam.records().filter_map(Result::ok).take(sample_size) {
-        if let (
-            rust_htslib::bam::record::Aux::String(s),
-            rust_htslib::bam::record::Aux::String(b),
-        ) = (result.aux(b"RG")?, result.aux(b"fn")?)
-        {
-            *rgs.entry((s.to_string(), b.to_string())).or_default() += 1;
-        }
+    let mut total = 0u64;
+
+    for rec in bam.records().filter_map(Result::ok).take(sample_size) {
+        let rg = match rec.aux(b"RG") {
+            Ok(Aux::String(s)) => s,
+            _ => continue,
+        };
+        let fn_tag = match rec.aux(b"fn") {
+            Ok(Aux::String(b)) => b,
+            _ => continue,
+        };
+        let rg_prefix = rg.split_once('_').map(|(a, _)| a).unwrap_or(rg);
+        let fn_prefix = fn_tag.split_once('_').map(|(b, _)| b).unwrap_or(fn_tag);
+        *rgs.entry((rg_prefix.to_string(), fn_prefix.to_string())).or_default() += 1;
+        total += 1;
     }
 
-    Ok(rgs
+    let results: Vec<_> = rgs
         .into_iter()
-        .map(|(k, n)| {
-            (
-                k.0.to_string(),
-                k.1.to_string(),
-                n as f64 * 100.0 / sample_size as f64,
-            )
-        })
-        .map(|(rg, f, p)| {
-            let (a, _) = rg.split_once('_').unwrap();
-            let (b, _) = f.split_once('_').unwrap();
-            (a.to_string(), b.to_string(), p)
-        })
-        .collect())
+        .map(|((rg, fn_tag), n)| (
+            rg,
+            fn_tag,
+            n as f64 * 100.0 / total as f64
+        ))
+        .collect();
+
+    Ok(results)
 }
 
 pub fn bam_has_fc(bam: &WGSBam, flow_cell: &FlowCell) -> anyhow::Result<bool> {
@@ -576,161 +630,324 @@ pub fn sample_random_positions(chromosomes: &Vec<(String, u64)>, n: usize) -> Ve
         .collect()
 }
 
+/// High-level mapping statistics for a WGS BAM file.
+///
+/// This struct aggregates essential quality control and mapping metrics, suitable for reporting or downstream QC.
+/// Unless otherwise specified, units are in base pairs (bp) or counts of reads.
+///
+/// # Fields
+/// - `all_records`: Total number of records in the BAM, before any filtering.
+/// - `n_reads`: Number of primary, mapped, QC-passed, non-duplicate reads counted for statistics.
+/// - `mapped_fraction`: Fraction of mapped (counted) reads relative to all records.
+/// - `n_unmapped`: Number of unmapped reads (`BAM_FUNMAP`).
+/// - `n_duplicates`: Number of reads marked as duplicate (`BAM_FDUP`).
+/// - `mapped_yield`: Total sum of mapped read lengths (bp).
+/// - `mean_read_length`: Mean length of mapped reads (bp).
+/// - `median_read_length`: Median mapped read length (bp).
+/// - `coverage_stddev`: Standard deviation of per-read global coverage contribution.
+/// - `global_coverage`: Mean mapped yield divided by total genome size.
+/// - `karyotype`: Per-contig summary: (TID, contig length, contig name, mapped yield, coverage stddev).
+/// - `n50`: N50 statistic for mapped read lengths (bp).
+/// - `by_lengths`: Histogram of mapped read lengths as (length, count), sorted by length.
+///
+/// # Filtering Rule
+/// Only records that are:
+/// - primary alignments (not secondary/supplementary),
+/// - mapped (`!BAM_FUNMAP`),
+/// - pass vendor quality checks (`!BAM_FQCFAIL`),
+/// - not marked as duplicate (`!BAM_FDUP`),
+/// - and with mapping quality ≥ `bam_min_mapq`,
+/// are counted as mapped reads.
+///
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct WGSBamStats {
-    pub all_records: u128,
-    pub n_reads: u128,
-    pub mapped_yield: u128,
+    /// Total number of BAM records (including unmapped, filtered, duplicates).
+    pub all_records: u64,
+    /// Number of mapped, primary, QC-passed, non-duplicate reads (final filtered count).
+    pub n_reads: u64,
+    /// Fraction of counted mapped reads over all BAM records.
+    pub mapped_fraction: f64,
+    /// Number of unmapped records (flag 0x4).
+    pub n_unmapped: u64,
+    /// Number of duplicate records (flag 0x400).
+    pub n_duplicates: u64,
+    /// Number of duplicate records (flag 0x400).
+    pub n_lowq: u64,
+    /// Total sum of mapped read lengths (bp).
+    pub mapped_yield: u64,
+    /// Mean mapped read length (bp).
+    pub mean_read_length: f64,
+    /// Median mapped read length (bp).
+    pub median_read_length: u64,
+    /// Standard deviation of per-read global coverage contributions.
+    pub coverage_stddev: f64,
+    /// Mean global coverage: `mapped_yield / genome_size`.
     pub global_coverage: f64,
-    pub karyotype: Vec<(i32, u64, String, u128)>,
-    pub n50: u128,
-    pub by_lengths: Vec<(u128, u32)>,
+    /// (tid, contig_length, contig_name, mapped_yield, mean_coverage, coverage_stddev, coverage_ratio)
+    pub karyotype: Vec<(i32, u64, String, u64, f64, f64, f64)>,
+    /// N50 value of mapped read lengths (bp).
+    pub n50: u64,
+    /// Histogram of mapped read lengths: (length, count), sorted by length.
+    pub by_lengths: Vec<(u64, u32)>,
 }
 
 impl WGSBamStats {
-    /// Creates a new `WGSBamStats` by scanning the BAM file for a given case and time,
-    /// computing mapping statistics and coverage.
+    /// Computes `WGSBamStats` by scanning a BAM file and summarizing essential statistics.
     ///
     /// # Parameters
+    /// - `case_id`: Sample/case identifier used to locate the BAM file via config.
+    /// - `time`: Timestamp string for progress/logging.
+    /// - `config`: Configuration struct. Must provide:
+    ///     - `solo_bam(case_id, time)`: path to BAM file.
+    ///     - `bam_min_mapq`: minimum mapping quality to include read.
+    ///     - `bam_n_threads`: number of BAM decompression threads.
     ///
-    /// - `case_id`: Identifier for the sample/case, used to locate the BAM via `config.solo_bam`.
-    /// - `time`: Timestamp string included in progress log messages.
-    /// - `config`: Configuration struct providing:
-    ///     - `solo_bam(case_id, time)`: path to the BAM file,
-    ///     - `bam_min_mapq`: minimum mapping quality filter,
-    ///     - `bam_n_threads`: number of threads for decompression.
+    /// # Filtering Rule
+    /// Only primary, mapped, non-duplicate, QC-passed reads with sufficient mapping quality are included (see struct doc).
     ///
     /// # Returns
-    ///
-    /// A populated `YourStruct` with fields:
-    /// - `all_records`: total BAM records before filtering,
-    /// - `n_reads`: number of reads that passed all filters,
-    /// - `mapped_yield`: total sum of mapped read lengths,
-    /// - `global_coverage`: `mapped_yield / genome_size`,
-    /// - `karyotype`: vector of `(tid, contig_len, contig_name, mapped_sum)` per contig,
-    /// - `n50`: N50 of the mapped lengths,
-    /// - `by_lengths`: histogram of mapped-read lengths as `(length, count)`.
-    ///
-    /// # Errors
-    ///
-    /// Returns an error if:
-    /// - the BAM file cannot be opened or parsed,
-    /// - genome sizes cannot be retrieved.
-    pub fn new(case_id: &str, time: &str, config: Config) -> anyhow::Result<Self> {
+    /// On success, returns a fully populated `WGSBamStats` with all core and extended fields.
+    /// Returns an error if the BAM cannot be parsed or genome sizes cannot be retrieved.
+    pub fn new(case_id: &str, time: &str, config: &Config) -> anyhow::Result<Self> {
         let bam_path = config.solo_bam(case_id, time);
-        let Config {
-            bam_min_mapq,
-            bam_n_threads,
-            ..
-        } = config;
-        let mut bam = rust_htslib::bam::Reader::from_path(bam_path)?;
+        let bam_min_mapq = config.bam_min_mapq;
+        let bam_n_threads = config.bam_n_threads;
+
+        let mut bam = rust_htslib::bam::Reader::from_path(&bam_path)?;
         let h = bam.header().clone();
         let header = rust_htslib::bam::Header::from_template(&h);
         bam.set_threads(bam_n_threads as usize)?;
 
-        // 2) Sequential scan
-        let mut all_records = 0;
-        let mut n_reads = 0;
+        info!("Parsing BAM file: {bam_path}");
+
+        let mut all_records = 0u64;
+        let mut n_reads = 0u64;
+        let mut n_unmapped = 0u64;
+        let mut n_duplicates = 0u64;
+        let mut n_lowq = 0u64;
         let mut mapped_lengths = Vec::new();
-        let mut lengths_by_tid: HashMap<i32, Vec<u128>> = HashMap::new();
+        let mut lengths_by_tid: HashMap<i32, Vec<u64>> = HashMap::new();
 
+        // Main scan loop: apply flag and MAPQ filters.
         for rec in bam.rc_records() {
             all_records += 1;
             let r = rec.with_context(|| "failed to parse BAM record")?;
 
-            if (r.flags()
-                & (rust_htslib::htslib::BAM_FUNMAP
-                    | rust_htslib::htslib::BAM_FSECONDARY
-                    | rust_htslib::htslib::BAM_FQCFAIL
-                    | rust_htslib::htslib::BAM_FDUP) as u16
-                == 0)
-                || r.mapq() < bam_min_mapq
-            {
+            let flags = r.flags();
+            // Count and exclude unmapped.
+            if flags & (rust_htslib::htslib::BAM_FUNMAP as u16) != 0 {
+                n_unmapped += 1;
+                continue;
+            }
+            // Count and exclude duplicates.
+            if flags & (rust_htslib::htslib::BAM_FDUP as u16) != 0 {
+                n_duplicates += 1;
+                continue;
+            }
+            // Filter by mapping quality.
+            if r.mapq() < bam_min_mapq {
+                n_lowq += 1;
+                continue;
+            }
+            // Exclude secondary and QC-failed.
+            let bad_flags = rust_htslib::htslib::BAM_FSECONDARY | rust_htslib::htslib::BAM_FQCFAIL;
+            if (flags & (bad_flags as u16)) != 0 {
                 continue;
             }
 
             n_reads += 1;
-            let len = (r.reference_end() - r.pos()) as u128;
+            let start = r.pos();
+            let end = r.reference_end();
+            let len = if end > start { (end - start) as u64 } else { 0 };
             mapped_lengths.push(len);
             lengths_by_tid.entry(r.tid()).or_default().push(len);
 
             if n_reads % 500_000 == 0 {
-                info!("{case_id} {time}: processed {} mapped reads", n_reads);
+                info!("{case_id} {time}: processed {n_reads} mapped reads");
             }
         }
 
-        // 3) Yield & coverage
-        let mapped_yield: u128 = mapped_lengths.iter().sum();
-        let genome = get_genome_sizes(&header)?;
-        let genome_size: u128 = genome.values().map(|&v| v as u128).sum();
-        let global_coverage = mapped_yield as f64 / genome_size as f64;
+        let mapped_fraction = if all_records > 0 {
+            n_reads as f64 / all_records as f64
+        } else {
+            0.0
+        };
 
-        // 4) N50 (parallel sort)
+        // Compute mean, median, N50 (single sort).
         let mut lens = mapped_lengths.clone();
-        lens.par_sort_unstable();
+        lens.sort_unstable();
+
+        let mean_read_length = if n_reads > 0 {
+            mapped_lengths.iter().sum::<u64>() as f64 / n_reads as f64
+        } else {
+            0.0
+        };
+        let median_read_length = if lens.is_empty() {
+            0
+        } else {
+            lens[lens.len() / 2]
+        };
+        let mapped_yield: u64 = mapped_lengths.iter().sum();
+        // N50: minimal length L such that sum(lengths >= L) >= 50% yield.
+        let mut cum = 0u64;
         let half = mapped_yield / 2;
-        let mut cum = 0;
         let n50 = lens
-            .into_iter()
+            .iter()
             .rev()
-            .find(|&l| {
+            .find_map(|&l| {
                 cum += l;
-                cum >= half
+                if cum >= half {
+                    Some(l)
+                } else {
+                    None
+                }
             })
             .unwrap_or(0);
 
-        // 5) Length histogram (parallel)
-        let by_lengths: Vec<(u128, u32)> = {
-            let freq: DashMap<u128, u32> = DashMap::new();
-            mapped_lengths.par_iter().for_each(|&l| {
-                *freq.entry(l).or_default() += 1;
-            });
-            let mut v: Vec<_> = freq.iter().map(|e| (*e.key(), *e.value())).collect();
-            v.par_sort_unstable_by_key(|&(len, _)| len);
-            v
+        // Histogram (sorted by length).
+        let mut histo = BTreeMap::new();
+        for &len in &mapped_lengths {
+            *histo.entry(len).or_insert(0) += 1;
+        }
+        let by_lengths: Vec<_> = histo.into_iter().collect();
+
+        // Genome/coverage.
+        let genome = get_genome_sizes(&header)?;
+        let genome_size: u64 = genome.values().sum();
+        let global_coverage = if genome_size > 0 {
+            mapped_yield as f64 / genome_size as f64
+        } else {
+            0.0
         };
 
-        // 6) Karyotype
-        let mut karyotype: Vec<(i32, u64, String, u128)> = lengths_by_tid
+        // Global coverage stdev: stdev of per-read contributions to global coverage.
+        let mut total_sq_cov = 0.0;
+        for &len in &mapped_lengths {
+            let cov = if genome_size > 0 {
+                len as f64 / genome_size as f64
+            } else {
+                0.0
+            };
+            total_sq_cov += cov * cov;
+        }
+        let mean_cov = global_coverage;
+        let variance = if n_reads > 0 {
+            let var = total_sq_cov / n_reads as f64 - mean_cov.powi(2);
+            if var < 0.0 {
+                0.0
+            } else {
+                var
+            }
+        } else {
+            0.0
+        };
+        let coverage_stddev = variance.sqrt();
+
+        // Per-contig: (TID, contig_length, contig_name, mapped_yield, mean_coverage, coverage_stddev , coverage_ratio).
+        let mut karyotype: Vec<(i32, u64, String, u64, f64, f64, f64)> = lengths_by_tid
             .iter()
             .map(|(tid, lengths)| {
                 let contig = String::from_utf8(h.tid2name(*tid as u32).to_vec()).unwrap();
+                let contig_len = genome.get(&contig).copied().unwrap_or(0);
+                let mapped_sum: u64 = lengths.iter().sum();
+                // Per-contig mean coverage
+                let mean_cov = if contig_len > 0 {
+                    mapped_sum as f64 / contig_len as f64
+                } else {
+                    0.0
+                };
+                let coverage_ratio = if global_coverage > 0.0 {
+                    mean_cov / global_coverage
+                } else {
+                    0.0
+                };
+
+                // Per-contig coverage stdev (over all reads mapped to this contig)
+                let var = if contig_len > 0 && !lengths.is_empty() {
+                    lengths
+                        .iter()
+                        .map(|&l| {
+                            let cov = l as f64 / contig_len as f64;
+                            (cov - mean_cov).powi(2)
+                        })
+                        .sum::<f64>()
+                        / lengths.len() as f64
+                } else {
+                    0.0
+                };
+                let stddev = var.sqrt();
                 (
                     *tid,
-                    *genome.get(&contig).unwrap(),
+                    contig_len,
                     contig,
-                    lengths.par_iter().sum::<u128>(),
+                    mapped_sum,
+                    mean_cov,
+                    stddev,
+                    coverage_ratio,
                 )
             })
             .collect();
 
-        karyotype.par_sort_unstable_by_key(|&(tid, _, _, _)| tid);
+        karyotype.sort_unstable_by_key(|&(tid, _, _, _, _, _, _)| tid);
 
         Ok(Self {
             all_records,
             n_reads,
+            mapped_fraction,
+            n_unmapped,
+            n_duplicates,
             mapped_yield,
+            mean_read_length,
+            median_read_length,
+            coverage_stddev,
             global_coverage,
             karyotype,
             n50,
             by_lengths,
+            n_lowq,
         })
     }
+}
 
-    pub fn print(&self) {
-        println!("N records\t{}", self.all_records);
-        println!("N counted reads\t{}", self.n_reads);
+impl fmt::Display for WGSBamStats {
+    /// Formats the BAM statistics as a readable summary table.
+    ///
+    /// Shows all global stats and a per-contig breakdown with coverage stdev.
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "BAM statistics summary:")?;
+        writeln!(f, "  All BAM records:         {}", self.all_records)?;
+        writeln!(f, "  Unmapped reads:          {}", self.n_unmapped)?;
+        writeln!(f, "  Duplicate reads:         {}", self.n_duplicates)?;
+        writeln!(f, "  Low Q reads:         {}", self.n_lowq)?;
+        writeln!(f, "  Counted (mapped) reads:  {}", self.n_reads)?;
+        writeln!(f, "  Mapped fraction:         {:.4}", self.mapped_fraction)?;
+        writeln!(
+            f,
+            "  Mapped yield [Gb]:       {:.2}",
+            self.mapped_yield as f64 / 1e9
+        )?;
+        writeln!(f, "  Mean read length [bp]:   {:.2}", self.mean_read_length)?;
+        writeln!(f, "  Median read length [bp]: {}", self.median_read_length)?;
+        writeln!(f, "  N50 [bp]:                {}", self.n50)?;
+        writeln!(f, "  Global mean coverage:    {:.2}", self.global_coverage)?;
+        writeln!(f, "  Global coverage stdev:   {:.2}", self.coverage_stddev)?;
+        writeln!(f, "  Per-contig stats:")?;
+        writeln!(
+            f,
+            "    {:<8} {:<10} {:<12} {:<14} {:<12} {:<10} {:<10}",
+            "TID", "Length", "Name", "MappedYield", "CovMean", "CovStdev", "CovRatio"
+        )?;
+        for (tid, contig_len, contig, mapped_sum, mean_cov, stddev, coverage_ratio) in
+            &self.karyotype
+        {
+            writeln!(
+                f,
+                "    {:<8} {:<10} {:<12} {:<14} {:<12.4} {:.4} {:.2}",
+                tid, contig_len, contig, mapped_sum, mean_cov, stddev, coverage_ratio
+            )?;
+        }
 
-        println!("Mapped yield [Gb]\t{:.2}", self.mapped_yield as f64 / 1e9);
-        println!("Mean coverage\t{:.2}", self.global_coverage);
-        self.karyotype
-            .iter()
-            .for_each(|(_, contig_len, contig, contig_yield)| {
-                println!(
-                    "{contig}\t{:.2}",
-                    (*contig_yield as f64 / *contig_len as f64) / self.global_coverage
-                );
-            });
+        Ok(())
     }
 }
 
@@ -870,7 +1087,7 @@ pub fn nt_pileup_new(
     // Stream the pileup; HTSlib walks the reads for us.
     for pileup in bam.pileup() {
         // Attach context to any low‑level error.
-        let pileup = pileup.context(format!("can't pile up BAM at {chr}:{pos}"))?;
+        let pileup = pileup.context(format!("Failed to pileup BAM at {chr}:{pos}"))?;
 
         // Skip other positions quickly.
         if pileup.pos() != pos {

+ 3 - 0
src/collection/mod.rs

@@ -38,6 +38,9 @@ pub mod vcf;
 pub mod flowcells;
 pub mod minknow;
 
+#[cfg(test)]
+mod tests;
+
 #[derive(Debug, Clone)]
 pub struct CollectionsConfig {
     pub pod_dir: String,

+ 61 - 0
src/collection/tests.rs

@@ -0,0 +1,61 @@
+use log::info;
+
+use crate::{collection::bam::{bam_composition, WGSBam, WGSBamStats}, config::Config};
+
+fn init() {
+    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
+        .is_test(true)
+        .try_init();
+}
+
+#[test]
+fn wgs_bam_stats() -> anyhow::Result<()> {
+    init();
+    let case_id = "HOULNE";
+    let time = "diag";
+
+    info!("WGSBamStats::new");
+
+    let r = WGSBamStats::new(case_id, time, &Config::default())?;
+    println!("{r}");
+    assert_eq!(r.all_records, 22417713);
+    let c = r.karyotype.iter().find_map(|e| if e.2 == "chr9" { Some(e.6) } else { None }).unwrap();
+    assert_eq!(c, 0.8800890147031387);
+
+    Ok(())
+}
+
+#[test]
+fn wgs_bam_new() -> anyhow::Result<()> {
+    init();
+    let case_id = "HOULNE";
+    let time = "diag";
+
+    info!("WGSBam::new");
+
+    let c = Config::default();
+
+    let b = WGSBam::new(c.solo_bam(case_id, time).into())?;
+
+    println!("{:?}", b.composition);
+
+    Ok(())
+}
+
+#[test]
+fn bam_c() -> anyhow::Result<()> {
+    init();
+    let case_id = "HOULNE";
+    let time = "diag";
+
+    info!("WGSBam::new");
+
+    let c = Config::default();
+
+    let compo = bam_composition(&c.solo_bam(case_id, time), 20000)?;
+
+
+    println!("{:?}", compo);
+
+    Ok(())
+}

+ 3 - 3
src/commands/dorado.rs

@@ -9,7 +9,7 @@ use log::{debug, info, warn};
 use uuid::Uuid;
 
 use crate::{
-    collection::{bam::bam_compo, flowcells::FlowCell, pod5::FlowCellCase},
+    collection::{bam::bam_composition, flowcells::FlowCell, pod5::FlowCellCase},
     config::Config,
     helpers::find_unique_file,
     io::pod5_infos::Pod5Info,
@@ -209,11 +209,11 @@ impl Dorado {
     }
 
     pub fn merge_bam(&self, bam: &Path) -> anyhow::Result<()> {
-        let composition_a: Vec<String> = bam_compo(bam.to_string_lossy().as_ref(), 20000)?
+        let composition_a: Vec<String> = bam_composition(bam.to_string_lossy().as_ref(), 20000)?
             .iter()
             .map(|(i, _, _)| i.clone())
             .collect();
-        let composition_b: Vec<String> = bam_compo(&self.bam, 20000)?
+        let composition_b: Vec<String> = bam_composition(&self.bam, 20000)?
             .iter()
             .map(|(i, _, _)| i.clone())
             .collect();

+ 0 - 9
src/lib.rs

@@ -1070,15 +1070,6 @@ let variant: VcfVariant = row.parse()?;
         Ok(())
     }
 
-    #[test]
-    fn coverage() -> anyhow::Result<()> {
-        init();
-        let id = "CAMARA";
-        let time = "diag";
-        WGSBamStats::new(id, time, Config::default())?.print();
-        Ok(())
-    }
-
     #[test]
     fn load_bam() -> anyhow::Result<()> {
         init();