Thomas 6 months ago
parent
commit
c7fed14ed0
1 changed files with 80 additions and 36 deletions
  1. 80 36
      src/collection/bam.rs

+ 80 - 36
src/collection/bam.rs

@@ -587,8 +587,36 @@ pub struct WGSBamStats {
 }
 
 impl WGSBamStats {
-    pub fn new(id: &str, time: &str, config: Config) -> anyhow::Result<Self> {
-        let bam_path = config.solo_bam(id, time);
+    /// Creates a new `WGSBamStats` by scanning the BAM file for a given case and time,
+    /// computing mapping statistics and coverage.
+    ///
+    /// # Parameters
+    ///
+    /// - `case_id`: Identifier for the sample/case, used to locate the BAM via `config.solo_bam`.
+    /// - `time`: Timestamp string included in progress log messages.
+    /// - `config`: Configuration struct providing:
+    ///     - `solo_bam(case_id, time)`: path to the BAM file,
+    ///     - `bam_min_mapq`: minimum mapping quality filter,
+    ///     - `bam_n_threads`: number of threads for decompression.
+    ///
+    /// # Returns
+    ///
+    /// A populated `YourStruct` with fields:
+    /// - `all_records`: total BAM records before filtering,
+    /// - `n_reads`: number of reads that passed all filters,
+    /// - `mapped_yield`: total sum of mapped read lengths,
+    /// - `global_coverage`: `mapped_yield / genome_size`,
+    /// - `karyotype`: vector of `(tid, contig_len, contig_name, mapped_sum)` per contig,
+    /// - `n50`: N50 of the mapped lengths,
+    /// - `by_lengths`: histogram of mapped-read lengths as `(length, count)`.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - the BAM file cannot be opened or parsed,
+    /// - genome sizes cannot be retrieved.
+    pub fn new(case_id: &str, time: &str, config: Config) -> anyhow::Result<Self> {
+        let bam_path = config.solo_bam(case_id, time);
         let Config {
             bam_min_mapq,
             bam_n_threads,
@@ -599,53 +627,69 @@ impl WGSBamStats {
         let header = rust_htslib::bam::Header::from_template(&h);
         bam.set_threads(bam_n_threads as usize)?;
 
-        let mut mapped_lengths = Vec::new();
+        // 2) Sequential scan
         let mut all_records = 0;
         let mut n_reads = 0;
+        let mut mapped_lengths = Vec::new();
         let mut lengths_by_tid: HashMap<i32, Vec<u128>> = HashMap::new();
 
-        for read in bam
-            .rc_records()
-            .map(|x| x.expect("Failure parsing Bam file"))
-            .inspect(|_| all_records += 1)
-            .filter(|read| {
-                read.flags()
-                    & (rust_htslib::htslib::BAM_FUNMAP
-                        | rust_htslib::htslib::BAM_FSECONDARY
-                        | rust_htslib::htslib::BAM_FQCFAIL
-                        | rust_htslib::htslib::BAM_FDUP) as u16
-                    == 0
-            })
-            .filter(|r| r.mapq() >= bam_min_mapq)
-        {
+        for rec in bam.rc_records() {
+            all_records += 1;
+            let r = rec.with_context(|| "failed to parse BAM record")?;
+
+            if (r.flags()
+                & (rust_htslib::htslib::BAM_FUNMAP
+                    | rust_htslib::htslib::BAM_FSECONDARY
+                    | rust_htslib::htslib::BAM_FQCFAIL
+                    | rust_htslib::htslib::BAM_FDUP) as u16
+                == 0)
+                || r.mapq() < bam_min_mapq
+            {
+                continue;
+            }
+
             n_reads += 1;
-            let mapped_len = (read.reference_end() - read.pos()) as u128;
-            mapped_lengths.push(mapped_len);
-            lengths_by_tid
-                .entry(read.tid())
-                .or_default()
-                .push(mapped_len);
+            let len = (r.reference_end() - r.pos()) as u128;
+            mapped_lengths.push(len);
+            lengths_by_tid.entry(r.tid()).or_default().push(len);
+
             if n_reads % 500_000 == 0 {
-                info!("{n_reads} reads parsed");
+                info!("{case_id} {time}: processed {} mapped reads", n_reads);
             }
         }
 
-        let mapped_yield = mapped_lengths.par_iter().sum::<u128>();
+        // 3) Yield & coverage
+        let mapped_yield: u128 = mapped_lengths.iter().sum();
         let genome = get_genome_sizes(&header)?;
-        let genome_size = genome.values().map(|v| *v as u128).sum::<u128>();
+        let genome_size: u128 = genome.values().map(|&v| v as u128).sum();
         let global_coverage = mapped_yield as f64 / genome_size as f64;
 
-        let by_lengths: DashMap<u128, u32> = DashMap::new();
-        mapped_lengths
-            .par_iter()
-            .for_each(|l| *by_lengths.entry(*l).or_default() += 1);
+        // 4) N50 (parallel sort)
+        let mut lens = mapped_lengths.clone();
+        lens.par_sort_unstable();
+        let half = mapped_yield / 2;
+        let mut cum = 0;
+        let n50 = lens
+            .into_iter()
+            .rev()
+            .find(|&l| {
+                cum += l;
+                cum >= half
+            })
+            .unwrap_or(0);
 
-        let mut by_lengths: Vec<(u128, u32)> =
-            by_lengths.iter().map(|e| (*e.key(), *e.value())).collect();
-        by_lengths.par_sort_by(|a, b| a.0.cmp(&b.0));
-        // This is not n50
-        let n50 = by_lengths[by_lengths.len() / 2].0;
+        // 5) Length histogram (parallel)
+        let by_lengths: Vec<(u128, u32)> = {
+            let freq: DashMap<u128, u32> = DashMap::new();
+            mapped_lengths.par_iter().for_each(|&l| {
+                *freq.entry(l).or_default() += 1;
+            });
+            let mut v: Vec<_> = freq.iter().map(|e| (*e.key(), *e.value())).collect();
+            v.par_sort_unstable_by_key(|&(len, _)| len);
+            v
+        };
 
+        // 6) Karyotype
         let mut karyotype: Vec<(i32, u64, String, u128)> = lengths_by_tid
             .iter()
             .map(|(tid, lengths)| {
@@ -659,7 +703,7 @@ impl WGSBamStats {
             })
             .collect();
 
-        karyotype.sort_by(|(ac, _, _, _), (bc, _, _, _)| ac.cmp(bc));
+        karyotype.par_sort_unstable_by_key(|&(tid, _, _, _)| tid);
 
         Ok(Self {
             all_records,