|
|
@@ -587,8 +587,36 @@ pub struct WGSBamStats {
|
|
|
}
|
|
|
|
|
|
impl WGSBamStats {
|
|
|
- pub fn new(id: &str, time: &str, config: Config) -> anyhow::Result<Self> {
|
|
|
- let bam_path = config.solo_bam(id, time);
|
|
|
+ /// Creates a new `WGSBamStats` by scanning the BAM file for a given case and time,
|
|
|
+ /// computing mapping statistics and coverage.
|
|
|
+ ///
|
|
|
+ /// # Parameters
|
|
|
+ ///
|
|
|
+ /// - `case_id`: Identifier for the sample/case, used to locate the BAM via `config.solo_bam`.
|
|
|
+ /// - `time`: Timestamp string included in progress log messages.
|
|
|
+ /// - `config`: Configuration struct providing:
|
|
|
+ /// - `solo_bam(case_id, time)`: path to the BAM file,
|
|
|
+ /// - `bam_min_mapq`: minimum mapping quality filter,
|
|
|
+ /// - `bam_n_threads`: number of threads for decompression.
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ ///
|
|
|
+ /// A populated `YourStruct` with fields:
|
|
|
+ /// - `all_records`: total BAM records before filtering,
|
|
|
+ /// - `n_reads`: number of reads that passed all filters,
|
|
|
+ /// - `mapped_yield`: total sum of mapped read lengths,
|
|
|
+ /// - `global_coverage`: `mapped_yield / genome_size`,
|
|
|
+ /// - `karyotype`: vector of `(tid, contig_len, contig_name, mapped_sum)` per contig,
|
|
|
+ /// - `n50`: N50 of the mapped lengths,
|
|
|
+ /// - `by_lengths`: histogram of mapped-read lengths as `(length, count)`.
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ ///
|
|
|
+ /// Returns an error if:
|
|
|
+ /// - the BAM file cannot be opened or parsed,
|
|
|
+ /// - genome sizes cannot be retrieved.
|
|
|
+ pub fn new(case_id: &str, time: &str, config: Config) -> anyhow::Result<Self> {
|
|
|
+ let bam_path = config.solo_bam(case_id, time);
|
|
|
let Config {
|
|
|
bam_min_mapq,
|
|
|
bam_n_threads,
|
|
|
@@ -599,53 +627,69 @@ impl WGSBamStats {
|
|
|
let header = rust_htslib::bam::Header::from_template(&h);
|
|
|
bam.set_threads(bam_n_threads as usize)?;
|
|
|
|
|
|
- let mut mapped_lengths = Vec::new();
|
|
|
+ // 2) Sequential scan
|
|
|
let mut all_records = 0;
|
|
|
let mut n_reads = 0;
|
|
|
+ let mut mapped_lengths = Vec::new();
|
|
|
let mut lengths_by_tid: HashMap<i32, Vec<u128>> = HashMap::new();
|
|
|
|
|
|
- for read in bam
|
|
|
- .rc_records()
|
|
|
- .map(|x| x.expect("Failure parsing Bam file"))
|
|
|
- .inspect(|_| all_records += 1)
|
|
|
- .filter(|read| {
|
|
|
- read.flags()
|
|
|
- & (rust_htslib::htslib::BAM_FUNMAP
|
|
|
- | rust_htslib::htslib::BAM_FSECONDARY
|
|
|
- | rust_htslib::htslib::BAM_FQCFAIL
|
|
|
- | rust_htslib::htslib::BAM_FDUP) as u16
|
|
|
- == 0
|
|
|
- })
|
|
|
- .filter(|r| r.mapq() >= bam_min_mapq)
|
|
|
- {
|
|
|
+ for rec in bam.rc_records() {
|
|
|
+ all_records += 1;
|
|
|
+ let r = rec.with_context(|| "failed to parse BAM record")?;
|
|
|
+
|
|
|
+ if (r.flags()
|
|
|
+ & (rust_htslib::htslib::BAM_FUNMAP
|
|
|
+ | rust_htslib::htslib::BAM_FSECONDARY
|
|
|
+ | rust_htslib::htslib::BAM_FQCFAIL
|
|
|
+ | rust_htslib::htslib::BAM_FDUP) as u16
|
|
|
+ == 0)
|
|
|
+ || r.mapq() < bam_min_mapq
|
|
|
+ {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
n_reads += 1;
|
|
|
- let mapped_len = (read.reference_end() - read.pos()) as u128;
|
|
|
- mapped_lengths.push(mapped_len);
|
|
|
- lengths_by_tid
|
|
|
- .entry(read.tid())
|
|
|
- .or_default()
|
|
|
- .push(mapped_len);
|
|
|
+ let len = (r.reference_end() - r.pos()) as u128;
|
|
|
+ mapped_lengths.push(len);
|
|
|
+ lengths_by_tid.entry(r.tid()).or_default().push(len);
|
|
|
+
|
|
|
if n_reads % 500_000 == 0 {
|
|
|
- info!("{n_reads} reads parsed");
|
|
|
+ info!("{case_id} {time}: processed {} mapped reads", n_reads);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- let mapped_yield = mapped_lengths.par_iter().sum::<u128>();
|
|
|
+ // 3) Yield & coverage
|
|
|
+ let mapped_yield: u128 = mapped_lengths.iter().sum();
|
|
|
let genome = get_genome_sizes(&header)?;
|
|
|
- let genome_size = genome.values().map(|v| *v as u128).sum::<u128>();
|
|
|
+ let genome_size: u128 = genome.values().map(|&v| v as u128).sum();
|
|
|
let global_coverage = mapped_yield as f64 / genome_size as f64;
|
|
|
|
|
|
- let by_lengths: DashMap<u128, u32> = DashMap::new();
|
|
|
- mapped_lengths
|
|
|
- .par_iter()
|
|
|
- .for_each(|l| *by_lengths.entry(*l).or_default() += 1);
|
|
|
+ // 4) N50 (parallel sort)
|
|
|
+ let mut lens = mapped_lengths.clone();
|
|
|
+ lens.par_sort_unstable();
|
|
|
+ let half = mapped_yield / 2;
|
|
|
+ let mut cum = 0;
|
|
|
+ let n50 = lens
|
|
|
+ .into_iter()
|
|
|
+ .rev()
|
|
|
+ .find(|&l| {
|
|
|
+ cum += l;
|
|
|
+ cum >= half
|
|
|
+ })
|
|
|
+ .unwrap_or(0);
|
|
|
|
|
|
- let mut by_lengths: Vec<(u128, u32)> =
|
|
|
- by_lengths.iter().map(|e| (*e.key(), *e.value())).collect();
|
|
|
- by_lengths.par_sort_by(|a, b| a.0.cmp(&b.0));
|
|
|
- // This is not n50
|
|
|
- let n50 = by_lengths[by_lengths.len() / 2].0;
|
|
|
+ // 5) Length histogram (parallel)
|
|
|
+ let by_lengths: Vec<(u128, u32)> = {
|
|
|
+ let freq: DashMap<u128, u32> = DashMap::new();
|
|
|
+ mapped_lengths.par_iter().for_each(|&l| {
|
|
|
+ *freq.entry(l).or_default() += 1;
|
|
|
+ });
|
|
|
+ let mut v: Vec<_> = freq.iter().map(|e| (*e.key(), *e.value())).collect();
|
|
|
+ v.par_sort_unstable_by_key(|&(len, _)| len);
|
|
|
+ v
|
|
|
+ };
|
|
|
|
|
|
+ // 6) Karyotype
|
|
|
let mut karyotype: Vec<(i32, u64, String, u128)> = lengths_by_tid
|
|
|
.iter()
|
|
|
.map(|(tid, lengths)| {
|
|
|
@@ -659,7 +703,7 @@ impl WGSBamStats {
|
|
|
})
|
|
|
.collect();
|
|
|
|
|
|
- karyotype.sort_by(|(ac, _, _, _), (bc, _, _, _)| ac.cmp(bc));
|
|
|
+ karyotype.par_sort_unstable_by_key(|&(tid, _, _, _)| tid);
|
|
|
|
|
|
Ok(Self {
|
|
|
all_records,
|