Thomas 7 月之前
父节点
当前提交
2c78c509a2
共有 2 个文件被更改,包括 30 次插入18 次删除
  1. 13 10
      src/collection/bam.rs
  2. 17 8
      src/collection/tests.rs

+ 13 - 10
src/collection/bam.rs

@@ -245,14 +245,14 @@ pub fn load_bam_collection(result_dir: &str) -> BamCollection {
 /// # Returns
 ///
 /// Returns a vector of tuples, each containing:
-///   - `String`: the prefix of the `RG` tag (up to the first underscore, or full tag if no underscore)
-///   - `String`: the prefix of the `fn` tag (similarly processed)
-///   - `f64`: percentage of records with this `(RG, fn)` prefix pair, relative to total successfully processed records
+///   - `String`: the <runid> (up to the first underscore of the `RG` tag, or full tag if no underscore)
+///   - `String`: the <flowcell-id> (up to the first underscore of the `fn` tag, or full tag if no underscore)
+///   - `f64`: percentage of records with this `(runid, flowcell-id)` prefix pair, relative to total successfully processed records
 ///
 /// # Errors
 ///
 /// Returns an error if the BAM file cannot be opened or read.
-/// Records missing the `RG` or `fn` tag (or with wrong type) are silently skipped.
+/// Any record in the sampled set does not have both required tags, or the tags are not strings.
 ///
 /// # Examples
 ///
@@ -279,6 +279,7 @@ pub fn load_bam_collection(result_dir: &str) -> BamCollection {
 /// # See also
 ///
 /// - [SAM/BAM tag conventions](https://samtools.github.io/hts-specs/SAMv1.pdf)
+/// - [Dorado SAM specifications](https://github.com/nanoporetech/dorado/blob/release-v1.0/documentation/SAM.md)
 ///
 pub fn bam_composition(
     file_path: &str,
@@ -289,21 +290,23 @@ pub fn bam_composition(
 
     let mut bam = Reader::from_path(file_path)?;
     let mut rgs: HashMap<(String, String), u64> = HashMap::new();
-    let mut total = 0u64;
 
-    for rec in bam.records().filter_map(Result::ok).take(sample_size) {
+    let mut processed = 0u64;
+    for (i, rec) in bam.records().filter_map(Result::ok).take(sample_size).enumerate() {
         let rg = match rec.aux(b"RG") {
             Ok(Aux::String(s)) => s,
-            _ => continue,
+            Ok(_) => return Err(anyhow::anyhow!("Record {}: RG tag is not a string", i + 1)),
+            Err(_) => return Err(anyhow::anyhow!("Record {}: RG tag missing", i + 1)),
         };
         let fn_tag = match rec.aux(b"fn") {
             Ok(Aux::String(b)) => b,
-            _ => continue,
+            Ok(_) => return Err(anyhow::anyhow!("Record {}: fn tag is not a string", i + 1)),
+            Err(_) => return Err(anyhow::anyhow!("Record {}: fn tag missing", i + 1)),
         };
         let rg_prefix = rg.split_once('_').map(|(a, _)| a).unwrap_or(rg);
         let fn_prefix = fn_tag.split_once('_').map(|(b, _)| b).unwrap_or(fn_tag);
         *rgs.entry((rg_prefix.to_string(), fn_prefix.to_string())).or_default() += 1;
-        total += 1;
+        processed += 1;
     }
 
     let results: Vec<_> = rgs
@@ -311,7 +314,7 @@ pub fn bam_composition(
         .map(|((rg, fn_tag), n)| (
             rg,
             fn_tag,
-            n as f64 * 100.0 / total as f64
+            n as f64 * 100.0 / processed as f64
         ))
         .collect();
 

+ 17 - 8
src/collection/tests.rs

@@ -1,6 +1,9 @@
 use log::info;
 
-use crate::{collection::bam::{bam_composition, WGSBam, WGSBamStats}, config::Config};
+use crate::{
+    collection::bam::{bam_composition, WGSBam, WGSBamStats},
+    config::Config,
+};
 
 fn init() {
     let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
@@ -9,7 +12,7 @@ fn init() {
 }
 
 #[test]
-fn wgs_bam_stats() -> anyhow::Result<()> {
+fn bam_stats() -> anyhow::Result<()> {
     init();
     let case_id = "HOULNE";
     let time = "diag";
@@ -19,14 +22,18 @@ fn wgs_bam_stats() -> anyhow::Result<()> {
     let r = WGSBamStats::new(case_id, time, &Config::default())?;
     println!("{r}");
     assert_eq!(r.all_records, 22417713);
-    let c = r.karyotype.iter().find_map(|e| if e.2 == "chr9" { Some(e.6) } else { None }).unwrap();
+    let c = r
+        .karyotype
+        .iter()
+        .find_map(|e| if e.2 == "chr9" { Some(e.6) } else { None })
+        .unwrap();
     assert_eq!(c, 0.8800890147031387);
 
     Ok(())
 }
 
 #[test]
-fn wgs_bam_new() -> anyhow::Result<()> {
+fn bam_new() -> anyhow::Result<()> {
     init();
     let case_id = "HOULNE";
     let time = "diag";
@@ -48,14 +55,16 @@ fn bam_c() -> anyhow::Result<()> {
     let case_id = "HOULNE";
     let time = "diag";
 
-    info!("WGSBam::new");
+    info!("bam_composition");
 
     let c = Config::default();
 
-    let compo = bam_composition(&c.solo_bam(case_id, time), 20000)?;
-
+    let compo = bam_composition(&c.solo_bam(case_id, time), 20_000)?;
+    assert_eq!(compo.len(), 2);
 
-    println!("{:?}", compo);
+    for (rg, flowcell, pct) in compo {
+        println!("{flowcell} {rg}: {pct:.2}%");
+    }
 
     Ok(())
 }