Thomas 6 months ago
parent
commit
91aa4d1732
2 changed files with 46 additions and 21 deletions
  1. 42 19
      src/collection/bam.rs
  2. 4 2
      src/collection/flowcells.rs

+ 42 - 19
src/collection/bam.rs

@@ -229,19 +229,32 @@ pub fn load_bam_collection(result_dir: &str) -> BamCollection {
     BamCollection { bams }
 }
 
-pub fn bam_compo(file_path: &str, sample_size: usize) -> anyhow::Result<Vec<(String, String, f64)>> {
+pub fn bam_compo(
+    file_path: &str,
+    sample_size: usize,
+) -> anyhow::Result<Vec<(String, String, f64)>> {
     let mut bam = rust_htslib::bam::Reader::from_path(file_path)?;
 
-    let mut rgs: HashMap<( String, String ), u64> = HashMap::new();
+    let mut rgs: HashMap<(String, String), u64> = HashMap::new();
     for result in bam.records().filter_map(Result::ok).take(sample_size) {
-        if let ( rust_htslib::bam::record::Aux::String(s),  rust_htslib::bam::record::Aux::String(b)) = ( result.aux(b"RG")?, result.aux(b"fn")? ) {
-            *rgs.entry(( s.to_string(), b.to_string())).or_default() += 1;
+        if let (
+            rust_htslib::bam::record::Aux::String(s),
+            rust_htslib::bam::record::Aux::String(b),
+        ) = (result.aux(b"RG")?, result.aux(b"fn")?)
+        {
+            *rgs.entry((s.to_string(), b.to_string())).or_default() += 1;
         }
     }
 
     Ok(rgs
         .into_iter()
-        .map(|(k, n)| (k.0.to_string(), k.1.to_string(), n as f64 * 100.0 / sample_size as f64))
+        .map(|(k, n)| {
+            (
+                k.0.to_string(),
+                k.1.to_string(),
+                n as f64 * 100.0 / sample_size as f64,
+            )
+        })
         .map(|(rg, f, p)| {
             let (a, _) = rg.split_once('_').unwrap();
             let (b, _) = f.split_once('_').unwrap();
@@ -722,9 +735,9 @@ pub fn base_at_new(
     record: &rust_htslib::bam::Record,
     at_pos: i64,
     with_next_ins: bool,
-) -> anyhow::Result<Option<PileBase>> {
+) -> Option<PileBase> {
     if record.pos() > at_pos {
-        return Ok(None);
+        return None;
     }
 
     let mut ref_pos = record.pos();
@@ -733,11 +746,13 @@ pub fn base_at_new(
     let seq = record.seq().as_bytes(); // already expanded to ASCII (A,C,G,T,N)
 
     for (i, op) in cigar.iter().enumerate() {
-        let (consume_r, consume_ref) = match op {
-            Cigar::Match(l) | Cigar::Equal(l) | Cigar::Diff(l) => (*l as i64, *l as i64),
-            Cigar::Ins(l) | Cigar::SoftClip(l) => (*l as i64, 0),
-            Cigar::Del(l) | Cigar::RefSkip(l) => (0, *l as i64),
-            _ => (0, 0),
+        let l = op.len() as i64;
+        let (consume_read, consume_ref) = match op.char() {
+            'M' | '=' | 'X' => (l, l),
+            'I' | 'S' => (l, 0),
+            'D' | 'N' => (0, l),
+            'H' | 'P' => (0, 0),
+            _ => unreachable!(),
         };
 
         /* look ahead for an insertion immediately AFTER the queried base */
@@ -745,29 +760,37 @@ pub fn base_at_new(
             && ref_pos + consume_ref == at_pos + 1
             && matches!(cigar.get(i + 1), Some(Cigar::Ins(_)))
         {
-            return Ok(Some(PileBase::Ins));
+            return Some(PileBase::Ins);
         }
 
+        // Instead of PileBase::Ins, consider
+        // extracting the inserted sequence and returning its length or actual bases:
+        // if let Some(Cigar::Ins(len)) = cigar.get(i + 1) {
+        //     let ins_seq = &seq[read_pos as usize + (consume_ref as usize)
+        //         ..read_pos as usize + (consume_ref as usize) + (*len as usize)];
+        //     return Some(PileBase::Ins(ins_seq.to_vec()));
+        // }
+
         /* Does the queried reference coordinate fall inside this CIGAR op? */
         if ref_pos + consume_ref > at_pos {
-            return Ok(Some(match op {
+            return Some(match op {
                 Cigar::Del(e) => PileBase::Del((record.qname().to_vec(), *e)),
                 Cigar::RefSkip(_) => PileBase::Skip,
                 _ => {
                     let offset = (at_pos - ref_pos) as usize;
                     if read_pos as usize + offset >= seq.len() {
-                        return Ok(None); // out‑of‑range, malformed BAM
+                        return None; // out of range, malformed BAM
                     }
                     decode(seq[read_pos as usize + offset])
                 }
-            }));
+            });
         }
 
-        read_pos += consume_r;
+        read_pos += consume_read;
         ref_pos += consume_ref;
     }
 
-    Ok(None) // beyond alignment
+    None // beyond alignment
 }
 
 /// Return the pile‑up of **all reads** at `chr:pos` (0‑based, inclusive).
@@ -821,7 +844,7 @@ pub fn nt_pileup_new(
                 }
                 rust_htslib::bam::pileup::Indel::None => {
                     // Regular match/mismatch: delegate to `base_at`.
-                    if let Some(base) = base_at_new(&aln.record(), pos as i64, with_next_ins)? {
+                    if let Some(base) = base_at_new(&aln.record(), pos as i64, with_next_ins) {
                         pile.push(base);
                     }
                 }

+ 4 - 2
src/collection/flowcells.rs

@@ -15,7 +15,8 @@ use serde::{Deserialize, Serialize};
 
 use crate::{
     collection::minknow::{parse_pore_activity_from_reader, parse_throughput_from_reader},
-    helpers::{find_files, list_directories}, io::{readers::get_gz_reader, writers::get_gz_writer},
+    helpers::{find_files, list_directories},
+    io::{readers::get_gz_reader, writers::get_gz_writer},
 };
 
 use super::minknow::{MinKnowSampleSheet, PoreStateEntry, PoreStateEntryExt, ThroughputEntry};
@@ -328,7 +329,8 @@ impl FlowCells {
         // Load existing archive, if any
         let mut all: Vec<FlowCell> = if Path::new(save_path).exists() {
             let file = File::open(save_path)?;
-            serde_json::from_reader(BufReader::new(file))?
+            serde_json::from_reader(BufReader::new(file))
+                .map_err(|e| anyhow::anyhow!("Failed to parse: {save_path}.\n\t{e}"))?
         } else {
             Vec::new()
         };