Thomas преди 6 месеца
родител
ревизия
df2a122947
променени са 1 файла, в които са добавени 53 реда и са изтрити 70 реда
  1. 53 70
      src/collection/bam.rs

+ 53 - 70
src/collection/bam.rs

@@ -117,10 +117,9 @@ impl WGSBam {
         //     None
         // };
 
-        let composition = bam_composition(path.to_string_lossy().as_ref(), 20_000).context(format!(
-            "Error while reading BAM composition for {}",
-            path.display()
-        ))?;
+        let composition = bam_composition(path.to_string_lossy().as_ref(), 20_000).context(
+            format!("Error while reading BAM composition for {}", path.display()),
+        )?;
 
         let s = Self {
             path,
@@ -285,14 +284,19 @@ pub fn bam_composition(
     file_path: &str,
     sample_size: usize,
 ) -> anyhow::Result<Vec<(String, String, f64)>> {
+    use rust_htslib::bam::{record::Aux, Reader};
     use std::collections::HashMap;
-    use rust_htslib::bam::{Reader, record::Aux};
 
     let mut bam = Reader::from_path(file_path)?;
     let mut rgs: HashMap<(String, String), u64> = HashMap::new();
 
     let mut processed = 0u64;
-    for (i, rec) in bam.records().filter_map(Result::ok).take(sample_size).enumerate() {
+    for (i, rec) in bam
+        .records()
+        .filter_map(Result::ok)
+        .take(sample_size)
+        .enumerate()
+    {
         let rg = match rec.aux(b"RG") {
             Ok(Aux::String(s)) => s,
             Ok(_) => return Err(anyhow::anyhow!("Record {}: RG tag is not a string", i + 1)),
@@ -305,17 +309,14 @@ pub fn bam_composition(
         };
         let rg_prefix = rg.split_once('_').map(|(a, _)| a).unwrap_or(rg);
         let fn_prefix = fn_tag.split_once('_').map(|(b, _)| b).unwrap_or(fn_tag);
-        *rgs.entry((rg_prefix.to_string(), fn_prefix.to_string())).or_default() += 1;
+        *rgs.entry((rg_prefix.to_string(), fn_prefix.to_string()))
+            .or_default() += 1;
         processed += 1;
     }
 
     let results: Vec<_> = rgs
         .into_iter()
-        .map(|((rg, fn_tag), n)| (
-            rg,
-            fn_tag,
-            n as f64 * 100.0 / processed as f64
-        ))
+        .map(|((rg, fn_tag), n)| (rg, fn_tag, n as f64 * 100.0 / processed as f64))
         .collect();
 
     Ok(results)
@@ -979,15 +980,18 @@ pub enum PileBase {
     G,
     T,
     N,
-    Ins,                 // insertion immediately after the queried base
-    Del((Vec<u8>, u32)), // deletion covering the queried base
-    Skip,                // reference skip (N)
+    /// insertion immediately after the queried base
+    Ins,
+    /// deletion covering the queried base: (qname, del_len)
+    Del((Vec<u8>, u32)),
+    /// reference skip (N)
+    Skip,
 }
 
-/// Decode one HTSlib 4bit nucleotide (0=A,1=C,2=G,3=T,4=N, …) to `Base`
+/// Decode one HTSlib 4-bit nucleotide (0=A,1=C,2=G,3=T,4=N, …) to `PileBase`
 #[inline]
-fn decode(b: u8) -> PileBase {
-    match b & 0b1111 {
+fn decode(nibble: u8) -> PileBase {
+    match nibble & 0x0f {
         0 => PileBase::A,
         1 => PileBase::C,
         2 => PileBase::G,
@@ -996,31 +1000,24 @@ fn decode(b: u8) -> PileBase {
     }
 }
 
-pub fn base_at_new(
-    record: &rust_htslib::bam::Record,
-    at_pos: i64,
-    with_next_ins: bool,
-) -> Option<PileBase> {
+pub fn base_at_new(record: &rust_htslib::bam::Record, at_pos: i64, with_next_ins: bool) -> Option<PileBase> {
     if record.pos() > at_pos {
         return None;
     }
 
     let mut ref_pos = record.pos();
-    let mut read_pos = 0_i64;
+    let mut read_pos: i64 = 0;
     let cigar = record.cigar();
-    let seq = record.seq().as_bytes(); // already expanded to ASCII (A,C,G,T,N)
 
     for (i, op) in cigar.iter().enumerate() {
-        let l = op.len() as i64;
-        let (consume_read, consume_ref) = match op.char() {
-            'M' | '=' | 'X' => (l, l),
-            'I' | 'S' => (l, 0),
-            'D' | 'N' => (0, l),
-            'H' | 'P' => (0, 0),
-            _ => unreachable!(),
+        let (consume_read, consume_ref) = match *op {
+            Cigar::Match(l) | Cigar::Equal(l) | Cigar::Diff(l) => (l as i64, l as i64),
+            Cigar::Ins(l) | Cigar::SoftClip(l) => (l as i64, 0),
+            Cigar::Del(l) | Cigar::RefSkip(l) => (0, l as i64),
+            Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
         };
 
-        /* look ahead for an insertion immediately AFTER the queried base */
+        // Look-ahead: insertion immediately AFTER the queried base?
         if with_next_ins
             && ref_pos + consume_ref == at_pos + 1
             && matches!(cigar.get(i + 1), Some(Cigar::Ins(_)))
@@ -1028,25 +1025,21 @@ pub fn base_at_new(
             return Some(PileBase::Ins);
         }
 
-        // Instead of PileBase::Ins, consider
-        // extracting the inserted sequence and returning its length or actual bases:
-        // if let Some(Cigar::Ins(len)) = cigar.get(i + 1) {
-        //     let ins_seq = &seq[read_pos as usize + (consume_ref as usize)
-        //         ..read_pos as usize + (consume_ref as usize) + (*len as usize)];
-        //     return Some(PileBase::Ins(ins_seq.to_vec()));
-        // }
-
-        /* Does the queried reference coordinate fall inside this CIGAR op? */
+        // Does the queried reference coordinate fall inside this CIGAR op?
         if ref_pos + consume_ref > at_pos {
-            return Some(match op {
-                Cigar::Del(e) => PileBase::Del((record.qname().to_vec(), *e)),
+            return Some(match *op {
+                Cigar::Del(l) => PileBase::Del((record.qname().to_vec(), l)),
                 Cigar::RefSkip(_) => PileBase::Skip,
+                // Match/mismatch or soft-clipped covering (only match/mismatch should consume ref)
                 _ => {
                     let offset = (at_pos - ref_pos) as usize;
-                    if read_pos as usize + offset >= seq.len() {
-                        return None; // out of range, malformed BAM
+                    // Guard against malformed records
+                    let seq_len = record.seq_len();
+                    if (read_pos as usize) + offset >= seq_len {
+                        return None;
                     }
-                    decode(seq[read_pos as usize + offset])
+                    let b4 = record.seq().encoded_base((read_pos as usize) + offset);
+                    decode(b4)
                 }
             });
         }
@@ -1058,57 +1051,47 @@ pub fn base_at_new(
     None // beyond alignment
 }
 
-/// Return the pile‑up of **all reads** at `chr:pos` (0‑based, inclusive).
+/// Return the pile-up of **all reads** at `chr:pos` (0-based, inclusive).
 ///
-/// * `bam`           – an open [`rust_htslib::bam::IndexedReader`].  
-/// * `chr` / `pos`   – reference contig name and coordinate.  
-/// * `with_next_ins` – if `true`, report [`Base::Ins`] when an insertion starts
+/// * `bam`           – an open `rust_htslib::bam::IndexedReader`.
+/// * `chr` / `pos`   – reference contig name and coordinate.
+/// * `with_next_ins` – if `true`, report `PileBase::Ins` when an insertion starts
 ///                     **right after** the queried base.
 ///
-/// The function bounds the internal pile‑up depth to 10 000 reads to protect
+/// The function bounds the internal pile-up depth to 10 000 reads to protect
 /// against malformed BAM files that could otherwise allocate unbounded memory.
-///
-/// # Errors
-///
-/// Propagates any I/O or parsing error from *rust‑htslib*, wrapped in
-/// [`anyhow::Error`].
 pub fn nt_pileup_new(
     bam: &mut rust_htslib::bam::IndexedReader,
     chr: &str,
     pos: u32,
     with_next_ins: bool,
 ) -> anyhow::Result<Vec<PileBase>> {
-    // Storage for the per‑read observations.
     let mut pile = Vec::new();
 
-    // Restrict BAM iterator to the onebase span of interest.
+    // Restrict BAM iterator to the one-base span of interest.
     bam.fetch((chr, pos, pos + 1))?;
 
-    // Hard cap on depth (adjust if your use‑case needs deeper coverage).
-    bam.pileup().set_max_depth(10_000);
+    // Apply the depth cap to the SAME iterator we traverse.
+    let mut pls = bam.pileup();
+    pls.set_max_depth(10_000);
 
-    // Stream the pileup; HTSlib walks the reads for us.
-    for pileup in bam.pileup() {
-        // Attach context to any low‑level error.
+    for pileup in pls {
         let pileup = pileup.context(format!("Failed to pileup BAM at {chr}:{pos}"))?;
-
-        // Skip other positions quickly.
         if pileup.pos() != pos {
             continue;
         }
 
         for aln in pileup.alignments() {
-            // Handle the three indel states reported by HTSlib.
             match aln.indel() {
                 rust_htslib::bam::pileup::Indel::Ins(_) => {
-                    pile.push(PileBase::Ins); // insertion *starts at* this base
+                    // insertion *starts at* this reference position
+                    pile.push(PileBase::Ins);
                 }
                 rust_htslib::bam::pileup::Indel::Del(e) => {
                     let qname = aln.record().qname().to_vec();
-                    pile.push(PileBase::Del((qname, e))); // deletion length > 1, but still covers here
+                    pile.push(PileBase::Del((qname, e)));
                 }
                 rust_htslib::bam::pileup::Indel::None => {
-                    // Regular match/mismatch: delegate to `base_at`.
                     if let Some(base) = base_at_new(&aln.record(), pos as i64, with_next_ins) {
                         pile.push(base);
                     }