Browse Source

SA tag CIGAR != corresponding supplementary CIGAR!

Thomas 1 year ago
parent
commit
f84fb56ccd
3 changed files with 199 additions and 84 deletions
  1. 125 0
      src/bam.rs
  2. 44 82
      src/bin.rs
  3. 30 2
      src/lib.rs

+ 125 - 0
src/bam.rs

@@ -0,0 +1,125 @@
+use std::collections::HashMap;
+
+use anyhow::Context;
+use rust_htslib::bam::{
+    ext::BamRecordExtensions, record::{Aux, Cigar, CigarString}, Header, IndexedReader, Read, Record,
+};
+
+pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
+    if record.is_supplementary() {
+        let qname = record.qname();
+
+        if let Ok(Aux::String(sa)) = record.aux(b"SA") {
+            let positions: Vec<(&str, i32)> = sa
+                .split(';')
+                .filter(|s| !s.is_empty())
+                .map(|s| {
+                    let parts: Vec<&str> = s.split(',').take(2).collect();
+                    let chr = parts.first().unwrap();
+                    let position: i32 = parts.get(1).unwrap().parse().unwrap();
+                    (*chr, position)
+                })
+                .collect();
+
+            for (chr, start) in positions {
+                bam.fetch((chr, start, start + 1)).unwrap();
+                for record in bam.records().flatten() {
+                    if qname == record.qname() && !record.is_supplementary() {
+                        return record.clone();
+                    }
+                }
+            }
+        }
+    }
+    record
+}
+
+pub fn get_all_positions(record: &Record, tid_2_contig: &HashMap<i32, String>, bam: &mut IndexedReader) {
+    let mut positions = Vec::new();
+    let qname = record.qname();
+    if let Some(contig) = tid_2_contig.get(&record.tid()) {
+        positions.push((contig.to_string(), record.reference_start() as u32));
+    }
+
+    if let Ok(Aux::String(sa)) = record.aux(b"SA") {
+        let positions: Vec<(&str, u32, u32)> = sa
+            .split(';')
+            .filter(|s| !s.is_empty())
+            .map(|s| {
+                let parts: Vec<&str> = s.split(',').take(4).collect();
+                let chr = parts.first().unwrap();
+                let start: u32 = parts.get(1).unwrap().parse().unwrap();
+                // SAM format is 1-based!
+                let start = start - 1;
+                let strand = parts.get(2).unwrap();
+                let cigar_str = *parts.get(3).unwrap();
+                let end = calculate_end_position(start, cigar_str).unwrap();
+
+                let mut founded = false;
+                bam.fetch((chr, start, start + 1)).unwrap();
+                for record in bam.records().flatten() {
+                        let rc = record.cigar().to_string();
+                    if qname == record.qname() && cigar_str == rc {
+                        founded = true;
+                        assert_eq!(start, record.reference_start() as u32, "start {chr}:{start}-{end} {strand} {cigar_str} {}", record.cigar());
+                        if end != record.reference_end() as u32 {
+                            println!("end {chr}:{start}-{end} {strand} {cigar_str} {}", record.cigar());
+                        }
+                    }
+                }
+                if !founded {
+                    println!("NOT FouNDED {chr}:{start}-{end} {strand} {cigar_str}");
+
+                }
+
+                (*chr, start, end)
+            })
+            .collect();
+
+        // for (chr, start) in positions {
+        //     bam.fetch((chr, start, start + 1)).unwrap();
+        //     for record in bam.records().flatten() {
+        //         if qname == record.qname() && !record.is_supplementary() {
+        //             return record.clone();
+        //         }
+        //     }
+        // }
+    }
+}
+
+fn calculate_end_position(start: u32, cigar_str: &str) -> Result<u32, String> {
+    let mut end = start;
+    let mut count = String::new();
+
+    for c in cigar_str.chars() {
+        if c.is_digit(10) {
+            count.push(c);
+        } else {
+            let len: u32 = count.parse().map_err(|e| format!("Parse error: {}", e))?;
+            count.clear();
+            match c {
+                'M' | 'D' | 'N' | '=' | 'X' => {
+                    end += len;
+                }
+                'I' | 'S' | 'H' | 'P' => {
+                    // These operations do not consume reference positions
+                }
+                _ => return Err(format!("Invalid CIGAR operation: {}", c)),
+            }
+        }
+    }
+
+    Ok(end)
+}
+
+pub fn create_tid_2_contig(bam_path: &str) -> anyhow::Result<HashMap<i32, String>> {
+    let bam_reader = rust_htslib::bam::IndexedReader::from_path(bam_path)
+            .context(format!("Can't open {}", bam_path))?;
+    let h = bam_reader.header();
+    let mut r = HashMap::new();
+    h.target_names().iter().enumerate().for_each(|(i, name)| {
+        let tid = (i + 1) as i32;
+        r.insert(tid, String::from_utf8(name.to_vec()).unwrap());
+    });
+    Ok(r)
+}

+ 44 - 82
src/bin.rs

@@ -2,7 +2,9 @@ use rayon::prelude::*;
 use std::{collections::HashMap, usize};
 
 use anyhow::Context;
-use rust_htslib::bam::{ext::BamRecordExtensions, record::Aux, Read, Record};
+use rust_htslib::bam::{ext::BamRecordExtensions, record::Aux, IndexedReader, Read, Record};
+
+use crate::bam::primary_record;
 
 /// Enforce that reads should have unique qnames
 #[derive(Debug)]
@@ -11,6 +13,7 @@ pub struct Bin {
     pub start: u32, // 0-based inclusif
     pub end: u32,
     pub reads_store: HashMap<Vec<u8>, Record>,
+    pub bam_reader: IndexedReader,
 }
 
 impl Bin {
@@ -39,6 +42,7 @@ impl Bin {
             start,
             end,
             reads_store,
+            bam_reader,
         })
     }
 
@@ -53,91 +57,41 @@ impl Bin {
             .count()
     }
 
+    pub fn sa_primary(&mut self) -> Vec<Record> {
+        self.reads_store
+            .values()
+            .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
+            .map(|record| primary_record(&mut self.bam_reader, record.clone()))
+            .collect()
+    }
+
     pub fn max_start_or_end(&self) -> (u32, usize) {
-        let mut starts: HashMap<u32, usize> = HashMap::new();
-        let mut ends: HashMap<u32, usize> = HashMap::new();
-        self.reads_store.values().for_each(|record| {
+        let mut se: HashMap<u32, usize> = HashMap::new();
+        for record in self.reads_store.values() {
             let reference_start = record.reference_start() as u32;
             let reference_end = record.reference_end() as u32;
 
             if reference_start >= self.start && reference_start <= self.end {
-                *starts.entry(reference_start).or_default() += 1;
+                *se.entry(reference_start).or_default() += 1;
             }
             if reference_end >= self.start && reference_end <= self.end {
-                *ends.entry(reference_end).or_default() += 1;
+                *se.entry(reference_end).or_default() += 1;
             }
-        });
-
-        let max_pos_start = starts.into_iter().max_by_key(|(_, v)| *v);
-        let max_pos_end = ends.into_iter().max_by_key(|(_, v)| *v);
-
-        if let (Some(s), Some(e)) = (max_pos_start, max_pos_end) {
-            if s > e {
-                s
-            } else {
-                e
-            }
-        } else {
-            (0, 0)
         }
+
+        let max_pos = se.into_iter().max_by_key(|(_, v)| *v);
+        max_pos.unwrap_or((0, 0))
     }
 
-    // Initiate
-    // let mut reads_starts: Vec<Vec<Record>> = Vec::new();
-    // reads_starts.resize(length as usize, vec![]);
-    // let mut reads_ends: Vec<Vec<Record>> = Vec::new();
-    // reads_ends.resize(length as usize, vec![]);
-    //
-    // for read in bam_reader.records() {
-    //     let record = read.context(format!("Error while parsing record"))?;
-    //     // Skip reads with low mapping quality
-    //     if record.mapq() < mapq {
-    //         continue;
-    //     }
-    //
-    //     let read_start = record.reference_start() as u32;
-    //     let read_end = record.reference_end() as u32;
-    //
-    //     if read_start >= start && read_start < end {
-    //         let index = read_start - start;
-    //         let at_pos = reads_starts.get_mut(index as usize).unwrap();
-    //         at_pos.push(record.clone());
-    //     }
-    //
-    //     if read_end >= start && read_end < end {
-    //         let index = read_end - start;
-    //         let at_pos = reads_ends.get_mut(index as usize).unwrap();
-    //         at_pos.push(record.clone());
-    //     }
-    // }
-
-    // let mut bam_pileup = Vec::new();
-    // for p in bam.pileup() {
-    //     let pileup = p.context(format!(
-    //         "Can't pilup bam at position {}:{}-{}",
-    //         chr, start, stop
-    //     ))?;
-    //     let position = pileup.pos() as i32;
-    //     if position == start {
-    //         for alignment in pileup.alignments() {
-    //             match alignment.indel() {
-    //                 bam::pileup::Indel::Ins(_len) => bam_pileup.push(b'I'),
-    //                 bam::pileup::Indel::Del(_len) => bam_pileup.push(b'D'),
-    //                 _ => {
-    //                     let record = alignment.record();
-    //                     if record.seq_len() > 0 {
-    //                         if let Some(b) = hts_base_at(&record, start as u32, with_next_ins)?
-    //                         {
-    //                             bases.push((record.clone(), b));
-    //                         }
-    //                     } else if alignment.is_del() {
-    //                         bases.push((record.clone(), b'D'));
-    //                     }
-    //                 }
-    //             }
-    //         }
-    //     }
-    // }
+    pub fn se_primary(&mut self, pos: u32) -> Vec<Record> {
+        self.reads_store
+            .values()
+            .filter(|record| {
+                record.reference_start() as u32 == pos || record.reference_end() as u32 == pos
+            })
+            .map(|record| primary_record(&mut self.bam_reader, record.clone()))
+            .collect()
+    }
 }
 
 pub fn scan(
@@ -271,7 +225,13 @@ fn compute_mad(data: &[f64], median: f64) -> f64 {
     compute_median(&deviations)
 }
 
-pub fn scan_outliers(bam_path: &str, contig: &str, start: u32, end: u32, length: u32) -> Vec<(u32, usize, f64, bool, f64, bool)> {
+pub fn scan_outliers(
+    bam_path: &str,
+    contig: &str,
+    start: u32,
+    end: u32,
+    length: u32,
+) -> Vec<(u32, usize, f64, bool, f64, bool)> {
     let mut starts = Vec::new();
     let mut current = start;
     while current <= end {
@@ -317,10 +277,12 @@ pub fn scan_outliers(bam_path: &str, contig: &str, start: u32, end: u32, length:
         filter_outliers_modified_z_score_with_indices(sa_ratios, indices.clone());
     let filtered_se_indices = filter_outliers_modified_z_score_with_indices(se_ratios, indices);
 
-    ratios.iter().map(|(p, n, sa, se)| {
-        let sa_outlier = filtered_sa_indices.contains(p);
-        let se_outlier = filtered_se_indices.contains(p);
-        (*p, *n, *sa, sa_outlier, *se, se_outlier)
-
-    }).collect()
+    ratios
+        .iter()
+        .map(|(p, n, sa, se)| {
+            let sa_outlier = filtered_sa_indices.contains(p);
+            let se_outlier = filtered_se_indices.contains(p);
+            (*p, *n, *sa, sa_outlier, *se, se_outlier)
+        })
+        .collect()
 }

+ 30 - 2
src/lib.rs

@@ -1,4 +1,5 @@
 pub mod bin;
+pub mod bam;
 
 // fn get_se_diag_mrd(
 //     diag_bam_path: &str,
@@ -138,8 +139,11 @@ fn std_deviation(data: &[f64]) -> Option<f64> {
 #[cfg(test)]
 mod tests {
     use log::{info, warn};
+    use rust_htslib::bam::Record;
     use std::iter::FromIterator;
 
+    use crate::bam::{create_tid_2_contig, get_all_positions};
+
     use self::bin::{
         filter_outliers_modified_z_score, filter_outliers_modified_z_score_with_indices, scan,
         scan_outliers, Bin,
@@ -249,9 +253,33 @@ mod tests {
         let result_dir = "/data/longreads_basic_pipe";
         let diag_bam_path = format!("{result_dir}/{id}/diag/{id}_diag_hs1.bam");
 
-        scan_outliers(&diag_bam_path, contig, start, end, bin_length)
+        let outliers_records: Vec<Vec<Record>> = scan_outliers(&diag_bam_path, contig, start, end, bin_length)
             .iter()
             .filter(|(_, _, _, sa_outlier, _, se_outlier)| *sa_outlier || *se_outlier)
-            .for_each(|(pos, n, sa, _, se, _)| info!("{}:{pos}-{}\t{n}\t{sa}\t{se}", contig, pos + bin_length - 1));
+            .map(|(start, n, sa, sa_outlier, se, se_outlier)| {
+                let mut bin = Bin::new(&diag_bam_path, contig, *start, bin_length).unwrap();
+                let mut records = Vec::new();
+                if *sa_outlier {
+                    records.extend(bin.sa_primary());
+                } else if *se_outlier {
+                    let (pos, _) = bin.max_start_or_end();
+                    records.extend(bin.se_primary(pos));
+                };
+                info!("{}:{start}-{}\t{n}\t{sa}\t{se}\t{}", contig, start + bin_length - 1, records.len());
+                records
+            }).collect();
+        info!("n group of records {}", outliers_records.len());
+
+        let mut bam = rust_htslib::bam::IndexedReader::from_path(&diag_bam_path)
+            .unwrap();
+        let tid_2_contig = create_tid_2_contig(&diag_bam_path).unwrap();
+        let mut n_recors = 0;
+        for outlier_record in outliers_records {
+            n_recors += outlier_record.len();
+            outlier_record.iter().for_each(|r| {
+                get_all_positions(r, &tid_2_contig, &mut bam)
+            })
+        }
+        println!("n records: {n_recors}");
     }
 }