|
|
@@ -1,8 +1,11 @@
|
|
|
-use std::collections::HashMap;
|
|
|
+use std::{collections::HashMap, u32};
|
|
|
|
|
|
use anyhow::Context;
|
|
|
+use log::warn;
|
|
|
use rust_htslib::bam::{
|
|
|
- ext::BamRecordExtensions, record::{Aux, Cigar, CigarString}, Header, IndexedReader, Read, Record,
|
|
|
+ ext::BamRecordExtensions,
|
|
|
+ record::{Aux, Cigar, CigarString},
|
|
|
+ Header, HeaderView, IndexedReader, Read, Record,
|
|
|
};
|
|
|
|
|
|
pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
|
|
|
@@ -34,87 +37,55 @@ pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
|
|
|
record
|
|
|
}
|
|
|
|
|
|
-pub fn get_all_positions(record: &Record, tid_2_contig: &HashMap<i32, String>, bam: &mut IndexedReader) {
|
|
|
+pub fn get_all_positions(
|
|
|
+ record: &Record,
|
|
|
+ header: &HeaderView,
|
|
|
+ bam: &mut IndexedReader,
|
|
|
+) -> anyhow::Result<Vec<(String, u32, u32)>> {
|
|
|
let mut positions = Vec::new();
|
|
|
let qname = record.qname();
|
|
|
- if let Some(contig) = tid_2_contig.get(&record.tid()) {
|
|
|
- positions.push((contig.to_string(), record.reference_start() as u32));
|
|
|
- }
|
|
|
+ let contig = String::from_utf8(header.tid2name(record.tid().try_into()?).to_vec())?;
|
|
|
+ positions.push((
|
|
|
+ contig,
|
|
|
+ record.reference_start() as u32,
|
|
|
+ record.reference_end() as u32,
|
|
|
+ ));
|
|
|
|
|
|
if let Ok(Aux::String(sa)) = record.aux(b"SA") {
|
|
|
- let positions: Vec<(&str, u32, u32)> = sa
|
|
|
- .split(';')
|
|
|
- .filter(|s| !s.is_empty())
|
|
|
- .map(|s| {
|
|
|
- let parts: Vec<&str> = s.split(',').take(4).collect();
|
|
|
- let chr = parts.first().unwrap();
|
|
|
- let start: u32 = parts.get(1).unwrap().parse().unwrap();
|
|
|
- // SAM format is 1-based!
|
|
|
- let start = start - 1;
|
|
|
- let strand = parts.get(2).unwrap();
|
|
|
- let cigar_str = *parts.get(3).unwrap();
|
|
|
- let end = calculate_end_position(start, cigar_str).unwrap();
|
|
|
-
|
|
|
- let mut founded = false;
|
|
|
- bam.fetch((chr, start, start + 1)).unwrap();
|
|
|
- for record in bam.records().flatten() {
|
|
|
- let rc = record.cigar().to_string();
|
|
|
- if qname == record.qname() && cigar_str == rc {
|
|
|
- founded = true;
|
|
|
- assert_eq!(start, record.reference_start() as u32, "start {chr}:{start}-{end} {strand} {cigar_str} {}", record.cigar());
|
|
|
- if end != record.reference_end() as u32 {
|
|
|
- println!("end {chr}:{start}-{end} {strand} {cigar_str} {}", record.cigar());
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- if !founded {
|
|
|
- println!("NOT FouNDED {chr}:{start}-{end} {strand} {cigar_str}");
|
|
|
-
|
|
|
- }
|
|
|
+ let sa_splits: Vec<&str> = sa.split(';').filter(|s| !s.is_empty()).collect();
|
|
|
+ for s in sa_splits {
|
|
|
+ let parts: Vec<&str> = s.split(',').take(4).collect();
|
|
|
+ let chr = parts.first().unwrap();
|
|
|
+ let start: u32 = parts.get(1).unwrap().parse().unwrap();
|
|
|
+ // SAM format is 1-based!
|
|
|
+ let start = start - 1;
|
|
|
+ // let strand = parts.get(2).unwrap();
|
|
|
+ // // CIGAR SA != CIGAR can't calculate the end from it.
|
|
|
+ // let cigar_str = *parts.get(3).unwrap();
|
|
|
+ // let end = calculate_end_position(start, cigar_str).unwrap();
|
|
|
|
|
|
- (*chr, start, end)
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- // for (chr, start) in positions {
|
|
|
- // bam.fetch((chr, start, start + 1)).unwrap();
|
|
|
- // for record in bam.records().flatten() {
|
|
|
- // if qname == record.qname() && !record.is_supplementary() {
|
|
|
- // return record.clone();
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-fn calculate_end_position(start: u32, cigar_str: &str) -> Result<u32, String> {
|
|
|
- let mut end = start;
|
|
|
- let mut count = String::new();
|
|
|
-
|
|
|
- for c in cigar_str.chars() {
|
|
|
- if c.is_digit(10) {
|
|
|
- count.push(c);
|
|
|
- } else {
|
|
|
- let len: u32 = count.parse().map_err(|e| format!("Parse error: {}", e))?;
|
|
|
- count.clear();
|
|
|
- match c {
|
|
|
- 'M' | 'D' | 'N' | '=' | 'X' => {
|
|
|
- end += len;
|
|
|
- }
|
|
|
- 'I' | 'S' | 'H' | 'P' => {
|
|
|
- // These operations do not consume reference positions
|
|
|
+ bam.fetch((chr, start, start + 1))?;
|
|
|
+ let mut founded = false;
|
|
|
+ for record in bam.records().flatten() {
|
|
|
+ if qname == record.qname() {
|
|
|
+ founded = true;
|
|
|
+ positions.push((chr.to_string(), start, record.reference_end() as u32));
|
|
|
}
|
|
|
- _ => return Err(format!("Invalid CIGAR operation: {}", c)),
|
|
|
+ }
|
|
|
+ if !founded {
|
|
|
+ warn!("Not founded");
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
- Ok(end)
|
|
|
+ positions.sort_by(|a, b| a.1.cmp(&b.1));
|
|
|
+ positions.sort_by(|a, b| a.0.cmp(&b.0));
|
|
|
+ positions.dedup();
|
|
|
+ Ok(positions)
|
|
|
}
|
|
|
|
|
|
pub fn create_tid_2_contig(bam_path: &str) -> anyhow::Result<HashMap<i32, String>> {
|
|
|
let bam_reader = rust_htslib::bam::IndexedReader::from_path(bam_path)
|
|
|
- .context(format!("Can't open {}", bam_path))?;
|
|
|
+ .context(format!("Can't open {}", bam_path))?;
|
|
|
let h = bam_reader.header();
|
|
|
let mut r = HashMap::new();
|
|
|
h.target_names().iter().enumerate().for_each(|(i, name)| {
|