|
|
@@ -0,0 +1,343 @@
|
|
|
+use std::{fs, io::{Read, BufRead}};
|
|
|
+use bgzip::{BGZFReader, BGZFError};
|
|
|
+
|
|
|
+#[derive(Debug)]
|
|
|
+pub enum BamNucleotid {
|
|
|
+ Equal, A, C, M, G, R, S, V, T, W, Y, H, K, D, B, N
|
|
|
+}
|
|
|
+#[derive(Debug)]
|
|
|
+pub struct BamSequence(Vec<BamNucleotid>);
|
|
|
+
|
|
|
+impl BamSequence {
|
|
|
+ pub fn new() -> BamSequence {
|
|
|
+ BamSequence(Vec::new())
|
|
|
+ }
|
|
|
+ pub fn get_nt(e: &u8) -> BamNucleotid {
|
|
|
+ match e {
|
|
|
+ 0 => BamNucleotid::Equal,
|
|
|
+ 1 => BamNucleotid::A,
|
|
|
+ 2 => BamNucleotid::C,
|
|
|
+ 3 => BamNucleotid::M,
|
|
|
+ 4 => BamNucleotid::G,
|
|
|
+ 5 => BamNucleotid::R,
|
|
|
+ 6 => BamNucleotid::S,
|
|
|
+ 7 => BamNucleotid::V,
|
|
|
+ 8 => BamNucleotid::T,
|
|
|
+ 9 => BamNucleotid::W,
|
|
|
+ 10 => BamNucleotid::Y,
|
|
|
+ 11 => BamNucleotid::H,
|
|
|
+ 12 => BamNucleotid::K,
|
|
|
+ 13 => BamNucleotid::D,
|
|
|
+ 14 => BamNucleotid::B,
|
|
|
+ 15 => BamNucleotid::N,
|
|
|
+ _ => panic!("Parsing error")
|
|
|
+ }
|
|
|
+ }
|
|
|
+ pub fn push(&mut self, e: &u8) {
|
|
|
+ self.0.push(BamSequence::get_nt(e))
|
|
|
+ }
|
|
|
+}
|
|
|
+#[derive(Debug, Clone)]
|
|
|
+pub enum TagValue {
|
|
|
+ Int(i8),
|
|
|
+ U(u8),
|
|
|
+ Int16(i16),
|
|
|
+ U16(u16),
|
|
|
+ Int32(i32),
|
|
|
+ U32(u32),
|
|
|
+ F32(f32),
|
|
|
+ Str(String)
|
|
|
+}
|
|
|
+
|
|
|
+pub type BamRead = (
|
|
|
+ i32, // ref_id
|
|
|
+ i32, // pos
|
|
|
+ u8, // mapq
|
|
|
+ u16, // flag
|
|
|
+ String, // read_name
|
|
|
+ Vec<(String, u32)>, // cigar
|
|
|
+ BamSequence, // sequence
|
|
|
+ Vec<u8>, // phred
|
|
|
+ Vec<(String, TagValue)> // tags
|
|
|
+);
|
|
|
+
|
|
|
+pub struct BamReader {
|
|
|
+ reader: BGZFReader<fs::File>,
|
|
|
+ references: Vec<(String, u32)>
|
|
|
+}
|
|
|
+
|
|
|
+impl BamReader {
|
|
|
+ pub fn new(bam_path: &str) -> Result<BamReader, BGZFError> {
|
|
|
+ let bam = fs::File::open(bam_path).unwrap();
|
|
|
+ let reader = BGZFReader::new(bam);
|
|
|
+ let mut bam_reader = BamReader { reader, references: Vec::new() };
|
|
|
+ bam_reader.read_header()?;
|
|
|
+ Ok(bam_reader)
|
|
|
+ }
|
|
|
+
|
|
|
+ fn read_header(&mut self) -> Result<(), BGZFError> {
|
|
|
+ let mut buffer_4 = [0u8; 4];
|
|
|
+ let reader = &mut self.reader;
|
|
|
+
|
|
|
+ // MAGIC
|
|
|
+ reader.read_exact(&mut buffer_4)?;
|
|
|
+ assert_eq!(buffer_4, [66, 65, 77, 1]);
|
|
|
+
|
|
|
+ // l_text
|
|
|
+ // size of the header
|
|
|
+ reader.read_exact(&mut buffer_4)?;
|
|
|
+ let l_text: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap());
|
|
|
+
|
|
|
+ // header
|
|
|
+ let mut buffer_header = vec![0u8; l_text as usize];
|
|
|
+ reader.read_exact(&mut buffer_header)?;
|
|
|
+ // let header_txt = String::from_utf8(buffer_header).expect("Can't read the BAM header");
|
|
|
+ // println!("{}", header_txt);
|
|
|
+
|
|
|
+ // List of reference information (n=n_ref)
|
|
|
+ // # reference sequences
|
|
|
+ reader.read_exact(&mut buffer_4)?;
|
|
|
+ let n_ref: u32 = u32::from_ne_bytes(buffer_4.try_into().unwrap());
|
|
|
+ // println!("n_ref {}", n_ref);
|
|
|
+
|
|
|
+ // reference vec: refrence_name refertence_size
|
|
|
+ let mut references: Vec<(String, u32)> = Vec::new();
|
|
|
+ for _ in 0..n_ref {
|
|
|
+ // Length of the reference name plus 1 (including NUL)
|
|
|
+ reader.read_exact(&mut buffer_4)?;
|
|
|
+ let l_name: u32 = u32::from_ne_bytes(buffer_4.try_into().expect("Parse error"));
|
|
|
+
|
|
|
+ // Reference sequence name; NUL-terminated
|
|
|
+ let mut buffer_name = vec![0u8; l_name as usize];
|
|
|
+ reader.read_exact(&mut buffer_name)?;
|
|
|
+ let name = String::from_utf8(buffer_name).expect("Can't read the reference name");
|
|
|
+
|
|
|
+ // Length of the reference sequence
|
|
|
+ reader.read_exact(&mut buffer_4)?;
|
|
|
+ let l_ref: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap());
|
|
|
+
|
|
|
+ references.push((name, l_ref));
|
|
|
+ }
|
|
|
+
|
|
|
+ self.references = references;
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ fn parse_read(&mut self) -> Result<BamRead, BGZFError> {
|
|
|
+ // buffers alloc
|
|
|
+ let mut buffer_1 = [0u8; 1];
|
|
|
+ let mut buffer_2 = [0u8; 2];
|
|
|
+ let mut buffer_4 = [0u8; 4];
|
|
|
+ // let mut reader = &self.reader;
|
|
|
+ // Total length of the alignment record, excluding this field
|
|
|
+ self.reader.read_exact(&mut buffer_4)?;
|
|
|
+ // if r.is_err() { break; }; // triggered by eof
|
|
|
+ let block_size: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap());
|
|
|
+ // println!("block size {}", block_size);
|
|
|
+
|
|
|
+ let pos_start = self.reader.bgzf_pos();
|
|
|
+
|
|
|
+ // Reference sequence ID, −1 ≤ refID < n ref; -1 for a read
|
|
|
+ // without a mapping position
|
|
|
+ self.reader.read_exact(&mut buffer_4)?;
|
|
|
+ let ref_id: i32 = i32::from_ne_bytes(buffer_4.try_into().expect("Parse error"));
|
|
|
+ // println!("ref_id {}", ref_id);
|
|
|
+ // println!("ref_name {}", references[ref_id as usize].0);
|
|
|
+
|
|
|
+ // 0-based leftmost coordinate (= POS − 1)
|
|
|
+ self.reader.read_exact(&mut buffer_4)?;
|
|
|
+ let pos: i32 = i32::from_ne_bytes(buffer_4.try_into().expect("Parse error"));
|
|
|
+ // println!("pos {}", pos);
|
|
|
+
|
|
|
+ // Length of read name below (= length(QNAME) + 1)
|
|
|
+ self.reader.read_exact(&mut buffer_1)?;
|
|
|
+ let l_read_name: u8 = u8::from_ne_bytes(buffer_1.try_into().expect("Parse error"));
|
|
|
+ // println!("l_read_name {}", l_read_name);
|
|
|
+
|
|
|
+ // Mapping quality (=MAPQ)
|
|
|
+ self.reader.read_exact(&mut buffer_1)?;
|
|
|
+ let mapq: u8 = u8::from_ne_bytes(buffer_1.try_into().expect("Parse error"));
|
|
|
+ // println!("mapq {}", mapq);
|
|
|
+
|
|
|
+ // BAI index bin
|
|
|
+ self.reader.read_exact(&mut buffer_2)?;
|
|
|
+ // let bin: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"));
|
|
|
+ // println!("bin {}", bin);
|
|
|
+
|
|
|
+ // Number of operations in CIGAR,
|
|
|
+ self.reader.read_exact(&mut buffer_2)?;
|
|
|
+ let n_cigar_op: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"));
|
|
|
+ // println!("n_cigar_op {}", n_cigar_op);
|
|
|
+
|
|
|
+ // Bitwise flags (= FLAG)^29
|
|
|
+ self.reader.read_exact(&mut buffer_2)?;
|
|
|
+ let flag: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"));
|
|
|
+
|
|
|
+ // println!("flag {}", flag);
|
|
|
+
|
|
|
+ // Length of SEQ
|
|
|
+ self.reader.read_exact(&mut buffer_4)?;
|
|
|
+ let l_seq: u32 = u32::from_ne_bytes(buffer_4.try_into().unwrap());
|
|
|
+ // println!("l_seq {}", l_seq);
|
|
|
+
|
|
|
+ // Ref-ID of the next segment (−1 ≤ next refID < n ref)
|
|
|
+ self.reader.read_exact(&mut buffer_4)?;
|
|
|
+ // let next_ref_id: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap());
|
|
|
+ // println!("next_ref_id {}", next_ref_id);
|
|
|
+
|
|
|
+ // 0-based leftmost pos of the next segment (= PNEXT − 1)
|
|
|
+ self.reader.read_exact(&mut buffer_4)?;
|
|
|
+ // let next_pos: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap());
|
|
|
+ // println!("next_pos {}", next_pos);
|
|
|
+
|
|
|
+ // Template length (= TLEN)
|
|
|
+ self.reader.read_exact(&mut buffer_4)?;
|
|
|
+ // let tlen: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap());
|
|
|
+ // println!("tlen {}", tlen);
|
|
|
+
|
|
|
+ // Read name, NUL-terminated (QNAME with trailing ‘\0’)^30
|
|
|
+ let mut buffer_read_name = vec![0u8; l_read_name as usize];
|
|
|
+ self.reader.read_exact(&mut buffer_read_name)?;
|
|
|
+ buffer_read_name.pop();
|
|
|
+ let read_name = String::from_utf8(buffer_read_name).expect("Can't read the read name");
|
|
|
+ // println!("read_name {}", read_name);
|
|
|
+
|
|
|
+ // CIGAR: op len<<4|op. ‘MIDNSHP=X’→‘012345678’
|
|
|
+ let l_buffer_cigar = (n_cigar_op as usize) * 4;
|
|
|
+ let mut buffer_cigar = vec![0u8; l_buffer_cigar];
|
|
|
+ self.reader.read_exact(&mut buffer_cigar)?;
|
|
|
+ let cigar: Vec<(String, u32)> = buffer_cigar.chunks(4)
|
|
|
+ .map(|c| {
|
|
|
+ let u = u32::from_ne_bytes(c.try_into().unwrap());
|
|
|
+ let op = u << 28 >> 28;
|
|
|
+ let n = u >> 4;
|
|
|
+ match op {
|
|
|
+ 0 => ("M".to_string(), n),
|
|
|
+ 1 => ("I".to_string(), n),
|
|
|
+ 2 => ("D".to_string(), n),
|
|
|
+ 3 => ("N".to_string(), n),
|
|
|
+ 4 => ("S".to_string(), n),
|
|
|
+ 5 => ("H".to_string(), n),
|
|
|
+ 6 => ("P".to_string(), n),
|
|
|
+ 7 => ("=".to_string(), n),
|
|
|
+ 8 => ("X".to_string(), n),
|
|
|
+ _ => panic!("Error parsing cigar")
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+ // println!("cigar {:?}", cigar);
|
|
|
+
|
|
|
+ let l_buffer_seq = (l_seq as usize + 1) / 2;
|
|
|
+ let mut buffer_seq = vec![0u8; l_buffer_seq];
|
|
|
+ self.reader.read_exact(&mut buffer_seq)?;
|
|
|
+ let mut sequence = BamSequence::new();
|
|
|
+ for e in buffer_seq.into_iter() {
|
|
|
+ let right = e << 4 >> 4;
|
|
|
+ let left = e >> 4;
|
|
|
+ sequence.push(&left);
|
|
|
+ if right != 0 { sequence.push(&right) };
|
|
|
+ }
|
|
|
+ // println!("sequence {:?}", sequence);
|
|
|
+
|
|
|
+ let mut phred = vec![0; l_seq as usize];
|
|
|
+ self.reader.read_exact(&mut phred)?;
|
|
|
+
|
|
|
+ let mut tags: Vec<(String, TagValue)> = Vec::new();
|
|
|
+ loop {
|
|
|
+ let mut tag_buffer = vec![0u8; 2];
|
|
|
+ self.reader.read_exact(&mut tag_buffer)?;
|
|
|
+ let tag = String::from_utf8(tag_buffer).unwrap();
|
|
|
+
|
|
|
+ let mut tag_type = vec![0u8];
|
|
|
+ self.reader.read_exact(&mut tag_type)?;
|
|
|
+
|
|
|
+ let tag_v = match tag_type[0] {
|
|
|
+ b'c' => {
|
|
|
+ self.reader.read_exact(&mut buffer_1)?;
|
|
|
+ TagValue::Int(buffer_1.clone()[0] as i8)
|
|
|
+ },
|
|
|
+ b'C' => {
|
|
|
+ self.reader.read_exact(&mut buffer_1)?;
|
|
|
+ TagValue::U(buffer_1.clone()[0])
|
|
|
+ },
|
|
|
+ b's' => {
|
|
|
+ self.reader.read_exact(&mut buffer_2)?;
|
|
|
+ TagValue::Int16(
|
|
|
+ i16::from_ne_bytes(buffer_2.try_into().expect("Parse error"))
|
|
|
+ )
|
|
|
+ },
|
|
|
+ b'S' => {
|
|
|
+ self.reader.read_exact(&mut buffer_2)?;
|
|
|
+ TagValue::U16(
|
|
|
+ u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"))
|
|
|
+ )
|
|
|
+ }
|
|
|
+ b'i' => {
|
|
|
+ self.reader.read_exact(&mut buffer_4)?;
|
|
|
+ TagValue::Int32(
|
|
|
+ i32::from_ne_bytes(buffer_4.try_into().expect("Parse error"))
|
|
|
+ )
|
|
|
+ },
|
|
|
+ b'I' => {
|
|
|
+ self.reader.read_exact(&mut buffer_4)?;
|
|
|
+ TagValue::U32(
|
|
|
+ u32::from_ne_bytes(buffer_4.try_into().expect("Parse error"))
|
|
|
+ )
|
|
|
+ },
|
|
|
+ b'f' => {
|
|
|
+ self.reader.read_exact(&mut buffer_4)?;
|
|
|
+ TagValue::F32(
|
|
|
+ f32::from_ne_bytes(buffer_4.try_into().expect("Parse error"))
|
|
|
+ )
|
|
|
+ },
|
|
|
+ b'Z' => {
|
|
|
+ let mut buf: Vec<u8> = Vec::new();
|
|
|
+ self.reader.read_until(b'\0', &mut buf)?;
|
|
|
+ buf.pop();
|
|
|
+ TagValue::Str(String::from_utf8(buf).unwrap())
|
|
|
+ }
|
|
|
+ e => panic!("Error unknown tag type. {:?} tag: {:?}", e, tag)
|
|
|
+ };
|
|
|
+ tags.push((tag, tag_v));
|
|
|
+
|
|
|
+ // if block_size - current_read == 0 { break; }
|
|
|
+ if self.reader.bgzf_pos() - pos_start >= block_size as u64 { break; }
|
|
|
+ }
|
|
|
+ Ok((ref_id, pos, mapq, flag, read_name, cigar, sequence, phred, tags))
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl Iterator for BamReader {
|
|
|
+ type Item = BamRead;
|
|
|
+
|
|
|
+ fn next(&mut self) -> Option<Self::Item> {
|
|
|
+ if let Ok(r) = self.parse_read() {
|
|
|
+ Some(r)
|
|
|
+ } else {
|
|
|
+ None
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[cfg(test)]
|
|
|
+mod tests {
|
|
|
+ use super::*;
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn it_works() {
|
|
|
+ let bam_path = "/Users/steimle/Documents/Programmes/sv-finder/betya.bam";
|
|
|
+ let flags: Vec<u16> = vec![81, 161, 97, 145, 65, 129, 113, 177];
|
|
|
+
|
|
|
+ let bam_reader = BamReader::new(bam_path).unwrap();
|
|
|
+
|
|
|
+ let reads = bam_reader
|
|
|
+ .filter(|br| flags.contains(&br.3))
|
|
|
+ .take(1000)
|
|
|
+ .collect::<Vec<BamRead>>();
|
|
|
+
|
|
|
+ assert_eq!(
|
|
|
+ reads[0].4,
|
|
|
+ "NB501645:337:HCCMVAFX2:3:11505:17842:2102_GATGGGACGG".to_string()
|
|
|
+ );
|
|
|
+ }
|
|
|
+}
|