use std::{fs, io::{Read, BufRead}}; use bgzip::{BGZFReader, BGZFError}; #[derive(Debug)] pub enum BamNucleotid { Equal, A, C, M, G, R, S, V, T, W, Y, H, K, D, B, N } #[derive(Debug)] pub struct BamSequence(Vec); impl BamSequence { pub fn new() -> BamSequence { BamSequence(Vec::new()) } pub fn get_nt(e: &u8) -> BamNucleotid { match e { 0 => BamNucleotid::Equal, 1 => BamNucleotid::A, 2 => BamNucleotid::C, 3 => BamNucleotid::M, 4 => BamNucleotid::G, 5 => BamNucleotid::R, 6 => BamNucleotid::S, 7 => BamNucleotid::V, 8 => BamNucleotid::T, 9 => BamNucleotid::W, 10 => BamNucleotid::Y, 11 => BamNucleotid::H, 12 => BamNucleotid::K, 13 => BamNucleotid::D, 14 => BamNucleotid::B, 15 => BamNucleotid::N, _ => panic!("Parsing error") } } pub fn push(&mut self, e: &u8) { self.0.push(BamSequence::get_nt(e)) } } #[derive(Debug, Clone)] pub enum TagValue { Int(i8), U(u8), Int16(i16), U16(u16), Int32(i32), U32(u32), F32(f32), Str(String) } pub struct BamRead { ref_id: i32, // ref_id pos: i32, // pos mapq: u8, // mapq flag: u16, // flag read_name: String, // read_name cigar: Vec<(String, u32)>, // cigar sequence: BamSequence, // sequence phred: Vec, // phred tags: Vec<(String, TagValue)> // tags } pub struct BamReader { reader: BGZFReader, references: Vec<(String, u32)> } impl BamReader { pub fn new(bam_path: &str) -> Result { let bam = fs::File::open(bam_path).unwrap(); let reader = BGZFReader::new(bam); let mut bam_reader = BamReader { reader, references: Vec::new() }; bam_reader.read_header()?; Ok(bam_reader) } fn read_header(&mut self) -> Result<(), BGZFError> { let mut buffer_4 = [0u8; 4]; let reader = &mut self.reader; // MAGIC reader.read_exact(&mut buffer_4)?; assert_eq!(buffer_4, [66, 65, 77, 1]); // l_text // size of the header reader.read_exact(&mut buffer_4)?; let l_text: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap()); // header let mut buffer_header = vec![0u8; l_text as usize]; reader.read_exact(&mut buffer_header)?; // let header_txt = String::from_utf8(buffer_header).expect("Can't read the BAM header"); // println!("{}", header_txt); // List of reference information (n=n_ref) // # reference sequences reader.read_exact(&mut buffer_4)?; let n_ref: u32 = u32::from_ne_bytes(buffer_4.try_into().unwrap()); // println!("n_ref {}", n_ref); // reference vec: refrence_name refertence_size let mut references: Vec<(String, u32)> = Vec::new(); for _ in 0..n_ref { // Length of the reference name plus 1 (including NUL) reader.read_exact(&mut buffer_4)?; let l_name: u32 = u32::from_ne_bytes(buffer_4.try_into().expect("Parse error")); // Reference sequence name; NUL-terminated let mut buffer_name = vec![0u8; l_name as usize]; reader.read_exact(&mut buffer_name)?; let name = String::from_utf8(buffer_name).expect("Can't read the reference name"); // Length of the reference sequence reader.read_exact(&mut buffer_4)?; let l_ref: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap()); references.push((name, l_ref)); } self.references = references; Ok(()) } fn parse_read(&mut self) -> Result { // buffers alloc let mut buffer_1 = [0u8; 1]; let mut buffer_2 = [0u8; 2]; let mut buffer_4 = [0u8; 4]; // let mut reader = &self.reader; // Total length of the alignment record, excluding this field self.reader.read_exact(&mut buffer_4)?; // if r.is_err() { break; }; // triggered by eof let block_size: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap()); // println!("block size {}", block_size); let pos_start = self.reader.bgzf_pos(); // Reference sequence ID, −1 ≤ refID < n ref; -1 for a read // without a mapping position self.reader.read_exact(&mut buffer_4)?; let ref_id: i32 = i32::from_ne_bytes(buffer_4.try_into().expect("Parse error")); // println!("ref_id {}", ref_id); // println!("ref_name {}", references[ref_id as usize].0); // 0-based leftmost coordinate (= POS − 1) self.reader.read_exact(&mut buffer_4)?; let pos: i32 = i32::from_ne_bytes(buffer_4.try_into().expect("Parse error")); // println!("pos {}", pos); // Length of read name below (= length(QNAME) + 1) self.reader.read_exact(&mut buffer_1)?; let l_read_name: u8 = u8::from_ne_bytes(buffer_1.try_into().expect("Parse error")); // println!("l_read_name {}", l_read_name); // Mapping quality (=MAPQ) self.reader.read_exact(&mut buffer_1)?; let mapq: u8 = u8::from_ne_bytes(buffer_1.try_into().expect("Parse error")); // println!("mapq {}", mapq); // BAI index bin self.reader.read_exact(&mut buffer_2)?; // let bin: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error")); // println!("bin {}", bin); // Number of operations in CIGAR, self.reader.read_exact(&mut buffer_2)?; let n_cigar_op: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error")); // println!("n_cigar_op {}", n_cigar_op); // Bitwise flags (= FLAG)^29 self.reader.read_exact(&mut buffer_2)?; let flag: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error")); // println!("flag {}", flag); // Length of SEQ self.reader.read_exact(&mut buffer_4)?; let l_seq: u32 = u32::from_ne_bytes(buffer_4.try_into().unwrap()); // println!("l_seq {}", l_seq); // Ref-ID of the next segment (−1 ≤ next refID < n ref) self.reader.read_exact(&mut buffer_4)?; // let next_ref_id: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap()); // println!("next_ref_id {}", next_ref_id); // 0-based leftmost pos of the next segment (= PNEXT − 1) self.reader.read_exact(&mut buffer_4)?; // let next_pos: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap()); // println!("next_pos {}", next_pos); // Template length (= TLEN) self.reader.read_exact(&mut buffer_4)?; // let tlen: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap()); // println!("tlen {}", tlen); // Read name, NUL-terminated (QNAME with trailing ‘\0’)^30 let mut buffer_read_name = vec![0u8; l_read_name as usize]; self.reader.read_exact(&mut buffer_read_name)?; buffer_read_name.pop(); let read_name = String::from_utf8(buffer_read_name).expect("Can't read the read name"); // println!("read_name {}", read_name); // CIGAR: op len<<4|op. ‘MIDNSHP=X’→‘012345678’ let l_buffer_cigar = (n_cigar_op as usize) * 4; let mut buffer_cigar = vec![0u8; l_buffer_cigar]; self.reader.read_exact(&mut buffer_cigar)?; let cigar: Vec<(String, u32)> = buffer_cigar.chunks(4) .map(|c| { let u = u32::from_ne_bytes(c.try_into().unwrap()); let op = u << 28 >> 28; let n = u >> 4; match op { 0 => ("M".to_string(), n), 1 => ("I".to_string(), n), 2 => ("D".to_string(), n), 3 => ("N".to_string(), n), 4 => ("S".to_string(), n), 5 => ("H".to_string(), n), 6 => ("P".to_string(), n), 7 => ("=".to_string(), n), 8 => ("X".to_string(), n), _ => panic!("Error parsing cigar") } }) .collect(); // println!("cigar {:?}", cigar); let l_buffer_seq = (l_seq as usize + 1) / 2; let mut buffer_seq = vec![0u8; l_buffer_seq]; self.reader.read_exact(&mut buffer_seq)?; let mut sequence = BamSequence::new(); for e in buffer_seq.into_iter() { let right = e << 4 >> 4; let left = e >> 4; sequence.push(&left); if right != 0 { sequence.push(&right) }; } // println!("sequence {:?}", sequence); let mut phred = vec![0; l_seq as usize]; self.reader.read_exact(&mut phred)?; let mut tags: Vec<(String, TagValue)> = Vec::new(); loop { let mut tag_buffer = vec![0u8; 2]; self.reader.read_exact(&mut tag_buffer)?; let tag = String::from_utf8(tag_buffer).unwrap(); let mut tag_type = vec![0u8]; self.reader.read_exact(&mut tag_type)?; let tag_v = match tag_type[0] { b'c' => { self.reader.read_exact(&mut buffer_1)?; TagValue::Int(buffer_1.clone()[0] as i8) }, b'C' => { self.reader.read_exact(&mut buffer_1)?; TagValue::U(buffer_1.clone()[0]) }, b's' => { self.reader.read_exact(&mut buffer_2)?; TagValue::Int16( i16::from_ne_bytes(buffer_2.try_into().expect("Parse error")) ) }, b'S' => { self.reader.read_exact(&mut buffer_2)?; TagValue::U16( u16::from_ne_bytes(buffer_2.try_into().expect("Parse error")) ) } b'i' => { self.reader.read_exact(&mut buffer_4)?; TagValue::Int32( i32::from_ne_bytes(buffer_4.try_into().expect("Parse error")) ) }, b'I' => { self.reader.read_exact(&mut buffer_4)?; TagValue::U32( u32::from_ne_bytes(buffer_4.try_into().expect("Parse error")) ) }, b'f' => { self.reader.read_exact(&mut buffer_4)?; TagValue::F32( f32::from_ne_bytes(buffer_4.try_into().expect("Parse error")) ) }, b'Z' => { let mut buf: Vec = Vec::new(); self.reader.read_until(b'\0', &mut buf)?; buf.pop(); TagValue::Str(String::from_utf8(buf).unwrap()) } e => panic!("Error unknown tag type. {:?} tag: {:?}", e, tag) }; tags.push((tag, tag_v)); if self.reader.bgzf_pos() - pos_start >= block_size as u64 { break; } } Ok(BamRead {ref_id, pos, mapq, flag, read_name, cigar, sequence, phred, tags}) } } impl Iterator for BamReader { type Item = BamRead; fn next(&mut self) -> Option { match self.parse_read() { Ok(r) => Some(r), Err(err) => { println!("Error: {}", err); None } } } } #[cfg(test)] mod tests { use super::*; #[test] fn it_works() { let bam_path = "/Users/steimle/Documents/Programmes/sv-finder/betya.bam"; let flags: Vec = vec![81, 161, 97, 145, 65, 129, 113, 177]; let bam_reader = BamReader::new(bam_path).unwrap(); let reads = bam_reader .filter(|br| flags.contains(&br.flag)) .take(1000) .collect::>(); assert_eq!( reads[0].read_name, "NB501645:337:HCCMVAFX2:3:11505:17842:2102_GATGGGACGG".to_string() ); } }