| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343 |
- use std::{fs, io::{Read, BufRead}};
- use bgzip::{BGZFReader, BGZFError};
- #[derive(Debug)]
- pub enum BamNucleotid {
- Equal, A, C, M, G, R, S, V, T, W, Y, H, K, D, B, N
- }
- #[derive(Debug)]
- pub struct BamSequence(Vec<BamNucleotid>);
- impl BamSequence {
- pub fn new() -> BamSequence {
- BamSequence(Vec::new())
- }
- pub fn get_nt(e: &u8) -> BamNucleotid {
- match e {
- 0 => BamNucleotid::Equal,
- 1 => BamNucleotid::A,
- 2 => BamNucleotid::C,
- 3 => BamNucleotid::M,
- 4 => BamNucleotid::G,
- 5 => BamNucleotid::R,
- 6 => BamNucleotid::S,
- 7 => BamNucleotid::V,
- 8 => BamNucleotid::T,
- 9 => BamNucleotid::W,
- 10 => BamNucleotid::Y,
- 11 => BamNucleotid::H,
- 12 => BamNucleotid::K,
- 13 => BamNucleotid::D,
- 14 => BamNucleotid::B,
- 15 => BamNucleotid::N,
- _ => panic!("Parsing error")
- }
- }
- pub fn push(&mut self, e: &u8) {
- self.0.push(BamSequence::get_nt(e))
- }
- }
- #[derive(Debug, Clone)]
- pub enum TagValue {
- Int(i8),
- U(u8),
- Int16(i16),
- U16(u16),
- Int32(i32),
- U32(u32),
- F32(f32),
- Str(String)
- }
- pub struct BamRead {
- ref_id: i32, // ref_id
- pos: i32, // pos
- mapq: u8, // mapq
- flag: u16, // flag
- read_name: String, // read_name
- cigar: Vec<(String, u32)>, // cigar
- sequence: BamSequence, // sequence
- phred: Vec<u8>, // phred
- tags: Vec<(String, TagValue)> // tags
- }
- pub struct BamReader {
- reader: BGZFReader<fs::File>,
- references: Vec<(String, u32)>
- }
- impl BamReader {
- pub fn new(bam_path: &str) -> Result<BamReader, BGZFError> {
- let bam = fs::File::open(bam_path).unwrap();
- let reader = BGZFReader::new(bam);
- let mut bam_reader = BamReader { reader, references: Vec::new() };
- bam_reader.read_header()?;
- Ok(bam_reader)
- }
- fn read_header(&mut self) -> Result<(), BGZFError> {
- let mut buffer_4 = [0u8; 4];
- let reader = &mut self.reader;
-
- // MAGIC
- reader.read_exact(&mut buffer_4)?;
- assert_eq!(buffer_4, [66, 65, 77, 1]);
-
- // l_text
- // size of the header
- reader.read_exact(&mut buffer_4)?;
- let l_text: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap());
-
- // header
- let mut buffer_header = vec![0u8; l_text as usize];
- reader.read_exact(&mut buffer_header)?;
- // let header_txt = String::from_utf8(buffer_header).expect("Can't read the BAM header");
- // println!("{}", header_txt);
-
- // List of reference information (n=n_ref)
- // # reference sequences
- reader.read_exact(&mut buffer_4)?;
- let n_ref: u32 = u32::from_ne_bytes(buffer_4.try_into().unwrap());
- // println!("n_ref {}", n_ref);
-
- // reference vec: refrence_name refertence_size
- let mut references: Vec<(String, u32)> = Vec::new();
- for _ in 0..n_ref {
- // Length of the reference name plus 1 (including NUL)
- reader.read_exact(&mut buffer_4)?;
- let l_name: u32 = u32::from_ne_bytes(buffer_4.try_into().expect("Parse error"));
-
- // Reference sequence name; NUL-terminated
- let mut buffer_name = vec![0u8; l_name as usize];
- reader.read_exact(&mut buffer_name)?;
- let name = String::from_utf8(buffer_name).expect("Can't read the reference name");
-
- // Length of the reference sequence
- reader.read_exact(&mut buffer_4)?;
- let l_ref: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap());
-
- references.push((name, l_ref));
- }
-
- self.references = references;
- Ok(())
- }
- fn parse_read(&mut self) -> Result<BamRead, BGZFError> {
- // buffers alloc
- let mut buffer_1 = [0u8; 1];
- let mut buffer_2 = [0u8; 2];
- let mut buffer_4 = [0u8; 4];
- // let mut reader = &self.reader;
- // Total length of the alignment record, excluding this field
- self.reader.read_exact(&mut buffer_4)?;
- // if r.is_err() { break; }; // triggered by eof
- let block_size: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap());
- // println!("block size {}", block_size);
- let pos_start = self.reader.bgzf_pos();
- // Reference sequence ID, −1 ≤ refID < n ref; -1 for a read
- // without a mapping position
- self.reader.read_exact(&mut buffer_4)?;
- let ref_id: i32 = i32::from_ne_bytes(buffer_4.try_into().expect("Parse error"));
- // println!("ref_id {}", ref_id);
- // println!("ref_name {}", references[ref_id as usize].0);
- // 0-based leftmost coordinate (= POS − 1)
- self.reader.read_exact(&mut buffer_4)?;
- let pos: i32 = i32::from_ne_bytes(buffer_4.try_into().expect("Parse error"));
- // println!("pos {}", pos);
- // Length of read name below (= length(QNAME) + 1)
- self.reader.read_exact(&mut buffer_1)?;
- let l_read_name: u8 = u8::from_ne_bytes(buffer_1.try_into().expect("Parse error"));
- // println!("l_read_name {}", l_read_name);
- // Mapping quality (=MAPQ)
- self.reader.read_exact(&mut buffer_1)?;
- let mapq: u8 = u8::from_ne_bytes(buffer_1.try_into().expect("Parse error"));
- // println!("mapq {}", mapq);
- // BAI index bin
- self.reader.read_exact(&mut buffer_2)?;
- // let bin: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"));
- // println!("bin {}", bin);
- // Number of operations in CIGAR,
- self.reader.read_exact(&mut buffer_2)?;
- let n_cigar_op: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"));
- // println!("n_cigar_op {}", n_cigar_op);
- // Bitwise flags (= FLAG)^29
- self.reader.read_exact(&mut buffer_2)?;
- let flag: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"));
-
- // println!("flag {}", flag);
- // Length of SEQ
- self.reader.read_exact(&mut buffer_4)?;
- let l_seq: u32 = u32::from_ne_bytes(buffer_4.try_into().unwrap());
- // println!("l_seq {}", l_seq);
- // Ref-ID of the next segment (−1 ≤ next refID < n ref)
- self.reader.read_exact(&mut buffer_4)?;
- // let next_ref_id: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap());
- // println!("next_ref_id {}", next_ref_id);
- // 0-based leftmost pos of the next segment (= PNEXT − 1)
- self.reader.read_exact(&mut buffer_4)?;
- // let next_pos: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap());
- // println!("next_pos {}", next_pos);
- // Template length (= TLEN)
- self.reader.read_exact(&mut buffer_4)?;
- // let tlen: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap());
- // println!("tlen {}", tlen);
- // Read name, NUL-terminated (QNAME with trailing ‘\0’)^30
- let mut buffer_read_name = vec![0u8; l_read_name as usize];
- self.reader.read_exact(&mut buffer_read_name)?;
- buffer_read_name.pop();
- let read_name = String::from_utf8(buffer_read_name).expect("Can't read the read name");
- // println!("read_name {}", read_name);
- // CIGAR: op len<<4|op. ‘MIDNSHP=X’→‘012345678’
- let l_buffer_cigar = (n_cigar_op as usize) * 4;
- let mut buffer_cigar = vec![0u8; l_buffer_cigar];
- self.reader.read_exact(&mut buffer_cigar)?;
- let cigar: Vec<(String, u32)> = buffer_cigar.chunks(4)
- .map(|c| {
- let u = u32::from_ne_bytes(c.try_into().unwrap());
- let op = u << 28 >> 28;
- let n = u >> 4;
- match op {
- 0 => ("M".to_string(), n),
- 1 => ("I".to_string(), n),
- 2 => ("D".to_string(), n),
- 3 => ("N".to_string(), n),
- 4 => ("S".to_string(), n),
- 5 => ("H".to_string(), n),
- 6 => ("P".to_string(), n),
- 7 => ("=".to_string(), n),
- 8 => ("X".to_string(), n),
- _ => panic!("Error parsing cigar")
- }
- })
- .collect();
- // println!("cigar {:?}", cigar);
- let l_buffer_seq = (l_seq as usize + 1) / 2;
- let mut buffer_seq = vec![0u8; l_buffer_seq];
- self.reader.read_exact(&mut buffer_seq)?;
- let mut sequence = BamSequence::new();
- for e in buffer_seq.into_iter() {
- let right = e << 4 >> 4;
- let left = e >> 4;
- sequence.push(&left);
- if right != 0 { sequence.push(&right) };
- }
- // println!("sequence {:?}", sequence);
-
- let mut phred = vec![0; l_seq as usize];
- self.reader.read_exact(&mut phred)?;
- let mut tags: Vec<(String, TagValue)> = Vec::new();
- loop {
- let mut tag_buffer = vec![0u8; 2];
- self.reader.read_exact(&mut tag_buffer)?;
- let tag = String::from_utf8(tag_buffer).unwrap();
-
- let mut tag_type = vec![0u8];
- self.reader.read_exact(&mut tag_type)?;
- let tag_v = match tag_type[0] {
- b'c' => {
- self.reader.read_exact(&mut buffer_1)?;
- TagValue::Int(buffer_1.clone()[0] as i8)
- },
- b'C' => {
- self.reader.read_exact(&mut buffer_1)?;
- TagValue::U(buffer_1.clone()[0])
- },
- b's' => {
- self.reader.read_exact(&mut buffer_2)?;
- TagValue::Int16(
- i16::from_ne_bytes(buffer_2.try_into().expect("Parse error"))
- )
- },
- b'S' => {
- self.reader.read_exact(&mut buffer_2)?;
- TagValue::U16(
- u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"))
- )
- }
- b'i' => {
- self.reader.read_exact(&mut buffer_4)?;
- TagValue::Int32(
- i32::from_ne_bytes(buffer_4.try_into().expect("Parse error"))
- )
- },
- b'I' => {
- self.reader.read_exact(&mut buffer_4)?;
- TagValue::U32(
- u32::from_ne_bytes(buffer_4.try_into().expect("Parse error"))
- )
- },
- b'f' => {
- self.reader.read_exact(&mut buffer_4)?;
- TagValue::F32(
- f32::from_ne_bytes(buffer_4.try_into().expect("Parse error"))
- )
- },
- b'Z' => {
- let mut buf: Vec<u8> = Vec::new();
- self.reader.read_until(b'\0', &mut buf)?;
- buf.pop();
- TagValue::Str(String::from_utf8(buf).unwrap())
- }
- e => panic!("Error unknown tag type. {:?} tag: {:?}", e, tag)
- };
- tags.push((tag, tag_v));
- if self.reader.bgzf_pos() - pos_start >= block_size as u64 { break; }
- }
- Ok(BamRead {ref_id, pos, mapq, flag, read_name, cigar, sequence, phred, tags})
- }
- }
- impl Iterator for BamReader {
- type Item = BamRead;
- fn next(&mut self) -> Option<Self::Item> {
- match self.parse_read() {
- Ok(r) => Some(r),
- Err(err) => {
- println!("Error: {}", err);
- None
- }
- }
- }
- }
- #[cfg(test)]
- mod tests {
- use super::*;
- #[test]
- fn it_works() {
- let bam_path = "/Users/steimle/Documents/Programmes/sv-finder/betya.bam";
- let flags: Vec<u16> = vec![81, 161, 97, 145, 65, 129, 113, 177];
-
- let bam_reader = BamReader::new(bam_path).unwrap();
-
- let reads = bam_reader
- .filter(|br| flags.contains(&br.flag))
- .take(1000)
- .collect::<Vec<BamRead>>();
- assert_eq!(
- reads[0].read_name,
- "NB501645:337:HCCMVAFX2:3:11505:17842:2102_GATGGGACGG".to_string()
- );
- }
- }
|