Thomas 3 年 前
コミット
91a4787fa8
1 ファイル変更343 行追加0 行削除
  1. 343 0
      src/lib.rs

+ 343 - 0
src/lib.rs

@@ -0,0 +1,343 @@
+use std::{fs, io::{Read, BufRead}};
+use bgzip::{BGZFReader, BGZFError};
+
+#[derive(Debug)]
+pub enum BamNucleotid {
+    Equal, A, C, M, G, R, S, V, T, W, Y, H, K, D, B, N
+}
+#[derive(Debug)]
+pub struct  BamSequence(Vec<BamNucleotid>);
+
+impl BamSequence {
+    pub fn new() -> BamSequence {
+        BamSequence(Vec::new())
+    }
+    pub fn get_nt(e: &u8) -> BamNucleotid {
+        match e {
+            0  => BamNucleotid::Equal,
+            1  => BamNucleotid::A,
+            2  => BamNucleotid::C,
+            3  => BamNucleotid::M,
+            4  => BamNucleotid::G,
+            5  => BamNucleotid::R,
+            6  => BamNucleotid::S,
+            7  => BamNucleotid::V,
+            8  => BamNucleotid::T,
+            9  => BamNucleotid::W,
+            10 => BamNucleotid::Y,
+            11 => BamNucleotid::H,
+            12 => BamNucleotid::K,
+            13 => BamNucleotid::D,
+            14 => BamNucleotid::B,
+            15 => BamNucleotid::N,
+            _  => panic!("Parsing error")
+        }
+    }
+    pub fn push(&mut self, e: &u8) {
+        self.0.push(BamSequence::get_nt(e))
+    }
+}
+#[derive(Debug, Clone)]
+pub enum TagValue {
+    Int(i8),
+    U(u8),
+    Int16(i16),
+    U16(u16),
+    Int32(i32),
+    U32(u32),
+    F32(f32),
+    Str(String)
+}
+
+pub type BamRead = (
+    i32,                    // ref_id
+    i32,                    // pos
+    u8,                     // mapq
+    u16,                    // flag
+    String,                 // read_name
+    Vec<(String, u32)>,     // cigar
+    BamSequence,            // sequence
+    Vec<u8>,                // phred
+    Vec<(String, TagValue)> // tags
+);
+
+pub struct BamReader {
+    reader: BGZFReader<fs::File>,
+    references: Vec<(String, u32)>
+}
+
+impl BamReader {
+    pub fn new(bam_path: &str) -> Result<BamReader, BGZFError> {
+        let bam = fs::File::open(bam_path).unwrap();
+        let reader = BGZFReader::new(bam);
+        let mut bam_reader = BamReader { reader, references: Vec::new() };
+        bam_reader.read_header()?;
+        Ok(bam_reader)
+    }
+
+    fn read_header(&mut self) -> Result<(), BGZFError> {
+        let mut buffer_4 = [0u8; 4];
+        let reader = &mut self.reader;
+        
+        // MAGIC
+        reader.read_exact(&mut buffer_4)?;
+        assert_eq!(buffer_4, [66, 65, 77, 1]);
+    
+        // l_text
+        // size of the header
+        reader.read_exact(&mut buffer_4)?;
+        let l_text: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap());
+    
+        // header
+        let mut buffer_header = vec![0u8; l_text as usize];
+        reader.read_exact(&mut buffer_header)?;
+        // let header_txt = String::from_utf8(buffer_header).expect("Can't read the BAM header");
+        // println!("{}", header_txt);
+    
+        // List of reference information (n=n_ref)
+        // # reference sequences
+        reader.read_exact(&mut buffer_4)?;
+        let n_ref: u32 = u32::from_ne_bytes(buffer_4.try_into().unwrap());
+        // println!("n_ref {}", n_ref);
+    
+        // reference vec: refrence_name refertence_size
+        let mut references: Vec<(String, u32)> = Vec::new();
+        for _ in 0..n_ref {
+            // Length of the reference name plus 1 (including NUL)
+            reader.read_exact(&mut buffer_4)?;
+            let l_name: u32 = u32::from_ne_bytes(buffer_4.try_into().expect("Parse error"));
+    
+            // Reference sequence name; NUL-terminated
+            let mut buffer_name = vec![0u8; l_name as usize];
+            reader.read_exact(&mut buffer_name)?;
+            let name = String::from_utf8(buffer_name).expect("Can't read the reference name");
+    
+            // Length of the reference sequence
+            reader.read_exact(&mut buffer_4)?;
+            let l_ref: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap());
+    
+            references.push((name, l_ref));
+        }
+    
+        self.references = references;
+        Ok(())
+    }
+
+    fn parse_read(&mut self) -> Result<BamRead, BGZFError> {
+        // buffers alloc
+        let mut buffer_1 = [0u8; 1];
+        let mut buffer_2 = [0u8; 2];
+        let mut buffer_4 = [0u8; 4];
+        // let mut reader = &self.reader;
+        // Total length of the alignment record, excluding this field
+        self.reader.read_exact(&mut buffer_4)?;
+        // if r.is_err() { break; }; // triggered by eof
+        let block_size: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap());
+        // println!("block size {}", block_size);
+
+        let pos_start = self.reader.bgzf_pos();
+
+        // Reference sequence ID, −1 ≤ refID < n ref; -1 for a read
+        // without a mapping position
+        self.reader.read_exact(&mut buffer_4)?;
+        let ref_id: i32 = i32::from_ne_bytes(buffer_4.try_into().expect("Parse error"));
+        // println!("ref_id {}", ref_id);
+        // println!("ref_name {}", references[ref_id as usize].0);
+
+        // 0-based leftmost coordinate (= POS − 1)
+        self.reader.read_exact(&mut buffer_4)?;
+        let pos: i32 = i32::from_ne_bytes(buffer_4.try_into().expect("Parse error"));
+        // println!("pos {}", pos);
+
+        // Length of read name below (= length(QNAME) + 1)
+        self.reader.read_exact(&mut buffer_1)?;
+        let l_read_name: u8 = u8::from_ne_bytes(buffer_1.try_into().expect("Parse error"));
+        // println!("l_read_name {}", l_read_name);
+
+        // Mapping quality (=MAPQ)
+        self.reader.read_exact(&mut buffer_1)?;
+        let mapq: u8 = u8::from_ne_bytes(buffer_1.try_into().expect("Parse error"));
+        // println!("mapq {}", mapq);
+
+        // BAI index bin
+        self.reader.read_exact(&mut buffer_2)?;
+        // let bin: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"));
+        // println!("bin {}", bin);
+
+        // Number of operations in CIGAR,
+        self.reader.read_exact(&mut buffer_2)?;
+        let n_cigar_op: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"));
+        // println!("n_cigar_op {}", n_cigar_op);
+
+        // Bitwise flags (= FLAG)^29
+        self.reader.read_exact(&mut buffer_2)?;
+        let flag: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"));
+        
+        // println!("flag {}", flag);
+
+        // Length of SEQ 
+        self.reader.read_exact(&mut buffer_4)?;
+        let l_seq: u32 = u32::from_ne_bytes(buffer_4.try_into().unwrap());
+        // println!("l_seq {}", l_seq);
+
+        // Ref-ID of the next segment (−1 ≤ next refID < n ref) 
+        self.reader.read_exact(&mut buffer_4)?;
+        // let next_ref_id: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap());
+        // println!("next_ref_id {}", next_ref_id);
+
+        // 0-based leftmost pos of the next segment  (= PNEXT − 1)
+        self.reader.read_exact(&mut buffer_4)?;
+        // let next_pos: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap());
+        // println!("next_pos {}", next_pos);
+
+        // Template length (= TLEN)
+        self.reader.read_exact(&mut buffer_4)?;
+        // let tlen: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap());
+        // println!("tlen {}", tlen);
+
+        // Read name, NUL-terminated (QNAME with trailing ‘\0’)^30
+        let mut buffer_read_name = vec![0u8; l_read_name as usize];
+        self.reader.read_exact(&mut buffer_read_name)?;
+        buffer_read_name.pop();
+        let read_name = String::from_utf8(buffer_read_name).expect("Can't read the read name");
+        // println!("read_name {}", read_name);
+
+        // CIGAR: op len<<4|op. ‘MIDNSHP=X’→‘012345678’ 
+        let l_buffer_cigar = (n_cigar_op as usize) * 4;
+        let mut buffer_cigar = vec![0u8; l_buffer_cigar];
+        self.reader.read_exact(&mut buffer_cigar)?;
+        let cigar: Vec<(String, u32)> = buffer_cigar.chunks(4)
+        .map(|c| { 
+            let u = u32::from_ne_bytes(c.try_into().unwrap());
+            let op = u << 28 >> 28;
+            let n = u >> 4;
+            match op {
+                0 => ("M".to_string(), n),
+                1 => ("I".to_string(), n),
+                2 => ("D".to_string(), n),
+                3 => ("N".to_string(), n),
+                4 => ("S".to_string(), n),
+                5 => ("H".to_string(), n),
+                6 => ("P".to_string(), n),
+                7 => ("=".to_string(), n),
+                8 => ("X".to_string(), n),
+                _ => panic!("Error parsing cigar")
+            }
+        })
+        .collect();
+        // println!("cigar {:?}", cigar);
+
+        let l_buffer_seq = (l_seq as usize + 1) / 2;
+        let mut buffer_seq = vec![0u8; l_buffer_seq];
+        self.reader.read_exact(&mut buffer_seq)?;
+        let mut sequence = BamSequence::new();
+        for e in buffer_seq.into_iter() {
+            let right = e << 4 >> 4;
+            let left = e >> 4;
+            sequence.push(&left);
+            if right != 0 { sequence.push(&right) };
+        }
+        // println!("sequence {:?}", sequence);
+        
+        let mut phred = vec![0; l_seq as usize];
+        self.reader.read_exact(&mut phred)?;
+
+        let mut tags: Vec<(String, TagValue)> = Vec::new();
+        loop {
+            let mut tag_buffer = vec![0u8; 2];
+            self.reader.read_exact(&mut tag_buffer)?;
+            let tag = String::from_utf8(tag_buffer).unwrap();
+            
+            let mut tag_type = vec![0u8];
+            self.reader.read_exact(&mut tag_type)?;
+
+            let tag_v = match tag_type[0] {
+                b'c' => {
+                    self.reader.read_exact(&mut buffer_1)?;
+                    TagValue::Int(buffer_1.clone()[0] as i8)
+                },
+                b'C' => {
+                    self.reader.read_exact(&mut buffer_1)?;
+                    TagValue::U(buffer_1.clone()[0])
+                },
+                b's' => {
+                    self.reader.read_exact(&mut buffer_2)?;
+                    TagValue::Int16(
+                        i16::from_ne_bytes(buffer_2.try_into().expect("Parse error"))
+                    )
+                },
+                b'S' => {
+                    self.reader.read_exact(&mut buffer_2)?;
+                    TagValue::U16(
+                        u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"))
+                    )
+                }
+                b'i' => {
+                    self.reader.read_exact(&mut buffer_4)?;
+                    TagValue::Int32(
+                        i32::from_ne_bytes(buffer_4.try_into().expect("Parse error"))
+                    )
+                },
+                b'I' => {
+                    self.reader.read_exact(&mut buffer_4)?;
+                    TagValue::U32(
+                        u32::from_ne_bytes(buffer_4.try_into().expect("Parse error"))
+                    )
+                },
+                b'f' => {
+                    self.reader.read_exact(&mut buffer_4)?;
+                    TagValue::F32(
+                        f32::from_ne_bytes(buffer_4.try_into().expect("Parse error"))
+                    )
+                },
+                b'Z' => {
+                    let mut buf: Vec<u8> = Vec::new();
+                    self.reader.read_until(b'\0', &mut buf)?;
+                    buf.pop();
+                    TagValue::Str(String::from_utf8(buf).unwrap())
+                }
+                e => panic!("Error unknown tag type. {:?} tag: {:?}", e, tag)
+            };
+            tags.push((tag, tag_v));
+
+            // if block_size - current_read == 0 { break; }
+            if self.reader.bgzf_pos() - pos_start >= block_size as u64 { break; }
+        }
+        Ok((ref_id, pos, mapq, flag, read_name, cigar, sequence, phred, tags))
+    }
+}
+
+impl Iterator for BamReader {
+    type Item = BamRead;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Ok(r) = self.parse_read() {
+            Some(r)
+        } else {
+            None
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn it_works() {
+        let bam_path = "/Users/steimle/Documents/Programmes/sv-finder/betya.bam";
+        let flags: Vec<u16> = vec![81, 161, 97, 145, 65, 129, 113, 177];
+        
+        let bam_reader = BamReader::new(bam_path).unwrap();
+        
+        let reads = bam_reader
+        .filter(|br| flags.contains(&br.3))
+        .take(1000)
+        .collect::<Vec<BamRead>>();
+
+        assert_eq!(
+            reads[0].4,
+            "NB501645:337:HCCMVAFX2:3:11505:17842:2102_GATGGGACGG".to_string()
+        );
+    }
+}