lib.rs 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. use std::{fs, io::{Read, BufRead}};
  2. use bgzip::{BGZFReader, BGZFError};
  3. #[derive(Debug)]
  4. pub enum BamNucleotid {
  5. Equal, A, C, M, G, R, S, V, T, W, Y, H, K, D, B, N
  6. }
  7. #[derive(Debug)]
  8. pub struct BamSequence(Vec<BamNucleotid>);
  9. impl BamSequence {
  10. pub fn new() -> BamSequence {
  11. BamSequence(Vec::new())
  12. }
  13. pub fn get_nt(e: &u8) -> BamNucleotid {
  14. match e {
  15. 0 => BamNucleotid::Equal,
  16. 1 => BamNucleotid::A,
  17. 2 => BamNucleotid::C,
  18. 3 => BamNucleotid::M,
  19. 4 => BamNucleotid::G,
  20. 5 => BamNucleotid::R,
  21. 6 => BamNucleotid::S,
  22. 7 => BamNucleotid::V,
  23. 8 => BamNucleotid::T,
  24. 9 => BamNucleotid::W,
  25. 10 => BamNucleotid::Y,
  26. 11 => BamNucleotid::H,
  27. 12 => BamNucleotid::K,
  28. 13 => BamNucleotid::D,
  29. 14 => BamNucleotid::B,
  30. 15 => BamNucleotid::N,
  31. _ => panic!("Parsing error")
  32. }
  33. }
  34. pub fn push(&mut self, e: &u8) {
  35. self.0.push(BamSequence::get_nt(e))
  36. }
  37. }
  38. #[derive(Debug, Clone)]
  39. pub enum TagValue {
  40. Int(i8),
  41. U(u8),
  42. Int16(i16),
  43. U16(u16),
  44. Int32(i32),
  45. U32(u32),
  46. F32(f32),
  47. Str(String)
  48. }
  49. pub struct BamRead {
  50. ref_id: i32, // ref_id
  51. pos: i32, // pos
  52. mapq: u8, // mapq
  53. flag: u16, // flag
  54. read_name: String, // read_name
  55. cigar: Vec<(String, u32)>, // cigar
  56. sequence: BamSequence, // sequence
  57. phred: Vec<u8>, // phred
  58. tags: Vec<(String, TagValue)> // tags
  59. }
  60. pub struct BamReader {
  61. reader: BGZFReader<fs::File>,
  62. references: Vec<(String, u32)>
  63. }
  64. impl BamReader {
  65. pub fn new(bam_path: &str) -> Result<BamReader, BGZFError> {
  66. let bam = fs::File::open(bam_path).unwrap();
  67. let reader = BGZFReader::new(bam);
  68. let mut bam_reader = BamReader { reader, references: Vec::new() };
  69. bam_reader.read_header()?;
  70. Ok(bam_reader)
  71. }
  72. fn read_header(&mut self) -> Result<(), BGZFError> {
  73. let mut buffer_4 = [0u8; 4];
  74. let reader = &mut self.reader;
  75. // MAGIC
  76. reader.read_exact(&mut buffer_4)?;
  77. assert_eq!(buffer_4, [66, 65, 77, 1]);
  78. // l_text
  79. // size of the header
  80. reader.read_exact(&mut buffer_4)?;
  81. let l_text: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap());
  82. // header
  83. let mut buffer_header = vec![0u8; l_text as usize];
  84. reader.read_exact(&mut buffer_header)?;
  85. // let header_txt = String::from_utf8(buffer_header).expect("Can't read the BAM header");
  86. // println!("{}", header_txt);
  87. // List of reference information (n=n_ref)
  88. // # reference sequences
  89. reader.read_exact(&mut buffer_4)?;
  90. let n_ref: u32 = u32::from_ne_bytes(buffer_4.try_into().unwrap());
  91. // println!("n_ref {}", n_ref);
  92. // reference vec: refrence_name refertence_size
  93. let mut references: Vec<(String, u32)> = Vec::new();
  94. for _ in 0..n_ref {
  95. // Length of the reference name plus 1 (including NUL)
  96. reader.read_exact(&mut buffer_4)?;
  97. let l_name: u32 = u32::from_ne_bytes(buffer_4.try_into().expect("Parse error"));
  98. // Reference sequence name; NUL-terminated
  99. let mut buffer_name = vec![0u8; l_name as usize];
  100. reader.read_exact(&mut buffer_name)?;
  101. let name = String::from_utf8(buffer_name).expect("Can't read the reference name");
  102. // Length of the reference sequence
  103. reader.read_exact(&mut buffer_4)?;
  104. let l_ref: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap());
  105. references.push((name, l_ref));
  106. }
  107. self.references = references;
  108. Ok(())
  109. }
  110. fn parse_read(&mut self) -> Result<BamRead, BGZFError> {
  111. // buffers alloc
  112. let mut buffer_1 = [0u8; 1];
  113. let mut buffer_2 = [0u8; 2];
  114. let mut buffer_4 = [0u8; 4];
  115. // let mut reader = &self.reader;
  116. // Total length of the alignment record, excluding this field
  117. self.reader.read_exact(&mut buffer_4)?;
  118. // if r.is_err() { break; }; // triggered by eof
  119. let block_size: u32 = u32::from_ne_bytes(buffer_4.clone().try_into().unwrap());
  120. // println!("block size {}", block_size);
  121. let pos_start = self.reader.bgzf_pos();
  122. // Reference sequence ID, −1 ≤ refID < n ref; -1 for a read
  123. // without a mapping position
  124. self.reader.read_exact(&mut buffer_4)?;
  125. let ref_id: i32 = i32::from_ne_bytes(buffer_4.try_into().expect("Parse error"));
  126. // println!("ref_id {}", ref_id);
  127. // println!("ref_name {}", references[ref_id as usize].0);
  128. // 0-based leftmost coordinate (= POS − 1)
  129. self.reader.read_exact(&mut buffer_4)?;
  130. let pos: i32 = i32::from_ne_bytes(buffer_4.try_into().expect("Parse error"));
  131. // println!("pos {}", pos);
  132. // Length of read name below (= length(QNAME) + 1)
  133. self.reader.read_exact(&mut buffer_1)?;
  134. let l_read_name: u8 = u8::from_ne_bytes(buffer_1.try_into().expect("Parse error"));
  135. // println!("l_read_name {}", l_read_name);
  136. // Mapping quality (=MAPQ)
  137. self.reader.read_exact(&mut buffer_1)?;
  138. let mapq: u8 = u8::from_ne_bytes(buffer_1.try_into().expect("Parse error"));
  139. // println!("mapq {}", mapq);
  140. // BAI index bin
  141. self.reader.read_exact(&mut buffer_2)?;
  142. // let bin: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"));
  143. // println!("bin {}", bin);
  144. // Number of operations in CIGAR,
  145. self.reader.read_exact(&mut buffer_2)?;
  146. let n_cigar_op: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"));
  147. // println!("n_cigar_op {}", n_cigar_op);
  148. // Bitwise flags (= FLAG)^29
  149. self.reader.read_exact(&mut buffer_2)?;
  150. let flag: u16 = u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"));
  151. // println!("flag {}", flag);
  152. // Length of SEQ
  153. self.reader.read_exact(&mut buffer_4)?;
  154. let l_seq: u32 = u32::from_ne_bytes(buffer_4.try_into().unwrap());
  155. // println!("l_seq {}", l_seq);
  156. // Ref-ID of the next segment (−1 ≤ next refID < n ref)
  157. self.reader.read_exact(&mut buffer_4)?;
  158. // let next_ref_id: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap());
  159. // println!("next_ref_id {}", next_ref_id);
  160. // 0-based leftmost pos of the next segment (= PNEXT − 1)
  161. self.reader.read_exact(&mut buffer_4)?;
  162. // let next_pos: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap());
  163. // println!("next_pos {}", next_pos);
  164. // Template length (= TLEN)
  165. self.reader.read_exact(&mut buffer_4)?;
  166. // let tlen: i32 = i32::from_ne_bytes(buffer_4.try_into().unwrap());
  167. // println!("tlen {}", tlen);
  168. // Read name, NUL-terminated (QNAME with trailing ‘\0’)^30
  169. let mut buffer_read_name = vec![0u8; l_read_name as usize];
  170. self.reader.read_exact(&mut buffer_read_name)?;
  171. buffer_read_name.pop();
  172. let read_name = String::from_utf8(buffer_read_name).expect("Can't read the read name");
  173. // println!("read_name {}", read_name);
  174. // CIGAR: op len<<4|op. ‘MIDNSHP=X’→‘012345678’
  175. let l_buffer_cigar = (n_cigar_op as usize) * 4;
  176. let mut buffer_cigar = vec![0u8; l_buffer_cigar];
  177. self.reader.read_exact(&mut buffer_cigar)?;
  178. let cigar: Vec<(String, u32)> = buffer_cigar.chunks(4)
  179. .map(|c| {
  180. let u = u32::from_ne_bytes(c.try_into().unwrap());
  181. let op = u << 28 >> 28;
  182. let n = u >> 4;
  183. match op {
  184. 0 => ("M".to_string(), n),
  185. 1 => ("I".to_string(), n),
  186. 2 => ("D".to_string(), n),
  187. 3 => ("N".to_string(), n),
  188. 4 => ("S".to_string(), n),
  189. 5 => ("H".to_string(), n),
  190. 6 => ("P".to_string(), n),
  191. 7 => ("=".to_string(), n),
  192. 8 => ("X".to_string(), n),
  193. _ => panic!("Error parsing cigar")
  194. }
  195. })
  196. .collect();
  197. // println!("cigar {:?}", cigar);
  198. let l_buffer_seq = (l_seq as usize + 1) / 2;
  199. let mut buffer_seq = vec![0u8; l_buffer_seq];
  200. self.reader.read_exact(&mut buffer_seq)?;
  201. let mut sequence = BamSequence::new();
  202. for e in buffer_seq.into_iter() {
  203. let right = e << 4 >> 4;
  204. let left = e >> 4;
  205. sequence.push(&left);
  206. if right != 0 { sequence.push(&right) };
  207. }
  208. // println!("sequence {:?}", sequence);
  209. let mut phred = vec![0; l_seq as usize];
  210. self.reader.read_exact(&mut phred)?;
  211. let mut tags: Vec<(String, TagValue)> = Vec::new();
  212. loop {
  213. let mut tag_buffer = vec![0u8; 2];
  214. self.reader.read_exact(&mut tag_buffer)?;
  215. let tag = String::from_utf8(tag_buffer).unwrap();
  216. let mut tag_type = vec![0u8];
  217. self.reader.read_exact(&mut tag_type)?;
  218. let tag_v = match tag_type[0] {
  219. b'c' => {
  220. self.reader.read_exact(&mut buffer_1)?;
  221. TagValue::Int(buffer_1.clone()[0] as i8)
  222. },
  223. b'C' => {
  224. self.reader.read_exact(&mut buffer_1)?;
  225. TagValue::U(buffer_1.clone()[0])
  226. },
  227. b's' => {
  228. self.reader.read_exact(&mut buffer_2)?;
  229. TagValue::Int16(
  230. i16::from_ne_bytes(buffer_2.try_into().expect("Parse error"))
  231. )
  232. },
  233. b'S' => {
  234. self.reader.read_exact(&mut buffer_2)?;
  235. TagValue::U16(
  236. u16::from_ne_bytes(buffer_2.try_into().expect("Parse error"))
  237. )
  238. }
  239. b'i' => {
  240. self.reader.read_exact(&mut buffer_4)?;
  241. TagValue::Int32(
  242. i32::from_ne_bytes(buffer_4.try_into().expect("Parse error"))
  243. )
  244. },
  245. b'I' => {
  246. self.reader.read_exact(&mut buffer_4)?;
  247. TagValue::U32(
  248. u32::from_ne_bytes(buffer_4.try_into().expect("Parse error"))
  249. )
  250. },
  251. b'f' => {
  252. self.reader.read_exact(&mut buffer_4)?;
  253. TagValue::F32(
  254. f32::from_ne_bytes(buffer_4.try_into().expect("Parse error"))
  255. )
  256. },
  257. b'Z' => {
  258. let mut buf: Vec<u8> = Vec::new();
  259. self.reader.read_until(b'\0', &mut buf)?;
  260. buf.pop();
  261. TagValue::Str(String::from_utf8(buf).unwrap())
  262. }
  263. e => panic!("Error unknown tag type. {:?} tag: {:?}", e, tag)
  264. };
  265. tags.push((tag, tag_v));
  266. if self.reader.bgzf_pos() - pos_start >= block_size as u64 { break; }
  267. }
  268. Ok(BamRead {ref_id, pos, mapq, flag, read_name, cigar, sequence, phred, tags})
  269. }
  270. }
  271. impl Iterator for BamReader {
  272. type Item = BamRead;
  273. fn next(&mut self) -> Option<Self::Item> {
  274. match self.parse_read() {
  275. Ok(r) => Some(r),
  276. Err(err) => {
  277. println!("Error: {}", err);
  278. None
  279. }
  280. }
  281. }
  282. }
  283. #[cfg(test)]
  284. mod tests {
  285. use super::*;
  286. #[test]
  287. fn it_works() {
  288. let bam_path = "/Users/steimle/Documents/Programmes/sv-finder/betya.bam";
  289. let flags: Vec<u16> = vec![81, 161, 97, 145, 65, 129, 113, 177];
  290. let bam_reader = BamReader::new(bam_path).unwrap();
  291. let reads = bam_reader
  292. .filter(|br| flags.contains(&br.flag))
  293. .take(1000)
  294. .collect::<Vec<BamRead>>();
  295. assert_eq!(
  296. reads[0].read_name,
  297. "NB501645:337:HCCMVAFX2:3:11505:17842:2102_GATGGGACGG".to_string()
  298. );
  299. }
  300. }