main.rs 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. use duct::cmd;
  2. use faimm::IndexedFasta;
  3. fn main() {
  4. let ref_path = "/home/thomas/NGS/ref/hg19.fa";
  5. // let sequence = "
  6. // CATTATATTCTTTTTTACAGTGCCACGGGTATATTTAGTAAACATTGAAATGTGTGGTTG
  7. // GAATTGTTCATCACTGATGTTTTTGGTGATATTTGTTTATGTTCTGAAGCTATTGCTGTA
  8. // GACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATATTA
  9. // TTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACGGC
  10. // TGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTAGGTCCAAGAACTACAACGGCT
  11. // GCTGCCGGCCGGGCCCGCTCCCTGGCCTACCTTGCACAAGAAGTCGACATCGTAGAAGGC
  12. // GGACAGAAGTGTGGTCGACATGTTTCTGGAA"
  13. // .replace("\n", "")
  14. // .replace(" ", "");
  15. // let sequence = "
  16. // CGTTATTCGTCGTGGCTCAAGCCCGGCCACGCCGCCCCAAGGGCTCCTCCCGACCTCCCG
  17. // GCCTGCCGCTCCGGCCACTGCGGGATCCAGAAACATGTCGACCACACTTCTGTCCGCCTT
  18. // CTACGATGTCGACTTCTTGTGCAAGGTAGGCCAGGGAGCGGGCCCGGCCGGCAGCAGCCG
  19. // TTGTAGTTCTTGGACCTAGGTTGGAGTGCATTAAGCCTTTGTCCAAAAACACCCAGCCGT
  20. // GACCCGCTATGTATGTCTCAGCATTGGGAAGAGTCCTCTGAGTGTCATGGGAAAATAATA
  21. // TATGAGTTT"
  22. // .replace("\n", "")
  23. // .replace(" ", "");
  24. //2011N160703-HAOAA 1
  25. let sequence = "
  26. CCTCCCGCCCTGCCTTTCAGGTCGGGAAAGTCCCGGGGTTTGCAAAAGAGTGTCCGAGCG
  27. CCCTGGAGGCGGGGAGGGCGGCAAGGAGGGCGCCGGTGTCGCGGTTGAGTTTCTCCACTG
  28. CCGACCGCGGCCACGCTGCCCGGGGCTTCCCGGACAGGCTTCGCGCCGCCCACCTCGGCA
  29. GCCGGGGCGGA
  30. TGG
  31. CCACACAGGTTGGAGTGCATTAAGCCTTTGTCCAAAAACACCCAGC
  32. CGTGACCCGCTATGTATGTCTCAGCATTGGGAAGAGTCCTCTGAGTGTCATGGGAAAATA
  33. ATATATGAGTT"
  34. .replace("\n", "")
  35. .replace(" ", "");
  36. //2011N160703-HAOAA 2
  37. // let sequence = "
  38. // TGGAATTGTTCATCACTGATGTTTTTGGTGATATTTGTTTATGTTCTGAAGCTATTGCTG
  39. // TAGACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATAT
  40. // TATTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACG
  41. // GCTGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTGTGTGGCCATCCGCCCCGGC
  42. // TGCCGAGGTGGGCGGCGCGAAGCCTGTCCGGGAAGCCCCGGGCAGCGTGGCCGCGGTCGG
  43. // CAGTGGAGAAACTCAACCGCGACACC"
  44. // .replace("\n", "")
  45. // .replace(" ", "");
  46. let alns = Alignments::to_reference(&sequence, ref_path);
  47. alns.ranges.iter().for_each(|r| println!("{:?}", r));
  48. }
  49. pub fn bwa_aln(ref_path: &str, query_sequence: &str) -> Vec<Sam> {
  50. let stdout = cmd!("echo", "-e", format!("\">sequence_to_query\n{}\"", query_sequence))
  51. .pipe(cmd!("bwa", "mem", ref_path, "-"))
  52. .stdout_capture()
  53. .stderr_capture()
  54. .read()
  55. .unwrap();
  56. stdout.split("\n")
  57. .filter(|e| e.contains("sequence_to_query"))
  58. .map(|e| Sam::new_from_bwa_output(e))
  59. .into_iter()
  60. .map(|mut e| {
  61. // if the sequence is returned by bwa with an N at first position remove it
  62. if e.seq.len() != query_sequence.len() {
  63. if e.seq.chars().nth(0).unwrap() == 'N' {
  64. let (_, seq) = e.seq.split_at(1);
  65. e.seq = seq.to_string();
  66. let mut cig = e.cigar.0.get_mut(0).unwrap();
  67. cig.1 -= 1;
  68. }
  69. if e.seq.chars().last().unwrap() == 'N' {
  70. let (seq, _) = e.seq.split_at(e.seq.len());
  71. e.seq = seq.to_string();
  72. let cl = e.cigar.0.len();
  73. let mut cig = e.cigar.0.get_mut(cl - 1).unwrap();
  74. cig.1 -= 1;
  75. }
  76. }
  77. e
  78. })
  79. .collect()
  80. }
  81. #[derive(Debug, Clone)]
  82. pub struct Sam {
  83. pub qname : String,
  84. pub flag : u16,
  85. pub ref_name: String,
  86. pub pos : i32,
  87. pub mapq : i32,
  88. pub cigar : Cigar,
  89. pub rnext : String,
  90. pub pnext : String,
  91. pub tlen : String,
  92. pub seq : String
  93. }
  94. impl Sam {
  95. pub fn new_from_bwa_output(sam_string: &str) -> Sam {
  96. let mut split = sam_string.split("\t");
  97. Sam {
  98. qname : split.next().unwrap().to_string(),
  99. flag : split.next().unwrap().parse().unwrap(),
  100. ref_name: split.next().unwrap().to_string(),
  101. pos : split.next().unwrap().parse().unwrap(),
  102. mapq : split.next().unwrap().parse().unwrap(),
  103. cigar : Cigar::new_from_str(&split.next().unwrap().to_string()),
  104. rnext : split.next().unwrap().to_string(),
  105. pnext : split.next().unwrap().to_string(),
  106. tlen : split.next().unwrap().to_string(),
  107. seq : split.next().unwrap().to_string()
  108. }
  109. }
  110. pub fn is_rc(&self) -> bool {
  111. self.flag & 0x10 == 0x10
  112. }
  113. }
  114. #[derive(Debug, Clone)]
  115. pub struct Cigar(pub Vec<(String, u32)>);
  116. impl Cigar {
  117. pub fn new_from_str(cigar_str: &str) -> Cigar {
  118. let mut res: Vec<(String, u32)> = Vec::new();
  119. let mut num_acc = String::new();
  120. for c in cigar_str.split("") {
  121. if c != "" { match c.parse::<usize>() {
  122. Ok(_) => num_acc.push_str(c), // if a number added to the accumulator
  123. Err(_) => {
  124. let current_add = num_acc.parse::<u32>().unwrap();
  125. num_acc = "".to_string();
  126. res.push((c.to_string(), current_add));
  127. }
  128. }}
  129. }
  130. Cigar(res)
  131. }
  132. pub fn len(&self) -> u32 {
  133. self.0.iter()
  134. .map(|e| e.1)
  135. .reduce(|acc, e| acc + e)
  136. .unwrap()
  137. }
  138. pub fn expand(&self) -> String {
  139. let mut res = String::new();
  140. for (op, n) in self.0.iter() {
  141. let c = op.chars().next().unwrap();
  142. for _ in 0..(*n) {
  143. res.push(c);
  144. }
  145. }
  146. res
  147. }
  148. }
  149. pub struct Alignments {
  150. pub query_sequence : String,
  151. pub sam : Vec<Sam>,
  152. pub ranges : Vec<Vec<((i32, i32), (i32, i32))>>,
  153. }
  154. impl Alignments {
  155. pub fn to_reference(query_sequence: &str, ref_path: &str) -> Alignments {
  156. let res_sam = bwa_aln(ref_path, query_sequence);
  157. let mut res_ranges = Vec::new();
  158. for sam in &res_sam {
  159. res_ranges.push(ranges_from_cigar(sam.pos, sam.cigar.clone(), sam.is_rc()));
  160. }
  161. Alignments {
  162. query_sequence: query_sequence.to_string(),
  163. sam: res_sam,
  164. ranges: res_ranges
  165. }
  166. }
  167. }
  168. pub fn revcomp(dna: &str) -> String {
  169. let mut rdna: String = String::with_capacity(dna.len());
  170. for c in dna.chars().rev() {
  171. rdna.push(switch_base(c));
  172. }
  173. rdna
  174. }
  175. pub fn switch_base(c: char) -> char {
  176. match c {
  177. 'a' => 't',
  178. 'c' => 'g',
  179. 't' => 'a',
  180. 'g' => 'c',
  181. 'u' => 'a',
  182. 'A' => 'T',
  183. 'C' => 'G',
  184. 'T' => 'A',
  185. 'G' => 'C',
  186. 'U' => 'A',
  187. _ => 'N'
  188. }
  189. }
  190. /// return [start, stop] 1-based positions relative to reference and query
  191. pub fn ranges_from_cigar(ref_start: i32, cigar: Cigar, is_rc: bool) -> Vec<((i32, i32), (i32, i32))> {
  192. let cigar = if is_rc { cigar.expand().chars().rev().collect::<Vec<char>>() } else { cigar.expand().chars().collect::<Vec<char>>() };
  193. let mut ref_counter = 0;
  194. let mut query_counter = 0;
  195. let mut query_matches: Vec<i32> = Vec::new();
  196. let mut ref_matches: Vec<i32> = Vec::new();
  197. for op in cigar.iter() {
  198. match op {
  199. 'M' | '=' => {
  200. query_matches.push(query_counter);
  201. ref_matches.push(ref_counter);
  202. ref_counter += 1;
  203. query_counter += 1;
  204. },
  205. 'S' | 'H' | 'I' => {
  206. query_counter += 1;
  207. },
  208. 'D' | 'N' => {
  209. ref_counter += 1;
  210. },
  211. _ => panic!("Unknown cigar operand")
  212. }
  213. }
  214. let mut query_ranges = Vec::new();
  215. let mut acc_range = None;
  216. for qp in query_matches.into_iter() {
  217. if let Some((start, stop)) = acc_range {
  218. if stop == qp - 1 {
  219. acc_range = Some((start, qp));
  220. } else {
  221. query_ranges.push(acc_range.unwrap());
  222. acc_range = Some((qp, qp));
  223. }
  224. } else {
  225. acc_range = Some((qp, qp));
  226. }
  227. }
  228. query_ranges.push(acc_range.unwrap());
  229. query_ranges = query_ranges.iter().map(|e| (e.0 +1, e.1 +1)).collect();
  230. let mut ref_ranges = Vec::new();
  231. let mut acc_range = None;
  232. for qp in ref_matches.into_iter() {
  233. if let Some((start, stop)) = acc_range {
  234. if stop == qp - 1 {
  235. acc_range = Some((start, qp));
  236. } else {
  237. query_ranges.push(acc_range.unwrap());
  238. acc_range = Some((qp, qp));
  239. }
  240. } else {
  241. acc_range = Some((qp, qp));
  242. }
  243. }
  244. ref_ranges.push(acc_range.unwrap());
  245. ref_ranges = ref_ranges.iter().map(|(start, stop)| {
  246. if is_rc {
  247. (ref_start + stop, ref_start + start)
  248. } else {
  249. (ref_start + start, ref_start + stop)
  250. }
  251. }).collect();
  252. ref_ranges.into_iter().zip(query_ranges.into_iter()).collect()
  253. }
  254. pub fn format_seq(name: &str, sequence: &str, line_size: usize) -> String {
  255. // todo check fasta name compl
  256. let res = sequence.chars()
  257. .collect::<Vec<char>>()
  258. .chunks(line_size)
  259. .map(|c| c.iter().collect::<String>())
  260. .collect::<Vec<String>>()
  261. .join("");
  262. format!(">{name}\n{res}")
  263. }
  264. pub fn compose_seq(ref_path: &str, ranges: Vec<(String, i32, i32)>) -> String {
  265. let fa = IndexedFasta::from_file(ref_path).expect("Error opening fa");
  266. let mut sequence = String::new();
  267. for (contig, start, stop) in ranges {
  268. let chr_index = fa.fai().tid(&contig).expect("Cannot find chr in index");
  269. if start < stop {
  270. let v = fa.view(chr_index, start as usize,stop as usize).expect("Cannot get .fa view");
  271. sequence = format!("{}{}", sequence, v.to_string());
  272. } else {
  273. let v = fa.view(chr_index, stop as usize, start as usize).expect("Cannot get .fa view");
  274. sequence = format!("{}{}", sequence, revcomp(&v.to_string()));
  275. }
  276. }
  277. sequence
  278. }