lib.rs 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868
  1. // MIT License
  2. // Copyright (c) [2022] [Thomas Steimlé]
  3. // Permission is hereby granted, free of charge, to any person obtaining a copy
  4. // of this software and associated documentation files (the "Software"), to deal
  5. // in the Software without restriction, including without limitation the rights
  6. // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. // copies of the Software, and to permit persons to whom the Software is
  8. // furnished to do so, subject to the following conditions:
  9. // The above copyright notice and this permission notice shall be included in all
  10. // copies or substantial portions of the Software.
  11. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  12. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  13. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  14. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  15. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  16. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  17. // SOFTWARE.
  18. use std::{ops::Range, collections::{HashMap, HashSet}};
  19. use petgraph::{graph::{Graph, NodeIndex, EdgeIndex}, Undirected};
  20. use kseq::parse_path;
  21. use faimm::IndexedFasta;
  22. use duct::cmd;
  23. pub struct SequencesGraph {
  24. sequences: HashMap<NodeIndex, String>,
  25. seq_hash : HashSet<String>,
  26. names : HashMap<NodeIndex, String>,
  27. overlaps : HashMap<EdgeIndex, Overlap>,
  28. offsets : HashMap<String, (NodeIndex, usize, i32)>, // node_index, stretch_id, offset
  29. graph : Graph<String, usize, Undirected>,
  30. min_overlapping: usize,
  31. max_consecutive: usize,
  32. max_diffs : usize,
  33. }
  34. #[derive(Debug)]
  35. pub struct Overlap {
  36. id_a : NodeIndex,
  37. range_a: Range<usize>,
  38. id_b : NodeIndex,
  39. range_b: Range<usize>,
  40. }
  41. impl SequencesGraph {
  42. pub fn new (min_overlapping: usize, max_consecutive: usize, max_diffs: usize) -> SequencesGraph {
  43. SequencesGraph {
  44. sequences: HashMap::new(),
  45. seq_hash : HashSet::new(),
  46. names : HashMap::new(),
  47. overlaps : HashMap::new(),
  48. offsets : HashMap::new(),
  49. graph : Graph::new_undirected(),
  50. min_overlapping,
  51. max_consecutive,
  52. max_diffs
  53. }
  54. }
  55. pub fn from_fq_pe (&mut self, r1_path: &str, r2_path: &str, min_overlapping_len_pe: usize) {
  56. let mut r1p = parse_path(r1_path).unwrap();
  57. let mut r2p = parse_path(r2_path).unwrap();
  58. let mut n_pairs = 0;
  59. while let (Some(r1), Some(r2)) = (r1p.iter_record().unwrap(), r2p.iter_record().unwrap()) {
  60. let a = r1.seq().as_bytes().to_vec();
  61. let b = r2.seq().as_bytes().to_vec();
  62. self.add_from_pe(a, b, r1.head().to_string(), r2.head().to_string(), min_overlapping_len_pe);
  63. n_pairs += 1;
  64. }
  65. println!("{n_pairs}, pairs parsed.");
  66. self.draw_edges();
  67. println!("{}, overlaps fund.", self.overlaps.len());
  68. self.remove_single_nodes();
  69. println!("{}, nodes to assemble", self.names.len());
  70. self.find_leftmost_node();
  71. }
  72. pub fn add_node (&mut self, sequence: String, name: String) {
  73. let id = self.graph.add_node(sequence.clone());
  74. self.sequences.insert(id, sequence);
  75. self.names.insert(id, name);
  76. }
  77. pub fn add_edge (&mut self, id_a: &NodeIndex, id_b: &NodeIndex) {
  78. let a = self.sequences.get(id_a).unwrap().as_bytes().to_vec();
  79. let b = self.sequences.get(id_b).unwrap().as_bytes().to_vec();
  80. if let Some(r) = get_overlaps(&a, &b, self.min_overlapping, self.max_consecutive, self.max_diffs) { // normaly always like that
  81. self.overlaps.insert(
  82. self.graph.add_edge(id_a.clone(), id_b.clone(), r.0.len()),
  83. Overlap { id_a: *id_a, range_a: r.0, id_b: *id_b, range_b: r.1}
  84. );
  85. } else if let Some(r) = get_overlaps(&b, &a, self.min_overlapping, self.max_consecutive, self.max_diffs) {
  86. self.overlaps.insert(
  87. self.graph.add_edge(id_a.clone(), id_b.clone(), r.0.len()),
  88. Overlap { id_a: *id_a, range_a: r.1, id_b: *id_b, range_b: r.0}
  89. );
  90. }
  91. }
  92. pub fn add_from_pe (&mut self, a: Vec<u8>, b: Vec<u8>, a_name: String, b_name: String, min_overlapping_len_pe: usize) {
  93. let a_rc = revcomp(std::str::from_utf8(&a).unwrap()).as_bytes().to_vec();
  94. let b_rc = revcomp(std::str::from_utf8(&b).unwrap()).as_bytes().to_vec();
  95. let max_consecutive = self.max_consecutive;
  96. let max_diffs = self.max_diffs;
  97. let mut seq: Option<Vec<u8>> = None;
  98. if let Some(r) = get_overlaps(&a_rc, &b, min_overlapping_len_pe, max_consecutive, max_diffs) { // normaly always like that
  99. seq = Some(fuse_overlaps(&a_rc, &b, &r.0, &r.1));
  100. } else if let Some(r) = get_overlaps(&b, &a, min_overlapping_len_pe, max_consecutive, max_diffs) {
  101. seq = Some(fuse_overlaps(&b, &a, &r.0, &r.1));
  102. } else if let Some(r) = get_overlaps(&a, &b, min_overlapping_len_pe, max_consecutive, max_diffs) {
  103. seq = Some(fuse_overlaps(&b, &a, &r.0, &r.1));
  104. } else if let Some(r) = get_overlaps(&b, &a_rc, min_overlapping_len_pe, max_consecutive, max_diffs) {
  105. seq = Some(fuse_overlaps(&b, &a_rc, &r.0, &r.1));
  106. } else if let Some(r) = get_overlaps(&a, &b_rc, min_overlapping_len_pe, max_consecutive, max_diffs) {
  107. seq = Some(fuse_overlaps(&a, &b_rc, &r.0, &r.1));
  108. } else if let Some(r) = get_overlaps(&a_rc, &b_rc, min_overlapping_len_pe, max_consecutive, max_diffs) {
  109. seq = Some(fuse_overlaps(&a_rc, &b_rc, &r.0, &r.1));
  110. } else if let Some(r) = get_overlaps(&b_rc, &a_rc, min_overlapping_len_pe, max_consecutive, max_diffs) {
  111. seq = Some(fuse_overlaps(&b_rc, &a_rc, &r.0, &r.1));
  112. }
  113. if let Some(seq) = seq {
  114. let seq = String::from_utf8_lossy(&seq).to_string();
  115. if !self.seq_hash.contains(&seq) {
  116. self.add_node(seq.clone(), a_name);
  117. self.seq_hash.insert(seq);
  118. }
  119. } else {
  120. let seq_r1 = String::from_utf8_lossy(&a).to_string();
  121. if !self.seq_hash.contains(&seq_r1) {
  122. self.add_node(seq_r1.clone(), format!("{a_name}_R1"));
  123. self.seq_hash.insert(seq_r1);
  124. }
  125. let seq_r2 = String::from_utf8_lossy(&b_rc).to_string();
  126. if !self.seq_hash.contains(&seq_r2) {
  127. self.add_node(seq_r2.clone(), format!("{b_name}_R2"));
  128. self.seq_hash.insert(seq_r2);
  129. }
  130. }
  131. }
  132. pub fn draw_edges (&mut self) {
  133. let idx: Vec<NodeIndex> = self.graph.node_indices().collect();
  134. let n_nodes = idx.len();
  135. for x in 0..n_nodes {
  136. for y in 0..n_nodes {
  137. if y < x {
  138. self.add_edge(&idx[x], &idx[y]);
  139. }
  140. }
  141. }
  142. }
  143. pub fn remove_single_nodes (&mut self) {
  144. let mut res: Vec<(NodeIndex, usize)> = Vec::new();
  145. self.names.iter().for_each(|(node_index, _)|{
  146. let mut neighbors = self.graph.neighbors(*node_index).detach();
  147. let mut node_neighbors: Vec<NodeIndex> = Vec::new();
  148. while let Some(neighbor_id) = neighbors.next_node(&self.graph) {
  149. node_neighbors.push(neighbor_id);
  150. }
  151. res.push((*node_index, node_neighbors.len()));
  152. });
  153. res.iter().for_each(|(node_id, n_neighbors)|{
  154. if *n_neighbors == 0 {
  155. self.graph.remove_node(*node_id);
  156. // println!("Removing {} which doesnt have any overlapping reads.", self.names.get(&node_id).unwrap());
  157. self.names.remove(node_id);
  158. self.sequences.remove(node_id);
  159. }
  160. });
  161. }
  162. pub fn find_leftmost_node (&mut self) {
  163. let mut stretch_id = 0;
  164. self.names.iter().for_each(|(node_index, _)|{
  165. let mut neighbors = self.graph.neighbors(node_index.clone()).detach();
  166. let mut left = false;
  167. let mut n_neigh = 0;
  168. while let Some(edge_index) = neighbors.next_edge(&self.graph) {
  169. n_neigh += 1;
  170. let ov = self.overlaps.get(&edge_index).unwrap();
  171. if ov.id_a == node_index.clone() && ov.range_a.start == 0 { left = true; break; }
  172. if ov.id_b == node_index.clone() && ov.range_b.start == 0 { left = true; break; }
  173. }
  174. if !left && n_neigh > 0 {
  175. let node_key = format!("{}_{stretch_id}", node_index.index());
  176. self.offsets.insert(node_key.clone(), (node_index.clone(), stretch_id, 0));
  177. stretch_id += 1;
  178. }
  179. });
  180. let mut visited: HashSet<String> = HashSet::new();
  181. loop {
  182. let mut new_visit = 0;
  183. let mut to_insert: Vec<(String, NodeIndex, usize, i32)> = Vec::new();
  184. for (node_key, (node_index, stretch_id, offset)) in self.offsets.iter() {
  185. if visited.contains(node_key) { continue; }
  186. new_visit += 1;
  187. let mut neighbors = self.graph.neighbors(*node_index).detach();
  188. while let Some(edge_index) = neighbors.next_edge(&self.graph) {
  189. let overlaps = self.overlaps.get(&edge_index).unwrap();
  190. let neighbor_id = if overlaps.id_a == *node_index { overlaps.id_b } else { overlaps.id_a };
  191. let neighbor_key = format!("{}_{stretch_id}", neighbor_id.index());
  192. if !self.offsets.contains_key(&neighbor_key) {
  193. if overlaps.id_a == *node_index {
  194. let new_offset = *offset + overlaps.range_a.start as i32 - overlaps.range_b.start as i32;
  195. to_insert.push((neighbor_key, neighbor_id, *stretch_id, new_offset));
  196. } else {
  197. let new_offset = *offset + overlaps.range_b.start as i32 - overlaps.range_a.start as i32;
  198. to_insert.push((neighbor_key, neighbor_id, *stretch_id, new_offset));
  199. }
  200. }
  201. }
  202. visited.insert(node_key.clone());
  203. }
  204. to_insert.iter().for_each(|(key, node_index, stretch_id, offset)| {
  205. self.offsets.insert(key.clone(), (*node_index, *stretch_id, *offset));
  206. });
  207. if new_visit == 0 { break; }
  208. }
  209. }
  210. pub fn process (&mut self) {
  211. self.draw_edges();
  212. self.remove_single_nodes();
  213. self.find_leftmost_node();
  214. }
  215. pub fn get_new_stretch_id (&self) -> usize {
  216. let mut v = self.offsets.iter()
  217. .map(|(_, (_, id, _))| *id).collect::<Vec<usize>>();
  218. v.dedup();
  219. v.sort_by(|a,b| b.cmp(a));
  220. if v.len() > 0 {
  221. v[0] + 1
  222. } else {
  223. 0
  224. }
  225. }
  226. pub fn get_stretch_align (&self, stretch_id: usize) -> Vec<(NodeIndex, i32)> {
  227. let mut align: Vec<(NodeIndex, i32)> = Vec::new();
  228. self.offsets.iter().for_each(|(_, (node_index, node_stretch_id, offset))| {
  229. if *node_stretch_id == stretch_id {
  230. align.push((*node_index, *offset));
  231. }
  232. });
  233. align.sort_by(|a,b| a.1.cmp(&b.1));
  234. align
  235. }
  236. pub fn print_stretch (&self, stretch_id: usize) {
  237. let v = self.get_stretch_align(stretch_id);
  238. println!(">sequence_{}", stretch_id);
  239. if v.len() > 0 {
  240. let decal = -v[0].1;
  241. for (node_index, offset) in v {
  242. let seq = self.sequences.get(&node_index).unwrap();
  243. println!("{}{} - {:?}", " ".repeat((offset + decal) as usize).to_string(), seq, node_index);
  244. }
  245. }
  246. let cons = self.get_stretch_consensus(stretch_id);
  247. println!("{}", String::from_utf8_lossy(&cons) );
  248. }
  249. // Stack reads respecting their offsets
  250. // Return the consensus sequence
  251. pub fn get_stretch_consensus (&self, stretch_id: usize) -> Vec<u8> {
  252. let v = self.get_stretch_align(stretch_id);
  253. let mut reads_stack: Vec<Vec<u8>> = Vec::with_capacity(v.len());
  254. let mut consensus_sequence: Vec<u8> = Vec::new();
  255. if v.len() > 0 {
  256. // first read position will be used as reference
  257. let first_gap = -v[0].1;
  258. let mut max_len = 0;
  259. // add each read to the stack with n offset empty space
  260. for (node_index, offset) in v {
  261. let seq = self.sequences.get(&node_index).unwrap();
  262. let seq = format!("{}{}", " ".repeat((offset + first_gap) as usize).to_string(), seq).as_bytes().to_vec();
  263. // keep track of the size of reads stack
  264. if max_len < seq.len() { max_len = seq.len(); }
  265. reads_stack.push(seq);
  266. }
  267. // for each column of the reads stack
  268. for i in 0..max_len {
  269. let mut acc: Vec<u8> = Vec::new();
  270. let atcg = vec![b'A', b'T', b'C', b'G'];
  271. let mut n_atcg = [0; 4];
  272. reads_stack.iter().for_each(|s|{
  273. if s.len() > i {
  274. acc.push(s[i].clone());
  275. for nb in 0..4 {
  276. if s[i] == atcg[nb] {
  277. n_atcg[nb] += 1;
  278. }
  279. }
  280. }
  281. });
  282. let n_base: usize = n_atcg.iter().sum();
  283. let max_base = n_atcg.iter().max().unwrap();
  284. let mut base = b'N';
  285. if *max_base as f32 / n_base as f32 > 0.5 {
  286. n_atcg.iter().enumerate().for_each(|(pos, n)| if n == max_base { base = atcg[pos] });
  287. }
  288. consensus_sequence.push(base);
  289. }
  290. }
  291. consensus_sequence
  292. }
  293. pub fn get_fasta (&self) -> String {
  294. let mut fasta = String::new();
  295. for i in 0..self.get_new_stretch_id() {
  296. let cons = self.get_stretch_consensus(i);
  297. fasta.push_str(format!(">sequence_{}_length_{}\n", i + 1, cons.len()).as_str());
  298. let mut acc: Vec<u8> = Vec::new();
  299. cons.into_iter().for_each(|c| {
  300. acc.push(c);
  301. if acc.len() == 60 {
  302. fasta.push_str(format!("{}\n", String::from_utf8_lossy(&acc)).as_str());
  303. acc = Vec::new();
  304. }
  305. });
  306. if acc.len() > 0 {
  307. fasta.push_str(format!("{}\n", String::from_utf8_lossy(&acc)).as_str());
  308. }
  309. }
  310. fasta
  311. }
  312. pub fn make_neo_contig (&self, stretch_id: usize) -> NeoContig {
  313. let cons = String::from_utf8_lossy(&self.get_stretch_consensus(stretch_id)).to_string().trim().to_string();
  314. let stretch = self.get_stretch_align(stretch_id);
  315. let sequences: HashMap<NodeIndex, String> = stretch.iter().map(|(id,_)|{
  316. let (id, s) = self.sequences.get_key_value(id).unwrap();
  317. (*id, s.clone())
  318. }).collect();
  319. let sequences_names: HashMap<NodeIndex, String> = stretch.iter().map(|(id,_)|{
  320. let (id, s) = self.names.get_key_value(id).unwrap();
  321. (*id, s.clone())
  322. }).collect();
  323. NeoContig {
  324. sequence_id: stretch_id,
  325. name: format!("sequence_{stretch_id}"),
  326. contig: cons.clone(),
  327. alignments: Vec::new(),
  328. sequences,
  329. sequences_names,
  330. stretch,
  331. }
  332. }
  333. }
  334. #[derive(Debug, Clone)]
  335. pub struct Sam {
  336. pub query_name : String,
  337. pub query_range: Range<usize>,
  338. pub ref_name : String,
  339. pub ref_pos : i64,
  340. pub ref_range : Range<usize>,
  341. pub sequence : String,
  342. pub ref_sequence: String,
  343. pub ref_cigar : String,
  344. }
  345. impl Sam {
  346. pub fn new(query_name: String, flag: i32, ref_name: String, ref_pos: i64, cigar: String, sequence: String, fa: &Fasta) -> Self {
  347. let mut sequence = sequence;
  348. let (mut ref_range, mut query_range, ref_cigar) = matched_range(&cigar, &flag, &mut sequence);
  349. // sam is 1-based
  350. // the range frames the sequence
  351. // the last position is excluded in rust ranges
  352. // [ref_range.start, ref_range.end[
  353. // in bam the positions are inclusives [start, end]
  354. if is_reverse(flag) {
  355. let len = ref_range.len();
  356. ref_range.end = (ref_pos as usize) - 1;
  357. ref_range.start = ref_pos as usize + len - 2;
  358. query_range.end = query_range.end + 1;
  359. } else {
  360. let len = ref_range.len();
  361. ref_range.start = ref_pos as usize;
  362. ref_range.end = (ref_pos as usize + len) - 1;
  363. query_range.end = query_range.end;
  364. }
  365. let ref_sequence = fa.get_sequence(&ref_name, ref_range.start.clone(), ref_range.end.clone());
  366. Sam {query_name, query_range, ref_name, ref_pos, ref_range, sequence, ref_sequence, ref_cigar}
  367. }
  368. }
  369. #[derive(Debug, Clone)]
  370. pub struct NeoContig {
  371. pub sequence_id: usize,
  372. pub name: String,
  373. pub contig : String,
  374. pub alignments : Vec<Sam>,
  375. pub sequences: HashMap<NodeIndex, String>,
  376. pub sequences_names: HashMap<NodeIndex, String>,
  377. pub stretch : Vec<(NodeIndex, i32)>,
  378. }
  379. impl NeoContig {
  380. pub fn bwa_aln (&mut self, fasta_ref_path: &str, fa: &Fasta) {
  381. let sam_keys = vec!["QNAME", "FLAG", "RNAME",
  382. "POS", "MAPQ", "CIGAR", "RNEXT", "PNEXT", "TLEN", "SEQ",
  383. "QUAL", "ALN"];
  384. let stdout = cmd!("echo", "-e", format!("\">sequence_{}\n{}\"", self.name.to_string(), self.contig))
  385. .pipe(cmd!("bwa", "mem", fasta_ref_path, "-"))
  386. .stdout_capture()
  387. .stderr_capture()
  388. .read().unwrap();
  389. let mut alignments: Vec<Sam> = Vec::new();
  390. stdout.split("\n").for_each(|line|{
  391. // println!("{:?}", line);
  392. if line.len() > 8 {
  393. if &line[0..9] == "sequence_" {
  394. let mut qname: Option<String> = None;
  395. let mut flag: Option<i32> = None;
  396. let mut rname: Option<String> = None;
  397. let mut pos: Option<i64> = None;
  398. let mut cigar: Option<String> = None;
  399. let mut sequence: Option<String> = None;
  400. for (i, value) in line.split("\t").enumerate() {
  401. if sam_keys.len() <= i { break; }
  402. match sam_keys[i] {
  403. "QNAME" => qname = Some(value.to_string()),
  404. "FLAG" => flag = value.parse::<i32>().ok(),
  405. "RNAME" => rname = Some(value.to_string()),
  406. "POS" => pos = value.parse::<i64>().ok(),
  407. "CIGAR" => cigar = Some(value.to_string()),
  408. "SEQ" => sequence = Some(value.to_string()),
  409. _ => ()
  410. }
  411. }
  412. if let (Some(qname), Some(flag), Some(rname), Some(pos), Some(cigar), Some(_)) = (qname, flag, rname, pos, cigar, sequence) {
  413. alignments.push(Sam::new(qname, flag, rname, pos, cigar, self.contig.clone(), &fa));
  414. }
  415. }
  416. }
  417. });
  418. alignments.sort_by(|a,b|{
  419. a.query_range.start.cmp(&b.query_range.start)
  420. });
  421. self.alignments = alignments;
  422. }
  423. pub fn make_aln (&self, print_sequences: Option<bool>) -> Vec<Vec<u8>> {
  424. println!("== {} ============", self.name);
  425. let contig_len = self.contig.len();
  426. let mut lines: Vec<Vec<u8>> = Vec::new();
  427. // ref 1
  428. if self.alignments.len() > 0 {
  429. let ref_spaces = " ".repeat(self.alignments[0].query_range.start).as_bytes().to_vec();
  430. // ref sequence
  431. let mut l = ref_spaces.clone();
  432. let ref_cigar = self.alignments[0].ref_cigar.as_bytes();
  433. let mut filtered_ref: Vec<u8> = self.alignments[0].ref_sequence
  434. .as_bytes()
  435. .iter().enumerate()
  436. .filter(|(i, _)| ref_cigar[*i] == b'M')
  437. .map(|(_, c)| *c)
  438. .collect();
  439. l.append(&mut filtered_ref);
  440. lines.push(l);
  441. // match pipes
  442. let mut match_pipes = self.alignments[0].ref_cigar
  443. .as_bytes().iter()
  444. .filter(|c| **c == b'M')
  445. .map(|_| b'|')
  446. .collect::<Vec<u8>>();
  447. let mut l = ref_spaces.clone();
  448. l.append(&mut match_pipes);
  449. let mut ll = format!("{} - {}:{}-{}{}",
  450. " ".repeat(contig_len - l.len()).to_string(),
  451. self.alignments[0].ref_name,
  452. self.alignments[0].ref_range.start,
  453. self.alignments[0].ref_range.end,
  454. if self.alignments[0].ref_range.start > self.alignments[0].ref_range.end { " (rc)" } else { "" }
  455. ).as_bytes().to_vec();
  456. l.append(&mut ll);
  457. lines.push(l);
  458. }
  459. // Contig
  460. let contig = format!("{} - denovo assembled contig (cov = {})",
  461. self.contig,
  462. self.sequences.len()
  463. ).as_bytes().to_vec();
  464. lines.push(contig.clone());
  465. // Sequences
  466. if let Some(show_seq) = print_sequences {
  467. if show_seq && self.stretch.len() > 0 {
  468. let decal = -self.stretch[0].1; // first have to be leftmost
  469. for (node_index, offset) in self.stretch.iter() {
  470. let seq = self.sequences.get(&node_index).unwrap();
  471. let name = self.sequences_names.get(&node_index).unwrap();
  472. let mut l = format!("{}{}", " ".repeat((offset + decal) as usize).to_string(), seq).as_bytes().to_vec();
  473. let mut ll = format!("{} - {}", " ".repeat(contig_len - l.len()).to_string(), name).as_bytes().to_vec();
  474. l.append(&mut ll);
  475. lines.push(l);
  476. }
  477. }
  478. // Contig
  479. lines.push(contig);
  480. }
  481. // If more than one reference sequence
  482. if self.alignments.len() > 1 {
  483. let ref_spaces = " ".repeat(self.alignments[1].query_range.start).as_bytes().to_vec();
  484. // match pipes
  485. let mut match_pipes = self.alignments[1]
  486. .ref_cigar.as_bytes().iter()
  487. .filter(|c| **c == b'M')
  488. .map(|_| b'|')
  489. .collect::<Vec<u8>>();
  490. let mut l = ref_spaces.clone();
  491. l.append(&mut match_pipes);
  492. let mut ll = format!("{} - {}:{}-{}{}",
  493. " ".repeat(contig_len ).to_string(),
  494. self.alignments[1].ref_name,
  495. self.alignments[1].ref_range.start,
  496. self.alignments[1].ref_range.end,
  497. if self.alignments[1].ref_range.start > self.alignments[1].ref_range.end { " (rc)" } else { "" }
  498. ).as_bytes().to_vec();
  499. l.append(&mut ll);
  500. lines.push(l);
  501. // ref sequence
  502. let mut l = ref_spaces.clone();
  503. let ref_cigar = self.alignments[1].ref_cigar.as_bytes();
  504. let mut filtered_ref: Vec<u8> = self.alignments[1].ref_sequence
  505. .as_bytes()
  506. .iter().enumerate()
  507. .filter(|(i, _)| ref_cigar[*i] == b'M')
  508. .map(|(_, c)| *c)
  509. .collect();
  510. l.append(&mut filtered_ref);
  511. lines.push(l);
  512. }
  513. lines
  514. }
  515. pub fn print_stretch(&self) {
  516. for line in self.make_aln(Some(false)) {
  517. println!("{}", String::from_utf8_lossy(&line));
  518. }
  519. }
  520. pub fn get_hgvs(&self, centromere_pos: Vec<(String, i64)>) -> Option<String> {
  521. fn get_hgvs_part(aln: Sam, centro_pos: i64) -> String {
  522. let mut part = "".to_string();
  523. if aln.ref_pos < centro_pos {
  524. part = format!(
  525. "pter_{}{}",
  526. aln.ref_pos,
  527. if aln.ref_range.start < aln.ref_range.end { "" } else { "inv" },
  528. )
  529. } else {
  530. part = format!(
  531. "{}_qter{}",
  532. aln.ref_pos,
  533. if aln.ref_range.start < aln.ref_range.end { "" } else { "inv" },
  534. )
  535. }
  536. part
  537. }
  538. // only if chimeric
  539. let mut res = None;
  540. if self.alignments.len() == 2 {
  541. let mut a_part = "".to_string();
  542. let mut b_part = "".to_string();
  543. centromere_pos.iter().for_each(|(contig, pos)| {
  544. if *contig == self.alignments[0].ref_name {
  545. a_part = get_hgvs_part(self.alignments[0].clone(), *pos);
  546. }
  547. if *contig == self.alignments[1].ref_name {
  548. b_part = get_hgvs_part(self.alignments[1].clone(), *pos);
  549. }
  550. });
  551. let mut added_nt = String::new();
  552. if self.alignments[0].query_range.end < self.alignments[1].query_range.start {
  553. added_nt = self.alignments[0].sequence[
  554. self.alignments[0].query_range.end..(self.alignments[1].query_range.start)
  555. ].to_string();
  556. added_nt.push(';');
  557. }
  558. res = Some(format!(
  559. "{}:g.{}[{}{}:g.{}]",
  560. self.alignments[0].ref_name,
  561. a_part,
  562. added_nt,
  563. self.alignments[1].ref_name,
  564. b_part
  565. ));
  566. }
  567. res
  568. }
  569. pub fn get_fasta(&self) -> (String, String) {
  570. let fa_lines = self.contig.chars()
  571. .collect::<Vec<char>>()
  572. .chunks(60)
  573. .map(|c| c.iter().collect::<String>())
  574. .collect::<Vec<String>>()
  575. .join("\n");
  576. let mut names: Vec<String> = Vec::new();
  577. for s in &self.alignments {
  578. // contig:[pos-pos]
  579. names.push(format!("{}:{}-{}", s.ref_name,
  580. s.ref_range.start, s.ref_range.end - 1));
  581. }
  582. (names.join("_"), fa_lines)
  583. }
  584. }
  585. pub struct Fasta {
  586. fa: IndexedFasta
  587. }
  588. impl Fasta {
  589. pub fn new (path: &str) -> Self {
  590. Fasta { fa: IndexedFasta::from_file(path).expect("Error opening fa") }
  591. }
  592. fn get_sequence (&self, contig: &str, start: usize, end: usize) -> String { // end is included
  593. let chr_index = self.fa.fai().tid(contig).expect("Cannot find chr in index");
  594. if (end as i64 - start as i64) < (0 as i64) {
  595. let fv = self.fa.view(chr_index, end, start + 1).expect("Cannot get .fa view");
  596. revcomp(&fv.to_string().to_uppercase())
  597. } else {
  598. let fv = self.fa.view(chr_index, start - 1, end).expect("Cannot get .fa view");
  599. fv.to_string().to_uppercase()
  600. }
  601. }
  602. }
  603. fn is_reverse (flag: i32) -> bool {
  604. flag & 0x10 == 0x10
  605. }
  606. fn matched_range (cigar: &str, flag: &i32, matched_seq: &mut str) -> (Range<usize>, Range<usize>, String) {
  607. let mut n_head_to_rm = 0;
  608. for c in matched_seq.as_bytes().to_vec().iter() {
  609. if *c == b'N' { n_head_to_rm += 1 } else { break; }
  610. }
  611. let mut n_trail_to_rm = 0;
  612. for c in matched_seq.as_bytes().to_vec().iter().rev() {
  613. if *c == b'N' { n_trail_to_rm += 1 } else { break; }
  614. }
  615. let all_op = vec!["M", "I", "D", "N", "S", "=", "X", "H", "P"];
  616. let consumes_query = vec!["M", "I", "S", "=", "X", "H"];
  617. let consumes_ref = vec!["M", "D", "N", "=", "X"];
  618. let mut range_query:Range<usize> = Range::default();
  619. let mut range_ref:Range<usize> = Range::default();
  620. let mut num_acc = String::new();
  621. let mut query_pos = 0;
  622. let mut ref_pos = 0;
  623. let mut ref_cigar_string = "".to_string();
  624. let mut first_m = true;
  625. let n_op = cigar.split("").filter(|c| all_op.contains(c)).count();
  626. let mut curr_op = 1;
  627. for f in cigar.split("") {
  628. match f.parse::<usize>() {
  629. Ok(_) => num_acc.push_str(f), // if a number added to the accumulator
  630. Err(_) => { // if a letter
  631. if f != "" {
  632. //parse the accumulator
  633. let mut current_add = num_acc.parse::<usize>().unwrap();
  634. num_acc = "".to_string();
  635. if n_head_to_rm > 0 && curr_op == 1 {
  636. current_add = current_add - n_head_to_rm;
  637. }
  638. if n_trail_to_rm > 0 && curr_op == n_op {
  639. current_add = current_add - n_trail_to_rm;
  640. }
  641. curr_op += 1;
  642. if f == "M" {
  643. if first_m {
  644. range_query.start = query_pos;
  645. range_query.end = query_pos + current_add;
  646. range_ref.start = ref_pos;
  647. range_ref.end = ref_pos + current_add;
  648. first_m = false;
  649. } else {
  650. range_query.end = query_pos + current_add;
  651. range_ref.end = ref_pos + current_add;
  652. }
  653. }
  654. if consumes_query.contains(&f) {
  655. query_pos += current_add;
  656. }
  657. if consumes_ref.contains(&f) {
  658. ref_pos += current_add;
  659. let toadd = f.repeat(current_add);
  660. ref_cigar_string.push_str(&toadd);
  661. }
  662. }
  663. }
  664. }
  665. }
  666. if is_reverse(*flag) {
  667. let query_len = range_query.len();
  668. range_query.start = query_pos - range_query.end;
  669. range_query.end = range_query.start + query_len - 1;
  670. }
  671. assert_eq!(range_ref.len(), ref_cigar_string.len());
  672. (range_ref, range_query, ref_cigar_string)
  673. }
  674. pub fn revcomp(dna: &str) -> String{
  675. let mut rdna: String = String::with_capacity(dna.len());
  676. for c in dna.chars().rev() {
  677. rdna.push(switch_base(c));
  678. }
  679. rdna
  680. }
  681. fn switch_base(c:char) -> char {
  682. match c {
  683. 'a' => 't',
  684. 'c' => 'g',
  685. 't' => 'a',
  686. 'g' => 'c',
  687. 'u' => 'a',
  688. 'A' => 'T',
  689. 'C' => 'G',
  690. 'T' => 'A',
  691. 'G' => 'C',
  692. 'U' => 'A',
  693. _ => 'N'
  694. }
  695. }
  696. pub fn get_overlaps (a: &Vec<u8>, b: &Vec<u8>, min_len: usize, max_consecutive: usize, max_diffs: usize) -> Option<(Range<usize>, Range<usize>)> {
  697. let (a_len, b_len) = (a.len(), b.len());
  698. let (smallest_len, biggest_len) = if a_len <= b_len { (a_len, b_len) } else { (b_len, a_len) };
  699. let mut res: Option<(Range<usize>, Range<usize>)> = None;
  700. for i in min_len..(smallest_len + (biggest_len - smallest_len)) {
  701. let range_a: Range<usize>;
  702. let range_b: Range<usize>;
  703. if i <= smallest_len {
  704. range_a = Range { start: 0, end: i };
  705. range_b = Range { start: b_len - i, end: b_len };
  706. } else {
  707. // check if inside
  708. if smallest_len == a_len {
  709. let after = i - smallest_len - 1;
  710. range_a = Range { start: 0, end: a_len };
  711. range_b = Range { start: after, end: a_len + after};
  712. } else {
  713. let after = i - smallest_len - 1;
  714. range_a = Range { start: after, end: b_len + after};
  715. range_b = Range { start: 0, end: b_len};
  716. }
  717. }
  718. if &a[range_a.clone()] == &b[range_b.clone()] {
  719. res = Some((range_a, range_b));
  720. continue;
  721. } else if match_tol(&a[range_a.clone()], &b[range_b.clone()], max_consecutive, max_diffs) {
  722. res = Some((range_a, range_b));
  723. continue;
  724. } else if res.is_some() {
  725. return res;
  726. }
  727. }
  728. res
  729. }
  730. pub fn fuse_overlaps (a: &Vec<u8>, b: &Vec<u8>, a_range: &Range<usize>, b_range: &Range<usize>) -> Vec<u8> {
  731. let mut res: Vec<u8> = Vec::new();
  732. if a_range.start == 0 {
  733. res = [b.clone(), a[a_range.end..a.len()].to_vec() ].concat().to_vec();
  734. } else if b_range.start == 0 {
  735. res = [a.clone(), b[b_range.end..b.len()].to_vec() ].concat().to_vec();
  736. }
  737. res
  738. }
  739. // max one consecutive diff and 5 diffs
  740. fn match_tol (s1: &[u8], s2: &[u8], max_consecutive: usize, max_diffs: usize) -> bool {
  741. let max_consecutive= max_consecutive + 2;
  742. // let max_diffs: usize = 5;
  743. let mut diffs: Vec<usize> = Vec::new();
  744. for (a, b) in std::iter::zip(s1, s2) {
  745. diffs.push((a == b) as usize);
  746. let sum: usize = diffs.iter().rev().take(max_consecutive).sum();
  747. if sum == 0 && diffs.len() >= max_consecutive { return false }
  748. }
  749. let sum: usize = diffs.iter().rev().take(max_consecutive).sum();
  750. if sum == max_consecutive {
  751. diffs.len() - diffs.iter().sum::<usize>() <= max_diffs
  752. } else {
  753. false
  754. }
  755. }