lib.rs 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
  1. use std::{ops::Range, collections::{HashMap, HashSet}};
  2. use petgraph::{graph::{Graph, NodeIndex, EdgeIndex}, Undirected};
  3. use kseq::parse_path;
  4. use faimm::IndexedFasta;
  5. pub struct SequencesGraph {
  6. sequences: HashMap<NodeIndex, String>,
  7. seq_hash : HashSet<String>,
  8. names : HashMap<NodeIndex, String>,
  9. overlaps : HashMap<EdgeIndex, Overlap>,
  10. offsets : HashMap<String, (NodeIndex, usize, i32)>, // node_index, stretch_id, offset
  11. graph : Graph<String, usize, Undirected>,
  12. min_overlapping: usize,
  13. max_consecutive: usize,
  14. max_diffs : usize,
  15. }
  16. #[derive(Debug)]
  17. pub struct Overlap {
  18. id_a : NodeIndex,
  19. range_a: Range<usize>,
  20. id_b : NodeIndex,
  21. range_b: Range<usize>,
  22. }
  23. impl SequencesGraph {
  24. pub fn new (min_overlapping: usize, max_consecutive: usize, max_diffs: usize) -> SequencesGraph {
  25. SequencesGraph {
  26. sequences: HashMap::new(),
  27. seq_hash : HashSet::new(),
  28. names : HashMap::new(),
  29. overlaps : HashMap::new(),
  30. offsets : HashMap::new(),
  31. graph : Graph::new_undirected(),
  32. min_overlapping,
  33. max_consecutive,
  34. max_diffs
  35. }
  36. }
  37. pub fn from_fq_pe (&mut self, r1_path: &str, r2_path: &str, min_overlapping_len_pe: usize) {
  38. let mut r1p = parse_path(r1_path).unwrap();
  39. let mut r2p = parse_path(r2_path).unwrap();
  40. let mut n_pairs = 0;
  41. while let (Some(r1), Some(r2)) = (r1p.iter_record().unwrap(), r2p.iter_record().unwrap()) {
  42. let a = r1.seq().as_bytes().to_vec();
  43. let b = r2.seq().as_bytes().to_vec();
  44. self.add_from_pe(a, b, r1.head().to_string(), r2.head().to_string(), min_overlapping_len_pe);
  45. n_pairs += 1;
  46. }
  47. println!("{n_pairs}, pairs parsed.");
  48. self.draw_edges();
  49. println!("{}, overlaps fund.", self.overlaps.len());
  50. self.remove_single_nodes();
  51. println!("{}, nodes to assemble", self.names.len());
  52. self.find_leftmost_node();
  53. }
  54. pub fn add_from_pe (&mut self, a: Vec<u8>, b: Vec<u8>, a_name: String, b_name: String, min_overlapping_len_pe: usize) {
  55. let a_rc = revcomp(std::str::from_utf8(&a).unwrap()).as_bytes().to_vec();
  56. let b_rc = revcomp(std::str::from_utf8(&b).unwrap()).as_bytes().to_vec();
  57. let max_consecutive = self.max_consecutive;
  58. let max_diffs = self.max_diffs;
  59. let mut seq: Option<Vec<u8>> = None;
  60. if let Some(r) = get_overlaps(&a_rc, &b, min_overlapping_len_pe, max_consecutive, max_diffs) { // normaly always like that
  61. seq = Some(fuse_overlaps(&a_rc, &b, &r.0, &r.1));
  62. } else if let Some(r) = get_overlaps(&b, &a, min_overlapping_len_pe, max_consecutive, max_diffs) {
  63. seq = Some(fuse_overlaps(&b, &a, &r.0, &r.1));
  64. } else if let Some(r) = get_overlaps(&a, &b, min_overlapping_len_pe, max_consecutive, max_diffs) {
  65. seq = Some(fuse_overlaps(&b, &a, &r.0, &r.1));
  66. } else if let Some(r) = get_overlaps(&b, &a_rc, min_overlapping_len_pe, max_consecutive, max_diffs) {
  67. seq = Some(fuse_overlaps(&b, &a_rc, &r.0, &r.1));
  68. } else if let Some(r) = get_overlaps(&a, &b_rc, min_overlapping_len_pe, max_consecutive, max_diffs) {
  69. seq = Some(fuse_overlaps(&a, &b_rc, &r.0, &r.1));
  70. } else if let Some(r) = get_overlaps(&a_rc, &b_rc, min_overlapping_len_pe, max_consecutive, max_diffs) {
  71. seq = Some(fuse_overlaps(&a_rc, &b_rc, &r.0, &r.1));
  72. } else if let Some(r) = get_overlaps(&b_rc, &a_rc, min_overlapping_len_pe, max_consecutive, max_diffs) {
  73. seq = Some(fuse_overlaps(&b_rc, &a_rc, &r.0, &r.1));
  74. }
  75. if let Some(seq) = seq {
  76. let seq = String::from_utf8_lossy(&seq).to_string();
  77. if !self.seq_hash.contains(&seq) {
  78. self.add_node(seq.clone(), a_name);
  79. self.seq_hash.insert(seq);
  80. }
  81. } else {
  82. let seq_r1 = String::from_utf8_lossy(&a).to_string();
  83. if !self.seq_hash.contains(&seq_r1) {
  84. self.add_node(seq_r1.clone(), format!("{a_name}_R1"));
  85. self.seq_hash.insert(seq_r1);
  86. }
  87. let seq_r2 = String::from_utf8_lossy(&b_rc).to_string();
  88. if !self.seq_hash.contains(&seq_r2) {
  89. self.add_node(seq_r2.clone(), format!("{b_name}_R2"));
  90. self.seq_hash.insert(seq_r2);
  91. }
  92. }
  93. }
  94. pub fn add_node (&mut self, sequence: String, name: String) {
  95. let id = self.graph.add_node(sequence.clone());
  96. self.sequences.insert(id, sequence);
  97. self.names.insert(id, name);
  98. }
  99. pub fn add_edge (&mut self, id_a: &NodeIndex, id_b: &NodeIndex) {
  100. let a = self.sequences.get(id_a).unwrap().as_bytes().to_vec();
  101. let b = self.sequences.get(id_b).unwrap().as_bytes().to_vec();
  102. if let Some(r) = get_overlaps(&a, &b, self.min_overlapping, self.max_consecutive, self.max_diffs) { // normaly always like that
  103. self.overlaps.insert(
  104. self.graph.add_edge(id_a.clone(), id_b.clone(), r.0.len()),
  105. Overlap { id_a: *id_a, range_a: r.0, id_b: *id_b, range_b: r.1}
  106. );
  107. } else if let Some(r) = get_overlaps(&b, &a, self.min_overlapping, self.max_consecutive, self.max_diffs) {
  108. self.overlaps.insert(
  109. self.graph.add_edge(id_a.clone(), id_b.clone(), r.0.len()),
  110. Overlap { id_a: *id_a, range_a: r.1, id_b: *id_b, range_b: r.0}
  111. );
  112. }
  113. }
  114. pub fn draw_edges (&mut self) {
  115. let idx: Vec<NodeIndex> = self.graph.node_indices().collect();
  116. let n_nodes = idx.len();
  117. for x in 0..n_nodes {
  118. for y in 0..n_nodes {
  119. if y < x {
  120. self.add_edge(&idx[x], &idx[y]);
  121. }
  122. }
  123. }
  124. }
  125. pub fn remove_single_nodes (&mut self) {
  126. let mut res: Vec<(NodeIndex, usize)> = Vec::new();
  127. self.names.iter().for_each(|(node_index, _)|{
  128. let mut neighbors = self.graph.neighbors(*node_index).detach();
  129. let mut node_neighbors: Vec<NodeIndex> = Vec::new();
  130. while let Some(neighbor_id) = neighbors.next_node(&self.graph) {
  131. node_neighbors.push(neighbor_id);
  132. }
  133. res.push((*node_index, node_neighbors.len()));
  134. });
  135. res.iter().for_each(|(node_id, n_neighbors)|{
  136. if *n_neighbors == 0 {
  137. self.graph.remove_node(*node_id);
  138. println!("Removing {} which doesnt have any overlapping reads.", self.names.get(&node_id).unwrap());
  139. self.names.remove(node_id);
  140. self.sequences.remove(node_id);
  141. }
  142. });
  143. }
  144. pub fn find_leftmost_node (&mut self) {
  145. let mut stretch_id = 0;
  146. self.names.iter().for_each(|(node_index, _)|{
  147. let mut neighbors = self.graph.neighbors(node_index.clone()).detach();
  148. let mut left = false;
  149. let mut n_neigh = 0;
  150. while let Some(edge_index) = neighbors.next_edge(&self.graph) {
  151. n_neigh += 1;
  152. let ov = self.overlaps.get(&edge_index).unwrap();
  153. if ov.id_a == node_index.clone() && ov.range_a.start == 0 { left = true; break; }
  154. if ov.id_b == node_index.clone() && ov.range_b.start == 0 { left = true; break; }
  155. }
  156. if !left && n_neigh > 0 {
  157. let node_key = format!("{}_{stretch_id}", node_index.index());
  158. self.offsets.insert(node_key.clone(), (node_index.clone(), stretch_id, 0));
  159. stretch_id += 1;
  160. }
  161. });
  162. let mut visited: HashSet<String> = HashSet::new();
  163. loop {
  164. let mut new_visit = 0;
  165. let mut to_insert: Vec<(String, NodeIndex, usize, i32)> = Vec::new();
  166. for (node_key, (node_index, stretch_id, offset)) in self.offsets.iter() {
  167. if visited.contains(node_key) { continue; }
  168. new_visit += 1;
  169. let mut neighbors = self.graph.neighbors(*node_index).detach();
  170. while let Some(edge_index) = neighbors.next_edge(&self.graph) {
  171. let overlaps = self.overlaps.get(&edge_index).unwrap();
  172. let neighbor_id = if overlaps.id_a == *node_index { overlaps.id_b } else { overlaps.id_a };
  173. let neighbor_key = format!("{}_{stretch_id}", neighbor_id.index());
  174. if !self.offsets.contains_key(&neighbor_key) {
  175. if overlaps.id_a == *node_index {
  176. let new_offset = *offset + overlaps.range_a.start as i32 - overlaps.range_b.start as i32;
  177. to_insert.push((neighbor_key, neighbor_id, *stretch_id, new_offset));
  178. } else {
  179. let new_offset = *offset + overlaps.range_b.start as i32 - overlaps.range_a.start as i32;
  180. to_insert.push((neighbor_key, neighbor_id, *stretch_id, new_offset));
  181. }
  182. }
  183. }
  184. visited.insert(node_key.clone());
  185. }
  186. to_insert.iter().for_each(|(key, node_index, stretch_id, offset)| {
  187. self.offsets.insert(key.clone(), (*node_index, *stretch_id, *offset));
  188. });
  189. if new_visit == 0 { break; }
  190. }
  191. }
  192. pub fn get_new_stretch_id (&self) -> usize {
  193. let mut v = self.offsets.iter()
  194. .map(|(_, (_, id, _))| *id).collect::<Vec<usize>>();
  195. v.dedup();
  196. v.sort_by(|a,b| b.cmp(a));
  197. if v.len() > 0 {
  198. v[0] + 1
  199. } else {
  200. 0
  201. }
  202. }
  203. pub fn get_stretch_align (&self, stretch_id: usize) -> Vec<(NodeIndex, i32)> {
  204. let mut align: Vec<(NodeIndex, i32)> = Vec::new();
  205. self.offsets.iter().for_each(|(_, (node_index, node_stretch_id, offset))| {
  206. if *node_stretch_id == stretch_id {
  207. align.push((*node_index, *offset));
  208. }
  209. });
  210. align.sort_by(|a,b| a.1.cmp(&b.1));
  211. align
  212. }
  213. pub fn print_stretch (&self, stretch_id: usize) {
  214. let v = self.get_stretch_align(stretch_id);
  215. println!(">sequence_{}", stretch_id);
  216. if v.len() > 0 {
  217. let decal = -v[0].1;
  218. for (node_index, offset) in v {
  219. let seq = self.sequences.get(&node_index).unwrap();
  220. println!("{}{} - {:?}", " ".repeat((offset + decal) as usize).to_string(), seq, node_index);
  221. }
  222. }
  223. let cons = self.get_stretch_consensus(stretch_id);
  224. println!("{}", String::from_utf8_lossy(&cons) );
  225. }
  226. pub fn get_stretch_consensus (&self, stretch_id: usize) -> Vec<u8> {
  227. let v = self.get_stretch_align(stretch_id);
  228. let mut ov: Vec<Vec<u8>> = Vec::with_capacity(v.len());
  229. let mut res: Vec<u8> = Vec::new();
  230. if v.len() > 0 {
  231. let decal = -v[0].1;
  232. let mut max_len = 0;
  233. for (node_index, offset) in v {
  234. let seq = self.sequences.get(&node_index).unwrap();
  235. let seq = format!("{}{}", " ".repeat((offset + decal) as usize).to_string(), seq).as_bytes().to_vec();
  236. if max_len < seq.len() { max_len = seq.len(); }
  237. ov.push(seq);
  238. }
  239. for i in 0..max_len {
  240. let mut acc: Vec<u8> = Vec::new();
  241. let atcg = vec![b"A"[0], b"T"[0], b"C"[0], b"G"[0]];
  242. let mut n_atcg = [0; 4];
  243. ov.iter().for_each(|s|{
  244. if s.len() > i {
  245. acc.push(s[i].clone());
  246. for nb in 0..4 {
  247. if s[i] == atcg[nb] {
  248. n_atcg[nb] += 1;
  249. }
  250. }
  251. }
  252. });
  253. let n_base: usize = n_atcg.iter().sum();
  254. let max_base = n_atcg.iter().max().unwrap();
  255. let mut base = b"N"[0];
  256. if *max_base as f32 / n_base as f32 > 0.5 {
  257. n_atcg.iter().enumerate().for_each(|(pos, n)| if n == max_base { base = atcg[pos] });
  258. }
  259. res.push(base);
  260. }
  261. }
  262. res
  263. }
  264. pub fn get_fasta (&self) -> String {
  265. let mut fasta = String::new();
  266. for i in 0..self.get_new_stretch_id() {
  267. let cons = self.get_stretch_consensus(i);
  268. fasta.push_str(format!(">sequence_{}_length_{}\n", i + 1, cons.len()).as_str());
  269. let mut acc: Vec<u8> = Vec::new();
  270. cons.into_iter().for_each(|c| {
  271. acc.push(c);
  272. if acc.len() == 60 {
  273. fasta.push_str(format!("{}\n", String::from_utf8_lossy(&acc)).as_str());
  274. acc = Vec::new();
  275. }
  276. });
  277. if acc.len() > 0 {
  278. fasta.push_str(format!("{}\n", String::from_utf8_lossy(&acc)).as_str());
  279. }
  280. }
  281. fasta
  282. }
  283. pub fn make_neo_contig (&self, stretch_id: usize) -> NeoContig {
  284. let cons = String::from_utf8_lossy(&self.get_stretch_consensus(stretch_id)).to_string();
  285. let stretch = self.get_stretch_align(stretch_id);
  286. let sequences: HashMap<NodeIndex, String> = stretch.iter().map(|(id,_)|{
  287. let (id, s) = self.sequences.get_key_value(id).unwrap();
  288. (*id, s.clone())
  289. }).collect();
  290. let sequences_names: HashMap<NodeIndex, String> = stretch.iter().map(|(id,_)|{
  291. let (id, s) = self.names.get_key_value(id).unwrap();
  292. (*id, s.clone())
  293. }).collect();
  294. NeoContig {
  295. sequence_id: stretch_id,
  296. name: format!("sequence_{stretch_id}"),
  297. contig: cons.clone(),
  298. alignments: Vec::new(),
  299. sequences, sequences_names,
  300. stretch,
  301. }
  302. }
  303. }
  304. #[derive(Debug, Clone)]
  305. pub struct Sam {
  306. query_name : String,
  307. query_range: Range<usize>,
  308. ref_name : String,
  309. ref_pos : i64,
  310. ref_range : Range<usize>,
  311. sequence : String,
  312. ref_sequence: String,
  313. ref_cigar : String,
  314. }
  315. impl Sam {
  316. pub fn new(query_name: String, flag: i32, ref_name: String, ref_pos: i64, cigar: String, sequence: String, fa: &Fasta) -> Self {
  317. let mut sequence = sequence;
  318. let (mut ref_range, mut query_range, ref_cigar) = matched_range(&cigar, &flag, &mut sequence);
  319. if is_reverse(flag) {
  320. let len = ref_range.len();
  321. ref_range.end = ref_pos as usize;
  322. ref_range.start = ref_pos as usize + len ;
  323. } else {
  324. let len = ref_range.len();
  325. ref_range.start = ref_pos as usize;
  326. ref_range.end = ref_pos as usize + len;
  327. }
  328. let ref_sequence = fa.get_sequence(&ref_name, ref_range.start.clone(), ref_range.end.clone());
  329. Sam {query_name, query_range, ref_name, ref_pos, ref_range, sequence, ref_sequence, ref_cigar}
  330. }
  331. }
  332. #[derive(Debug, Clone)]
  333. pub struct NeoContig {
  334. sequence_id: usize,
  335. name: String,
  336. contig : String,
  337. alignments : Vec<Sam>,
  338. sequences: HashMap<NodeIndex, String>,
  339. sequences_names: HashMap<NodeIndex, String>,
  340. stretch : Vec<(NodeIndex, i32)>,
  341. }
  342. pub struct Fasta {
  343. fa: IndexedFasta
  344. }
  345. impl Fasta {
  346. pub fn new (path: &str) -> Self {
  347. Fasta { fa: IndexedFasta::from_file(path).expect("Error opening fa") }
  348. }
  349. fn get_sequence (&self, contig: &str, start: usize, end: usize) -> String { // end is included
  350. let start = start - 1;
  351. let end = end - 1;
  352. let chr_index = self.fa.fai().tid(contig).expect("Cannot find chr in index");
  353. if (end as i64 - start as i64) < (0 as i64) {
  354. let fv = self.fa.view(chr_index, end, start).expect("Cannot get .fa view");
  355. revcomp(&fv.to_string().to_uppercase())
  356. } else {
  357. let fv = self.fa.view(chr_index, start, end).expect("Cannot get .fa view");
  358. fv.to_string().to_uppercase()
  359. }
  360. }
  361. }
  362. fn is_reverse (flag: i32) -> bool {
  363. flag & 0x10 == 0x10
  364. }
  365. fn matched_range (cigar: &str, flag: &i32, matched_seq: &mut str) -> (Range<usize>, Range<usize>, String) {
  366. let mut n_head_to_rm = 0;
  367. for c in matched_seq.as_bytes().to_vec().iter() {
  368. if *c == b"N"[0] { n_head_to_rm += 1 } else { break; }
  369. }
  370. let mut n_trail_to_rm = 0;
  371. for c in matched_seq.as_bytes().to_vec().iter().rev() {
  372. if *c == b"N"[0] { n_trail_to_rm += 1 } else { break; }
  373. }
  374. let all_op = vec!["M", "I", "D", "N", "S", "=", "X", "H", "P"];
  375. let consumes_query = vec!["M", "I", "S", "=", "X", "H"];
  376. let consumes_ref = vec!["M", "D", "N", "=", "X"];
  377. let mut range_query:Range<usize> = Range::default();
  378. let mut range_ref:Range<usize> = Range::default();
  379. let mut num_acc = String::new();
  380. let mut query_pos = 0;
  381. let mut ref_pos = 0;
  382. let mut ref_cigar_string = "".to_string();
  383. let mut pos = 0;
  384. let mut first_m = true;
  385. let n_op = cigar.split("").filter(|c| all_op.contains(c)).count();
  386. let mut curr_op = 1;
  387. for f in cigar.split("") {
  388. match f.parse::<usize>() {
  389. Ok(_) => num_acc.push_str(f), // if a number added to the accumulator
  390. Err(_) => { // if a letter
  391. if f != "" {
  392. //parse the accumulator
  393. let mut current_add = num_acc.parse::<usize>().unwrap();
  394. num_acc = "".to_string();
  395. if n_head_to_rm > 0 && curr_op == 1 {
  396. current_add = current_add - n_head_to_rm;
  397. }
  398. if n_trail_to_rm > 0 && curr_op == n_op {
  399. current_add = current_add - n_trail_to_rm;
  400. }
  401. curr_op += 1;
  402. if f == "M" {
  403. if first_m {
  404. range_query.start = query_pos;
  405. range_query.end = query_pos + current_add;
  406. range_ref.start = ref_pos;
  407. range_ref.end = ref_pos + current_add;
  408. first_m = false;
  409. } else {
  410. range_query.end = query_pos + current_add;
  411. range_ref.end = ref_pos + current_add;
  412. }
  413. }
  414. if consumes_query.contains(&f) {
  415. query_pos += current_add;
  416. }
  417. if consumes_ref.contains(&f) {
  418. ref_pos += current_add;
  419. let toadd = f.repeat(current_add);
  420. ref_cigar_string.push_str(&toadd);
  421. }
  422. pos += current_add;
  423. }
  424. },
  425. }
  426. }
  427. if is_reverse(*flag) {
  428. let query_len = range_query.len();
  429. range_query.start = query_pos - range_query.end;
  430. range_query.end = range_query.start + query_len;
  431. }
  432. // println!("{}", matched_seq);
  433. // println!("{:?}", range_ref);
  434. // println!("{:?}", range_query);
  435. // println!("{:?}", ref_cigar_string.len());
  436. // println!("{:?}", range_ref.len());
  437. // println!("{}", ref_cigar_string);
  438. assert_eq!(range_ref.len(), ref_cigar_string.len());
  439. (range_ref, range_query, ref_cigar_string)
  440. }
  441. fn revcomp(dna: &str) -> String{
  442. let mut rdna: String = String::with_capacity(dna.len());
  443. for c in dna.chars().rev() {
  444. rdna.push(switch_base(c));
  445. }
  446. rdna
  447. }
  448. fn switch_base(c:char) -> char {
  449. match c {
  450. 'a' => 't' ,
  451. 'c' => 'g' ,
  452. 't' => 'a' ,
  453. 'g' => 'c' ,
  454. 'u' => 'a',
  455. 'A' => 'T' ,
  456. 'C' => 'G' ,
  457. 'T' => 'A' ,
  458. 'G' => 'C',
  459. 'U' => 'A',
  460. _ => 'N'
  461. }
  462. }
  463. fn get_overlaps (a: &Vec<u8>, b: &Vec<u8>, min_len: usize, max_consecutive: usize, max_diffs: usize) -> Option<(Range<usize>, Range<usize>)> {
  464. let (a_len, b_len) = (a.len(), b.len());
  465. let (smallest_len, biggest_len) = if a_len <= b_len { (a_len, b_len) } else { (b_len, a_len) };
  466. let mut res: Option<(Range<usize>, Range<usize>)> = None;
  467. for i in min_len..(smallest_len + (biggest_len - smallest_len)) {
  468. let range_a: Range<usize>;
  469. let range_b: Range<usize>;
  470. if i <= smallest_len {
  471. range_a = Range { start: 0, end: i };
  472. range_b = Range { start: b_len - i, end: b_len };
  473. } else {
  474. // check if inside
  475. if smallest_len == a_len {
  476. let after = i - smallest_len - 1;
  477. range_a = Range { start: 0, end: a_len };
  478. range_b = Range { start: after, end: a_len + after};
  479. } else {
  480. let after = i - smallest_len - 1;
  481. range_a = Range { start: after, end: b_len + after};
  482. range_b = Range { start: 0, end: b_len};
  483. }
  484. }
  485. if &a[range_a.clone()] == &b[range_b.clone()] {
  486. res = Some((range_a, range_b));
  487. continue;
  488. } else if match_tol(&a[range_a.clone()], &b[range_b.clone()], max_consecutive, max_diffs) {
  489. res = Some((range_a, range_b));
  490. continue;
  491. } else if res.is_some() {
  492. return res;
  493. }
  494. }
  495. res
  496. }
  497. fn fuse_overlaps (a: &Vec<u8>, b: &Vec<u8>, a_range: &Range<usize>, b_range: &Range<usize>) -> Vec<u8> {
  498. let mut res: Vec<u8> = Vec::new();
  499. if a_range.start == 0 {
  500. res = [b.clone(), a[a_range.end..a.len()].to_vec() ].concat().to_vec();
  501. } else if b_range.start == 0 {
  502. res = [a.clone(), b[b_range.end..b.len()].to_vec() ].concat().to_vec();
  503. }
  504. res
  505. }
  506. // max one consecutive diff and 5 diffs
  507. fn match_tol (s1: &[u8], s2: &[u8], max_consecutive: usize, max_diffs: usize) -> bool {
  508. let max_consecutive= max_consecutive + 2;
  509. // let max_diffs: usize = 5;
  510. let mut diffs: Vec<usize> = Vec::new();
  511. for (a, b) in std::iter::zip(s1, s2) {
  512. diffs.push((a == b) as usize);
  513. let sum: usize = diffs.iter().rev().take(max_consecutive).sum();
  514. if sum == 0 && diffs.len() >= max_consecutive { return false }
  515. }
  516. let sum: usize = diffs.iter().rev().take(max_consecutive).sum();
  517. if sum == max_consecutive {
  518. diffs.len() - diffs.iter().sum::<usize>() <= max_diffs
  519. } else {
  520. false
  521. }
  522. }
  523. #[cfg(test)]
  524. mod tests {
  525. use super::*;
  526. #[test]
  527. fn it_works() {
  528. // let result = add(2, 2);
  529. // assert_eq!(result, 4);
  530. }
  531. }