|
@@ -0,0 +1,607 @@
|
|
|
|
|
+use std::{time::Instant, ops::Range, collections::{HashMap, HashSet}};
|
|
|
|
|
+use petgraph::{graph::{Graph, NodeIndex, EdgeIndex}, Undirected};
|
|
|
|
|
+use kseq::parse_path;
|
|
|
|
|
+use faimm::IndexedFasta;
|
|
|
|
|
+
|
|
|
|
|
+struct SequencesGraph {
|
|
|
|
|
+ sequences: HashMap<NodeIndex, String>,
|
|
|
|
|
+ seq_hash : HashSet<String>,
|
|
|
|
|
+ names : HashMap<NodeIndex, String>,
|
|
|
|
|
+ overlaps : HashMap<EdgeIndex, Overlap>,
|
|
|
|
|
+ offsets : HashMap<String, (NodeIndex, usize, i32)>, // node_index, stretch_id, offset
|
|
|
|
|
+ graph : Graph<String, usize, Undirected>,
|
|
|
|
|
+ min_overlapping: usize,
|
|
|
|
|
+ max_consecutive: usize,
|
|
|
|
|
+ max_diffs : usize,
|
|
|
|
|
+
|
|
|
|
|
+}
|
|
|
|
|
+#[derive(Debug)]
|
|
|
|
|
+struct Overlap {
|
|
|
|
|
+ id_a : NodeIndex,
|
|
|
|
|
+ range_a: Range<usize>,
|
|
|
|
|
+ id_b : NodeIndex,
|
|
|
|
|
+ range_b: Range<usize>,
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+impl SequencesGraph {
|
|
|
|
|
+ pub fn new (min_overlapping: usize, max_consecutive: usize, max_diffs: usize) -> SequencesGraph {
|
|
|
|
|
+ SequencesGraph {
|
|
|
|
|
+ sequences: HashMap::new(),
|
|
|
|
|
+ seq_hash : HashSet::new(),
|
|
|
|
|
+ names : HashMap::new(),
|
|
|
|
|
+ overlaps : HashMap::new(),
|
|
|
|
|
+ offsets : HashMap::new(),
|
|
|
|
|
+ graph : Graph::new_undirected(),
|
|
|
|
|
+ min_overlapping,
|
|
|
|
|
+ max_consecutive,
|
|
|
|
|
+ max_diffs
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pub fn from_fq_pe (&mut self, r1_path: &str, r2_path: &str, min_overlapping_len_pe: usize) {
|
|
|
|
|
+ let mut r1p = parse_path(r1_path).unwrap();
|
|
|
|
|
+ let mut r2p = parse_path(r2_path).unwrap();
|
|
|
|
|
+
|
|
|
|
|
+ let mut n_pairs = 0;
|
|
|
|
|
+ while let (Some(r1), Some(r2)) = (r1p.iter_record().unwrap(), r2p.iter_record().unwrap()) {
|
|
|
|
|
+ let a = r1.seq().as_bytes().to_vec();
|
|
|
|
|
+ let b = r2.seq().as_bytes().to_vec();
|
|
|
|
|
+ self.add_from_pe(a, b, r1.head().to_string(), r2.head().to_string(), min_overlapping_len_pe);
|
|
|
|
|
+ n_pairs += 1;
|
|
|
|
|
+ }
|
|
|
|
|
+ println!("{n_pairs}, pairs parsed.");
|
|
|
|
|
+ self.draw_edges();
|
|
|
|
|
+ println!("{}, overlaps fund.", self.overlaps.len());
|
|
|
|
|
+ self.remove_single_nodes();
|
|
|
|
|
+ println!("{}, nodes to assemble", self.names.len());
|
|
|
|
|
+ self.find_leftmost_node();
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pub fn add_from_pe (&mut self, a: Vec<u8>, b: Vec<u8>, a_name: String, b_name: String, min_overlapping_len_pe: usize) {
|
|
|
|
|
+ let a_rc = revcomp(std::str::from_utf8(&a).unwrap()).as_bytes().to_vec();
|
|
|
|
|
+ let b_rc = revcomp(std::str::from_utf8(&b).unwrap()).as_bytes().to_vec();
|
|
|
|
|
+
|
|
|
|
|
+ let max_consecutive = self.max_consecutive;
|
|
|
|
|
+ let max_diffs = self.max_diffs;
|
|
|
|
|
+
|
|
|
|
|
+ let mut seq: Option<Vec<u8>> = None;
|
|
|
|
|
+ if let Some(r) = get_overlaps(&a_rc, &b, min_overlapping_len_pe, max_consecutive, max_diffs) { // normaly always like that
|
|
|
|
|
+ seq = Some(fuse_overlaps(&a_rc, &b, &r.0, &r.1));
|
|
|
|
|
+ } else if let Some(r) = get_overlaps(&b, &a, min_overlapping_len_pe, max_consecutive, max_diffs) {
|
|
|
|
|
+ seq = Some(fuse_overlaps(&b, &a, &r.0, &r.1));
|
|
|
|
|
+ } else if let Some(r) = get_overlaps(&a, &b, min_overlapping_len_pe, max_consecutive, max_diffs) {
|
|
|
|
|
+ seq = Some(fuse_overlaps(&b, &a, &r.0, &r.1));
|
|
|
|
|
+ } else if let Some(r) = get_overlaps(&b, &a_rc, min_overlapping_len_pe, max_consecutive, max_diffs) {
|
|
|
|
|
+ seq = Some(fuse_overlaps(&b, &a_rc, &r.0, &r.1));
|
|
|
|
|
+ } else if let Some(r) = get_overlaps(&a, &b_rc, min_overlapping_len_pe, max_consecutive, max_diffs) {
|
|
|
|
|
+ seq = Some(fuse_overlaps(&a, &b_rc, &r.0, &r.1));
|
|
|
|
|
+ } else if let Some(r) = get_overlaps(&a_rc, &b_rc, min_overlapping_len_pe, max_consecutive, max_diffs) {
|
|
|
|
|
+ seq = Some(fuse_overlaps(&a_rc, &b_rc, &r.0, &r.1));
|
|
|
|
|
+ } else if let Some(r) = get_overlaps(&b_rc, &a_rc, min_overlapping_len_pe, max_consecutive, max_diffs) {
|
|
|
|
|
+ seq = Some(fuse_overlaps(&b_rc, &a_rc, &r.0, &r.1));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if let Some(seq) = seq {
|
|
|
|
|
+ let seq = String::from_utf8_lossy(&seq).to_string();
|
|
|
|
|
+ if !self.seq_hash.contains(&seq) {
|
|
|
|
|
+ self.add_node(seq.clone(), a_name);
|
|
|
|
|
+ self.seq_hash.insert(seq);
|
|
|
|
|
+ }
|
|
|
|
|
+ } else {
|
|
|
|
|
+ let seq_r1 = String::from_utf8_lossy(&a).to_string();
|
|
|
|
|
+ if !self.seq_hash.contains(&seq_r1) {
|
|
|
|
|
+ self.add_node(seq_r1.clone(), format!("{a_name}_R1"));
|
|
|
|
|
+ self.seq_hash.insert(seq_r1);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ let seq_r2 = String::from_utf8_lossy(&b_rc).to_string();
|
|
|
|
|
+ if !self.seq_hash.contains(&seq_r2) {
|
|
|
|
|
+ self.add_node(seq_r2.clone(), format!("{b_name}_R2"));
|
|
|
|
|
+ self.seq_hash.insert(seq_r2);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pub fn add_node (&mut self, sequence: String, name: String) {
|
|
|
|
|
+ let id = self.graph.add_node(sequence.clone());
|
|
|
|
|
+ self.sequences.insert(id, sequence);
|
|
|
|
|
+ self.names.insert(id, name);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pub fn add_edge (&mut self, id_a: &NodeIndex, id_b: &NodeIndex) {
|
|
|
|
|
+ let a = self.sequences.get(id_a).unwrap().as_bytes().to_vec();
|
|
|
|
|
+ let b = self.sequences.get(id_b).unwrap().as_bytes().to_vec();
|
|
|
|
|
+
|
|
|
|
|
+ if let Some(r) = get_overlaps(&a, &b, self.min_overlapping, self.max_consecutive, self.max_diffs) { // normaly always like that
|
|
|
|
|
+ self.overlaps.insert(
|
|
|
|
|
+ self.graph.add_edge(id_a.clone(), id_b.clone(), r.0.len()),
|
|
|
|
|
+ Overlap { id_a: *id_a, range_a: r.0, id_b: *id_b, range_b: r.1}
|
|
|
|
|
+ );
|
|
|
|
|
+ } else if let Some(r) = get_overlaps(&b, &a, self.min_overlapping, self.max_consecutive, self.max_diffs) {
|
|
|
|
|
+ self.overlaps.insert(
|
|
|
|
|
+ self.graph.add_edge(id_a.clone(), id_b.clone(), r.0.len()),
|
|
|
|
|
+ Overlap { id_a: *id_a, range_a: r.1, id_b: *id_b, range_b: r.0}
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pub fn draw_edges (&mut self) {
|
|
|
|
|
+ let idx: Vec<NodeIndex> = self.graph.node_indices().collect();
|
|
|
|
|
+ let n_nodes = idx.len();
|
|
|
|
|
+ for x in 0..n_nodes {
|
|
|
|
|
+ for y in 0..n_nodes {
|
|
|
|
|
+ if y < x {
|
|
|
|
|
+ self.add_edge(&idx[x], &idx[y]);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pub fn remove_single_nodes (&mut self) {
|
|
|
|
|
+ let mut res: Vec<(NodeIndex, usize)> = Vec::new();
|
|
|
|
|
+ self.names.iter().for_each(|(node_index, _)|{
|
|
|
|
|
+ let mut neighbors = self.graph.neighbors(*node_index).detach();
|
|
|
|
|
+ let mut node_neighbors: Vec<NodeIndex> = Vec::new();
|
|
|
|
|
+ while let Some(neighbor_id) = neighbors.next_node(&self.graph) {
|
|
|
|
|
+ node_neighbors.push(neighbor_id);
|
|
|
|
|
+ }
|
|
|
|
|
+ res.push((*node_index, node_neighbors.len()));
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ res.iter().for_each(|(node_id, n_neighbors)|{
|
|
|
|
|
+ if *n_neighbors == 0 {
|
|
|
|
|
+ self.graph.remove_node(*node_id);
|
|
|
|
|
+ println!("Removing {} which doesnt have any overlapping reads.", self.names.get(&node_id).unwrap());
|
|
|
|
|
+ self.names.remove(node_id);
|
|
|
|
|
+ self.sequences.remove(node_id);
|
|
|
|
|
+ }
|
|
|
|
|
+ });
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pub fn find_leftmost_node (&mut self) {
|
|
|
|
|
+ let mut stretch_id = 0;
|
|
|
|
|
+ self.names.iter().for_each(|(node_index, _)|{
|
|
|
|
|
+ let mut neighbors = self.graph.neighbors(node_index.clone()).detach();
|
|
|
|
|
+
|
|
|
|
|
+ let mut left = false;
|
|
|
|
|
+ let mut n_neigh = 0;
|
|
|
|
|
+ while let Some(edge_index) = neighbors.next_edge(&self.graph) {
|
|
|
|
|
+ n_neigh += 1;
|
|
|
|
|
+ let ov = self.overlaps.get(&edge_index).unwrap();
|
|
|
|
|
+ if ov.id_a == node_index.clone() && ov.range_a.start == 0 { left = true; break; }
|
|
|
|
|
+ if ov.id_b == node_index.clone() && ov.range_b.start == 0 { left = true; break; }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if !left && n_neigh > 0 {
|
|
|
|
|
+ let node_key = format!("{}_{stretch_id}", node_index.index());
|
|
|
|
|
+ self.offsets.insert(node_key.clone(), (node_index.clone(), stretch_id, 0));
|
|
|
|
|
+ stretch_id += 1;
|
|
|
|
|
+ }
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ let mut visited: HashSet<String> = HashSet::new();
|
|
|
|
|
+ loop {
|
|
|
|
|
+ let mut new_visit = 0;
|
|
|
|
|
+ let mut to_insert: Vec<(String, NodeIndex, usize, i32)> = Vec::new();
|
|
|
|
|
+
|
|
|
|
|
+ for (node_key, (node_index, stretch_id, offset)) in self.offsets.iter() {
|
|
|
|
|
+ if visited.contains(node_key) { continue; }
|
|
|
|
|
+
|
|
|
|
|
+ new_visit += 1;
|
|
|
|
|
+
|
|
|
|
|
+ let mut neighbors = self.graph.neighbors(*node_index).detach();
|
|
|
|
|
+ while let Some(edge_index) = neighbors.next_edge(&self.graph) {
|
|
|
|
|
+ let overlaps = self.overlaps.get(&edge_index).unwrap();
|
|
|
|
|
+
|
|
|
|
|
+ let neighbor_id = if overlaps.id_a == *node_index { overlaps.id_b } else { overlaps.id_a };
|
|
|
|
|
+ let neighbor_key = format!("{}_{stretch_id}", neighbor_id.index());
|
|
|
|
|
+
|
|
|
|
|
+ if !self.offsets.contains_key(&neighbor_key) {
|
|
|
|
|
+ if overlaps.id_a == *node_index {
|
|
|
|
|
+ let new_offset = *offset + overlaps.range_a.start as i32 - overlaps.range_b.start as i32;
|
|
|
|
|
+ to_insert.push((neighbor_key, neighbor_id, *stretch_id, new_offset));
|
|
|
|
|
+ } else {
|
|
|
|
|
+ let new_offset = *offset + overlaps.range_b.start as i32 - overlaps.range_a.start as i32;
|
|
|
|
|
+ to_insert.push((neighbor_key, neighbor_id, *stretch_id, new_offset));
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ visited.insert(node_key.clone());
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ to_insert.iter().for_each(|(key, node_index, stretch_id, offset)| {
|
|
|
|
|
+ self.offsets.insert(key.clone(), (*node_index, *stretch_id, *offset));
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ if new_visit == 0 { break; }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pub fn get_new_stretch_id (&self) -> usize {
|
|
|
|
|
+ let mut v = self.offsets.iter()
|
|
|
|
|
+ .map(|(_, (_, id, _))| *id).collect::<Vec<usize>>();
|
|
|
|
|
+ v.dedup();
|
|
|
|
|
+ v.sort_by(|a,b| b.cmp(a));
|
|
|
|
|
+ if v.len() > 0 {
|
|
|
|
|
+ v[0] + 1
|
|
|
|
|
+ } else {
|
|
|
|
|
+ 0
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pub fn get_stretch_align (&self, stretch_id: usize) -> Vec<(NodeIndex, i32)> {
|
|
|
|
|
+ let mut align: Vec<(NodeIndex, i32)> = Vec::new();
|
|
|
|
|
+ self.offsets.iter().for_each(|(_, (node_index, node_stretch_id, offset))| {
|
|
|
|
|
+ if *node_stretch_id == stretch_id {
|
|
|
|
|
+ align.push((*node_index, *offset));
|
|
|
|
|
+ }
|
|
|
|
|
+ });
|
|
|
|
|
+ align.sort_by(|a,b| a.1.cmp(&b.1));
|
|
|
|
|
+ align
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pub fn print_stretch (&self, stretch_id: usize) {
|
|
|
|
|
+ let v = self.get_stretch_align(stretch_id);
|
|
|
|
|
+ println!(">sequence_{}", stretch_id);
|
|
|
|
|
+ if v.len() > 0 {
|
|
|
|
|
+ let decal = -v[0].1;
|
|
|
|
|
+ for (node_index, offset) in v {
|
|
|
|
|
+ let seq = self.sequences.get(&node_index).unwrap();
|
|
|
|
|
+ println!("{}{} - {:?}", " ".repeat((offset + decal) as usize).to_string(), seq, node_index);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ let cons = self.get_stretch_consensus(stretch_id);
|
|
|
|
|
+ println!("{}", String::from_utf8_lossy(&cons) );
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pub fn get_stretch_consensus (&self, stretch_id: usize) -> Vec<u8> {
|
|
|
|
|
+ let v = self.get_stretch_align(stretch_id);
|
|
|
|
|
+ let mut ov: Vec<Vec<u8>> = Vec::with_capacity(v.len());
|
|
|
|
|
+ let mut res: Vec<u8> = Vec::new();
|
|
|
|
|
+ if v.len() > 0 {
|
|
|
|
|
+ let decal = -v[0].1;
|
|
|
|
|
+ let mut max_len = 0;
|
|
|
|
|
+ for (node_index, offset) in v {
|
|
|
|
|
+ let seq = self.sequences.get(&node_index).unwrap();
|
|
|
|
|
+ let seq = format!("{}{}", " ".repeat((offset + decal) as usize).to_string(), seq).as_bytes().to_vec();
|
|
|
|
|
+ if max_len < seq.len() { max_len = seq.len(); }
|
|
|
|
|
+ ov.push(seq);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ for i in 0..max_len {
|
|
|
|
|
+ let mut acc: Vec<u8> = Vec::new();
|
|
|
|
|
+ let atcg = vec![b"A"[0], b"T"[0], b"C"[0], b"G"[0]];
|
|
|
|
|
+ let mut n_atcg = [0; 4];
|
|
|
|
|
+
|
|
|
|
|
+ ov.iter().for_each(|s|{
|
|
|
|
|
+ if s.len() > i {
|
|
|
|
|
+ acc.push(s[i].clone());
|
|
|
|
|
+ for nb in 0..4 {
|
|
|
|
|
+ if s[i] == atcg[nb] {
|
|
|
|
|
+ n_atcg[nb] += 1;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ });
|
|
|
|
|
+ let n_base: usize = n_atcg.iter().sum();
|
|
|
|
|
+ let max_base = n_atcg.iter().max().unwrap();
|
|
|
|
|
+ let mut base = b"N"[0];
|
|
|
|
|
+ if *max_base as f32 / n_base as f32 > 0.5 {
|
|
|
|
|
+ n_atcg.iter().enumerate().for_each(|(pos, n)| if n == max_base { base = atcg[pos] });
|
|
|
|
|
+ }
|
|
|
|
|
+ res.push(base);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ res
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pub fn get_fasta (&self) -> String {
|
|
|
|
|
+ let mut fasta = String::new();
|
|
|
|
|
+
|
|
|
|
|
+ for i in 0..self.get_new_stretch_id() {
|
|
|
|
|
+ let cons = self.get_stretch_consensus(i);
|
|
|
|
|
+ fasta.push_str(format!(">sequence_{}_length_{}\n", i + 1, cons.len()).as_str());
|
|
|
|
|
+
|
|
|
|
|
+ let mut acc: Vec<u8> = Vec::new();
|
|
|
|
|
+ cons.into_iter().for_each(|c| {
|
|
|
|
|
+ acc.push(c);
|
|
|
|
|
+ if acc.len() == 60 {
|
|
|
|
|
+ fasta.push_str(format!("{}\n", String::from_utf8_lossy(&acc)).as_str());
|
|
|
|
|
+ acc = Vec::new();
|
|
|
|
|
+ }
|
|
|
|
|
+ });
|
|
|
|
|
+ if acc.len() > 0 {
|
|
|
|
|
+ fasta.push_str(format!("{}\n", String::from_utf8_lossy(&acc)).as_str());
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ fasta
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pub fn make_neo_contig (&self, stretch_id: usize) -> NeoContig {
|
|
|
|
|
+ let cons = String::from_utf8_lossy(&self.get_stretch_consensus(stretch_id)).to_string();
|
|
|
|
|
+
|
|
|
|
|
+ let stretch = self.get_stretch_align(stretch_id);
|
|
|
|
|
+
|
|
|
|
|
+ let sequences: HashMap<NodeIndex, String> = stretch.iter().map(|(id,_)|{
|
|
|
|
|
+ let (id, s) = self.sequences.get_key_value(id).unwrap();
|
|
|
|
|
+ (*id, s.clone())
|
|
|
|
|
+ }).collect();
|
|
|
|
|
+
|
|
|
|
|
+ let sequences_names: HashMap<NodeIndex, String> = stretch.iter().map(|(id,_)|{
|
|
|
|
|
+ let (id, s) = self.names.get_key_value(id).unwrap();
|
|
|
|
|
+ (*id, s.clone())
|
|
|
|
|
+ }).collect();
|
|
|
|
|
+
|
|
|
|
|
+ NeoContig {
|
|
|
|
|
+ sequence_id: stretch_id,
|
|
|
|
|
+ name: format!("sequence_{stretch_id}"),
|
|
|
|
|
+ contig: cons.clone(),
|
|
|
|
|
+ alignments: Vec::new(),
|
|
|
|
|
+ sequences, sequences_names,
|
|
|
|
|
+ stretch,
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+#[derive(Debug, Clone)]
|
|
|
|
|
+struct Sam {
|
|
|
|
|
+ query_name : String,
|
|
|
|
|
+ query_range: Range<usize>,
|
|
|
|
|
+ ref_name : String,
|
|
|
|
|
+ ref_pos : i64,
|
|
|
|
|
+ ref_range : Range<usize>,
|
|
|
|
|
+ sequence : String,
|
|
|
|
|
+ ref_sequence: String,
|
|
|
|
|
+ ref_cigar : String,
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+impl Sam {
|
|
|
|
|
+ fn new(query_name: String, flag: i32, ref_name: String, ref_pos: i64, cigar: String, sequence: String, fa: &Fasta) -> Self {
|
|
|
|
|
+ let mut sequence = sequence;
|
|
|
|
|
+ let (mut ref_range, mut query_range, ref_cigar) = matched_range(&cigar, &flag, &mut sequence);
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ if is_reverse(flag) {
|
|
|
|
|
+ let len = ref_range.len();
|
|
|
|
|
+ ref_range.end = ref_pos as usize;
|
|
|
|
|
+ ref_range.start = ref_pos as usize + len ;
|
|
|
|
|
+ } else {
|
|
|
|
|
+ let len = ref_range.len();
|
|
|
|
|
+ ref_range.start = ref_pos as usize;
|
|
|
|
|
+ ref_range.end = ref_pos as usize + len;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ let ref_sequence = fa.get_sequence(&ref_name, ref_range.start.clone(), ref_range.end.clone());
|
|
|
|
|
+
|
|
|
|
|
+ Sam {query_name, query_range, ref_name, ref_pos, ref_range, sequence, ref_sequence, ref_cigar}
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+#[derive(Debug, Clone)]
|
|
|
|
|
+struct NeoContig {
|
|
|
|
|
+ sequence_id: usize,
|
|
|
|
|
+ name: String,
|
|
|
|
|
+ contig : String,
|
|
|
|
|
+ alignments : Vec<Sam>,
|
|
|
|
|
+ sequences: HashMap<NodeIndex, String>,
|
|
|
|
|
+ sequences_names: HashMap<NodeIndex, String>,
|
|
|
|
|
+ stretch : Vec<(NodeIndex, i32)>,
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+struct Fasta {
|
|
|
|
|
+ fa: IndexedFasta
|
|
|
|
|
+}
|
|
|
|
|
+impl Fasta {
|
|
|
|
|
+ fn new (path: &str) -> Self {
|
|
|
|
|
+ Fasta { fa: IndexedFasta::from_file(path).expect("Error opening fa") }
|
|
|
|
|
+ }
|
|
|
|
|
+ fn get_sequence (&self, contig: &str, start: usize, end: usize) -> String { // end is included
|
|
|
|
|
+ let start = start - 1;
|
|
|
|
|
+ let end = end - 1;
|
|
|
|
|
+ let chr_index = self.fa.fai().tid(contig).expect("Cannot find chr in index");
|
|
|
|
|
+ if (end as i64 - start as i64) < (0 as i64) {
|
|
|
|
|
+ let fv = self.fa.view(chr_index, end, start).expect("Cannot get .fa view");
|
|
|
|
|
+ revcomp(&fv.to_string().to_uppercase())
|
|
|
|
|
+ } else {
|
|
|
|
|
+ let fv = self.fa.view(chr_index, start, end).expect("Cannot get .fa view");
|
|
|
|
|
+ fv.to_string().to_uppercase()
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn is_reverse (flag: i32) -> bool {
|
|
|
|
|
+ flag & 0x10 == 0x10
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn matched_range (cigar: &str, flag: &i32, matched_seq: &mut str) -> (Range<usize>, Range<usize>, String) {
|
|
|
|
|
+ let mut n_head_to_rm = 0;
|
|
|
|
|
+ for c in matched_seq.as_bytes().to_vec().iter() {
|
|
|
|
|
+ if *c == b"N"[0] { n_head_to_rm += 1 } else { break; }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ let mut n_trail_to_rm = 0;
|
|
|
|
|
+ for c in matched_seq.as_bytes().to_vec().iter().rev() {
|
|
|
|
|
+ if *c == b"N"[0] { n_trail_to_rm += 1 } else { break; }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ let all_op = vec!["M", "I", "D", "N", "S", "=", "X", "H", "P"];
|
|
|
|
|
+ let consumes_query = vec!["M", "I", "S", "=", "X", "H"];
|
|
|
|
|
+ let consumes_ref = vec!["M", "D", "N", "=", "X"];
|
|
|
|
|
+
|
|
|
|
|
+ let mut range_query:Range<usize> = Range::default();
|
|
|
|
|
+ let mut range_ref:Range<usize> = Range::default();
|
|
|
|
|
+
|
|
|
|
|
+ let mut num_acc = String::new();
|
|
|
|
|
+
|
|
|
|
|
+ let mut query_pos = 0;
|
|
|
|
|
+ let mut ref_pos = 0;
|
|
|
|
|
+ let mut ref_cigar_string = "".to_string();
|
|
|
|
|
+
|
|
|
|
|
+ let mut pos = 0;
|
|
|
|
|
+
|
|
|
|
|
+ let mut first_m = true;
|
|
|
|
|
+ let n_op = cigar.split("").filter(|c| all_op.contains(c)).count();
|
|
|
|
|
+ let mut curr_op = 1;
|
|
|
|
|
+ for f in cigar.split("") {
|
|
|
|
|
+ match f.parse::<usize>() {
|
|
|
|
|
+ Ok(_) => num_acc.push_str(f), // if a number added to the accumulator
|
|
|
|
|
+ Err(_) => { // if a letter
|
|
|
|
|
+ if f != "" {
|
|
|
|
|
+ //parse the accumulator
|
|
|
|
|
+ let mut current_add = num_acc.parse::<usize>().unwrap();
|
|
|
|
|
+ num_acc = "".to_string();
|
|
|
|
|
+
|
|
|
|
|
+ if n_head_to_rm > 0 && curr_op == 1 {
|
|
|
|
|
+ current_add = current_add - n_head_to_rm;
|
|
|
|
|
+ }
|
|
|
|
|
+ if n_trail_to_rm > 0 && curr_op == n_op {
|
|
|
|
|
+ current_add = current_add - n_trail_to_rm;
|
|
|
|
|
+ }
|
|
|
|
|
+ curr_op += 1;
|
|
|
|
|
+
|
|
|
|
|
+ if f == "M" {
|
|
|
|
|
+ if first_m {
|
|
|
|
|
+ range_query.start = query_pos;
|
|
|
|
|
+ range_query.end = query_pos + current_add;
|
|
|
|
|
+
|
|
|
|
|
+ range_ref.start = ref_pos;
|
|
|
|
|
+ range_ref.end = ref_pos + current_add;
|
|
|
|
|
+
|
|
|
|
|
+ first_m = false;
|
|
|
|
|
+ } else {
|
|
|
|
|
+ range_query.end = query_pos + current_add;
|
|
|
|
|
+ range_ref.end = ref_pos + current_add;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if consumes_query.contains(&f) {
|
|
|
|
|
+ query_pos += current_add;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if consumes_ref.contains(&f) {
|
|
|
|
|
+ ref_pos += current_add;
|
|
|
|
|
+ let toadd = f.repeat(current_add);
|
|
|
|
|
+ ref_cigar_string.push_str(&toadd);
|
|
|
|
|
+ }
|
|
|
|
|
+ pos += current_add;
|
|
|
|
|
+ }
|
|
|
|
|
+ },
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if is_reverse(*flag) {
|
|
|
|
|
+ let query_len = range_query.len();
|
|
|
|
|
+ range_query.start = query_pos - range_query.end;
|
|
|
|
|
+ range_query.end = range_query.start + query_len;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // println!("{}", matched_seq);
|
|
|
|
|
+ // println!("{:?}", range_ref);
|
|
|
|
|
+ // println!("{:?}", range_query);
|
|
|
|
|
+ // println!("{:?}", ref_cigar_string.len());
|
|
|
|
|
+ // println!("{:?}", range_ref.len());
|
|
|
|
|
+ // println!("{}", ref_cigar_string);
|
|
|
|
|
+ assert_eq!(range_ref.len(), ref_cigar_string.len());
|
|
|
|
|
+
|
|
|
|
|
+ (range_ref, range_query, ref_cigar_string)
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn revcomp(dna: &str) -> String{
|
|
|
|
|
+ let mut rdna: String = String::with_capacity(dna.len());
|
|
|
|
|
+ for c in dna.chars().rev() {
|
|
|
|
|
+ rdna.push(switch_base(c));
|
|
|
|
|
+ }
|
|
|
|
|
+ rdna
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn switch_base(c:char) -> char {
|
|
|
|
|
+ match c {
|
|
|
|
|
+ 'a' => 't' ,
|
|
|
|
|
+ 'c' => 'g' ,
|
|
|
|
|
+ 't' => 'a' ,
|
|
|
|
|
+ 'g' => 'c' ,
|
|
|
|
|
+ 'u' => 'a',
|
|
|
|
|
+ 'A' => 'T' ,
|
|
|
|
|
+ 'C' => 'G' ,
|
|
|
|
|
+ 'T' => 'A' ,
|
|
|
|
|
+ 'G' => 'C',
|
|
|
|
|
+ 'U' => 'A',
|
|
|
|
|
+ _ => 'N'
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn get_overlaps (a: &Vec<u8>, b: &Vec<u8>, min_len: usize, max_consecutive: usize, max_diffs: usize) -> Option<(Range<usize>, Range<usize>)> {
|
|
|
|
|
+ let (a_len, b_len) = (a.len(), b.len());
|
|
|
|
|
+ let (smallest_len, biggest_len) = if a_len <= b_len { (a_len, b_len) } else { (b_len, a_len) };
|
|
|
|
|
+
|
|
|
|
|
+ let mut res: Option<(Range<usize>, Range<usize>)> = None;
|
|
|
|
|
+ for i in min_len..(smallest_len + (biggest_len - smallest_len)) {
|
|
|
|
|
+ let range_a: Range<usize>;
|
|
|
|
|
+ let range_b: Range<usize>;
|
|
|
|
|
+ if i <= smallest_len {
|
|
|
|
|
+ range_a = Range { start: 0, end: i };
|
|
|
|
|
+ range_b = Range { start: b_len - i, end: b_len };
|
|
|
|
|
+ } else {
|
|
|
|
|
+ // check if inside
|
|
|
|
|
+ if smallest_len == a_len {
|
|
|
|
|
+ let after = i - smallest_len - 1;
|
|
|
|
|
+ range_a = Range { start: 0, end: a_len };
|
|
|
|
|
+ range_b = Range { start: after, end: a_len + after};
|
|
|
|
|
+ } else {
|
|
|
|
|
+ let after = i - smallest_len - 1;
|
|
|
|
|
+ range_a = Range { start: after, end: b_len + after};
|
|
|
|
|
+ range_b = Range { start: 0, end: b_len};
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if &a[range_a.clone()] == &b[range_b.clone()] {
|
|
|
|
|
+ res = Some((range_a, range_b));
|
|
|
|
|
+ continue;
|
|
|
|
|
+ } else if match_tol(&a[range_a.clone()], &b[range_b.clone()], max_consecutive, max_diffs) {
|
|
|
|
|
+ res = Some((range_a, range_b));
|
|
|
|
|
+ continue;
|
|
|
|
|
+ } else if res.is_some() {
|
|
|
|
|
+ return res;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ res
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn fuse_overlaps (a: &Vec<u8>, b: &Vec<u8>, a_range: &Range<usize>, b_range: &Range<usize>) -> Vec<u8> {
|
|
|
|
|
+ let mut res: Vec<u8> = Vec::new();
|
|
|
|
|
+ if a_range.start == 0 {
|
|
|
|
|
+ res = [b.clone(), a[a_range.end..a.len()].to_vec() ].concat().to_vec();
|
|
|
|
|
+ } else if b_range.start == 0 {
|
|
|
|
|
+ res = [a.clone(), b[b_range.end..b.len()].to_vec() ].concat().to_vec();
|
|
|
|
|
+ }
|
|
|
|
|
+ res
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// max one consecutive diff and 5 diffs
|
|
|
|
|
+fn match_tol (s1: &[u8], s2: &[u8], max_consecutive: usize, max_diffs: usize) -> bool {
|
|
|
|
|
+ let max_consecutive= max_consecutive + 2;
|
|
|
|
|
+ // let max_diffs: usize = 5;
|
|
|
|
|
+ let mut diffs: Vec<usize> = Vec::new();
|
|
|
|
|
+
|
|
|
|
|
+ for (a, b) in std::iter::zip(s1, s2) {
|
|
|
|
|
+ diffs.push((a == b) as usize);
|
|
|
|
|
+ let sum: usize = diffs.iter().rev().take(max_consecutive).sum();
|
|
|
|
|
+ if sum == 0 && diffs.len() >= max_consecutive { return false }
|
|
|
|
|
+ }
|
|
|
|
|
+ let sum: usize = diffs.iter().rev().take(max_consecutive).sum();
|
|
|
|
|
+
|
|
|
|
|
+ if sum == max_consecutive {
|
|
|
|
|
+ diffs.len() - diffs.iter().sum::<usize>() <= max_diffs
|
|
|
|
|
+ } else {
|
|
|
|
|
+ false
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+#[cfg(test)]
|
|
|
|
|
+mod tests {
|
|
|
|
|
+ use super::*;
|
|
|
|
|
+
|
|
|
|
|
+ #[test]
|
|
|
|
|
+ fn it_works() {
|
|
|
|
|
+ // let result = add(2, 2);
|
|
|
|
|
+ // assert_eq!(result, 4);
|
|
|
|
|
+ }
|
|
|
|
|
+}
|