|
|
@@ -1,62 +1,61 @@
|
|
|
use duct::cmd;
|
|
|
+use faimm::IndexedFasta;
|
|
|
|
|
|
fn main() {
|
|
|
- let fasta_ref_path = "/home/thomas/NGS/ref/hg19.fa";
|
|
|
- let sequence = "
|
|
|
- CATTATATTCTTTTTTACAGTGCCACGGGTATATTTAGTAAACATTGAAATGTGTGGTTG
|
|
|
- GAATTGTTCATCACTGATGTTTTTGGTGATATTTGTTTATGTTCTGAAGCTATTGCTGTA
|
|
|
- GACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATATTA
|
|
|
- TTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACGGC
|
|
|
- TGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTAGGTCCAAGAACTACAACGGCT
|
|
|
- GCTGCCGGCCGGGCCCGCTCCCTGGCCTACCTTGCACAAGAAGTCGACATCGTAGAAGGC
|
|
|
- GGACAGAAGTGTGGTCGACATGTTTCTGGAA"
|
|
|
- .replace("\n", "")
|
|
|
- .replace(" ", "");
|
|
|
+ let ref_path = "/home/thomas/NGS/ref/hg19.fa";
|
|
|
+ // let sequence = "
|
|
|
+ // CATTATATTCTTTTTTACAGTGCCACGGGTATATTTAGTAAACATTGAAATGTGTGGTTG
|
|
|
+ // GAATTGTTCATCACTGATGTTTTTGGTGATATTTGTTTATGTTCTGAAGCTATTGCTGTA
|
|
|
+ // GACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATATTA
|
|
|
+ // TTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACGGC
|
|
|
+ // TGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTAGGTCCAAGAACTACAACGGCT
|
|
|
+ // GCTGCCGGCCGGGCCCGCTCCCTGGCCTACCTTGCACAAGAAGTCGACATCGTAGAAGGC
|
|
|
+ // GGACAGAAGTGTGGTCGACATGTTTCTGGAA"
|
|
|
+ // .replace("\n", "")
|
|
|
+ // .replace(" ", "");
|
|
|
|
|
|
+ // let sequence = "
|
|
|
+ // CGTTATTCGTCGTGGCTCAAGCCCGGCCACGCCGCCCCAAGGGCTCCTCCCGACCTCCCG
|
|
|
+ // GCCTGCCGCTCCGGCCACTGCGGGATCCAGAAACATGTCGACCACACTTCTGTCCGCCTT
|
|
|
+ // CTACGATGTCGACTTCTTGTGCAAGGTAGGCCAGGGAGCGGGCCCGGCCGGCAGCAGCCG
|
|
|
+ // TTGTAGTTCTTGGACCTAGGTTGGAGTGCATTAAGCCTTTGTCCAAAAACACCCAGCCGT
|
|
|
+ // GACCCGCTATGTATGTCTCAGCATTGGGAAGAGTCCTCTGAGTGTCATGGGAAAATAATA
|
|
|
+ // TATGAGTTT"
|
|
|
+ // .replace("\n", "")
|
|
|
+ // .replace(" ", "");
|
|
|
+
|
|
|
+ //2011N160703-HAOAA 1
|
|
|
let sequence = "
|
|
|
- CATTATATTCTTTTTTACAGTGCCACGGGTATATTTAGTAAACATTGAAATGTGTGGTTG
|
|
|
- GAATTGTTCATCAGTGATATTTGTTTATGTTCTGAAGCTATTGCTGTA
|
|
|
- GACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATATTA
|
|
|
- TTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACGGC
|
|
|
- TGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTAGGTCCAAGAACTACAACGGCT
|
|
|
- GCTGCCGGCCGGGCCCGCTCCCTGGCCTACCTTGCACAAGAAGTCGACATCGTAGAAGGC
|
|
|
- GGACAGAAGTGTGGTCGACATGTTTCTGGAA"
|
|
|
+ CCTCCCGCCCTGCCTTTCAGGTCGGGAAAGTCCCGGGGTTTGCAAAAGAGTGTCCGAGCG
|
|
|
+ CCCTGGAGGCGGGGAGGGCGGCAAGGAGGGCGCCGGTGTCGCGGTTGAGTTTCTCCACTG
|
|
|
+ CCGACCGCGGCCACGCTGCCCGGGGCTTCCCGGACAGGCTTCGCGCCGCCCACCTCGGCA
|
|
|
+ GCCGGGGCGGA
|
|
|
+ TGG
|
|
|
+ CCACACAGGTTGGAGTGCATTAAGCCTTTGTCCAAAAACACCCAGC
|
|
|
+ CGTGACCCGCTATGTATGTCTCAGCATTGGGAAGAGTCCTCTGAGTGTCATGGGAAAATA
|
|
|
+ ATATATGAGTT"
|
|
|
.replace("\n", "")
|
|
|
.replace(" ", "");
|
|
|
+ //2011N160703-HAOAA 2
|
|
|
+ // let sequence = "
|
|
|
+ // TGGAATTGTTCATCACTGATGTTTTTGGTGATATTTGTTTATGTTCTGAAGCTATTGCTG
|
|
|
+ // TAGACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATAT
|
|
|
+ // TATTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACG
|
|
|
+ // GCTGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTGTGTGGCCATCCGCCCCGGC
|
|
|
+ // TGCCGAGGTGGGCGGCGCGAAGCCTGTCCGGGAAGCCCCGGGCAGCGTGGCCGCGGTCGG
|
|
|
+ // CAGTGGAGAAACTCAACCGCGACACC"
|
|
|
+ // .replace("\n", "")
|
|
|
+ // .replace(" ", "");
|
|
|
+
|
|
|
+ let alns = Alignments::to_reference(&sequence, ref_path);
|
|
|
|
|
|
- println!("sequence len: {}", sequence.len());
|
|
|
- println!(">sequence\n{}", sequence);
|
|
|
-
|
|
|
- let aln = bwa_aln(fasta_ref_path, &sequence);
|
|
|
-
|
|
|
- aln.iter()
|
|
|
- .for_each(|e| {
|
|
|
- println!("seq len: {:?}", e.seq.len());
|
|
|
- println!("cigar: {:?}", e.cigar.0);
|
|
|
- println!("reverse: {:?}", is_reverse(e.flag));
|
|
|
- println!("cigar len: {:?}", e.cigar.len());
|
|
|
- println!("pos: {}:{}", e.ref_name, e.pos);
|
|
|
- if is_reverse(e.flag) {
|
|
|
- // println!("{:?}", &e.seq);
|
|
|
- println!("rc {:?}", revcomp(&e.seq));
|
|
|
- println!("{:?}", e.cigar.expand().chars().rev().collect::<String>());
|
|
|
-
|
|
|
- // println!("{:?}", e.cigar.expand());
|
|
|
+ alns.ranges.iter().for_each(|r| println!("{:?}", r));
|
|
|
|
|
|
- } else {
|
|
|
- println!("{:?}", e.seq);
|
|
|
- println!("{:?}", e.cigar.expand());
|
|
|
- }
|
|
|
-
|
|
|
- println!("-------------------------------");
|
|
|
-
|
|
|
- });
|
|
|
- make_aln(&sequence, &aln);
|
|
|
}
|
|
|
|
|
|
-pub fn bwa_aln(fasta_ref_path: &str, sequence: &str) -> Vec<Sam> {
|
|
|
- let stdout = cmd!("echo", "-e", format!("\">sequence_to_query\n{}\"", sequence))
|
|
|
- .pipe(cmd!("bwa", "mem", fasta_ref_path, "-"))
|
|
|
+pub fn bwa_aln(ref_path: &str, query_sequence: &str) -> Vec<Sam> {
|
|
|
+ let stdout = cmd!("echo", "-e", format!("\">sequence_to_query\n{}\"", query_sequence))
|
|
|
+ .pipe(cmd!("bwa", "mem", ref_path, "-"))
|
|
|
.stdout_capture()
|
|
|
.stderr_capture()
|
|
|
.read()
|
|
|
@@ -68,13 +67,19 @@ pub fn bwa_aln(fasta_ref_path: &str, sequence: &str) -> Vec<Sam> {
|
|
|
.into_iter()
|
|
|
.map(|mut e| {
|
|
|
// if the sequence is returned by bwa with an N at first position remove it
|
|
|
- if e.seq.len() != sequence.len() {
|
|
|
+ if e.seq.len() != query_sequence.len() {
|
|
|
if e.seq.chars().nth(0).unwrap() == 'N' {
|
|
|
let (_, seq) = e.seq.split_at(1);
|
|
|
e.seq = seq.to_string();
|
|
|
let mut cig = e.cigar.0.get_mut(0).unwrap();
|
|
|
cig.1 -= 1;
|
|
|
- // e.pos += 1;
|
|
|
+ }
|
|
|
+ if e.seq.chars().last().unwrap() == 'N' {
|
|
|
+ let (seq, _) = e.seq.split_at(e.seq.len());
|
|
|
+ e.seq = seq.to_string();
|
|
|
+ let cl = e.cigar.0.len();
|
|
|
+ let mut cig = e.cigar.0.get_mut(cl - 1).unwrap();
|
|
|
+ cig.1 -= 1;
|
|
|
}
|
|
|
}
|
|
|
e
|
|
|
@@ -113,13 +118,13 @@ impl Sam {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- pub fn is_reverse(&self) -> bool {
|
|
|
+ pub fn is_rc(&self) -> bool {
|
|
|
self.flag & 0x10 == 0x10
|
|
|
}
|
|
|
}
|
|
|
|
|
|
#[derive(Debug, Clone)]
|
|
|
-pub struct Cigar(Vec<(String, u32)>);
|
|
|
+pub struct Cigar(pub Vec<(String, u32)>);
|
|
|
|
|
|
impl Cigar {
|
|
|
pub fn new_from_str(cigar_str: &str) -> Cigar {
|
|
|
@@ -131,7 +136,6 @@ impl Cigar {
|
|
|
Err(_) => {
|
|
|
let current_add = num_acc.parse::<u32>().unwrap();
|
|
|
num_acc = "".to_string();
|
|
|
-
|
|
|
res.push((c.to_string(), current_add));
|
|
|
}
|
|
|
}}
|
|
|
@@ -158,11 +162,30 @@ impl Cigar {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-fn is_reverse (flag: u16) -> bool {
|
|
|
- flag & 0x10 == 0x10
|
|
|
+pub struct Alignments {
|
|
|
+ pub query_sequence : String,
|
|
|
+ pub sam : Vec<Sam>,
|
|
|
+ pub ranges : Vec<Vec<((i32, i32), (i32, i32))>>,
|
|
|
+}
|
|
|
+
|
|
|
+impl Alignments {
|
|
|
+ pub fn to_reference(query_sequence: &str, ref_path: &str) -> Alignments {
|
|
|
+ let res_sam = bwa_aln(ref_path, query_sequence);
|
|
|
+
|
|
|
+ let mut res_ranges = Vec::new();
|
|
|
+ for sam in &res_sam {
|
|
|
+ res_ranges.push(ranges_from_cigar(sam.pos, sam.cigar.clone(), sam.is_rc()));
|
|
|
+ }
|
|
|
+
|
|
|
+ Alignments {
|
|
|
+ query_sequence: query_sequence.to_string(),
|
|
|
+ sam: res_sam,
|
|
|
+ ranges: res_ranges
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
-pub fn revcomp(dna: &str) -> String{
|
|
|
+pub fn revcomp(dna: &str) -> String {
|
|
|
let mut rdna: String = String::with_capacity(dna.len());
|
|
|
for c in dna.chars().rev() {
|
|
|
rdna.push(switch_base(c));
|
|
|
@@ -170,7 +193,7 @@ pub fn revcomp(dna: &str) -> String{
|
|
|
rdna
|
|
|
}
|
|
|
|
|
|
-fn switch_base(c:char) -> char {
|
|
|
+pub fn switch_base(c: char) -> char {
|
|
|
match c {
|
|
|
'a' => 't',
|
|
|
'c' => 'g',
|
|
|
@@ -186,62 +209,101 @@ fn switch_base(c:char) -> char {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-fn make_aln(seq_query: &str, alignements: &Vec<Sam>) {
|
|
|
- let n_line = 80;
|
|
|
- let consumes_query = vec![b'M', b'I', b'S', b'=', b'X', b'H'];
|
|
|
- // let consumes_ref = vec![b'M', b'D', b'N', b'=', b'X'];
|
|
|
- let consumes_ref = vec![b'M', b'D', b'I', b'N', b'=', b'X'];
|
|
|
- let no_bar_with = vec![b'I', b'H', b'S'];
|
|
|
-
|
|
|
- let mut stack: Vec<Vec<u8>> = Vec::new();
|
|
|
- stack.push(seq_query.as_bytes().to_vec());
|
|
|
-
|
|
|
- for alignement in alignements {
|
|
|
- let mut ref_seq = alignement.seq.clone().as_bytes().to_vec();
|
|
|
- let mut cigar = alignement.cigar.expand().as_bytes().to_vec();
|
|
|
- if alignement.is_reverse() {
|
|
|
- ref_seq = revcomp(&alignement.seq).as_bytes().to_vec();
|
|
|
- cigar = cigar.into_iter().rev().collect();
|
|
|
+/// return [start, stop] 1-based positions relative to reference and query
|
|
|
+pub fn ranges_from_cigar(ref_start: i32, cigar: Cigar, is_rc: bool) -> Vec<((i32, i32), (i32, i32))> {
|
|
|
+ let cigar = if is_rc { cigar.expand().chars().rev().collect::<Vec<char>>() } else { cigar.expand().chars().collect::<Vec<char>>() };
|
|
|
+ let mut ref_counter = 0;
|
|
|
+ let mut query_counter = 0;
|
|
|
+
|
|
|
+ let mut query_matches: Vec<i32> = Vec::new();
|
|
|
+ let mut ref_matches: Vec<i32> = Vec::new();
|
|
|
+ for op in cigar.iter() {
|
|
|
+ match op {
|
|
|
+ 'M' | '=' => {
|
|
|
+ query_matches.push(query_counter);
|
|
|
+ ref_matches.push(ref_counter);
|
|
|
+
|
|
|
+ ref_counter += 1;
|
|
|
+ query_counter += 1;
|
|
|
+ },
|
|
|
+ 'S' | 'H' | 'I' => {
|
|
|
+ query_counter += 1;
|
|
|
+ },
|
|
|
+ 'D' | 'N' => {
|
|
|
+ ref_counter += 1;
|
|
|
+ },
|
|
|
+ _ => panic!("Unknown cigar operand")
|
|
|
}
|
|
|
- let mut matches: Vec<u8> = Vec::new();
|
|
|
- let mut ref_matches: Vec<u8> = Vec::new();
|
|
|
- let mut ref_seq = ref_seq.iter();
|
|
|
- for pos in 0..cigar.len() {
|
|
|
- let op = *cigar.get(pos).unwrap();
|
|
|
- if op == b'M' {
|
|
|
- matches.push(b'|');
|
|
|
- } else {
|
|
|
- if no_bar_with.contains(&op) { matches.push(b' '); }
|
|
|
- }
|
|
|
- if !consumes_ref.contains(&op) {
|
|
|
- ref_matches.push(b' ');
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut query_ranges = Vec::new();
|
|
|
+ let mut acc_range = None;
|
|
|
+ for qp in query_matches.into_iter() {
|
|
|
+ if let Some((start, stop)) = acc_range {
|
|
|
+ if stop == qp - 1 {
|
|
|
+ acc_range = Some((start, qp));
|
|
|
} else {
|
|
|
- ref_matches.push(*ref_seq.next().unwrap());
|
|
|
+ query_ranges.push(acc_range.unwrap());
|
|
|
+ acc_range = Some((qp, qp));
|
|
|
}
|
|
|
- }
|
|
|
- stack.push(matches);
|
|
|
- stack.push(ref_matches);
|
|
|
+ } else {
|
|
|
+ acc_range = Some((qp, qp));
|
|
|
+ }
|
|
|
}
|
|
|
+ query_ranges.push(acc_range.unwrap());
|
|
|
+ query_ranges = query_ranges.iter().map(|e| (e.0 +1, e.1 +1)).collect();
|
|
|
|
|
|
- let max_len = stack.iter().map(|e| e.len()).max().unwrap();
|
|
|
-
|
|
|
- let mut lines: Vec<Vec<String>> = Vec::new();
|
|
|
- let mut acc_lines: Vec<String> = vec![String::new(); stack.len()];
|
|
|
- for p in 1..(max_len + 1) {
|
|
|
- let pos = p - 1;
|
|
|
- for (i, l) in stack.iter().enumerate() {
|
|
|
- if let Some(al) = l.get(pos) {
|
|
|
- acc_lines[i].push(*al as char);
|
|
|
+ let mut ref_ranges = Vec::new();
|
|
|
+ let mut acc_range = None;
|
|
|
+ for qp in ref_matches.into_iter() {
|
|
|
+ if let Some((start, stop)) = acc_range {
|
|
|
+ if stop == qp - 1 {
|
|
|
+ acc_range = Some((start, qp));
|
|
|
} else {
|
|
|
- acc_lines[i].push(' ');
|
|
|
+ query_ranges.push(acc_range.unwrap());
|
|
|
+ acc_range = Some((qp, qp));
|
|
|
}
|
|
|
+ } else {
|
|
|
+ acc_range = Some((qp, qp));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ ref_ranges.push(acc_range.unwrap());
|
|
|
+
|
|
|
+ ref_ranges = ref_ranges.iter().map(|(start, stop)| {
|
|
|
+ if is_rc {
|
|
|
+ (ref_start + stop, ref_start + start)
|
|
|
+ } else {
|
|
|
+ (ref_start + start, ref_start + stop)
|
|
|
}
|
|
|
+ }).collect();
|
|
|
+
|
|
|
+ ref_ranges.into_iter().zip(query_ranges.into_iter()).collect()
|
|
|
+}
|
|
|
|
|
|
- if p % n_line == 0 {
|
|
|
- lines.push(acc_lines.clone());
|
|
|
- acc_lines.iter().for_each(|e| println!("{}", e));
|
|
|
- acc_lines = vec![String::new(); stack.len()];
|
|
|
+pub fn format_seq(name: &str, sequence: &str, line_size: usize) -> String {
|
|
|
+ // todo check fasta name compl
|
|
|
+ let res = sequence.chars()
|
|
|
+ .collect::<Vec<char>>()
|
|
|
+ .chunks(line_size)
|
|
|
+ .map(|c| c.iter().collect::<String>())
|
|
|
+ .collect::<Vec<String>>()
|
|
|
+ .join("");
|
|
|
+ format!(">{name}\n{res}")
|
|
|
+}
|
|
|
+
|
|
|
+pub fn compose_seq(ref_path: &str, ranges: Vec<(String, i32, i32)>) -> String {
|
|
|
+ let fa = IndexedFasta::from_file(ref_path).expect("Error opening fa");
|
|
|
+
|
|
|
+ let mut sequence = String::new();
|
|
|
+ for (contig, start, stop) in ranges {
|
|
|
+ let chr_index = fa.fai().tid(&contig).expect("Cannot find chr in index");
|
|
|
+ if start < stop {
|
|
|
+ let v = fa.view(chr_index, start as usize,stop as usize).expect("Cannot get .fa view");
|
|
|
+ sequence = format!("{}{}", sequence, v.to_string());
|
|
|
+ } else {
|
|
|
+ let v = fa.view(chr_index, stop as usize, start as usize).expect("Cannot get .fa view");
|
|
|
+ sequence = format!("{}{}", sequence, revcomp(&v.to_string()));
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
+ sequence
|
|
|
}
|