Thomas 2 年之前
父节点
当前提交
091a4b5006
共有 2 个文件被更改,包括 167 次插入104 次删除
  1. 1 0
      Cargo.toml
  2. 166 104
      src/main.rs

+ 1 - 0
Cargo.toml

@@ -7,3 +7,4 @@ edition = "2021"
 
 [dependencies]
 duct = "0.13.5"
+faimm = "0.3.0"

+ 166 - 104
src/main.rs

@@ -1,62 +1,61 @@
 use duct::cmd;
+use faimm::IndexedFasta;
 
 fn main() {
-    let fasta_ref_path = "/home/thomas/NGS/ref/hg19.fa";
-    let sequence = "
-    CATTATATTCTTTTTTACAGTGCCACGGGTATATTTAGTAAACATTGAAATGTGTGGTTG
-    GAATTGTTCATCACTGATGTTTTTGGTGATATTTGTTTATGTTCTGAAGCTATTGCTGTA
-    GACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATATTA
-    TTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACGGC
-    TGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTAGGTCCAAGAACTACAACGGCT
-    GCTGCCGGCCGGGCCCGCTCCCTGGCCTACCTTGCACAAGAAGTCGACATCGTAGAAGGC
-    GGACAGAAGTGTGGTCGACATGTTTCTGGAA"
-    .replace("\n", "")
-    .replace(" ", "");
+    let ref_path = "/home/thomas/NGS/ref/hg19.fa";
+    // let sequence = "
+    // CATTATATTCTTTTTTACAGTGCCACGGGTATATTTAGTAAACATTGAAATGTGTGGTTG
+    // GAATTGTTCATCACTGATGTTTTTGGTGATATTTGTTTATGTTCTGAAGCTATTGCTGTA
+    // GACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATATTA
+    // TTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACGGC
+    // TGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTAGGTCCAAGAACTACAACGGCT
+    // GCTGCCGGCCGGGCCCGCTCCCTGGCCTACCTTGCACAAGAAGTCGACATCGTAGAAGGC
+    // GGACAGAAGTGTGGTCGACATGTTTCTGGAA"
+    // .replace("\n", "")
+    // .replace(" ", "");
 
+    // let sequence = "
+    // CGTTATTCGTCGTGGCTCAAGCCCGGCCACGCCGCCCCAAGGGCTCCTCCCGACCTCCCG
+    // GCCTGCCGCTCCGGCCACTGCGGGATCCAGAAACATGTCGACCACACTTCTGTCCGCCTT
+    // CTACGATGTCGACTTCTTGTGCAAGGTAGGCCAGGGAGCGGGCCCGGCCGGCAGCAGCCG
+    // TTGTAGTTCTTGGACCTAGGTTGGAGTGCATTAAGCCTTTGTCCAAAAACACCCAGCCGT
+    // GACCCGCTATGTATGTCTCAGCATTGGGAAGAGTCCTCTGAGTGTCATGGGAAAATAATA
+    // TATGAGTTT"
+    // .replace("\n", "")
+    // .replace(" ", "");
+
+    //2011N160703-HAOAA 1
     let sequence = "
-    CATTATATTCTTTTTTACAGTGCCACGGGTATATTTAGTAAACATTGAAATGTGTGGTTG
-    GAATTGTTCATCAGTGATATTTGTTTATGTTCTGAAGCTATTGCTGTA
-    GACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATATTA
-    TTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACGGC
-    TGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTAGGTCCAAGAACTACAACGGCT
-    GCTGCCGGCCGGGCCCGCTCCCTGGCCTACCTTGCACAAGAAGTCGACATCGTAGAAGGC
-    GGACAGAAGTGTGGTCGACATGTTTCTGGAA"
+    CCTCCCGCCCTGCCTTTCAGGTCGGGAAAGTCCCGGGGTTTGCAAAAGAGTGTCCGAGCG
+    CCCTGGAGGCGGGGAGGGCGGCAAGGAGGGCGCCGGTGTCGCGGTTGAGTTTCTCCACTG
+    CCGACCGCGGCCACGCTGCCCGGGGCTTCCCGGACAGGCTTCGCGCCGCCCACCTCGGCA
+    GCCGGGGCGGA
+    TGG
+    CCACACAGGTTGGAGTGCATTAAGCCTTTGTCCAAAAACACCCAGC
+    CGTGACCCGCTATGTATGTCTCAGCATTGGGAAGAGTCCTCTGAGTGTCATGGGAAAATA
+    ATATATGAGTT"
     .replace("\n", "")
     .replace(" ", "");
+    //2011N160703-HAOAA 2
+    // let sequence = "
+    // TGGAATTGTTCATCACTGATGTTTTTGGTGATATTTGTTTATGTTCTGAAGCTATTGCTG
+    // TAGACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATAT
+    // TATTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACG
+    // GCTGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTGTGTGGCCATCCGCCCCGGC
+    // TGCCGAGGTGGGCGGCGCGAAGCCTGTCCGGGAAGCCCCGGGCAGCGTGGCCGCGGTCGG
+    // CAGTGGAGAAACTCAACCGCGACACC"
+    // .replace("\n", "")
+    // .replace(" ", "");
+    
+    let alns = Alignments::to_reference(&sequence, ref_path);
 
-    println!("sequence len: {}", sequence.len());
-    println!(">sequence\n{}", sequence);
-
-    let aln = bwa_aln(fasta_ref_path, &sequence);
-
-    aln.iter()
-    .for_each(|e| {
-        println!("seq len: {:?}", e.seq.len());
-        println!("cigar: {:?}", e.cigar.0);
-        println!("reverse: {:?}", is_reverse(e.flag));
-        println!("cigar len: {:?}", e.cigar.len());
-        println!("pos: {}:{}", e.ref_name, e.pos);
-        if is_reverse(e.flag) {
-            // println!("{:?}", &e.seq);
-            println!("rc {:?}", revcomp(&e.seq));
-            println!("{:?}", e.cigar.expand().chars().rev().collect::<String>());
-
-            // println!("{:?}", e.cigar.expand());
+    alns.ranges.iter().for_each(|r| println!("{:?}", r));
 
-        } else {
-            println!("{:?}", e.seq);
-            println!("{:?}", e.cigar.expand());
-        }
-        
-        println!("-------------------------------");
-
-    });
-    make_aln(&sequence, &aln);
 }
 
-pub fn bwa_aln(fasta_ref_path: &str, sequence: &str) -> Vec<Sam> {
-    let stdout = cmd!("echo", "-e", format!("\">sequence_to_query\n{}\"", sequence))
-    .pipe(cmd!("bwa", "mem", fasta_ref_path, "-"))
+pub fn bwa_aln(ref_path: &str, query_sequence: &str) -> Vec<Sam> {
+    let stdout = cmd!("echo", "-e", format!("\">sequence_to_query\n{}\"", query_sequence))
+    .pipe(cmd!("bwa", "mem", ref_path, "-"))
     .stdout_capture()
     .stderr_capture()
     .read()
@@ -68,13 +67,19 @@ pub fn bwa_aln(fasta_ref_path: &str, sequence: &str) -> Vec<Sam> {
     .into_iter()
     .map(|mut e| {
         // if the sequence is returned by bwa with an N at first position remove it
-        if e.seq.len() != sequence.len() {
+        if e.seq.len() != query_sequence.len() {
             if e.seq.chars().nth(0).unwrap() == 'N' {
                 let (_, seq) = e.seq.split_at(1);
                 e.seq = seq.to_string();
                 let mut cig = e.cigar.0.get_mut(0).unwrap();
                 cig.1 -= 1;
-                // e.pos += 1;
+            }
+            if e.seq.chars().last().unwrap() == 'N' {
+                let (seq, _) = e.seq.split_at(e.seq.len());
+                e.seq = seq.to_string();
+                let cl = e.cigar.0.len();
+                let mut cig = e.cigar.0.get_mut(cl - 1).unwrap();
+                cig.1 -= 1;
             }
         }
         e
@@ -113,13 +118,13 @@ impl Sam {
         }
     }
 
-    pub fn is_reverse(&self) -> bool {
+    pub fn is_rc(&self) -> bool {
         self.flag & 0x10 == 0x10
     }
 }
 
 #[derive(Debug, Clone)]
-pub struct Cigar(Vec<(String, u32)>);
+pub struct Cigar(pub Vec<(String, u32)>);
 
 impl Cigar {
     pub fn new_from_str(cigar_str: &str) -> Cigar {
@@ -131,7 +136,6 @@ impl Cigar {
                 Err(_) => {
                     let current_add = num_acc.parse::<u32>().unwrap();
                     num_acc = "".to_string();
-
                     res.push((c.to_string(), current_add));
                 }
             }}
@@ -158,11 +162,30 @@ impl Cigar {
     }
 }
 
-fn is_reverse (flag: u16) -> bool {
-    flag & 0x10 == 0x10
+pub struct Alignments {
+    pub query_sequence : String,
+    pub sam            : Vec<Sam>,
+    pub ranges         : Vec<Vec<((i32, i32), (i32, i32))>>,
+}
+
+impl Alignments {
+    pub fn to_reference(query_sequence: &str, ref_path: &str) -> Alignments {
+        let res_sam = bwa_aln(ref_path, query_sequence);
+        
+        let mut res_ranges = Vec::new();
+        for sam in &res_sam {
+            res_ranges.push(ranges_from_cigar(sam.pos, sam.cigar.clone(), sam.is_rc()));
+        }
+        
+        Alignments {
+            query_sequence: query_sequence.to_string(),
+            sam: res_sam,
+            ranges: res_ranges
+        }
+    }
 }
 
-pub fn revcomp(dna: &str) -> String{
+pub fn revcomp(dna: &str) -> String {
     let mut rdna: String = String::with_capacity(dna.len()); 
     for c in dna.chars().rev() {
         rdna.push(switch_base(c));
@@ -170,7 +193,7 @@ pub fn revcomp(dna: &str) -> String{
     rdna
 }
 
-fn switch_base(c:char) -> char {
+pub fn switch_base(c: char) -> char {
     match c {
         'a' => 't',
         'c' => 'g',
@@ -186,62 +209,101 @@ fn switch_base(c:char) -> char {
     }
 }
 
-fn make_aln(seq_query: &str, alignements: &Vec<Sam>) {
-    let n_line = 80;
-    let consumes_query = vec![b'M', b'I', b'S', b'=', b'X', b'H'];
-    // let consumes_ref   = vec![b'M', b'D', b'N', b'=', b'X'];
-    let consumes_ref   = vec![b'M', b'D', b'I', b'N', b'=', b'X'];
-    let no_bar_with = vec![b'I', b'H', b'S'];
-
-    let mut stack: Vec<Vec<u8>> = Vec::new();
-    stack.push(seq_query.as_bytes().to_vec());
-
-    for alignement in alignements {
-        let mut ref_seq = alignement.seq.clone().as_bytes().to_vec();
-        let mut cigar  = alignement.cigar.expand().as_bytes().to_vec();
-        if alignement.is_reverse() {
-            ref_seq = revcomp(&alignement.seq).as_bytes().to_vec();
-            cigar   = cigar.into_iter().rev().collect();
+/// return [start, stop] 1-based positions relative to reference and query
+pub fn ranges_from_cigar(ref_start: i32, cigar: Cigar, is_rc: bool) -> Vec<((i32, i32), (i32, i32))> {
+    let cigar = if is_rc { cigar.expand().chars().rev().collect::<Vec<char>>() } else { cigar.expand().chars().collect::<Vec<char>>() };
+    let mut ref_counter = 0;
+    let mut query_counter = 0;
+    
+    let mut query_matches: Vec<i32> = Vec::new();
+    let mut ref_matches: Vec<i32> = Vec::new();
+    for op in cigar.iter() {
+        match op {
+            'M' | '=' => {
+                query_matches.push(query_counter);
+                ref_matches.push(ref_counter);
+                
+                ref_counter   += 1;
+                query_counter += 1;
+            },
+            'S' | 'H' | 'I' => {
+                query_counter += 1;
+            },
+            'D' | 'N' => {
+                ref_counter += 1;
+            },
+            _ => panic!("Unknown cigar operand")
         }
-        let mut matches:     Vec<u8> = Vec::new();
-        let mut ref_matches: Vec<u8> = Vec::new();
-        let mut ref_seq = ref_seq.iter();
-        for pos in 0..cigar.len() {
-            let op = *cigar.get(pos).unwrap();
-            if op == b'M' {
-                matches.push(b'|');
-            } else {
-                if no_bar_with.contains(&op) { matches.push(b' '); }
-            }
-            if !consumes_ref.contains(&op) {
-                ref_matches.push(b' ');
+    }
+    
+    let mut query_ranges = Vec::new();
+    let mut acc_range = None;
+    for qp in query_matches.into_iter() {
+        if let Some((start, stop)) = acc_range {
+            if stop == qp - 1 {
+                acc_range = Some((start, qp));
             } else {
-                ref_matches.push(*ref_seq.next().unwrap());
+                query_ranges.push(acc_range.unwrap());
+                acc_range = Some((qp, qp));
             }
-        }
-        stack.push(matches);
-        stack.push(ref_matches);
+        } else {
+            acc_range = Some((qp, qp));
+        }   
     }
+    query_ranges.push(acc_range.unwrap());
+    query_ranges = query_ranges.iter().map(|e| (e.0 +1, e.1 +1)).collect();
 
-    let max_len = stack.iter().map(|e| e.len()).max().unwrap();
-
-    let mut lines: Vec<Vec<String>> = Vec::new();
-    let mut acc_lines: Vec<String> = vec![String::new(); stack.len()];
-    for p in 1..(max_len + 1) {
-        let pos = p - 1;
-        for (i, l) in stack.iter().enumerate() {
-            if let Some(al) = l.get(pos) {
-                acc_lines[i].push(*al as char);
+    let mut ref_ranges = Vec::new();
+    let mut acc_range = None;
+    for qp in ref_matches.into_iter() {
+        if let Some((start, stop)) = acc_range {
+            if stop == qp - 1 {
+                acc_range = Some((start, qp));
             } else {
-                acc_lines[i].push(' ');
+                query_ranges.push(acc_range.unwrap());
+                acc_range = Some((qp, qp));
             }
+        } else {
+            acc_range = Some((qp, qp));
+        }   
+    }
+    ref_ranges.push(acc_range.unwrap());
+
+    ref_ranges = ref_ranges.iter().map(|(start, stop)| {
+        if is_rc {
+            (ref_start + stop, ref_start + start)
+        } else {
+            (ref_start + start, ref_start + stop)
         }
+    }).collect();
+
+    ref_ranges.into_iter().zip(query_ranges.into_iter()).collect()
+}
 
-        if p % n_line == 0 {
-            lines.push(acc_lines.clone());
-            acc_lines.iter().for_each(|e| println!("{}", e));
-            acc_lines = vec![String::new(); stack.len()];
+pub fn format_seq(name: &str, sequence: &str, line_size: usize) -> String {
+    // todo check fasta name compl
+    let res = sequence.chars()
+    .collect::<Vec<char>>()
+    .chunks(line_size)
+    .map(|c| c.iter().collect::<String>())
+    .collect::<Vec<String>>()
+    .join("");
+    format!(">{name}\n{res}")
+}
+
+pub fn compose_seq(ref_path: &str, ranges: Vec<(String, i32, i32)>) -> String {
+    let fa = IndexedFasta::from_file(ref_path).expect("Error opening fa");
+
+    let mut sequence = String::new();
+    for (contig, start, stop) in ranges {
+        let chr_index = fa.fai().tid(&contig).expect("Cannot find chr in index");
+        if start < stop {
+            let v = fa.view(chr_index, start as usize,stop as usize).expect("Cannot get .fa view");
+            sequence = format!("{}{}", sequence, v.to_string());
+        } else {
+            let v = fa.view(chr_index, stop as usize, start as usize).expect("Cannot get .fa view");
+            sequence = format!("{}{}", sequence, revcomp(&v.to_string()));
         }
     }
-
+    sequence
 }