| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309 |
- use duct::cmd;
- use faimm::IndexedFasta;
- fn main() {
- let ref_path = "/home/thomas/NGS/ref/hg19.fa";
- // let sequence = "
- // CATTATATTCTTTTTTACAGTGCCACGGGTATATTTAGTAAACATTGAAATGTGTGGTTG
- // GAATTGTTCATCACTGATGTTTTTGGTGATATTTGTTTATGTTCTGAAGCTATTGCTGTA
- // GACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATATTA
- // TTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACGGC
- // TGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTAGGTCCAAGAACTACAACGGCT
- // GCTGCCGGCCGGGCCCGCTCCCTGGCCTACCTTGCACAAGAAGTCGACATCGTAGAAGGC
- // GGACAGAAGTGTGGTCGACATGTTTCTGGAA"
- // .replace("\n", "")
- // .replace(" ", "");
- // let sequence = "
- // CGTTATTCGTCGTGGCTCAAGCCCGGCCACGCCGCCCCAAGGGCTCCTCCCGACCTCCCG
- // GCCTGCCGCTCCGGCCACTGCGGGATCCAGAAACATGTCGACCACACTTCTGTCCGCCTT
- // CTACGATGTCGACTTCTTGTGCAAGGTAGGCCAGGGAGCGGGCCCGGCCGGCAGCAGCCG
- // TTGTAGTTCTTGGACCTAGGTTGGAGTGCATTAAGCCTTTGTCCAAAAACACCCAGCCGT
- // GACCCGCTATGTATGTCTCAGCATTGGGAAGAGTCCTCTGAGTGTCATGGGAAAATAATA
- // TATGAGTTT"
- // .replace("\n", "")
- // .replace(" ", "");
- //2011N160703-HAOAA 1
- let sequence = "
- CCTCCCGCCCTGCCTTTCAGGTCGGGAAAGTCCCGGGGTTTGCAAAAGAGTGTCCGAGCG
- CCCTGGAGGCGGGGAGGGCGGCAAGGAGGGCGCCGGTGTCGCGGTTGAGTTTCTCCACTG
- CCGACCGCGGCCACGCTGCCCGGGGCTTCCCGGACAGGCTTCGCGCCGCCCACCTCGGCA
- GCCGGGGCGGA
- TGG
- CCACACAGGTTGGAGTGCATTAAGCCTTTGTCCAAAAACACCCAGC
- CGTGACCCGCTATGTATGTCTCAGCATTGGGAAGAGTCCTCTGAGTGTCATGGGAAAATA
- ATATATGAGTT"
- .replace("\n", "")
- .replace(" ", "");
- //2011N160703-HAOAA 2
- // let sequence = "
- // TGGAATTGTTCATCACTGATGTTTTTGGTGATATTTGTTTATGTTCTGAAGCTATTGCTG
- // TAGACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATAT
- // TATTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACG
- // GCTGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTGTGTGGCCATCCGCCCCGGC
- // TGCCGAGGTGGGCGGCGCGAAGCCTGTCCGGGAAGCCCCGGGCAGCGTGGCCGCGGTCGG
- // CAGTGGAGAAACTCAACCGCGACACC"
- // .replace("\n", "")
- // .replace(" ", "");
-
- let alns = Alignments::to_reference(&sequence, ref_path);
- alns.ranges.iter().for_each(|r| println!("{:?}", r));
- }
- pub fn bwa_aln(ref_path: &str, query_sequence: &str) -> Vec<Sam> {
- let stdout = cmd!("echo", "-e", format!("\">sequence_to_query\n{}\"", query_sequence))
- .pipe(cmd!("bwa", "mem", ref_path, "-"))
- .stdout_capture()
- .stderr_capture()
- .read()
- .unwrap();
-
- stdout.split("\n")
- .filter(|e| e.contains("sequence_to_query"))
- .map(|e| Sam::new_from_bwa_output(e))
- .into_iter()
- .map(|mut e| {
- // if the sequence is returned by bwa with an N at first position remove it
- if e.seq.len() != query_sequence.len() {
- if e.seq.chars().nth(0).unwrap() == 'N' {
- let (_, seq) = e.seq.split_at(1);
- e.seq = seq.to_string();
- let mut cig = e.cigar.0.get_mut(0).unwrap();
- cig.1 -= 1;
- }
- if e.seq.chars().last().unwrap() == 'N' {
- let (seq, _) = e.seq.split_at(e.seq.len());
- e.seq = seq.to_string();
- let cl = e.cigar.0.len();
- let mut cig = e.cigar.0.get_mut(cl - 1).unwrap();
- cig.1 -= 1;
- }
- }
- e
- })
- .collect()
- }
- #[derive(Debug, Clone)]
- pub struct Sam {
- pub qname : String,
- pub flag : u16,
- pub ref_name: String,
- pub pos : i32,
- pub mapq : i32,
- pub cigar : Cigar,
- pub rnext : String,
- pub pnext : String,
- pub tlen : String,
- pub seq : String
- }
- impl Sam {
- pub fn new_from_bwa_output(sam_string: &str) -> Sam {
- let mut split = sam_string.split("\t");
- Sam {
- qname : split.next().unwrap().to_string(),
- flag : split.next().unwrap().parse().unwrap(),
- ref_name: split.next().unwrap().to_string(),
- pos : split.next().unwrap().parse().unwrap(),
- mapq : split.next().unwrap().parse().unwrap(),
- cigar : Cigar::new_from_str(&split.next().unwrap().to_string()),
- rnext : split.next().unwrap().to_string(),
- pnext : split.next().unwrap().to_string(),
- tlen : split.next().unwrap().to_string(),
- seq : split.next().unwrap().to_string()
- }
- }
- pub fn is_rc(&self) -> bool {
- self.flag & 0x10 == 0x10
- }
- }
- #[derive(Debug, Clone)]
- pub struct Cigar(pub Vec<(String, u32)>);
- impl Cigar {
- pub fn new_from_str(cigar_str: &str) -> Cigar {
- let mut res: Vec<(String, u32)> = Vec::new();
- let mut num_acc = String::new();
- for c in cigar_str.split("") {
- if c != "" { match c.parse::<usize>() {
- Ok(_) => num_acc.push_str(c), // if a number added to the accumulator
- Err(_) => {
- let current_add = num_acc.parse::<u32>().unwrap();
- num_acc = "".to_string();
- res.push((c.to_string(), current_add));
- }
- }}
- }
- Cigar(res)
- }
- pub fn len(&self) -> u32 {
- self.0.iter()
- .map(|e| e.1)
- .reduce(|acc, e| acc + e)
- .unwrap()
- }
- pub fn expand(&self) -> String {
- let mut res = String::new();
- for (op, n) in self.0.iter() {
- let c = op.chars().next().unwrap();
- for _ in 0..(*n) {
- res.push(c);
- }
- }
- res
- }
- }
- pub struct Alignments {
- pub query_sequence : String,
- pub sam : Vec<Sam>,
- pub ranges : Vec<Vec<((i32, i32), (i32, i32))>>,
- }
- impl Alignments {
- pub fn to_reference(query_sequence: &str, ref_path: &str) -> Alignments {
- let res_sam = bwa_aln(ref_path, query_sequence);
-
- let mut res_ranges = Vec::new();
- for sam in &res_sam {
- res_ranges.push(ranges_from_cigar(sam.pos, sam.cigar.clone(), sam.is_rc()));
- }
-
- Alignments {
- query_sequence: query_sequence.to_string(),
- sam: res_sam,
- ranges: res_ranges
- }
- }
- }
- pub fn revcomp(dna: &str) -> String {
- let mut rdna: String = String::with_capacity(dna.len());
- for c in dna.chars().rev() {
- rdna.push(switch_base(c));
- }
- rdna
- }
- pub fn switch_base(c: char) -> char {
- match c {
- 'a' => 't',
- 'c' => 'g',
- 't' => 'a',
- 'g' => 'c',
- 'u' => 'a',
- 'A' => 'T',
- 'C' => 'G',
- 'T' => 'A',
- 'G' => 'C',
- 'U' => 'A',
- _ => 'N'
- }
- }
- /// return [start, stop] 1-based positions relative to reference and query
- pub fn ranges_from_cigar(ref_start: i32, cigar: Cigar, is_rc: bool) -> Vec<((i32, i32), (i32, i32))> {
- let cigar = if is_rc { cigar.expand().chars().rev().collect::<Vec<char>>() } else { cigar.expand().chars().collect::<Vec<char>>() };
- let mut ref_counter = 0;
- let mut query_counter = 0;
-
- let mut query_matches: Vec<i32> = Vec::new();
- let mut ref_matches: Vec<i32> = Vec::new();
- for op in cigar.iter() {
- match op {
- 'M' | '=' => {
- query_matches.push(query_counter);
- ref_matches.push(ref_counter);
-
- ref_counter += 1;
- query_counter += 1;
- },
- 'S' | 'H' | 'I' => {
- query_counter += 1;
- },
- 'D' | 'N' => {
- ref_counter += 1;
- },
- _ => panic!("Unknown cigar operand")
- }
- }
-
- let mut query_ranges = Vec::new();
- let mut acc_range = None;
- for qp in query_matches.into_iter() {
- if let Some((start, stop)) = acc_range {
- if stop == qp - 1 {
- acc_range = Some((start, qp));
- } else {
- query_ranges.push(acc_range.unwrap());
- acc_range = Some((qp, qp));
- }
- } else {
- acc_range = Some((qp, qp));
- }
- }
- query_ranges.push(acc_range.unwrap());
- query_ranges = query_ranges.iter().map(|e| (e.0 +1, e.1 +1)).collect();
- let mut ref_ranges = Vec::new();
- let mut acc_range = None;
- for qp in ref_matches.into_iter() {
- if let Some((start, stop)) = acc_range {
- if stop == qp - 1 {
- acc_range = Some((start, qp));
- } else {
- query_ranges.push(acc_range.unwrap());
- acc_range = Some((qp, qp));
- }
- } else {
- acc_range = Some((qp, qp));
- }
- }
- ref_ranges.push(acc_range.unwrap());
- ref_ranges = ref_ranges.iter().map(|(start, stop)| {
- if is_rc {
- (ref_start + stop, ref_start + start)
- } else {
- (ref_start + start, ref_start + stop)
- }
- }).collect();
- ref_ranges.into_iter().zip(query_ranges.into_iter()).collect()
- }
- pub fn format_seq(name: &str, sequence: &str, line_size: usize) -> String {
- // todo check fasta name compl
- let res = sequence.chars()
- .collect::<Vec<char>>()
- .chunks(line_size)
- .map(|c| c.iter().collect::<String>())
- .collect::<Vec<String>>()
- .join("");
- format!(">{name}\n{res}")
- }
- pub fn compose_seq(ref_path: &str, ranges: Vec<(String, i32, i32)>) -> String {
- let fa = IndexedFasta::from_file(ref_path).expect("Error opening fa");
- let mut sequence = String::new();
- for (contig, start, stop) in ranges {
- let chr_index = fa.fai().tid(&contig).expect("Cannot find chr in index");
- if start < stop {
- let v = fa.view(chr_index, start as usize,stop as usize).expect("Cannot get .fa view");
- sequence = format!("{}{}", sequence, v.to_string());
- } else {
- let v = fa.view(chr_index, stop as usize, start as usize).expect("Cannot get .fa view");
- sequence = format!("{}{}", sequence, revcomp(&v.to_string()));
- }
- }
- sequence
- }
|