use duct::cmd; use faimm::IndexedFasta; fn main() { let ref_path = "/home/thomas/NGS/ref/hg19.fa"; // let sequence = " // CATTATATTCTTTTTTACAGTGCCACGGGTATATTTAGTAAACATTGAAATGTGTGGTTG // GAATTGTTCATCACTGATGTTTTTGGTGATATTTGTTTATGTTCTGAAGCTATTGCTGTA // GACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATATTA // TTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACGGC // TGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTAGGTCCAAGAACTACAACGGCT // GCTGCCGGCCGGGCCCGCTCCCTGGCCTACCTTGCACAAGAAGTCGACATCGTAGAAGGC // GGACAGAAGTGTGGTCGACATGTTTCTGGAA" // .replace("\n", "") // .replace(" ", ""); // let sequence = " // CGTTATTCGTCGTGGCTCAAGCCCGGCCACGCCGCCCCAAGGGCTCCTCCCGACCTCCCG // GCCTGCCGCTCCGGCCACTGCGGGATCCAGAAACATGTCGACCACACTTCTGTCCGCCTT // CTACGATGTCGACTTCTTGTGCAAGGTAGGCCAGGGAGCGGGCCCGGCCGGCAGCAGCCG // TTGTAGTTCTTGGACCTAGGTTGGAGTGCATTAAGCCTTTGTCCAAAAACACCCAGCCGT // GACCCGCTATGTATGTCTCAGCATTGGGAAGAGTCCTCTGAGTGTCATGGGAAAATAATA // TATGAGTTT" // .replace("\n", "") // .replace(" ", ""); //2011N160703-HAOAA 1 let sequence = " CCTCCCGCCCTGCCTTTCAGGTCGGGAAAGTCCCGGGGTTTGCAAAAGAGTGTCCGAGCG CCCTGGAGGCGGGGAGGGCGGCAAGGAGGGCGCCGGTGTCGCGGTTGAGTTTCTCCACTG CCGACCGCGGCCACGCTGCCCGGGGCTTCCCGGACAGGCTTCGCGCCGCCCACCTCGGCA GCCGGGGCGGA TGG CCACACAGGTTGGAGTGCATTAAGCCTTTGTCCAAAAACACCCAGC CGTGACCCGCTATGTATGTCTCAGCATTGGGAAGAGTCCTCTGAGTGTCATGGGAAAATA ATATATGAGTT" .replace("\n", "") .replace(" ", ""); //2011N160703-HAOAA 2 // let sequence = " // TGGAATTGTTCATCACTGATGTTTTTGGTGATATTTGTTTATGTTCTGAAGCTATTGCTG // TAGACCAACATGGAGTAAACAGAAAATAATTGGGACACAGGGGCAATAAAACTCATATAT // TATTTTCCCATGACACTCAGAGGACTCTTCCCAATGCTGAGACATACATAGCGGGTCACG // GCTGGGTGTTTTTGGACAAAGGCTTAATGCACTCCAACCTGTGTGGCCATCCGCCCCGGC // TGCCGAGGTGGGCGGCGCGAAGCCTGTCCGGGAAGCCCCGGGCAGCGTGGCCGCGGTCGG // CAGTGGAGAAACTCAACCGCGACACC" // .replace("\n", "") // .replace(" ", ""); let alns = Alignments::to_reference(&sequence, ref_path); alns.ranges.iter().for_each(|r| println!("{:?}", r)); } pub fn bwa_aln(ref_path: &str, query_sequence: &str) -> Vec { let stdout = cmd!("echo", "-e", format!("\">sequence_to_query\n{}\"", query_sequence)) .pipe(cmd!("bwa", "mem", ref_path, "-")) .stdout_capture() .stderr_capture() .read() .unwrap(); stdout.split("\n") .filter(|e| e.contains("sequence_to_query")) .map(|e| Sam::new_from_bwa_output(e)) .into_iter() .map(|mut e| { // if the sequence is returned by bwa with an N at first position remove it if e.seq.len() != query_sequence.len() { if e.seq.chars().nth(0).unwrap() == 'N' { let (_, seq) = e.seq.split_at(1); e.seq = seq.to_string(); let mut cig = e.cigar.0.get_mut(0).unwrap(); cig.1 -= 1; } if e.seq.chars().last().unwrap() == 'N' { let (seq, _) = e.seq.split_at(e.seq.len()); e.seq = seq.to_string(); let cl = e.cigar.0.len(); let mut cig = e.cigar.0.get_mut(cl - 1).unwrap(); cig.1 -= 1; } } e }) .collect() } #[derive(Debug, Clone)] pub struct Sam { pub qname : String, pub flag : u16, pub ref_name: String, pub pos : i32, pub mapq : i32, pub cigar : Cigar, pub rnext : String, pub pnext : String, pub tlen : String, pub seq : String } impl Sam { pub fn new_from_bwa_output(sam_string: &str) -> Sam { let mut split = sam_string.split("\t"); Sam { qname : split.next().unwrap().to_string(), flag : split.next().unwrap().parse().unwrap(), ref_name: split.next().unwrap().to_string(), pos : split.next().unwrap().parse().unwrap(), mapq : split.next().unwrap().parse().unwrap(), cigar : Cigar::new_from_str(&split.next().unwrap().to_string()), rnext : split.next().unwrap().to_string(), pnext : split.next().unwrap().to_string(), tlen : split.next().unwrap().to_string(), seq : split.next().unwrap().to_string() } } pub fn is_rc(&self) -> bool { self.flag & 0x10 == 0x10 } } #[derive(Debug, Clone)] pub struct Cigar(pub Vec<(String, u32)>); impl Cigar { pub fn new_from_str(cigar_str: &str) -> Cigar { let mut res: Vec<(String, u32)> = Vec::new(); let mut num_acc = String::new(); for c in cigar_str.split("") { if c != "" { match c.parse::() { Ok(_) => num_acc.push_str(c), // if a number added to the accumulator Err(_) => { let current_add = num_acc.parse::().unwrap(); num_acc = "".to_string(); res.push((c.to_string(), current_add)); } }} } Cigar(res) } pub fn len(&self) -> u32 { self.0.iter() .map(|e| e.1) .reduce(|acc, e| acc + e) .unwrap() } pub fn expand(&self) -> String { let mut res = String::new(); for (op, n) in self.0.iter() { let c = op.chars().next().unwrap(); for _ in 0..(*n) { res.push(c); } } res } } pub struct Alignments { pub query_sequence : String, pub sam : Vec, pub ranges : Vec>, } impl Alignments { pub fn to_reference(query_sequence: &str, ref_path: &str) -> Alignments { let res_sam = bwa_aln(ref_path, query_sequence); let mut res_ranges = Vec::new(); for sam in &res_sam { res_ranges.push(ranges_from_cigar(sam.pos, sam.cigar.clone(), sam.is_rc())); } Alignments { query_sequence: query_sequence.to_string(), sam: res_sam, ranges: res_ranges } } } pub fn revcomp(dna: &str) -> String { let mut rdna: String = String::with_capacity(dna.len()); for c in dna.chars().rev() { rdna.push(switch_base(c)); } rdna } pub fn switch_base(c: char) -> char { match c { 'a' => 't', 'c' => 'g', 't' => 'a', 'g' => 'c', 'u' => 'a', 'A' => 'T', 'C' => 'G', 'T' => 'A', 'G' => 'C', 'U' => 'A', _ => 'N' } } /// return [start, stop] 1-based positions relative to reference and query pub fn ranges_from_cigar(ref_start: i32, cigar: Cigar, is_rc: bool) -> Vec<((i32, i32), (i32, i32))> { let cigar = if is_rc { cigar.expand().chars().rev().collect::>() } else { cigar.expand().chars().collect::>() }; let mut ref_counter = 0; let mut query_counter = 0; let mut query_matches: Vec = Vec::new(); let mut ref_matches: Vec = Vec::new(); for op in cigar.iter() { match op { 'M' | '=' => { query_matches.push(query_counter); ref_matches.push(ref_counter); ref_counter += 1; query_counter += 1; }, 'S' | 'H' | 'I' => { query_counter += 1; }, 'D' | 'N' => { ref_counter += 1; }, _ => panic!("Unknown cigar operand") } } let mut query_ranges = Vec::new(); let mut acc_range = None; for qp in query_matches.into_iter() { if let Some((start, stop)) = acc_range { if stop == qp - 1 { acc_range = Some((start, qp)); } else { query_ranges.push(acc_range.unwrap()); acc_range = Some((qp, qp)); } } else { acc_range = Some((qp, qp)); } } query_ranges.push(acc_range.unwrap()); query_ranges = query_ranges.iter().map(|e| (e.0 +1, e.1 +1)).collect(); let mut ref_ranges = Vec::new(); let mut acc_range = None; for qp in ref_matches.into_iter() { if let Some((start, stop)) = acc_range { if stop == qp - 1 { acc_range = Some((start, qp)); } else { query_ranges.push(acc_range.unwrap()); acc_range = Some((qp, qp)); } } else { acc_range = Some((qp, qp)); } } ref_ranges.push(acc_range.unwrap()); ref_ranges = ref_ranges.iter().map(|(start, stop)| { if is_rc { (ref_start + stop, ref_start + start) } else { (ref_start + start, ref_start + stop) } }).collect(); ref_ranges.into_iter().zip(query_ranges.into_iter()).collect() } pub fn format_seq(name: &str, sequence: &str, line_size: usize) -> String { // todo check fasta name compl let res = sequence.chars() .collect::>() .chunks(line_size) .map(|c| c.iter().collect::()) .collect::>() .join(""); format!(">{name}\n{res}") } pub fn compose_seq(ref_path: &str, ranges: Vec<(String, i32, i32)>) -> String { let fa = IndexedFasta::from_file(ref_path).expect("Error opening fa"); let mut sequence = String::new(); for (contig, start, stop) in ranges { let chr_index = fa.fai().tid(&contig).expect("Cannot find chr in index"); if start < stop { let v = fa.view(chr_index, start as usize,stop as usize).expect("Cannot get .fa view"); sequence = format!("{}{}", sequence, v.to_string()); } else { let v = fa.view(chr_index, stop as usize, start as usize).expect("Cannot get .fa view"); sequence = format!("{}{}", sequence, revcomp(&v.to_string())); } } sequence }