| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106 |
- use log::{info, warn};
- use std::{
- fs::{self, File}, io::{BufRead, BufReader}, path::Path, process::{Command, Stdio}
- };
- pub fn run_flye(fasta_path: &str, tmp_dir: &str, _size: &str) {
- info!("Running Flye for {fasta_path}");
- let flye_bin = "/data/tools/Flye/bin/flye";
- let mut cmd = Command::new(flye_bin)
- .arg("--threads")
- .arg("12")
- // .arg("--keep-haplotypes")
- // .arg("--meta")
- .arg("--min-overlap")
- .arg("1000")
- .arg("--out-dir")
- .arg(tmp_dir)
- .arg("--nano-hq")
- .arg(fasta_path)
- .stderr(Stdio::piped())
- .spawn()
- .expect("Flye failed to start");
- let stderr = cmd.stderr.take().unwrap();
- let reader = BufReader::new(stderr);
- reader
- .lines()
- .map_while(Result::ok)
- .filter(|line| line.contains("ERROR"))
- .for_each(|line| warn!("[FLYE] {line}"));
- cmd.wait().unwrap();
- cmd.kill().unwrap();
- }
- pub fn read_flye_coverage(path: &str, min_cov: i32, contig_name: &str) -> (usize, usize) {
- use std::io::Read;
- let mut reader = File::open(path).map(flate2::read::GzDecoder::new).unwrap();
- let mut buf = Vec::new();
- reader.read_to_end(&mut buf).unwrap();
- let mut line_acc = Vec::new();
- let mut start = None;
- let mut end = None;
- let mut last_end = 0;
- for b in buf.iter() {
- match b {
- b'\n' => {
- let s = String::from_utf8(line_acc.clone()).unwrap();
- line_acc.clear();
- if !s.starts_with(&format!("{contig_name}\t")) {
- continue;
- }
- let s = s.split('\t').collect::<Vec<&str>>();
- let cov: i32 = s.get(3).unwrap().parse().unwrap();
- if start.is_none() && cov >= min_cov {
- let st: i32 = s.get(1).unwrap().parse().unwrap();
- start = Some(st);
- } else if end.is_none() && start.is_some() && cov < min_cov {
- let en: i32 = s.get(1).unwrap().parse().unwrap();
- end = Some(en);
- break;
- }
- last_end = s.get(2).unwrap().parse().unwrap();
- }
- _ => line_acc.push(*b),
- }
- }
- (start.unwrap() as usize, end.unwrap_or(last_end) as usize)
- }
- pub fn assemble_flye(reads_path: &str) -> anyhow::Result<Option<Vec<String>>> {
- let min_cov = 2;
- let tmp_dir = format!("/tmp/ass_{}", uuid::Uuid::new_v4());
- fs::create_dir(tmp_dir.clone()).unwrap();
- run_flye(reads_path, &tmp_dir, "10M");
- let contigs_path = format!("{tmp_dir}/assembly.fasta");
- let mut res = None;
- if Path::new(&contigs_path).exists() {
- let assembly_path = format!("{tmp_dir}/assembly.fasta");
- let flye_cov_path = format!("{tmp_dir}/40-polishing/base_coverage.bed.gz");
- let mut reader = File::open(assembly_path)
- .map(BufReader::new)
- .map(noodles_fasta::Reader::new)?;
- let mut contigs = Vec::new();
- for result in reader.records() {
- let record = result?;
- let contig_name = String::from_utf8(record.name().to_vec()).unwrap();
- let (s, e) = read_flye_coverage(&flye_cov_path, min_cov, &contig_name);
- let seq = record.sequence().as_ref();
- let seq = String::from_utf8(seq.to_vec()).unwrap();
- let seq: String = seq[s..e].into();
- contigs.push(seq.clone());
- }
- res = Some(contigs);
- }
- fs::remove_dir_all(tmp_dir)?;
- Ok(res)
- }
|