|
@@ -0,0 +1,241 @@
|
|
|
|
|
+use log::{info, warn};
|
|
|
|
|
+use std::{
|
|
|
|
|
+ fs::{self, File},
|
|
|
|
|
+ io::{BufRead, BufReader, Read},
|
|
|
|
|
+ path::{Path, PathBuf},
|
|
|
|
|
+ process::{Command, Stdio},
|
|
|
|
|
+};
|
|
|
|
|
+use super::Assemble;
|
|
|
|
|
+use crate::{assembler::AssembleError, io::{
|
|
|
|
|
+ bam::{cp_mod_tags, read_bam},
|
|
|
|
|
+ fasta::{fai, records_to_fasta, write_fasta},
|
|
|
|
|
+}};
|
|
|
|
|
+
|
|
|
|
|
+#[derive(Debug, Clone)]
|
|
|
|
|
+pub struct FlyeConfig {
|
|
|
|
|
+ pub output_dir: PathBuf,
|
|
|
|
|
+
|
|
|
|
|
+ pub min_cov: u16,
|
|
|
|
|
+ pub min_reads: u32,
|
|
|
|
|
+ pub threads: u16,
|
|
|
|
|
+ pub min_overlap: u32,
|
|
|
|
|
+ pub flye_bin: String,
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+impl Default for FlyeConfig {
|
|
|
|
|
+ fn default() -> Self {
|
|
|
|
|
+ Self {
|
|
|
|
|
+ output_dir: PathBuf::new(),
|
|
|
|
|
+ min_cov: 2,
|
|
|
|
|
+ min_reads: 3,
|
|
|
|
|
+ threads: 12,
|
|
|
|
|
+ min_overlap: 1000,
|
|
|
|
|
+ flye_bin: "/data/tools/Flye/bin/flye".to_string(),
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+impl FlyeConfig {
|
|
|
|
|
+ pub fn new(output_dir: &str) -> Self {
|
|
|
|
|
+ FlyeConfig { output_dir: PathBuf::from(output_dir), ..Default::default() }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+#[derive(Debug)]
|
|
|
|
|
+pub struct Flye {
|
|
|
|
|
+ pub config: FlyeConfig,
|
|
|
|
|
+ pub input_id: String,
|
|
|
|
|
+
|
|
|
|
|
+ pub input_records: Vec<bam::Record>,
|
|
|
|
|
+ pub on_contig_bam: String,
|
|
|
|
|
+ pub contigs: Option<Vec<String>>,
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+impl Assemble for Flye {
|
|
|
|
|
+ fn init(
|
|
|
|
|
+ input_bam: &Path,
|
|
|
|
|
+ config: &super::AssembleConfig,
|
|
|
|
|
+ ) -> anyhow::Result<Self> {
|
|
|
|
|
+ match config {
|
|
|
|
|
+ super::AssembleConfig::Flye(config) => {
|
|
|
|
|
+ let input_id = input_bam.file_stem().unwrap().to_str().unwrap().to_string();
|
|
|
|
|
+ let input_records = read_bam(input_bam)?;
|
|
|
|
|
+
|
|
|
|
|
+ Ok(Self {
|
|
|
|
|
+ config: config.clone(),
|
|
|
|
|
+ input_id,
|
|
|
|
|
+ input_records,
|
|
|
|
|
+ on_contig_bam: String::new(),
|
|
|
|
|
+ contigs: None,
|
|
|
|
|
+ })
|
|
|
|
|
+ }
|
|
|
|
|
+ _ => Err(anyhow::anyhow!("Wrong config format for Flye.")),
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ fn assemble(mut self) -> anyhow::Result<Self> {
|
|
|
|
|
+ let min_cov = self.config.min_cov;
|
|
|
|
|
+
|
|
|
|
|
+ let tmp_dir = format!("/tmp/ass_{}", uuid::Uuid::new_v4());
|
|
|
|
|
+ info!("Creating tmp directory {tmp_dir}");
|
|
|
|
|
+ fs::create_dir(&tmp_dir).unwrap();
|
|
|
|
|
+
|
|
|
|
|
+ let input_fa = format!("{}/{}.fasta", tmp_dir, self.input_id);
|
|
|
|
|
+ if !Path::new(&input_fa).exists() {
|
|
|
|
|
+ records_to_fasta(&self.input_records, &input_fa)?;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ let flye_tmp_dir = format!("{tmp_dir}/flye");
|
|
|
|
|
+ fs::create_dir(&flye_tmp_dir).unwrap();
|
|
|
|
|
+
|
|
|
|
|
+ run_flye(&input_fa, &flye_tmp_dir, &self.config);
|
|
|
|
|
+
|
|
|
|
|
+ let contigs_path = format!("{flye_tmp_dir}/assembly.fasta");
|
|
|
|
|
+
|
|
|
|
|
+ if Path::new(&contigs_path).exists() {
|
|
|
|
|
+ let assembly_path = format!("{flye_tmp_dir}/assembly.fasta");
|
|
|
|
|
+ let flye_cov_path = format!("{flye_tmp_dir}/40-polishing/base_coverage.bed.gz");
|
|
|
|
|
+
|
|
|
|
|
+ // Read the assembly fasta
|
|
|
|
|
+ let mut reader = File::open(assembly_path)
|
|
|
|
|
+ .map(BufReader::new)
|
|
|
|
|
+ .map(noodles_fasta::Reader::new)?;
|
|
|
|
|
+
|
|
|
|
|
+ let mut contigs = Vec::new();
|
|
|
|
|
+ for result in reader.records() {
|
|
|
|
|
+ let record = result?;
|
|
|
|
|
+ let contig_name = String::from_utf8(record.name().to_vec()).unwrap();
|
|
|
|
|
+ let (s, e) = read_flye_coverage(&flye_cov_path, min_cov.into(), &contig_name);
|
|
|
|
|
+ let seq = record.sequence().as_ref();
|
|
|
|
|
+ let seq = String::from_utf8(seq.to_vec()).unwrap();
|
|
|
|
|
+ let seq: String = seq[s..e].into();
|
|
|
|
|
+ contigs.push(seq);
|
|
|
|
|
+ }
|
|
|
|
|
+ self.contigs = Some(contigs);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ // warn!("No reads assembled for {}", self.input_id);
|
|
|
|
|
+ anyhow::bail!(AssembleError::NoContig(self.input_id));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Cleaning
|
|
|
|
|
+ fs::remove_dir_all(tmp_dir)?;
|
|
|
|
|
+
|
|
|
|
|
+ Ok(self)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ fn save(self) -> anyhow::Result<()> {
|
|
|
|
|
+ if self.contigs.is_none() {
|
|
|
|
|
+ anyhow::bail!(AssembleError::NoContig(self.input_id));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ for (i, contig) in self.contigs.unwrap().iter().enumerate() {
|
|
|
|
|
+ let suffixe = if i == 0 {
|
|
|
|
|
+ "".to_string()
|
|
|
|
|
+ } else {
|
|
|
|
|
+ format!("_{i}")
|
|
|
|
|
+ };
|
|
|
|
|
+
|
|
|
|
|
+ if !self.config.output_dir.exists() {
|
|
|
|
|
+ fs::create_dir_all(&self.config.output_dir)?;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ let contig_id = format!("{}{suffixe}_flye", self.input_id);
|
|
|
|
|
+ let contig_fa = format!("{}/{contig_id}.fa", self.config.output_dir.display());
|
|
|
|
|
+
|
|
|
|
|
+ info!("Saving contig {contig_id} in {contig_fa}");
|
|
|
|
|
+ write_fasta(&contig_fa, &vec![(contig_id.clone(), contig.clone())]);
|
|
|
|
|
+ fai(&contig_fa)?;
|
|
|
|
|
+
|
|
|
|
|
+ // Writing bed file from best blastn results
|
|
|
|
|
+ // let bed_path = format!("{}/{contig_id}.bed", self.output_dir.display());
|
|
|
|
|
+ // let bed = pandora_lib_blastn::Blast::init(&contig_fa)?
|
|
|
|
|
+ // .run()?
|
|
|
|
|
+ // .keep_best()
|
|
|
|
|
+ // .to_bed()?;
|
|
|
|
|
+ // let mut f = File::create(bed_path)?;
|
|
|
|
|
+ // f.write_all(bed.as_bytes())?;
|
|
|
|
|
+ //
|
|
|
|
|
+ // Remaping input bam to contig
|
|
|
|
|
+ info!("Mapping input reads to {contig_id}");
|
|
|
|
|
+ let new_bam = format!("{}/{contig_id}.bam", self.config.output_dir.display());
|
|
|
|
|
+ duct::cmd!("bwa", "index", contig_fa.clone()).run()?;
|
|
|
|
|
+ let input_fa = format!("{}/{}.fasta", self.config.output_dir.display(), self.input_id);
|
|
|
|
|
+ let bwa = format!("bwa mem {contig_fa} {input_fa}");
|
|
|
|
|
+ let samtools = "samtools sort /dev/stdin";
|
|
|
|
|
+ let pipe = format!("{bwa} | {samtools} > {new_bam}");
|
|
|
|
|
+ duct::cmd!("bash", "-c", pipe).run()?;
|
|
|
|
|
+
|
|
|
|
|
+ // Copy modified base tags to new bam
|
|
|
|
|
+ cp_mod_tags(&self.input_records, &new_bam)?;
|
|
|
|
|
+
|
|
|
|
|
+ // Run modkit
|
|
|
|
|
+ let modkit_pileup = format!("{}/{contig_id}_mod.bed", self.config.output_dir.display());
|
|
|
|
|
+ duct::cmd!("modkit", "pileup", new_bam, modkit_pileup).run()?;
|
|
|
|
|
+ }
|
|
|
|
|
+ Ok(())
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+pub fn run_flye(fasta_path: &str, tmp_dir: &str, config: &FlyeConfig) {
|
|
|
|
|
+ info!("Running Flye for {fasta_path}");
|
|
|
|
|
+ let mut cmd = Command::new(&config.flye_bin)
|
|
|
|
|
+ .arg("--threads")
|
|
|
|
|
+ .arg(config.threads.to_string())
|
|
|
|
|
+ // .arg("--keep-haplotypes")
|
|
|
|
|
+ // .arg("--meta")
|
|
|
|
|
+ .arg("--min-overlap")
|
|
|
|
|
+ .arg(config.min_overlap.to_string())
|
|
|
|
|
+ .arg("--out-dir")
|
|
|
|
|
+ .arg(tmp_dir)
|
|
|
|
|
+ .arg("--nano-hq")
|
|
|
|
|
+ .arg(fasta_path)
|
|
|
|
|
+ .stderr(Stdio::piped())
|
|
|
|
|
+ .spawn()
|
|
|
|
|
+ .expect("Flye failed to start");
|
|
|
|
|
+
|
|
|
|
|
+ let stderr = cmd.stderr.take().unwrap();
|
|
|
|
|
+ let reader = BufReader::new(stderr);
|
|
|
|
|
+ reader
|
|
|
|
|
+ .lines()
|
|
|
|
|
+ .map_while(Result::ok)
|
|
|
|
|
+ .filter(|line| line.contains("ERROR"))
|
|
|
|
|
+ .for_each(|line| warn!("[FLYE] {line}"));
|
|
|
|
|
+
|
|
|
|
|
+ cmd.wait().unwrap();
|
|
|
|
|
+ cmd.kill().unwrap();
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+pub fn read_flye_coverage(path: &str, min_cov: i32, contig_name: &str) -> (usize, usize) {
|
|
|
|
|
+ let mut reader = File::open(path).map(flate2::read::GzDecoder::new).unwrap();
|
|
|
|
|
+ let mut buf = Vec::new();
|
|
|
|
|
+ reader.read_to_end(&mut buf).unwrap();
|
|
|
|
|
+
|
|
|
|
|
+ let mut line_acc = Vec::new();
|
|
|
|
|
+ let mut start = None;
|
|
|
|
|
+ let mut end = None;
|
|
|
|
|
+ let mut last_end = 0;
|
|
|
|
|
+ for b in buf.iter() {
|
|
|
|
|
+ match b {
|
|
|
|
|
+ b'\n' => {
|
|
|
|
|
+ let s = String::from_utf8(line_acc.clone()).unwrap();
|
|
|
|
|
+ line_acc.clear();
|
|
|
|
|
+ if !s.starts_with(&format!("{contig_name}\t")) {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+ let s = s.split('\t').collect::<Vec<&str>>();
|
|
|
|
|
+ let cov: i32 = s.get(3).unwrap().parse().unwrap();
|
|
|
|
|
+ if start.is_none() && cov >= min_cov {
|
|
|
|
|
+ let st: i32 = s.get(1).unwrap().parse().unwrap();
|
|
|
|
|
+ start = Some(st);
|
|
|
|
|
+ } else if end.is_none() && start.is_some() && cov < min_cov {
|
|
|
|
|
+ let en: i32 = s.get(1).unwrap().parse().unwrap();
|
|
|
|
|
+ end = Some(en);
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ last_end = s.get(2).unwrap().parse().unwrap();
|
|
|
|
|
+ }
|
|
|
|
|
+ _ => line_acc.push(*b),
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ (start.unwrap() as usize, end.unwrap_or(last_end) as usize)
|
|
|
|
|
+}
|