|
|
@@ -1,15 +1,19 @@
|
|
|
+use super::Assemble;
|
|
|
+use crate::{
|
|
|
+ assembler::{calculate_shannon_entropy, remove_bwa_indices, AssembleError},
|
|
|
+ io::{
|
|
|
+ bam::{cp_mod_tags, read_bam, run_modkit},
|
|
|
+ fasta::{fai, records_to_fasta, write_fasta},
|
|
|
+ },
|
|
|
+};
|
|
|
use log::{info, warn};
|
|
|
+use seq_io::fasta::Record;
|
|
|
use std::{
|
|
|
fs::{self, File},
|
|
|
io::{BufRead, BufReader, Read},
|
|
|
path::{Path, PathBuf},
|
|
|
process::{Command, Stdio},
|
|
|
};
|
|
|
-use super::Assemble;
|
|
|
-use crate::{assembler::AssembleError, io::{
|
|
|
- bam::{cp_mod_tags, read_bam},
|
|
|
- fasta::{fai, records_to_fasta, write_fasta},
|
|
|
-}};
|
|
|
|
|
|
#[derive(Debug, Clone)]
|
|
|
pub struct FlyeConfig {
|
|
|
@@ -17,6 +21,7 @@ pub struct FlyeConfig {
|
|
|
|
|
|
pub min_cov: u16,
|
|
|
pub min_reads: u32,
|
|
|
+ pub min_entropy: f64,
|
|
|
pub threads: u16,
|
|
|
pub min_overlap: u32,
|
|
|
pub flye_bin: String,
|
|
|
@@ -28,6 +33,8 @@ impl Default for FlyeConfig {
|
|
|
output_dir: PathBuf::new(),
|
|
|
min_cov: 2,
|
|
|
min_reads: 3,
|
|
|
+ min_entropy: 0.2,
|
|
|
+
|
|
|
threads: 12,
|
|
|
min_overlap: 1000,
|
|
|
flye_bin: "/data/tools/Flye/bin/flye".to_string(),
|
|
|
@@ -37,7 +44,10 @@ impl Default for FlyeConfig {
|
|
|
|
|
|
impl FlyeConfig {
|
|
|
pub fn new(output_dir: &str) -> Self {
|
|
|
- FlyeConfig { output_dir: PathBuf::from(output_dir), ..Default::default() }
|
|
|
+ FlyeConfig {
|
|
|
+ output_dir: PathBuf::from(output_dir),
|
|
|
+ ..Default::default()
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -52,10 +62,7 @@ pub struct Flye {
|
|
|
}
|
|
|
|
|
|
impl Assemble for Flye {
|
|
|
- fn init(
|
|
|
- input_bam: &Path,
|
|
|
- config: &super::AssembleConfig,
|
|
|
- ) -> anyhow::Result<Self> {
|
|
|
+ fn init(input_bam: &Path, config: &super::AssembleConfig) -> anyhow::Result<Self> {
|
|
|
match config {
|
|
|
super::AssembleConfig::Flye(config) => {
|
|
|
let input_id = input_bam.file_stem().unwrap().to_str().unwrap().to_string();
|
|
|
@@ -74,8 +81,6 @@ impl Assemble for Flye {
|
|
|
}
|
|
|
|
|
|
fn assemble(mut self) -> anyhow::Result<Self> {
|
|
|
- let min_cov = self.config.min_cov;
|
|
|
-
|
|
|
let tmp_dir = format!("/tmp/ass_{}", uuid::Uuid::new_v4());
|
|
|
info!("Creating tmp directory {tmp_dir}");
|
|
|
fs::create_dir(&tmp_dir).unwrap();
|
|
|
@@ -93,23 +98,16 @@ impl Assemble for Flye {
|
|
|
let contigs_path = format!("{flye_tmp_dir}/assembly.fasta");
|
|
|
|
|
|
if Path::new(&contigs_path).exists() {
|
|
|
- let assembly_path = format!("{flye_tmp_dir}/assembly.fasta");
|
|
|
- let flye_cov_path = format!("{flye_tmp_dir}/40-polishing/base_coverage.bed.gz");
|
|
|
-
|
|
|
- // Read the assembly fasta
|
|
|
- let mut reader = File::open(assembly_path)
|
|
|
- .map(BufReader::new)
|
|
|
- .map(noodles_fasta::Reader::new)?;
|
|
|
+ let mut reader = seq_io::fasta::Reader::from_path(&contigs_path)?;
|
|
|
|
|
|
let mut contigs = Vec::new();
|
|
|
for result in reader.records() {
|
|
|
let record = result?;
|
|
|
- let contig_name = String::from_utf8(record.name().to_vec()).unwrap();
|
|
|
- let (s, e) = read_flye_coverage(&flye_cov_path, min_cov.into(), &contig_name);
|
|
|
- let seq = record.sequence().as_ref();
|
|
|
- let seq = String::from_utf8(seq.to_vec()).unwrap();
|
|
|
- let seq: String = seq[s..e].into();
|
|
|
- contigs.push(seq);
|
|
|
+ let seq = record.seq();
|
|
|
+ let seq = String::from_utf8(seq.to_vec())?;
|
|
|
+ if calculate_shannon_entropy(&seq) >= self.config.min_entropy {
|
|
|
+ contigs.push(seq);
|
|
|
+ }
|
|
|
}
|
|
|
self.contigs = Some(contigs);
|
|
|
} else {
|
|
|
@@ -143,7 +141,7 @@ impl Assemble for Flye {
|
|
|
let contig_fa = format!("{}/{contig_id}.fa", self.config.output_dir.display());
|
|
|
|
|
|
info!("Saving contig {contig_id} in {contig_fa}");
|
|
|
- write_fasta(&contig_fa, &vec![(contig_id.clone(), contig.clone())]);
|
|
|
+ write_fasta(&contig_fa, &vec![(contig_id.clone(), contig.clone())])?;
|
|
|
fai(&contig_fa)?;
|
|
|
|
|
|
// Writing bed file from best blastn results
|
|
|
@@ -159,18 +157,25 @@ impl Assemble for Flye {
|
|
|
info!("Mapping input reads to {contig_id}");
|
|
|
let new_bam = format!("{}/{contig_id}.bam", self.config.output_dir.display());
|
|
|
duct::cmd!("bwa", "index", contig_fa.clone()).run()?;
|
|
|
- let input_fa = format!("{}/{}.fasta", self.config.output_dir.display(), self.input_id);
|
|
|
+ let input_fa = format!(
|
|
|
+ "{}/{}.fasta",
|
|
|
+ self.config.output_dir.display(),
|
|
|
+ self.input_id
|
|
|
+ );
|
|
|
let bwa = format!("bwa mem {contig_fa} {input_fa}");
|
|
|
let samtools = "samtools sort /dev/stdin";
|
|
|
let pipe = format!("{bwa} | {samtools} > {new_bam}");
|
|
|
duct::cmd!("bash", "-c", pipe).run()?;
|
|
|
|
|
|
+ // clean bwa indices
|
|
|
+ remove_bwa_indices(&contig_fa)?;
|
|
|
+
|
|
|
// Copy modified base tags to new bam
|
|
|
cp_mod_tags(&self.input_records, &new_bam)?;
|
|
|
|
|
|
// Run modkit
|
|
|
let modkit_pileup = format!("{}/{contig_id}_mod.bed", self.config.output_dir.display());
|
|
|
- duct::cmd!("modkit", "pileup", new_bam, modkit_pileup).run()?;
|
|
|
+ run_modkit(&new_bam, &contig_fa, &modkit_pileup)?;
|
|
|
}
|
|
|
Ok(())
|
|
|
}
|