Thomas 1 year ago
parent
commit
7009d43f08
3 changed files with 86 additions and 67 deletions
  1. 1 0
      Cargo.lock
  2. 1 0
      Cargo.toml
  3. 84 67
      src/commands/dorado.rs

+ 1 - 0
Cargo.lock

@@ -2028,6 +2028,7 @@ dependencies = [
  "test-log",
  "tracing",
  "tracing-test",
+ "uuid",
 ]
 
 [[package]]

+ 1 - 0
Cargo.toml

@@ -28,3 +28,4 @@ nix = { version = "0.29.0", features = ["term", "process" ] }
 expectrl = "0.7.1"
 ptyprocess = "0.4.1"
 duct = "0.13.7"
+uuid = { version = "1.10.0", features = ["v4"] }

+ 84 - 67
src/commands/dorado.rs

@@ -1,12 +1,13 @@
 use std::{
     fs,
     io::{Read, Write},
-    path::Path,
+    path::{Path, PathBuf},
     time::SystemTime,
 };
 
 use duct::cmd;
 use log::{info, warn};
+use uuid::Uuid;
 
 pub trait Run {
     fn run(self) -> anyhow::Result<()>;
@@ -26,6 +27,9 @@ pub struct DoradoConfig {
 
 pub struct Dorado {
     config: DoradoConfig,
+    case_dir: String,
+    time_dir: String,
+    bam: String,
     start_time: SystemTime,
     end_time: SystemTime,
     is_done: bool,
@@ -34,40 +38,46 @@ pub struct Dorado {
 
 impl Dorado {
     pub fn init(config: DoradoConfig) -> anyhow::Result<Self> {
+        let data_dir = "/data/longreads_basic_pipe";
+        let case_dir = format!("{}/{}", data_dir, config.name);
+        let time_dir = format!("{}/{}", case_dir, config.time);
+        let bam = format!("{}/{}_{}_hs1.bam", time_dir, config.name, config.time);
+
         Ok(Self {
             config,
             start_time: SystemTime::now(),
             end_time: SystemTime::now(),
             is_done: false,
             log: Vec::new(),
+            case_dir,
+            time_dir,
+            bam,
         })
     }
-    fn create_reference_mmi(&self, ref_mmi: &str, ref_fa: &str) -> anyhow::Result<()> {
-        if !std::path::Path::new(ref_mmi).exists() {
-            cmd!("minimap2", "-x", "map-ont", "-d", ref_mmi, ref_fa).run()?;
+    fn create_reference_mmi(&self) -> anyhow::Result<()> {
+        if !std::path::Path::new(&self.config.ref_mmi).exists() {
+            cmd!("minimap2", "-x", "map-ont", "-d", &self.config.ref_mmi, &self.config.ref_fa).run()?;
         }
         Ok(())
     }
 
-    fn create_directories(&self, case_dir: &str, time_dir: &str) -> anyhow::Result<()> {
-        if !std::path::Path::new(case_dir).exists() {
-            fs::create_dir(case_dir)?;
+    fn create_directories(&self) -> anyhow::Result<()> {
+        if !std::path::Path::new(&self.case_dir).exists() {
+            fs::create_dir(&self.case_dir)?;
         }
-        if !std::path::Path::new(time_dir).exists() {
-            fs::create_dir(time_dir)?;
+        if !std::path::Path::new(&self.time_dir).exists() {
+            fs::create_dir(&self.time_dir)?;
         }
         Ok(())
     }
 
-    fn run_dorado_and_samtools(
-        &mut self,
-        dorado_bin: &str,
-        pod_dir: &str,
-        ref_mmi: &str,
-        bam: &str,
-        samtools_view_threads: u16,
-        samtools_sort_threads: u16,
-    ) -> anyhow::Result<()> {
+    fn basecall_align(&mut self, dorado_bin: &str) -> anyhow::Result<()> {
+        let pod_dir = &self.config.pod_dir;
+        let ref_mmi = &self.config.ref_mmi;
+        let bam = &self.bam;
+        let samtools_view_threads = self.config.samtools_view_threads;
+        let samtools_sort_threads = self.config.samtools_sort_threads;
+
         let dorado = format!(
             "{dorado_bin} basecaller sup,5mC_5hmC {pod_dir} --trim all --reference {ref_mmi}"
         );
@@ -107,10 +117,13 @@ impl Dorado {
         Ok(())
     }
 
-    fn run_cramino(&self, bam: &str, time_dir: &str, name: &str, time: &str) -> anyhow::Result<()> {
-        let cramino_out = format!("{}/{}_{}_hs1_cramino.txt", time_dir, name, time);
+    fn run_cramino(&self) -> anyhow::Result<()> {
+        let cramino_out = format!(
+            "{}/{}_{}_hs1_cramino.txt",
+            self.time_dir, self.config.name, self.config.time
+        );
         if !Path::new(&cramino_out).exists() {
-            info!("Quality control of BAM: {}", bam);
+            info!("Quality control of BAM: {}", self.bam);
             let output = duct::cmd!(
                 "cramino",
                 "-t",
@@ -118,7 +131,7 @@ impl Dorado {
                 "--hist",
                 "--checksum",
                 "--karyotype",
-                bam
+                &self.bam
             )
             .stdout_capture()
             .unchecked()
@@ -129,11 +142,14 @@ impl Dorado {
         Ok(())
     }
 
-    fn run_modkit(&self, bam: &str, time_dir: &str, name: &str, time: &str) -> anyhow::Result<()> {
-        let mod_summary = format!("{}/{}_{}_5mC_5hmC_summary.txt", time_dir, name, time);
+    fn run_modkit(&self) -> anyhow::Result<()> {
+        let mod_summary = format!(
+            "{}/{}_{}_5mC_5hmC_summary.txt",
+            self.time_dir, self.config.name, self.config.time
+        );
         if !Path::new(&mod_summary).exists() {
-            info!("Generating base modification summary for BAM: {}", bam);
-            let output = cmd!("modkit", "summary", "-t", "50", bam)
+            info!("Generating base modification summary for BAM: {}", self.bam);
+            let output = cmd!("modkit", "summary", "-t", "50", &self.bam)
                 .stdout_capture()
                 .unchecked()
                 .run()?;
@@ -143,14 +159,12 @@ impl Dorado {
         Ok(())
     }
 
-    fn create_fastq(
-        &self,
-        bam: &str,
-        case_dir: &str,
-        name: &str,
-        time: &str,
-    ) -> anyhow::Result<()> {
-        let fastq = format!("{}/{}/{}/{}_{}.fastq.gz", case_dir, name, time, name, time);
+    fn create_fastq(&self) -> anyhow::Result<()> {
+        let bam = &self.bam;
+        let fastq = format!(
+            "{}/{}/{}/{}_{}.fastq.gz",
+            self.case_dir, self.config.name, self.config.time, self.config.name, self.config.time
+        );
         if !std::path::Path::new(&fastq).exists() {
             let samtools = format!("samtools fastq -@ 150 {bam}");
             let crabz = format!("crabz -f bgzf - -o {fastq}");
@@ -161,6 +175,21 @@ impl Dorado {
         }
         Ok(())
     }
+
+    fn merge_bam(&self, bam: &Path) -> anyhow::Result<()> {
+        let into = PathBuf::from(&self.bam);
+        let dir = into.parent().unwrap();
+        let original_file = into.file_name().unwrap().to_string_lossy().to_string();
+        let original_i = dir.join(format!("{original_file}.bai"));
+        let tmp_original_file = format!("{}.bam", Uuid::new_v4());
+        let tmp_original = dir.join(tmp_original_file.clone());
+        let tmp_original_i = dir.join(format!("{tmp_original_file}.bai"));
+        fs::rename(bam, tmp_original)?;
+        fs::rename(original_i, tmp_original_i)?;
+
+        cmd!("samtools merge -@ 160 -h {bam} {into} {bam} {tmp_original}").run()?;
+        Ok(())
+    }
 }
 
 impl Run for Dorado {
@@ -170,42 +199,30 @@ impl Run for Dorado {
 
         info!("Running Dorado with params: {:#?}", self.config);
 
-        let DoradoConfig {
-            name,
-            time,
-            pod_dir,
-            ref_fa,
-            ref_mmi,
-            dorado_threads: _,
-            samtools_view_threads,
-            samtools_sort_threads,
-        } = self.config.clone();
-
-        let data_dir = "/data/longreads_basic_pipe";
         let dorado_bin = "/data/tools/dorado-0.7.2-linux-x64/bin/dorado";
-        let case_dir = format!("{}/{}", data_dir, name);
-        let time_dir = format!("{}/{}", case_dir, time);
-        let bam = format!("{}/{}_{}_hs1.bam", time_dir, name, time);
-
-        self.create_reference_mmi(&ref_mmi, &ref_fa)?;
-        self.create_directories(&case_dir, &time_dir)?;
-
-        info!("Reading {} pod5 from: {}", time, pod_dir);
-
-        if !std::path::Path::new(&bam).exists() {
-            self.run_dorado_and_samtools(
-                dorado_bin,
-                &pod_dir,
-                &ref_mmi,
-                &bam,
-                samtools_view_threads,
-                samtools_sort_threads,
-            )?;
+
+        self.create_reference_mmi()?;
+        self.create_directories()?;
+
+        info!("Reading {} pod5 from: {}", self.config.time, self.config.pod_dir);
+        let bam_path = std::path::Path::new(&self.bam);
+
+        if !bam_path.exists() {
+            self.basecall_align(dorado_bin)?;
+        } else {
+            let new_bam_path = bam_path
+                .parent()
+                .unwrap()
+                .join(format!("{}.bam", Uuid::new_v4()));
+            warn!("Creating new bam {}", new_bam_path.display());
+
+            self.basecall_align(dorado_bin)?;
+            self.merge_bam(&new_bam_path)?;
         }
 
-        self.run_cramino(&bam, &time_dir, &name, &time)?;
-        self.run_modkit(&bam, &time_dir, &name, &time)?;
-        self.create_fastq(&bam, &case_dir, &name, &time)?;
+        self.run_cramino()?;
+        self.run_modkit()?;
+        self.create_fastq()?;
 
         let end_time = std::time::SystemTime::now();
         self.end_time = end_time;