use std::{ fs::{self}, path::{Path, PathBuf}, }; use anyhow::Context; use log::{debug, info}; use crate::{ collection::pod5::Pod5, commands::{Command, SlurmParams}, config::Config, slurm_helpers::max_gpu_per_node, }; /// Run Dorado basecalling on a directory of POD5 files. /// /// This command: /// - Validates that the POD5 directory exists /// - Locates the first `.pod5` file to extract the sequencing kit (dont mix pod5 from different /// runs into the same dir) /// - Builds a Dorado `basecaller` command line using the configured arguments /// /// The resulting BAM is written to `output_bam`. #[derive(Debug)] pub struct DoradoBasecall { /// Path to the Dorado executable. dorado: PathBuf, /// Directory containing `.pod5` reads. pod5_dir: PathBuf, /// Output BAM file produced by Dorado. output_bam: PathBuf, /// Sequencing kit extracted from the POD5 file metadata. sequencing_kit: String, /// Additional basecalling arguments from configuration. dorado_basecall_arg: String, } impl DoradoBasecall { /// Build a `DoradoBasecall` command from configuration and input/output paths. /// /// # Parameters /// - `config`: global configuration providing Dorado binary path and args /// - `pod5_dir`: directory containing POD5 files /// - `output_bam`: destination BAM path pub fn from_config( config: &Config, pod5_dir: impl AsRef, output_bam: impl AsRef, ) -> Self { Self { dorado: (&config.align.dorado_bin).into(), pod5_dir: pod5_dir.as_ref().into(), output_bam: output_bam.as_ref().into(), sequencing_kit: String::new(), dorado_basecall_arg: config.align.dorado_basecall_arg.clone(), } } } impl Command for DoradoBasecall { /// Validate input directory, ensure no output overwrite, and extract sequencing kit. fn init(&mut self) -> anyhow::Result<()> { if !self.pod5_dir.exists() || !self.pod5_dir.is_dir() { anyhow::bail!( "The pod5 dir is not accessible.\n{}", self.pod5_dir.display() ); } if self.output_bam.exists() { anyhow::bail!("The output BAM file already exists."); } let pod_path = fs::read_dir(&self.pod5_dir) .context(format!( "Failed to read pod5 dir: {}", self.pod5_dir.display() ))? .filter_map(Result::ok) // keep only Ok entries .map(|e| e.path()) .find(|p| p.extension().and_then(|e| e.to_str()) == Some("pod5")) .context("No .pod5 file found")?; self.sequencing_kit = Pod5::from_path(&pod_path)?.sequencing_kit.to_uppercase(); Ok(()) } /// Build the Dorado basecaller command. fn cmd(&self) -> String { let dorado_bin = &self.dorado; let pod_dir = &self.pod5_dir; let bam = &self.output_bam; let dorado_arg = &self.dorado_basecall_arg; let sequencing_kit = &self.sequencing_kit; // Error while trying to trim with a non barcode kit !!! // Should find the right way.... let sequencing_kit = "SQK-NBD114-24"; let trim = if sequencing_kit == "SQK-LSK114" { "adapters" } else { "all" }; format!( "{} basecaller --kit-name {sequencing_kit} --trim {} --emit-moves {dorado_arg} --models-directory /home/t_steimle/ref/dorado_models {} > {}", dorado_bin.display(), trim, pod_dir.display(), bam.display() ) } } /// Slurm execution parameters for `DoradoBasecall`. /// /// This configuration launches Dorado on a GPU partition with: /// - 10 CPU threads (`--cpus-per-task=10`) /// - 60 GB memory /// - 4× H100 GPUs /// /// # Performance Notes /// /// Dorado basecalling throughput increases with CPU count when using GPUs. /// Example measurements (Samples/s): /// /// │ CPUs │ Throughput (Samples/s) │ /// │------│-----------------------------│ /// │ 4 │ 5.36×10⁷ │ /// │ 5 │ 6.16×10⁷ │ /// │ 6 │ 6.87×10⁷ │ /// │ 7 │ 7.23×10⁷ │ /// │ 8 │ 7.67×10⁷ │ /// │ 10 │ 8.40×10⁷ │ /// │ 15 │ 8.78×10⁷ │ /// /// Throughput gains diminish beyond ~10 CPUs when the GPU becomes the bottleneck. /// The runner uses **10 CPUs** by default as a balanced configuration. /// /// # Resulting Slurm Command /// /// Equivalent to: /// /// ```text /// srun \ /// --job-name=dorado_basecall \ /// --cpus-per-task=10 \ /// --mem=60G \ /// --partition=gpgpuq \ /// --gres=gpu:h100:4 \ /// bash -c "" /// ``` fn dorado_slurm_params() -> super::SlurmParams { let (gpu, n) = if let (Some(h100_av), Some(a100_av)) = (max_gpu_per_node("h100"), max_gpu_per_node("a100")) { debug!("Available H100: {h100_av} and A100: {a100_av}"); let (gpu, n) = if h100_av >= a100_av { ("h100", h100_av) } else { ("a100", a100_av) }; let n = n.clamp(1, 4); (gpu, n) } else { panic!("Are you running slurm with a100 and h100 GPU ?"); }; info!("Running Dorado Basecaller with: {gpu} x {n}"); super::SlurmParams { job_name: Some("dorado_basecall".into()), cpus_per_task: Some(10), mem: Some("60G".into()), partition: Some("gpgpuq".into()), gres: Some(format!("gpu:{gpu}:{n}")), } } impl super::SlurmRunner for DoradoBasecall { fn slurm_args(&self) -> Vec { dorado_slurm_params().to_args() } } impl super::LocalRunner for DoradoBasecall {} impl super::SbatchRunner for DoradoBasecall { fn slurm_params(&self) -> SlurmParams { dorado_slurm_params() } } /// Run Dorado alignment using a reference FASTA and an input BAM. /// /// This command: /// - Validates input/output paths /// - Invokes Dorado `aligner` /// - Produces an aligned BAM at `output` #[derive(Debug)] pub struct DoradoAlign { /// Path to the Dorado executable. pub dorado: PathBuf, /// Reference FASTA used for alignment. pub reference: PathBuf, /// Input BAM to align. pub input: PathBuf, /// Output aligned BAM. pub output: PathBuf, /// Number of threads for the Dorado aligner. pub threads: u8, /// Slurm params pub slurm_params: SlurmParams, } impl DoradoAlign { /// Build a `DoradoAlign` command from configuration and input/output paths. /// /// # Parameters /// - `config`: global configuration /// - `input`: input BAM /// - `output`: aligned BAM pub fn from_config(config: &Config, input: impl AsRef, output: impl AsRef) -> Self { let threads = config.align.dorado_aligner_threads; Self { dorado: (&config.align.dorado_bin).into(), reference: (&config.align.ref_fa).into(), input: input.as_ref().into(), output: output.as_ref().into(), threads, slurm_params: SlurmParams { job_name: Some("dorado_align".into()), cpus_per_task: Some(threads.into()), mem: Some("30G".into()), partition: Some("mediumq".into()), gres: None, }, } } } impl super::Command for DoradoAlign { /// Validate input BAM and ensure the output does not already exist. fn init(&mut self) -> anyhow::Result<()> { if !self.input.exists() { anyhow::bail!( "The input BAM file is not accessible.\n{}", self.input.display() ); } if self.output.exists() { anyhow::bail!( "The output BAM file already exists.\n{}", self.output.display() ); } Ok(()) } /// Build the Dorado aligner command. fn cmd(&self) -> String { format!( "{} aligner --threads {} --allow-sec-supp --mm2-opts '--secondary yes' {} {} > {}", self.dorado.display(), self.threads, self.reference.display(), self.input.display(), self.output.display() ) } } impl super::LocalRunner for DoradoAlign {} impl super::LocalBatchRunner for DoradoAlign {} impl super::SlurmRunner for DoradoAlign { /// Default Slurm parameters for running the Dorado aligner. fn slurm_args(&self) -> Vec { self.slurm_params.to_args() } } impl super::SbatchRunner for DoradoAlign { /// Default Slurm parameters for running the Dorado aligner. fn slurm_params(&self) -> SlurmParams { self.slurm_params.clone() } } #[cfg(test)] mod tests { use super::*; use crate::TEST_DIR; use log::info; use crate::{commands::SlurmRunner, config::Config, helpers::test_init}; #[test] fn dorado_basecall() -> anyhow::Result<()> { test_init(); let config = Config::default(); let tmp_dir = config.tmp_dir.clone(); let mut dca = DoradoBasecall::from_config( &config, format!("/mnt/beegfs02/scratch/t_steimle/data/runs/20260206_1540_P2I-00461-A_PBI80774_67adb0bc/pod5"), format!("/mnt/beegfs02/scratch/t_steimle/data/36122_unaligned.bam"), ); info!("Basecalling"); let out = SlurmRunner::exec(&mut dca)?; println!("{out:#?}"); Ok(()) } #[test] fn dorado_align() -> anyhow::Result<()> { test_init(); let config = Config::default(); let mut dca = DoradoAlign::from_config( &config, format!("/mnt/beegfs02/scratch/t_steimle/data/36122_unaligned.bam"), format!("/mnt/beegfs02/scratch/t_steimle/data/36122_aligned.bam"), ); info!("Basecalling"); let _out = SlurmRunner::exec(&mut dca)?; Ok(()) } }