| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339 |
- use std::{
- fs::{self},
- path::{Path, PathBuf},
- };
- use anyhow::Context;
- use log::{debug, info};
- use crate::{
- collection::pod5::Pod5,
- commands::{Command, SlurmParams},
- config::Config,
- slurm_helpers::max_gpu_per_node,
- };
- /// Run Dorado basecalling on a directory of POD5 files.
- ///
- /// This command:
- /// - Validates that the POD5 directory exists
- /// - Locates the first `.pod5` file to extract the sequencing kit (dont mix pod5 from different
- /// runs into the same dir)
- /// - Builds a Dorado `basecaller` command line using the configured arguments
- ///
- /// The resulting BAM is written to `output_bam`.
- #[derive(Debug)]
- pub struct DoradoBasecall {
- /// Path to the Dorado executable.
- dorado: PathBuf,
- /// Directory containing `.pod5` reads.
- pod5_dir: PathBuf,
- /// Output BAM file produced by Dorado.
- output_bam: PathBuf,
- /// Sequencing kit extracted from the POD5 file metadata.
- sequencing_kit: String,
- /// Additional basecalling arguments from configuration.
- dorado_basecall_arg: String,
- }
- impl DoradoBasecall {
- /// Build a `DoradoBasecall` command from configuration and input/output paths.
- ///
- /// # Parameters
- /// - `config`: global configuration providing Dorado binary path and args
- /// - `pod5_dir`: directory containing POD5 files
- /// - `output_bam`: destination BAM path
- pub fn from_config(
- config: &Config,
- pod5_dir: impl AsRef<Path>,
- output_bam: impl AsRef<Path>,
- ) -> Self {
- Self {
- dorado: (&config.align.dorado_bin).into(),
- pod5_dir: pod5_dir.as_ref().into(),
- output_bam: output_bam.as_ref().into(),
- sequencing_kit: String::new(),
- dorado_basecall_arg: config.align.dorado_basecall_arg.clone(),
- }
- }
- }
- impl Command for DoradoBasecall {
- /// Validate input directory, ensure no output overwrite, and extract sequencing kit.
- fn init(&mut self) -> anyhow::Result<()> {
- if !self.pod5_dir.exists() || !self.pod5_dir.is_dir() {
- anyhow::bail!(
- "The pod5 dir is not accessible.\n{}",
- self.pod5_dir.display()
- );
- }
- if self.output_bam.exists() {
- anyhow::bail!("The output BAM file already exists.");
- }
- let pod_path = fs::read_dir(&self.pod5_dir)
- .context(format!(
- "Failed to read pod5 dir: {}",
- self.pod5_dir.display()
- ))?
- .filter_map(Result::ok) // keep only Ok entries
- .map(|e| e.path())
- .find(|p| p.extension().and_then(|e| e.to_str()) == Some("pod5"))
- .context("No .pod5 file found")?;
- self.sequencing_kit = Pod5::from_path(&pod_path)?.sequencing_kit.to_uppercase();
- Ok(())
- }
- /// Build the Dorado basecaller command.
- fn cmd(&self) -> String {
- let dorado_bin = &self.dorado;
- let pod_dir = &self.pod5_dir;
- let bam = &self.output_bam;
- let dorado_arg = &self.dorado_basecall_arg;
- let sequencing_kit = &self.sequencing_kit;
- // Error while trying to trim with a non barcode kit !!!
- // Should find the right way....
- let sequencing_kit = "SQK-NBD114-24";
- let trim = if sequencing_kit == "SQK-LSK114" {
- "adapters"
- } else {
- "all"
- };
- format!(
- "{} basecaller --kit-name {sequencing_kit} --trim {} --emit-moves {dorado_arg} --models-directory /home/t_steimle/ref/dorado_models {} > {}",
- dorado_bin.display(),
- trim,
- pod_dir.display(),
- bam.display()
- )
- }
- }
- /// Slurm execution parameters for `DoradoBasecall`.
- ///
- /// This configuration launches Dorado on a GPU partition with:
- /// - 10 CPU threads (`--cpus-per-task=10`)
- /// - 60 GB memory
- /// - 4× H100 GPUs
- ///
- /// # Performance Notes
- ///
- /// Dorado basecalling throughput increases with CPU count when using GPUs.
- /// Example measurements (Samples/s):
- ///
- /// │ CPUs │ Throughput (Samples/s) │
- /// │------│-----------------------------│
- /// │ 4 │ 5.36×10⁷ │
- /// │ 5 │ 6.16×10⁷ │
- /// │ 6 │ 6.87×10⁷ │
- /// │ 7 │ 7.23×10⁷ │
- /// │ 8 │ 7.67×10⁷ │
- /// │ 10 │ 8.40×10⁷ │
- /// │ 15 │ 8.78×10⁷ │
- ///
- /// Throughput gains diminish beyond ~10 CPUs when the GPU becomes the bottleneck.
- /// The runner uses **10 CPUs** by default as a balanced configuration.
- ///
- /// # Resulting Slurm Command
- ///
- /// Equivalent to:
- ///
- /// ```text
- /// srun \
- /// --job-name=dorado_basecall \
- /// --cpus-per-task=10 \
- /// --mem=60G \
- /// --partition=gpgpuq \
- /// --gres=gpu:h100:4 \
- /// bash -c "<dorado command…>"
- /// ```
- fn dorado_slurm_params() -> super::SlurmParams {
- let (gpu, n) = if let (Some(h100_av), Some(a100_av)) =
- (max_gpu_per_node("h100"), max_gpu_per_node("a100"))
- {
- debug!("Available H100: {h100_av} and A100: {a100_av}");
- let (gpu, n) = if h100_av >= a100_av {
- ("h100", h100_av)
- } else {
- ("a100", a100_av)
- };
- let n = n.clamp(1, 4);
- (gpu, n)
- } else {
- panic!("Are you running slurm with a100 and h100 GPU ?");
- };
- info!("Running Dorado Basecaller with: {gpu} x {n}");
- super::SlurmParams {
- job_name: Some("dorado_basecall".into()),
- cpus_per_task: Some(10),
- mem: Some("60G".into()),
- partition: Some("gpgpuq".into()),
- gres: Some(format!("gpu:{gpu}:{n}")),
- }
- }
- impl super::SlurmRunner for DoradoBasecall {
- fn slurm_args(&self) -> Vec<String> {
- dorado_slurm_params().to_args()
- }
- }
- impl super::LocalRunner for DoradoBasecall {}
- impl super::SbatchRunner for DoradoBasecall {
- fn slurm_params(&self) -> SlurmParams {
- dorado_slurm_params()
- }
- }
- /// Run Dorado alignment using a reference FASTA and an input BAM.
- ///
- /// This command:
- /// - Validates input/output paths
- /// - Invokes Dorado `aligner`
- /// - Produces an aligned BAM at `output`
- #[derive(Debug)]
- pub struct DoradoAlign {
- /// Path to the Dorado executable.
- pub dorado: PathBuf,
- /// Reference FASTA used for alignment.
- pub reference: PathBuf,
- /// Input BAM to align.
- pub input: PathBuf,
- /// Output aligned BAM.
- pub output: PathBuf,
- /// Number of threads for the Dorado aligner.
- pub threads: u8,
- /// Slurm params
- pub slurm_params: SlurmParams,
- }
- impl DoradoAlign {
- /// Build a `DoradoAlign` command from configuration and input/output paths.
- ///
- /// # Parameters
- /// - `config`: global configuration
- /// - `input`: input BAM
- /// - `output`: aligned BAM
- pub fn from_config(config: &Config, input: impl AsRef<Path>, output: impl AsRef<Path>) -> Self {
- let threads = config.align.dorado_aligner_threads;
- Self {
- dorado: (&config.align.dorado_bin).into(),
- reference: (&config.align.ref_fa).into(),
- input: input.as_ref().into(),
- output: output.as_ref().into(),
- threads,
- slurm_params: SlurmParams {
- job_name: Some("dorado_align".into()),
- cpus_per_task: Some(threads.into()),
- mem: Some("30G".into()),
- partition: Some("mediumq".into()),
- gres: None,
- },
- }
- }
- }
- impl super::Command for DoradoAlign {
- /// Validate input BAM and ensure the output does not already exist.
- fn init(&mut self) -> anyhow::Result<()> {
- if !self.input.exists() {
- anyhow::bail!(
- "The input BAM file is not accessible.\n{}",
- self.input.display()
- );
- }
- if self.output.exists() {
- anyhow::bail!(
- "The output BAM file already exists.\n{}",
- self.output.display()
- );
- }
- Ok(())
- }
- /// Build the Dorado aligner command.
- fn cmd(&self) -> String {
- format!(
- "{} aligner --threads {} --allow-sec-supp --mm2-opts '--secondary yes' {} {} > {}",
- self.dorado.display(),
- self.threads,
- self.reference.display(),
- self.input.display(),
- self.output.display()
- )
- }
- }
- impl super::LocalRunner for DoradoAlign {}
- impl super::LocalBatchRunner for DoradoAlign {}
- impl super::SlurmRunner for DoradoAlign {
- /// Default Slurm parameters for running the Dorado aligner.
- fn slurm_args(&self) -> Vec<String> {
- self.slurm_params.to_args()
- }
- }
- impl super::SbatchRunner for DoradoAlign {
- /// Default Slurm parameters for running the Dorado aligner.
- fn slurm_params(&self) -> SlurmParams {
- self.slurm_params.clone()
- }
- }
- #[cfg(test)]
- mod tests {
- use super::*;
- use crate::TEST_DIR;
- use log::info;
- use crate::{commands::SlurmRunner, config::Config, helpers::test_init};
- #[test]
- fn dorado_basecall() -> anyhow::Result<()> {
- test_init();
- let config = Config::default();
- let tmp_dir = config.tmp_dir.clone();
- let mut dca = DoradoBasecall::from_config(
- &config,
- format!("/mnt/beegfs02/scratch/t_steimle/data/runs/20260206_1540_P2I-00461-A_PBI80774_67adb0bc/pod5"),
- format!("/mnt/beegfs02/scratch/t_steimle/data/36122_unaligned.bam"),
- );
- info!("Basecalling");
- let out = SlurmRunner::exec(&mut dca)?;
- println!("{out:#?}");
- Ok(())
- }
- #[test]
- fn dorado_align() -> anyhow::Result<()> {
- test_init();
- let config = Config::default();
- let mut dca = DoradoAlign::from_config(
- &config,
- format!("/mnt/beegfs02/scratch/t_steimle/data/36122_unaligned.bam"),
- format!("/mnt/beegfs02/scratch/t_steimle/data/36122_aligned.bam"),
- );
- info!("Basecalling");
- let _out = SlurmRunner::exec(&mut dca)?;
- Ok(())
- }
- }
|