|
|
@@ -11,27 +11,52 @@ use uuid::Uuid;
|
|
|
|
|
|
use crate::{
|
|
|
collection::{bam::bam_composition, flowcells::FlowCell, pod5::FlowCellCase},
|
|
|
- commands::Command,
|
|
|
+ commands::{Command, SlurmParams},
|
|
|
config::Config,
|
|
|
helpers::find_unique_file,
|
|
|
io::pod5_infos::Pod5Info,
|
|
|
+ slurm_helpers::max_gpu_per_node,
|
|
|
};
|
|
|
|
|
|
+/// Run Dorado basecalling on a directory of POD5 files.
|
|
|
+///
|
|
|
+/// This command:
|
|
|
+/// - Validates that the POD5 directory exists
|
|
|
+/// - Locates the first `.pod5` file to extract the sequencing kit (dont mix pod5 from different
|
|
|
+/// runs into the same dir)
|
|
|
+/// - Builds a Dorado `basecaller` command line using the configured arguments
|
|
|
+///
|
|
|
+/// The resulting BAM is written to `output_bam`.
|
|
|
#[derive(Debug)]
|
|
|
pub struct DoradoBasecall {
|
|
|
+ /// Path to the Dorado executable.
|
|
|
dorado: PathBuf,
|
|
|
- output_bam: PathBuf,
|
|
|
+ /// Directory containing `.pod5` reads.
|
|
|
pod5_dir: PathBuf,
|
|
|
+ /// Output BAM file produced by Dorado.
|
|
|
+ output_bam: PathBuf,
|
|
|
+ /// Sequencing kit extracted from the POD5 file metadata.
|
|
|
sequencing_kit: String,
|
|
|
+ /// Additional basecalling arguments from configuration.
|
|
|
dorado_basecall_arg: String,
|
|
|
}
|
|
|
|
|
|
impl DoradoBasecall {
|
|
|
- pub fn from_config(config: &Config, pod5_dir: PathBuf, output_bam: PathBuf) -> Self {
|
|
|
+ /// Build a `DoradoBasecall` command from configuration and input/output paths.
|
|
|
+ ///
|
|
|
+ /// # Parameters
|
|
|
+ /// - `config`: global configuration providing Dorado binary path and args
|
|
|
+ /// - `pod5_dir`: directory containing POD5 files
|
|
|
+ /// - `output_bam`: destination BAM path
|
|
|
+ pub fn from_config(
|
|
|
+ config: &Config,
|
|
|
+ pod5_dir: impl AsRef<Path>,
|
|
|
+ output_bam: impl AsRef<Path>,
|
|
|
+ ) -> Self {
|
|
|
Self {
|
|
|
dorado: (&config.align.dorado_bin).into(),
|
|
|
- output_bam,
|
|
|
- pod5_dir,
|
|
|
+ pod5_dir: pod5_dir.as_ref().into(),
|
|
|
+ output_bam: output_bam.as_ref().into(),
|
|
|
sequencing_kit: String::new(),
|
|
|
dorado_basecall_arg: config.align.dorado_basecall_arg.clone(),
|
|
|
}
|
|
|
@@ -39,6 +64,7 @@ impl DoradoBasecall {
|
|
|
}
|
|
|
|
|
|
impl Command for DoradoBasecall {
|
|
|
+ /// Validate input directory, ensure no output overwrite, and extract sequencing kit.
|
|
|
fn init(&mut self) -> anyhow::Result<()> {
|
|
|
if !self.pod5_dir.exists() || !self.pod5_dir.is_dir() {
|
|
|
anyhow::bail!(
|
|
|
@@ -68,6 +94,7 @@ impl Command for DoradoBasecall {
|
|
|
Ok(())
|
|
|
}
|
|
|
|
|
|
+ /// Build the Dorado basecaller command.
|
|
|
fn cmd(&self) -> String {
|
|
|
let dorado_bin = &self.dorado;
|
|
|
let pod_dir = &self.pod5_dir;
|
|
|
@@ -81,44 +108,123 @@ impl Command for DoradoBasecall {
|
|
|
pod_dir.display(),
|
|
|
bam.display()
|
|
|
)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/// Slurm execution parameters for `DoradoBasecall`.
|
|
|
+///
|
|
|
+/// This configuration launches Dorado on a GPU partition with:
|
|
|
+/// - 10 CPU threads (`--cpus-per-task=10`)
|
|
|
+/// - 60 GB memory
|
|
|
+/// - 4× H100 GPUs
|
|
|
+///
|
|
|
+/// # Performance Notes
|
|
|
+///
|
|
|
+/// Dorado basecalling throughput increases with CPU count when using GPUs.
|
|
|
+/// Example measurements (Samples/s):
|
|
|
+///
|
|
|
+/// │ CPUs │ Throughput (Samples/s) │
|
|
|
+/// │------│-----------------------------│
|
|
|
+/// │ 4 │ 5.36×10⁷ │
|
|
|
+/// │ 5 │ 6.16×10⁷ │
|
|
|
+/// │ 6 │ 6.87×10⁷ │
|
|
|
+/// │ 7 │ 7.23×10⁷ │
|
|
|
+/// │ 8 │ 7.67×10⁷ │
|
|
|
+/// │ 10 │ 8.40×10⁷ │
|
|
|
+/// │ 15 │ 8.78×10⁷ │
|
|
|
+///
|
|
|
+/// Throughput gains diminish beyond ~10 CPUs when the GPU becomes the bottleneck.
|
|
|
+/// The runner uses **10 CPUs** by default as a balanced configuration.
|
|
|
+///
|
|
|
+/// # Resulting Slurm Command
|
|
|
+///
|
|
|
+/// Equivalent to:
|
|
|
+///
|
|
|
+/// ```text
|
|
|
+/// srun \
|
|
|
+/// --job-name=dorado_basecall \
|
|
|
+/// --cpus-per-task=10 \
|
|
|
+/// --mem=60G \
|
|
|
+/// --partition=gpgpuq \
|
|
|
+/// --gres=gpu:h100:4 \
|
|
|
+/// bash -c "<dorado command…>"
|
|
|
+/// ```
|
|
|
+impl super::SlurmRunner for DoradoBasecall {
|
|
|
+ fn slurm_args(&self) -> Vec<String> {
|
|
|
+ let (gpu, n) = if let (Some(h100_av), Some(a100_av)) = (max_gpu_per_node("h100"), max_gpu_per_node("a100"))
|
|
|
+ {
|
|
|
+ let (gpu, n) = if h100_av >= a100_av {
|
|
|
+ ("h100", h100_av)
|
|
|
+ } else {
|
|
|
+ ("a100", a100_av)
|
|
|
+ };
|
|
|
|
|
|
- // let samtools_view = format!(
|
|
|
- // "{} view -h -@ {samtools_view_threads} -b /dev/stdin",
|
|
|
- // samtools.display()
|
|
|
- // );
|
|
|
- // let samtools_sort = format!(
|
|
|
- // "{} sort -@ {samtools_sort_threads} /dev/stdin -o {}",
|
|
|
- // samtools.display(),
|
|
|
- // bam.display()
|
|
|
- // );
|
|
|
- //
|
|
|
- // // format!("{dorado} | {samtools_view} | {samtools_sort}")
|
|
|
- // format!("{dorado} | {samtools_view} > {}", bam.display())
|
|
|
+ let n = n.max(2);
|
|
|
+ (gpu, n)
|
|
|
+ } else {
|
|
|
+ panic!("Are you running slurm with a100 and h100 GPU ?");
|
|
|
+ };
|
|
|
+ super::SlurmParams {
|
|
|
+ job_name: Some("dorado_basecall".into()),
|
|
|
+ cpus_per_task: Some(10),
|
|
|
+ mem: Some("60G".into()),
|
|
|
+ partition: Some("gpgpuq".into()),
|
|
|
+ gres: Some(format!("gpu:{gpu}:{n}")),
|
|
|
+ }
|
|
|
+ .to_args()
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+/// Run Dorado alignment using a reference FASTA and an input BAM.
|
|
|
+///
|
|
|
+/// This command:
|
|
|
+/// - Validates input/output paths
|
|
|
+/// - Invokes Dorado `aligner`
|
|
|
+/// - Produces an aligned BAM at `output`
|
|
|
#[derive(Debug)]
|
|
|
pub struct DoradoAlign {
|
|
|
+ /// Path to the Dorado executable.
|
|
|
pub dorado: PathBuf,
|
|
|
+ /// Reference FASTA used for alignment.
|
|
|
pub reference: PathBuf,
|
|
|
+ /// Input BAM to align.
|
|
|
pub input: PathBuf,
|
|
|
+ /// Output aligned BAM.
|
|
|
pub output: PathBuf,
|
|
|
+ /// Number of threads for the Dorado aligner.
|
|
|
pub threads: u8,
|
|
|
+ /// Slurm params
|
|
|
+ pub slurm_params: SlurmParams,
|
|
|
}
|
|
|
|
|
|
impl DoradoAlign {
|
|
|
- pub fn from_config(config: &Config, input: PathBuf, output: PathBuf) -> Self {
|
|
|
+ /// Build a `DoradoAlign` command from configuration and input/output paths.
|
|
|
+ ///
|
|
|
+ /// # Parameters
|
|
|
+ /// - `config`: global configuration
|
|
|
+ /// - `input`: input BAM
|
|
|
+ /// - `output`: aligned BAM
|
|
|
+ pub fn from_config(config: &Config, input: impl AsRef<Path>, output: impl AsRef<Path>) -> Self {
|
|
|
+ let threads = config.align.dorado_aligner_threads;
|
|
|
Self {
|
|
|
dorado: (&config.align.dorado_bin).into(),
|
|
|
reference: (&config.align.ref_fa).into(),
|
|
|
- input,
|
|
|
- output,
|
|
|
- threads: config.align.dorado_aligner_threads,
|
|
|
+ input: input.as_ref().into(),
|
|
|
+ output: output.as_ref().into(),
|
|
|
+ threads,
|
|
|
+ slurm_params: SlurmParams {
|
|
|
+ job_name: Some("dorado_align".into()),
|
|
|
+ cpus_per_task: Some(threads.into()),
|
|
|
+ mem: Some("60G".into()),
|
|
|
+ partition: Some("gpgpuq".into()),
|
|
|
+ gres: None,
|
|
|
+ },
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
impl super::Command for DoradoAlign {
|
|
|
+ /// Validate input BAM and ensure the output does not already exist.
|
|
|
fn init(&mut self) -> anyhow::Result<()> {
|
|
|
if !self.input.exists() {
|
|
|
anyhow::bail!(
|
|
|
@@ -136,6 +242,8 @@ impl super::Command for DoradoAlign {
|
|
|
|
|
|
Ok(())
|
|
|
}
|
|
|
+
|
|
|
+ /// Build the Dorado aligner command.
|
|
|
fn cmd(&self) -> String {
|
|
|
format!(
|
|
|
"{} aligner --threads {} --allow-sec-supp --mm2-opts '--secondary yes' {} {} > {}",
|
|
|
@@ -149,48 +257,27 @@ impl super::Command for DoradoAlign {
|
|
|
}
|
|
|
|
|
|
impl super::SlurmRunner for DoradoAlign {
|
|
|
+ /// Default Slurm parameters for running the Dorado aligner.
|
|
|
fn slurm_args(&self) -> Vec<String> {
|
|
|
- super::SlurmParams {
|
|
|
- job_name: Some("dorado_align".into()),
|
|
|
- cpus_per_task: Some(self.threads.into()),
|
|
|
- mem: Some("60G".into()),
|
|
|
- partition: Some("gpgpuq".into()),
|
|
|
- gres: None,
|
|
|
- }
|
|
|
- .to_args()
|
|
|
+ self.slurm_params.to_args()
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-// Running with Slurm: srun --job-name=dorado_basecall --cpus-per-task=X --mem=60G --partition=gpgpuq --gres=gpu:h100:4 bash -c /mnt/beegfs02/scratch/t_steimle/tools/dorado-latest-linux-x64/bin/dorado basecaller --kit-name SQK-NBD114-24 -x 'cuda:all' sup,5mC_5hmC /mnt/beegfs02/scratch/t_steimle/test_data/inputs/pod5/A --trim all --emit-moves > /mnt/beegfs02/scratch/t_steimle/test_data/outputs/aligned_5.bam
|
|
|
-// 04 cpu Basecalled @ Samples/s: 5.359770e+07
|
|
|
-// 05 cpu Basecalled @ Samples/s: 6.155305e+07
|
|
|
-// 06 cpu Basecalled @ Samples/s: 6.870292e+07
|
|
|
-// 07 cpu Basecalled @ Samples/s: 7.230430e+07
|
|
|
-// 08 cpu Basecalled @ Samples/s: 7.669054e+07
|
|
|
-// 10 cpu Basecalled @ Samples/s: 8.398348e+07
|
|
|
-// 15 cpu Basecalled @ Samples/s: 8.776285e+07
|
|
|
-impl super::SlurmRunner for DoradoBasecall {
|
|
|
- fn slurm_args(&self) -> Vec<String> {
|
|
|
- super::SlurmParams {
|
|
|
- job_name: Some("dorado_basecall".into()),
|
|
|
- cpus_per_task: Some(10),
|
|
|
- mem: Some("60G".into()),
|
|
|
- partition: Some("gpgpuq".into()),
|
|
|
- gres: Some("gpu:h100:4".into()),
|
|
|
- }
|
|
|
- .to_args()
|
|
|
+impl super::SbatchRunner for DoradoAlign {
|
|
|
+ /// Default Slurm parameters for running the Dorado aligner.
|
|
|
+ fn slurm_params(&self) -> SlurmParams {
|
|
|
+ self.slurm_params.clone()
|
|
|
}
|
|
|
}
|
|
|
|
|
|
#[cfg(test)]
|
|
|
mod tests {
|
|
|
use super::*;
|
|
|
- use log::info;
|
|
|
use crate::TEST_DIR;
|
|
|
+ use log::info;
|
|
|
|
|
|
use crate::{commands::SlurmRunner, config::Config, helpers::test_init};
|
|
|
|
|
|
-
|
|
|
#[test]
|
|
|
fn dorado_basecall() -> anyhow::Result<()> {
|
|
|
test_init();
|
|
|
@@ -199,8 +286,8 @@ mod tests {
|
|
|
|
|
|
let mut dca = DoradoBasecall::from_config(
|
|
|
&config,
|
|
|
- format!("{}/inputs/pod5/A", TEST_DIR.as_str()).into(),
|
|
|
- format!("{}/outputs/unaligned_10.bam", TEST_DIR.as_str()).into(),
|
|
|
+ format!("{}/inputs/pod5/A", TEST_DIR.as_str()),
|
|
|
+ format!("{}/outputs/unaligned_10.bam", TEST_DIR.as_str()),
|
|
|
);
|
|
|
|
|
|
info!("Basecalling");
|
|
|
@@ -219,8 +306,8 @@ mod tests {
|
|
|
|
|
|
let mut dca = DoradoAlign::from_config(
|
|
|
&config,
|
|
|
- format!("{}/outputs/unaligned_10.bam", TEST_DIR.as_str()).into(),
|
|
|
- format!("{}/outputs/10_hs1_sorted.bam", TEST_DIR.as_str()).into(),
|
|
|
+ format!("{}/outputs/unaligned_10.bam", TEST_DIR.as_str()),
|
|
|
+ format!("{}/outputs/10_hs1_sorted.bam", TEST_DIR.as_str()),
|
|
|
);
|
|
|
|
|
|
info!("Basecalling");
|
|
|
@@ -228,7 +315,6 @@ mod tests {
|
|
|
|
|
|
Ok(())
|
|
|
}
|
|
|
-
|
|
|
}
|
|
|
|
|
|
#[derive(Debug, Clone)]
|