dorado.rs 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. use std::{
  2. fs::{self},
  3. path::{Path, PathBuf},
  4. };
  5. use anyhow::Context;
  6. use log::{debug, info};
  7. use crate::{
  8. collection::pod5::Pod5,
  9. commands::{Command, SlurmParams},
  10. config::Config,
  11. slurm_helpers::max_gpu_per_node,
  12. };
  13. /// Run Dorado basecalling on a directory of POD5 files.
  14. ///
  15. /// This command:
  16. /// - Validates that the POD5 directory exists
  17. /// - Locates the first `.pod5` file to extract the sequencing kit (dont mix pod5 from different
  18. /// runs into the same dir)
  19. /// - Builds a Dorado `basecaller` command line using the configured arguments
  20. ///
  21. /// The resulting BAM is written to `output_bam`.
  22. #[derive(Debug)]
  23. pub struct DoradoBasecall {
  24. /// Path to the Dorado executable.
  25. dorado: PathBuf,
  26. /// Directory containing `.pod5` reads.
  27. pod5_dir: PathBuf,
  28. /// Output BAM file produced by Dorado.
  29. output_bam: PathBuf,
  30. /// Sequencing kit extracted from the POD5 file metadata.
  31. sequencing_kit: String,
  32. /// Additional basecalling arguments from configuration.
  33. dorado_basecall_arg: String,
  34. }
  35. impl DoradoBasecall {
  36. /// Build a `DoradoBasecall` command from configuration and input/output paths.
  37. ///
  38. /// # Parameters
  39. /// - `config`: global configuration providing Dorado binary path and args
  40. /// - `pod5_dir`: directory containing POD5 files
  41. /// - `output_bam`: destination BAM path
  42. pub fn from_config(
  43. config: &Config,
  44. pod5_dir: impl AsRef<Path>,
  45. output_bam: impl AsRef<Path>,
  46. ) -> Self {
  47. Self {
  48. dorado: (&config.align.dorado_bin).into(),
  49. pod5_dir: pod5_dir.as_ref().into(),
  50. output_bam: output_bam.as_ref().into(),
  51. sequencing_kit: String::new(),
  52. dorado_basecall_arg: config.align.dorado_basecall_arg.clone(),
  53. }
  54. }
  55. }
  56. impl Command for DoradoBasecall {
  57. /// Validate input directory, ensure no output overwrite, and extract sequencing kit.
  58. fn init(&mut self) -> anyhow::Result<()> {
  59. if !self.pod5_dir.exists() || !self.pod5_dir.is_dir() {
  60. anyhow::bail!(
  61. "The pod5 dir is not accessible.\n{}",
  62. self.pod5_dir.display()
  63. );
  64. }
  65. if self.output_bam.exists() {
  66. anyhow::bail!("The output BAM file already exists.");
  67. }
  68. let pod_path = fs::read_dir(&self.pod5_dir)
  69. .context(format!(
  70. "Failed to read pod5 dir: {}",
  71. self.pod5_dir.display()
  72. ))?
  73. .filter_map(Result::ok) // keep only Ok entries
  74. .map(|e| e.path())
  75. .find(|p| p.extension().and_then(|e| e.to_str()) == Some("pod5"))
  76. .context("No .pod5 file found")?;
  77. self.sequencing_kit = Pod5::from_path(&pod_path)?.sequencing_kit.to_uppercase();
  78. Ok(())
  79. }
  80. /// Build the Dorado basecaller command.
  81. fn cmd(&self) -> String {
  82. let dorado_bin = &self.dorado;
  83. let pod_dir = &self.pod5_dir;
  84. let bam = &self.output_bam;
  85. let dorado_arg = &self.dorado_basecall_arg;
  86. let sequencing_kit = &self.sequencing_kit;
  87. // Error while trying to trim with a non barcode kit !!!
  88. // Should find the right way....
  89. let sequencing_kit = "SQK-NBD114-24";
  90. let trim = if sequencing_kit == "SQK-LSK114" {
  91. "adapters"
  92. } else {
  93. "all"
  94. };
  95. format!(
  96. "{} basecaller --kit-name {sequencing_kit} --trim {} --emit-moves {dorado_arg} --models-directory /home/t_steimle/ref/dorado_models {} > {}",
  97. dorado_bin.display(),
  98. trim,
  99. pod_dir.display(),
  100. bam.display()
  101. )
  102. }
  103. }
  104. /// Slurm execution parameters for `DoradoBasecall`.
  105. ///
  106. /// This configuration launches Dorado on a GPU partition with:
  107. /// - 10 CPU threads (`--cpus-per-task=10`)
  108. /// - 60 GB memory
  109. /// - 4× H100 GPUs
  110. ///
  111. /// # Performance Notes
  112. ///
  113. /// Dorado basecalling throughput increases with CPU count when using GPUs.
  114. /// Example measurements (Samples/s):
  115. ///
  116. /// │ CPUs │ Throughput (Samples/s) │
  117. /// │------│-----------------------------│
  118. /// │ 4 │ 5.36×10⁷ │
  119. /// │ 5 │ 6.16×10⁷ │
  120. /// │ 6 │ 6.87×10⁷ │
  121. /// │ 7 │ 7.23×10⁷ │
  122. /// │ 8 │ 7.67×10⁷ │
  123. /// │ 10 │ 8.40×10⁷ │
  124. /// │ 15 │ 8.78×10⁷ │
  125. ///
  126. /// Throughput gains diminish beyond ~10 CPUs when the GPU becomes the bottleneck.
  127. /// The runner uses **10 CPUs** by default as a balanced configuration.
  128. ///
  129. /// # Resulting Slurm Command
  130. ///
  131. /// Equivalent to:
  132. ///
  133. /// ```text
  134. /// srun \
  135. /// --job-name=dorado_basecall \
  136. /// --cpus-per-task=10 \
  137. /// --mem=60G \
  138. /// --partition=gpgpuq \
  139. /// --gres=gpu:h100:4 \
  140. /// bash -c "<dorado command…>"
  141. /// ```
  142. fn dorado_slurm_params() -> super::SlurmParams {
  143. let (gpu, n) = if let (Some(h100_av), Some(a100_av)) =
  144. (max_gpu_per_node("h100"), max_gpu_per_node("a100"))
  145. {
  146. debug!("Available H100: {h100_av} and A100: {a100_av}");
  147. let (gpu, n) = if h100_av >= a100_av {
  148. ("h100", h100_av)
  149. } else {
  150. ("a100", a100_av)
  151. };
  152. let n = n.clamp(1, 4);
  153. (gpu, n)
  154. } else {
  155. panic!("Are you running slurm with a100 and h100 GPU ?");
  156. };
  157. info!("Running Dorado Basecaller with: {gpu} x {n}");
  158. super::SlurmParams {
  159. job_name: Some("dorado_basecall".into()),
  160. cpus_per_task: Some(10),
  161. mem: Some("60G".into()),
  162. partition: Some("gpgpuq".into()),
  163. gres: Some(format!("gpu:{gpu}:{n}")),
  164. }
  165. }
  166. impl super::SlurmRunner for DoradoBasecall {
  167. fn slurm_args(&self) -> Vec<String> {
  168. dorado_slurm_params().to_args()
  169. }
  170. }
  171. impl super::LocalRunner for DoradoBasecall {}
  172. impl super::SbatchRunner for DoradoBasecall {
  173. fn slurm_params(&self) -> SlurmParams {
  174. dorado_slurm_params()
  175. }
  176. }
  177. /// Run Dorado alignment using a reference FASTA and an input BAM.
  178. ///
  179. /// This command:
  180. /// - Validates input/output paths
  181. /// - Invokes Dorado `aligner`
  182. /// - Produces an aligned BAM at `output`
  183. #[derive(Debug)]
  184. pub struct DoradoAlign {
  185. /// Path to the Dorado executable.
  186. pub dorado: PathBuf,
  187. /// Reference FASTA used for alignment.
  188. pub reference: PathBuf,
  189. /// Input BAM to align.
  190. pub input: PathBuf,
  191. /// Output aligned BAM.
  192. pub output: PathBuf,
  193. /// Number of threads for the Dorado aligner.
  194. pub threads: u8,
  195. /// Slurm params
  196. pub slurm_params: SlurmParams,
  197. }
  198. impl DoradoAlign {
  199. /// Build a `DoradoAlign` command from configuration and input/output paths.
  200. ///
  201. /// # Parameters
  202. /// - `config`: global configuration
  203. /// - `input`: input BAM
  204. /// - `output`: aligned BAM
  205. pub fn from_config(config: &Config, input: impl AsRef<Path>, output: impl AsRef<Path>) -> Self {
  206. let threads = config.align.dorado_aligner_threads;
  207. Self {
  208. dorado: (&config.align.dorado_bin).into(),
  209. reference: (&config.align.ref_fa).into(),
  210. input: input.as_ref().into(),
  211. output: output.as_ref().into(),
  212. threads,
  213. slurm_params: SlurmParams {
  214. job_name: Some("dorado_align".into()),
  215. cpus_per_task: Some(threads.into()),
  216. mem: Some("30G".into()),
  217. partition: Some("mediumq".into()),
  218. gres: None,
  219. },
  220. }
  221. }
  222. }
  223. impl super::Command for DoradoAlign {
  224. /// Validate input BAM and ensure the output does not already exist.
  225. fn init(&mut self) -> anyhow::Result<()> {
  226. if !self.input.exists() {
  227. anyhow::bail!(
  228. "The input BAM file is not accessible.\n{}",
  229. self.input.display()
  230. );
  231. }
  232. if self.output.exists() {
  233. anyhow::bail!(
  234. "The output BAM file already exists.\n{}",
  235. self.output.display()
  236. );
  237. }
  238. Ok(())
  239. }
  240. /// Build the Dorado aligner command.
  241. fn cmd(&self) -> String {
  242. format!(
  243. "{} aligner --threads {} --allow-sec-supp --mm2-opts '--secondary yes' {} {} > {}",
  244. self.dorado.display(),
  245. self.threads,
  246. self.reference.display(),
  247. self.input.display(),
  248. self.output.display()
  249. )
  250. }
  251. }
  252. impl super::LocalRunner for DoradoAlign {}
  253. impl super::LocalBatchRunner for DoradoAlign {}
  254. impl super::SlurmRunner for DoradoAlign {
  255. /// Default Slurm parameters for running the Dorado aligner.
  256. fn slurm_args(&self) -> Vec<String> {
  257. self.slurm_params.to_args()
  258. }
  259. }
  260. impl super::SbatchRunner for DoradoAlign {
  261. /// Default Slurm parameters for running the Dorado aligner.
  262. fn slurm_params(&self) -> SlurmParams {
  263. self.slurm_params.clone()
  264. }
  265. }
  266. #[cfg(test)]
  267. mod tests {
  268. use super::*;
  269. use crate::TEST_DIR;
  270. use log::info;
  271. use crate::{commands::SlurmRunner, config::Config, helpers::test_init};
  272. #[test]
  273. fn dorado_basecall() -> anyhow::Result<()> {
  274. test_init();
  275. let config = Config::default();
  276. let tmp_dir = config.tmp_dir.clone();
  277. let mut dca = DoradoBasecall::from_config(
  278. &config,
  279. format!("/mnt/beegfs02/scratch/t_steimle/data/runs/20260206_1540_P2I-00461-A_PBI80774_67adb0bc/pod5"),
  280. format!("/mnt/beegfs02/scratch/t_steimle/data/36122_unaligned.bam"),
  281. );
  282. info!("Basecalling");
  283. let out = SlurmRunner::exec(&mut dca)?;
  284. println!("{out:#?}");
  285. Ok(())
  286. }
  287. #[test]
  288. fn dorado_align() -> anyhow::Result<()> {
  289. test_init();
  290. let config = Config::default();
  291. let mut dca = DoradoAlign::from_config(
  292. &config,
  293. format!("/mnt/beegfs02/scratch/t_steimle/data/36122_unaligned.bam"),
  294. format!("/mnt/beegfs02/scratch/t_steimle/data/36122_aligned.bam"),
  295. );
  296. info!("Basecalling");
  297. let _out = SlurmRunner::exec(&mut dca)?;
  298. Ok(())
  299. }
  300. }