mod.rs 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105
  1. pub mod bcftools;
  2. pub mod dorado;
  3. // pub mod longphase;
  4. pub mod longphase;
  5. pub mod modkit;
  6. pub mod samtools;
  7. // pub mod wakhan;
  8. use std::{
  9. fmt::{self, Display, Formatter},
  10. fs::{self, File},
  11. io::{self, BufReader, Read, Seek, SeekFrom, Write},
  12. path::{Path, PathBuf},
  13. process::{Command as ProcessCommand, Stdio},
  14. sync::{mpsc, Arc, Mutex},
  15. thread,
  16. time::Duration,
  17. };
  18. /// A helper trait for commands that should be run through Slurm.
  19. ///
  20. /// Types implementing `SlurmRunner` must also implement [`Command`].
  21. /// The provided [`run`](SlurmRunner::run) method:
  22. ///
  23. /// 1. Calls [`Command::init`].
  24. /// 2. Runs `srun <slurm_args...> bash -lc "<cmd()>”`.
  25. /// 3. On success, calls [`Command::clean_up`].
  26. pub trait Command {
  27. fn init(&mut self) -> anyhow::Result<()> {
  28. Ok(())
  29. }
  30. fn cmd(&self) -> String;
  31. fn clean_up(&self) -> anyhow::Result<()> {
  32. Ok(())
  33. }
  34. }
  35. /// Local runner: execute the command directly on the machine.
  36. ///
  37. /// Usage:
  38. /// ```ignore
  39. /// my_command.run()?;
  40. /// ```
  41. pub trait LocalRunner: Command {
  42. /// The shell binary used to run commands.
  43. /// Override if you want to use `sh` or something else.
  44. fn shell(&self) -> &str {
  45. "bash"
  46. }
  47. /// Run locally with:
  48. /// <shell> -c "<cmd()>"
  49. fn exec(&mut self) -> anyhow::Result<CapturedOutput> {
  50. self.init()?;
  51. let mut cmd = ProcessCommand::new(self.shell());
  52. cmd.arg("-c")
  53. .arg(self.cmd())
  54. .stdout(Stdio::piped())
  55. .stderr(Stdio::piped());
  56. info!(
  57. "Running: {} {}",
  58. cmd.get_program().to_string_lossy(),
  59. cmd.get_args()
  60. .map(|arg| arg.to_string_lossy())
  61. .collect::<Vec<_>>()
  62. .join(" ")
  63. );
  64. let mut child = cmd.spawn().context("failed to spawn local command")?;
  65. let stdout = child.stdout.take().context("failed to capture stdout")?;
  66. let stderr = child.stderr.take().context("failed to capture stderr")?;
  67. let stderr_handle = thread::spawn(move || {
  68. let mut reader = BufReader::new(stderr);
  69. let mut buf = Vec::new();
  70. let mut line = Vec::new();
  71. let mut byte = [0u8; 1];
  72. loop {
  73. match reader.read(&mut byte) {
  74. Ok(0) => {
  75. if !line.is_empty() {
  76. let text = String::from_utf8_lossy(&line);
  77. eprint!("{}", text);
  78. buf.extend_from_slice(&line);
  79. }
  80. break;
  81. }
  82. Ok(_) => {
  83. line.push(byte[0]);
  84. if byte[0] == b'\n' {
  85. let text = String::from_utf8_lossy(&line);
  86. eprint!("{}", text);
  87. buf.extend_from_slice(&line);
  88. line.clear();
  89. }
  90. }
  91. Err(_) => break,
  92. }
  93. }
  94. String::from_utf8_lossy(&buf).to_string()
  95. });
  96. // --- stdout in a separate thread ---
  97. let stdout_handle = thread::spawn(move || {
  98. let mut reader = BufReader::new(stdout);
  99. let mut buf = Vec::new();
  100. let mut line = Vec::new();
  101. let mut byte = [0u8; 1];
  102. loop {
  103. match reader.read(&mut byte) {
  104. Ok(0) => {
  105. if !line.is_empty() {
  106. let text = String::from_utf8_lossy(&line);
  107. print!("{}", text);
  108. buf.extend_from_slice(&line);
  109. }
  110. break;
  111. }
  112. Ok(_) => {
  113. line.push(byte[0]);
  114. if byte[0] == b'\n' {
  115. let text = String::from_utf8_lossy(&line);
  116. print!("{}", text);
  117. buf.extend_from_slice(&line);
  118. line.clear();
  119. }
  120. }
  121. Err(_) => break,
  122. }
  123. }
  124. String::from_utf8_lossy(&buf).to_string()
  125. });
  126. // wait for process
  127. let status = child.wait().context("failed to wait on local command")?;
  128. // wait for stderr thread and collect stderr
  129. let captured_stdout = stdout_handle.join().unwrap_or_default();
  130. let captured_stderr = stderr_handle.join().unwrap_or_default();
  131. if !status.success() {
  132. anyhow::bail!("local command failed with status: {status}");
  133. }
  134. self.clean_up()?;
  135. Ok(CapturedOutput {
  136. stdout: captured_stdout,
  137. stderr: captured_stderr,
  138. slurm_epilog: None,
  139. })
  140. }
  141. }
  142. use anyhow::Context;
  143. use log::info;
  144. use rayon::iter::{IntoParallelIterator, ParallelIterator};
  145. use serde::{Deserialize, Serialize};
  146. use uuid::Uuid;
  147. /// Captured process output while it was being streamed live.
  148. #[derive(Debug, Default, Serialize, Deserialize)]
  149. pub struct CapturedOutput {
  150. pub stdout: String,
  151. pub stderr: String,
  152. pub slurm_epilog: Option<SlurmEpilog>,
  153. }
  154. impl CapturedOutput {
  155. /// Constructs a unique filename based on the base `path`, appending
  156. /// a truncated UUID v4 segment and the ".json" suffix.
  157. /// Saves the struct as JSON to the constructed path (creates parent dirs if needed).
  158. pub fn save_to_file(&self, base_path: impl AsRef<Path>) -> anyhow::Result<PathBuf> {
  159. let base_path = base_path.as_ref();
  160. let binding = Uuid::new_v4().to_string();
  161. let unique_id = binding.split('-').next().unwrap_or("default");
  162. let filename = format!(
  163. "{}_{}.json",
  164. base_path
  165. .file_name()
  166. .unwrap_or(base_path.as_os_str())
  167. .to_string_lossy(),
  168. unique_id
  169. );
  170. let mut full_path = PathBuf::from(base_path.parent().unwrap_or_else(|| Path::new("")));
  171. full_path.push(filename);
  172. if let Some(p) = full_path.parent() {
  173. fs::create_dir_all(p)
  174. .with_context(|| format!("Failed to create directory {}", p.display()))?;
  175. }
  176. let json = serde_json::to_string_pretty(self)?; // Using pretty for readability, remove _pretty if not needed
  177. fs::write(&full_path, json)
  178. .with_context(|| format!("Failed to write file {}", full_path.display()))?;
  179. Ok(full_path)
  180. }
  181. }
  182. impl Display for CapturedOutput {
  183. fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
  184. // 1. Write STDOUT
  185. writeln!(f, "--- STANDARD OUTPUT ---")?;
  186. if self.stdout.is_empty() {
  187. writeln!(f, "[No standard output was captured.]")?;
  188. } else {
  189. writeln!(f, "{}", self.stdout)?;
  190. }
  191. // 2. Write STDERR
  192. writeln!(f, "\n--- STANDARD ERROR ---")?;
  193. if self.stderr.is_empty() {
  194. writeln!(f, "[No standard error was captured.]")?;
  195. } else {
  196. writeln!(f, "{}", self.stderr)?;
  197. }
  198. // 3. Write SlurmEpilog if present
  199. if let Some(epilog) = &self.slurm_epilog {
  200. // Using a separate line break before the epilog for clarity
  201. writeln!(f, "\n{}", epilog)?;
  202. }
  203. Ok(())
  204. }
  205. }
  206. /// Super-trait adding a Slurm `run()` method on top of [`Command`].
  207. pub trait SlurmRunner: Command {
  208. /// Slurm launcher binary. Defaults to `srun`.
  209. ///
  210. /// Override if you want to use `sbatch` or another wrapper.
  211. fn slurm_bin(&self) -> &str {
  212. "srun"
  213. }
  214. /// Additional Slurm arguments (e.g. `--time=1:00:00`, `--cpus-per-task=4`).
  215. ///
  216. /// Default is no extra arguments.
  217. fn slurm_args(&self) -> Vec<String> {
  218. Vec::new()
  219. }
  220. /// Run the command through Slurm.
  221. ///
  222. /// The effective shell command is:
  223. ///
  224. /// ```text
  225. /// <slurm_bin> <slurm_args...> bash -lc "<cmd()>"
  226. /// ```
  227. fn exec(&mut self) -> anyhow::Result<CapturedOutput> {
  228. self.init()?;
  229. let mut cmd = ProcessCommand::new(self.slurm_bin());
  230. cmd.args(self.slurm_args())
  231. .arg("bash")
  232. .arg("-c")
  233. .arg(self.cmd())
  234. .stdout(Stdio::piped())
  235. .stderr(Stdio::piped());
  236. info!(
  237. "Running with Slurm: {} {}",
  238. cmd.get_program().to_string_lossy(),
  239. cmd.get_args()
  240. .map(|arg| arg.to_string_lossy())
  241. .collect::<Vec<_>>()
  242. .join(" ")
  243. );
  244. let mut child = cmd.spawn().context("failed to spawn slurm job")?;
  245. let stdout = child.stdout.take().context("failed to capture stdout")?;
  246. let stderr = child.stderr.take().context("failed to capture stderr")?;
  247. let stderr_handle = thread::spawn(move || {
  248. let mut reader = BufReader::new(stderr);
  249. let mut buf = Vec::new();
  250. let mut chunk = [0u8; 8192];
  251. let mut out = io::stderr();
  252. loop {
  253. match reader.read(&mut chunk) {
  254. Ok(0) => break, // EOF
  255. Ok(n) => {
  256. // keep a copy
  257. buf.extend_from_slice(&chunk[..n]);
  258. // forward to our stderr (preserves colors, \r, etc.)
  259. let _ = out.write_all(&chunk[..n]);
  260. let _ = out.flush();
  261. }
  262. Err(_) => break,
  263. }
  264. }
  265. String::from_utf8_lossy(&buf).to_string()
  266. });
  267. // --- stdout in a separate thread ---
  268. let stdout_handle = thread::spawn(move || {
  269. let mut reader = BufReader::new(stdout);
  270. let mut buf = Vec::new();
  271. let mut chunk = [0u8; 8192];
  272. let mut out = io::stdout();
  273. loop {
  274. match reader.read(&mut chunk) {
  275. Ok(0) => break, // EOF
  276. Ok(n) => {
  277. // capture
  278. buf.extend_from_slice(&chunk[..n]);
  279. // forward
  280. let _ = out.write_all(&chunk[..n]);
  281. let _ = out.flush();
  282. }
  283. Err(_) => break,
  284. }
  285. }
  286. String::from_utf8_lossy(&buf).to_string()
  287. });
  288. // wait for process
  289. let status = child.wait().context("failed to wait on local command")?;
  290. // wait for both threads and collect output
  291. let stdout = stdout_handle.join().unwrap_or_default();
  292. let stderr = stderr_handle.join().unwrap_or_default();
  293. if !status.success() {
  294. anyhow::bail!("slurm job failed with status: {status}");
  295. }
  296. self.clean_up()?;
  297. Ok(CapturedOutput {
  298. stdout,
  299. stderr,
  300. slurm_epilog: None,
  301. })
  302. }
  303. }
  304. /// Holds optional Slurm parameters that can be converted to CLI args.
  305. ///
  306. /// Example:
  307. /// ```rust
  308. /// let params = SlurmParams {
  309. /// job_name: Some("dorado_basecall".into()),
  310. /// cpus_per_task: Some(47),
  311. /// mem: Some("60G".into()),
  312. /// partition: Some("gpgpuq".into()),
  313. /// gres: Some("gpu:h100:4".into()),
  314. /// };
  315. ///
  316. /// let args = params.to_args();
  317. /// ```
  318. ///
  319. /// Produces:
  320. /// ```text
  321. /// --job-name=dorado_basecall
  322. /// --cpus-per-task=47
  323. /// --mem=60G
  324. /// --partition=gpgpuq
  325. /// --gres=gpu:h100:4
  326. /// ```
  327. #[derive(Debug, Clone, Default)]
  328. pub struct SlurmParams {
  329. pub job_name: Option<String>,
  330. pub cpus_per_task: Option<u32>,
  331. pub mem: Option<String>,
  332. pub partition: Option<String>,
  333. pub gres: Option<String>,
  334. }
  335. impl SlurmParams {
  336. /// Convert all non-empty fields into Slurm CLI arguments.
  337. pub fn to_args(&self) -> Vec<String> {
  338. let mut args = Vec::new();
  339. if let Some(v) = &self.job_name {
  340. args.push(format!("--job-name={v}"));
  341. }
  342. if let Some(v) = self.cpus_per_task {
  343. args.push(format!("--cpus-per-task={v}"));
  344. }
  345. if let Some(v) = &self.mem {
  346. args.push(format!("--mem={v}"));
  347. }
  348. if let Some(v) = &self.partition {
  349. args.push(format!("--partition={v}"));
  350. }
  351. if let Some(v) = &self.gres {
  352. args.push(format!("--gres={v}"));
  353. }
  354. args
  355. }
  356. }
  357. pub trait SbatchRunner: Command {
  358. fn sbatch_bin(&self) -> &str {
  359. "sbatch"
  360. }
  361. fn squeue_bin(&self) -> &str {
  362. "squeue"
  363. }
  364. fn slurm_params(&self) -> SlurmParams {
  365. SlurmParams::default()
  366. }
  367. fn sbatch_extra_args(&self) -> Vec<String> {
  368. Vec::new()
  369. }
  370. /// Submit via sbatch, tail slurm-<jobid>.out "live", wait for job to finish,
  371. /// then return captured stdout.
  372. fn exec(&mut self) -> anyhow::Result<CapturedOutput> {
  373. self.init()?;
  374. // 1. sbatch --parsable --output=slurm-%j.out --wrap "<cmd>"
  375. let mut args = self.slurm_params().to_args();
  376. args.extend(self.sbatch_extra_args());
  377. args.push("--parsable".to_string());
  378. args.push("--output=slurm-%j.out".to_string());
  379. args.push("--wrap".to_string());
  380. args.push(self.cmd());
  381. let output = ProcessCommand::new(self.sbatch_bin())
  382. .args(&args)
  383. .output()
  384. .context("failed to spawn sbatch")?;
  385. let sbatch_stdout = String::from_utf8_lossy(&output.stdout).to_string();
  386. let sbatch_stderr = String::from_utf8_lossy(&output.stderr).to_string();
  387. if !output.status.success() {
  388. anyhow::bail!(
  389. "sbatch failed with status: {}\nstdout:\n{}\nstderr:\n{}",
  390. output.status,
  391. sbatch_stdout,
  392. sbatch_stderr
  393. );
  394. }
  395. // --parsable usually returns "<jobid>" or "<jobid>;..."
  396. let job_id = sbatch_stdout
  397. .trim()
  398. .split(';')
  399. .next()
  400. .unwrap_or("")
  401. .to_string();
  402. info!("Running jobid: {job_id}: {}", args.join(" "));
  403. if job_id.is_empty() {
  404. anyhow::bail!("failed to parse job id from sbatch output: {sbatch_stdout}");
  405. }
  406. let out_file = format!("slurm-{job_id}.out");
  407. // 2. Tail thread: emulate `tail -f slurm-<jobid>.out`
  408. let (stop_tx, stop_rx) = mpsc::channel::<()>();
  409. let out_path = out_file.clone();
  410. let captured_stdout = Arc::new(Mutex::new(String::new()));
  411. let captured_stdout_clone = Arc::clone(&captured_stdout);
  412. let job_id_clone = job_id.clone();
  413. let tail_handle = thread::spawn(move || -> anyhow::Result<()> {
  414. let mut pos: u64 = 0;
  415. let mut file_opened = false;
  416. loop {
  417. // stop signal?
  418. if stop_rx.try_recv().is_ok() {
  419. // Do one final read before stopping
  420. if let Ok(mut f) = File::open(&out_path) {
  421. if f.seek(SeekFrom::Start(pos)).is_ok() {
  422. let mut remaining = Vec::new();
  423. if f.read_to_end(&mut remaining).is_ok() && !remaining.is_empty() {
  424. let s = String::from_utf8_lossy(&remaining);
  425. let cleaned = clean_output(&s);
  426. for line in cleaned.lines() {
  427. let line = line.trim();
  428. if !line.is_empty() && !is_progress_line(line) {
  429. info!("{}: {}", job_id_clone, line);
  430. }
  431. }
  432. if let Ok(mut c) = captured_stdout_clone.lock() {
  433. c.push_str(&s);
  434. }
  435. }
  436. }
  437. }
  438. break;
  439. }
  440. match File::open(&out_path) {
  441. Ok(mut f) => {
  442. if !file_opened {
  443. file_opened = true;
  444. }
  445. // seek to last position and read new bytes
  446. if let Err(e) = f.seek(SeekFrom::Start(pos)) {
  447. anyhow::bail!("failed to seek in output file: {}", e);
  448. }
  449. let mut buf = [0u8; 8192];
  450. loop {
  451. match f.read(&mut buf) {
  452. Ok(0) => break, // EOF for now
  453. Ok(n) => {
  454. pos += n as u64;
  455. let bytes = &buf[..n];
  456. // forward to terminal
  457. let s = String::from_utf8_lossy(bytes);
  458. let cleaned = clean_output(&s);
  459. // Log non-progress lines
  460. for line in cleaned.lines() {
  461. let line = line.trim();
  462. if !line.is_empty() && !is_progress_line(line) {
  463. info!("{}: {}", job_id_clone, line);
  464. }
  465. }
  466. io::stdout().flush().ok();
  467. // capture
  468. if let Ok(mut c) = captured_stdout_clone.lock() {
  469. c.push_str(&s);
  470. }
  471. }
  472. Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
  473. Err(e) => {
  474. anyhow::bail!("error reading output file: {}", e);
  475. }
  476. }
  477. }
  478. }
  479. Err(e) if e.kind() == io::ErrorKind::NotFound => {
  480. // file not yet created; this is expected initially
  481. }
  482. Err(e) => {
  483. // Only bail if we've already opened the file once
  484. // (unexpected disappearance or permission issues)
  485. if file_opened {
  486. anyhow::bail!("error opening output file: {}", e);
  487. }
  488. }
  489. }
  490. thread::sleep(Duration::from_millis(500));
  491. }
  492. Ok(())
  493. });
  494. // 3. Poll squeue until job is no longer in the queue
  495. loop {
  496. let sq_out = ProcessCommand::new(self.squeue_bin())
  497. .args(["-j", &job_id])
  498. .output()
  499. .context("failed to run squeue")?;
  500. let out_str = String::from_utf8_lossy(&sq_out.stdout);
  501. // squeue prints a header line + job line if job exists
  502. // If job is gone, usually only the header OR nothing
  503. let has_job = out_str.lines().skip(1).any(|l| !l.trim().is_empty());
  504. if !has_job {
  505. break;
  506. }
  507. thread::sleep(Duration::from_secs(1));
  508. }
  509. // Wait for output file to stop growing (SLURM epilog may still be writing)
  510. let mut stable_count = 0;
  511. let mut last_size = 0u64;
  512. while stable_count < 2 {
  513. thread::sleep(Duration::from_millis(300));
  514. if let Ok(metadata) = std::fs::metadata(&out_file) {
  515. let current_size = metadata.len();
  516. if current_size == last_size {
  517. stable_count += 1;
  518. } else {
  519. stable_count = 0;
  520. last_size = current_size;
  521. }
  522. } else {
  523. // File doesn't exist yet, keep waiting
  524. stable_count = 0;
  525. }
  526. }
  527. // 4. Stop tail thread and join
  528. let _ = stop_tx.send(());
  529. match tail_handle.join() {
  530. Ok(Ok(())) => {}
  531. Ok(Err(e)) => return Err(e.context("tail thread failed")),
  532. Err(_) => anyhow::bail!("tail thread panicked"),
  533. }
  534. self.clean_up()?;
  535. let stdout = match Arc::try_unwrap(captured_stdout) {
  536. Ok(mutex) => mutex.into_inner().unwrap_or_default(),
  537. Err(arc) => arc.lock().unwrap().clone(),
  538. };
  539. // Remove the SLURM output file
  540. let _ = std::fs::remove_file(&out_file);
  541. let (stdout, slurm_epilog) = split_output(&stdout);
  542. Ok(CapturedOutput {
  543. stdout,
  544. stderr: String::new(), // all job output is in the .out file
  545. slurm_epilog,
  546. })
  547. }
  548. }
  549. #[macro_export]
  550. macro_rules! run {
  551. ($cfg:expr, $cmd:expr) => {{
  552. if $cfg.slurm_runner {
  553. $crate::commands::SbatchRunner::exec($cmd)
  554. } else {
  555. $crate::commands::LocalRunner::exec($cmd)
  556. }
  557. }};
  558. }
  559. /// Clean output by handling carriage returns properly.
  560. ///
  561. /// When a line contains \r (carriage return), only the text after the last \r
  562. /// is kept, simulating terminal behavior where \r moves cursor to start of line.
  563. fn clean_output(s: &str) -> String {
  564. s.split('\n')
  565. .map(|line| {
  566. // For lines with \r, keep only the last segment (what's visible after all \r overwrites)
  567. line.split('\r').next_back().unwrap_or("")
  568. })
  569. .collect::<Vec<_>>()
  570. .join("\n")
  571. }
  572. /// Check if a line is a progress indicator that should be filtered from logs.
  573. fn is_progress_line(line: &str) -> bool {
  574. line.starts_with("> Output records written:")
  575. || line.starts_with("> Processing")
  576. || (line.starts_with('>') && line.contains("records"))
  577. }
  578. use std::collections::HashMap;
  579. use crate::config::Config;
  580. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
  581. pub struct SlurmEpilog {
  582. pub job_id: String,
  583. pub cluster: String,
  584. pub user: String,
  585. pub group: String,
  586. pub nodelist: String,
  587. pub cores: u32,
  588. pub job_started_at: String,
  589. pub job_ended_at: String,
  590. pub wall_clock_time: String,
  591. pub cpu_utilized: String,
  592. pub cpu_efficiency: String,
  593. pub memory_utilized: String,
  594. pub memory_efficiency: String,
  595. }
  596. impl SlurmEpilog {
  597. pub fn parse(output: &str) -> Option<Self> {
  598. let epilog_start = output.find("SLURM EPILOG")?;
  599. let epilog_section = &output[epilog_start..];
  600. let mut fields: HashMap<String, String> = HashMap::new();
  601. for line in epilog_section.lines() {
  602. if let Some((key, value)) = Self::parse_line(line) {
  603. fields.insert(key.to_lowercase(), value);
  604. }
  605. }
  606. let get =
  607. |keys: &[&str]| -> Option<String> { keys.iter().find_map(|k| fields.get(*k).cloned()) };
  608. let (user, group) = get(&["user/group"])
  609. .and_then(|s| {
  610. let mut parts = s.splitn(2, '/');
  611. Some((parts.next()?.to_string(), parts.next()?.to_string()))
  612. })
  613. .unwrap_or_default();
  614. Some(SlurmEpilog {
  615. job_id: get(&["job id", "jobid"]).unwrap_or_default(),
  616. cluster: get(&["cluster"]).unwrap_or_default(),
  617. user,
  618. group,
  619. nodelist: get(&["nodelist", "node list"]).unwrap_or_default(),
  620. cores: get(&["cores per node", "cores per task", "cores"])
  621. .and_then(|v| v.parse().ok())
  622. .unwrap_or(0),
  623. job_started_at: get(&["job started at", "start time"]).unwrap_or_default(),
  624. job_ended_at: get(&["job ended at", "end time"]).unwrap_or_default(),
  625. wall_clock_time: get(&["job wall-clock time", "wall-clock time", "elapsed"])
  626. .unwrap_or_default(),
  627. cpu_utilized: get(&["cpu utilized", "cpu time"]).unwrap_or_default(),
  628. cpu_efficiency: get(&["cpu efficiency"]).unwrap_or_default(),
  629. memory_utilized: get(&["memory utilized", "mem utilized"]).unwrap_or_default(),
  630. memory_efficiency: get(&["memory efficiency", "mem efficiency"]).unwrap_or_default(),
  631. })
  632. }
  633. fn parse_line(line: &str) -> Option<(String, String)> {
  634. // Strip box-drawing chars and whitespace
  635. let line = line.trim_matches(|c: char| c == '|' || c == '-' || c.is_whitespace());
  636. if line.is_empty() || line.contains("SLURM EPILOG") {
  637. return None;
  638. }
  639. let (key, value) = line.split_once(':')?;
  640. let key = key.trim().to_lowercase();
  641. let value = value.trim().to_string();
  642. if key.is_empty() || value.is_empty() {
  643. return None;
  644. }
  645. Some((key, value))
  646. }
  647. }
  648. // impl SlurmEpilog {
  649. // /// Parse SLURM epilog from output string
  650. // pub fn parse(output: &str) -> Option<Self> {
  651. // // Find the epilog section
  652. // let epilog_start = output.find("SLURM EPILOG")?;
  653. // let epilog_section = &output[epilog_start..];
  654. //
  655. // // Parse key-value pairs
  656. // let mut fields: HashMap<String, String> = HashMap::new();
  657. //
  658. // for line in epilog_section.lines() {
  659. // if let Some((key, value)) = Self::parse_line(line) {
  660. // fields.insert(key, value);
  661. // }
  662. // }
  663. //
  664. // // Extract user/group
  665. // let (user, group) = fields.get("User/Group").and_then(|s| {
  666. // let parts: Vec<&str> = s.split('/').collect();
  667. // if parts.len() == 2 {
  668. // Some((parts[0].to_string(), parts[1].to_string()))
  669. // } else {
  670. // None
  671. // }
  672. // })?;
  673. //
  674. // Some(SlurmEpilog {
  675. // job_id: fields.get("Job ID")?.clone(),
  676. // cluster: fields.get("Cluster")?.clone(),
  677. // user,
  678. // group,
  679. // nodelist: fields.get("Nodelist")?.clone(),
  680. // cores: fields.get("Cores")?.parse().ok()?,
  681. // job_started_at: fields.get("Job started at")?.clone(),
  682. // job_ended_at: fields.get("Job ended at")?.clone(),
  683. // wall_clock_time: fields.get("Job Wall-clock time")?.clone(),
  684. // cpu_utilized: fields.get("CPU Utilized")?.clone(),
  685. // cpu_efficiency: fields.get("CPU Efficiency")?.clone(),
  686. // memory_utilized: fields.get("Memory Utilized")?.clone(),
  687. // memory_efficiency: fields.get("Memory Efficiency")?.clone(),
  688. // })
  689. // }
  690. //
  691. // fn parse_line(line: &str) -> Option<(String, String)> {
  692. // // Skip lines that are just decorations or headers
  693. // if line.contains("---") || line.contains("SLURM EPILOG") || line.trim().is_empty() {
  694. // return None;
  695. // }
  696. //
  697. // // Look for "Key: Value" pattern
  698. // let parts: Vec<&str> = line.splitn(2, ':').collect();
  699. // if parts.len() == 2 {
  700. // let key = parts[0].trim().to_string();
  701. // let value = parts[1].trim().to_string();
  702. // if !key.is_empty() && !value.is_empty() {
  703. // return Some((key, value));
  704. // }
  705. // }
  706. //
  707. // None
  708. // }
  709. // }
  710. impl Display for SlurmEpilog {
  711. fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
  712. writeln!(f, "--- SLURM JOB EPILOG (ID: {}) ---", self.job_id)?;
  713. writeln!(f, " Cluster: {}", self.cluster)?;
  714. writeln!(f, " User: {} (Group: {})", self.user, self.group)?;
  715. writeln!(f, " Nodelist: {}", self.nodelist)?;
  716. writeln!(f, " Cores Used: {}", self.cores)?;
  717. writeln!(f)?;
  718. writeln!(f, " --- Timestamps ---")?;
  719. writeln!(f, " Started At: {}", self.job_started_at)?;
  720. writeln!(f, " Ended At: {}", self.job_ended_at)?;
  721. writeln!(f, " Wall Time: {}", self.wall_clock_time)?;
  722. writeln!(f)?;
  723. writeln!(f, " --- Resource Utilization ---")?;
  724. writeln!(f, " CPU Utilized: {}", self.cpu_utilized)?;
  725. writeln!(f, " CPU Efficiency: {}", self.cpu_efficiency)?;
  726. writeln!(f, " Memory Utilized: {}", self.memory_utilized)?;
  727. writeln!(f, " Memory Efficiency:{}", self.memory_efficiency)?;
  728. writeln!(f, "------------------------------------")
  729. }
  730. }
  731. /// Split output into job output and epilog
  732. pub fn split_output(full_output: &str) -> (String, Option<SlurmEpilog>) {
  733. if let Some(epilog_start) = full_output.find("----------------------------------------------") {
  734. // Check if this is actually the epilog header
  735. if full_output[epilog_start..].contains("SLURM EPILOG") {
  736. let job_output = full_output[..epilog_start].trim_end().to_string();
  737. let epilog = SlurmEpilog::parse(full_output);
  738. return (job_output, epilog);
  739. }
  740. }
  741. // No epilog found, return full output
  742. (full_output.to_string(), None)
  743. }
  744. /// Run multiple SbatchRunner jobs in parallel.
  745. ///
  746. /// Each job:
  747. /// - calls `init()`
  748. /// - runs `sbatch --wait ... --wrap "<cmd>"`
  749. /// - streams its output to this process' stdout/stderr
  750. /// - returns `CapturedOutput`
  751. ///
  752. /// NOTE:
  753. /// - Outputs from different jobs may interleave on the terminal.
  754. /// - Slurm still decides scheduling order (queue vs run).
  755. pub fn run_many_sbatch<T>(jobs: Vec<T>, config: &Config) -> anyhow::Result<Vec<CapturedOutput>>
  756. where
  757. T: SbatchRunner + Send + 'static,
  758. {
  759. // let mut handles = Vec::with_capacity(jobs.len());
  760. //
  761. // for mut job in jobs.drain(..) {
  762. // let handle = thread::spawn(move || -> anyhow::Result<CapturedOutput> {
  763. // SbatchRunner::exec(&mut job)
  764. // });
  765. // handles.push(handle);
  766. // }
  767. //
  768. // let mut results = Vec::with_capacity(handles.len());
  769. // for h in handles {
  770. // // propagate the first error you hit
  771. // let res = h.join().expect("thread panicked")?;
  772. // results.push(res);
  773. // }
  774. // Rule for the IGR cluster because the admins dont want to implement proper rules so we are
  775. let max_parallel = config.slurm_max_par;
  776. let pool = rayon::ThreadPoolBuilder::new()
  777. .num_threads(max_parallel.into())
  778. .build()
  779. .context("failed to build rayon thread pool")?;
  780. pool.install(|| {
  781. jobs.into_par_iter()
  782. .map(|mut job| SbatchRunner::exec(&mut job))
  783. .collect::<anyhow::Result<Vec<_>>>()
  784. })
  785. }
  786. /// Local batch runner: execute the command directly on the machine,
  787. /// mimicking SbatchRunner behavior (output to file, async monitoring).
  788. pub trait LocalBatchRunner: Command {
  789. fn shell(&self) -> &str {
  790. "bash"
  791. }
  792. /// Output file pattern. Use `{}` as placeholder for job ID (UUID).
  793. fn output_pattern(&self) -> &str {
  794. "local-{}.out"
  795. }
  796. fn run(&mut self) -> anyhow::Result<CapturedOutput> {
  797. self.init()?;
  798. let job_id = Uuid::new_v4()
  799. .to_string()
  800. .split('-')
  801. .next()
  802. .unwrap_or("local")
  803. .to_string();
  804. let out_file = self.output_pattern().replace("{}", &job_id);
  805. info!("Running local batch job: {job_id}");
  806. let out_file_handle = File::create(&out_file)
  807. .with_context(|| format!("failed to create output file: {out_file}"))?;
  808. let out_file_write = out_file_handle
  809. .try_clone()
  810. .context("failed to clone file handle for stderr")?;
  811. let mut child = ProcessCommand::new(self.shell())
  812. .arg("-c")
  813. .arg(self.cmd())
  814. .stdout(Stdio::from(out_file_handle))
  815. .stderr(Stdio::from(out_file_write))
  816. .spawn()
  817. .context("failed to spawn local batch command")?;
  818. let (stop_tx, stop_rx) = mpsc::channel::<()>();
  819. let out_path = out_file.clone();
  820. let captured_stdout = Arc::new(Mutex::new(String::new()));
  821. let captured_stdout_clone = Arc::clone(&captured_stdout);
  822. let job_id_clone = job_id.clone();
  823. let tail_handle = thread::spawn(move || -> anyhow::Result<()> {
  824. let mut pos: u64 = 0;
  825. loop {
  826. if stop_rx.try_recv().is_ok() {
  827. if let Ok(mut f) = File::open(&out_path) {
  828. if f.seek(SeekFrom::Start(pos)).is_ok() {
  829. let mut remaining = Vec::new();
  830. if f.read_to_end(&mut remaining).is_ok() && !remaining.is_empty() {
  831. let s = String::from_utf8_lossy(&remaining);
  832. let cleaned = clean_output(&s);
  833. for line in cleaned.lines() {
  834. let line = line.trim();
  835. if !line.is_empty() && !is_progress_line(line) {
  836. info!("{}: {}", job_id_clone, line);
  837. }
  838. }
  839. if let Ok(mut c) = captured_stdout_clone.lock() {
  840. c.push_str(&s);
  841. }
  842. }
  843. }
  844. }
  845. break;
  846. }
  847. if let Ok(mut f) = File::open(&out_path) {
  848. if f.seek(SeekFrom::Start(pos)).is_ok() {
  849. let mut buf = [0u8; 8192];
  850. loop {
  851. match f.read(&mut buf) {
  852. Ok(0) => break,
  853. Ok(n) => {
  854. pos += n as u64;
  855. let s = String::from_utf8_lossy(&buf[..n]);
  856. let cleaned = clean_output(&s);
  857. for line in cleaned.lines() {
  858. let line = line.trim();
  859. if !line.is_empty() && !is_progress_line(line) {
  860. info!("{}: {}", job_id_clone, line);
  861. }
  862. }
  863. if let Ok(mut c) = captured_stdout_clone.lock() {
  864. c.push_str(&s);
  865. }
  866. }
  867. Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
  868. Err(_) => break,
  869. }
  870. }
  871. }
  872. }
  873. thread::sleep(Duration::from_millis(100));
  874. }
  875. Ok(())
  876. });
  877. let status = child
  878. .wait()
  879. .context("failed to wait on local batch command")?;
  880. let _ = stop_tx.send(());
  881. match tail_handle.join() {
  882. Ok(Ok(())) => {}
  883. Ok(Err(e)) => return Err(e.context("tail thread failed")),
  884. Err(_) => anyhow::bail!("tail thread panicked"),
  885. }
  886. if !status.success() {
  887. anyhow::bail!("local batch command failed with status: {status}");
  888. }
  889. self.clean_up()?;
  890. let stdout = match Arc::try_unwrap(captured_stdout) {
  891. Ok(mutex) => mutex.into_inner().unwrap_or_default(),
  892. Err(arc) => arc.lock().unwrap_or_else(|e| e.into_inner()).clone(),
  893. };
  894. let _ = std::fs::remove_file(&out_file);
  895. Ok(CapturedOutput {
  896. stdout,
  897. stderr: String::new(),
  898. slurm_epilog: None,
  899. })
  900. }
  901. }
  902. pub fn run_many_local_batch<T>(jobs: Vec<T>) -> anyhow::Result<Vec<CapturedOutput>>
  903. where
  904. T: LocalBatchRunner + Send,
  905. {
  906. // Set thread pool size based on max_concurrent
  907. // let pool = rayon::ThreadPoolBuilder::new()
  908. // .num_threads(threads)
  909. // .build()
  910. // .context("failed to build thread pool")?;
  911. // pool.install(|| {
  912. jobs.into_iter()
  913. .map(|mut job| LocalBatchRunner::run(&mut job))
  914. .collect()
  915. // })
  916. }
  917. /// Macro to run multiple batch commands in parallel, either via SLURM or locally.
  918. ///
  919. /// Usage:
  920. /// ```ignore
  921. /// let results = run_many!(cfg, jobs)?;
  922. ///
  923. /// ```
  924. #[macro_export]
  925. macro_rules! run_many {
  926. ($cfg:expr, $jobs:expr) => {{
  927. if $cfg.slurm_runner {
  928. $crate::commands::run_many_sbatch($jobs, $cfg)
  929. } else {
  930. $crate::commands::run_many_local_batch($jobs)
  931. }
  932. }};
  933. }
  934. #[cfg(test)]
  935. mod tests {
  936. use super::{Command, SbatchRunner, SlurmParams};
  937. struct TestRun;
  938. impl Command for TestRun {
  939. fn cmd(&self) -> String {
  940. "echo 'hello from sbatch'".to_string()
  941. }
  942. }
  943. impl SbatchRunner for TestRun {
  944. fn slurm_params(&self) -> SlurmParams {
  945. SlurmParams {
  946. job_name: Some("test-sbatch".into()),
  947. partition: Some("gpgpuq".into()),
  948. cpus_per_task: Some(1),
  949. mem: Some("1G".into()),
  950. gres: None,
  951. }
  952. }
  953. }
  954. #[test]
  955. fn sbatch_test_run() -> anyhow::Result<()> {
  956. let mut t = TestRun;
  957. let out = SbatchRunner::exec(&mut t)?;
  958. println!("{out:#?}");
  959. assert!(out.stdout.contains("hello from sbatch"));
  960. Ok(())
  961. }
  962. }