mod.rs 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040
  1. pub mod bcftools;
  2. pub mod dorado;
  3. // pub mod longphase;
  4. pub mod modkit;
  5. pub mod samtools;
  6. pub mod longphase;
  7. // pub mod wakhan;
  8. use std::{
  9. fmt::{self, Display, Formatter},
  10. fs::{self, File},
  11. io::{self, BufReader, Read, Seek, SeekFrom, Write},
  12. path::{Path, PathBuf},
  13. process::{Command as ProcessCommand, Stdio},
  14. sync::{mpsc, Arc, Mutex},
  15. thread,
  16. time::Duration,
  17. };
  18. /// A helper trait for commands that should be run through Slurm.
  19. ///
  20. /// Types implementing `SlurmRunner` must also implement [`Command`].
  21. /// The provided [`run`](SlurmRunner::run) method:
  22. ///
  23. /// 1. Calls [`Command::init`].
  24. /// 2. Runs `srun <slurm_args...> bash -lc "<cmd()>”`.
  25. /// 3. On success, calls [`Command::clean_up`].
  26. pub trait Command {
  27. fn init(&mut self) -> anyhow::Result<()> {
  28. Ok(())
  29. }
  30. fn cmd(&self) -> String;
  31. fn clean_up(&self) -> anyhow::Result<()> {
  32. Ok(())
  33. }
  34. }
  35. /// Local runner: execute the command directly on the machine.
  36. ///
  37. /// Usage:
  38. /// ```ignore
  39. /// my_command.run()?;
  40. /// ```
  41. pub trait LocalRunner: Command {
  42. /// The shell binary used to run commands.
  43. /// Override if you want to use `sh` or something else.
  44. fn shell(&self) -> &str {
  45. "bash"
  46. }
  47. /// Run locally with:
  48. /// <shell> -c "<cmd()>"
  49. fn exec(&mut self) -> anyhow::Result<CapturedOutput> {
  50. self.init()?;
  51. let mut cmd = ProcessCommand::new(self.shell());
  52. cmd.arg("-c")
  53. .arg(self.cmd())
  54. .stdout(Stdio::piped())
  55. .stderr(Stdio::piped());
  56. info!(
  57. "Running: {} {}",
  58. cmd.get_program().to_string_lossy(),
  59. cmd.get_args()
  60. .map(|arg| arg.to_string_lossy())
  61. .collect::<Vec<_>>()
  62. .join(" ")
  63. );
  64. let mut child = cmd.spawn().context("failed to spawn local command")?;
  65. let stdout = child.stdout.take().context("failed to capture stdout")?;
  66. let stderr = child.stderr.take().context("failed to capture stderr")?;
  67. let stderr_handle = thread::spawn(move || {
  68. let mut reader = BufReader::new(stderr);
  69. let mut buf = Vec::new();
  70. let mut line = Vec::new();
  71. let mut byte = [0u8; 1];
  72. loop {
  73. match reader.read(&mut byte) {
  74. Ok(0) => {
  75. if !line.is_empty() {
  76. let text = String::from_utf8_lossy(&line);
  77. eprint!("{}", text);
  78. buf.extend_from_slice(&line);
  79. }
  80. break;
  81. }
  82. Ok(_) => {
  83. line.push(byte[0]);
  84. if byte[0] == b'\n' {
  85. let text = String::from_utf8_lossy(&line);
  86. eprint!("{}", text);
  87. buf.extend_from_slice(&line);
  88. line.clear();
  89. }
  90. }
  91. Err(_) => break,
  92. }
  93. }
  94. String::from_utf8_lossy(&buf).to_string()
  95. });
  96. // --- stdout in a separate thread ---
  97. let stdout_handle = thread::spawn(move || {
  98. let mut reader = BufReader::new(stdout);
  99. let mut buf = Vec::new();
  100. let mut line = Vec::new();
  101. let mut byte = [0u8; 1];
  102. loop {
  103. match reader.read(&mut byte) {
  104. Ok(0) => {
  105. if !line.is_empty() {
  106. let text = String::from_utf8_lossy(&line);
  107. print!("{}", text);
  108. buf.extend_from_slice(&line);
  109. }
  110. break;
  111. }
  112. Ok(_) => {
  113. line.push(byte[0]);
  114. if byte[0] == b'\n' {
  115. let text = String::from_utf8_lossy(&line);
  116. print!("{}", text);
  117. buf.extend_from_slice(&line);
  118. line.clear();
  119. }
  120. }
  121. Err(_) => break,
  122. }
  123. }
  124. String::from_utf8_lossy(&buf).to_string()
  125. });
  126. // wait for process
  127. let status = child.wait().context("failed to wait on local command")?;
  128. // wait for stderr thread and collect stderr
  129. let captured_stdout = stdout_handle.join().unwrap_or_default();
  130. let captured_stderr = stderr_handle.join().unwrap_or_default();
  131. if !status.success() {
  132. anyhow::bail!("local command failed with status: {status}");
  133. }
  134. self.clean_up()?;
  135. Ok(CapturedOutput {
  136. stdout: captured_stdout,
  137. stderr: captured_stderr,
  138. slurm_epilog: None,
  139. })
  140. }
  141. }
  142. use anyhow::Context;
  143. use log::info;
  144. use rayon::iter::{IntoParallelIterator, ParallelIterator};
  145. use serde::{Deserialize, Serialize};
  146. use uuid::Uuid;
  147. /// Captured process output while it was being streamed live.
  148. #[derive(Debug, Default, Serialize, Deserialize)]
  149. pub struct CapturedOutput {
  150. pub stdout: String,
  151. pub stderr: String,
  152. pub slurm_epilog: Option<SlurmEpilog>,
  153. }
  154. impl CapturedOutput {
  155. /// Constructs a unique filename based on the base `path`, appending
  156. /// a truncated UUID v4 segment and the ".json" suffix.
  157. /// Saves the struct as JSON to the constructed path (creates parent dirs if needed).
  158. pub fn save_to_file(&self, base_path: impl AsRef<Path>) -> anyhow::Result<PathBuf> {
  159. let base_path = base_path.as_ref();
  160. let binding = Uuid::new_v4().to_string();
  161. let unique_id = binding.split('-').next().unwrap_or("default");
  162. let filename = format!(
  163. "{}_{}.json",
  164. base_path
  165. .file_name()
  166. .unwrap_or(base_path.as_os_str())
  167. .to_string_lossy(),
  168. unique_id
  169. );
  170. let mut full_path = PathBuf::from(base_path.parent().unwrap_or_else(|| Path::new("")));
  171. full_path.push(filename);
  172. if let Some(p) = full_path.parent() {
  173. fs::create_dir_all(p)
  174. .with_context(|| format!("Failed to create directory {}", p.display()))?;
  175. }
  176. let json = serde_json::to_string_pretty(self)?; // Using pretty for readability, remove _pretty if not needed
  177. fs::write(&full_path, json)
  178. .with_context(|| format!("Failed to write file {}", full_path.display()))?;
  179. Ok(full_path)
  180. }
  181. }
  182. impl Display for CapturedOutput {
  183. fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
  184. // 1. Write STDOUT
  185. writeln!(f, "--- STANDARD OUTPUT ---")?;
  186. if self.stdout.is_empty() {
  187. writeln!(f, "[No standard output was captured.]")?;
  188. } else {
  189. writeln!(f, "{}", self.stdout)?;
  190. }
  191. // 2. Write STDERR
  192. writeln!(f, "\n--- STANDARD ERROR ---")?;
  193. if self.stderr.is_empty() {
  194. writeln!(f, "[No standard error was captured.]")?;
  195. } else {
  196. writeln!(f, "{}", self.stderr)?;
  197. }
  198. // 3. Write SlurmEpilog if present
  199. if let Some(epilog) = &self.slurm_epilog {
  200. // Using a separate line break before the epilog for clarity
  201. writeln!(f, "\n{}", epilog)?;
  202. }
  203. Ok(())
  204. }
  205. }
  206. /// Super-trait adding a Slurm `run()` method on top of [`Command`].
  207. pub trait SlurmRunner: Command {
  208. /// Slurm launcher binary. Defaults to `srun`.
  209. ///
  210. /// Override if you want to use `sbatch` or another wrapper.
  211. fn slurm_bin(&self) -> &str {
  212. "srun"
  213. }
  214. /// Additional Slurm arguments (e.g. `--time=1:00:00`, `--cpus-per-task=4`).
  215. ///
  216. /// Default is no extra arguments.
  217. fn slurm_args(&self) -> Vec<String> {
  218. Vec::new()
  219. }
  220. /// Run the command through Slurm.
  221. ///
  222. /// The effective shell command is:
  223. ///
  224. /// ```text
  225. /// <slurm_bin> <slurm_args...> bash -lc "<cmd()>"
  226. /// ```
  227. fn exec(&mut self) -> anyhow::Result<CapturedOutput> {
  228. self.init()?;
  229. let mut cmd = ProcessCommand::new(self.slurm_bin());
  230. cmd.args(self.slurm_args())
  231. .arg("bash")
  232. .arg("-c")
  233. .arg(self.cmd())
  234. .stdout(Stdio::piped())
  235. .stderr(Stdio::piped());
  236. info!(
  237. "Running with Slurm: {} {}",
  238. cmd.get_program().to_string_lossy(),
  239. cmd.get_args()
  240. .map(|arg| arg.to_string_lossy())
  241. .collect::<Vec<_>>()
  242. .join(" ")
  243. );
  244. let mut child = cmd.spawn().context("failed to spawn slurm job")?;
  245. let stdout = child.stdout.take().context("failed to capture stdout")?;
  246. let stderr = child.stderr.take().context("failed to capture stderr")?;
  247. let stderr_handle = thread::spawn(move || {
  248. let mut reader = BufReader::new(stderr);
  249. let mut buf = Vec::new();
  250. let mut chunk = [0u8; 8192];
  251. let mut out = io::stderr();
  252. loop {
  253. match reader.read(&mut chunk) {
  254. Ok(0) => break, // EOF
  255. Ok(n) => {
  256. // keep a copy
  257. buf.extend_from_slice(&chunk[..n]);
  258. // forward to our stderr (preserves colors, \r, etc.)
  259. let _ = out.write_all(&chunk[..n]);
  260. let _ = out.flush();
  261. }
  262. Err(_) => break,
  263. }
  264. }
  265. String::from_utf8_lossy(&buf).to_string()
  266. });
  267. // --- stdout in a separate thread ---
  268. let stdout_handle = thread::spawn(move || {
  269. let mut reader = BufReader::new(stdout);
  270. let mut buf = Vec::new();
  271. let mut chunk = [0u8; 8192];
  272. let mut out = io::stdout();
  273. loop {
  274. match reader.read(&mut chunk) {
  275. Ok(0) => break, // EOF
  276. Ok(n) => {
  277. // capture
  278. buf.extend_from_slice(&chunk[..n]);
  279. // forward
  280. let _ = out.write_all(&chunk[..n]);
  281. let _ = out.flush();
  282. }
  283. Err(_) => break,
  284. }
  285. }
  286. String::from_utf8_lossy(&buf).to_string()
  287. });
  288. // wait for process
  289. let status = child.wait().context("failed to wait on local command")?;
  290. // wait for both threads and collect output
  291. let stdout = stdout_handle.join().unwrap_or_default();
  292. let stderr = stderr_handle.join().unwrap_or_default();
  293. if !status.success() {
  294. anyhow::bail!("slurm job failed with status: {status}");
  295. }
  296. self.clean_up()?;
  297. Ok(CapturedOutput {
  298. stdout,
  299. stderr,
  300. slurm_epilog: None,
  301. })
  302. }
  303. }
  304. /// Holds optional Slurm parameters that can be converted to CLI args.
  305. ///
  306. /// Example:
  307. /// ```rust
  308. /// let params = SlurmParams {
  309. /// job_name: Some("dorado_basecall".into()),
  310. /// cpus_per_task: Some(47),
  311. /// mem: Some("60G".into()),
  312. /// partition: Some("gpgpuq".into()),
  313. /// gres: Some("gpu:h100:4".into()),
  314. /// };
  315. ///
  316. /// let args = params.to_args();
  317. /// ```
  318. ///
  319. /// Produces:
  320. /// ```text
  321. /// --job-name=dorado_basecall
  322. /// --cpus-per-task=47
  323. /// --mem=60G
  324. /// --partition=gpgpuq
  325. /// --gres=gpu:h100:4
  326. /// ```
  327. #[derive(Debug, Clone, Default)]
  328. pub struct SlurmParams {
  329. pub job_name: Option<String>,
  330. pub cpus_per_task: Option<u32>,
  331. pub mem: Option<String>,
  332. pub partition: Option<String>,
  333. pub gres: Option<String>,
  334. }
  335. impl SlurmParams {
  336. /// Convert all non-empty fields into Slurm CLI arguments.
  337. pub fn to_args(&self) -> Vec<String> {
  338. let mut args = Vec::new();
  339. if let Some(v) = &self.job_name {
  340. args.push(format!("--job-name={v}"));
  341. }
  342. if let Some(v) = self.cpus_per_task {
  343. args.push(format!("--cpus-per-task={v}"));
  344. }
  345. if let Some(v) = &self.mem {
  346. args.push(format!("--mem={v}"));
  347. }
  348. if let Some(v) = &self.partition {
  349. args.push(format!("--partition={v}"));
  350. }
  351. if let Some(v) = &self.gres {
  352. args.push(format!("--gres={v}"));
  353. }
  354. args
  355. }
  356. }
  357. pub trait SbatchRunner: Command {
  358. fn sbatch_bin(&self) -> &str {
  359. "sbatch"
  360. }
  361. fn squeue_bin(&self) -> &str {
  362. "squeue"
  363. }
  364. fn slurm_params(&self) -> SlurmParams {
  365. SlurmParams::default()
  366. }
  367. fn sbatch_extra_args(&self) -> Vec<String> {
  368. Vec::new()
  369. }
  370. /// Submit via sbatch, tail slurm-<jobid>.out "live", wait for job to finish,
  371. /// then return captured stdout.
  372. fn exec(&mut self) -> anyhow::Result<CapturedOutput> {
  373. self.init()?;
  374. // 1. sbatch --parsable --output=slurm-%j.out --wrap "<cmd>"
  375. let mut args = self.slurm_params().to_args();
  376. args.extend(self.sbatch_extra_args());
  377. args.push("--parsable".to_string());
  378. args.push("--output=slurm-%j.out".to_string());
  379. args.push("--wrap".to_string());
  380. args.push(self.cmd());
  381. let output = ProcessCommand::new(self.sbatch_bin())
  382. .args(&args)
  383. .output()
  384. .context("failed to spawn sbatch")?;
  385. let sbatch_stdout = String::from_utf8_lossy(&output.stdout).to_string();
  386. let sbatch_stderr = String::from_utf8_lossy(&output.stderr).to_string();
  387. if !output.status.success() {
  388. anyhow::bail!(
  389. "sbatch failed with status: {}\nstdout:\n{}\nstderr:\n{}",
  390. output.status,
  391. sbatch_stdout,
  392. sbatch_stderr
  393. );
  394. }
  395. // --parsable usually returns "<jobid>" or "<jobid>;..."
  396. let job_id = sbatch_stdout
  397. .trim()
  398. .split(';')
  399. .next()
  400. .unwrap_or("")
  401. .to_string();
  402. info!("Running jobid: {job_id}: {}", args.join(" "));
  403. if job_id.is_empty() {
  404. anyhow::bail!("failed to parse job id from sbatch output: {sbatch_stdout}");
  405. }
  406. let out_file = format!("slurm-{job_id}.out");
  407. // 2. Tail thread: emulate `tail -f slurm-<jobid>.out`
  408. let (stop_tx, stop_rx) = mpsc::channel::<()>();
  409. let out_path = out_file.clone();
  410. let captured_stdout = Arc::new(Mutex::new(String::new()));
  411. let captured_stdout_clone = Arc::clone(&captured_stdout);
  412. let job_id_clone = job_id.clone();
  413. let tail_handle = thread::spawn(move || -> anyhow::Result<()> {
  414. let mut pos: u64 = 0;
  415. let mut file_opened = false;
  416. loop {
  417. // stop signal?
  418. if stop_rx.try_recv().is_ok() {
  419. // Do one final read before stopping
  420. if let Ok(mut f) = File::open(&out_path) {
  421. if f.seek(SeekFrom::Start(pos)).is_ok() {
  422. let mut remaining = Vec::new();
  423. if f.read_to_end(&mut remaining).is_ok() && !remaining.is_empty() {
  424. let s = String::from_utf8_lossy(&remaining);
  425. let cleaned = clean_output(&s);
  426. for line in cleaned.lines() {
  427. let line = line.trim();
  428. if !line.is_empty() && !is_progress_line(line) {
  429. info!("{}: {}", job_id_clone, line);
  430. }
  431. }
  432. if let Ok(mut c) = captured_stdout_clone.lock() {
  433. c.push_str(&s);
  434. }
  435. }
  436. }
  437. }
  438. break;
  439. }
  440. match File::open(&out_path) {
  441. Ok(mut f) => {
  442. if !file_opened {
  443. file_opened = true;
  444. }
  445. // seek to last position and read new bytes
  446. if let Err(e) = f.seek(SeekFrom::Start(pos)) {
  447. anyhow::bail!("failed to seek in output file: {}", e);
  448. }
  449. let mut buf = [0u8; 8192];
  450. loop {
  451. match f.read(&mut buf) {
  452. Ok(0) => break, // EOF for now
  453. Ok(n) => {
  454. pos += n as u64;
  455. let bytes = &buf[..n];
  456. // forward to terminal
  457. let s = String::from_utf8_lossy(bytes);
  458. let cleaned = clean_output(&s);
  459. // Log non-progress lines
  460. for line in cleaned.lines() {
  461. let line = line.trim();
  462. if !line.is_empty() && !is_progress_line(line) {
  463. info!("{}: {}", job_id_clone, line);
  464. }
  465. }
  466. io::stdout().flush().ok();
  467. // capture
  468. if let Ok(mut c) = captured_stdout_clone.lock() {
  469. c.push_str(&s);
  470. }
  471. }
  472. Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
  473. Err(e) => {
  474. anyhow::bail!("error reading output file: {}", e);
  475. }
  476. }
  477. }
  478. }
  479. Err(e) if e.kind() == io::ErrorKind::NotFound => {
  480. // file not yet created; this is expected initially
  481. }
  482. Err(e) => {
  483. // Only bail if we've already opened the file once
  484. // (unexpected disappearance or permission issues)
  485. if file_opened {
  486. anyhow::bail!("error opening output file: {}", e);
  487. }
  488. }
  489. }
  490. thread::sleep(Duration::from_millis(500));
  491. }
  492. Ok(())
  493. });
  494. // 3. Poll squeue until job is no longer in the queue
  495. loop {
  496. let sq_out = ProcessCommand::new(self.squeue_bin())
  497. .args(["-j", &job_id])
  498. .output()
  499. .context("failed to run squeue")?;
  500. let out_str = String::from_utf8_lossy(&sq_out.stdout);
  501. // squeue prints a header line + job line if job exists
  502. // If job is gone, usually only the header OR nothing
  503. let has_job = out_str.lines().skip(1).any(|l| !l.trim().is_empty());
  504. if !has_job {
  505. break;
  506. }
  507. thread::sleep(Duration::from_secs(1));
  508. }
  509. // Wait for output file to stop growing (SLURM epilog may still be writing)
  510. let mut stable_count = 0;
  511. let mut last_size = 0u64;
  512. while stable_count < 2 {
  513. thread::sleep(Duration::from_millis(300));
  514. if let Ok(metadata) = std::fs::metadata(&out_file) {
  515. let current_size = metadata.len();
  516. if current_size == last_size {
  517. stable_count += 1;
  518. } else {
  519. stable_count = 0;
  520. last_size = current_size;
  521. }
  522. } else {
  523. // File doesn't exist yet, keep waiting
  524. stable_count = 0;
  525. }
  526. }
  527. // 4. Stop tail thread and join
  528. let _ = stop_tx.send(());
  529. match tail_handle.join() {
  530. Ok(Ok(())) => {}
  531. Ok(Err(e)) => return Err(e.context("tail thread failed")),
  532. Err(_) => anyhow::bail!("tail thread panicked"),
  533. }
  534. self.clean_up()?;
  535. let stdout = match Arc::try_unwrap(captured_stdout) {
  536. Ok(mutex) => mutex.into_inner().unwrap_or_default(),
  537. Err(arc) => arc.lock().unwrap().clone(),
  538. };
  539. // Remove the SLURM output file
  540. let _ = std::fs::remove_file(&out_file);
  541. let (stdout, slurm_epilog) = split_output(&stdout);
  542. Ok(CapturedOutput {
  543. stdout,
  544. stderr: String::new(), // all job output is in the .out file
  545. slurm_epilog,
  546. })
  547. }
  548. }
  549. #[macro_export]
  550. macro_rules! run {
  551. ($cfg:expr, $cmd:expr) => {{
  552. if $cfg.slurm_runner {
  553. $crate::commands::SbatchRunner::exec($cmd)
  554. } else {
  555. $crate::commands::LocalRunner::exec($cmd)
  556. }
  557. }};
  558. }
  559. /// Clean output by handling carriage returns properly.
  560. ///
  561. /// When a line contains \r (carriage return), only the text after the last \r
  562. /// is kept, simulating terminal behavior where \r moves cursor to start of line.
  563. fn clean_output(s: &str) -> String {
  564. s.split('\n')
  565. .map(|line| {
  566. // For lines with \r, keep only the last segment (what's visible after all \r overwrites)
  567. line.split('\r').next_back().unwrap_or("")
  568. })
  569. .collect::<Vec<_>>()
  570. .join("\n")
  571. }
  572. /// Check if a line is a progress indicator that should be filtered from logs.
  573. fn is_progress_line(line: &str) -> bool {
  574. line.starts_with("> Output records written:")
  575. || line.starts_with("> Processing")
  576. || (line.starts_with('>') && line.contains("records"))
  577. }
  578. use std::collections::HashMap;
  579. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
  580. pub struct SlurmEpilog {
  581. pub job_id: String,
  582. pub cluster: String,
  583. pub user: String,
  584. pub group: String,
  585. pub nodelist: String,
  586. pub cores: u32,
  587. pub job_started_at: String,
  588. pub job_ended_at: String,
  589. pub wall_clock_time: String,
  590. pub cpu_utilized: String,
  591. pub cpu_efficiency: String,
  592. pub memory_utilized: String,
  593. pub memory_efficiency: String,
  594. }
  595. impl SlurmEpilog {
  596. /// Parse SLURM epilog from output string
  597. pub fn parse(output: &str) -> Option<Self> {
  598. // Find the epilog section
  599. let epilog_start = output.find("SLURM EPILOG")?;
  600. let epilog_section = &output[epilog_start..];
  601. // Parse key-value pairs
  602. let mut fields: HashMap<String, String> = HashMap::new();
  603. for line in epilog_section.lines() {
  604. if let Some((key, value)) = Self::parse_line(line) {
  605. fields.insert(key, value);
  606. }
  607. }
  608. // Extract user/group
  609. let (user, group) = fields.get("User/Group").and_then(|s| {
  610. let parts: Vec<&str> = s.split('/').collect();
  611. if parts.len() == 2 {
  612. Some((parts[0].to_string(), parts[1].to_string()))
  613. } else {
  614. None
  615. }
  616. })?;
  617. Some(SlurmEpilog {
  618. job_id: fields.get("Job ID")?.clone(),
  619. cluster: fields.get("Cluster")?.clone(),
  620. user,
  621. group,
  622. nodelist: fields.get("Nodelist")?.clone(),
  623. cores: fields.get("Cores")?.parse().ok()?,
  624. job_started_at: fields.get("Job started at")?.clone(),
  625. job_ended_at: fields.get("Job ended at")?.clone(),
  626. wall_clock_time: fields.get("Job Wall-clock time")?.clone(),
  627. cpu_utilized: fields.get("CPU Utilized")?.clone(),
  628. cpu_efficiency: fields.get("CPU Efficiency")?.clone(),
  629. memory_utilized: fields.get("Memory Utilized")?.clone(),
  630. memory_efficiency: fields.get("Memory Efficiency")?.clone(),
  631. })
  632. }
  633. fn parse_line(line: &str) -> Option<(String, String)> {
  634. // Skip lines that are just decorations or headers
  635. if line.contains("---") || line.contains("SLURM EPILOG") || line.trim().is_empty() {
  636. return None;
  637. }
  638. // Look for "Key: Value" pattern
  639. let parts: Vec<&str> = line.splitn(2, ':').collect();
  640. if parts.len() == 2 {
  641. let key = parts[0].trim().to_string();
  642. let value = parts[1].trim().to_string();
  643. if !key.is_empty() && !value.is_empty() {
  644. return Some((key, value));
  645. }
  646. }
  647. None
  648. }
  649. }
  650. impl Display for SlurmEpilog {
  651. fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
  652. writeln!(f, "--- SLURM JOB EPILOG (ID: {}) ---", self.job_id)?;
  653. writeln!(f, " Cluster: {}", self.cluster)?;
  654. writeln!(f, " User: {} (Group: {})", self.user, self.group)?;
  655. writeln!(f, " Nodelist: {}", self.nodelist)?;
  656. writeln!(f, " Cores Used: {}", self.cores)?;
  657. writeln!(f)?;
  658. writeln!(f, " --- Timestamps ---")?;
  659. writeln!(f, " Started At: {}", self.job_started_at)?;
  660. writeln!(f, " Ended At: {}", self.job_ended_at)?;
  661. writeln!(f, " Wall Time: {}", self.wall_clock_time)?;
  662. writeln!(f)?;
  663. writeln!(f, " --- Resource Utilization ---")?;
  664. writeln!(f, " CPU Utilized: {}", self.cpu_utilized)?;
  665. writeln!(f, " CPU Efficiency: {}", self.cpu_efficiency)?;
  666. writeln!(f, " Memory Utilized: {}", self.memory_utilized)?;
  667. writeln!(f, " Memory Efficiency:{}", self.memory_efficiency)?;
  668. writeln!(f, "------------------------------------")
  669. }
  670. }
  671. /// Split output into job output and epilog
  672. pub fn split_output(full_output: &str) -> (String, Option<SlurmEpilog>) {
  673. if let Some(epilog_start) = full_output.find("----------------------------------------------") {
  674. // Check if this is actually the epilog header
  675. if full_output[epilog_start..].contains("SLURM EPILOG") {
  676. let job_output = full_output[..epilog_start].trim_end().to_string();
  677. let epilog = SlurmEpilog::parse(full_output);
  678. return (job_output, epilog);
  679. }
  680. }
  681. // No epilog found, return full output
  682. (full_output.to_string(), None)
  683. }
  684. /// Run multiple SbatchRunner jobs in parallel.
  685. ///
  686. /// Each job:
  687. /// - calls `init()`
  688. /// - runs `sbatch --wait ... --wrap "<cmd>"`
  689. /// - streams its output to this process' stdout/stderr
  690. /// - returns `CapturedOutput`
  691. ///
  692. /// NOTE:
  693. /// - Outputs from different jobs may interleave on the terminal.
  694. /// - Slurm still decides scheduling order (queue vs run).
  695. pub fn run_many_sbatch<T>(jobs: Vec<T>) -> anyhow::Result<Vec<CapturedOutput>>
  696. where
  697. T: SbatchRunner + Send + 'static,
  698. {
  699. // let mut handles = Vec::with_capacity(jobs.len());
  700. //
  701. // for mut job in jobs.drain(..) {
  702. // let handle = thread::spawn(move || -> anyhow::Result<CapturedOutput> {
  703. // SbatchRunner::exec(&mut job)
  704. // });
  705. // handles.push(handle);
  706. // }
  707. //
  708. // let mut results = Vec::with_capacity(handles.len());
  709. // for h in handles {
  710. // // propagate the first error you hit
  711. // let res = h.join().expect("thread panicked")?;
  712. // results.push(res);
  713. // }
  714. // Rule for the IGR cluster because the admins dont want to implement propoper rules so we are
  715. let max_parallel = 22;
  716. let pool = rayon::ThreadPoolBuilder::new()
  717. .num_threads(max_parallel)
  718. .build()
  719. .context("failed to build rayon thread pool")?;
  720. pool.install(|| {
  721. jobs.into_par_iter()
  722. .map(|mut job| SbatchRunner::exec(&mut job))
  723. .collect::<anyhow::Result<Vec<_>>>()
  724. })
  725. }
  726. /// Local batch runner: execute the command directly on the machine,
  727. /// mimicking SbatchRunner behavior (output to file, async monitoring).
  728. pub trait LocalBatchRunner: Command {
  729. fn shell(&self) -> &str {
  730. "bash"
  731. }
  732. /// Output file pattern. Use `{}` as placeholder for job ID (UUID).
  733. fn output_pattern(&self) -> &str {
  734. "local-{}.out"
  735. }
  736. fn run(&mut self) -> anyhow::Result<CapturedOutput> {
  737. self.init()?;
  738. let job_id = Uuid::new_v4()
  739. .to_string()
  740. .split('-')
  741. .next()
  742. .unwrap_or("local")
  743. .to_string();
  744. let out_file = self.output_pattern().replace("{}", &job_id);
  745. info!("Running local batch job: {job_id}");
  746. let out_file_handle =
  747. File::create(&out_file).with_context(|| format!("failed to create output file: {out_file}"))?;
  748. let out_file_write = out_file_handle
  749. .try_clone()
  750. .context("failed to clone file handle for stderr")?;
  751. let mut child = ProcessCommand::new(self.shell())
  752. .arg("-c")
  753. .arg(self.cmd())
  754. .stdout(Stdio::from(out_file_handle))
  755. .stderr(Stdio::from(out_file_write))
  756. .spawn()
  757. .context("failed to spawn local batch command")?;
  758. let (stop_tx, stop_rx) = mpsc::channel::<()>();
  759. let out_path = out_file.clone();
  760. let captured_stdout = Arc::new(Mutex::new(String::new()));
  761. let captured_stdout_clone = Arc::clone(&captured_stdout);
  762. let job_id_clone = job_id.clone();
  763. let tail_handle = thread::spawn(move || -> anyhow::Result<()> {
  764. let mut pos: u64 = 0;
  765. loop {
  766. if stop_rx.try_recv().is_ok() {
  767. if let Ok(mut f) = File::open(&out_path) {
  768. if f.seek(SeekFrom::Start(pos)).is_ok() {
  769. let mut remaining = Vec::new();
  770. if f.read_to_end(&mut remaining).is_ok() && !remaining.is_empty() {
  771. let s = String::from_utf8_lossy(&remaining);
  772. let cleaned = clean_output(&s);
  773. for line in cleaned.lines() {
  774. let line = line.trim();
  775. if !line.is_empty() && !is_progress_line(line) {
  776. info!("{}: {}", job_id_clone, line);
  777. }
  778. }
  779. if let Ok(mut c) = captured_stdout_clone.lock() {
  780. c.push_str(&s);
  781. }
  782. }
  783. }
  784. }
  785. break;
  786. }
  787. if let Ok(mut f) = File::open(&out_path) {
  788. if f.seek(SeekFrom::Start(pos)).is_ok() {
  789. let mut buf = [0u8; 8192];
  790. loop {
  791. match f.read(&mut buf) {
  792. Ok(0) => break,
  793. Ok(n) => {
  794. pos += n as u64;
  795. let s = String::from_utf8_lossy(&buf[..n]);
  796. let cleaned = clean_output(&s);
  797. for line in cleaned.lines() {
  798. let line = line.trim();
  799. if !line.is_empty() && !is_progress_line(line) {
  800. info!("{}: {}", job_id_clone, line);
  801. }
  802. }
  803. if let Ok(mut c) = captured_stdout_clone.lock() {
  804. c.push_str(&s);
  805. }
  806. }
  807. Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
  808. Err(_) => break,
  809. }
  810. }
  811. }
  812. }
  813. thread::sleep(Duration::from_millis(100));
  814. }
  815. Ok(())
  816. });
  817. let status = child.wait().context("failed to wait on local batch command")?;
  818. let _ = stop_tx.send(());
  819. match tail_handle.join() {
  820. Ok(Ok(())) => {}
  821. Ok(Err(e)) => return Err(e.context("tail thread failed")),
  822. Err(_) => anyhow::bail!("tail thread panicked"),
  823. }
  824. if !status.success() {
  825. anyhow::bail!("local batch command failed with status: {status}");
  826. }
  827. self.clean_up()?;
  828. let stdout = match Arc::try_unwrap(captured_stdout) {
  829. Ok(mutex) => mutex.into_inner().unwrap_or_default(),
  830. Err(arc) => arc.lock().unwrap_or_else(|e| e.into_inner()).clone(),
  831. };
  832. let _ = std::fs::remove_file(&out_file);
  833. Ok(CapturedOutput {
  834. stdout,
  835. stderr: String::new(),
  836. slurm_epilog: None,
  837. })
  838. }
  839. }
  840. pub fn run_many_local_batch<T>(jobs: Vec<T>) -> anyhow::Result<Vec<CapturedOutput>>
  841. where
  842. T: LocalBatchRunner + Send,
  843. {
  844. // Set thread pool size based on max_concurrent
  845. // let pool = rayon::ThreadPoolBuilder::new()
  846. // .num_threads(threads)
  847. // .build()
  848. // .context("failed to build thread pool")?;
  849. // pool.install(|| {
  850. jobs.into_iter()
  851. .map(|mut job| LocalBatchRunner::run(&mut job))
  852. .collect()
  853. // })
  854. }
  855. /// Macro to run multiple batch commands in parallel, either via SLURM or locally.
  856. ///
  857. /// Usage:
  858. /// ```ignore
  859. /// let results = run_many!(cfg, jobs)?;
  860. ///
  861. /// ```
  862. #[macro_export]
  863. macro_rules! run_many {
  864. ($cfg:expr, $jobs:expr) => {{
  865. if $cfg.slurm_runner {
  866. $crate::commands::run_many_sbatch($jobs)
  867. } else {
  868. $crate::commands::run_many_local_batch($jobs)
  869. }
  870. }}
  871. }
  872. #[cfg(test)]
  873. mod tests {
  874. use super::{Command, SbatchRunner, SlurmParams};
  875. struct TestRun;
  876. impl Command for TestRun {
  877. fn cmd(&self) -> String {
  878. "echo 'hello from sbatch'".to_string()
  879. }
  880. }
  881. impl SbatchRunner for TestRun {
  882. fn slurm_params(&self) -> SlurmParams {
  883. SlurmParams {
  884. job_name: Some("test-sbatch".into()),
  885. partition: Some("gpgpuq".into()),
  886. cpus_per_task: Some(1),
  887. mem: Some("1G".into()),
  888. gres: None,
  889. }
  890. }
  891. }
  892. #[test]
  893. fn sbatch_test_run() -> anyhow::Result<()> {
  894. let mut t = TestRun;
  895. let out = SbatchRunner::exec(&mut t)?;
  896. println!("{out:#?}");
  897. assert!(out.stdout.contains("hello from sbatch"));
  898. Ok(())
  899. }
  900. }