helpers.rs 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214
  1. use anyhow::Context;
  2. use bitcode::{Decode, Encode};
  3. use glob::glob;
  4. use log::{debug, error, warn};
  5. use rust_htslib::bam::Read;
  6. use rustc_hash::FxHashMap;
  7. use serde::{Deserialize, Serialize};
  8. use uuid::Uuid;
  9. use std::{
  10. cmp::Ordering,
  11. collections::{HashMap, HashSet},
  12. fmt, fs,
  13. iter::Sum,
  14. ops::{Add, Div},
  15. path::{Path, PathBuf},
  16. sync::atomic::AtomicBool,
  17. };
  18. pub fn find_unique_file(dir_path: &str, suffix: &str) -> anyhow::Result<String> {
  19. let mut matching_files = Vec::new();
  20. for entry in
  21. fs::read_dir(dir_path).with_context(|| format!("Failed to read directory: {}", dir_path))?
  22. {
  23. let entry = entry.with_context(|| "Failed to read directory entry")?;
  24. let path = entry.path();
  25. if path.is_file()
  26. && path
  27. .file_name()
  28. .and_then(|name| name.to_str())
  29. .map(|name| name.ends_with(suffix))
  30. .unwrap_or(false)
  31. {
  32. matching_files.push(path);
  33. }
  34. }
  35. match matching_files.len() {
  36. 0 => Err(anyhow::anyhow!("No file found ending with '{}'", suffix))
  37. .with_context(|| format!("In directory: {}", dir_path)),
  38. 1 => Ok(matching_files[0].to_string_lossy().into_owned()),
  39. _ => Err(anyhow::anyhow!(
  40. "Multiple files found ending with '{}'",
  41. suffix
  42. ))
  43. .with_context(|| format!("In directory: {}", dir_path)),
  44. }
  45. }
  46. pub fn path_prefix(out: &str) -> anyhow::Result<String> {
  47. let out_path = Path::new(out);
  48. let out_dir = out_path
  49. .parent()
  50. .ok_or_else(|| anyhow::anyhow!("Can't parse the dir of {}", out_path.display()))?;
  51. let name = out_path
  52. .file_name()
  53. .and_then(|name| name.to_str())
  54. .ok_or_else(|| anyhow::anyhow!("Can't parse the file name of {}", out_path.display()))?;
  55. let stem = name
  56. .split_once('.')
  57. .map(|(stem, _)| stem)
  58. .ok_or_else(|| anyhow::anyhow!("Can't parse the file stem of {}", name))?;
  59. Ok(format!("{}/{stem}", out_dir.display()))
  60. }
  61. pub fn force_or_not(_path: &str, _force: bool) -> anyhow::Result<()> {
  62. // let path = Path::new(path);
  63. // let mut output_exists = path.exists();
  64. // let dir = path
  65. // .parent()
  66. // .context(format!("Can't parse the parent dir of {}", path.display()))?;
  67. //
  68. // if force && output_exists {
  69. // fs::remove_dir_all(dir)?;
  70. // fs::create_dir_all(dir)?;
  71. // output_exists = false;
  72. // }
  73. //
  74. // if output_exists {
  75. // info!("{} already exists.", path.display())
  76. // }
  77. Ok(())
  78. }
  79. /// Builds Singularity bind flags from input/output paths.
  80. ///
  81. /// - Converts file paths to their parent directory.
  82. /// - Skips non-existing paths to avoid Singularity errors.
  83. /// - Canonicalizes and deduplicates directories.
  84. pub fn singularity_bind_flags<I, P>(paths: I) -> String
  85. where
  86. I: IntoIterator<Item = P>,
  87. P: AsRef<Path>,
  88. {
  89. let mut seen = HashSet::new();
  90. let mut binds = Vec::new();
  91. let mut push_dir = |p: &Path| {
  92. if p.as_os_str().is_empty() || !p.exists() {
  93. return;
  94. }
  95. let dir = p.canonicalize().unwrap_or_else(|_| p.to_path_buf());
  96. if seen.insert(dir.clone()) {
  97. binds.push(format!(
  98. "--bind {src}:{dst}",
  99. src = dir.display(),
  100. dst = dir.display()
  101. ));
  102. }
  103. };
  104. for path_str in paths {
  105. let path = path_str.as_ref();
  106. if path.is_dir() {
  107. push_dir(path);
  108. } else if let Some(parent) = path.parent() {
  109. push_dir(parent);
  110. }
  111. }
  112. binds.join(" ")
  113. }
  114. use rayon::prelude::*;
  115. use std::cmp::Ord;
  116. pub struct VectorIntersection<T> {
  117. pub common: Vec<T>,
  118. pub only_in_first: Vec<T>,
  119. pub only_in_second: Vec<T>,
  120. }
  121. impl<T> fmt::Display for VectorIntersection<T> {
  122. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  123. let total_items = self.common.len() + self.only_in_first.len() + self.only_in_second.len();
  124. writeln!(f, "Total items: {}", total_items)?;
  125. writeln!(
  126. f,
  127. "Common: {} ({:.2}%)",
  128. self.common.len(),
  129. percentage(self.common.len(), total_items)
  130. )?;
  131. writeln!(
  132. f,
  133. "Only in first: {} ({:.2}%)",
  134. self.only_in_first.len(),
  135. percentage(self.only_in_first.len(), total_items)
  136. )?;
  137. writeln!(
  138. f,
  139. "Only in second: {} ({:.2}%)",
  140. self.only_in_second.len(),
  141. percentage(self.only_in_second.len(), total_items)
  142. )
  143. }
  144. }
  145. fn percentage(part: usize, total: usize) -> f64 {
  146. if total == 0 {
  147. 0.0
  148. } else {
  149. (part as f64 / total as f64) * 100.0
  150. }
  151. }
  152. impl<T> Default for VectorIntersection<T> {
  153. fn default() -> Self {
  154. Self {
  155. common: Vec::new(),
  156. only_in_first: Vec::new(),
  157. only_in_second: Vec::new(),
  158. }
  159. }
  160. }
  161. impl<T: Ord + Clone> VectorIntersection<T> {
  162. fn merge(&mut self, other: &mut Self) {
  163. self.common.append(&mut other.common);
  164. self.only_in_first.append(&mut other.only_in_first);
  165. self.only_in_second.append(&mut other.only_in_second);
  166. }
  167. }
  168. fn intersection<T: Ord + Clone>(vec1: &[T], vec2: &[T]) -> VectorIntersection<T> {
  169. let mut result = VectorIntersection::default();
  170. let mut i = 0;
  171. let mut j = 0;
  172. while i < vec1.len() && j < vec2.len() {
  173. match vec1[i].cmp(&vec2[j]) {
  174. Ordering::Less => {
  175. result.only_in_first.push(vec1[i].clone());
  176. i += 1;
  177. }
  178. Ordering::Greater => {
  179. result.only_in_second.push(vec2[j].clone());
  180. j += 1;
  181. }
  182. Ordering::Equal => {
  183. let val = &vec1[i];
  184. let mut count1 = 1;
  185. let mut count2 = 1;
  186. // Count occurrences in vec1
  187. while i + 1 < vec1.len() && &vec1[i + 1] == val {
  188. i += 1;
  189. count1 += 1;
  190. }
  191. // Count occurrences in vec2
  192. while j + 1 < vec2.len() && &vec2[j + 1] == val {
  193. j += 1;
  194. count2 += 1;
  195. }
  196. // Add to common
  197. result
  198. .common
  199. .extend(std::iter::repeat_n(val.clone(), count1.min(count2)));
  200. // Add excess to only_in_first or only_in_second
  201. match count1.cmp(&count2) {
  202. Ordering::Greater => {
  203. result
  204. .only_in_first
  205. .extend(std::iter::repeat_n(val.clone(), count1 - count2));
  206. }
  207. Ordering::Less => {
  208. result
  209. .only_in_second
  210. .extend(std::iter::repeat_n(val.clone(), count2 - count1));
  211. }
  212. Ordering::Equal => {
  213. // No excess elements, do nothing
  214. }
  215. }
  216. i += 1;
  217. j += 1;
  218. }
  219. }
  220. }
  221. result.only_in_first.extend(vec1[i..].iter().cloned());
  222. result.only_in_second.extend(vec2[j..].iter().cloned());
  223. result
  224. }
  225. pub fn par_intersection<T: Ord + Send + Sync + Clone>(
  226. vec1: &[T],
  227. vec2: &[T],
  228. ) -> VectorIntersection<T> {
  229. let chunk_size = (vec1.len() / rayon::current_num_threads()).max(1);
  230. vec1.par_chunks(chunk_size)
  231. .map(|chunk| {
  232. let start = vec2.partition_point(|x| x < &chunk[0]);
  233. let end = vec2.partition_point(|x| x <= &chunk[chunk.len() - 1]);
  234. // Ensure start is not greater than end
  235. if start <= end {
  236. intersection(chunk, &vec2[start..end])
  237. } else {
  238. // If start > end, there's no intersection for this chunk
  239. VectorIntersection::default()
  240. }
  241. })
  242. .reduce(VectorIntersection::default, |mut acc, mut x| {
  243. acc.merge(&mut x);
  244. acc
  245. })
  246. }
  247. pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 {
  248. let m = dna_sequence.len() as f64;
  249. // Early return for empty sequences
  250. if m == 0.0 {
  251. return 0.0;
  252. }
  253. // Count occurrences of each base
  254. let mut bases = HashMap::<char, usize>::new();
  255. for base in dna_sequence.chars() {
  256. *bases.entry(base).or_insert(0) += 1;
  257. }
  258. // Calculate Shannon entropy
  259. let shannon_entropy_value: f64 = bases
  260. .values()
  261. .map(|&n_i| {
  262. let p_i = n_i as f64 / m;
  263. if p_i > 0.0 {
  264. -p_i * p_i.log2()
  265. } else {
  266. 0.0 // Avoid log2(0)
  267. }
  268. })
  269. .sum();
  270. shannon_entropy_value
  271. }
  272. pub fn mean<T>(values: &[T]) -> f64
  273. where
  274. T: Copy + Add<Output = T> + Div<Output = T> + Sum + Into<f64>,
  275. {
  276. let count = values.len();
  277. if count == 0 {
  278. return 0.0;
  279. }
  280. let sum: T = values.iter().copied().sum();
  281. sum.into() / count as f64
  282. }
  283. pub fn bin_data<V>(data: Vec<V>, bin_size: V) -> Vec<(V, usize)>
  284. where
  285. V: Copy + PartialOrd + std::ops::AddAssign + std::ops::Add<Output = V>,
  286. {
  287. // Sort the data
  288. let mut sorted_data = data.clone();
  289. sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
  290. // Initialize bins
  291. let mut bins: Vec<(V, usize)> = Vec::new();
  292. let mut current_bin_start = sorted_data[0];
  293. let mut count = 0;
  294. for &value in &sorted_data {
  295. if value < current_bin_start + bin_size {
  296. count += 1;
  297. } else {
  298. bins.push((current_bin_start, count));
  299. current_bin_start += bin_size;
  300. count = 1;
  301. }
  302. }
  303. // Push the last bin
  304. bins.push((current_bin_start, count));
  305. bins
  306. }
  307. // fn aggregate_data(data: &[(u128, u32)], num_bins: usize) -> Vec<(u32, u32)> {
  308. // if data.is_empty() || num_bins == 0 {
  309. // return vec![];
  310. // }
  311. //
  312. // let bin_size = ((data.len() as f64) / (num_bins as f64)).ceil() as usize;
  313. //
  314. // (0..num_bins)
  315. // .map(|i| {
  316. // let start = i * bin_size;
  317. // let end = (start + bin_size).min(data.len()); // Ensure `end` does not exceed `data.len()`
  318. //
  319. // // If `start` is out of bounds, return (0, 0)
  320. // if start >= data.len() {
  321. // return (0, 0);
  322. // }
  323. //
  324. // let bin = &data[start..end];
  325. //
  326. // let sum_x: u128 = bin.iter().map(|&(x, _)| x).sum();
  327. // let count = bin.len() as u128;
  328. // let mean_x = (sum_x / count) as u32; // Rounded down to nearest u32
  329. //
  330. // let sum_n: u32 = bin.iter().map(|&(_, n)| n).sum();
  331. //
  332. // (mean_x, sum_n)
  333. // })
  334. // .collect()
  335. // }
  336. //
  337. pub fn app_storage_dir() -> anyhow::Result<PathBuf> {
  338. let app_name = env!("CARGO_PKG_NAME");
  339. let app_dir = dirs::data_dir()
  340. .context("Failed to get data directory")?
  341. .join(app_name);
  342. if !app_dir.exists() {
  343. fs::create_dir_all(&app_dir).context("Failed to create application directory")?;
  344. }
  345. Ok(app_dir)
  346. }
  347. use blake3::Hasher as Blake3Hasher;
  348. use std::hash::{BuildHasher, Hasher};
  349. pub struct Blake3Hash(Blake3Hasher);
  350. impl Hasher for Blake3Hash {
  351. fn finish(&self) -> u64 {
  352. let hash = self.0.finalize();
  353. u64::from_le_bytes(hash.as_bytes()[..8].try_into().unwrap())
  354. }
  355. fn write(&mut self, bytes: &[u8]) {
  356. self.0.update(bytes);
  357. }
  358. }
  359. /// Default Hasher
  360. #[derive(Default, Clone)]
  361. pub struct Blake3BuildHasher;
  362. impl BuildHasher for Blake3BuildHasher {
  363. type Hasher = Blake3Hash;
  364. fn build_hasher(&self) -> Self::Hasher {
  365. Blake3Hash(Blake3Hasher::new())
  366. }
  367. }
  368. // Custom 128-bit hash type
  369. #[derive(PartialEq, Eq, Clone, Copy, Serialize, Deserialize, Debug, Encode, Decode)]
  370. pub struct Hash128([u8; 16]);
  371. impl std::hash::Hash for Hash128 {
  372. fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
  373. state.write(&self.0);
  374. }
  375. }
  376. impl Hash128 {
  377. pub fn new(bytes: [u8; 16]) -> Self {
  378. Hash128(bytes)
  379. }
  380. pub fn to_bytes(&self) -> [u8; 16] {
  381. self.0
  382. }
  383. }
  384. pub fn get_dir_size(path: &Path) -> std::io::Result<u64> {
  385. let mut total_size = 0;
  386. if path.is_dir() {
  387. for entry in fs::read_dir(path)? {
  388. let entry = entry?;
  389. let path = entry.path();
  390. if path.is_file() {
  391. total_size += path.metadata()?.len();
  392. } else if path.is_dir() {
  393. total_size += get_dir_size(&path)?;
  394. }
  395. }
  396. }
  397. Ok(total_size)
  398. }
  399. /// Finds all files matching the given glob pattern.
  400. ///
  401. /// # Arguments
  402. ///
  403. /// * `pattern` - A glob pattern string (e.g., `"data/**/*.txt"`).
  404. ///
  405. /// # Returns
  406. ///
  407. /// * `Ok(Vec<PathBuf>)` with all successfully matched file paths.
  408. /// * `Err` if the glob pattern is invalid or any matched path fails to resolve.
  409. ///
  410. /// # Errors
  411. ///
  412. /// Returns an error if:
  413. /// - The glob pattern is invalid.
  414. /// - A file path matching the pattern cannot be resolved.
  415. ///
  416. /// # Examples
  417. ///
  418. /// ```rust
  419. /// let files = find_files("src/**/*.rs")?;
  420. /// for file in files {
  421. /// println!("{:?}", file);
  422. /// }
  423. /// ```
  424. pub fn find_files(pattern: &str) -> anyhow::Result<Vec<PathBuf>> {
  425. let mut result = Vec::new();
  426. let entries = glob(pattern).with_context(|| format!("Invalid glob pattern: '{}'", pattern))?;
  427. for entry in entries {
  428. let path =
  429. entry.with_context(|| format!("Failed to resolve path for pattern '{}'", pattern))?;
  430. result.push(path);
  431. }
  432. Ok(result)
  433. }
  434. // fn system_time_to_utc(system_time: SystemTime) -> Option<DateTime<Utc>> {
  435. // system_time
  436. // .duration_since(UNIX_EPOCH)
  437. // .ok()
  438. // .map(|duration| Utc.timestamp(duration.as_secs() as i64, duration.subsec_nanos()))
  439. // }
  440. /// List all files in a directory that have the given suffix (file extension).
  441. ///
  442. /// # Arguments
  443. /// * `dir` – Directory to scan.
  444. /// * `suffix` – File extension to match (without the dot).
  445. ///
  446. /// # Returns
  447. /// A vector of `PathBuf` entries whose extension matches `suffix`.
  448. ///
  449. /// # Errors
  450. /// Returns any I/O error from `read_dir` or directory traversal.
  451. pub fn list_files_with_ext(dir: impl AsRef<Path>, ext: &str) -> std::io::Result<Vec<PathBuf>> {
  452. let mut out = Vec::new();
  453. for entry in fs::read_dir(dir)? {
  454. let entry = entry?;
  455. let path = entry.path();
  456. if path.is_file() {
  457. if let Some(file_ext) = path.extension().and_then(|e| e.to_str()) {
  458. if file_ext == ext {
  459. out.push(path);
  460. }
  461. }
  462. }
  463. }
  464. Ok(out)
  465. }
  466. pub fn list_directories(dir_path: PathBuf) -> std::io::Result<Vec<String>> {
  467. let mut directories = Vec::new();
  468. for entry in fs::read_dir(dir_path)? {
  469. let entry = entry?;
  470. let path = entry.path();
  471. // Check if the path is a directory
  472. if path.is_dir() {
  473. if let Some(dir_name) = path.file_name().and_then(|name| name.to_str()) {
  474. directories.push(dir_name.to_string());
  475. }
  476. }
  477. }
  478. Ok(directories)
  479. }
  480. pub fn list_files_recursive(root: impl AsRef<Path>) -> Vec<PathBuf> {
  481. fn walk(dir: &Path, out: &mut Vec<PathBuf>) {
  482. if let Ok(entries) = fs::read_dir(dir) {
  483. for entry in entries.flatten() {
  484. let path = entry.path();
  485. if path.is_dir() {
  486. walk(&path, out);
  487. } else if path.is_file() {
  488. out.push(path);
  489. }
  490. }
  491. }
  492. }
  493. let mut files = Vec::new();
  494. walk(root.as_ref(), &mut files);
  495. files
  496. }
  497. /// Checks whether the modification time of `file1` is older than `file2`.
  498. ///
  499. /// If `rm` is `true` and `file1` is older, attempts to remove the directory containing `file1`.
  500. ///
  501. /// # Arguments
  502. ///
  503. /// * `file1` - Path to the first file.
  504. /// * `file2` - Path to the second file.
  505. /// * `rm` - If true, and `file1` is older, attempts to remove its parent directory.
  506. ///
  507. /// # Returns
  508. ///
  509. /// * `Ok(true)` if `file1` is older than `file2`.
  510. /// * `Ok(false)` otherwise.
  511. ///
  512. /// # Errors
  513. ///
  514. /// Returns an [`anyhow::Error`] if:
  515. /// - Either `file1` or `file2` does not exist.
  516. /// - File metadata cannot be read.
  517. /// - File modification times cannot be retrieved.
  518. /// - (if `rm == true`) Directory removal fails (if uncommented).
  519. ///
  520. pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result<bool> {
  521. debug!("is_file_older {file1} {file2}, {rm:?}");
  522. let mtime1 = fs::metadata(file1)
  523. .with_context(|| format!("Failed to read metadata for '{}'", file1))?
  524. .modified()
  525. .with_context(|| format!("Failed to get modified time for '{}'", file1))?;
  526. let mtime2 = fs::metadata(file2)
  527. .with_context(|| format!("Failed to read metadata for '{}'", file2))?
  528. .modified()
  529. .with_context(|| format!("Failed to get modified time for '{}'", file2))?;
  530. if mtime1 < mtime2 && rm {
  531. if let Some(file1_dir) = Path::new(file1).parent() {
  532. warn!("Removing old directory: {}", file1_dir.display());
  533. fs::remove_dir_all(file1_dir).map_err(|e| {
  534. warn!("Failed to remove {}: {}", file1_dir.display(), e);
  535. e
  536. })?;
  537. }
  538. }
  539. Ok(mtime1 < mtime2)
  540. }
  541. pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> {
  542. debug!("Trying to remove: {dir}");
  543. // match fs::remove_dir_all(dir) {
  544. // Ok(_) => {}
  545. // Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
  546. // Err(e) => {
  547. // anyhow::bail!("Failed to remove directory '{}': {}", dir, e);
  548. // }
  549. // };
  550. Ok(())
  551. }
  552. /// Searches a directory for the first file matching a given name pattern.
  553. ///
  554. /// This function looks for a file in the given directory whose filename
  555. /// starts with the provided `starts_with` prefix and ends with the provided
  556. /// `ends_with` suffix. It returns the full path to the first match found.
  557. ///
  558. /// # Arguments
  559. ///
  560. /// * `dir` - A reference to the directory path where the search will occur.
  561. /// * `starts_with` - The required prefix of the filename (e.g., `"throughput_"`).
  562. /// * `ends_with` - The required suffix of the filename (e.g., `".csv"`).
  563. ///
  564. /// # Returns
  565. ///
  566. /// * `Some(PathBuf)` - Path to the first file matching the pattern.
  567. /// * `None` - If no matching file is found or the directory can't be read.
  568. ///
  569. /// # Example
  570. ///
  571. /// ```rust
  572. /// let dir = std::path::Path::new("/path/to/data");
  573. /// if let Some(path) = find_matching_file(dir, "throughput_", ".csv") {
  574. /// println!("Found file: {}", path.display());
  575. /// } else {
  576. /// eprintln!("No matching file found.");
  577. /// }
  578. /// ```
  579. pub fn find_matching_file(dir: &Path, starts_with: &str, ends_with: &str) -> Option<PathBuf> {
  580. fs::read_dir(dir)
  581. .ok()?
  582. .filter_map(Result::ok)
  583. .map(|entry| entry.path())
  584. .find(|path| {
  585. path.is_file()
  586. && path
  587. .file_name()
  588. .and_then(|name| name.to_str())
  589. .map(|name| name.starts_with(starts_with) && name.ends_with(ends_with))
  590. .unwrap_or(false)
  591. })
  592. }
  593. /// Searches for the first file in the given directory whose file name
  594. /// satisfies the provided condition.
  595. ///
  596. /// # Arguments
  597. ///
  598. /// * `dir` - Path to the directory to search.
  599. /// * `condition` - A closure that takes a file name (`&str`) and returns `true`
  600. /// if the file matches the desired condition.
  601. ///
  602. /// # Returns
  603. ///
  604. /// An `Option<PathBuf>` containing the path to the first matching file,
  605. /// or `None` if no file matches or the directory can't be read.
  606. ///
  607. /// # Examples
  608. ///
  609. /// ```
  610. /// use std::path::Path;
  611. /// let result = find_file(Path::new("."), |name| name.ends_with(".rs"));
  612. /// ```
  613. pub fn find_file<F>(dir: &Path, condition: F) -> Option<PathBuf>
  614. where
  615. F: Fn(&str) -> bool,
  616. {
  617. fs::read_dir(dir).ok()?.find_map(|entry| {
  618. let path = entry.ok()?.path();
  619. if path.is_file()
  620. && path
  621. .file_name()
  622. .and_then(|name| name.to_str())
  623. .map(&condition)
  624. .unwrap_or(false)
  625. {
  626. Some(path)
  627. } else {
  628. None
  629. }
  630. })
  631. }
  632. /// Represents a string made of either:
  633. /// - one character repeated N times,
  634. /// - a two-character block repeated N times,
  635. /// - or no valid repetition pattern.
  636. #[derive(Debug, PartialEq)]
  637. pub enum Repeat {
  638. None,
  639. RepOne(char, usize),
  640. RepTwo(String, usize),
  641. }
  642. pub fn detect_repetition(s: &str) -> Repeat {
  643. let len = s.chars().count();
  644. if len == 0 {
  645. return Repeat::None;
  646. }
  647. // Check for single-char repetition
  648. let mut chars = s.chars();
  649. let first = chars.next().unwrap();
  650. if s.chars().all(|c| c == first) {
  651. return Repeat::RepOne(first, len);
  652. }
  653. // Check for two-char block repetition
  654. if len.is_multiple_of(2) {
  655. let mut iter = s.chars();
  656. let a = iter.next().unwrap();
  657. let b = iter.next().unwrap();
  658. let mut count = 1;
  659. while let (Some(c), Some(d)) = (iter.next(), iter.next()) {
  660. if c != a || d != b {
  661. return Repeat::None;
  662. }
  663. count += 1;
  664. }
  665. return Repeat::RepTwo(format!("{}{}", a, b), count);
  666. }
  667. Repeat::None
  668. }
  669. pub fn test_init() {
  670. let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("debug"))
  671. .try_init();
  672. }
  673. use regex::Regex;
  674. /// Extracts the numeric barcode suffix from a filename.
  675. ///
  676. /// # Examples
  677. ///
  678. /// ```
  679. /// let f = "sample_SQK-NBD114-24_barcode07.bam";
  680. /// assert_eq!(extract_barcode(f), Some(7));
  681. /// ```
  682. ///
  683. /// Returns `None` if no `barcodeNN` pattern is found
  684. /// or if the number cannot be parsed.
  685. pub fn extract_barcode(name: &str) -> Option<u32> {
  686. let re = Regex::new(r"barcode(\d+)").unwrap();
  687. re.captures(name)
  688. .and_then(|c| c.get(1))
  689. .and_then(|m| m.as_str().parse::<u32>().ok())
  690. }
  691. pub fn human_size(bytes: u64) -> String {
  692. const UNITS: [&str; 5] = ["B", "KB", "MB", "GB", "TB"];
  693. let mut size = bytes as f64;
  694. let mut unit = 0;
  695. while size >= 1024.0 && unit < UNITS.len() - 1 {
  696. size /= 1024.0;
  697. unit += 1;
  698. }
  699. if unit == 0 {
  700. format!("{} {}", bytes, UNITS[unit])
  701. } else {
  702. format!("{:.2} {}", size, UNITS[unit])
  703. }
  704. }
  705. // A RAII (Resource Acquisition Is Initialization) guard for cleaning up temporary directories.
  706. pub struct TempDirGuard {
  707. path: PathBuf,
  708. }
  709. impl TempDirGuard {
  710. pub fn new(path: PathBuf) -> Self {
  711. Self { path }
  712. }
  713. }
  714. // The core cleanup logic: executed automatically when the TempDirGuard is dropped.
  715. impl Drop for TempDirGuard {
  716. fn drop(&mut self) {
  717. if self.path.exists() {
  718. // Log the attempt. Using eprintln/warn! ensures the error is noted,
  719. // but the program does not panic.
  720. if let Err(e) = fs::remove_dir_all(&self.path) {
  721. error!(
  722. "Failed to remove temporary directory '{}': {}",
  723. self.path.display(),
  724. e
  725. );
  726. }
  727. }
  728. }
  729. }
  730. /// Guard that tracks temporary filesystem paths and cleans them up on drop
  731. /// unless explicitly disarmed.
  732. ///
  733. /// This is intended for pipeline-style code where intermediate outputs
  734. /// are produced by downstream tools. The guard only tracks paths; it does
  735. /// **not** create files itself.
  736. ///
  737. /// If the pipeline completes successfully, call [`disarm`] to prevent
  738. /// cleanup. Otherwise, all tracked paths are removed on drop on a
  739. /// best-effort basis.
  740. ///
  741. /// # Example
  742. ///
  743. /// ```no_run
  744. /// use std::path::PathBuf;
  745. ///
  746. /// let tmp_dir = PathBuf::from("/tmp");
  747. /// let mut guard = TempFileGuard::new();
  748. ///
  749. /// let path1 = guard.temp_path(".tmp", &tmp_dir);
  750. /// let path2 = guard.temp_path(".tmp", &tmp_dir);
  751. ///
  752. /// // Produce outputs at `path1` and `path2`
  753. ///
  754. /// // Mark success
  755. /// guard.disarm();
  756. /// ```
  757. ///
  758. /// If `disarm()` is not called, both paths are cleaned up automatically
  759. /// when the guard is dropped.
  760. pub struct TempFileGuard {
  761. files: Vec<PathBuf>,
  762. disarmed: AtomicBool,
  763. }
  764. impl Default for TempFileGuard {
  765. fn default() -> Self {
  766. Self::new()
  767. }
  768. }
  769. impl TempFileGuard {
  770. pub fn new() -> Self {
  771. Self {
  772. files: Vec::new(),
  773. disarmed: AtomicBool::new(false),
  774. }
  775. }
  776. /// Registers a temporary file for cleanup.
  777. pub fn track(&mut self, path: PathBuf) {
  778. self.files.push(path);
  779. }
  780. /// Registers multiple temporary files for cleanup.
  781. pub fn track_many(&mut self, paths: impl IntoIterator<Item = PathBuf>) {
  782. self.files.extend(paths);
  783. }
  784. /// Disarms the guard, preventing cleanup on drop.
  785. /// Call this when the pipeline succeeds.
  786. pub fn disarm(&self) {
  787. self.disarmed
  788. .store(true, std::sync::atomic::Ordering::SeqCst);
  789. }
  790. /// Manually clean up all tracked files.
  791. pub fn cleanup(&mut self) {
  792. for file in &self.files {
  793. if file.exists() {
  794. if let Err(e) = remove_bam_with_index(file) {
  795. warn!("Failed to clean up temp file {}: {}", file.display(), e);
  796. }
  797. }
  798. }
  799. self.files.clear();
  800. }
  801. pub fn clear(&mut self) {
  802. self.files.clear();
  803. }
  804. /// Generates a unique temporary path inside `tmp_dir` and registers it
  805. /// for cleanup.
  806. ///
  807. /// The returned path is derived from a UUIDv4 and **no file is created**.
  808. /// The caller is responsible for creating or writing to the path.
  809. ///
  810. /// # Example
  811. ///
  812. /// ```no_run
  813. /// use std::path::PathBuf;
  814. ///
  815. /// let tmp_dir = PathBuf::from("/tmp");
  816. /// let mut guard = TempFileGuard::new();
  817. ///
  818. /// let path = guard.temp_path(".tmp", &tmp_dir);
  819. /// // create or write to `path`
  820. /// ```
  821. pub fn tmp_path(&mut self, suffix: &str, tmp_dir: impl AsRef<Path>) -> PathBuf {
  822. let name = format!("pandora-tmp-{}{}", Uuid::new_v4(), suffix);
  823. let tmp_dir = tmp_dir.as_ref();
  824. let path = tmp_dir.join(name);
  825. self.track(path.clone());
  826. path
  827. }
  828. }
  829. impl Drop for TempFileGuard {
  830. fn drop(&mut self) {
  831. if !self.disarmed.load(std::sync::atomic::Ordering::SeqCst) {
  832. warn!(
  833. "Pipeline failed or panicked, cleaning up {} temporary files",
  834. self.files.len()
  835. );
  836. self.cleanup();
  837. }
  838. }
  839. }
  840. // pub fn temp_file_path(suffix: &str, config: &Config) -> std::io::Result<PathBuf> {
  841. // let p = config.tmp_dir;
  842. // let temp_path = tempfile::Builder::new()
  843. // .prefix("pandora-temp-")
  844. // .suffix(suffix)
  845. // .rand_bytes(5)
  846. // .tempfile()?
  847. // .into_temp_path();
  848. //
  849. // Ok(temp_path.to_path_buf())
  850. // }
  851. /// Extracts genome sizes from BAM header.
  852. pub fn get_genome_sizes(
  853. header: &rust_htslib::bam::Header,
  854. ) -> anyhow::Result<FxHashMap<String, u64>> {
  855. let mut sizes = FxHashMap::default();
  856. for (_, records) in header.to_hashmap() {
  857. for record in records {
  858. if let (Some(sn), Some(ln)) = (record.get("SN"), record.get("LN")) {
  859. if let Ok(len) = ln.parse::<u64>() {
  860. sizes.insert(sn.clone(), len);
  861. }
  862. }
  863. }
  864. }
  865. Ok(sizes)
  866. }
  867. pub fn bam_contigs<P: AsRef<std::path::Path>>(bam: P) -> anyhow::Result<Vec<String>> {
  868. let reader = rust_htslib::bam::Reader::from_path(&bam)?;
  869. let header = rust_htslib::bam::Header::from_template(reader.header());
  870. Ok(get_genome_sizes(&header)?.into_keys().collect())
  871. }
  872. /// Split genome into ~n_parts equal-sized chunks (by number of bases),
  873. /// returning for each chunk a list of regions in `ctg:start-end` form.
  874. /// Split genome into ~n_parts equal-sized chunks (by bases).
  875. /// Each chunk is a single region `ctg:start-end` (no cross-contig regions).
  876. ///
  877. /// Note: the number of regions returned will be ≈ n_parts (may differ by 1–2).
  878. pub fn split_genome_into_n_regions(
  879. genome_sizes: &FxHashMap<String, u64>,
  880. n_parts: usize,
  881. ) -> Vec<String> {
  882. if n_parts == 0 || genome_sizes.is_empty() {
  883. return Vec::new();
  884. }
  885. // Deterministic contig order
  886. let mut contigs: Vec<(String, u64)> = genome_sizes
  887. .iter()
  888. .map(|(ctg, len)| (ctg.clone(), *len))
  889. .collect();
  890. contigs.sort_by(|a, b| a.0.cmp(&b.0));
  891. let total_bases: u64 = contigs.iter().map(|(_, len)| *len).sum();
  892. if total_bases == 0 {
  893. return Vec::new();
  894. }
  895. let target_chunk_size: u64 = total_bases.div_ceil(n_parts as u64); // ceil
  896. let mut regions = Vec::new();
  897. for (ctg, len) in contigs {
  898. let mut remaining = len;
  899. let mut start: u64 = 1;
  900. while remaining > 0 {
  901. let take = remaining.min(target_chunk_size);
  902. let end = start + take - 1;
  903. regions.push(format!("{ctg}:{start}-{end}"));
  904. remaining -= take;
  905. start = end + 1;
  906. }
  907. }
  908. regions
  909. }
  910. /// Split genome into exactly `n_parts` chunks with (almost) equal total size.
  911. /// Each chunk is a *list* of regions `ctg:start-end`. A chunk may span several
  912. /// contigs, but each region is within a single contig.
  913. ///
  914. /// The sizes of the parts will differ by at most 1 base.
  915. pub fn split_genome_into_n_regions_exact(
  916. genome_sizes: &FxHashMap<String, u64>,
  917. n_parts: usize,
  918. ) -> Vec<Vec<String>> {
  919. if n_parts == 0 || genome_sizes.is_empty() {
  920. return Vec::new();
  921. }
  922. // Deterministic contig order
  923. let mut contigs: Vec<(String, u64)> = genome_sizes
  924. .iter()
  925. .map(|(ctg, len)| (ctg.clone(), *len))
  926. .collect();
  927. contigs.sort_by(|a, b| a.0.cmp(&b.0));
  928. let total_bases: u64 = contigs.iter().map(|(_, len)| *len).sum();
  929. if total_bases == 0 {
  930. return Vec::new();
  931. }
  932. // We cannot have more non-empty parts than bases if we use 1-based inclusive coordinates.
  933. let parts = n_parts.min(total_bases as usize);
  934. // Distribute bases as evenly as possible:
  935. // first `remainder` parts get (base_per_part + 1), the rest get base_per_part.
  936. let base_per_part = total_bases / parts as u64;
  937. let remainder = total_bases % parts as u64;
  938. let target_sizes: Vec<u64> = (0..parts)
  939. .map(|i| {
  940. if i < remainder as usize {
  941. base_per_part + 1
  942. } else {
  943. base_per_part
  944. }
  945. })
  946. .collect();
  947. let mut chunks: Vec<Vec<String>> = Vec::with_capacity(parts);
  948. let mut part_idx = 0;
  949. let mut remaining_in_part = target_sizes[0];
  950. let mut ctg_iter = contigs.into_iter();
  951. while let Some((ctg, mut len)) = ctg_iter.next() {
  952. let mut start: u64 = 1;
  953. while len > 0 && part_idx < parts {
  954. let take = len.min(remaining_in_part);
  955. let end = start + take - 1;
  956. if chunks.len() <= part_idx {
  957. chunks.push(Vec::new());
  958. }
  959. chunks[part_idx].push(format!("{ctg}:{start}-{end}"));
  960. len -= take;
  961. start = end + 1;
  962. remaining_in_part -= take;
  963. if remaining_in_part == 0 {
  964. part_idx += 1;
  965. if part_idx == parts {
  966. break;
  967. }
  968. remaining_in_part = target_sizes[part_idx];
  969. }
  970. }
  971. }
  972. // Sanity: we should have exactly `parts` chunks, all non-empty
  973. debug_assert_eq!(chunks.len(), parts);
  974. chunks
  975. }
  976. /// Removes a BAM file and its associated index file (.bai).
  977. pub fn remove_bam_with_index(bam: &Path) -> anyhow::Result<()> {
  978. // Remove BAM
  979. if bam.exists() {
  980. fs::remove_file(bam).with_context(|| format!("Failed to remove BAM: {}", bam.display()))?;
  981. }
  982. // Remove index (.bam.bai or .bai)
  983. let bai = bam.with_extension("bam.bai");
  984. if bai.exists() {
  985. fs::remove_file(&bai).ok(); // Don't fail if index doesn't exist
  986. }
  987. let bai_alt = bam.with_extension("bai");
  988. if bai_alt.exists() {
  989. fs::remove_file(&bai_alt).ok();
  990. }
  991. Ok(())
  992. }
  993. /// Convert a numeric value to an RGB triplet using a diverging palette.
  994. ///
  995. /// Values are clipped to `[-clip, +clip]` and mapped as:
  996. /// - negative → blue
  997. /// - zero → white
  998. /// - positive → red
  999. ///
  1000. /// # Arguments
  1001. /// * `v` – value to color (e.g. delta methylation)
  1002. /// * `clip` – symmetric clipping bound (must be > 0)
  1003. ///
  1004. /// # Returns
  1005. /// `(r, g, b)` as 8-bit RGB components.
  1006. ///
  1007. /// # Panics
  1008. /// Panics if `clip <= 0.0`.
  1009. pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) {
  1010. assert!(clip > 0.0, "clip must be > 0");
  1011. let x = v.clamp(-clip, clip);
  1012. let t = (x + clip) / (2.0 * clip); // [-clip,+clip] → [0,1]
  1013. if t < 0.5 {
  1014. // blue → white
  1015. let u = t / 0.5;
  1016. let r = (255.0 * u).round() as u8;
  1017. let g = (255.0 * u).round() as u8;
  1018. (r, g, 255)
  1019. } else {
  1020. // white → red
  1021. let u = (t - 0.5) / 0.5;
  1022. let g = (255.0 * (1.0 - u)).round() as u8;
  1023. let b = (255.0 * (1.0 - u)).round() as u8;
  1024. (255, g, b)
  1025. }
  1026. }
  1027. pub fn revcomp(s: &str) -> String {
  1028. fn comp(b: u8) -> u8 {
  1029. match b {
  1030. b'A' => b'T',
  1031. b'C' => b'G',
  1032. b'G' => b'C',
  1033. b'T' => b'A',
  1034. b'N' => b'N',
  1035. _ => b'N',
  1036. }
  1037. }
  1038. let v: Vec<u8> = s.as_bytes().iter().rev().map(|&b| comp(b.to_ascii_uppercase())).collect();
  1039. String::from_utf8(v).unwrap()
  1040. }
  1041. #[cfg(test)]
  1042. mod tests {
  1043. use log::info;
  1044. use rust_htslib::bam::{self, Read};
  1045. use crate::helpers::test_init;
  1046. use super::*;
  1047. #[test]
  1048. fn split_g() -> anyhow::Result<()> {
  1049. test_init();
  1050. let n_parts = 4;
  1051. let reader = bam::Reader::from_path(
  1052. "/mnt/beegfs02/scratch/t_steimle/data/wgs/34528/norm/34528_norm_hs1.bam",
  1053. )
  1054. .with_context(|| format!("Failed to open BAM"))?;
  1055. let header = bam::Header::from_template(reader.header());
  1056. let genome_sizes = get_genome_sizes(&header)?;
  1057. // Split genome into regions
  1058. let regions = split_genome_into_n_regions_exact(&genome_sizes, n_parts);
  1059. info!("{:#?}", regions);
  1060. Ok(())
  1061. }
  1062. }