helpers.rs 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628
  1. use anyhow::Context;
  2. use bitcode::{Decode, Encode};
  3. use glob::glob;
  4. use log::{debug, warn};
  5. use serde::{Deserialize, Serialize};
  6. use std::{
  7. cmp::Ordering,
  8. collections::HashMap,
  9. fmt, fs,
  10. iter::Sum,
  11. ops::{Add, Div},
  12. path::{Path, PathBuf},
  13. };
  14. pub fn find_unique_file(dir_path: &str, suffix: &str) -> anyhow::Result<String> {
  15. let mut matching_files = Vec::new();
  16. for entry in
  17. fs::read_dir(dir_path).with_context(|| format!("Failed to read directory: {}", dir_path))?
  18. {
  19. let entry = entry.with_context(|| "Failed to read directory entry")?;
  20. let path = entry.path();
  21. if path.is_file()
  22. && path
  23. .file_name()
  24. .and_then(|name| name.to_str())
  25. .map(|name| name.ends_with(suffix))
  26. .unwrap_or(false)
  27. {
  28. matching_files.push(path);
  29. }
  30. }
  31. match matching_files.len() {
  32. 0 => Err(anyhow::anyhow!("No file found ending with '{}'", suffix))
  33. .with_context(|| format!("In directory: {}", dir_path)),
  34. 1 => Ok(matching_files[0].to_string_lossy().into_owned()),
  35. _ => Err(anyhow::anyhow!(
  36. "Multiple files found ending with '{}'",
  37. suffix
  38. ))
  39. .with_context(|| format!("In directory: {}", dir_path)),
  40. }
  41. }
  42. pub fn path_prefix(out: &str) -> anyhow::Result<String> {
  43. let out_path = Path::new(&out);
  44. let out_dir = out_path
  45. .parent()
  46. .ok_or_else(|| anyhow::anyhow!("Can't parse the dir of {}", out_path.display()))?;
  47. let name = out_path
  48. .file_name()
  49. .and_then(|name| name.to_str())
  50. .ok_or_else(|| anyhow::anyhow!("Can't parse the file name of {}", out_path.display()))?;
  51. let stem = name
  52. .split_once('.')
  53. .map(|(stem, _)| stem)
  54. .ok_or_else(|| anyhow::anyhow!("Can't parse the file stem of {}", name))?;
  55. Ok(format!("{}/{stem}", out_dir.display()))
  56. }
  57. pub fn force_or_not(_path: &str, _force: bool) -> anyhow::Result<()> {
  58. // let path = Path::new(path);
  59. // let mut output_exists = path.exists();
  60. // let dir = path
  61. // .parent()
  62. // .context(format!("Can't parse the parent dir of {}", path.display()))?;
  63. //
  64. // if force && output_exists {
  65. // fs::remove_dir_all(dir)?;
  66. // fs::create_dir_all(dir)?;
  67. // output_exists = false;
  68. // }
  69. //
  70. // if output_exists {
  71. // info!("{} already exists.", path.display())
  72. // }
  73. Ok(())
  74. }
  75. use rayon::prelude::*;
  76. use std::cmp::Ord;
  77. pub struct VectorIntersection<T> {
  78. pub common: Vec<T>,
  79. pub only_in_first: Vec<T>,
  80. pub only_in_second: Vec<T>,
  81. }
  82. impl<T> fmt::Display for VectorIntersection<T> {
  83. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  84. let total_items = self.common.len() + self.only_in_first.len() + self.only_in_second.len();
  85. writeln!(f, "Total items: {}", total_items)?;
  86. writeln!(
  87. f,
  88. "Common: {} ({:.2}%)",
  89. self.common.len(),
  90. percentage(self.common.len(), total_items)
  91. )?;
  92. writeln!(
  93. f,
  94. "Only in first: {} ({:.2}%)",
  95. self.only_in_first.len(),
  96. percentage(self.only_in_first.len(), total_items)
  97. )?;
  98. writeln!(
  99. f,
  100. "Only in second: {} ({:.2}%)",
  101. self.only_in_second.len(),
  102. percentage(self.only_in_second.len(), total_items)
  103. )
  104. }
  105. }
  106. fn percentage(part: usize, total: usize) -> f64 {
  107. if total == 0 {
  108. 0.0
  109. } else {
  110. (part as f64 / total as f64) * 100.0
  111. }
  112. }
  113. impl<T> Default for VectorIntersection<T> {
  114. fn default() -> Self {
  115. Self {
  116. common: Vec::new(),
  117. only_in_first: Vec::new(),
  118. only_in_second: Vec::new(),
  119. }
  120. }
  121. }
  122. impl<T: Ord + Clone> VectorIntersection<T> {
  123. fn merge(&mut self, other: &mut Self) {
  124. self.common.append(&mut other.common);
  125. self.only_in_first.append(&mut other.only_in_first);
  126. self.only_in_second.append(&mut other.only_in_second);
  127. }
  128. }
  129. fn intersection<T: Ord + Clone>(vec1: &[T], vec2: &[T]) -> VectorIntersection<T> {
  130. let mut result = VectorIntersection::default();
  131. let mut i = 0;
  132. let mut j = 0;
  133. while i < vec1.len() && j < vec2.len() {
  134. match vec1[i].cmp(&vec2[j]) {
  135. Ordering::Less => {
  136. result.only_in_first.push(vec1[i].clone());
  137. i += 1;
  138. }
  139. Ordering::Greater => {
  140. result.only_in_second.push(vec2[j].clone());
  141. j += 1;
  142. }
  143. Ordering::Equal => {
  144. let val = &vec1[i];
  145. let mut count1 = 1;
  146. let mut count2 = 1;
  147. // Count occurrences in vec1
  148. while i + 1 < vec1.len() && &vec1[i + 1] == val {
  149. i += 1;
  150. count1 += 1;
  151. }
  152. // Count occurrences in vec2
  153. while j + 1 < vec2.len() && &vec2[j + 1] == val {
  154. j += 1;
  155. count2 += 1;
  156. }
  157. // Add to common
  158. result
  159. .common
  160. .extend(std::iter::repeat(val.clone()).take(count1.min(count2)));
  161. // Add excess to only_in_first or only_in_second
  162. match count1.cmp(&count2) {
  163. Ordering::Greater => {
  164. result
  165. .only_in_first
  166. .extend(std::iter::repeat(val.clone()).take(count1 - count2));
  167. }
  168. Ordering::Less => {
  169. result
  170. .only_in_second
  171. .extend(std::iter::repeat(val.clone()).take(count2 - count1));
  172. }
  173. Ordering::Equal => {
  174. // No excess elements, do nothing
  175. }
  176. }
  177. i += 1;
  178. j += 1;
  179. }
  180. }
  181. }
  182. result.only_in_first.extend(vec1[i..].iter().cloned());
  183. result.only_in_second.extend(vec2[j..].iter().cloned());
  184. result
  185. }
  186. pub fn par_intersection<T: Ord + Send + Sync + Clone>(
  187. vec1: &[T],
  188. vec2: &[T],
  189. ) -> VectorIntersection<T> {
  190. let chunk_size = (vec1.len() / rayon::current_num_threads()).max(1);
  191. vec1.par_chunks(chunk_size)
  192. .map(|chunk| {
  193. let start = vec2.partition_point(|x| x < &chunk[0]);
  194. let end = vec2.partition_point(|x| x <= &chunk[chunk.len() - 1]);
  195. // Ensure start is not greater than end
  196. if start <= end {
  197. intersection(chunk, &vec2[start..end])
  198. } else {
  199. // If start > end, there's no intersection for this chunk
  200. VectorIntersection::default()
  201. }
  202. })
  203. .reduce(VectorIntersection::default, |mut acc, mut x| {
  204. acc.merge(&mut x);
  205. acc
  206. })
  207. }
  208. pub fn temp_file_path(suffix: &str) -> std::io::Result<PathBuf> {
  209. let temp_path = tempfile::Builder::new()
  210. .prefix("pandora-temp-")
  211. .suffix(suffix)
  212. .rand_bytes(5)
  213. .tempfile()?
  214. .into_temp_path();
  215. Ok(temp_path.to_path_buf())
  216. }
  217. pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 {
  218. let m = dna_sequence.len() as f64;
  219. // Early return for empty sequences
  220. if m == 0.0 {
  221. return 0.0;
  222. }
  223. // Count occurrences of each base
  224. let mut bases = HashMap::<char, usize>::new();
  225. for base in dna_sequence.chars() {
  226. *bases.entry(base).or_insert(0) += 1;
  227. }
  228. // Calculate Shannon entropy
  229. let shannon_entropy_value: f64 = bases
  230. .values()
  231. .map(|&n_i| {
  232. let p_i = n_i as f64 / m;
  233. if p_i > 0.0 {
  234. -p_i * p_i.log2()
  235. } else {
  236. 0.0 // Avoid log2(0)
  237. }
  238. })
  239. .sum();
  240. shannon_entropy_value
  241. }
  242. pub fn mean<T>(values: &[T]) -> f64
  243. where
  244. T: Copy + Add<Output = T> + Div<Output = T> + Sum + Into<f64>,
  245. {
  246. let count = values.len();
  247. if count == 0 {
  248. return 0.0;
  249. }
  250. let sum: T = values.iter().copied().sum();
  251. sum.into() / count as f64
  252. }
  253. pub fn bin_data<V>(data: Vec<V>, bin_size: V) -> Vec<(V, usize)>
  254. where
  255. V: Copy + PartialOrd + std::ops::AddAssign + std::ops::Add<Output = V>,
  256. {
  257. // Sort the data
  258. let mut sorted_data = data.clone();
  259. sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
  260. // Initialize bins
  261. let mut bins: Vec<(V, usize)> = Vec::new();
  262. let mut current_bin_start = sorted_data[0];
  263. let mut count = 0;
  264. for &value in &sorted_data {
  265. if value < current_bin_start + bin_size {
  266. count += 1;
  267. } else {
  268. bins.push((current_bin_start, count));
  269. current_bin_start += bin_size;
  270. count = 1;
  271. }
  272. }
  273. // Push the last bin
  274. bins.push((current_bin_start, count));
  275. bins
  276. }
  277. // fn aggregate_data(data: &[(u128, u32)], num_bins: usize) -> Vec<(u32, u32)> {
  278. // if data.is_empty() || num_bins == 0 {
  279. // return vec![];
  280. // }
  281. //
  282. // let bin_size = ((data.len() as f64) / (num_bins as f64)).ceil() as usize;
  283. //
  284. // (0..num_bins)
  285. // .map(|i| {
  286. // let start = i * bin_size;
  287. // let end = (start + bin_size).min(data.len()); // Ensure `end` does not exceed `data.len()`
  288. //
  289. // // If `start` is out of bounds, return (0, 0)
  290. // if start >= data.len() {
  291. // return (0, 0);
  292. // }
  293. //
  294. // let bin = &data[start..end];
  295. //
  296. // let sum_x: u128 = bin.iter().map(|&(x, _)| x).sum();
  297. // let count = bin.len() as u128;
  298. // let mean_x = (sum_x / count) as u32; // Rounded down to nearest u32
  299. //
  300. // let sum_n: u32 = bin.iter().map(|&(_, n)| n).sum();
  301. //
  302. // (mean_x, sum_n)
  303. // })
  304. // .collect()
  305. // }
  306. //
  307. pub fn app_storage_dir() -> anyhow::Result<PathBuf> {
  308. let app_name = env!("CARGO_PKG_NAME");
  309. let app_dir = dirs::data_dir()
  310. .context("Failed to get data directory")?
  311. .join(app_name);
  312. if !app_dir.exists() {
  313. fs::create_dir_all(&app_dir).context("Failed to create application directory")?;
  314. }
  315. Ok(app_dir)
  316. }
  317. use blake3::Hasher as Blake3Hasher;
  318. use std::hash::{BuildHasher, Hasher};
  319. pub struct Blake3Hash(Blake3Hasher);
  320. impl Hasher for Blake3Hash {
  321. fn finish(&self) -> u64 {
  322. let hash = self.0.finalize();
  323. u64::from_le_bytes(hash.as_bytes()[..8].try_into().unwrap())
  324. }
  325. fn write(&mut self, bytes: &[u8]) {
  326. self.0.update(bytes);
  327. }
  328. }
  329. /// Default Hasher
  330. #[derive(Default, Clone)]
  331. pub struct Blake3BuildHasher;
  332. impl BuildHasher for Blake3BuildHasher {
  333. type Hasher = Blake3Hash;
  334. fn build_hasher(&self) -> Self::Hasher {
  335. Blake3Hash(Blake3Hasher::new())
  336. }
  337. }
  338. // Custom 128-bit hash type
  339. #[derive(PartialEq, Eq, Clone, Copy, Serialize, Deserialize, Debug, Encode, Decode)]
  340. pub struct Hash128([u8; 16]);
  341. impl std::hash::Hash for Hash128 {
  342. fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
  343. state.write(&self.0);
  344. }
  345. }
  346. impl Hash128 {
  347. pub fn new(bytes: [u8; 16]) -> Self {
  348. Hash128(bytes)
  349. }
  350. pub fn to_bytes(&self) -> [u8; 16] {
  351. self.0
  352. }
  353. }
  354. pub fn get_dir_size(path: &Path) -> std::io::Result<u64> {
  355. let mut total_size = 0;
  356. if path.is_dir() {
  357. for entry in fs::read_dir(path)? {
  358. let entry = entry?;
  359. let path = entry.path();
  360. if path.is_file() {
  361. total_size += path.metadata()?.len();
  362. } else if path.is_dir() {
  363. total_size += get_dir_size(&path)?;
  364. }
  365. }
  366. }
  367. Ok(total_size)
  368. }
  369. /// Finds all files matching the given glob pattern.
  370. ///
  371. /// # Arguments
  372. ///
  373. /// * `pattern` - A glob pattern string (e.g., `"data/**/*.txt"`).
  374. ///
  375. /// # Returns
  376. ///
  377. /// * `Ok(Vec<PathBuf>)` with all successfully matched file paths.
  378. /// * `Err` if the glob pattern is invalid or any matched path fails to resolve.
  379. ///
  380. /// # Errors
  381. ///
  382. /// Returns an error if:
  383. /// - The glob pattern is invalid.
  384. /// - A file path matching the pattern cannot be resolved.
  385. ///
  386. /// # Examples
  387. ///
  388. /// ```rust
  389. /// let files = find_files("src/**/*.rs")?;
  390. /// for file in files {
  391. /// println!("{:?}", file);
  392. /// }
  393. /// ```
  394. pub fn find_files(pattern: &str) -> anyhow::Result<Vec<PathBuf>> {
  395. let mut result = Vec::new();
  396. let entries = glob(pattern)
  397. .with_context(|| format!("Invalid glob pattern: '{}'", pattern))?;
  398. for entry in entries {
  399. let path = entry.with_context(|| format!("Failed to resolve path for pattern '{}'", pattern))?;
  400. result.push(path);
  401. }
  402. Ok(result)
  403. }
  404. // fn system_time_to_utc(system_time: SystemTime) -> Option<DateTime<Utc>> {
  405. // system_time
  406. // .duration_since(UNIX_EPOCH)
  407. // .ok()
  408. // .map(|duration| Utc.timestamp(duration.as_secs() as i64, duration.subsec_nanos()))
  409. // }
  410. pub fn list_directories(dir_path: &str) -> std::io::Result<Vec<String>> {
  411. let mut directories = Vec::new();
  412. for entry in fs::read_dir(dir_path)? {
  413. let entry = entry?;
  414. let path = entry.path();
  415. // Check if the path is a directory
  416. if path.is_dir() {
  417. if let Some(dir_name) = path.file_name().and_then(|name| name.to_str()) {
  418. directories.push(dir_name.to_string());
  419. }
  420. }
  421. }
  422. Ok(directories)
  423. }
  424. /// Checks whether the modification time of `file1` is older than `file2`.
  425. ///
  426. /// If `rm` is `true` and `file1` is older, attempts to remove the directory containing `file1`.
  427. ///
  428. /// # Arguments
  429. ///
  430. /// * `file1` - Path to the first file.
  431. /// * `file2` - Path to the second file.
  432. /// * `rm` - If true, and `file1` is older, attempts to remove its parent directory.
  433. ///
  434. /// # Returns
  435. ///
  436. /// * `Ok(true)` if `file1` is older than `file2`.
  437. /// * `Ok(false)` otherwise.
  438. ///
  439. /// # Errors
  440. ///
  441. /// Returns an [`anyhow::Error`] if:
  442. /// - Either `file1` or `file2` does not exist.
  443. /// - File metadata cannot be read.
  444. /// - File modification times cannot be retrieved.
  445. /// - (if `rm == true`) Directory removal fails (if uncommented).
  446. ///
  447. pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result<bool> {
  448. let mtime1 = fs::metadata(file1)
  449. .with_context(|| format!("Failed to read metadata for '{}'", file1))?
  450. .modified()
  451. .with_context(|| format!("Failed to get modified time for '{}'", file1))?;
  452. let mtime2 = fs::metadata(file2)
  453. .with_context(|| format!("Failed to read metadata for '{}'", file2))?
  454. .modified()
  455. .with_context(|| format!("Failed to get modified time for '{}'", file2))?;
  456. if mtime1 < mtime2 && rm {
  457. if let Some(file1_dir) = Path::new(file1).parent() {
  458. warn!("Removing old directory: {}", file1_dir.display());
  459. // fs::remove_dir(file1_dir)?;
  460. }
  461. }
  462. Ok(mtime1 < mtime2)
  463. }
  464. pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> {
  465. debug!("Trying to remove: {dir}");
  466. // match fs::remove_dir_all(dir) {
  467. // Ok(_) => {}
  468. // Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
  469. // Err(e) => {
  470. // anyhow::bail!("Failed to remove directory '{}': {}", dir, e);
  471. // }
  472. // };
  473. Ok(())
  474. }
  475. /// Searches a directory for the first file matching a given name pattern.
  476. ///
  477. /// This function looks for a file in the given directory whose filename
  478. /// starts with the provided `starts_with` prefix and ends with the provided
  479. /// `ends_with` suffix. It returns the full path to the first match found.
  480. ///
  481. /// # Arguments
  482. ///
  483. /// * `dir` - A reference to the directory path where the search will occur.
  484. /// * `starts_with` - The required prefix of the filename (e.g., `"throughput_"`).
  485. /// * `ends_with` - The required suffix of the filename (e.g., `".csv"`).
  486. ///
  487. /// # Returns
  488. ///
  489. /// * `Some(PathBuf)` - Path to the first file matching the pattern.
  490. /// * `None` - If no matching file is found or the directory can't be read.
  491. ///
  492. /// # Example
  493. ///
  494. /// ```rust
  495. /// let dir = std::path::Path::new("/path/to/data");
  496. /// if let Some(path) = find_matching_file(dir, "throughput_", ".csv") {
  497. /// println!("Found file: {}", path.display());
  498. /// } else {
  499. /// eprintln!("No matching file found.");
  500. /// }
  501. /// ```
  502. pub fn find_matching_file(dir: &Path, starts_with: &str, ends_with: &str) -> Option<PathBuf> {
  503. fs::read_dir(dir)
  504. .ok()?
  505. .filter_map(Result::ok)
  506. .map(|entry| entry.path())
  507. .find(|path| {
  508. path.is_file()
  509. && path
  510. .file_name()
  511. .and_then(|name| name.to_str())
  512. .map(|name| name.starts_with(starts_with) && name.ends_with(ends_with))
  513. .unwrap_or(false)
  514. })
  515. }
  516. /// Searches for the first file in the given directory whose file name
  517. /// satisfies the provided condition.
  518. ///
  519. /// # Arguments
  520. ///
  521. /// * `dir` - Path to the directory to search.
  522. /// * `condition` - A closure that takes a file name (`&str`) and returns `true`
  523. /// if the file matches the desired condition.
  524. ///
  525. /// # Returns
  526. ///
  527. /// An `Option<PathBuf>` containing the path to the first matching file,
  528. /// or `None` if no file matches or the directory can't be read.
  529. ///
  530. /// # Examples
  531. ///
  532. /// ```
  533. /// use std::path::Path;
  534. /// let result = find_file(Path::new("."), |name| name.ends_with(".rs"));
  535. /// ```
  536. pub fn find_file<F>(dir: &Path, condition: F) -> Option<PathBuf>
  537. where
  538. F: Fn(&str) -> bool,
  539. {
  540. fs::read_dir(dir).ok()?.find_map(|entry| {
  541. let path = entry.ok()?.path();
  542. if path.is_file()
  543. && path
  544. .file_name()
  545. .and_then(|name| name.to_str())
  546. .map(&condition)
  547. .unwrap_or(false)
  548. {
  549. Some(path)
  550. } else {
  551. None
  552. }
  553. })
  554. }