helpers.rs 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398
  1. use anyhow::Context;
  2. use bitcode::{Decode, Encode};
  3. use glob::glob;
  4. use log::{debug, error, warn};
  5. use rust_htslib::bam::Read;
  6. use rustc_hash::FxHashMap;
  7. use serde::{Deserialize, Serialize};
  8. use std::{
  9. cmp::Ordering,
  10. collections::{HashMap, HashSet},
  11. fmt, fs,
  12. iter::Sum,
  13. ops::{Add, Div},
  14. path::{Path, PathBuf},
  15. sync::{atomic::AtomicBool, OnceLock},
  16. };
  17. use uuid::Uuid;
  18. pub fn find_unique_file(dir_path: &str, suffix: &str) -> anyhow::Result<String> {
  19. let mut matching_files = Vec::new();
  20. for entry in
  21. fs::read_dir(dir_path).with_context(|| format!("Failed to read directory: {}", dir_path))?
  22. {
  23. let entry = entry.with_context(|| "Failed to read directory entry")?;
  24. let path = entry.path();
  25. if path.is_file()
  26. && path
  27. .file_name()
  28. .and_then(|name| name.to_str())
  29. .map(|name| name.ends_with(suffix))
  30. .unwrap_or(false)
  31. {
  32. matching_files.push(path);
  33. }
  34. }
  35. match matching_files.len() {
  36. 0 => Err(anyhow::anyhow!("No file found ending with '{}'", suffix))
  37. .with_context(|| format!("In directory: {}", dir_path)),
  38. 1 => Ok(matching_files[0].to_string_lossy().into_owned()),
  39. _ => {
  40. matching_files.sort();
  41. let matches = matching_files
  42. .iter()
  43. .map(|p| p.display().to_string())
  44. .collect::<Vec<_>>()
  45. .join(", ");
  46. Err(anyhow::anyhow!(
  47. "Multiple files found ending with '{}': {}",
  48. suffix,
  49. matches
  50. ))
  51. .with_context(|| format!("In directory: {}", dir_path))
  52. }
  53. }
  54. }
  55. /// Return the output prefix by removing all suffixes from the first dot in the filename.
  56. ///
  57. /// This is intentionally stricter than [`Path::file_stem`]: filenames such as
  58. /// `sample.v1.vcf.gz` become `sample`, not `sample.v1.vcf`. This matches tools
  59. /// that expect a prefix rather than a single-extension stem.
  60. ///
  61. /// # Examples
  62. ///
  63. /// ```text
  64. /// /tmp/sample.vcf.gz -> /tmp/sample
  65. /// /tmp/sample.v1.vcf.gz -> /tmp/sample
  66. /// ```
  67. pub fn path_prefix(out: &str) -> anyhow::Result<String> {
  68. let out_path = Path::new(out);
  69. let out_dir = out_path
  70. .parent()
  71. .ok_or_else(|| anyhow::anyhow!("Can't parse the dir of {}", out_path.display()))?;
  72. let name = out_path
  73. .file_name()
  74. .and_then(|name| name.to_str())
  75. .ok_or_else(|| anyhow::anyhow!("Can't parse the file name of {}", out_path.display()))?;
  76. let stem = name
  77. .split_once('.')
  78. .map(|(stem, _)| stem)
  79. .ok_or_else(|| anyhow::anyhow!("Can't parse the file stem of {}", name))?;
  80. Ok(format!("{}/{stem}", out_dir.display()))
  81. }
  82. pub fn force_or_not(_path: &str, _force: bool) -> anyhow::Result<()> {
  83. // let path = Path::new(path);
  84. // let mut output_exists = path.exists();
  85. // let dir = path
  86. // .parent()
  87. // .context(format!("Can't parse the parent dir of {}", path.display()))?;
  88. //
  89. // if force && output_exists {
  90. // fs::remove_dir_all(dir)?;
  91. // fs::create_dir_all(dir)?;
  92. // output_exists = false;
  93. // }
  94. //
  95. // if output_exists {
  96. // info!("{} already exists.", path.display())
  97. // }
  98. Ok(())
  99. }
  100. /// Builds Singularity bind flags from input/output paths.
  101. ///
  102. /// - Converts file paths to their parent directory.
  103. /// - Skips non-existing paths to avoid Singularity errors.
  104. /// - Canonicalizes and deduplicates directories.
  105. pub fn singularity_bind_flags<I, P>(paths: I) -> String
  106. where
  107. I: IntoIterator<Item = P>,
  108. P: AsRef<Path>,
  109. {
  110. let mut seen = HashSet::new();
  111. let mut binds = Vec::new();
  112. let mut push_dir = |p: &Path| {
  113. if p.as_os_str().is_empty() || !p.exists() {
  114. return;
  115. }
  116. let dir = p.canonicalize().unwrap_or_else(|_| p.to_path_buf());
  117. if seen.insert(dir.clone()) {
  118. binds.push(format!(
  119. "--bind {src}:{dst}",
  120. src = dir.display(),
  121. dst = dir.display()
  122. ));
  123. }
  124. };
  125. for path_str in paths {
  126. let path = path_str.as_ref();
  127. if path.is_dir() {
  128. push_dir(path);
  129. } else if let Some(parent) = path.parent() {
  130. push_dir(parent);
  131. }
  132. }
  133. binds.join(" ")
  134. }
  135. use std::cmp::Ord;
  136. pub struct VectorIntersection<T> {
  137. pub common: Vec<T>,
  138. pub only_in_first: Vec<T>,
  139. pub only_in_second: Vec<T>,
  140. }
  141. impl<T> fmt::Display for VectorIntersection<T> {
  142. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  143. let total_items = self.common.len() + self.only_in_first.len() + self.only_in_second.len();
  144. writeln!(f, "Total items: {}", total_items)?;
  145. writeln!(
  146. f,
  147. "Common: {} ({:.2}%)",
  148. self.common.len(),
  149. percentage(self.common.len(), total_items)
  150. )?;
  151. writeln!(
  152. f,
  153. "Only in first: {} ({:.2}%)",
  154. self.only_in_first.len(),
  155. percentage(self.only_in_first.len(), total_items)
  156. )?;
  157. writeln!(
  158. f,
  159. "Only in second: {} ({:.2}%)",
  160. self.only_in_second.len(),
  161. percentage(self.only_in_second.len(), total_items)
  162. )
  163. }
  164. }
  165. fn percentage(part: usize, total: usize) -> f64 {
  166. if total == 0 {
  167. 0.0
  168. } else {
  169. (part as f64 / total as f64) * 100.0
  170. }
  171. }
  172. impl<T> Default for VectorIntersection<T> {
  173. fn default() -> Self {
  174. Self {
  175. common: Vec::new(),
  176. only_in_first: Vec::new(),
  177. only_in_second: Vec::new(),
  178. }
  179. }
  180. }
  181. impl<T: Ord + Clone> VectorIntersection<T> {
  182. fn merge(&mut self, other: &mut Self) {
  183. self.common.append(&mut other.common);
  184. self.only_in_first.append(&mut other.only_in_first);
  185. self.only_in_second.append(&mut other.only_in_second);
  186. }
  187. }
  188. fn intersection<T: Ord + Clone>(vec1: &[T], vec2: &[T]) -> VectorIntersection<T> {
  189. let mut result = VectorIntersection::default();
  190. let mut i = 0;
  191. let mut j = 0;
  192. while i < vec1.len() && j < vec2.len() {
  193. match vec1[i].cmp(&vec2[j]) {
  194. Ordering::Less => {
  195. result.only_in_first.push(vec1[i].clone());
  196. i += 1;
  197. }
  198. Ordering::Greater => {
  199. result.only_in_second.push(vec2[j].clone());
  200. j += 1;
  201. }
  202. Ordering::Equal => {
  203. let val = &vec1[i];
  204. let mut count1 = 1;
  205. let mut count2 = 1;
  206. // Count occurrences in vec1
  207. while i + 1 < vec1.len() && &vec1[i + 1] == val {
  208. i += 1;
  209. count1 += 1;
  210. }
  211. // Count occurrences in vec2
  212. while j + 1 < vec2.len() && &vec2[j + 1] == val {
  213. j += 1;
  214. count2 += 1;
  215. }
  216. // Add to common
  217. result
  218. .common
  219. .extend(std::iter::repeat_n(val.clone(), count1.min(count2)));
  220. // Add excess to only_in_first or only_in_second
  221. match count1.cmp(&count2) {
  222. Ordering::Greater => {
  223. result
  224. .only_in_first
  225. .extend(std::iter::repeat_n(val.clone(), count1 - count2));
  226. }
  227. Ordering::Less => {
  228. result
  229. .only_in_second
  230. .extend(std::iter::repeat_n(val.clone(), count2 - count1));
  231. }
  232. Ordering::Equal => {
  233. // No excess elements, do nothing
  234. }
  235. }
  236. i += 1;
  237. j += 1;
  238. }
  239. }
  240. }
  241. result.only_in_first.extend(vec1[i..].iter().cloned());
  242. result.only_in_second.extend(vec2[j..].iter().cloned());
  243. result
  244. }
  245. pub fn par_intersection<T: Ord + Send + Sync + Clone>(
  246. vec1: &[T],
  247. vec2: &[T],
  248. ) -> VectorIntersection<T> {
  249. par_intersection_by_value(vec1, vec2)
  250. }
  251. fn par_intersection_by_value<T: Ord + Send + Sync + Clone>(
  252. vec1: &[T],
  253. vec2: &[T],
  254. ) -> VectorIntersection<T> {
  255. const SEQUENTIAL_THRESHOLD: usize = 4096;
  256. if vec1.len() + vec2.len() <= SEQUENTIAL_THRESHOLD {
  257. return intersection(vec1, vec2);
  258. }
  259. if vec1.is_empty() {
  260. return VectorIntersection {
  261. common: Vec::new(),
  262. only_in_first: Vec::new(),
  263. only_in_second: vec2.to_vec(),
  264. };
  265. }
  266. if vec2.is_empty() {
  267. return VectorIntersection {
  268. common: Vec::new(),
  269. only_in_first: vec1.to_vec(),
  270. only_in_second: Vec::new(),
  271. };
  272. }
  273. let pivot = &vec1[vec1.len() / 2];
  274. let v1_left = vec1.partition_point(|x| x < pivot);
  275. let v1_right = vec1.partition_point(|x| x <= pivot);
  276. let v2_left = vec2.partition_point(|x| x < pivot);
  277. let v2_right = vec2.partition_point(|x| x <= pivot);
  278. let (mut left, mut right) = rayon::join(
  279. || par_intersection_by_value(&vec1[..v1_left], &vec2[..v2_left]),
  280. || par_intersection_by_value(&vec1[v1_right..], &vec2[v2_right..]),
  281. );
  282. let count1 = v1_right - v1_left;
  283. let count2 = v2_right - v2_left;
  284. let mut middle = VectorIntersection::default();
  285. middle
  286. .common
  287. .extend(std::iter::repeat_n(pivot.clone(), count1.min(count2)));
  288. match count1.cmp(&count2) {
  289. Ordering::Greater => middle
  290. .only_in_first
  291. .extend(std::iter::repeat_n(pivot.clone(), count1 - count2)),
  292. Ordering::Less => middle
  293. .only_in_second
  294. .extend(std::iter::repeat_n(pivot.clone(), count2 - count1)),
  295. Ordering::Equal => {}
  296. }
  297. left.merge(&mut middle);
  298. left.merge(&mut right);
  299. left
  300. }
  301. pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 {
  302. let m = dna_sequence.len() as f64;
  303. // Early return for empty sequences
  304. if m == 0.0 {
  305. return 0.0;
  306. }
  307. // Count occurrences of each base
  308. let mut bases = HashMap::<char, usize>::new();
  309. for base in dna_sequence.chars() {
  310. *bases.entry(base).or_insert(0) += 1;
  311. }
  312. // Calculate Shannon entropy
  313. let shannon_entropy_value: f64 = bases
  314. .values()
  315. .map(|&n_i| {
  316. let p_i = n_i as f64 / m;
  317. if p_i > 0.0 {
  318. -p_i * p_i.log2()
  319. } else {
  320. 0.0 // Avoid log2(0)
  321. }
  322. })
  323. .sum();
  324. shannon_entropy_value
  325. }
  326. pub fn mean<T>(values: &[T]) -> f64
  327. where
  328. T: Copy + Add<Output = T> + Div<Output = T> + Sum + Into<f64>,
  329. {
  330. let count = values.len();
  331. if count == 0 {
  332. return 0.0;
  333. }
  334. let sum: T = values.iter().copied().sum();
  335. sum.into() / count as f64
  336. }
  337. pub fn bin_data<V>(data: Vec<V>, bin_size: V) -> Vec<(V, usize)>
  338. where
  339. V: Copy + Default + PartialOrd + std::ops::AddAssign + std::ops::Add<Output = V>,
  340. {
  341. if data.is_empty() || bin_size.partial_cmp(&V::default()) != Some(Ordering::Greater) {
  342. return Vec::new();
  343. }
  344. // Sort comparable data points. For floats, this drops NaN values.
  345. let mut sorted_data: Vec<V> = data
  346. .into_iter()
  347. .filter(|v| v.partial_cmp(v) == Some(Ordering::Equal))
  348. .collect();
  349. if sorted_data.is_empty() {
  350. return Vec::new();
  351. }
  352. sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
  353. // Initialize bins
  354. let mut bins: Vec<(V, usize)> = Vec::new();
  355. let mut current_bin_start = sorted_data[0];
  356. let mut count = 0;
  357. for &value in &sorted_data {
  358. if value < current_bin_start + bin_size {
  359. count += 1;
  360. } else {
  361. bins.push((current_bin_start, count));
  362. current_bin_start += bin_size;
  363. while value >= current_bin_start + bin_size {
  364. current_bin_start += bin_size;
  365. }
  366. count = 1;
  367. }
  368. }
  369. // Push the last bin
  370. bins.push((current_bin_start, count));
  371. bins
  372. }
  373. // fn aggregate_data(data: &[(u128, u32)], num_bins: usize) -> Vec<(u32, u32)> {
  374. // if data.is_empty() || num_bins == 0 {
  375. // return vec![];
  376. // }
  377. //
  378. // let bin_size = ((data.len() as f64) / (num_bins as f64)).ceil() as usize;
  379. //
  380. // (0..num_bins)
  381. // .map(|i| {
  382. // let start = i * bin_size;
  383. // let end = (start + bin_size).min(data.len()); // Ensure `end` does not exceed `data.len()`
  384. //
  385. // // If `start` is out of bounds, return (0, 0)
  386. // if start >= data.len() {
  387. // return (0, 0);
  388. // }
  389. //
  390. // let bin = &data[start..end];
  391. //
  392. // let sum_x: u128 = bin.iter().map(|&(x, _)| x).sum();
  393. // let count = bin.len() as u128;
  394. // let mean_x = (sum_x / count) as u32; // Rounded down to nearest u32
  395. //
  396. // let sum_n: u32 = bin.iter().map(|&(_, n)| n).sum();
  397. //
  398. // (mean_x, sum_n)
  399. // })
  400. // .collect()
  401. // }
  402. //
  403. pub fn app_storage_dir() -> anyhow::Result<PathBuf> {
  404. let app_name = env!("CARGO_PKG_NAME");
  405. let app_dir = dirs::data_dir()
  406. .context("Failed to get data directory")?
  407. .join(app_name);
  408. if !app_dir.exists() {
  409. fs::create_dir_all(&app_dir).context("Failed to create application directory")?;
  410. }
  411. Ok(app_dir)
  412. }
  413. use blake3::Hasher as Blake3Hasher;
  414. use std::hash::{BuildHasher, Hasher};
  415. pub struct Blake3Hash(Blake3Hasher);
  416. impl Hasher for Blake3Hash {
  417. fn finish(&self) -> u64 {
  418. let hash = self.0.finalize();
  419. u64::from_le_bytes(hash.as_bytes()[..8].try_into().unwrap())
  420. }
  421. fn write(&mut self, bytes: &[u8]) {
  422. self.0.update(bytes);
  423. }
  424. }
  425. /// Default Hasher
  426. #[derive(Default, Clone)]
  427. pub struct Blake3BuildHasher;
  428. impl BuildHasher for Blake3BuildHasher {
  429. type Hasher = Blake3Hash;
  430. fn build_hasher(&self) -> Self::Hasher {
  431. Blake3Hash(Blake3Hasher::new())
  432. }
  433. }
  434. // Custom 128-bit hash type
  435. #[derive(PartialEq, Eq, Clone, Copy, Serialize, Deserialize, Debug, Encode, Decode)]
  436. pub struct Hash128([u8; 16]);
  437. impl std::hash::Hash for Hash128 {
  438. fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
  439. state.write(&self.0);
  440. }
  441. }
  442. impl Hash128 {
  443. pub fn new(bytes: [u8; 16]) -> Self {
  444. Hash128(bytes)
  445. }
  446. pub fn to_bytes(&self) -> [u8; 16] {
  447. self.0
  448. }
  449. }
  450. pub fn get_dir_size(path: &Path) -> std::io::Result<u64> {
  451. let mut total_size = 0;
  452. if path.is_dir() {
  453. for entry in fs::read_dir(path)? {
  454. let entry = entry?;
  455. let path = entry.path();
  456. if path.is_file() {
  457. total_size += path.metadata()?.len();
  458. } else if path.is_dir() {
  459. total_size += get_dir_size(&path)?;
  460. }
  461. }
  462. }
  463. Ok(total_size)
  464. }
  465. /// Finds all files matching the given glob pattern.
  466. ///
  467. /// # Arguments
  468. ///
  469. /// * `pattern` - A glob pattern string (e.g., `"data/**/*.txt"`).
  470. ///
  471. /// # Returns
  472. ///
  473. /// * `Ok(Vec<PathBuf>)` with all successfully matched file paths.
  474. /// * `Err` if the glob pattern is invalid or any matched path fails to resolve.
  475. ///
  476. /// # Errors
  477. ///
  478. /// Returns an error if:
  479. /// - The glob pattern is invalid.
  480. /// - A file path matching the pattern cannot be resolved.
  481. ///
  482. /// # Examples
  483. ///
  484. /// ```rust
  485. /// let files = find_files("src/**/*.rs")?;
  486. /// for file in files {
  487. /// println!("{:?}", file);
  488. /// }
  489. /// ```
  490. pub fn find_files(pattern: &str) -> anyhow::Result<Vec<PathBuf>> {
  491. let mut result = Vec::new();
  492. let entries = glob(pattern).with_context(|| format!("Invalid glob pattern: '{}'", pattern))?;
  493. for entry in entries {
  494. let path =
  495. entry.with_context(|| format!("Failed to resolve path for pattern '{}'", pattern))?;
  496. result.push(path);
  497. }
  498. result.sort();
  499. Ok(result)
  500. }
  501. // fn system_time_to_utc(system_time: SystemTime) -> Option<DateTime<Utc>> {
  502. // system_time
  503. // .duration_since(UNIX_EPOCH)
  504. // .ok()
  505. // .map(|duration| Utc.timestamp(duration.as_secs() as i64, duration.subsec_nanos()))
  506. // }
  507. /// List all files in a directory that have the given suffix (file extension).
  508. ///
  509. /// # Arguments
  510. /// * `dir` – Directory to scan.
  511. /// * `suffix` – File extension to match (without the dot).
  512. ///
  513. /// # Returns
  514. /// A vector of `PathBuf` entries whose extension matches `suffix`.
  515. ///
  516. /// # Errors
  517. /// Returns any I/O error from `read_dir` or directory traversal.
  518. pub fn list_files_with_ext(dir: impl AsRef<Path>, ext: &str) -> std::io::Result<Vec<PathBuf>> {
  519. let mut out = Vec::new();
  520. for entry in fs::read_dir(dir)? {
  521. let entry = entry?;
  522. let path = entry.path();
  523. if path.is_file() {
  524. if let Some(file_ext) = path.extension().and_then(|e| e.to_str()) {
  525. if file_ext == ext {
  526. out.push(path);
  527. }
  528. }
  529. }
  530. }
  531. out.sort();
  532. Ok(out)
  533. }
  534. pub fn list_directories(dir_path: PathBuf) -> std::io::Result<Vec<String>> {
  535. let mut directories = Vec::new();
  536. for entry in fs::read_dir(dir_path)? {
  537. let entry = entry?;
  538. let path = entry.path();
  539. // Check if the path is a directory
  540. if path.is_dir() {
  541. if let Some(dir_name) = path.file_name().and_then(|name| name.to_str()) {
  542. directories.push(dir_name.to_string());
  543. }
  544. }
  545. }
  546. directories.sort();
  547. Ok(directories)
  548. }
  549. pub fn list_files_recursive(root: impl AsRef<Path>) -> Vec<PathBuf> {
  550. fn walk(dir: &Path, out: &mut Vec<PathBuf>) {
  551. let entries = match fs::read_dir(dir) {
  552. Ok(entries) => entries,
  553. Err(e) => {
  554. warn!("Cannot read directory {}: {}", dir.display(), e);
  555. return;
  556. }
  557. };
  558. for entry in entries {
  559. let entry = match entry {
  560. Ok(entry) => entry,
  561. Err(e) => {
  562. warn!("Cannot read directory entry in {}: {}", dir.display(), e);
  563. continue;
  564. }
  565. };
  566. let path = entry.path();
  567. if path.is_dir() {
  568. walk(&path, out);
  569. } else if path.is_file() {
  570. out.push(path);
  571. }
  572. }
  573. }
  574. let mut files = Vec::new();
  575. walk(root.as_ref(), &mut files);
  576. files.sort();
  577. files
  578. }
  579. /// Checks whether the modification time of `file1` is older than `file2`.
  580. ///
  581. /// If `rm` is `true` and `file1` is older, attempts to remove the directory containing `file1`.
  582. ///
  583. /// # Arguments
  584. ///
  585. /// * `file1` - Path to the first file.
  586. /// * `file2` - Path to the second file.
  587. /// * `rm` - If true, and `file1` is older, attempts to remove its parent directory.
  588. ///
  589. /// # Returns
  590. ///
  591. /// * `Ok(true)` if `file1` is older than `file2`.
  592. /// * `Ok(false)` otherwise.
  593. ///
  594. /// # Errors
  595. ///
  596. /// Returns an [`anyhow::Error`] if:
  597. /// - Either `file1` or `file2` does not exist.
  598. /// - File metadata cannot be read.
  599. /// - File modification times cannot be retrieved.
  600. /// - Directory removal fails when `rm == true` and `file1` is older.
  601. ///
  602. pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result<bool> {
  603. debug!("is_file_older {file1} {file2}, {rm:?}");
  604. let mtime1 = fs::metadata(file1)
  605. .with_context(|| format!("Failed to read metadata for '{}'", file1))?
  606. .modified()
  607. .with_context(|| format!("Failed to get modified time for '{}'", file1))?;
  608. let mtime2 = fs::metadata(file2)
  609. .with_context(|| format!("Failed to read metadata for '{}'", file2))?
  610. .modified()
  611. .with_context(|| format!("Failed to get modified time for '{}'", file2))?;
  612. if mtime1 < mtime2 && rm {
  613. if let Some(file1_dir) = Path::new(file1).parent() {
  614. warn!("Removing old directory: {}", file1_dir.display());
  615. fs::remove_dir_all(file1_dir).map_err(|e| {
  616. warn!("Failed to remove {}: {}", file1_dir.display(), e);
  617. e
  618. })?;
  619. }
  620. }
  621. Ok(mtime1 < mtime2)
  622. }
  623. pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> {
  624. debug!("Trying to remove: {dir}");
  625. match fs::remove_dir_all(dir) {
  626. Ok(_) => {}
  627. Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
  628. Err(e) => anyhow::bail!("Failed to remove directory '{}': {}", dir, e),
  629. }
  630. Ok(())
  631. }
  632. /// Searches a directory for the first file matching a given name pattern.
  633. ///
  634. /// This function looks for a file in the given directory whose filename
  635. /// starts with the provided `starts_with` prefix and ends with the provided
  636. /// `ends_with` suffix. It returns the full path to the first match found.
  637. ///
  638. /// # Arguments
  639. ///
  640. /// * `dir` - A reference to the directory path where the search will occur.
  641. /// * `starts_with` - The required prefix of the filename (e.g., `"throughput_"`).
  642. /// * `ends_with` - The required suffix of the filename (e.g., `".csv"`).
  643. ///
  644. /// # Returns
  645. ///
  646. /// * `Some(PathBuf)` - Path to the first file matching the pattern.
  647. /// * `None` - If no matching file is found or the directory can't be read.
  648. ///
  649. /// # Example
  650. ///
  651. /// ```rust
  652. /// let dir = std::path::Path::new("/path/to/data");
  653. /// if let Some(path) = find_matching_file(dir, "throughput_", ".csv") {
  654. /// println!("Found file: {}", path.display());
  655. /// } else {
  656. /// eprintln!("No matching file found.");
  657. /// }
  658. /// ```
  659. pub fn find_matching_file(dir: &Path, starts_with: &str, ends_with: &str) -> Option<PathBuf> {
  660. let mut matches: Vec<_> = fs::read_dir(dir)
  661. .ok()?
  662. .filter_map(Result::ok)
  663. .map(|entry| entry.path())
  664. .filter(|path| {
  665. path.is_file()
  666. && path
  667. .file_name()
  668. .and_then(|name| name.to_str())
  669. .map(|name| name.starts_with(starts_with) && name.ends_with(ends_with))
  670. .unwrap_or(false)
  671. })
  672. .collect();
  673. matches.sort();
  674. matches.into_iter().next()
  675. }
  676. /// Searches for the first file in the given directory whose file name
  677. /// satisfies the provided condition.
  678. ///
  679. /// # Arguments
  680. ///
  681. /// * `dir` - Path to the directory to search.
  682. /// * `condition` - A closure that takes a file name (`&str`) and returns `true`
  683. /// if the file matches the desired condition.
  684. ///
  685. /// # Returns
  686. ///
  687. /// An `Option<PathBuf>` containing the path to the first matching file,
  688. /// or `None` if no file matches or the directory can't be read.
  689. ///
  690. /// # Examples
  691. ///
  692. /// ```
  693. /// use std::path::Path;
  694. /// let result = find_file(Path::new("."), |name| name.ends_with(".rs"));
  695. /// ```
  696. pub fn find_file<F>(dir: &Path, condition: F) -> Option<PathBuf>
  697. where
  698. F: Fn(&str) -> bool,
  699. {
  700. let mut matches: Vec<_> = fs::read_dir(dir)
  701. .ok()?
  702. .filter_map(Result::ok)
  703. .map(|entry| entry.path())
  704. .filter(|path| {
  705. path.is_file()
  706. && path
  707. .file_name()
  708. .and_then(|name| name.to_str())
  709. .map(&condition)
  710. .unwrap_or(false)
  711. })
  712. .collect();
  713. matches.sort();
  714. matches.into_iter().next()
  715. }
  716. /// Represents a string made of either:
  717. /// - one character repeated N times,
  718. /// - a two-character block repeated N times,
  719. /// - or no valid repetition pattern.
  720. #[derive(Debug, PartialEq)]
  721. pub enum Repeat {
  722. None,
  723. RepOne(char, usize),
  724. RepTwo(String, usize),
  725. }
  726. pub fn detect_repetition(s: &str) -> Repeat {
  727. let len = s.chars().count();
  728. if len == 0 {
  729. return Repeat::None;
  730. }
  731. // Check for single-char repetition
  732. let mut chars = s.chars();
  733. let first = chars.next().unwrap();
  734. if s.chars().all(|c| c == first) {
  735. return Repeat::RepOne(first, len);
  736. }
  737. // Check for two-char block repetition
  738. if len.is_multiple_of(2) {
  739. let mut iter = s.chars();
  740. let a = iter.next().unwrap();
  741. let b = iter.next().unwrap();
  742. let mut count = 1;
  743. while let (Some(c), Some(d)) = (iter.next(), iter.next()) {
  744. if c != a || d != b {
  745. return Repeat::None;
  746. }
  747. count += 1;
  748. }
  749. return Repeat::RepTwo(format!("{}{}", a, b), count);
  750. }
  751. Repeat::None
  752. }
  753. pub fn test_init() {
  754. let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("debug"))
  755. .try_init();
  756. }
  757. use regex::Regex;
  758. /// Extracts the numeric barcode suffix from a filename.
  759. ///
  760. /// # Examples
  761. ///
  762. /// ```
  763. /// let f = "sample_SQK-NBD114-24_barcode07.bam";
  764. /// assert_eq!(extract_barcode(f), Some(7));
  765. /// ```
  766. ///
  767. /// Returns `None` if no `barcodeNN` pattern is found
  768. /// or if the number cannot be parsed.
  769. pub fn extract_barcode(name: &str) -> Option<u32> {
  770. static BARCODE_RE: OnceLock<Regex> = OnceLock::new();
  771. let re = BARCODE_RE.get_or_init(|| Regex::new(r"barcode(\d+)").unwrap());
  772. re.captures(name)
  773. .and_then(|c| c.get(1))
  774. .and_then(|m| m.as_str().parse::<u32>().ok())
  775. }
  776. pub fn human_size(bytes: u64) -> String {
  777. const UNITS: [&str; 5] = ["B", "KB", "MB", "GB", "TB"];
  778. let mut size = bytes as f64;
  779. let mut unit = 0;
  780. while size >= 1024.0 && unit < UNITS.len() - 1 {
  781. size /= 1024.0;
  782. unit += 1;
  783. }
  784. if unit == 0 {
  785. format!("{} {}", bytes, UNITS[unit])
  786. } else {
  787. format!("{:.2} {}", size, UNITS[unit])
  788. }
  789. }
  790. /// RAII guard that removes a temporary directory when dropped.
  791. pub struct TempDirGuard {
  792. path: PathBuf,
  793. }
  794. impl TempDirGuard {
  795. pub fn new(path: PathBuf) -> Self {
  796. Self { path }
  797. }
  798. }
  799. impl Drop for TempDirGuard {
  800. fn drop(&mut self) {
  801. if self.path.exists() {
  802. // Log the attempt. Using eprintln/warn! ensures the error is noted,
  803. // but the program does not panic.
  804. if let Err(e) = fs::remove_dir_all(&self.path) {
  805. error!(
  806. "Failed to remove temporary directory '{}': {}",
  807. self.path.display(),
  808. e
  809. );
  810. }
  811. }
  812. }
  813. }
  814. /// Removes a BAM file and its associated index file (`.bam.bai` or `.bai`).
  815. pub fn remove_bam_with_index(bam: &Path) -> anyhow::Result<()> {
  816. if bam.exists() {
  817. fs::remove_file(bam).with_context(|| format!("Failed to remove BAM: {}", bam.display()))?;
  818. }
  819. let bai = bam.with_extension("bam.bai");
  820. if bai.exists() {
  821. fs::remove_file(&bai).ok();
  822. }
  823. let bai_alt = bam.with_extension("bai");
  824. if bai_alt.exists() {
  825. fs::remove_file(&bai_alt).ok();
  826. }
  827. Ok(())
  828. }
  829. /// Guard that tracks temporary filesystem paths and cleans them up on drop
  830. /// unless explicitly disarmed.
  831. ///
  832. /// This is intended for pipeline-style code where intermediate outputs
  833. /// are produced by downstream tools. The guard only tracks paths; it does
  834. /// **not** create files itself.
  835. ///
  836. /// If the pipeline completes successfully, call [`disarm`] to prevent
  837. /// cleanup. Otherwise, all tracked paths are removed on drop on a
  838. /// best-effort basis.
  839. ///
  840. /// # Example
  841. ///
  842. /// ```no_run
  843. /// use std::path::PathBuf;
  844. ///
  845. /// let tmp_dir = PathBuf::from("/tmp");
  846. /// let mut guard = TempFileGuard::new();
  847. ///
  848. /// let path1 = guard.temp_path(".tmp", &tmp_dir);
  849. /// let path2 = guard.temp_path(".tmp", &tmp_dir);
  850. ///
  851. /// // Produce outputs at `path1` and `path2`
  852. ///
  853. /// // Mark success
  854. /// guard.disarm();
  855. /// ```
  856. ///
  857. /// If `disarm()` is not called, both paths are cleaned up automatically
  858. /// when the guard is dropped.
  859. pub struct TempFileGuard {
  860. files: Vec<PathBuf>,
  861. disarmed: AtomicBool,
  862. }
  863. impl Default for TempFileGuard {
  864. fn default() -> Self {
  865. Self::new()
  866. }
  867. }
  868. impl TempFileGuard {
  869. pub fn new() -> Self {
  870. Self {
  871. files: Vec::new(),
  872. disarmed: AtomicBool::new(false),
  873. }
  874. }
  875. /// Registers a temporary file for cleanup.
  876. pub fn track(&mut self, path: PathBuf) {
  877. self.files.push(path);
  878. }
  879. /// Registers multiple temporary files for cleanup.
  880. pub fn track_many(&mut self, paths: impl IntoIterator<Item = PathBuf>) {
  881. self.files.extend(paths);
  882. }
  883. /// Disarms the guard, preventing cleanup on drop.
  884. /// Call this when the pipeline succeeds.
  885. pub fn disarm(&self) {
  886. self.disarmed
  887. .store(true, std::sync::atomic::Ordering::SeqCst);
  888. }
  889. /// Manually clean up all tracked files.
  890. pub fn cleanup(&mut self) {
  891. for file in &self.files {
  892. if let Err(e) = remove_tracked_path(file) {
  893. warn!("Failed to clean up temp file {}: {}", file.display(), e);
  894. }
  895. }
  896. self.files.clear();
  897. }
  898. pub fn clear(&mut self) {
  899. self.files.clear();
  900. }
  901. /// Generates a unique temporary path inside `tmp_dir` and registers it
  902. /// for cleanup.
  903. ///
  904. /// The returned path is derived from a UUIDv4 and **no file is created**.
  905. /// The caller is responsible for creating or writing to the path.
  906. ///
  907. /// # Example
  908. ///
  909. /// ```no_run
  910. /// use std::path::PathBuf;
  911. ///
  912. /// let tmp_dir = PathBuf::from("/tmp");
  913. /// let mut guard = TempFileGuard::new();
  914. ///
  915. /// let path = guard.temp_path(".tmp", &tmp_dir);
  916. /// // create or write to `path`
  917. /// ```
  918. pub fn tmp_path(&mut self, suffix: &str, tmp_dir: impl AsRef<Path>) -> PathBuf {
  919. let name = format!("pandora-tmp-{}{}", Uuid::new_v4(), suffix);
  920. let tmp_dir = tmp_dir.as_ref();
  921. let path = tmp_dir.join(name);
  922. self.track(path.clone());
  923. path
  924. }
  925. }
  926. fn remove_tracked_path(path: &Path) -> anyhow::Result<()> {
  927. if path.is_dir() {
  928. fs::remove_dir_all(path)
  929. .with_context(|| format!("Failed to remove temp directory: {}", path.display()))?;
  930. } else if path.exists() {
  931. fs::remove_file(path)
  932. .with_context(|| format!("Failed to remove temp file: {}", path.display()))?;
  933. }
  934. if path.extension().and_then(|e| e.to_str()) == Some("bam") {
  935. remove_existing_file(path.with_extension("bam.bai"));
  936. remove_existing_file(path.with_extension("bai"));
  937. }
  938. Ok(())
  939. }
  940. fn remove_existing_file(path: PathBuf) {
  941. if path.exists() {
  942. let _ = fs::remove_file(path);
  943. }
  944. }
  945. impl Drop for TempFileGuard {
  946. fn drop(&mut self) {
  947. if !self.disarmed.load(std::sync::atomic::Ordering::SeqCst) {
  948. warn!(
  949. "Pipeline failed or panicked, cleaning up {} temporary files",
  950. self.files.len()
  951. );
  952. self.cleanup();
  953. }
  954. }
  955. }
  956. // pub fn temp_file_path(suffix: &str, config: &Config) -> std::io::Result<PathBuf> {
  957. // let p = config.tmp_dir;
  958. // let temp_path = tempfile::Builder::new()
  959. // .prefix("pandora-temp-")
  960. // .suffix(suffix)
  961. // .rand_bytes(5)
  962. // .tempfile()?
  963. // .into_temp_path();
  964. //
  965. // Ok(temp_path.to_path_buf())
  966. // }
  967. /// Extracts genome sizes from BAM header.
  968. /// Extract contig names and lengths from a BAM header as a hash map.
  969. ///
  970. /// This delegates parsing to [`crate::io::bam::get_genome_sizes`]. Because the
  971. /// return type is a hash map, iteration order is not BAM header order; use
  972. /// [`get_genome_sizes_in_header_order`] when order matters.
  973. pub fn get_genome_sizes(
  974. header: &rust_htslib::bam::Header,
  975. ) -> anyhow::Result<FxHashMap<String, u64>> {
  976. Ok(crate::io::bam::get_genome_sizes(header)?
  977. .into_iter()
  978. .collect())
  979. }
  980. /// Extract contig names and lengths from a BAM header in header order.
  981. ///
  982. /// Use this when downstream region generation must follow the BAM/reference
  983. /// contig order. [`get_genome_sizes`] returns a hash map and does not preserve
  984. /// that order.
  985. pub fn get_genome_sizes_in_header_order(
  986. header: &rust_htslib::bam::HeaderView,
  987. ) -> anyhow::Result<Vec<(String, u64)>> {
  988. (0..header.target_count())
  989. .map(|tid| {
  990. let name = String::from_utf8_lossy(header.tid2name(tid)).into_owned();
  991. let len = header
  992. .target_len(tid)
  993. .with_context(|| format!("BAM header target {name} is missing length"))?;
  994. Ok((name, len))
  995. })
  996. .collect()
  997. }
  998. pub fn bam_contigs<P: AsRef<std::path::Path>>(bam: P) -> anyhow::Result<Vec<String>> {
  999. let reader = rust_htslib::bam::Reader::from_path(&bam)?;
  1000. Ok(reader
  1001. .header()
  1002. .target_names()
  1003. .iter()
  1004. .map(|name| String::from_utf8_lossy(name).into_owned())
  1005. .collect())
  1006. }
  1007. /// Split genome into ~n_parts equal-sized chunks (by number of bases),
  1008. /// returning for each chunk a list of regions in `ctg:start-end` form.
  1009. /// Split genome into ~n_parts equal-sized chunks (by bases).
  1010. /// Each chunk is a single region `ctg:start-end` (no cross-contig regions).
  1011. ///
  1012. /// Note: the number of regions returned will be ≈ n_parts (may differ by 1–2).
  1013. pub fn split_genome_into_n_regions(
  1014. genome_sizes: &FxHashMap<String, u64>,
  1015. n_parts: usize,
  1016. ) -> Vec<String> {
  1017. if n_parts == 0 || genome_sizes.is_empty() {
  1018. return Vec::new();
  1019. }
  1020. // Deterministic contig order
  1021. let mut contigs: Vec<(String, u64)> = genome_sizes
  1022. .iter()
  1023. .map(|(ctg, len)| (ctg.clone(), *len))
  1024. .collect();
  1025. contigs.sort_by(|a, b| a.0.cmp(&b.0));
  1026. let total_bases: u64 = contigs.iter().map(|(_, len)| *len).sum();
  1027. if total_bases == 0 {
  1028. return Vec::new();
  1029. }
  1030. let target_chunk_size: u64 = total_bases.div_ceil(n_parts as u64); // ceil
  1031. let mut regions = Vec::new();
  1032. for (ctg, len) in contigs {
  1033. let mut remaining = len;
  1034. let mut start: u64 = 1;
  1035. while remaining > 0 {
  1036. let take = remaining.min(target_chunk_size);
  1037. let end = start + take - 1;
  1038. regions.push(format!("{ctg}:{start}-{end}"));
  1039. remaining -= take;
  1040. start = end + 1;
  1041. }
  1042. }
  1043. regions
  1044. }
  1045. /// Split genome into exactly `n_parts` chunks with (almost) equal total size.
  1046. /// Each chunk is a *list* of regions `ctg:start-end`. A chunk may span several
  1047. /// contigs, but each region is within a single contig.
  1048. ///
  1049. /// The sizes of the parts will differ by at most 1 base.
  1050. pub fn split_genome_into_n_regions_exact(
  1051. genome_sizes: &FxHashMap<String, u64>,
  1052. n_parts: usize,
  1053. ) -> Vec<Vec<String>> {
  1054. let mut contigs: Vec<(String, u64)> = genome_sizes
  1055. .iter()
  1056. .map(|(ctg, len)| (ctg.clone(), *len))
  1057. .collect();
  1058. contigs.sort_by(|a, b| a.0.cmp(&b.0));
  1059. split_ordered_genome_into_n_regions_exact(&contigs, n_parts)
  1060. }
  1061. pub fn split_ordered_genome_into_n_regions_exact(
  1062. genome_sizes: &[(String, u64)],
  1063. n_parts: usize,
  1064. ) -> Vec<Vec<String>> {
  1065. if n_parts == 0 || genome_sizes.is_empty() {
  1066. return Vec::new();
  1067. }
  1068. let total_bases: u64 = genome_sizes.iter().map(|(_, len)| *len).sum();
  1069. if total_bases == 0 {
  1070. return Vec::new();
  1071. }
  1072. // We cannot have more non-empty parts than bases if we use 1-based inclusive coordinates.
  1073. let parts = n_parts.min(total_bases as usize);
  1074. // Distribute bases as evenly as possible:
  1075. // first `remainder` parts get (base_per_part + 1), the rest get base_per_part.
  1076. let base_per_part = total_bases / parts as u64;
  1077. let remainder = total_bases % parts as u64;
  1078. let target_sizes: Vec<u64> = (0..parts)
  1079. .map(|i| {
  1080. if i < remainder as usize {
  1081. base_per_part + 1
  1082. } else {
  1083. base_per_part
  1084. }
  1085. })
  1086. .collect();
  1087. let mut chunks: Vec<Vec<String>> = Vec::with_capacity(parts);
  1088. let mut part_idx = 0;
  1089. let mut remaining_in_part = target_sizes[0];
  1090. for (ctg, mut len) in genome_sizes.iter().cloned() {
  1091. let mut start: u64 = 1;
  1092. while len > 0 && part_idx < parts {
  1093. let take = len.min(remaining_in_part);
  1094. let end = start + take - 1;
  1095. if chunks.len() <= part_idx {
  1096. chunks.push(Vec::new());
  1097. }
  1098. chunks[part_idx].push(format!("{ctg}:{start}-{end}"));
  1099. len -= take;
  1100. start = end + 1;
  1101. remaining_in_part -= take;
  1102. if remaining_in_part == 0 {
  1103. part_idx += 1;
  1104. if part_idx == parts {
  1105. break;
  1106. }
  1107. remaining_in_part = target_sizes[part_idx];
  1108. }
  1109. }
  1110. }
  1111. // Sanity: we should have exactly `parts` chunks, all non-empty
  1112. debug_assert_eq!(chunks.len(), parts);
  1113. chunks
  1114. }
  1115. /// Convert a numeric value to an RGB triplet using a diverging palette.
  1116. ///
  1117. /// Values are clipped to `[-clip, +clip]` and mapped as:
  1118. /// - negative → blue
  1119. /// - zero → white
  1120. /// - positive → red
  1121. ///
  1122. /// # Arguments
  1123. /// * `v` – value to color (e.g. delta methylation)
  1124. /// * `clip` – symmetric clipping bound (must be > 0)
  1125. ///
  1126. /// # Returns
  1127. /// `(r, g, b)` as 8-bit RGB components.
  1128. ///
  1129. /// # Panics
  1130. /// Panics if `clip <= 0.0`.
  1131. pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) {
  1132. assert!(clip > 0.0, "clip must be > 0");
  1133. let x = v.clamp(-clip, clip);
  1134. let t = (x + clip) / (2.0 * clip); // [-clip,+clip] → [0,1]
  1135. if t < 0.5 {
  1136. // blue → white
  1137. let u = t / 0.5;
  1138. let r = (255.0 * u).round() as u8;
  1139. let g = (255.0 * u).round() as u8;
  1140. (r, g, 255)
  1141. } else {
  1142. // white → red
  1143. let u = (t - 0.5) / 0.5;
  1144. let g = (255.0 * (1.0 - u)).round() as u8;
  1145. let b = (255.0 * (1.0 - u)).round() as u8;
  1146. (255, g, b)
  1147. }
  1148. }
  1149. pub fn revcomp(s: &str) -> String {
  1150. fn comp(b: u8) -> u8 {
  1151. match b {
  1152. b'A' => b'T',
  1153. b'C' => b'G',
  1154. b'G' => b'C',
  1155. b'T' => b'A',
  1156. b'N' => b'N',
  1157. _ => b'N',
  1158. }
  1159. }
  1160. let v: Vec<u8> = s
  1161. .as_bytes()
  1162. .iter()
  1163. .rev()
  1164. .map(|&b| comp(b.to_ascii_uppercase()))
  1165. .collect();
  1166. String::from_utf8(v).unwrap()
  1167. }
  1168. #[cfg(test)]
  1169. mod tests {
  1170. use log::info;
  1171. use rust_htslib::bam::{self, Read};
  1172. use crate::helpers::test_init;
  1173. use super::*;
  1174. #[test]
  1175. fn par_intersection_keeps_values_only_in_second_outside_first_range() {
  1176. let res = par_intersection(&[10], &[1]);
  1177. assert_eq!(res.common, Vec::<i32>::new());
  1178. assert_eq!(res.only_in_first, vec![10]);
  1179. assert_eq!(res.only_in_second, vec![1]);
  1180. }
  1181. #[test]
  1182. fn par_intersection_handles_duplicate_counts() {
  1183. let res = par_intersection(&[1, 2, 2, 3], &[0, 2, 2, 2, 4]);
  1184. assert_eq!(res.common, vec![2, 2]);
  1185. assert_eq!(res.only_in_first, vec![1, 3]);
  1186. assert_eq!(res.only_in_second, vec![0, 2, 4]);
  1187. }
  1188. #[test]
  1189. fn bin_data_handles_empty_and_invalid_bin_size() {
  1190. assert_eq!(bin_data::<f64>(Vec::new(), 0.1), Vec::<(f64, usize)>::new());
  1191. assert_eq!(bin_data(vec![1.0, 2.0], 0.0), Vec::<(f64, usize)>::new());
  1192. assert_eq!(
  1193. bin_data(vec![1.0, 2.0], f64::NAN),
  1194. Vec::<(f64, usize)>::new()
  1195. );
  1196. assert_eq!(bin_data(vec![f64::NAN], 0.1), Vec::<(f64, usize)>::new());
  1197. }
  1198. #[test]
  1199. fn bin_data_keeps_sparse_values_in_aligned_bins() {
  1200. assert_eq!(bin_data(vec![0.0, 1.0], 0.25), vec![(0.0, 1), (1.0, 1)]);
  1201. }
  1202. #[test]
  1203. fn split_ordered_genome_regions_preserves_input_contig_order() {
  1204. let genome = vec![("chr2".to_string(), 2), ("chr1".to_string(), 2)];
  1205. assert_eq!(
  1206. split_ordered_genome_into_n_regions_exact(&genome, 2),
  1207. vec![vec!["chr2:1-2".to_string()], vec!["chr1:1-2".to_string()]]
  1208. );
  1209. }
  1210. #[test]
  1211. fn split_g() -> anyhow::Result<()> {
  1212. test_init();
  1213. let n_parts = 4;
  1214. let reader = bam::Reader::from_path(
  1215. "/mnt/beegfs02/scratch/t_steimle/data/wgs/34528/norm/34528_norm_hs1.bam",
  1216. )
  1217. .with_context(|| format!("Failed to open BAM"))?;
  1218. let header = bam::Header::from_template(reader.header());
  1219. let genome_sizes = get_genome_sizes(&header)?;
  1220. // Split genome into regions
  1221. let regions = split_genome_into_n_regions_exact(&genome_sizes, n_parts);
  1222. info!("{:#?}", regions);
  1223. Ok(())
  1224. }
  1225. }