use anyhow::Context; use bitcode::{Decode, Encode}; use glob::glob; use log::{debug, warn}; use serde::{Deserialize, Serialize}; use std::{ cmp::Ordering, collections::HashMap, fmt, fs, iter::Sum, ops::{Add, Div}, path::{Path, PathBuf}, }; pub fn find_unique_file(dir_path: &str, suffix: &str) -> anyhow::Result { let mut matching_files = Vec::new(); for entry in fs::read_dir(dir_path).with_context(|| format!("Failed to read directory: {}", dir_path))? { let entry = entry.with_context(|| "Failed to read directory entry")?; let path = entry.path(); if path.is_file() && path .file_name() .and_then(|name| name.to_str()) .map(|name| name.ends_with(suffix)) .unwrap_or(false) { matching_files.push(path); } } match matching_files.len() { 0 => Err(anyhow::anyhow!("No file found ending with '{}'", suffix)) .with_context(|| format!("In directory: {}", dir_path)), 1 => Ok(matching_files[0].to_string_lossy().into_owned()), _ => Err(anyhow::anyhow!( "Multiple files found ending with '{}'", suffix )) .with_context(|| format!("In directory: {}", dir_path)), } } pub fn path_prefix(out: &str) -> anyhow::Result { let out_path = Path::new(&out); let out_dir = out_path .parent() .ok_or_else(|| anyhow::anyhow!("Can't parse the dir of {}", out_path.display()))?; let name = out_path .file_name() .and_then(|name| name.to_str()) .ok_or_else(|| anyhow::anyhow!("Can't parse the file name of {}", out_path.display()))?; let stem = name .split_once('.') .map(|(stem, _)| stem) .ok_or_else(|| anyhow::anyhow!("Can't parse the file stem of {}", name))?; Ok(format!("{}/{stem}", out_dir.display())) } pub fn force_or_not(_path: &str, _force: bool) -> anyhow::Result<()> { // let path = Path::new(path); // let mut output_exists = path.exists(); // let dir = path // .parent() // .context(format!("Can't parse the parent dir of {}", path.display()))?; // // if force && output_exists { // fs::remove_dir_all(dir)?; // fs::create_dir_all(dir)?; // output_exists = false; // } // // if output_exists { // info!("{} already exists.", path.display()) // } Ok(()) } use rayon::prelude::*; use std::cmp::Ord; pub struct VectorIntersection { pub common: Vec, pub only_in_first: Vec, pub only_in_second: Vec, } impl fmt::Display for VectorIntersection { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let total_items = self.common.len() + self.only_in_first.len() + self.only_in_second.len(); writeln!(f, "Total items: {}", total_items)?; writeln!( f, "Common: {} ({:.2}%)", self.common.len(), percentage(self.common.len(), total_items) )?; writeln!( f, "Only in first: {} ({:.2}%)", self.only_in_first.len(), percentage(self.only_in_first.len(), total_items) )?; writeln!( f, "Only in second: {} ({:.2}%)", self.only_in_second.len(), percentage(self.only_in_second.len(), total_items) ) } } fn percentage(part: usize, total: usize) -> f64 { if total == 0 { 0.0 } else { (part as f64 / total as f64) * 100.0 } } impl Default for VectorIntersection { fn default() -> Self { Self { common: Vec::new(), only_in_first: Vec::new(), only_in_second: Vec::new(), } } } impl VectorIntersection { fn merge(&mut self, other: &mut Self) { self.common.append(&mut other.common); self.only_in_first.append(&mut other.only_in_first); self.only_in_second.append(&mut other.only_in_second); } } fn intersection(vec1: &[T], vec2: &[T]) -> VectorIntersection { let mut result = VectorIntersection::default(); let mut i = 0; let mut j = 0; while i < vec1.len() && j < vec2.len() { match vec1[i].cmp(&vec2[j]) { Ordering::Less => { result.only_in_first.push(vec1[i].clone()); i += 1; } Ordering::Greater => { result.only_in_second.push(vec2[j].clone()); j += 1; } Ordering::Equal => { let val = &vec1[i]; let mut count1 = 1; let mut count2 = 1; // Count occurrences in vec1 while i + 1 < vec1.len() && &vec1[i + 1] == val { i += 1; count1 += 1; } // Count occurrences in vec2 while j + 1 < vec2.len() && &vec2[j + 1] == val { j += 1; count2 += 1; } // Add to common result .common .extend(std::iter::repeat(val.clone()).take(count1.min(count2))); // Add excess to only_in_first or only_in_second match count1.cmp(&count2) { Ordering::Greater => { result .only_in_first .extend(std::iter::repeat(val.clone()).take(count1 - count2)); } Ordering::Less => { result .only_in_second .extend(std::iter::repeat(val.clone()).take(count2 - count1)); } Ordering::Equal => { // No excess elements, do nothing } } i += 1; j += 1; } } } result.only_in_first.extend(vec1[i..].iter().cloned()); result.only_in_second.extend(vec2[j..].iter().cloned()); result } pub fn par_intersection( vec1: &[T], vec2: &[T], ) -> VectorIntersection { let chunk_size = (vec1.len() / rayon::current_num_threads()).max(1); vec1.par_chunks(chunk_size) .map(|chunk| { let start = vec2.partition_point(|x| x < &chunk[0]); let end = vec2.partition_point(|x| x <= &chunk[chunk.len() - 1]); // Ensure start is not greater than end if start <= end { intersection(chunk, &vec2[start..end]) } else { // If start > end, there's no intersection for this chunk VectorIntersection::default() } }) .reduce(VectorIntersection::default, |mut acc, mut x| { acc.merge(&mut x); acc }) } pub fn temp_file_path(suffix: &str) -> std::io::Result { let temp_path = tempfile::Builder::new() .prefix("pandora-temp-") .suffix(suffix) .rand_bytes(5) .tempfile()? .into_temp_path(); Ok(temp_path.to_path_buf()) } pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 { let m = dna_sequence.len() as f64; // Early return for empty sequences if m == 0.0 { return 0.0; } // Count occurrences of each base let mut bases = HashMap::::new(); for base in dna_sequence.chars() { *bases.entry(base).or_insert(0) += 1; } // Calculate Shannon entropy let shannon_entropy_value: f64 = bases .values() .map(|&n_i| { let p_i = n_i as f64 / m; if p_i > 0.0 { -p_i * p_i.log2() } else { 0.0 // Avoid log2(0) } }) .sum(); shannon_entropy_value } pub fn mean(values: &[T]) -> f64 where T: Copy + Add + Div + Sum + Into, { let count = values.len(); if count == 0 { return 0.0; } let sum: T = values.iter().copied().sum(); sum.into() / count as f64 } pub fn bin_data(data: Vec, bin_size: V) -> Vec<(V, usize)> where V: Copy + PartialOrd + std::ops::AddAssign + std::ops::Add, { // Sort the data let mut sorted_data = data.clone(); sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap()); // Initialize bins let mut bins: Vec<(V, usize)> = Vec::new(); let mut current_bin_start = sorted_data[0]; let mut count = 0; for &value in &sorted_data { if value < current_bin_start + bin_size { count += 1; } else { bins.push((current_bin_start, count)); current_bin_start += bin_size; count = 1; } } // Push the last bin bins.push((current_bin_start, count)); bins } // fn aggregate_data(data: &[(u128, u32)], num_bins: usize) -> Vec<(u32, u32)> { // if data.is_empty() || num_bins == 0 { // return vec![]; // } // // let bin_size = ((data.len() as f64) / (num_bins as f64)).ceil() as usize; // // (0..num_bins) // .map(|i| { // let start = i * bin_size; // let end = (start + bin_size).min(data.len()); // Ensure `end` does not exceed `data.len()` // // // If `start` is out of bounds, return (0, 0) // if start >= data.len() { // return (0, 0); // } // // let bin = &data[start..end]; // // let sum_x: u128 = bin.iter().map(|&(x, _)| x).sum(); // let count = bin.len() as u128; // let mean_x = (sum_x / count) as u32; // Rounded down to nearest u32 // // let sum_n: u32 = bin.iter().map(|&(_, n)| n).sum(); // // (mean_x, sum_n) // }) // .collect() // } // pub fn app_storage_dir() -> anyhow::Result { let app_name = env!("CARGO_PKG_NAME"); let app_dir = dirs::data_dir() .context("Failed to get data directory")? .join(app_name); if !app_dir.exists() { fs::create_dir_all(&app_dir).context("Failed to create application directory")?; } Ok(app_dir) } use blake3::Hasher as Blake3Hasher; use std::hash::{BuildHasher, Hasher}; pub struct Blake3Hash(Blake3Hasher); impl Hasher for Blake3Hash { fn finish(&self) -> u64 { let hash = self.0.finalize(); u64::from_le_bytes(hash.as_bytes()[..8].try_into().unwrap()) } fn write(&mut self, bytes: &[u8]) { self.0.update(bytes); } } /// Default Hasher #[derive(Default, Clone)] pub struct Blake3BuildHasher; impl BuildHasher for Blake3BuildHasher { type Hasher = Blake3Hash; fn build_hasher(&self) -> Self::Hasher { Blake3Hash(Blake3Hasher::new()) } } // Custom 128-bit hash type #[derive(PartialEq, Eq, Clone, Copy, Serialize, Deserialize, Debug, Encode, Decode)] pub struct Hash128([u8; 16]); impl std::hash::Hash for Hash128 { fn hash(&self, state: &mut H) { state.write(&self.0); } } impl Hash128 { pub fn new(bytes: [u8; 16]) -> Self { Hash128(bytes) } pub fn to_bytes(&self) -> [u8; 16] { self.0 } } pub fn get_dir_size(path: &Path) -> std::io::Result { let mut total_size = 0; if path.is_dir() { for entry in fs::read_dir(path)? { let entry = entry?; let path = entry.path(); if path.is_file() { total_size += path.metadata()?.len(); } else if path.is_dir() { total_size += get_dir_size(&path)?; } } } Ok(total_size) } /// Finds all files matching the given glob pattern. /// /// # Arguments /// /// * `pattern` - A glob pattern string (e.g., `"data/**/*.txt"`). /// /// # Returns /// /// * `Ok(Vec)` with all successfully matched file paths. /// * `Err` if the glob pattern is invalid or any matched path fails to resolve. /// /// # Errors /// /// Returns an error if: /// - The glob pattern is invalid. /// - A file path matching the pattern cannot be resolved. /// /// # Examples /// /// ```rust /// let files = find_files("src/**/*.rs")?; /// for file in files { /// println!("{:?}", file); /// } /// ``` pub fn find_files(pattern: &str) -> anyhow::Result> { let mut result = Vec::new(); let entries = glob(pattern) .with_context(|| format!("Invalid glob pattern: '{}'", pattern))?; for entry in entries { let path = entry.with_context(|| format!("Failed to resolve path for pattern '{}'", pattern))?; result.push(path); } Ok(result) } // fn system_time_to_utc(system_time: SystemTime) -> Option> { // system_time // .duration_since(UNIX_EPOCH) // .ok() // .map(|duration| Utc.timestamp(duration.as_secs() as i64, duration.subsec_nanos())) // } pub fn list_directories(dir_path: &str) -> std::io::Result> { let mut directories = Vec::new(); for entry in fs::read_dir(dir_path)? { let entry = entry?; let path = entry.path(); // Check if the path is a directory if path.is_dir() { if let Some(dir_name) = path.file_name().and_then(|name| name.to_str()) { directories.push(dir_name.to_string()); } } } Ok(directories) } /// Checks whether the modification time of `file1` is older than `file2`. /// /// If `rm` is `true` and `file1` is older, attempts to remove the directory containing `file1`. /// /// # Arguments /// /// * `file1` - Path to the first file. /// * `file2` - Path to the second file. /// * `rm` - If true, and `file1` is older, attempts to remove its parent directory. /// /// # Returns /// /// * `Ok(true)` if `file1` is older than `file2`. /// * `Ok(false)` otherwise. /// /// # Errors /// /// Returns an [`anyhow::Error`] if: /// - Either `file1` or `file2` does not exist. /// - File metadata cannot be read. /// - File modification times cannot be retrieved. /// - (if `rm == true`) Directory removal fails (if uncommented). /// pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result { let mtime1 = fs::metadata(file1) .with_context(|| format!("Failed to read metadata for '{}'", file1))? .modified() .with_context(|| format!("Failed to get modified time for '{}'", file1))?; let mtime2 = fs::metadata(file2) .with_context(|| format!("Failed to read metadata for '{}'", file2))? .modified() .with_context(|| format!("Failed to get modified time for '{}'", file2))?; if mtime1 < mtime2 && rm { if let Some(file1_dir) = Path::new(file1).parent() { warn!("Removing old directory: {}", file1_dir.display()); // fs::remove_dir(file1_dir)?; } } Ok(mtime1 < mtime2) } pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> { debug!("Trying to remove: {dir}"); // match fs::remove_dir_all(dir) { // Ok(_) => {} // Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} // Err(e) => { // anyhow::bail!("Failed to remove directory '{}': {}", dir, e); // } // }; Ok(()) } /// Searches a directory for the first file matching a given name pattern. /// /// This function looks for a file in the given directory whose filename /// starts with the provided `starts_with` prefix and ends with the provided /// `ends_with` suffix. It returns the full path to the first match found. /// /// # Arguments /// /// * `dir` - A reference to the directory path where the search will occur. /// * `starts_with` - The required prefix of the filename (e.g., `"throughput_"`). /// * `ends_with` - The required suffix of the filename (e.g., `".csv"`). /// /// # Returns /// /// * `Some(PathBuf)` - Path to the first file matching the pattern. /// * `None` - If no matching file is found or the directory can't be read. /// /// # Example /// /// ```rust /// let dir = std::path::Path::new("/path/to/data"); /// if let Some(path) = find_matching_file(dir, "throughput_", ".csv") { /// println!("Found file: {}", path.display()); /// } else { /// eprintln!("No matching file found."); /// } /// ``` pub fn find_matching_file(dir: &Path, starts_with: &str, ends_with: &str) -> Option { fs::read_dir(dir) .ok()? .filter_map(Result::ok) .map(|entry| entry.path()) .find(|path| { path.is_file() && path .file_name() .and_then(|name| name.to_str()) .map(|name| name.starts_with(starts_with) && name.ends_with(ends_with)) .unwrap_or(false) }) } /// Searches for the first file in the given directory whose file name /// satisfies the provided condition. /// /// # Arguments /// /// * `dir` - Path to the directory to search. /// * `condition` - A closure that takes a file name (`&str`) and returns `true` /// if the file matches the desired condition. /// /// # Returns /// /// An `Option` containing the path to the first matching file, /// or `None` if no file matches or the directory can't be read. /// /// # Examples /// /// ``` /// use std::path::Path; /// let result = find_file(Path::new("."), |name| name.ends_with(".rs")); /// ``` pub fn find_file(dir: &Path, condition: F) -> Option where F: Fn(&str) -> bool, { fs::read_dir(dir).ok()?.find_map(|entry| { let path = entry.ok()?.path(); if path.is_file() && path .file_name() .and_then(|name| name.to_str()) .map(&condition) .unwrap_or(false) { Some(path) } else { None } }) }