use anyhow::Context; use bitcode::{Decode, Encode}; use glob::glob; use log::{debug, error, warn}; use rust_htslib::bam::Read; use rustc_hash::FxHashMap; use serde::{Deserialize, Serialize}; use std::{ cmp::Ordering, collections::{HashMap, HashSet}, fmt, fs, iter::Sum, ops::{Add, Div}, path::{Path, PathBuf}, sync::{atomic::AtomicBool, OnceLock}, }; use uuid::Uuid; pub fn find_unique_file(dir_path: &str, suffix: &str) -> anyhow::Result { let mut matching_files = Vec::new(); for entry in fs::read_dir(dir_path).with_context(|| format!("Failed to read directory: {}", dir_path))? { let entry = entry.with_context(|| "Failed to read directory entry")?; let path = entry.path(); if path.is_file() && path .file_name() .and_then(|name| name.to_str()) .map(|name| name.ends_with(suffix)) .unwrap_or(false) { matching_files.push(path); } } match matching_files.len() { 0 => Err(anyhow::anyhow!("No file found ending with '{}'", suffix)) .with_context(|| format!("In directory: {}", dir_path)), 1 => Ok(matching_files[0].to_string_lossy().into_owned()), _ => { matching_files.sort(); let matches = matching_files .iter() .map(|p| p.display().to_string()) .collect::>() .join(", "); Err(anyhow::anyhow!( "Multiple files found ending with '{}': {}", suffix, matches )) .with_context(|| format!("In directory: {}", dir_path)) } } } /// Return the output prefix by removing all suffixes from the first dot in the filename. /// /// This is intentionally stricter than [`Path::file_stem`]: filenames such as /// `sample.v1.vcf.gz` become `sample`, not `sample.v1.vcf`. This matches tools /// that expect a prefix rather than a single-extension stem. /// /// # Examples /// /// ```text /// /tmp/sample.vcf.gz -> /tmp/sample /// /tmp/sample.v1.vcf.gz -> /tmp/sample /// ``` pub fn path_prefix(out: &str) -> anyhow::Result { let out_path = Path::new(out); let out_dir = out_path .parent() .ok_or_else(|| anyhow::anyhow!("Can't parse the dir of {}", out_path.display()))?; let name = out_path .file_name() .and_then(|name| name.to_str()) .ok_or_else(|| anyhow::anyhow!("Can't parse the file name of {}", out_path.display()))?; let stem = name .split_once('.') .map(|(stem, _)| stem) .ok_or_else(|| anyhow::anyhow!("Can't parse the file stem of {}", name))?; Ok(format!("{}/{stem}", out_dir.display())) } pub fn force_or_not(_path: &str, _force: bool) -> anyhow::Result<()> { // let path = Path::new(path); // let mut output_exists = path.exists(); // let dir = path // .parent() // .context(format!("Can't parse the parent dir of {}", path.display()))?; // // if force && output_exists { // fs::remove_dir_all(dir)?; // fs::create_dir_all(dir)?; // output_exists = false; // } // // if output_exists { // info!("{} already exists.", path.display()) // } Ok(()) } /// Builds Singularity bind flags from input/output paths. /// /// - Converts file paths to their parent directory. /// - Skips non-existing paths to avoid Singularity errors. /// - Canonicalizes and deduplicates directories. pub fn singularity_bind_flags(paths: I) -> String where I: IntoIterator, P: AsRef, { let mut seen = HashSet::new(); let mut binds = Vec::new(); let mut push_dir = |p: &Path| { if p.as_os_str().is_empty() || !p.exists() { return; } let dir = p.canonicalize().unwrap_or_else(|_| p.to_path_buf()); if seen.insert(dir.clone()) { binds.push(format!( "--bind {src}:{dst}", src = dir.display(), dst = dir.display() )); } }; for path_str in paths { let path = path_str.as_ref(); if path.is_dir() { push_dir(path); } else if let Some(parent) = path.parent() { push_dir(parent); } } binds.join(" ") } use std::cmp::Ord; pub struct VectorIntersection { pub common: Vec, pub only_in_first: Vec, pub only_in_second: Vec, } impl fmt::Display for VectorIntersection { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let total_items = self.common.len() + self.only_in_first.len() + self.only_in_second.len(); writeln!(f, "Total items: {}", total_items)?; writeln!( f, "Common: {} ({:.2}%)", self.common.len(), percentage(self.common.len(), total_items) )?; writeln!( f, "Only in first: {} ({:.2}%)", self.only_in_first.len(), percentage(self.only_in_first.len(), total_items) )?; writeln!( f, "Only in second: {} ({:.2}%)", self.only_in_second.len(), percentage(self.only_in_second.len(), total_items) ) } } fn percentage(part: usize, total: usize) -> f64 { if total == 0 { 0.0 } else { (part as f64 / total as f64) * 100.0 } } impl Default for VectorIntersection { fn default() -> Self { Self { common: Vec::new(), only_in_first: Vec::new(), only_in_second: Vec::new(), } } } impl VectorIntersection { fn merge(&mut self, other: &mut Self) { self.common.append(&mut other.common); self.only_in_first.append(&mut other.only_in_first); self.only_in_second.append(&mut other.only_in_second); } } fn intersection(vec1: &[T], vec2: &[T]) -> VectorIntersection { let mut result = VectorIntersection::default(); let mut i = 0; let mut j = 0; while i < vec1.len() && j < vec2.len() { match vec1[i].cmp(&vec2[j]) { Ordering::Less => { result.only_in_first.push(vec1[i].clone()); i += 1; } Ordering::Greater => { result.only_in_second.push(vec2[j].clone()); j += 1; } Ordering::Equal => { let val = &vec1[i]; let mut count1 = 1; let mut count2 = 1; // Count occurrences in vec1 while i + 1 < vec1.len() && &vec1[i + 1] == val { i += 1; count1 += 1; } // Count occurrences in vec2 while j + 1 < vec2.len() && &vec2[j + 1] == val { j += 1; count2 += 1; } // Add to common result .common .extend(std::iter::repeat_n(val.clone(), count1.min(count2))); // Add excess to only_in_first or only_in_second match count1.cmp(&count2) { Ordering::Greater => { result .only_in_first .extend(std::iter::repeat_n(val.clone(), count1 - count2)); } Ordering::Less => { result .only_in_second .extend(std::iter::repeat_n(val.clone(), count2 - count1)); } Ordering::Equal => { // No excess elements, do nothing } } i += 1; j += 1; } } } result.only_in_first.extend(vec1[i..].iter().cloned()); result.only_in_second.extend(vec2[j..].iter().cloned()); result } pub fn par_intersection( vec1: &[T], vec2: &[T], ) -> VectorIntersection { par_intersection_by_value(vec1, vec2) } fn par_intersection_by_value( vec1: &[T], vec2: &[T], ) -> VectorIntersection { const SEQUENTIAL_THRESHOLD: usize = 4096; if vec1.len() + vec2.len() <= SEQUENTIAL_THRESHOLD { return intersection(vec1, vec2); } if vec1.is_empty() { return VectorIntersection { common: Vec::new(), only_in_first: Vec::new(), only_in_second: vec2.to_vec(), }; } if vec2.is_empty() { return VectorIntersection { common: Vec::new(), only_in_first: vec1.to_vec(), only_in_second: Vec::new(), }; } let pivot = &vec1[vec1.len() / 2]; let v1_left = vec1.partition_point(|x| x < pivot); let v1_right = vec1.partition_point(|x| x <= pivot); let v2_left = vec2.partition_point(|x| x < pivot); let v2_right = vec2.partition_point(|x| x <= pivot); let (mut left, mut right) = rayon::join( || par_intersection_by_value(&vec1[..v1_left], &vec2[..v2_left]), || par_intersection_by_value(&vec1[v1_right..], &vec2[v2_right..]), ); let count1 = v1_right - v1_left; let count2 = v2_right - v2_left; let mut middle = VectorIntersection::default(); middle .common .extend(std::iter::repeat_n(pivot.clone(), count1.min(count2))); match count1.cmp(&count2) { Ordering::Greater => middle .only_in_first .extend(std::iter::repeat_n(pivot.clone(), count1 - count2)), Ordering::Less => middle .only_in_second .extend(std::iter::repeat_n(pivot.clone(), count2 - count1)), Ordering::Equal => {} } left.merge(&mut middle); left.merge(&mut right); left } pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 { let m = dna_sequence.len() as f64; // Early return for empty sequences if m == 0.0 { return 0.0; } // Count occurrences of each base let mut bases = HashMap::::new(); for base in dna_sequence.chars() { *bases.entry(base).or_insert(0) += 1; } // Calculate Shannon entropy let shannon_entropy_value: f64 = bases .values() .map(|&n_i| { let p_i = n_i as f64 / m; if p_i > 0.0 { -p_i * p_i.log2() } else { 0.0 // Avoid log2(0) } }) .sum(); shannon_entropy_value } pub fn mean(values: &[T]) -> f64 where T: Copy + Add + Div + Sum + Into, { let count = values.len(); if count == 0 { return 0.0; } let sum: T = values.iter().copied().sum(); sum.into() / count as f64 } pub fn bin_data(data: Vec, bin_size: V) -> Vec<(V, usize)> where V: Copy + Default + PartialOrd + std::ops::AddAssign + std::ops::Add, { if data.is_empty() || bin_size.partial_cmp(&V::default()) != Some(Ordering::Greater) { return Vec::new(); } // Sort comparable data points. For floats, this drops NaN values. let mut sorted_data: Vec = data .into_iter() .filter(|v| v.partial_cmp(v) == Some(Ordering::Equal)) .collect(); if sorted_data.is_empty() { return Vec::new(); } sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal)); // Initialize bins let mut bins: Vec<(V, usize)> = Vec::new(); let mut current_bin_start = sorted_data[0]; let mut count = 0; for &value in &sorted_data { if value < current_bin_start + bin_size { count += 1; } else { bins.push((current_bin_start, count)); current_bin_start += bin_size; while value >= current_bin_start + bin_size { current_bin_start += bin_size; } count = 1; } } // Push the last bin bins.push((current_bin_start, count)); bins } // fn aggregate_data(data: &[(u128, u32)], num_bins: usize) -> Vec<(u32, u32)> { // if data.is_empty() || num_bins == 0 { // return vec![]; // } // // let bin_size = ((data.len() as f64) / (num_bins as f64)).ceil() as usize; // // (0..num_bins) // .map(|i| { // let start = i * bin_size; // let end = (start + bin_size).min(data.len()); // Ensure `end` does not exceed `data.len()` // // // If `start` is out of bounds, return (0, 0) // if start >= data.len() { // return (0, 0); // } // // let bin = &data[start..end]; // // let sum_x: u128 = bin.iter().map(|&(x, _)| x).sum(); // let count = bin.len() as u128; // let mean_x = (sum_x / count) as u32; // Rounded down to nearest u32 // // let sum_n: u32 = bin.iter().map(|&(_, n)| n).sum(); // // (mean_x, sum_n) // }) // .collect() // } // pub fn app_storage_dir() -> anyhow::Result { let app_name = env!("CARGO_PKG_NAME"); let app_dir = dirs::data_dir() .context("Failed to get data directory")? .join(app_name); if !app_dir.exists() { fs::create_dir_all(&app_dir).context("Failed to create application directory")?; } Ok(app_dir) } use blake3::Hasher as Blake3Hasher; use std::hash::{BuildHasher, Hasher}; pub struct Blake3Hash(Blake3Hasher); impl Hasher for Blake3Hash { fn finish(&self) -> u64 { let hash = self.0.finalize(); u64::from_le_bytes(hash.as_bytes()[..8].try_into().unwrap()) } fn write(&mut self, bytes: &[u8]) { self.0.update(bytes); } } /// Default Hasher #[derive(Default, Clone)] pub struct Blake3BuildHasher; impl BuildHasher for Blake3BuildHasher { type Hasher = Blake3Hash; fn build_hasher(&self) -> Self::Hasher { Blake3Hash(Blake3Hasher::new()) } } // Custom 128-bit hash type #[derive(PartialEq, Eq, Clone, Copy, Serialize, Deserialize, Debug, Encode, Decode)] pub struct Hash128([u8; 16]); impl std::hash::Hash for Hash128 { fn hash(&self, state: &mut H) { state.write(&self.0); } } impl Hash128 { pub fn new(bytes: [u8; 16]) -> Self { Hash128(bytes) } pub fn to_bytes(&self) -> [u8; 16] { self.0 } } pub fn get_dir_size(path: &Path) -> std::io::Result { let mut total_size = 0; if path.is_dir() { for entry in fs::read_dir(path)? { let entry = entry?; let path = entry.path(); if path.is_file() { total_size += path.metadata()?.len(); } else if path.is_dir() { total_size += get_dir_size(&path)?; } } } Ok(total_size) } /// Finds all files matching the given glob pattern. /// /// # Arguments /// /// * `pattern` - A glob pattern string (e.g., `"data/**/*.txt"`). /// /// # Returns /// /// * `Ok(Vec)` with all successfully matched file paths. /// * `Err` if the glob pattern is invalid or any matched path fails to resolve. /// /// # Errors /// /// Returns an error if: /// - The glob pattern is invalid. /// - A file path matching the pattern cannot be resolved. /// /// # Examples /// /// ```rust /// let files = find_files("src/**/*.rs")?; /// for file in files { /// println!("{:?}", file); /// } /// ``` pub fn find_files(pattern: &str) -> anyhow::Result> { let mut result = Vec::new(); let entries = glob(pattern).with_context(|| format!("Invalid glob pattern: '{}'", pattern))?; for entry in entries { let path = entry.with_context(|| format!("Failed to resolve path for pattern '{}'", pattern))?; result.push(path); } result.sort(); Ok(result) } // fn system_time_to_utc(system_time: SystemTime) -> Option> { // system_time // .duration_since(UNIX_EPOCH) // .ok() // .map(|duration| Utc.timestamp(duration.as_secs() as i64, duration.subsec_nanos())) // } /// List all files in a directory that have the given suffix (file extension). /// /// # Arguments /// * `dir` – Directory to scan. /// * `suffix` – File extension to match (without the dot). /// /// # Returns /// A vector of `PathBuf` entries whose extension matches `suffix`. /// /// # Errors /// Returns any I/O error from `read_dir` or directory traversal. pub fn list_files_with_ext(dir: impl AsRef, ext: &str) -> std::io::Result> { let mut out = Vec::new(); for entry in fs::read_dir(dir)? { let entry = entry?; let path = entry.path(); if path.is_file() { if let Some(file_ext) = path.extension().and_then(|e| e.to_str()) { if file_ext == ext { out.push(path); } } } } out.sort(); Ok(out) } pub fn list_directories(dir_path: PathBuf) -> std::io::Result> { let mut directories = Vec::new(); for entry in fs::read_dir(dir_path)? { let entry = entry?; let path = entry.path(); // Check if the path is a directory if path.is_dir() { if let Some(dir_name) = path.file_name().and_then(|name| name.to_str()) { directories.push(dir_name.to_string()); } } } directories.sort(); Ok(directories) } pub fn list_files_recursive(root: impl AsRef) -> Vec { fn walk(dir: &Path, out: &mut Vec) { let entries = match fs::read_dir(dir) { Ok(entries) => entries, Err(e) => { warn!("Cannot read directory {}: {}", dir.display(), e); return; } }; for entry in entries { let entry = match entry { Ok(entry) => entry, Err(e) => { warn!("Cannot read directory entry in {}: {}", dir.display(), e); continue; } }; let path = entry.path(); if path.is_dir() { walk(&path, out); } else if path.is_file() { out.push(path); } } } let mut files = Vec::new(); walk(root.as_ref(), &mut files); files.sort(); files } /// Checks whether the modification time of `file1` is older than `file2`. /// /// If `rm` is `true` and `file1` is older, attempts to remove the directory containing `file1`. /// /// # Arguments /// /// * `file1` - Path to the first file. /// * `file2` - Path to the second file. /// * `rm` - If true, and `file1` is older, attempts to remove its parent directory. /// /// # Returns /// /// * `Ok(true)` if `file1` is older than `file2`. /// * `Ok(false)` otherwise. /// /// # Errors /// /// Returns an [`anyhow::Error`] if: /// - Either `file1` or `file2` does not exist. /// - File metadata cannot be read. /// - File modification times cannot be retrieved. /// - Directory removal fails when `rm == true` and `file1` is older. /// pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result { debug!("is_file_older {file1} {file2}, {rm:?}"); let mtime1 = fs::metadata(file1) .with_context(|| format!("Failed to read metadata for '{}'", file1))? .modified() .with_context(|| format!("Failed to get modified time for '{}'", file1))?; let mtime2 = fs::metadata(file2) .with_context(|| format!("Failed to read metadata for '{}'", file2))? .modified() .with_context(|| format!("Failed to get modified time for '{}'", file2))?; if mtime1 < mtime2 && rm { if let Some(file1_dir) = Path::new(file1).parent() { warn!("Removing old directory: {}", file1_dir.display()); fs::remove_dir_all(file1_dir).map_err(|e| { warn!("Failed to remove {}: {}", file1_dir.display(), e); e })?; } } Ok(mtime1 < mtime2) } pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> { debug!("Trying to remove: {dir}"); match fs::remove_dir_all(dir) { Ok(_) => {} Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} Err(e) => anyhow::bail!("Failed to remove directory '{}': {}", dir, e), } Ok(()) } /// Searches a directory for the first file matching a given name pattern. /// /// This function looks for a file in the given directory whose filename /// starts with the provided `starts_with` prefix and ends with the provided /// `ends_with` suffix. It returns the full path to the first match found. /// /// # Arguments /// /// * `dir` - A reference to the directory path where the search will occur. /// * `starts_with` - The required prefix of the filename (e.g., `"throughput_"`). /// * `ends_with` - The required suffix of the filename (e.g., `".csv"`). /// /// # Returns /// /// * `Some(PathBuf)` - Path to the first file matching the pattern. /// * `None` - If no matching file is found or the directory can't be read. /// /// # Example /// /// ```rust /// let dir = std::path::Path::new("/path/to/data"); /// if let Some(path) = find_matching_file(dir, "throughput_", ".csv") { /// println!("Found file: {}", path.display()); /// } else { /// eprintln!("No matching file found."); /// } /// ``` pub fn find_matching_file(dir: &Path, starts_with: &str, ends_with: &str) -> Option { let mut matches: Vec<_> = fs::read_dir(dir) .ok()? .filter_map(Result::ok) .map(|entry| entry.path()) .filter(|path| { path.is_file() && path .file_name() .and_then(|name| name.to_str()) .map(|name| name.starts_with(starts_with) && name.ends_with(ends_with)) .unwrap_or(false) }) .collect(); matches.sort(); matches.into_iter().next() } /// Searches for the first file in the given directory whose file name /// satisfies the provided condition. /// /// # Arguments /// /// * `dir` - Path to the directory to search. /// * `condition` - A closure that takes a file name (`&str`) and returns `true` /// if the file matches the desired condition. /// /// # Returns /// /// An `Option` containing the path to the first matching file, /// or `None` if no file matches or the directory can't be read. /// /// # Examples /// /// ``` /// use std::path::Path; /// let result = find_file(Path::new("."), |name| name.ends_with(".rs")); /// ``` pub fn find_file(dir: &Path, condition: F) -> Option where F: Fn(&str) -> bool, { let mut matches: Vec<_> = fs::read_dir(dir) .ok()? .filter_map(Result::ok) .map(|entry| entry.path()) .filter(|path| { path.is_file() && path .file_name() .and_then(|name| name.to_str()) .map(&condition) .unwrap_or(false) }) .collect(); matches.sort(); matches.into_iter().next() } /// Represents a string made of either: /// - one character repeated N times, /// - a two-character block repeated N times, /// - or no valid repetition pattern. #[derive(Debug, PartialEq)] pub enum Repeat { None, RepOne(char, usize), RepTwo(String, usize), } pub fn detect_repetition(s: &str) -> Repeat { let len = s.chars().count(); if len == 0 { return Repeat::None; } // Check for single-char repetition let mut chars = s.chars(); let first = chars.next().unwrap(); if s.chars().all(|c| c == first) { return Repeat::RepOne(first, len); } // Check for two-char block repetition if len.is_multiple_of(2) { let mut iter = s.chars(); let a = iter.next().unwrap(); let b = iter.next().unwrap(); let mut count = 1; while let (Some(c), Some(d)) = (iter.next(), iter.next()) { if c != a || d != b { return Repeat::None; } count += 1; } return Repeat::RepTwo(format!("{}{}", a, b), count); } Repeat::None } pub fn test_init() { let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("debug")) .try_init(); } use regex::Regex; /// Extracts the numeric barcode suffix from a filename. /// /// # Examples /// /// ``` /// let f = "sample_SQK-NBD114-24_barcode07.bam"; /// assert_eq!(extract_barcode(f), Some(7)); /// ``` /// /// Returns `None` if no `barcodeNN` pattern is found /// or if the number cannot be parsed. pub fn extract_barcode(name: &str) -> Option { static BARCODE_RE: OnceLock = OnceLock::new(); let re = BARCODE_RE.get_or_init(|| Regex::new(r"barcode(\d+)").unwrap()); re.captures(name) .and_then(|c| c.get(1)) .and_then(|m| m.as_str().parse::().ok()) } pub fn human_size(bytes: u64) -> String { const UNITS: [&str; 5] = ["B", "KB", "MB", "GB", "TB"]; let mut size = bytes as f64; let mut unit = 0; while size >= 1024.0 && unit < UNITS.len() - 1 { size /= 1024.0; unit += 1; } if unit == 0 { format!("{} {}", bytes, UNITS[unit]) } else { format!("{:.2} {}", size, UNITS[unit]) } } /// RAII guard that removes a temporary directory when dropped. pub struct TempDirGuard { path: PathBuf, } impl TempDirGuard { pub fn new(path: PathBuf) -> Self { Self { path } } } impl Drop for TempDirGuard { fn drop(&mut self) { if self.path.exists() { // Log the attempt. Using eprintln/warn! ensures the error is noted, // but the program does not panic. if let Err(e) = fs::remove_dir_all(&self.path) { error!( "Failed to remove temporary directory '{}': {}", self.path.display(), e ); } } } } /// Removes a BAM file and its associated index file (`.bam.bai` or `.bai`). pub fn remove_bam_with_index(bam: &Path) -> anyhow::Result<()> { if bam.exists() { fs::remove_file(bam).with_context(|| format!("Failed to remove BAM: {}", bam.display()))?; } let bai = bam.with_extension("bam.bai"); if bai.exists() { fs::remove_file(&bai).ok(); } let bai_alt = bam.with_extension("bai"); if bai_alt.exists() { fs::remove_file(&bai_alt).ok(); } Ok(()) } /// Guard that tracks temporary filesystem paths and cleans them up on drop /// unless explicitly disarmed. /// /// This is intended for pipeline-style code where intermediate outputs /// are produced by downstream tools. The guard only tracks paths; it does /// **not** create files itself. /// /// If the pipeline completes successfully, call [`disarm`] to prevent /// cleanup. Otherwise, all tracked paths are removed on drop on a /// best-effort basis. /// /// # Example /// /// ```no_run /// use std::path::PathBuf; /// /// let tmp_dir = PathBuf::from("/tmp"); /// let mut guard = TempFileGuard::new(); /// /// let path1 = guard.temp_path(".tmp", &tmp_dir); /// let path2 = guard.temp_path(".tmp", &tmp_dir); /// /// // Produce outputs at `path1` and `path2` /// /// // Mark success /// guard.disarm(); /// ``` /// /// If `disarm()` is not called, both paths are cleaned up automatically /// when the guard is dropped. pub struct TempFileGuard { files: Vec, disarmed: AtomicBool, } impl Default for TempFileGuard { fn default() -> Self { Self::new() } } impl TempFileGuard { pub fn new() -> Self { Self { files: Vec::new(), disarmed: AtomicBool::new(false), } } /// Registers a temporary file for cleanup. pub fn track(&mut self, path: PathBuf) { self.files.push(path); } /// Registers multiple temporary files for cleanup. pub fn track_many(&mut self, paths: impl IntoIterator) { self.files.extend(paths); } /// Disarms the guard, preventing cleanup on drop. /// Call this when the pipeline succeeds. pub fn disarm(&self) { self.disarmed .store(true, std::sync::atomic::Ordering::SeqCst); } /// Manually clean up all tracked files. pub fn cleanup(&mut self) { for file in &self.files { if let Err(e) = remove_tracked_path(file) { warn!("Failed to clean up temp file {}: {}", file.display(), e); } } self.files.clear(); } pub fn clear(&mut self) { self.files.clear(); } /// Generates a unique temporary path inside `tmp_dir` and registers it /// for cleanup. /// /// The returned path is derived from a UUIDv4 and **no file is created**. /// The caller is responsible for creating or writing to the path. /// /// # Example /// /// ```no_run /// use std::path::PathBuf; /// /// let tmp_dir = PathBuf::from("/tmp"); /// let mut guard = TempFileGuard::new(); /// /// let path = guard.temp_path(".tmp", &tmp_dir); /// // create or write to `path` /// ``` pub fn tmp_path(&mut self, suffix: &str, tmp_dir: impl AsRef) -> PathBuf { let name = format!("pandora-tmp-{}{}", Uuid::new_v4(), suffix); let tmp_dir = tmp_dir.as_ref(); let path = tmp_dir.join(name); self.track(path.clone()); path } } fn remove_tracked_path(path: &Path) -> anyhow::Result<()> { if path.is_dir() { fs::remove_dir_all(path) .with_context(|| format!("Failed to remove temp directory: {}", path.display()))?; } else if path.exists() { fs::remove_file(path) .with_context(|| format!("Failed to remove temp file: {}", path.display()))?; } if path.extension().and_then(|e| e.to_str()) == Some("bam") { remove_existing_file(path.with_extension("bam.bai")); remove_existing_file(path.with_extension("bai")); } Ok(()) } fn remove_existing_file(path: PathBuf) { if path.exists() { let _ = fs::remove_file(path); } } impl Drop for TempFileGuard { fn drop(&mut self) { if !self.disarmed.load(std::sync::atomic::Ordering::SeqCst) { warn!( "Pipeline failed or panicked, cleaning up {} temporary files", self.files.len() ); self.cleanup(); } } } // pub fn temp_file_path(suffix: &str, config: &Config) -> std::io::Result { // let p = config.tmp_dir; // let temp_path = tempfile::Builder::new() // .prefix("pandora-temp-") // .suffix(suffix) // .rand_bytes(5) // .tempfile()? // .into_temp_path(); // // Ok(temp_path.to_path_buf()) // } /// Extracts genome sizes from BAM header. /// Extract contig names and lengths from a BAM header as a hash map. /// /// This delegates parsing to [`crate::io::bam::get_genome_sizes`]. Because the /// return type is a hash map, iteration order is not BAM header order; use /// [`get_genome_sizes_in_header_order`] when order matters. pub fn get_genome_sizes( header: &rust_htslib::bam::Header, ) -> anyhow::Result> { Ok(crate::io::bam::get_genome_sizes(header)? .into_iter() .collect()) } /// Extract contig names and lengths from a BAM header in header order. /// /// Use this when downstream region generation must follow the BAM/reference /// contig order. [`get_genome_sizes`] returns a hash map and does not preserve /// that order. pub fn get_genome_sizes_in_header_order( header: &rust_htslib::bam::HeaderView, ) -> anyhow::Result> { (0..header.target_count()) .map(|tid| { let name = String::from_utf8_lossy(header.tid2name(tid)).into_owned(); let len = header .target_len(tid) .with_context(|| format!("BAM header target {name} is missing length"))?; Ok((name, len)) }) .collect() } pub fn bam_contigs>(bam: P) -> anyhow::Result> { let reader = rust_htslib::bam::Reader::from_path(&bam)?; Ok(reader .header() .target_names() .iter() .map(|name| String::from_utf8_lossy(name).into_owned()) .collect()) } /// Split genome into ~n_parts equal-sized chunks (by number of bases), /// returning for each chunk a list of regions in `ctg:start-end` form. /// Split genome into ~n_parts equal-sized chunks (by bases). /// Each chunk is a single region `ctg:start-end` (no cross-contig regions). /// /// Note: the number of regions returned will be ≈ n_parts (may differ by 1–2). pub fn split_genome_into_n_regions( genome_sizes: &FxHashMap, n_parts: usize, ) -> Vec { if n_parts == 0 || genome_sizes.is_empty() { return Vec::new(); } // Deterministic contig order let mut contigs: Vec<(String, u64)> = genome_sizes .iter() .map(|(ctg, len)| (ctg.clone(), *len)) .collect(); contigs.sort_by(|a, b| a.0.cmp(&b.0)); let total_bases: u64 = contigs.iter().map(|(_, len)| *len).sum(); if total_bases == 0 { return Vec::new(); } let target_chunk_size: u64 = total_bases.div_ceil(n_parts as u64); // ceil let mut regions = Vec::new(); for (ctg, len) in contigs { let mut remaining = len; let mut start: u64 = 1; while remaining > 0 { let take = remaining.min(target_chunk_size); let end = start + take - 1; regions.push(format!("{ctg}:{start}-{end}")); remaining -= take; start = end + 1; } } regions } /// Split genome into exactly `n_parts` chunks with (almost) equal total size. /// Each chunk is a *list* of regions `ctg:start-end`. A chunk may span several /// contigs, but each region is within a single contig. /// /// The sizes of the parts will differ by at most 1 base. pub fn split_genome_into_n_regions_exact( genome_sizes: &FxHashMap, n_parts: usize, ) -> Vec> { let mut contigs: Vec<(String, u64)> = genome_sizes .iter() .map(|(ctg, len)| (ctg.clone(), *len)) .collect(); contigs.sort_by(|a, b| a.0.cmp(&b.0)); split_ordered_genome_into_n_regions_exact(&contigs, n_parts) } pub fn split_ordered_genome_into_n_regions_exact( genome_sizes: &[(String, u64)], n_parts: usize, ) -> Vec> { if n_parts == 0 || genome_sizes.is_empty() { return Vec::new(); } let total_bases: u64 = genome_sizes.iter().map(|(_, len)| *len).sum(); if total_bases == 0 { return Vec::new(); } // We cannot have more non-empty parts than bases if we use 1-based inclusive coordinates. let parts = n_parts.min(total_bases as usize); // Distribute bases as evenly as possible: // first `remainder` parts get (base_per_part + 1), the rest get base_per_part. let base_per_part = total_bases / parts as u64; let remainder = total_bases % parts as u64; let target_sizes: Vec = (0..parts) .map(|i| { if i < remainder as usize { base_per_part + 1 } else { base_per_part } }) .collect(); let mut chunks: Vec> = Vec::with_capacity(parts); let mut part_idx = 0; let mut remaining_in_part = target_sizes[0]; for (ctg, mut len) in genome_sizes.iter().cloned() { let mut start: u64 = 1; while len > 0 && part_idx < parts { let take = len.min(remaining_in_part); let end = start + take - 1; if chunks.len() <= part_idx { chunks.push(Vec::new()); } chunks[part_idx].push(format!("{ctg}:{start}-{end}")); len -= take; start = end + 1; remaining_in_part -= take; if remaining_in_part == 0 { part_idx += 1; if part_idx == parts { break; } remaining_in_part = target_sizes[part_idx]; } } } // Sanity: we should have exactly `parts` chunks, all non-empty debug_assert_eq!(chunks.len(), parts); chunks } /// Convert a numeric value to an RGB triplet using a diverging palette. /// /// Values are clipped to `[-clip, +clip]` and mapped as: /// - negative → blue /// - zero → white /// - positive → red /// /// # Arguments /// * `v` – value to color (e.g. delta methylation) /// * `clip` – symmetric clipping bound (must be > 0) /// /// # Returns /// `(r, g, b)` as 8-bit RGB components. /// /// # Panics /// Panics if `clip <= 0.0`. pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) { assert!(clip > 0.0, "clip must be > 0"); let x = v.clamp(-clip, clip); let t = (x + clip) / (2.0 * clip); // [-clip,+clip] → [0,1] if t < 0.5 { // blue → white let u = t / 0.5; let r = (255.0 * u).round() as u8; let g = (255.0 * u).round() as u8; (r, g, 255) } else { // white → red let u = (t - 0.5) / 0.5; let g = (255.0 * (1.0 - u)).round() as u8; let b = (255.0 * (1.0 - u)).round() as u8; (255, g, b) } } pub fn revcomp(s: &str) -> String { fn comp(b: u8) -> u8 { match b { b'A' => b'T', b'C' => b'G', b'G' => b'C', b'T' => b'A', b'N' => b'N', _ => b'N', } } let v: Vec = s .as_bytes() .iter() .rev() .map(|&b| comp(b.to_ascii_uppercase())) .collect(); String::from_utf8(v).unwrap() } #[cfg(test)] mod tests { use log::info; use rust_htslib::bam::{self, Read}; use crate::helpers::test_init; use super::*; #[test] fn par_intersection_keeps_values_only_in_second_outside_first_range() { let res = par_intersection(&[10], &[1]); assert_eq!(res.common, Vec::::new()); assert_eq!(res.only_in_first, vec![10]); assert_eq!(res.only_in_second, vec![1]); } #[test] fn par_intersection_handles_duplicate_counts() { let res = par_intersection(&[1, 2, 2, 3], &[0, 2, 2, 2, 4]); assert_eq!(res.common, vec![2, 2]); assert_eq!(res.only_in_first, vec![1, 3]); assert_eq!(res.only_in_second, vec![0, 2, 4]); } #[test] fn bin_data_handles_empty_and_invalid_bin_size() { assert_eq!(bin_data::(Vec::new(), 0.1), Vec::<(f64, usize)>::new()); assert_eq!(bin_data(vec![1.0, 2.0], 0.0), Vec::<(f64, usize)>::new()); assert_eq!( bin_data(vec![1.0, 2.0], f64::NAN), Vec::<(f64, usize)>::new() ); assert_eq!(bin_data(vec![f64::NAN], 0.1), Vec::<(f64, usize)>::new()); } #[test] fn bin_data_keeps_sparse_values_in_aligned_bins() { assert_eq!(bin_data(vec![0.0, 1.0], 0.25), vec![(0.0, 1), (1.0, 1)]); } #[test] fn split_ordered_genome_regions_preserves_input_contig_order() { let genome = vec![("chr2".to_string(), 2), ("chr1".to_string(), 2)]; assert_eq!( split_ordered_genome_into_n_regions_exact(&genome, 2), vec![vec!["chr2:1-2".to_string()], vec!["chr1:1-2".to_string()]] ); } #[test] fn split_g() -> anyhow::Result<()> { test_init(); let n_parts = 4; let reader = bam::Reader::from_path( "/mnt/beegfs02/scratch/t_steimle/data/wgs/34528/norm/34528_norm_hs1.bam", ) .with_context(|| format!("Failed to open BAM"))?; let header = bam::Header::from_template(reader.header()); let genome_sizes = get_genome_sizes(&header)?; // Split genome into regions let regions = split_genome_into_n_regions_exact(&genome_sizes, n_parts); info!("{:#?}", regions); Ok(()) } }