use anyhow::Context; use bitcode::{Decode, Encode}; use glob::glob; use log::{debug, error, warn}; use rust_htslib::bam::Read; use rustc_hash::FxHashMap; use serde::{Deserialize, Serialize}; use uuid::Uuid; use std::{ cmp::Ordering, collections::{HashMap, HashSet}, fmt, fs, iter::Sum, ops::{Add, Div}, path::{Path, PathBuf}, sync::atomic::AtomicBool, }; pub fn find_unique_file(dir_path: &str, suffix: &str) -> anyhow::Result { let mut matching_files = Vec::new(); for entry in fs::read_dir(dir_path).with_context(|| format!("Failed to read directory: {}", dir_path))? { let entry = entry.with_context(|| "Failed to read directory entry")?; let path = entry.path(); if path.is_file() && path .file_name() .and_then(|name| name.to_str()) .map(|name| name.ends_with(suffix)) .unwrap_or(false) { matching_files.push(path); } } match matching_files.len() { 0 => Err(anyhow::anyhow!("No file found ending with '{}'", suffix)) .with_context(|| format!("In directory: {}", dir_path)), 1 => Ok(matching_files[0].to_string_lossy().into_owned()), _ => Err(anyhow::anyhow!( "Multiple files found ending with '{}'", suffix )) .with_context(|| format!("In directory: {}", dir_path)), } } pub fn path_prefix(out: &str) -> anyhow::Result { let out_path = Path::new(out); let out_dir = out_path .parent() .ok_or_else(|| anyhow::anyhow!("Can't parse the dir of {}", out_path.display()))?; let name = out_path .file_name() .and_then(|name| name.to_str()) .ok_or_else(|| anyhow::anyhow!("Can't parse the file name of {}", out_path.display()))?; let stem = name .split_once('.') .map(|(stem, _)| stem) .ok_or_else(|| anyhow::anyhow!("Can't parse the file stem of {}", name))?; Ok(format!("{}/{stem}", out_dir.display())) } pub fn force_or_not(_path: &str, _force: bool) -> anyhow::Result<()> { // let path = Path::new(path); // let mut output_exists = path.exists(); // let dir = path // .parent() // .context(format!("Can't parse the parent dir of {}", path.display()))?; // // if force && output_exists { // fs::remove_dir_all(dir)?; // fs::create_dir_all(dir)?; // output_exists = false; // } // // if output_exists { // info!("{} already exists.", path.display()) // } Ok(()) } /// Builds Singularity bind flags from input/output paths. /// /// - Converts file paths to their parent directory. /// - Skips non-existing paths to avoid Singularity errors. /// - Canonicalizes and deduplicates directories. pub fn singularity_bind_flags(paths: I) -> String where I: IntoIterator, P: AsRef, { let mut seen = HashSet::new(); let mut binds = Vec::new(); let mut push_dir = |p: &Path| { if p.as_os_str().is_empty() || !p.exists() { return; } let dir = p.canonicalize().unwrap_or_else(|_| p.to_path_buf()); if seen.insert(dir.clone()) { binds.push(format!( "--bind {src}:{dst}", src = dir.display(), dst = dir.display() )); } }; for path_str in paths { let path = path_str.as_ref(); if path.is_dir() { push_dir(path); } else if let Some(parent) = path.parent() { push_dir(parent); } } binds.join(" ") } use rayon::prelude::*; use std::cmp::Ord; pub struct VectorIntersection { pub common: Vec, pub only_in_first: Vec, pub only_in_second: Vec, } impl fmt::Display for VectorIntersection { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let total_items = self.common.len() + self.only_in_first.len() + self.only_in_second.len(); writeln!(f, "Total items: {}", total_items)?; writeln!( f, "Common: {} ({:.2}%)", self.common.len(), percentage(self.common.len(), total_items) )?; writeln!( f, "Only in first: {} ({:.2}%)", self.only_in_first.len(), percentage(self.only_in_first.len(), total_items) )?; writeln!( f, "Only in second: {} ({:.2}%)", self.only_in_second.len(), percentage(self.only_in_second.len(), total_items) ) } } fn percentage(part: usize, total: usize) -> f64 { if total == 0 { 0.0 } else { (part as f64 / total as f64) * 100.0 } } impl Default for VectorIntersection { fn default() -> Self { Self { common: Vec::new(), only_in_first: Vec::new(), only_in_second: Vec::new(), } } } impl VectorIntersection { fn merge(&mut self, other: &mut Self) { self.common.append(&mut other.common); self.only_in_first.append(&mut other.only_in_first); self.only_in_second.append(&mut other.only_in_second); } } fn intersection(vec1: &[T], vec2: &[T]) -> VectorIntersection { let mut result = VectorIntersection::default(); let mut i = 0; let mut j = 0; while i < vec1.len() && j < vec2.len() { match vec1[i].cmp(&vec2[j]) { Ordering::Less => { result.only_in_first.push(vec1[i].clone()); i += 1; } Ordering::Greater => { result.only_in_second.push(vec2[j].clone()); j += 1; } Ordering::Equal => { let val = &vec1[i]; let mut count1 = 1; let mut count2 = 1; // Count occurrences in vec1 while i + 1 < vec1.len() && &vec1[i + 1] == val { i += 1; count1 += 1; } // Count occurrences in vec2 while j + 1 < vec2.len() && &vec2[j + 1] == val { j += 1; count2 += 1; } // Add to common result .common .extend(std::iter::repeat_n(val.clone(), count1.min(count2))); // Add excess to only_in_first or only_in_second match count1.cmp(&count2) { Ordering::Greater => { result .only_in_first .extend(std::iter::repeat_n(val.clone(), count1 - count2)); } Ordering::Less => { result .only_in_second .extend(std::iter::repeat_n(val.clone(), count2 - count1)); } Ordering::Equal => { // No excess elements, do nothing } } i += 1; j += 1; } } } result.only_in_first.extend(vec1[i..].iter().cloned()); result.only_in_second.extend(vec2[j..].iter().cloned()); result } pub fn par_intersection( vec1: &[T], vec2: &[T], ) -> VectorIntersection { let chunk_size = (vec1.len() / rayon::current_num_threads()).max(1); vec1.par_chunks(chunk_size) .map(|chunk| { let start = vec2.partition_point(|x| x < &chunk[0]); let end = vec2.partition_point(|x| x <= &chunk[chunk.len() - 1]); // Ensure start is not greater than end if start <= end { intersection(chunk, &vec2[start..end]) } else { // If start > end, there's no intersection for this chunk VectorIntersection::default() } }) .reduce(VectorIntersection::default, |mut acc, mut x| { acc.merge(&mut x); acc }) } pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 { let m = dna_sequence.len() as f64; // Early return for empty sequences if m == 0.0 { return 0.0; } // Count occurrences of each base let mut bases = HashMap::::new(); for base in dna_sequence.chars() { *bases.entry(base).or_insert(0) += 1; } // Calculate Shannon entropy let shannon_entropy_value: f64 = bases .values() .map(|&n_i| { let p_i = n_i as f64 / m; if p_i > 0.0 { -p_i * p_i.log2() } else { 0.0 // Avoid log2(0) } }) .sum(); shannon_entropy_value } pub fn mean(values: &[T]) -> f64 where T: Copy + Add + Div + Sum + Into, { let count = values.len(); if count == 0 { return 0.0; } let sum: T = values.iter().copied().sum(); sum.into() / count as f64 } pub fn bin_data(data: Vec, bin_size: V) -> Vec<(V, usize)> where V: Copy + PartialOrd + std::ops::AddAssign + std::ops::Add, { // Sort the data let mut sorted_data = data.clone(); sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap()); // Initialize bins let mut bins: Vec<(V, usize)> = Vec::new(); let mut current_bin_start = sorted_data[0]; let mut count = 0; for &value in &sorted_data { if value < current_bin_start + bin_size { count += 1; } else { bins.push((current_bin_start, count)); current_bin_start += bin_size; count = 1; } } // Push the last bin bins.push((current_bin_start, count)); bins } // fn aggregate_data(data: &[(u128, u32)], num_bins: usize) -> Vec<(u32, u32)> { // if data.is_empty() || num_bins == 0 { // return vec![]; // } // // let bin_size = ((data.len() as f64) / (num_bins as f64)).ceil() as usize; // // (0..num_bins) // .map(|i| { // let start = i * bin_size; // let end = (start + bin_size).min(data.len()); // Ensure `end` does not exceed `data.len()` // // // If `start` is out of bounds, return (0, 0) // if start >= data.len() { // return (0, 0); // } // // let bin = &data[start..end]; // // let sum_x: u128 = bin.iter().map(|&(x, _)| x).sum(); // let count = bin.len() as u128; // let mean_x = (sum_x / count) as u32; // Rounded down to nearest u32 // // let sum_n: u32 = bin.iter().map(|&(_, n)| n).sum(); // // (mean_x, sum_n) // }) // .collect() // } // pub fn app_storage_dir() -> anyhow::Result { let app_name = env!("CARGO_PKG_NAME"); let app_dir = dirs::data_dir() .context("Failed to get data directory")? .join(app_name); if !app_dir.exists() { fs::create_dir_all(&app_dir).context("Failed to create application directory")?; } Ok(app_dir) } use blake3::Hasher as Blake3Hasher; use std::hash::{BuildHasher, Hasher}; pub struct Blake3Hash(Blake3Hasher); impl Hasher for Blake3Hash { fn finish(&self) -> u64 { let hash = self.0.finalize(); u64::from_le_bytes(hash.as_bytes()[..8].try_into().unwrap()) } fn write(&mut self, bytes: &[u8]) { self.0.update(bytes); } } /// Default Hasher #[derive(Default, Clone)] pub struct Blake3BuildHasher; impl BuildHasher for Blake3BuildHasher { type Hasher = Blake3Hash; fn build_hasher(&self) -> Self::Hasher { Blake3Hash(Blake3Hasher::new()) } } // Custom 128-bit hash type #[derive(PartialEq, Eq, Clone, Copy, Serialize, Deserialize, Debug, Encode, Decode)] pub struct Hash128([u8; 16]); impl std::hash::Hash for Hash128 { fn hash(&self, state: &mut H) { state.write(&self.0); } } impl Hash128 { pub fn new(bytes: [u8; 16]) -> Self { Hash128(bytes) } pub fn to_bytes(&self) -> [u8; 16] { self.0 } } pub fn get_dir_size(path: &Path) -> std::io::Result { let mut total_size = 0; if path.is_dir() { for entry in fs::read_dir(path)? { let entry = entry?; let path = entry.path(); if path.is_file() { total_size += path.metadata()?.len(); } else if path.is_dir() { total_size += get_dir_size(&path)?; } } } Ok(total_size) } /// Finds all files matching the given glob pattern. /// /// # Arguments /// /// * `pattern` - A glob pattern string (e.g., `"data/**/*.txt"`). /// /// # Returns /// /// * `Ok(Vec)` with all successfully matched file paths. /// * `Err` if the glob pattern is invalid or any matched path fails to resolve. /// /// # Errors /// /// Returns an error if: /// - The glob pattern is invalid. /// - A file path matching the pattern cannot be resolved. /// /// # Examples /// /// ```rust /// let files = find_files("src/**/*.rs")?; /// for file in files { /// println!("{:?}", file); /// } /// ``` pub fn find_files(pattern: &str) -> anyhow::Result> { let mut result = Vec::new(); let entries = glob(pattern).with_context(|| format!("Invalid glob pattern: '{}'", pattern))?; for entry in entries { let path = entry.with_context(|| format!("Failed to resolve path for pattern '{}'", pattern))?; result.push(path); } Ok(result) } // fn system_time_to_utc(system_time: SystemTime) -> Option> { // system_time // .duration_since(UNIX_EPOCH) // .ok() // .map(|duration| Utc.timestamp(duration.as_secs() as i64, duration.subsec_nanos())) // } /// List all files in a directory that have the given suffix (file extension). /// /// # Arguments /// * `dir` – Directory to scan. /// * `suffix` – File extension to match (without the dot). /// /// # Returns /// A vector of `PathBuf` entries whose extension matches `suffix`. /// /// # Errors /// Returns any I/O error from `read_dir` or directory traversal. pub fn list_files_with_ext(dir: impl AsRef, ext: &str) -> std::io::Result> { let mut out = Vec::new(); for entry in fs::read_dir(dir)? { let entry = entry?; let path = entry.path(); if path.is_file() { if let Some(file_ext) = path.extension().and_then(|e| e.to_str()) { if file_ext == ext { out.push(path); } } } } Ok(out) } pub fn list_directories(dir_path: PathBuf) -> std::io::Result> { let mut directories = Vec::new(); for entry in fs::read_dir(dir_path)? { let entry = entry?; let path = entry.path(); // Check if the path is a directory if path.is_dir() { if let Some(dir_name) = path.file_name().and_then(|name| name.to_str()) { directories.push(dir_name.to_string()); } } } Ok(directories) } pub fn list_files_recursive(root: impl AsRef) -> Vec { fn walk(dir: &Path, out: &mut Vec) { if let Ok(entries) = fs::read_dir(dir) { for entry in entries.flatten() { let path = entry.path(); if path.is_dir() { walk(&path, out); } else if path.is_file() { out.push(path); } } } } let mut files = Vec::new(); walk(root.as_ref(), &mut files); files } /// Checks whether the modification time of `file1` is older than `file2`. /// /// If `rm` is `true` and `file1` is older, attempts to remove the directory containing `file1`. /// /// # Arguments /// /// * `file1` - Path to the first file. /// * `file2` - Path to the second file. /// * `rm` - If true, and `file1` is older, attempts to remove its parent directory. /// /// # Returns /// /// * `Ok(true)` if `file1` is older than `file2`. /// * `Ok(false)` otherwise. /// /// # Errors /// /// Returns an [`anyhow::Error`] if: /// - Either `file1` or `file2` does not exist. /// - File metadata cannot be read. /// - File modification times cannot be retrieved. /// - (if `rm == true`) Directory removal fails (if uncommented). /// pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result { debug!("is_file_older {file1} {file2}, {rm:?}"); let mtime1 = fs::metadata(file1) .with_context(|| format!("Failed to read metadata for '{}'", file1))? .modified() .with_context(|| format!("Failed to get modified time for '{}'", file1))?; let mtime2 = fs::metadata(file2) .with_context(|| format!("Failed to read metadata for '{}'", file2))? .modified() .with_context(|| format!("Failed to get modified time for '{}'", file2))?; if mtime1 < mtime2 && rm { if let Some(file1_dir) = Path::new(file1).parent() { warn!("Removing old directory: {}", file1_dir.display()); fs::remove_dir_all(file1_dir).map_err(|e| { warn!("Failed to remove {}: {}", file1_dir.display(), e); e })?; } } Ok(mtime1 < mtime2) } pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> { debug!("Trying to remove: {dir}"); // match fs::remove_dir_all(dir) { // Ok(_) => {} // Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} // Err(e) => { // anyhow::bail!("Failed to remove directory '{}': {}", dir, e); // } // }; Ok(()) } /// Searches a directory for the first file matching a given name pattern. /// /// This function looks for a file in the given directory whose filename /// starts with the provided `starts_with` prefix and ends with the provided /// `ends_with` suffix. It returns the full path to the first match found. /// /// # Arguments /// /// * `dir` - A reference to the directory path where the search will occur. /// * `starts_with` - The required prefix of the filename (e.g., `"throughput_"`). /// * `ends_with` - The required suffix of the filename (e.g., `".csv"`). /// /// # Returns /// /// * `Some(PathBuf)` - Path to the first file matching the pattern. /// * `None` - If no matching file is found or the directory can't be read. /// /// # Example /// /// ```rust /// let dir = std::path::Path::new("/path/to/data"); /// if let Some(path) = find_matching_file(dir, "throughput_", ".csv") { /// println!("Found file: {}", path.display()); /// } else { /// eprintln!("No matching file found."); /// } /// ``` pub fn find_matching_file(dir: &Path, starts_with: &str, ends_with: &str) -> Option { fs::read_dir(dir) .ok()? .filter_map(Result::ok) .map(|entry| entry.path()) .find(|path| { path.is_file() && path .file_name() .and_then(|name| name.to_str()) .map(|name| name.starts_with(starts_with) && name.ends_with(ends_with)) .unwrap_or(false) }) } /// Searches for the first file in the given directory whose file name /// satisfies the provided condition. /// /// # Arguments /// /// * `dir` - Path to the directory to search. /// * `condition` - A closure that takes a file name (`&str`) and returns `true` /// if the file matches the desired condition. /// /// # Returns /// /// An `Option` containing the path to the first matching file, /// or `None` if no file matches or the directory can't be read. /// /// # Examples /// /// ``` /// use std::path::Path; /// let result = find_file(Path::new("."), |name| name.ends_with(".rs")); /// ``` pub fn find_file(dir: &Path, condition: F) -> Option where F: Fn(&str) -> bool, { fs::read_dir(dir).ok()?.find_map(|entry| { let path = entry.ok()?.path(); if path.is_file() && path .file_name() .and_then(|name| name.to_str()) .map(&condition) .unwrap_or(false) { Some(path) } else { None } }) } /// Represents a string made of either: /// - one character repeated N times, /// - a two-character block repeated N times, /// - or no valid repetition pattern. #[derive(Debug, PartialEq)] pub enum Repeat { None, RepOne(char, usize), RepTwo(String, usize), } pub fn detect_repetition(s: &str) -> Repeat { let len = s.chars().count(); if len == 0 { return Repeat::None; } // Check for single-char repetition let mut chars = s.chars(); let first = chars.next().unwrap(); if s.chars().all(|c| c == first) { return Repeat::RepOne(first, len); } // Check for two-char block repetition if len.is_multiple_of(2) { let mut iter = s.chars(); let a = iter.next().unwrap(); let b = iter.next().unwrap(); let mut count = 1; while let (Some(c), Some(d)) = (iter.next(), iter.next()) { if c != a || d != b { return Repeat::None; } count += 1; } return Repeat::RepTwo(format!("{}{}", a, b), count); } Repeat::None } pub fn test_init() { let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("debug")) .try_init(); } use regex::Regex; /// Extracts the numeric barcode suffix from a filename. /// /// # Examples /// /// ``` /// let f = "sample_SQK-NBD114-24_barcode07.bam"; /// assert_eq!(extract_barcode(f), Some(7)); /// ``` /// /// Returns `None` if no `barcodeNN` pattern is found /// or if the number cannot be parsed. pub fn extract_barcode(name: &str) -> Option { let re = Regex::new(r"barcode(\d+)").unwrap(); re.captures(name) .and_then(|c| c.get(1)) .and_then(|m| m.as_str().parse::().ok()) } pub fn human_size(bytes: u64) -> String { const UNITS: [&str; 5] = ["B", "KB", "MB", "GB", "TB"]; let mut size = bytes as f64; let mut unit = 0; while size >= 1024.0 && unit < UNITS.len() - 1 { size /= 1024.0; unit += 1; } if unit == 0 { format!("{} {}", bytes, UNITS[unit]) } else { format!("{:.2} {}", size, UNITS[unit]) } } // A RAII (Resource Acquisition Is Initialization) guard for cleaning up temporary directories. pub struct TempDirGuard { path: PathBuf, } impl TempDirGuard { pub fn new(path: PathBuf) -> Self { Self { path } } } // The core cleanup logic: executed automatically when the TempDirGuard is dropped. impl Drop for TempDirGuard { fn drop(&mut self) { if self.path.exists() { // Log the attempt. Using eprintln/warn! ensures the error is noted, // but the program does not panic. if let Err(e) = fs::remove_dir_all(&self.path) { error!( "Failed to remove temporary directory '{}': {}", self.path.display(), e ); } } } } /// Guard that tracks temporary filesystem paths and cleans them up on drop /// unless explicitly disarmed. /// /// This is intended for pipeline-style code where intermediate outputs /// are produced by downstream tools. The guard only tracks paths; it does /// **not** create files itself. /// /// If the pipeline completes successfully, call [`disarm`] to prevent /// cleanup. Otherwise, all tracked paths are removed on drop on a /// best-effort basis. /// /// # Example /// /// ```no_run /// use std::path::PathBuf; /// /// let tmp_dir = PathBuf::from("/tmp"); /// let mut guard = TempFileGuard::new(); /// /// let path1 = guard.temp_path(".tmp", &tmp_dir); /// let path2 = guard.temp_path(".tmp", &tmp_dir); /// /// // Produce outputs at `path1` and `path2` /// /// // Mark success /// guard.disarm(); /// ``` /// /// If `disarm()` is not called, both paths are cleaned up automatically /// when the guard is dropped. pub struct TempFileGuard { files: Vec, disarmed: AtomicBool, } impl Default for TempFileGuard { fn default() -> Self { Self::new() } } impl TempFileGuard { pub fn new() -> Self { Self { files: Vec::new(), disarmed: AtomicBool::new(false), } } /// Registers a temporary file for cleanup. pub fn track(&mut self, path: PathBuf) { self.files.push(path); } /// Registers multiple temporary files for cleanup. pub fn track_many(&mut self, paths: impl IntoIterator) { self.files.extend(paths); } /// Disarms the guard, preventing cleanup on drop. /// Call this when the pipeline succeeds. pub fn disarm(&self) { self.disarmed .store(true, std::sync::atomic::Ordering::SeqCst); } /// Manually clean up all tracked files. pub fn cleanup(&mut self) { for file in &self.files { if file.exists() { if let Err(e) = remove_bam_with_index(file) { warn!("Failed to clean up temp file {}: {}", file.display(), e); } } } self.files.clear(); } pub fn clear(&mut self) { self.files.clear(); } /// Generates a unique temporary path inside `tmp_dir` and registers it /// for cleanup. /// /// The returned path is derived from a UUIDv4 and **no file is created**. /// The caller is responsible for creating or writing to the path. /// /// # Example /// /// ```no_run /// use std::path::PathBuf; /// /// let tmp_dir = PathBuf::from("/tmp"); /// let mut guard = TempFileGuard::new(); /// /// let path = guard.temp_path(".tmp", &tmp_dir); /// // create or write to `path` /// ``` pub fn tmp_path(&mut self, suffix: &str, tmp_dir: impl AsRef) -> PathBuf { let name = format!("pandora-tmp-{}{}", Uuid::new_v4(), suffix); let tmp_dir = tmp_dir.as_ref(); let path = tmp_dir.join(name); self.track(path.clone()); path } } impl Drop for TempFileGuard { fn drop(&mut self) { if !self.disarmed.load(std::sync::atomic::Ordering::SeqCst) { warn!( "Pipeline failed or panicked, cleaning up {} temporary files", self.files.len() ); self.cleanup(); } } } // pub fn temp_file_path(suffix: &str, config: &Config) -> std::io::Result { // let p = config.tmp_dir; // let temp_path = tempfile::Builder::new() // .prefix("pandora-temp-") // .suffix(suffix) // .rand_bytes(5) // .tempfile()? // .into_temp_path(); // // Ok(temp_path.to_path_buf()) // } /// Extracts genome sizes from BAM header. pub fn get_genome_sizes( header: &rust_htslib::bam::Header, ) -> anyhow::Result> { let mut sizes = FxHashMap::default(); for (_, records) in header.to_hashmap() { for record in records { if let (Some(sn), Some(ln)) = (record.get("SN"), record.get("LN")) { if let Ok(len) = ln.parse::() { sizes.insert(sn.clone(), len); } } } } Ok(sizes) } pub fn bam_contigs>(bam: P) -> anyhow::Result> { let reader = rust_htslib::bam::Reader::from_path(&bam)?; let header = rust_htslib::bam::Header::from_template(reader.header()); Ok(get_genome_sizes(&header)?.into_keys().collect()) } /// Split genome into ~n_parts equal-sized chunks (by number of bases), /// returning for each chunk a list of regions in `ctg:start-end` form. /// Split genome into ~n_parts equal-sized chunks (by bases). /// Each chunk is a single region `ctg:start-end` (no cross-contig regions). /// /// Note: the number of regions returned will be ≈ n_parts (may differ by 1–2). pub fn split_genome_into_n_regions( genome_sizes: &FxHashMap, n_parts: usize, ) -> Vec { if n_parts == 0 || genome_sizes.is_empty() { return Vec::new(); } // Deterministic contig order let mut contigs: Vec<(String, u64)> = genome_sizes .iter() .map(|(ctg, len)| (ctg.clone(), *len)) .collect(); contigs.sort_by(|a, b| a.0.cmp(&b.0)); let total_bases: u64 = contigs.iter().map(|(_, len)| *len).sum(); if total_bases == 0 { return Vec::new(); } let target_chunk_size: u64 = total_bases.div_ceil(n_parts as u64); // ceil let mut regions = Vec::new(); for (ctg, len) in contigs { let mut remaining = len; let mut start: u64 = 1; while remaining > 0 { let take = remaining.min(target_chunk_size); let end = start + take - 1; regions.push(format!("{ctg}:{start}-{end}")); remaining -= take; start = end + 1; } } regions } /// Split genome into exactly `n_parts` chunks with (almost) equal total size. /// Each chunk is a *list* of regions `ctg:start-end`. A chunk may span several /// contigs, but each region is within a single contig. /// /// The sizes of the parts will differ by at most 1 base. pub fn split_genome_into_n_regions_exact( genome_sizes: &FxHashMap, n_parts: usize, ) -> Vec> { if n_parts == 0 || genome_sizes.is_empty() { return Vec::new(); } // Deterministic contig order let mut contigs: Vec<(String, u64)> = genome_sizes .iter() .map(|(ctg, len)| (ctg.clone(), *len)) .collect(); contigs.sort_by(|a, b| a.0.cmp(&b.0)); let total_bases: u64 = contigs.iter().map(|(_, len)| *len).sum(); if total_bases == 0 { return Vec::new(); } // We cannot have more non-empty parts than bases if we use 1-based inclusive coordinates. let parts = n_parts.min(total_bases as usize); // Distribute bases as evenly as possible: // first `remainder` parts get (base_per_part + 1), the rest get base_per_part. let base_per_part = total_bases / parts as u64; let remainder = total_bases % parts as u64; let target_sizes: Vec = (0..parts) .map(|i| { if i < remainder as usize { base_per_part + 1 } else { base_per_part } }) .collect(); let mut chunks: Vec> = Vec::with_capacity(parts); let mut part_idx = 0; let mut remaining_in_part = target_sizes[0]; let mut ctg_iter = contigs.into_iter(); while let Some((ctg, mut len)) = ctg_iter.next() { let mut start: u64 = 1; while len > 0 && part_idx < parts { let take = len.min(remaining_in_part); let end = start + take - 1; if chunks.len() <= part_idx { chunks.push(Vec::new()); } chunks[part_idx].push(format!("{ctg}:{start}-{end}")); len -= take; start = end + 1; remaining_in_part -= take; if remaining_in_part == 0 { part_idx += 1; if part_idx == parts { break; } remaining_in_part = target_sizes[part_idx]; } } } // Sanity: we should have exactly `parts` chunks, all non-empty debug_assert_eq!(chunks.len(), parts); chunks } /// Removes a BAM file and its associated index file (.bai). pub fn remove_bam_with_index(bam: &Path) -> anyhow::Result<()> { // Remove BAM if bam.exists() { fs::remove_file(bam).with_context(|| format!("Failed to remove BAM: {}", bam.display()))?; } // Remove index (.bam.bai or .bai) let bai = bam.with_extension("bam.bai"); if bai.exists() { fs::remove_file(&bai).ok(); // Don't fail if index doesn't exist } let bai_alt = bam.with_extension("bai"); if bai_alt.exists() { fs::remove_file(&bai_alt).ok(); } Ok(()) } /// Convert a numeric value to an RGB triplet using a diverging palette. /// /// Values are clipped to `[-clip, +clip]` and mapped as: /// - negative → blue /// - zero → white /// - positive → red /// /// # Arguments /// * `v` – value to color (e.g. delta methylation) /// * `clip` – symmetric clipping bound (must be > 0) /// /// # Returns /// `(r, g, b)` as 8-bit RGB components. /// /// # Panics /// Panics if `clip <= 0.0`. pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) { assert!(clip > 0.0, "clip must be > 0"); let x = v.clamp(-clip, clip); let t = (x + clip) / (2.0 * clip); // [-clip,+clip] → [0,1] if t < 0.5 { // blue → white let u = t / 0.5; let r = (255.0 * u).round() as u8; let g = (255.0 * u).round() as u8; (r, g, 255) } else { // white → red let u = (t - 0.5) / 0.5; let g = (255.0 * (1.0 - u)).round() as u8; let b = (255.0 * (1.0 - u)).round() as u8; (255, g, b) } } pub fn revcomp(s: &str) -> String { fn comp(b: u8) -> u8 { match b { b'A' => b'T', b'C' => b'G', b'G' => b'C', b'T' => b'A', b'N' => b'N', _ => b'N', } } let v: Vec = s.as_bytes().iter().rev().map(|&b| comp(b.to_ascii_uppercase())).collect(); String::from_utf8(v).unwrap() } #[cfg(test)] mod tests { use log::info; use rust_htslib::bam::{self, Read}; use crate::helpers::test_init; use super::*; #[test] fn split_g() -> anyhow::Result<()> { test_init(); let n_parts = 4; let reader = bam::Reader::from_path( "/mnt/beegfs02/scratch/t_steimle/data/wgs/34528/norm/34528_norm_hs1.bam", ) .with_context(|| format!("Failed to open BAM"))?; let header = bam::Header::from_template(reader.header()); let genome_sizes = get_genome_sizes(&header)?; // Split genome into regions let regions = split_genome_into_n_regions_exact(&genome_sizes, n_parts); info!("{:#?}", regions); Ok(()) } }