| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398 |
- use anyhow::Context;
- use bitcode::{Decode, Encode};
- use glob::glob;
- use log::{debug, error, warn};
- use rust_htslib::bam::Read;
- use rustc_hash::FxHashMap;
- use serde::{Deserialize, Serialize};
- use std::{
- cmp::Ordering,
- collections::{HashMap, HashSet},
- fmt, fs,
- iter::Sum,
- ops::{Add, Div},
- path::{Path, PathBuf},
- sync::{atomic::AtomicBool, OnceLock},
- };
- use uuid::Uuid;
- pub fn find_unique_file(dir_path: &str, suffix: &str) -> anyhow::Result<String> {
- let mut matching_files = Vec::new();
- for entry in
- fs::read_dir(dir_path).with_context(|| format!("Failed to read directory: {}", dir_path))?
- {
- let entry = entry.with_context(|| "Failed to read directory entry")?;
- let path = entry.path();
- if path.is_file()
- && path
- .file_name()
- .and_then(|name| name.to_str())
- .map(|name| name.ends_with(suffix))
- .unwrap_or(false)
- {
- matching_files.push(path);
- }
- }
- match matching_files.len() {
- 0 => Err(anyhow::anyhow!("No file found ending with '{}'", suffix))
- .with_context(|| format!("In directory: {}", dir_path)),
- 1 => Ok(matching_files[0].to_string_lossy().into_owned()),
- _ => {
- matching_files.sort();
- let matches = matching_files
- .iter()
- .map(|p| p.display().to_string())
- .collect::<Vec<_>>()
- .join(", ");
- Err(anyhow::anyhow!(
- "Multiple files found ending with '{}': {}",
- suffix,
- matches
- ))
- .with_context(|| format!("In directory: {}", dir_path))
- }
- }
- }
- /// Return the output prefix by removing all suffixes from the first dot in the filename.
- ///
- /// This is intentionally stricter than [`Path::file_stem`]: filenames such as
- /// `sample.v1.vcf.gz` become `sample`, not `sample.v1.vcf`. This matches tools
- /// that expect a prefix rather than a single-extension stem.
- ///
- /// # Examples
- ///
- /// ```text
- /// /tmp/sample.vcf.gz -> /tmp/sample
- /// /tmp/sample.v1.vcf.gz -> /tmp/sample
- /// ```
- pub fn path_prefix(out: &str) -> anyhow::Result<String> {
- let out_path = Path::new(out);
- let out_dir = out_path
- .parent()
- .ok_or_else(|| anyhow::anyhow!("Can't parse the dir of {}", out_path.display()))?;
- let name = out_path
- .file_name()
- .and_then(|name| name.to_str())
- .ok_or_else(|| anyhow::anyhow!("Can't parse the file name of {}", out_path.display()))?;
- let stem = name
- .split_once('.')
- .map(|(stem, _)| stem)
- .ok_or_else(|| anyhow::anyhow!("Can't parse the file stem of {}", name))?;
- Ok(format!("{}/{stem}", out_dir.display()))
- }
- pub fn force_or_not(_path: &str, _force: bool) -> anyhow::Result<()> {
- // let path = Path::new(path);
- // let mut output_exists = path.exists();
- // let dir = path
- // .parent()
- // .context(format!("Can't parse the parent dir of {}", path.display()))?;
- //
- // if force && output_exists {
- // fs::remove_dir_all(dir)?;
- // fs::create_dir_all(dir)?;
- // output_exists = false;
- // }
- //
- // if output_exists {
- // info!("{} already exists.", path.display())
- // }
- Ok(())
- }
- /// Builds Singularity bind flags from input/output paths.
- ///
- /// - Converts file paths to their parent directory.
- /// - Skips non-existing paths to avoid Singularity errors.
- /// - Canonicalizes and deduplicates directories.
- pub fn singularity_bind_flags<I, P>(paths: I) -> String
- where
- I: IntoIterator<Item = P>,
- P: AsRef<Path>,
- {
- let mut seen = HashSet::new();
- let mut binds = Vec::new();
- let mut push_dir = |p: &Path| {
- if p.as_os_str().is_empty() || !p.exists() {
- return;
- }
- let dir = p.canonicalize().unwrap_or_else(|_| p.to_path_buf());
- if seen.insert(dir.clone()) {
- binds.push(format!(
- "--bind {src}:{dst}",
- src = dir.display(),
- dst = dir.display()
- ));
- }
- };
- for path_str in paths {
- let path = path_str.as_ref();
- if path.is_dir() {
- push_dir(path);
- } else if let Some(parent) = path.parent() {
- push_dir(parent);
- }
- }
- binds.join(" ")
- }
- use std::cmp::Ord;
- pub struct VectorIntersection<T> {
- pub common: Vec<T>,
- pub only_in_first: Vec<T>,
- pub only_in_second: Vec<T>,
- }
- impl<T> fmt::Display for VectorIntersection<T> {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- let total_items = self.common.len() + self.only_in_first.len() + self.only_in_second.len();
- writeln!(f, "Total items: {}", total_items)?;
- writeln!(
- f,
- "Common: {} ({:.2}%)",
- self.common.len(),
- percentage(self.common.len(), total_items)
- )?;
- writeln!(
- f,
- "Only in first: {} ({:.2}%)",
- self.only_in_first.len(),
- percentage(self.only_in_first.len(), total_items)
- )?;
- writeln!(
- f,
- "Only in second: {} ({:.2}%)",
- self.only_in_second.len(),
- percentage(self.only_in_second.len(), total_items)
- )
- }
- }
- fn percentage(part: usize, total: usize) -> f64 {
- if total == 0 {
- 0.0
- } else {
- (part as f64 / total as f64) * 100.0
- }
- }
- impl<T> Default for VectorIntersection<T> {
- fn default() -> Self {
- Self {
- common: Vec::new(),
- only_in_first: Vec::new(),
- only_in_second: Vec::new(),
- }
- }
- }
- impl<T: Ord + Clone> VectorIntersection<T> {
- fn merge(&mut self, other: &mut Self) {
- self.common.append(&mut other.common);
- self.only_in_first.append(&mut other.only_in_first);
- self.only_in_second.append(&mut other.only_in_second);
- }
- }
- fn intersection<T: Ord + Clone>(vec1: &[T], vec2: &[T]) -> VectorIntersection<T> {
- let mut result = VectorIntersection::default();
- let mut i = 0;
- let mut j = 0;
- while i < vec1.len() && j < vec2.len() {
- match vec1[i].cmp(&vec2[j]) {
- Ordering::Less => {
- result.only_in_first.push(vec1[i].clone());
- i += 1;
- }
- Ordering::Greater => {
- result.only_in_second.push(vec2[j].clone());
- j += 1;
- }
- Ordering::Equal => {
- let val = &vec1[i];
- let mut count1 = 1;
- let mut count2 = 1;
- // Count occurrences in vec1
- while i + 1 < vec1.len() && &vec1[i + 1] == val {
- i += 1;
- count1 += 1;
- }
- // Count occurrences in vec2
- while j + 1 < vec2.len() && &vec2[j + 1] == val {
- j += 1;
- count2 += 1;
- }
- // Add to common
- result
- .common
- .extend(std::iter::repeat_n(val.clone(), count1.min(count2)));
- // Add excess to only_in_first or only_in_second
- match count1.cmp(&count2) {
- Ordering::Greater => {
- result
- .only_in_first
- .extend(std::iter::repeat_n(val.clone(), count1 - count2));
- }
- Ordering::Less => {
- result
- .only_in_second
- .extend(std::iter::repeat_n(val.clone(), count2 - count1));
- }
- Ordering::Equal => {
- // No excess elements, do nothing
- }
- }
- i += 1;
- j += 1;
- }
- }
- }
- result.only_in_first.extend(vec1[i..].iter().cloned());
- result.only_in_second.extend(vec2[j..].iter().cloned());
- result
- }
- pub fn par_intersection<T: Ord + Send + Sync + Clone>(
- vec1: &[T],
- vec2: &[T],
- ) -> VectorIntersection<T> {
- par_intersection_by_value(vec1, vec2)
- }
- fn par_intersection_by_value<T: Ord + Send + Sync + Clone>(
- vec1: &[T],
- vec2: &[T],
- ) -> VectorIntersection<T> {
- const SEQUENTIAL_THRESHOLD: usize = 4096;
- if vec1.len() + vec2.len() <= SEQUENTIAL_THRESHOLD {
- return intersection(vec1, vec2);
- }
- if vec1.is_empty() {
- return VectorIntersection {
- common: Vec::new(),
- only_in_first: Vec::new(),
- only_in_second: vec2.to_vec(),
- };
- }
- if vec2.is_empty() {
- return VectorIntersection {
- common: Vec::new(),
- only_in_first: vec1.to_vec(),
- only_in_second: Vec::new(),
- };
- }
- let pivot = &vec1[vec1.len() / 2];
- let v1_left = vec1.partition_point(|x| x < pivot);
- let v1_right = vec1.partition_point(|x| x <= pivot);
- let v2_left = vec2.partition_point(|x| x < pivot);
- let v2_right = vec2.partition_point(|x| x <= pivot);
- let (mut left, mut right) = rayon::join(
- || par_intersection_by_value(&vec1[..v1_left], &vec2[..v2_left]),
- || par_intersection_by_value(&vec1[v1_right..], &vec2[v2_right..]),
- );
- let count1 = v1_right - v1_left;
- let count2 = v2_right - v2_left;
- let mut middle = VectorIntersection::default();
- middle
- .common
- .extend(std::iter::repeat_n(pivot.clone(), count1.min(count2)));
- match count1.cmp(&count2) {
- Ordering::Greater => middle
- .only_in_first
- .extend(std::iter::repeat_n(pivot.clone(), count1 - count2)),
- Ordering::Less => middle
- .only_in_second
- .extend(std::iter::repeat_n(pivot.clone(), count2 - count1)),
- Ordering::Equal => {}
- }
- left.merge(&mut middle);
- left.merge(&mut right);
- left
- }
- pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 {
- let m = dna_sequence.len() as f64;
- // Early return for empty sequences
- if m == 0.0 {
- return 0.0;
- }
- // Count occurrences of each base
- let mut bases = HashMap::<char, usize>::new();
- for base in dna_sequence.chars() {
- *bases.entry(base).or_insert(0) += 1;
- }
- // Calculate Shannon entropy
- let shannon_entropy_value: f64 = bases
- .values()
- .map(|&n_i| {
- let p_i = n_i as f64 / m;
- if p_i > 0.0 {
- -p_i * p_i.log2()
- } else {
- 0.0 // Avoid log2(0)
- }
- })
- .sum();
- shannon_entropy_value
- }
- pub fn mean<T>(values: &[T]) -> f64
- where
- T: Copy + Add<Output = T> + Div<Output = T> + Sum + Into<f64>,
- {
- let count = values.len();
- if count == 0 {
- return 0.0;
- }
- let sum: T = values.iter().copied().sum();
- sum.into() / count as f64
- }
- pub fn bin_data<V>(data: Vec<V>, bin_size: V) -> Vec<(V, usize)>
- where
- V: Copy + Default + PartialOrd + std::ops::AddAssign + std::ops::Add<Output = V>,
- {
- if data.is_empty() || bin_size.partial_cmp(&V::default()) != Some(Ordering::Greater) {
- return Vec::new();
- }
- // Sort comparable data points. For floats, this drops NaN values.
- let mut sorted_data: Vec<V> = data
- .into_iter()
- .filter(|v| v.partial_cmp(v) == Some(Ordering::Equal))
- .collect();
- if sorted_data.is_empty() {
- return Vec::new();
- }
- sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
- // Initialize bins
- let mut bins: Vec<(V, usize)> = Vec::new();
- let mut current_bin_start = sorted_data[0];
- let mut count = 0;
- for &value in &sorted_data {
- if value < current_bin_start + bin_size {
- count += 1;
- } else {
- bins.push((current_bin_start, count));
- current_bin_start += bin_size;
- while value >= current_bin_start + bin_size {
- current_bin_start += bin_size;
- }
- count = 1;
- }
- }
- // Push the last bin
- bins.push((current_bin_start, count));
- bins
- }
- // fn aggregate_data(data: &[(u128, u32)], num_bins: usize) -> Vec<(u32, u32)> {
- // if data.is_empty() || num_bins == 0 {
- // return vec![];
- // }
- //
- // let bin_size = ((data.len() as f64) / (num_bins as f64)).ceil() as usize;
- //
- // (0..num_bins)
- // .map(|i| {
- // let start = i * bin_size;
- // let end = (start + bin_size).min(data.len()); // Ensure `end` does not exceed `data.len()`
- //
- // // If `start` is out of bounds, return (0, 0)
- // if start >= data.len() {
- // return (0, 0);
- // }
- //
- // let bin = &data[start..end];
- //
- // let sum_x: u128 = bin.iter().map(|&(x, _)| x).sum();
- // let count = bin.len() as u128;
- // let mean_x = (sum_x / count) as u32; // Rounded down to nearest u32
- //
- // let sum_n: u32 = bin.iter().map(|&(_, n)| n).sum();
- //
- // (mean_x, sum_n)
- // })
- // .collect()
- // }
- //
- pub fn app_storage_dir() -> anyhow::Result<PathBuf> {
- let app_name = env!("CARGO_PKG_NAME");
- let app_dir = dirs::data_dir()
- .context("Failed to get data directory")?
- .join(app_name);
- if !app_dir.exists() {
- fs::create_dir_all(&app_dir).context("Failed to create application directory")?;
- }
- Ok(app_dir)
- }
- use blake3::Hasher as Blake3Hasher;
- use std::hash::{BuildHasher, Hasher};
- pub struct Blake3Hash(Blake3Hasher);
- impl Hasher for Blake3Hash {
- fn finish(&self) -> u64 {
- let hash = self.0.finalize();
- u64::from_le_bytes(hash.as_bytes()[..8].try_into().unwrap())
- }
- fn write(&mut self, bytes: &[u8]) {
- self.0.update(bytes);
- }
- }
- /// Default Hasher
- #[derive(Default, Clone)]
- pub struct Blake3BuildHasher;
- impl BuildHasher for Blake3BuildHasher {
- type Hasher = Blake3Hash;
- fn build_hasher(&self) -> Self::Hasher {
- Blake3Hash(Blake3Hasher::new())
- }
- }
- // Custom 128-bit hash type
- #[derive(PartialEq, Eq, Clone, Copy, Serialize, Deserialize, Debug, Encode, Decode)]
- pub struct Hash128([u8; 16]);
- impl std::hash::Hash for Hash128 {
- fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
- state.write(&self.0);
- }
- }
- impl Hash128 {
- pub fn new(bytes: [u8; 16]) -> Self {
- Hash128(bytes)
- }
- pub fn to_bytes(&self) -> [u8; 16] {
- self.0
- }
- }
- pub fn get_dir_size(path: &Path) -> std::io::Result<u64> {
- let mut total_size = 0;
- if path.is_dir() {
- for entry in fs::read_dir(path)? {
- let entry = entry?;
- let path = entry.path();
- if path.is_file() {
- total_size += path.metadata()?.len();
- } else if path.is_dir() {
- total_size += get_dir_size(&path)?;
- }
- }
- }
- Ok(total_size)
- }
- /// Finds all files matching the given glob pattern.
- ///
- /// # Arguments
- ///
- /// * `pattern` - A glob pattern string (e.g., `"data/**/*.txt"`).
- ///
- /// # Returns
- ///
- /// * `Ok(Vec<PathBuf>)` with all successfully matched file paths.
- /// * `Err` if the glob pattern is invalid or any matched path fails to resolve.
- ///
- /// # Errors
- ///
- /// Returns an error if:
- /// - The glob pattern is invalid.
- /// - A file path matching the pattern cannot be resolved.
- ///
- /// # Examples
- ///
- /// ```rust
- /// let files = find_files("src/**/*.rs")?;
- /// for file in files {
- /// println!("{:?}", file);
- /// }
- /// ```
- pub fn find_files(pattern: &str) -> anyhow::Result<Vec<PathBuf>> {
- let mut result = Vec::new();
- let entries = glob(pattern).with_context(|| format!("Invalid glob pattern: '{}'", pattern))?;
- for entry in entries {
- let path =
- entry.with_context(|| format!("Failed to resolve path for pattern '{}'", pattern))?;
- result.push(path);
- }
- result.sort();
- Ok(result)
- }
- // fn system_time_to_utc(system_time: SystemTime) -> Option<DateTime<Utc>> {
- // system_time
- // .duration_since(UNIX_EPOCH)
- // .ok()
- // .map(|duration| Utc.timestamp(duration.as_secs() as i64, duration.subsec_nanos()))
- // }
- /// List all files in a directory that have the given suffix (file extension).
- ///
- /// # Arguments
- /// * `dir` – Directory to scan.
- /// * `suffix` – File extension to match (without the dot).
- ///
- /// # Returns
- /// A vector of `PathBuf` entries whose extension matches `suffix`.
- ///
- /// # Errors
- /// Returns any I/O error from `read_dir` or directory traversal.
- pub fn list_files_with_ext(dir: impl AsRef<Path>, ext: &str) -> std::io::Result<Vec<PathBuf>> {
- let mut out = Vec::new();
- for entry in fs::read_dir(dir)? {
- let entry = entry?;
- let path = entry.path();
- if path.is_file() {
- if let Some(file_ext) = path.extension().and_then(|e| e.to_str()) {
- if file_ext == ext {
- out.push(path);
- }
- }
- }
- }
- out.sort();
- Ok(out)
- }
- pub fn list_directories(dir_path: PathBuf) -> std::io::Result<Vec<String>> {
- let mut directories = Vec::new();
- for entry in fs::read_dir(dir_path)? {
- let entry = entry?;
- let path = entry.path();
- // Check if the path is a directory
- if path.is_dir() {
- if let Some(dir_name) = path.file_name().and_then(|name| name.to_str()) {
- directories.push(dir_name.to_string());
- }
- }
- }
- directories.sort();
- Ok(directories)
- }
- pub fn list_files_recursive(root: impl AsRef<Path>) -> Vec<PathBuf> {
- fn walk(dir: &Path, out: &mut Vec<PathBuf>) {
- let entries = match fs::read_dir(dir) {
- Ok(entries) => entries,
- Err(e) => {
- warn!("Cannot read directory {}: {}", dir.display(), e);
- return;
- }
- };
- for entry in entries {
- let entry = match entry {
- Ok(entry) => entry,
- Err(e) => {
- warn!("Cannot read directory entry in {}: {}", dir.display(), e);
- continue;
- }
- };
- let path = entry.path();
- if path.is_dir() {
- walk(&path, out);
- } else if path.is_file() {
- out.push(path);
- }
- }
- }
- let mut files = Vec::new();
- walk(root.as_ref(), &mut files);
- files.sort();
- files
- }
- /// Checks whether the modification time of `file1` is older than `file2`.
- ///
- /// If `rm` is `true` and `file1` is older, attempts to remove the directory containing `file1`.
- ///
- /// # Arguments
- ///
- /// * `file1` - Path to the first file.
- /// * `file2` - Path to the second file.
- /// * `rm` - If true, and `file1` is older, attempts to remove its parent directory.
- ///
- /// # Returns
- ///
- /// * `Ok(true)` if `file1` is older than `file2`.
- /// * `Ok(false)` otherwise.
- ///
- /// # Errors
- ///
- /// Returns an [`anyhow::Error`] if:
- /// - Either `file1` or `file2` does not exist.
- /// - File metadata cannot be read.
- /// - File modification times cannot be retrieved.
- /// - Directory removal fails when `rm == true` and `file1` is older.
- ///
- pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result<bool> {
- debug!("is_file_older {file1} {file2}, {rm:?}");
- let mtime1 = fs::metadata(file1)
- .with_context(|| format!("Failed to read metadata for '{}'", file1))?
- .modified()
- .with_context(|| format!("Failed to get modified time for '{}'", file1))?;
- let mtime2 = fs::metadata(file2)
- .with_context(|| format!("Failed to read metadata for '{}'", file2))?
- .modified()
- .with_context(|| format!("Failed to get modified time for '{}'", file2))?;
- if mtime1 < mtime2 && rm {
- if let Some(file1_dir) = Path::new(file1).parent() {
- warn!("Removing old directory: {}", file1_dir.display());
- fs::remove_dir_all(file1_dir).map_err(|e| {
- warn!("Failed to remove {}: {}", file1_dir.display(), e);
- e
- })?;
- }
- }
- Ok(mtime1 < mtime2)
- }
- pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> {
- debug!("Trying to remove: {dir}");
- match fs::remove_dir_all(dir) {
- Ok(_) => {}
- Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
- Err(e) => anyhow::bail!("Failed to remove directory '{}': {}", dir, e),
- }
- Ok(())
- }
- /// Searches a directory for the first file matching a given name pattern.
- ///
- /// This function looks for a file in the given directory whose filename
- /// starts with the provided `starts_with` prefix and ends with the provided
- /// `ends_with` suffix. It returns the full path to the first match found.
- ///
- /// # Arguments
- ///
- /// * `dir` - A reference to the directory path where the search will occur.
- /// * `starts_with` - The required prefix of the filename (e.g., `"throughput_"`).
- /// * `ends_with` - The required suffix of the filename (e.g., `".csv"`).
- ///
- /// # Returns
- ///
- /// * `Some(PathBuf)` - Path to the first file matching the pattern.
- /// * `None` - If no matching file is found or the directory can't be read.
- ///
- /// # Example
- ///
- /// ```rust
- /// let dir = std::path::Path::new("/path/to/data");
- /// if let Some(path) = find_matching_file(dir, "throughput_", ".csv") {
- /// println!("Found file: {}", path.display());
- /// } else {
- /// eprintln!("No matching file found.");
- /// }
- /// ```
- pub fn find_matching_file(dir: &Path, starts_with: &str, ends_with: &str) -> Option<PathBuf> {
- let mut matches: Vec<_> = fs::read_dir(dir)
- .ok()?
- .filter_map(Result::ok)
- .map(|entry| entry.path())
- .filter(|path| {
- path.is_file()
- && path
- .file_name()
- .and_then(|name| name.to_str())
- .map(|name| name.starts_with(starts_with) && name.ends_with(ends_with))
- .unwrap_or(false)
- })
- .collect();
- matches.sort();
- matches.into_iter().next()
- }
- /// Searches for the first file in the given directory whose file name
- /// satisfies the provided condition.
- ///
- /// # Arguments
- ///
- /// * `dir` - Path to the directory to search.
- /// * `condition` - A closure that takes a file name (`&str`) and returns `true`
- /// if the file matches the desired condition.
- ///
- /// # Returns
- ///
- /// An `Option<PathBuf>` containing the path to the first matching file,
- /// or `None` if no file matches or the directory can't be read.
- ///
- /// # Examples
- ///
- /// ```
- /// use std::path::Path;
- /// let result = find_file(Path::new("."), |name| name.ends_with(".rs"));
- /// ```
- pub fn find_file<F>(dir: &Path, condition: F) -> Option<PathBuf>
- where
- F: Fn(&str) -> bool,
- {
- let mut matches: Vec<_> = fs::read_dir(dir)
- .ok()?
- .filter_map(Result::ok)
- .map(|entry| entry.path())
- .filter(|path| {
- path.is_file()
- && path
- .file_name()
- .and_then(|name| name.to_str())
- .map(&condition)
- .unwrap_or(false)
- })
- .collect();
- matches.sort();
- matches.into_iter().next()
- }
- /// Represents a string made of either:
- /// - one character repeated N times,
- /// - a two-character block repeated N times,
- /// - or no valid repetition pattern.
- #[derive(Debug, PartialEq)]
- pub enum Repeat {
- None,
- RepOne(char, usize),
- RepTwo(String, usize),
- }
- pub fn detect_repetition(s: &str) -> Repeat {
- let len = s.chars().count();
- if len == 0 {
- return Repeat::None;
- }
- // Check for single-char repetition
- let mut chars = s.chars();
- let first = chars.next().unwrap();
- if s.chars().all(|c| c == first) {
- return Repeat::RepOne(first, len);
- }
- // Check for two-char block repetition
- if len.is_multiple_of(2) {
- let mut iter = s.chars();
- let a = iter.next().unwrap();
- let b = iter.next().unwrap();
- let mut count = 1;
- while let (Some(c), Some(d)) = (iter.next(), iter.next()) {
- if c != a || d != b {
- return Repeat::None;
- }
- count += 1;
- }
- return Repeat::RepTwo(format!("{}{}", a, b), count);
- }
- Repeat::None
- }
- pub fn test_init() {
- let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("debug"))
- .try_init();
- }
- use regex::Regex;
- /// Extracts the numeric barcode suffix from a filename.
- ///
- /// # Examples
- ///
- /// ```
- /// let f = "sample_SQK-NBD114-24_barcode07.bam";
- /// assert_eq!(extract_barcode(f), Some(7));
- /// ```
- ///
- /// Returns `None` if no `barcodeNN` pattern is found
- /// or if the number cannot be parsed.
- pub fn extract_barcode(name: &str) -> Option<u32> {
- static BARCODE_RE: OnceLock<Regex> = OnceLock::new();
- let re = BARCODE_RE.get_or_init(|| Regex::new(r"barcode(\d+)").unwrap());
- re.captures(name)
- .and_then(|c| c.get(1))
- .and_then(|m| m.as_str().parse::<u32>().ok())
- }
- pub fn human_size(bytes: u64) -> String {
- const UNITS: [&str; 5] = ["B", "KB", "MB", "GB", "TB"];
- let mut size = bytes as f64;
- let mut unit = 0;
- while size >= 1024.0 && unit < UNITS.len() - 1 {
- size /= 1024.0;
- unit += 1;
- }
- if unit == 0 {
- format!("{} {}", bytes, UNITS[unit])
- } else {
- format!("{:.2} {}", size, UNITS[unit])
- }
- }
- /// RAII guard that removes a temporary directory when dropped.
- pub struct TempDirGuard {
- path: PathBuf,
- }
- impl TempDirGuard {
- pub fn new(path: PathBuf) -> Self {
- Self { path }
- }
- }
- impl Drop for TempDirGuard {
- fn drop(&mut self) {
- if self.path.exists() {
- // Log the attempt. Using eprintln/warn! ensures the error is noted,
- // but the program does not panic.
- if let Err(e) = fs::remove_dir_all(&self.path) {
- error!(
- "Failed to remove temporary directory '{}': {}",
- self.path.display(),
- e
- );
- }
- }
- }
- }
- /// Removes a BAM file and its associated index file (`.bam.bai` or `.bai`).
- pub fn remove_bam_with_index(bam: &Path) -> anyhow::Result<()> {
- if bam.exists() {
- fs::remove_file(bam).with_context(|| format!("Failed to remove BAM: {}", bam.display()))?;
- }
- let bai = bam.with_extension("bam.bai");
- if bai.exists() {
- fs::remove_file(&bai).ok();
- }
- let bai_alt = bam.with_extension("bai");
- if bai_alt.exists() {
- fs::remove_file(&bai_alt).ok();
- }
- Ok(())
- }
- /// Guard that tracks temporary filesystem paths and cleans them up on drop
- /// unless explicitly disarmed.
- ///
- /// This is intended for pipeline-style code where intermediate outputs
- /// are produced by downstream tools. The guard only tracks paths; it does
- /// **not** create files itself.
- ///
- /// If the pipeline completes successfully, call [`disarm`] to prevent
- /// cleanup. Otherwise, all tracked paths are removed on drop on a
- /// best-effort basis.
- ///
- /// # Example
- ///
- /// ```no_run
- /// use std::path::PathBuf;
- ///
- /// let tmp_dir = PathBuf::from("/tmp");
- /// let mut guard = TempFileGuard::new();
- ///
- /// let path1 = guard.temp_path(".tmp", &tmp_dir);
- /// let path2 = guard.temp_path(".tmp", &tmp_dir);
- ///
- /// // Produce outputs at `path1` and `path2`
- ///
- /// // Mark success
- /// guard.disarm();
- /// ```
- ///
- /// If `disarm()` is not called, both paths are cleaned up automatically
- /// when the guard is dropped.
- pub struct TempFileGuard {
- files: Vec<PathBuf>,
- disarmed: AtomicBool,
- }
- impl Default for TempFileGuard {
- fn default() -> Self {
- Self::new()
- }
- }
- impl TempFileGuard {
- pub fn new() -> Self {
- Self {
- files: Vec::new(),
- disarmed: AtomicBool::new(false),
- }
- }
- /// Registers a temporary file for cleanup.
- pub fn track(&mut self, path: PathBuf) {
- self.files.push(path);
- }
- /// Registers multiple temporary files for cleanup.
- pub fn track_many(&mut self, paths: impl IntoIterator<Item = PathBuf>) {
- self.files.extend(paths);
- }
- /// Disarms the guard, preventing cleanup on drop.
- /// Call this when the pipeline succeeds.
- pub fn disarm(&self) {
- self.disarmed
- .store(true, std::sync::atomic::Ordering::SeqCst);
- }
- /// Manually clean up all tracked files.
- pub fn cleanup(&mut self) {
- for file in &self.files {
- if let Err(e) = remove_tracked_path(file) {
- warn!("Failed to clean up temp file {}: {}", file.display(), e);
- }
- }
- self.files.clear();
- }
- pub fn clear(&mut self) {
- self.files.clear();
- }
- /// Generates a unique temporary path inside `tmp_dir` and registers it
- /// for cleanup.
- ///
- /// The returned path is derived from a UUIDv4 and **no file is created**.
- /// The caller is responsible for creating or writing to the path.
- ///
- /// # Example
- ///
- /// ```no_run
- /// use std::path::PathBuf;
- ///
- /// let tmp_dir = PathBuf::from("/tmp");
- /// let mut guard = TempFileGuard::new();
- ///
- /// let path = guard.temp_path(".tmp", &tmp_dir);
- /// // create or write to `path`
- /// ```
- pub fn tmp_path(&mut self, suffix: &str, tmp_dir: impl AsRef<Path>) -> PathBuf {
- let name = format!("pandora-tmp-{}{}", Uuid::new_v4(), suffix);
- let tmp_dir = tmp_dir.as_ref();
- let path = tmp_dir.join(name);
- self.track(path.clone());
- path
- }
- }
- fn remove_tracked_path(path: &Path) -> anyhow::Result<()> {
- if path.is_dir() {
- fs::remove_dir_all(path)
- .with_context(|| format!("Failed to remove temp directory: {}", path.display()))?;
- } else if path.exists() {
- fs::remove_file(path)
- .with_context(|| format!("Failed to remove temp file: {}", path.display()))?;
- }
- if path.extension().and_then(|e| e.to_str()) == Some("bam") {
- remove_existing_file(path.with_extension("bam.bai"));
- remove_existing_file(path.with_extension("bai"));
- }
- Ok(())
- }
- fn remove_existing_file(path: PathBuf) {
- if path.exists() {
- let _ = fs::remove_file(path);
- }
- }
- impl Drop for TempFileGuard {
- fn drop(&mut self) {
- if !self.disarmed.load(std::sync::atomic::Ordering::SeqCst) {
- warn!(
- "Pipeline failed or panicked, cleaning up {} temporary files",
- self.files.len()
- );
- self.cleanup();
- }
- }
- }
- // pub fn temp_file_path(suffix: &str, config: &Config) -> std::io::Result<PathBuf> {
- // let p = config.tmp_dir;
- // let temp_path = tempfile::Builder::new()
- // .prefix("pandora-temp-")
- // .suffix(suffix)
- // .rand_bytes(5)
- // .tempfile()?
- // .into_temp_path();
- //
- // Ok(temp_path.to_path_buf())
- // }
- /// Extracts genome sizes from BAM header.
- /// Extract contig names and lengths from a BAM header as a hash map.
- ///
- /// This delegates parsing to [`crate::io::bam::get_genome_sizes`]. Because the
- /// return type is a hash map, iteration order is not BAM header order; use
- /// [`get_genome_sizes_in_header_order`] when order matters.
- pub fn get_genome_sizes(
- header: &rust_htslib::bam::Header,
- ) -> anyhow::Result<FxHashMap<String, u64>> {
- Ok(crate::io::bam::get_genome_sizes(header)?
- .into_iter()
- .collect())
- }
- /// Extract contig names and lengths from a BAM header in header order.
- ///
- /// Use this when downstream region generation must follow the BAM/reference
- /// contig order. [`get_genome_sizes`] returns a hash map and does not preserve
- /// that order.
- pub fn get_genome_sizes_in_header_order(
- header: &rust_htslib::bam::HeaderView,
- ) -> anyhow::Result<Vec<(String, u64)>> {
- (0..header.target_count())
- .map(|tid| {
- let name = String::from_utf8_lossy(header.tid2name(tid)).into_owned();
- let len = header
- .target_len(tid)
- .with_context(|| format!("BAM header target {name} is missing length"))?;
- Ok((name, len))
- })
- .collect()
- }
- pub fn bam_contigs<P: AsRef<std::path::Path>>(bam: P) -> anyhow::Result<Vec<String>> {
- let reader = rust_htslib::bam::Reader::from_path(&bam)?;
- Ok(reader
- .header()
- .target_names()
- .iter()
- .map(|name| String::from_utf8_lossy(name).into_owned())
- .collect())
- }
- /// Split genome into ~n_parts equal-sized chunks (by number of bases),
- /// returning for each chunk a list of regions in `ctg:start-end` form.
- /// Split genome into ~n_parts equal-sized chunks (by bases).
- /// Each chunk is a single region `ctg:start-end` (no cross-contig regions).
- ///
- /// Note: the number of regions returned will be ≈ n_parts (may differ by 1–2).
- pub fn split_genome_into_n_regions(
- genome_sizes: &FxHashMap<String, u64>,
- n_parts: usize,
- ) -> Vec<String> {
- if n_parts == 0 || genome_sizes.is_empty() {
- return Vec::new();
- }
- // Deterministic contig order
- let mut contigs: Vec<(String, u64)> = genome_sizes
- .iter()
- .map(|(ctg, len)| (ctg.clone(), *len))
- .collect();
- contigs.sort_by(|a, b| a.0.cmp(&b.0));
- let total_bases: u64 = contigs.iter().map(|(_, len)| *len).sum();
- if total_bases == 0 {
- return Vec::new();
- }
- let target_chunk_size: u64 = total_bases.div_ceil(n_parts as u64); // ceil
- let mut regions = Vec::new();
- for (ctg, len) in contigs {
- let mut remaining = len;
- let mut start: u64 = 1;
- while remaining > 0 {
- let take = remaining.min(target_chunk_size);
- let end = start + take - 1;
- regions.push(format!("{ctg}:{start}-{end}"));
- remaining -= take;
- start = end + 1;
- }
- }
- regions
- }
- /// Split genome into exactly `n_parts` chunks with (almost) equal total size.
- /// Each chunk is a *list* of regions `ctg:start-end`. A chunk may span several
- /// contigs, but each region is within a single contig.
- ///
- /// The sizes of the parts will differ by at most 1 base.
- pub fn split_genome_into_n_regions_exact(
- genome_sizes: &FxHashMap<String, u64>,
- n_parts: usize,
- ) -> Vec<Vec<String>> {
- let mut contigs: Vec<(String, u64)> = genome_sizes
- .iter()
- .map(|(ctg, len)| (ctg.clone(), *len))
- .collect();
- contigs.sort_by(|a, b| a.0.cmp(&b.0));
- split_ordered_genome_into_n_regions_exact(&contigs, n_parts)
- }
- pub fn split_ordered_genome_into_n_regions_exact(
- genome_sizes: &[(String, u64)],
- n_parts: usize,
- ) -> Vec<Vec<String>> {
- if n_parts == 0 || genome_sizes.is_empty() {
- return Vec::new();
- }
- let total_bases: u64 = genome_sizes.iter().map(|(_, len)| *len).sum();
- if total_bases == 0 {
- return Vec::new();
- }
- // We cannot have more non-empty parts than bases if we use 1-based inclusive coordinates.
- let parts = n_parts.min(total_bases as usize);
- // Distribute bases as evenly as possible:
- // first `remainder` parts get (base_per_part + 1), the rest get base_per_part.
- let base_per_part = total_bases / parts as u64;
- let remainder = total_bases % parts as u64;
- let target_sizes: Vec<u64> = (0..parts)
- .map(|i| {
- if i < remainder as usize {
- base_per_part + 1
- } else {
- base_per_part
- }
- })
- .collect();
- let mut chunks: Vec<Vec<String>> = Vec::with_capacity(parts);
- let mut part_idx = 0;
- let mut remaining_in_part = target_sizes[0];
- for (ctg, mut len) in genome_sizes.iter().cloned() {
- let mut start: u64 = 1;
- while len > 0 && part_idx < parts {
- let take = len.min(remaining_in_part);
- let end = start + take - 1;
- if chunks.len() <= part_idx {
- chunks.push(Vec::new());
- }
- chunks[part_idx].push(format!("{ctg}:{start}-{end}"));
- len -= take;
- start = end + 1;
- remaining_in_part -= take;
- if remaining_in_part == 0 {
- part_idx += 1;
- if part_idx == parts {
- break;
- }
- remaining_in_part = target_sizes[part_idx];
- }
- }
- }
- // Sanity: we should have exactly `parts` chunks, all non-empty
- debug_assert_eq!(chunks.len(), parts);
- chunks
- }
- /// Convert a numeric value to an RGB triplet using a diverging palette.
- ///
- /// Values are clipped to `[-clip, +clip]` and mapped as:
- /// - negative → blue
- /// - zero → white
- /// - positive → red
- ///
- /// # Arguments
- /// * `v` – value to color (e.g. delta methylation)
- /// * `clip` – symmetric clipping bound (must be > 0)
- ///
- /// # Returns
- /// `(r, g, b)` as 8-bit RGB components.
- ///
- /// # Panics
- /// Panics if `clip <= 0.0`.
- pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) {
- assert!(clip > 0.0, "clip must be > 0");
- let x = v.clamp(-clip, clip);
- let t = (x + clip) / (2.0 * clip); // [-clip,+clip] → [0,1]
- if t < 0.5 {
- // blue → white
- let u = t / 0.5;
- let r = (255.0 * u).round() as u8;
- let g = (255.0 * u).round() as u8;
- (r, g, 255)
- } else {
- // white → red
- let u = (t - 0.5) / 0.5;
- let g = (255.0 * (1.0 - u)).round() as u8;
- let b = (255.0 * (1.0 - u)).round() as u8;
- (255, g, b)
- }
- }
- pub fn revcomp(s: &str) -> String {
- fn comp(b: u8) -> u8 {
- match b {
- b'A' => b'T',
- b'C' => b'G',
- b'G' => b'C',
- b'T' => b'A',
- b'N' => b'N',
- _ => b'N',
- }
- }
- let v: Vec<u8> = s
- .as_bytes()
- .iter()
- .rev()
- .map(|&b| comp(b.to_ascii_uppercase()))
- .collect();
- String::from_utf8(v).unwrap()
- }
- #[cfg(test)]
- mod tests {
- use log::info;
- use rust_htslib::bam::{self, Read};
- use crate::helpers::test_init;
- use super::*;
- #[test]
- fn par_intersection_keeps_values_only_in_second_outside_first_range() {
- let res = par_intersection(&[10], &[1]);
- assert_eq!(res.common, Vec::<i32>::new());
- assert_eq!(res.only_in_first, vec![10]);
- assert_eq!(res.only_in_second, vec![1]);
- }
- #[test]
- fn par_intersection_handles_duplicate_counts() {
- let res = par_intersection(&[1, 2, 2, 3], &[0, 2, 2, 2, 4]);
- assert_eq!(res.common, vec![2, 2]);
- assert_eq!(res.only_in_first, vec![1, 3]);
- assert_eq!(res.only_in_second, vec![0, 2, 4]);
- }
- #[test]
- fn bin_data_handles_empty_and_invalid_bin_size() {
- assert_eq!(bin_data::<f64>(Vec::new(), 0.1), Vec::<(f64, usize)>::new());
- assert_eq!(bin_data(vec![1.0, 2.0], 0.0), Vec::<(f64, usize)>::new());
- assert_eq!(
- bin_data(vec![1.0, 2.0], f64::NAN),
- Vec::<(f64, usize)>::new()
- );
- assert_eq!(bin_data(vec![f64::NAN], 0.1), Vec::<(f64, usize)>::new());
- }
- #[test]
- fn bin_data_keeps_sparse_values_in_aligned_bins() {
- assert_eq!(bin_data(vec![0.0, 1.0], 0.25), vec![(0.0, 1), (1.0, 1)]);
- }
- #[test]
- fn split_ordered_genome_regions_preserves_input_contig_order() {
- let genome = vec![("chr2".to_string(), 2), ("chr1".to_string(), 2)];
- assert_eq!(
- split_ordered_genome_into_n_regions_exact(&genome, 2),
- vec![vec!["chr2:1-2".to_string()], vec!["chr1:1-2".to_string()]]
- );
- }
- #[test]
- fn split_g() -> anyhow::Result<()> {
- test_init();
- let n_parts = 4;
- let reader = bam::Reader::from_path(
- "/mnt/beegfs02/scratch/t_steimle/data/wgs/34528/norm/34528_norm_hs1.bam",
- )
- .with_context(|| format!("Failed to open BAM"))?;
- let header = bam::Header::from_template(reader.header());
- let genome_sizes = get_genome_sizes(&header)?;
- // Split genome into regions
- let regions = split_genome_into_n_regions_exact(&genome_sizes, n_parts);
- info!("{:#?}", regions);
- Ok(())
- }
- }
|