Thomas
/
pandora_lib_promethion


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214
							use anyhow::Context;
use bitcode::{Decode, Encode};
use glob::glob;
use log::{debug, error, warn};
use rust_htslib::bam::Read;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use uuid::Uuid;
use std::{
    cmp::Ordering,
    collections::{HashMap, HashSet},
    fmt, fs,
    iter::Sum,
    ops::{Add, Div},
    path::{Path, PathBuf},
    sync::atomic::AtomicBool,
};

pub fn find_unique_file(dir_path: &str, suffix: &str) -> anyhow::Result<String> {
    let mut matching_files = Vec::new();

    for entry in
        fs::read_dir(dir_path).with_context(|| format!("Failed to read directory: {}", dir_path))?
    {
        let entry = entry.with_context(|| "Failed to read directory entry")?;
        let path = entry.path();

        if path.is_file()
            && path
                .file_name()
                .and_then(|name| name.to_str())
                .map(|name| name.ends_with(suffix))
                .unwrap_or(false)
        {
            matching_files.push(path);
        }
    }

    match matching_files.len() {
        0 => Err(anyhow::anyhow!("No file found ending with '{}'", suffix))
            .with_context(|| format!("In directory: {}", dir_path)),
        1 => Ok(matching_files[0].to_string_lossy().into_owned()),
        _ => Err(anyhow::anyhow!(
            "Multiple files found ending with '{}'",
            suffix
        ))
        .with_context(|| format!("In directory: {}", dir_path)),
    }
}

pub fn path_prefix(out: &str) -> anyhow::Result<String> {
    let out_path = Path::new(out);

    let out_dir = out_path
        .parent()
        .ok_or_else(|| anyhow::anyhow!("Can't parse the dir of {}", out_path.display()))?;

    let name = out_path
        .file_name()
        .and_then(|name| name.to_str())
        .ok_or_else(|| anyhow::anyhow!("Can't parse the file name of {}", out_path.display()))?;

    let stem = name
        .split_once('.')
        .map(|(stem, _)| stem)
        .ok_or_else(|| anyhow::anyhow!("Can't parse the file stem of {}", name))?;

    Ok(format!("{}/{stem}", out_dir.display()))
}

pub fn force_or_not(_path: &str, _force: bool) -> anyhow::Result<()> {
    // let path = Path::new(path);
    // let mut output_exists = path.exists();
    // let dir = path
    //     .parent()
    //     .context(format!("Can't parse the parent dir of {}", path.display()))?;
    //
    // if force && output_exists {
    //     fs::remove_dir_all(dir)?;
    //     fs::create_dir_all(dir)?;
    //     output_exists = false;
    // }
    //
    // if output_exists {
    //     info!("{} already exists.", path.display())
    // }
    Ok(())
}

/// Builds Singularity bind flags from input/output paths.
///
/// - Converts file paths to their parent directory.
/// - Skips non-existing paths to avoid Singularity errors.
/// - Canonicalizes and deduplicates directories.
pub fn singularity_bind_flags<I, P>(paths: I) -> String
where
    I: IntoIterator<Item = P>,
    P: AsRef<Path>,
{
    let mut seen = HashSet::new();
    let mut binds = Vec::new();

    let mut push_dir = |p: &Path| {
        if p.as_os_str().is_empty() || !p.exists() {
            return;
        }
        let dir = p.canonicalize().unwrap_or_else(|_| p.to_path_buf());
        if seen.insert(dir.clone()) {
            binds.push(format!(
                "--bind {src}:{dst}",
                src = dir.display(),
                dst = dir.display()
            ));
        }
    };

    for path_str in paths {
        let path = path_str.as_ref();
        if path.is_dir() {
            push_dir(path);
        } else if let Some(parent) = path.parent() {
            push_dir(parent);
        }
    }

    binds.join(" ")
}

use rayon::prelude::*;
use std::cmp::Ord;

pub struct VectorIntersection<T> {
    pub common: Vec<T>,
    pub only_in_first: Vec<T>,
    pub only_in_second: Vec<T>,
}

impl<T> fmt::Display for VectorIntersection<T> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let total_items = self.common.len() + self.only_in_first.len() + self.only_in_second.len();

        writeln!(f, "Total items: {}", total_items)?;
        writeln!(
            f,
            "Common: {} ({:.2}%)",
            self.common.len(),
            percentage(self.common.len(), total_items)
        )?;
        writeln!(
            f,
            "Only in first: {} ({:.2}%)",
            self.only_in_first.len(),
            percentage(self.only_in_first.len(), total_items)
        )?;
        writeln!(
            f,
            "Only in second: {} ({:.2}%)",
            self.only_in_second.len(),
            percentage(self.only_in_second.len(), total_items)
        )
    }
}

fn percentage(part: usize, total: usize) -> f64 {
    if total == 0 {
        0.0
    } else {
        (part as f64 / total as f64) * 100.0
    }
}

impl<T> Default for VectorIntersection<T> {
    fn default() -> Self {
        Self {
            common: Vec::new(),
            only_in_first: Vec::new(),
            only_in_second: Vec::new(),
        }
    }
}

impl<T: Ord + Clone> VectorIntersection<T> {
    fn merge(&mut self, other: &mut Self) {
        self.common.append(&mut other.common);
        self.only_in_first.append(&mut other.only_in_first);
        self.only_in_second.append(&mut other.only_in_second);
    }
}

fn intersection<T: Ord + Clone>(vec1: &[T], vec2: &[T]) -> VectorIntersection<T> {
    let mut result = VectorIntersection::default();
    let mut i = 0;
    let mut j = 0;

    while i < vec1.len() && j < vec2.len() {
        match vec1[i].cmp(&vec2[j]) {
            Ordering::Less => {
                result.only_in_first.push(vec1[i].clone());
                i += 1;
            }
            Ordering::Greater => {
                result.only_in_second.push(vec2[j].clone());
                j += 1;
            }
            Ordering::Equal => {
                let val = &vec1[i];
                let mut count1 = 1;
                let mut count2 = 1;

                // Count occurrences in vec1
                while i + 1 < vec1.len() && &vec1[i + 1] == val {
                    i += 1;
                    count1 += 1;
                }

                // Count occurrences in vec2
                while j + 1 < vec2.len() && &vec2[j + 1] == val {
                    j += 1;
                    count2 += 1;
                }

                // Add to common
                result
                    .common
                    .extend(std::iter::repeat_n(val.clone(), count1.min(count2)));

                // Add excess to only_in_first or only_in_second
                match count1.cmp(&count2) {
                    Ordering::Greater => {
                        result
                            .only_in_first
                            .extend(std::iter::repeat_n(val.clone(), count1 - count2));
                    }
                    Ordering::Less => {
                        result
                            .only_in_second
                            .extend(std::iter::repeat_n(val.clone(), count2 - count1));
                    }
                    Ordering::Equal => {
                        // No excess elements, do nothing
                    }
                }
                i += 1;
                j += 1;
            }
        }
    }

    result.only_in_first.extend(vec1[i..].iter().cloned());
    result.only_in_second.extend(vec2[j..].iter().cloned());

    result
}

pub fn par_intersection<T: Ord + Send + Sync + Clone>(
    vec1: &[T],
    vec2: &[T],
) -> VectorIntersection<T> {
    let chunk_size = (vec1.len() / rayon::current_num_threads()).max(1);

    vec1.par_chunks(chunk_size)
        .map(|chunk| {
            let start = vec2.partition_point(|x| x < &chunk[0]);
            let end = vec2.partition_point(|x| x <= &chunk[chunk.len() - 1]);

            // Ensure start is not greater than end
            if start <= end {
                intersection(chunk, &vec2[start..end])
            } else {
                // If start > end, there's no intersection for this chunk
                VectorIntersection::default()
            }
        })
        .reduce(VectorIntersection::default, |mut acc, mut x| {
            acc.merge(&mut x);
            acc
        })
}

pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 {
    let m = dna_sequence.len() as f64;

    // Early return for empty sequences
    if m == 0.0 {
        return 0.0;
    }

    // Count occurrences of each base
    let mut bases = HashMap::<char, usize>::new();
    for base in dna_sequence.chars() {
        *bases.entry(base).or_insert(0) += 1;
    }

    // Calculate Shannon entropy
    let shannon_entropy_value: f64 = bases
        .values()
        .map(|&n_i| {
            let p_i = n_i as f64 / m;
            if p_i > 0.0 {
                -p_i * p_i.log2()
            } else {
                0.0 // Avoid log2(0)
            }
        })
        .sum();

    shannon_entropy_value
}

pub fn mean<T>(values: &[T]) -> f64
where
    T: Copy + Add<Output = T> + Div<Output = T> + Sum + Into<f64>,
{
    let count = values.len();

    if count == 0 {
        return 0.0;
    }

    let sum: T = values.iter().copied().sum();
    sum.into() / count as f64
}

pub fn bin_data<V>(data: Vec<V>, bin_size: V) -> Vec<(V, usize)>
where
    V: Copy + PartialOrd + std::ops::AddAssign + std::ops::Add<Output = V>,
{
    // Sort the data
    let mut sorted_data = data.clone();
    sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());

    // Initialize bins
    let mut bins: Vec<(V, usize)> = Vec::new();
    let mut current_bin_start = sorted_data[0];
    let mut count = 0;

    for &value in &sorted_data {
        if value < current_bin_start + bin_size {
            count += 1;
        } else {
            bins.push((current_bin_start, count));
            current_bin_start += bin_size;
            count = 1;
        }
    }

    // Push the last bin
    bins.push((current_bin_start, count));

    bins
}

// fn aggregate_data(data: &[(u128, u32)], num_bins: usize) -> Vec<(u32, u32)> {
//     if data.is_empty() || num_bins == 0 {
//         return vec![];
//     }
//
//     let bin_size = ((data.len() as f64) / (num_bins as f64)).ceil() as usize;
//
//     (0..num_bins)
//         .map(|i| {
//             let start = i * bin_size;
//             let end = (start + bin_size).min(data.len()); // Ensure `end` does not exceed `data.len()`
//
//             // If `start` is out of bounds, return (0, 0)
//             if start >= data.len() {
//                 return (0, 0);
//             }
//
//             let bin = &data[start..end];
//
//             let sum_x: u128 = bin.iter().map(|&(x, _)| x).sum();
//             let count = bin.len() as u128;
//             let mean_x = (sum_x / count) as u32; // Rounded down to nearest u32
//
//             let sum_n: u32 = bin.iter().map(|&(_, n)| n).sum();
//
//             (mean_x, sum_n)
//         })
//         .collect()
// }
//

pub fn app_storage_dir() -> anyhow::Result<PathBuf> {
    let app_name = env!("CARGO_PKG_NAME");
    let app_dir = dirs::data_dir()
        .context("Failed to get data directory")?
        .join(app_name);

    if !app_dir.exists() {
        fs::create_dir_all(&app_dir).context("Failed to create application directory")?;
    }

    Ok(app_dir)
}
use blake3::Hasher as Blake3Hasher;
use std::hash::{BuildHasher, Hasher};

pub struct Blake3Hash(Blake3Hasher);

impl Hasher for Blake3Hash {
    fn finish(&self) -> u64 {
        let hash = self.0.finalize();
        u64::from_le_bytes(hash.as_bytes()[..8].try_into().unwrap())
    }

    fn write(&mut self, bytes: &[u8]) {
        self.0.update(bytes);
    }
}

/// Default Hasher
#[derive(Default, Clone)]
pub struct Blake3BuildHasher;

impl BuildHasher for Blake3BuildHasher {
    type Hasher = Blake3Hash;

    fn build_hasher(&self) -> Self::Hasher {
        Blake3Hash(Blake3Hasher::new())
    }
}
// Custom 128-bit hash type
#[derive(PartialEq, Eq, Clone, Copy, Serialize, Deserialize, Debug, Encode, Decode)]
pub struct Hash128([u8; 16]);

impl std::hash::Hash for Hash128 {
    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
        state.write(&self.0);
    }
}

impl Hash128 {
    pub fn new(bytes: [u8; 16]) -> Self {
        Hash128(bytes)
    }
    pub fn to_bytes(&self) -> [u8; 16] {
        self.0
    }
}

pub fn get_dir_size(path: &Path) -> std::io::Result<u64> {
    let mut total_size = 0;
    if path.is_dir() {
        for entry in fs::read_dir(path)? {
            let entry = entry?;
            let path = entry.path();
            if path.is_file() {
                total_size += path.metadata()?.len();
            } else if path.is_dir() {
                total_size += get_dir_size(&path)?;
            }
        }
    }
    Ok(total_size)
}

/// Finds all files matching the given glob pattern.
///
/// # Arguments
///
/// * `pattern` - A glob pattern string (e.g., `"data/**/*.txt"`).
///
/// # Returns
///
/// * `Ok(Vec<PathBuf>)` with all successfully matched file paths.
/// * `Err` if the glob pattern is invalid or any matched path fails to resolve.
///
/// # Errors
///
/// Returns an error if:
/// - The glob pattern is invalid.
/// - A file path matching the pattern cannot be resolved.
///
/// # Examples
///
/// ```rust
/// let files = find_files("src/**/*.rs")?;
/// for file in files {
///     println!("{:?}", file);
/// }
/// ```
pub fn find_files(pattern: &str) -> anyhow::Result<Vec<PathBuf>> {
    let mut result = Vec::new();

    let entries = glob(pattern).with_context(|| format!("Invalid glob pattern: '{}'", pattern))?;

    for entry in entries {
        let path =
            entry.with_context(|| format!("Failed to resolve path for pattern '{}'", pattern))?;
        result.push(path);
    }

    Ok(result)
}

// fn system_time_to_utc(system_time: SystemTime) -> Option<DateTime<Utc>> {
//     system_time
//         .duration_since(UNIX_EPOCH)
//         .ok()
//         .map(|duration| Utc.timestamp(duration.as_secs() as i64, duration.subsec_nanos()))
// }

/// List all files in a directory that have the given suffix (file extension).
///
/// # Arguments
/// * `dir` – Directory to scan.
/// * `suffix` – File extension to match (without the dot).
///
/// # Returns
/// A vector of `PathBuf` entries whose extension matches `suffix`.
///
/// # Errors
/// Returns any I/O error from `read_dir` or directory traversal.
pub fn list_files_with_ext(dir: impl AsRef<Path>, ext: &str) -> std::io::Result<Vec<PathBuf>> {
    let mut out = Vec::new();

    for entry in fs::read_dir(dir)? {
        let entry = entry?;
        let path = entry.path();

        if path.is_file() {
            if let Some(file_ext) = path.extension().and_then(|e| e.to_str()) {
                if file_ext == ext {
                    out.push(path);
                }
            }
        }
    }

    Ok(out)
}

pub fn list_directories(dir_path: PathBuf) -> std::io::Result<Vec<String>> {
    let mut directories = Vec::new();

    for entry in fs::read_dir(dir_path)? {
        let entry = entry?;
        let path = entry.path();

        // Check if the path is a directory
        if path.is_dir() {
            if let Some(dir_name) = path.file_name().and_then(|name| name.to_str()) {
                directories.push(dir_name.to_string());
            }
        }
    }

    Ok(directories)
}

pub fn list_files_recursive(root: impl AsRef<Path>) -> Vec<PathBuf> {
    fn walk(dir: &Path, out: &mut Vec<PathBuf>) {
        if let Ok(entries) = fs::read_dir(dir) {
            for entry in entries.flatten() {
                let path = entry.path();
                if path.is_dir() {
                    walk(&path, out);
                } else if path.is_file() {
                    out.push(path);
                }
            }
        }
    }

    let mut files = Vec::new();
    walk(root.as_ref(), &mut files);
    files
}

/// Checks whether the modification time of `file1` is older than `file2`.
///
/// If `rm` is `true` and `file1` is older, attempts to remove the directory containing `file1`.
///
/// # Arguments
///
/// * `file1` - Path to the first file.
/// * `file2` - Path to the second file.
/// * `rm` - If true, and `file1` is older, attempts to remove its parent directory.
///
/// # Returns
///
/// * `Ok(true)` if `file1` is older than `file2`.
/// * `Ok(false)` otherwise.
///
/// # Errors
///
/// Returns an [`anyhow::Error`] if:
/// - Either `file1` or `file2` does not exist.
/// - File metadata cannot be read.
/// - File modification times cannot be retrieved.
/// - (if `rm == true`) Directory removal fails (if uncommented).
///
pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result<bool> {
    debug!("is_file_older {file1} {file2}, {rm:?}");
    let mtime1 = fs::metadata(file1)
        .with_context(|| format!("Failed to read metadata for '{}'", file1))?
        .modified()
        .with_context(|| format!("Failed to get modified time for '{}'", file1))?;

    let mtime2 = fs::metadata(file2)
        .with_context(|| format!("Failed to read metadata for '{}'", file2))?
        .modified()
        .with_context(|| format!("Failed to get modified time for '{}'", file2))?;

    if mtime1 < mtime2 && rm {
        if let Some(file1_dir) = Path::new(file1).parent() {
            
            warn!("Removing old directory: {}", file1_dir.display());
            fs::remove_dir_all(file1_dir).map_err(|e| {
                warn!("Failed to remove {}: {}", file1_dir.display(), e);
                e
            })?;
        }
    }

    Ok(mtime1 < mtime2)
}

pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> {
    debug!("Trying to remove: {dir}");
    // match fs::remove_dir_all(dir) {
    //     Ok(_) => {}
    //     Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
    //     Err(e) => {
    //         anyhow::bail!("Failed to remove directory '{}': {}", dir, e);
    //     }
    // };
    Ok(())
}

/// Searches a directory for the first file matching a given name pattern.
///
/// This function looks for a file in the given directory whose filename
/// starts with the provided `starts_with` prefix and ends with the provided
/// `ends_with` suffix. It returns the full path to the first match found.
///
/// # Arguments
///
/// * `dir` - A reference to the directory path where the search will occur.
/// * `starts_with` - The required prefix of the filename (e.g., `"throughput_"`).
/// * `ends_with` - The required suffix of the filename (e.g., `".csv"`).
///
/// # Returns
///
/// * `Some(PathBuf)` - Path to the first file matching the pattern.
/// * `None` - If no matching file is found or the directory can't be read.
///
/// # Example
///
/// ```rust
/// let dir = std::path::Path::new("/path/to/data");
/// if let Some(path) = find_matching_file(dir, "throughput_", ".csv") {
///     println!("Found file: {}", path.display());
/// } else {
///     eprintln!("No matching file found.");
/// }
/// ```
pub fn find_matching_file(dir: &Path, starts_with: &str, ends_with: &str) -> Option<PathBuf> {
    fs::read_dir(dir)
        .ok()?
        .filter_map(Result::ok)
        .map(|entry| entry.path())
        .find(|path| {
            path.is_file()
                && path
                    .file_name()
                    .and_then(|name| name.to_str())
                    .map(|name| name.starts_with(starts_with) && name.ends_with(ends_with))
                    .unwrap_or(false)
        })
}

/// Searches for the first file in the given directory whose file name
/// satisfies the provided condition.
///
/// # Arguments
///
/// * `dir` - Path to the directory to search.
/// * `condition` - A closure that takes a file name (`&str`) and returns `true`
///   if the file matches the desired condition.
///
/// # Returns
///
/// An `Option<PathBuf>` containing the path to the first matching file,
/// or `None` if no file matches or the directory can't be read.
///
/// # Examples
///
/// ```
/// use std::path::Path;
/// let result = find_file(Path::new("."), |name| name.ends_with(".rs"));
/// ```
pub fn find_file<F>(dir: &Path, condition: F) -> Option<PathBuf>
where
    F: Fn(&str) -> bool,
{
    fs::read_dir(dir).ok()?.find_map(|entry| {
        let path = entry.ok()?.path();

        if path.is_file()
            && path
                .file_name()
                .and_then(|name| name.to_str())
                .map(&condition)
                .unwrap_or(false)
        {
            Some(path)
        } else {
            None
        }
    })
}

/// Represents a string made of either:
/// - one character repeated N times,
/// - a two-character block repeated N times,
/// - or no valid repetition pattern.
#[derive(Debug, PartialEq)]
pub enum Repeat {
    None,
    RepOne(char, usize),
    RepTwo(String, usize),
}

pub fn detect_repetition(s: &str) -> Repeat {
    let len = s.chars().count();
    if len == 0 {
        return Repeat::None;
    }

    // Check for single-char repetition
    let mut chars = s.chars();
    let first = chars.next().unwrap();
    if s.chars().all(|c| c == first) {
        return Repeat::RepOne(first, len);
    }

    // Check for two-char block repetition
    if len.is_multiple_of(2) {
        let mut iter = s.chars();
        let a = iter.next().unwrap();
        let b = iter.next().unwrap();
        let mut count = 1;

        while let (Some(c), Some(d)) = (iter.next(), iter.next()) {
            if c != a || d != b {
                return Repeat::None;
            }
            count += 1;
        }

        return Repeat::RepTwo(format!("{}{}", a, b), count);
    }

    Repeat::None
}

pub fn test_init() {
    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("debug"))
        .try_init();
}

use regex::Regex;

/// Extracts the numeric barcode suffix from a filename.
///
/// # Examples
///
/// ```
/// let f = "sample_SQK-NBD114-24_barcode07.bam";
/// assert_eq!(extract_barcode(f), Some(7));
/// ```
///
/// Returns `None` if no `barcodeNN` pattern is found
/// or if the number cannot be parsed.
pub fn extract_barcode(name: &str) -> Option<u32> {
    let re = Regex::new(r"barcode(\d+)").unwrap();
    re.captures(name)
        .and_then(|c| c.get(1))
        .and_then(|m| m.as_str().parse::<u32>().ok())
}

pub fn human_size(bytes: u64) -> String {
    const UNITS: [&str; 5] = ["B", "KB", "MB", "GB", "TB"];

    let mut size = bytes as f64;
    let mut unit = 0;

    while size >= 1024.0 && unit < UNITS.len() - 1 {
        size /= 1024.0;
        unit += 1;
    }

    if unit == 0 {
        format!("{} {}", bytes, UNITS[unit])
    } else {
        format!("{:.2} {}", size, UNITS[unit])
    }
}
// A RAII (Resource Acquisition Is Initialization) guard for cleaning up temporary directories.
pub struct TempDirGuard {
    path: PathBuf,
}

impl TempDirGuard {
    pub fn new(path: PathBuf) -> Self {
        Self { path }
    }
}

// The core cleanup logic: executed automatically when the TempDirGuard is dropped.
impl Drop for TempDirGuard {
    fn drop(&mut self) {
        if self.path.exists() {
            // Log the attempt. Using eprintln/warn! ensures the error is noted,
            // but the program does not panic.
            if let Err(e) = fs::remove_dir_all(&self.path) {
                error!(
                    "Failed to remove temporary directory '{}': {}",
                    self.path.display(),
                    e
                );
            }
        }
    }
}

/// Guard that tracks temporary filesystem paths and cleans them up on drop
/// unless explicitly disarmed.
///
/// This is intended for pipeline-style code where intermediate outputs
/// are produced by downstream tools. The guard only tracks paths; it does
/// **not** create files itself.
///
/// If the pipeline completes successfully, call [`disarm`] to prevent
/// cleanup. Otherwise, all tracked paths are removed on drop on a
/// best-effort basis.
///
/// # Example
///
/// ```no_run
/// use std::path::PathBuf;
///
/// let tmp_dir = PathBuf::from("/tmp");
/// let mut guard = TempFileGuard::new();
///
/// let path1 = guard.temp_path(".tmp", &tmp_dir);
/// let path2 = guard.temp_path(".tmp", &tmp_dir);
///
/// // Produce outputs at `path1` and `path2`
///
/// // Mark success
/// guard.disarm();
/// ```
///
/// If `disarm()` is not called, both paths are cleaned up automatically
/// when the guard is dropped.
pub struct TempFileGuard {
    files: Vec<PathBuf>,
    disarmed: AtomicBool,
}

impl Default for TempFileGuard {
    fn default() -> Self {
        Self::new()
    }
}

impl TempFileGuard {
    pub fn new() -> Self {
        Self {
            files: Vec::new(),
            disarmed: AtomicBool::new(false),
        }
    }

    /// Registers a temporary file for cleanup.
    pub fn track(&mut self, path: PathBuf) {
        self.files.push(path);
    }

    /// Registers multiple temporary files for cleanup.
    pub fn track_many(&mut self, paths: impl IntoIterator<Item = PathBuf>) {
        self.files.extend(paths);
    }

    /// Disarms the guard, preventing cleanup on drop.
    /// Call this when the pipeline succeeds.
    pub fn disarm(&self) {
        self.disarmed
            .store(true, std::sync::atomic::Ordering::SeqCst);
    }

    /// Manually clean up all tracked files.
    pub fn cleanup(&mut self) {
        for file in &self.files {
            if file.exists() {
                if let Err(e) = remove_bam_with_index(file) {
                    warn!("Failed to clean up temp file {}: {}", file.display(), e);
                }
            }
        }
        self.files.clear();
    }

    pub fn clear(&mut self) {
        self.files.clear();
    }

    /// Generates a unique temporary path inside `tmp_dir` and registers it
    /// for cleanup.
    ///
    /// The returned path is derived from a UUIDv4 and **no file is created**.
    /// The caller is responsible for creating or writing to the path.
    ///
    /// # Example
    ///
    /// ```no_run
    /// use std::path::PathBuf;
    ///
    /// let tmp_dir = PathBuf::from("/tmp");
    /// let mut guard = TempFileGuard::new();
    ///
    /// let path = guard.temp_path(".tmp", &tmp_dir);
    /// // create or write to `path`
    /// ```
    pub fn tmp_path(&mut self, suffix: &str, tmp_dir: impl AsRef<Path>) -> PathBuf {
        let name = format!("pandora-tmp-{}{}", Uuid::new_v4(), suffix);

        let tmp_dir = tmp_dir.as_ref();
        let path = tmp_dir.join(name);
        self.track(path.clone());
        path
    }
}

impl Drop for TempFileGuard {
    fn drop(&mut self) {
        if !self.disarmed.load(std::sync::atomic::Ordering::SeqCst) {
            warn!(
                "Pipeline failed or panicked, cleaning up {} temporary files",
                self.files.len()
            );
            self.cleanup();
        }
    }
}

// pub fn temp_file_path(suffix: &str, config: &Config) -> std::io::Result<PathBuf> {
//     let p = config.tmp_dir;
//     let temp_path = tempfile::Builder::new()
//         .prefix("pandora-temp-")
//         .suffix(suffix)
//         .rand_bytes(5)
//         .tempfile()?
//         .into_temp_path();
//
//     Ok(temp_path.to_path_buf())
// }

/// Extracts genome sizes from BAM header.
pub fn get_genome_sizes(
    header: &rust_htslib::bam::Header,
) -> anyhow::Result<FxHashMap<String, u64>> {
    let mut sizes = FxHashMap::default();

    for (_, records) in header.to_hashmap() {
        for record in records {
            if let (Some(sn), Some(ln)) = (record.get("SN"), record.get("LN")) {
                if let Ok(len) = ln.parse::<u64>() {
                    sizes.insert(sn.clone(), len);
                }
            }
        }
    }

    Ok(sizes)
}

pub fn bam_contigs<P: AsRef<std::path::Path>>(bam: P) -> anyhow::Result<Vec<String>> {
    let reader = rust_htslib::bam::Reader::from_path(&bam)?;
    let header = rust_htslib::bam::Header::from_template(reader.header());

    Ok(get_genome_sizes(&header)?.into_keys().collect())
}

/// Split genome into ~n_parts equal-sized chunks (by number of bases),
/// returning for each chunk a list of regions in `ctg:start-end` form.
/// Split genome into ~n_parts equal-sized chunks (by bases).
/// Each chunk is a single region `ctg:start-end` (no cross-contig regions).
///
/// Note: the number of regions returned will be ≈ n_parts (may differ by 1–2).
pub fn split_genome_into_n_regions(
    genome_sizes: &FxHashMap<String, u64>,
    n_parts: usize,
) -> Vec<String> {
    if n_parts == 0 || genome_sizes.is_empty() {
        return Vec::new();
    }

    // Deterministic contig order
    let mut contigs: Vec<(String, u64)> = genome_sizes
        .iter()
        .map(|(ctg, len)| (ctg.clone(), *len))
        .collect();
    contigs.sort_by(|a, b| a.0.cmp(&b.0));

    let total_bases: u64 = contigs.iter().map(|(_, len)| *len).sum();
    if total_bases == 0 {
        return Vec::new();
    }

    let target_chunk_size: u64 = total_bases.div_ceil(n_parts as u64); // ceil

    let mut regions = Vec::new();

    for (ctg, len) in contigs {
        let mut remaining = len;
        let mut start: u64 = 1;

        while remaining > 0 {
            let take = remaining.min(target_chunk_size);
            let end = start + take - 1;
            regions.push(format!("{ctg}:{start}-{end}"));

            remaining -= take;
            start = end + 1;
        }
    }

    regions
}

/// Split genome into exactly `n_parts` chunks with (almost) equal total size.
/// Each chunk is a *list* of regions `ctg:start-end`. A chunk may span several
/// contigs, but each region is within a single contig.
///
/// The sizes of the parts will differ by at most 1 base.
pub fn split_genome_into_n_regions_exact(
    genome_sizes: &FxHashMap<String, u64>,
    n_parts: usize,
) -> Vec<Vec<String>> {
    if n_parts == 0 || genome_sizes.is_empty() {
        return Vec::new();
    }

    // Deterministic contig order
    let mut contigs: Vec<(String, u64)> = genome_sizes
        .iter()
        .map(|(ctg, len)| (ctg.clone(), *len))
        .collect();
    contigs.sort_by(|a, b| a.0.cmp(&b.0));

    let total_bases: u64 = contigs.iter().map(|(_, len)| *len).sum();
    if total_bases == 0 {
        return Vec::new();
    }

    // We cannot have more non-empty parts than bases if we use 1-based inclusive coordinates.
    let parts = n_parts.min(total_bases as usize);

    // Distribute bases as evenly as possible:
    // first `remainder` parts get (base_per_part + 1), the rest get base_per_part.
    let base_per_part = total_bases / parts as u64;
    let remainder = total_bases % parts as u64;

    let target_sizes: Vec<u64> = (0..parts)
        .map(|i| {
            if i < remainder as usize {
                base_per_part + 1
            } else {
                base_per_part
            }
        })
        .collect();

    let mut chunks: Vec<Vec<String>> = Vec::with_capacity(parts);

    let mut part_idx = 0;
    let mut remaining_in_part = target_sizes[0];

    let mut ctg_iter = contigs.into_iter();
    while let Some((ctg, mut len)) = ctg_iter.next() {
        let mut start: u64 = 1;

        while len > 0 && part_idx < parts {
            let take = len.min(remaining_in_part);
            let end = start + take - 1;

            if chunks.len() <= part_idx {
                chunks.push(Vec::new());
            }
            chunks[part_idx].push(format!("{ctg}:{start}-{end}"));

            len -= take;
            start = end + 1;
            remaining_in_part -= take;

            if remaining_in_part == 0 {
                part_idx += 1;
                if part_idx == parts {
                    break;
                }
                remaining_in_part = target_sizes[part_idx];
            }
        }
    }

    // Sanity: we should have exactly `parts` chunks, all non-empty
    debug_assert_eq!(chunks.len(), parts);
    chunks
}

/// Removes a BAM file and its associated index file (.bai).
pub fn remove_bam_with_index(bam: &Path) -> anyhow::Result<()> {
    // Remove BAM
    if bam.exists() {
        fs::remove_file(bam).with_context(|| format!("Failed to remove BAM: {}", bam.display()))?;
    }

    // Remove index (.bam.bai or .bai)
    let bai = bam.with_extension("bam.bai");
    if bai.exists() {
        fs::remove_file(&bai).ok(); // Don't fail if index doesn't exist
    }

    let bai_alt = bam.with_extension("bai");
    if bai_alt.exists() {
        fs::remove_file(&bai_alt).ok();
    }

    Ok(())
}

/// Convert a numeric value to an RGB triplet using a diverging palette.
///
/// Values are clipped to `[-clip, +clip]` and mapped as:
/// - negative → blue
/// - zero → white
/// - positive → red
///
/// # Arguments
/// * `v` – value to color (e.g. delta methylation)
/// * `clip` – symmetric clipping bound (must be > 0)
///
/// # Returns
/// `(r, g, b)` as 8-bit RGB components.
///
/// # Panics
/// Panics if `clip <= 0.0`.
pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) {
    assert!(clip > 0.0, "clip must be > 0");

    let x = v.clamp(-clip, clip);
    let t = (x + clip) / (2.0 * clip); // [-clip,+clip] → [0,1]

    if t < 0.5 {
        // blue → white
        let u = t / 0.5;
        let r = (255.0 * u).round() as u8;
        let g = (255.0 * u).round() as u8;
        (r, g, 255)
    } else {
        // white → red
        let u = (t - 0.5) / 0.5;
        let g = (255.0 * (1.0 - u)).round() as u8;
        let b = (255.0 * (1.0 - u)).round() as u8;
        (255, g, b)
    }
}

pub fn revcomp(s: &str) -> String {
    fn comp(b: u8) -> u8 {
        match b {
            b'A' => b'T',
            b'C' => b'G',
            b'G' => b'C',
            b'T' => b'A',
            b'N' => b'N',
            _ => b'N',
        }
    }
    let v: Vec<u8> = s.as_bytes().iter().rev().map(|&b| comp(b.to_ascii_uppercase())).collect();
    String::from_utf8(v).unwrap()
}

#[cfg(test)]
mod tests {
    use log::info;
    use rust_htslib::bam::{self, Read};

    use crate::helpers::test_init;

    use super::*;

    #[test]
    fn split_g() -> anyhow::Result<()> {
        test_init();
        let n_parts = 4;
        let reader = bam::Reader::from_path(
            "/mnt/beegfs02/scratch/t_steimle/data/wgs/34528/norm/34528_norm_hs1.bam",
        )
        .with_context(|| format!("Failed to open BAM"))?;
        let header = bam::Header::from_template(reader.header());
        let genome_sizes = get_genome_sizes(&header)?;

        // Split genome into regions
        let regions = split_genome_into_n_regions_exact(&genome_sizes, n_parts);

        info!("{:#?}", regions);
        Ok(())
    }
}