|
@@ -5,7 +5,6 @@ use log::{debug, error, warn};
|
|
|
use rust_htslib::bam::Read;
|
|
use rust_htslib::bam::Read;
|
|
|
use rustc_hash::FxHashMap;
|
|
use rustc_hash::FxHashMap;
|
|
|
use serde::{Deserialize, Serialize};
|
|
use serde::{Deserialize, Serialize};
|
|
|
-use uuid::Uuid;
|
|
|
|
|
use std::{
|
|
use std::{
|
|
|
cmp::Ordering,
|
|
cmp::Ordering,
|
|
|
collections::{HashMap, HashSet},
|
|
collections::{HashMap, HashSet},
|
|
@@ -13,8 +12,9 @@ use std::{
|
|
|
iter::Sum,
|
|
iter::Sum,
|
|
|
ops::{Add, Div},
|
|
ops::{Add, Div},
|
|
|
path::{Path, PathBuf},
|
|
path::{Path, PathBuf},
|
|
|
- sync::atomic::AtomicBool,
|
|
|
|
|
|
|
+ sync::{atomic::AtomicBool, OnceLock},
|
|
|
};
|
|
};
|
|
|
|
|
+use uuid::Uuid;
|
|
|
|
|
|
|
|
pub fn find_unique_file(dir_path: &str, suffix: &str) -> anyhow::Result<String> {
|
|
pub fn find_unique_file(dir_path: &str, suffix: &str) -> anyhow::Result<String> {
|
|
|
let mut matching_files = Vec::new();
|
|
let mut matching_files = Vec::new();
|
|
@@ -40,14 +40,35 @@ pub fn find_unique_file(dir_path: &str, suffix: &str) -> anyhow::Result<String>
|
|
|
0 => Err(anyhow::anyhow!("No file found ending with '{}'", suffix))
|
|
0 => Err(anyhow::anyhow!("No file found ending with '{}'", suffix))
|
|
|
.with_context(|| format!("In directory: {}", dir_path)),
|
|
.with_context(|| format!("In directory: {}", dir_path)),
|
|
|
1 => Ok(matching_files[0].to_string_lossy().into_owned()),
|
|
1 => Ok(matching_files[0].to_string_lossy().into_owned()),
|
|
|
- _ => Err(anyhow::anyhow!(
|
|
|
|
|
- "Multiple files found ending with '{}'",
|
|
|
|
|
- suffix
|
|
|
|
|
- ))
|
|
|
|
|
- .with_context(|| format!("In directory: {}", dir_path)),
|
|
|
|
|
|
|
+ _ => {
|
|
|
|
|
+ matching_files.sort();
|
|
|
|
|
+ let matches = matching_files
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .map(|p| p.display().to_string())
|
|
|
|
|
+ .collect::<Vec<_>>()
|
|
|
|
|
+ .join(", ");
|
|
|
|
|
+ Err(anyhow::anyhow!(
|
|
|
|
|
+ "Multiple files found ending with '{}': {}",
|
|
|
|
|
+ suffix,
|
|
|
|
|
+ matches
|
|
|
|
|
+ ))
|
|
|
|
|
+ .with_context(|| format!("In directory: {}", dir_path))
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/// Return the output prefix by removing all suffixes from the first dot in the filename.
|
|
|
|
|
+///
|
|
|
|
|
+/// This is intentionally stricter than [`Path::file_stem`]: filenames such as
|
|
|
|
|
+/// `sample.v1.vcf.gz` become `sample`, not `sample.v1.vcf`. This matches tools
|
|
|
|
|
+/// that expect a prefix rather than a single-extension stem.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Examples
|
|
|
|
|
+///
|
|
|
|
|
+/// ```text
|
|
|
|
|
+/// /tmp/sample.vcf.gz -> /tmp/sample
|
|
|
|
|
+/// /tmp/sample.v1.vcf.gz -> /tmp/sample
|
|
|
|
|
+/// ```
|
|
|
pub fn path_prefix(out: &str) -> anyhow::Result<String> {
|
|
pub fn path_prefix(out: &str) -> anyhow::Result<String> {
|
|
|
let out_path = Path::new(out);
|
|
let out_path = Path::new(out);
|
|
|
|
|
|
|
@@ -126,7 +147,6 @@ where
|
|
|
binds.join(" ")
|
|
binds.join(" ")
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-use rayon::prelude::*;
|
|
|
|
|
use std::cmp::Ord;
|
|
use std::cmp::Ord;
|
|
|
|
|
|
|
|
pub struct VectorIntersection<T> {
|
|
pub struct VectorIntersection<T> {
|
|
@@ -256,25 +276,65 @@ pub fn par_intersection<T: Ord + Send + Sync + Clone>(
|
|
|
vec1: &[T],
|
|
vec1: &[T],
|
|
|
vec2: &[T],
|
|
vec2: &[T],
|
|
|
) -> VectorIntersection<T> {
|
|
) -> VectorIntersection<T> {
|
|
|
- let chunk_size = (vec1.len() / rayon::current_num_threads()).max(1);
|
|
|
|
|
|
|
+ par_intersection_by_value(vec1, vec2)
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
- vec1.par_chunks(chunk_size)
|
|
|
|
|
- .map(|chunk| {
|
|
|
|
|
- let start = vec2.partition_point(|x| x < &chunk[0]);
|
|
|
|
|
- let end = vec2.partition_point(|x| x <= &chunk[chunk.len() - 1]);
|
|
|
|
|
|
|
+fn par_intersection_by_value<T: Ord + Send + Sync + Clone>(
|
|
|
|
|
+ vec1: &[T],
|
|
|
|
|
+ vec2: &[T],
|
|
|
|
|
+) -> VectorIntersection<T> {
|
|
|
|
|
+ const SEQUENTIAL_THRESHOLD: usize = 4096;
|
|
|
|
|
|
|
|
- // Ensure start is not greater than end
|
|
|
|
|
- if start <= end {
|
|
|
|
|
- intersection(chunk, &vec2[start..end])
|
|
|
|
|
- } else {
|
|
|
|
|
- // If start > end, there's no intersection for this chunk
|
|
|
|
|
- VectorIntersection::default()
|
|
|
|
|
- }
|
|
|
|
|
- })
|
|
|
|
|
- .reduce(VectorIntersection::default, |mut acc, mut x| {
|
|
|
|
|
- acc.merge(&mut x);
|
|
|
|
|
- acc
|
|
|
|
|
- })
|
|
|
|
|
|
|
+ if vec1.len() + vec2.len() <= SEQUENTIAL_THRESHOLD {
|
|
|
|
|
+ return intersection(vec1, vec2);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if vec1.is_empty() {
|
|
|
|
|
+ return VectorIntersection {
|
|
|
|
|
+ common: Vec::new(),
|
|
|
|
|
+ only_in_first: Vec::new(),
|
|
|
|
|
+ only_in_second: vec2.to_vec(),
|
|
|
|
|
+ };
|
|
|
|
|
+ }
|
|
|
|
|
+ if vec2.is_empty() {
|
|
|
|
|
+ return VectorIntersection {
|
|
|
|
|
+ common: Vec::new(),
|
|
|
|
|
+ only_in_first: vec1.to_vec(),
|
|
|
|
|
+ only_in_second: Vec::new(),
|
|
|
|
|
+ };
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ let pivot = &vec1[vec1.len() / 2];
|
|
|
|
|
+ let v1_left = vec1.partition_point(|x| x < pivot);
|
|
|
|
|
+ let v1_right = vec1.partition_point(|x| x <= pivot);
|
|
|
|
|
+ let v2_left = vec2.partition_point(|x| x < pivot);
|
|
|
|
|
+ let v2_right = vec2.partition_point(|x| x <= pivot);
|
|
|
|
|
+
|
|
|
|
|
+ let (mut left, mut right) = rayon::join(
|
|
|
|
|
+ || par_intersection_by_value(&vec1[..v1_left], &vec2[..v2_left]),
|
|
|
|
|
+ || par_intersection_by_value(&vec1[v1_right..], &vec2[v2_right..]),
|
|
|
|
|
+ );
|
|
|
|
|
+
|
|
|
|
|
+ let count1 = v1_right - v1_left;
|
|
|
|
|
+ let count2 = v2_right - v2_left;
|
|
|
|
|
+
|
|
|
|
|
+ let mut middle = VectorIntersection::default();
|
|
|
|
|
+ middle
|
|
|
|
|
+ .common
|
|
|
|
|
+ .extend(std::iter::repeat_n(pivot.clone(), count1.min(count2)));
|
|
|
|
|
+ match count1.cmp(&count2) {
|
|
|
|
|
+ Ordering::Greater => middle
|
|
|
|
|
+ .only_in_first
|
|
|
|
|
+ .extend(std::iter::repeat_n(pivot.clone(), count1 - count2)),
|
|
|
|
|
+ Ordering::Less => middle
|
|
|
|
|
+ .only_in_second
|
|
|
|
|
+ .extend(std::iter::repeat_n(pivot.clone(), count2 - count1)),
|
|
|
|
|
+ Ordering::Equal => {}
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ left.merge(&mut middle);
|
|
|
|
|
+ left.merge(&mut right);
|
|
|
|
|
+ left
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 {
|
|
pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 {
|
|
@@ -323,11 +383,21 @@ where
|
|
|
|
|
|
|
|
pub fn bin_data<V>(data: Vec<V>, bin_size: V) -> Vec<(V, usize)>
|
|
pub fn bin_data<V>(data: Vec<V>, bin_size: V) -> Vec<(V, usize)>
|
|
|
where
|
|
where
|
|
|
- V: Copy + PartialOrd + std::ops::AddAssign + std::ops::Add<Output = V>,
|
|
|
|
|
|
|
+ V: Copy + Default + PartialOrd + std::ops::AddAssign + std::ops::Add<Output = V>,
|
|
|
{
|
|
{
|
|
|
- // Sort the data
|
|
|
|
|
- let mut sorted_data = data.clone();
|
|
|
|
|
- sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
|
|
|
|
|
+ if data.is_empty() || bin_size.partial_cmp(&V::default()) != Some(Ordering::Greater) {
|
|
|
|
|
+ return Vec::new();
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Sort comparable data points. For floats, this drops NaN values.
|
|
|
|
|
+ let mut sorted_data: Vec<V> = data
|
|
|
|
|
+ .into_iter()
|
|
|
|
|
+ .filter(|v| v.partial_cmp(v) == Some(Ordering::Equal))
|
|
|
|
|
+ .collect();
|
|
|
|
|
+ if sorted_data.is_empty() {
|
|
|
|
|
+ return Vec::new();
|
|
|
|
|
+ }
|
|
|
|
|
+ sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
|
|
|
|
|
|
|
|
// Initialize bins
|
|
// Initialize bins
|
|
|
let mut bins: Vec<(V, usize)> = Vec::new();
|
|
let mut bins: Vec<(V, usize)> = Vec::new();
|
|
@@ -340,6 +410,9 @@ where
|
|
|
} else {
|
|
} else {
|
|
|
bins.push((current_bin_start, count));
|
|
bins.push((current_bin_start, count));
|
|
|
current_bin_start += bin_size;
|
|
current_bin_start += bin_size;
|
|
|
|
|
+ while value >= current_bin_start + bin_size {
|
|
|
|
|
+ current_bin_start += bin_size;
|
|
|
|
|
+ }
|
|
|
count = 1;
|
|
count = 1;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -491,6 +564,7 @@ pub fn find_files(pattern: &str) -> anyhow::Result<Vec<PathBuf>> {
|
|
|
result.push(path);
|
|
result.push(path);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ result.sort();
|
|
|
Ok(result)
|
|
Ok(result)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -528,6 +602,7 @@ pub fn list_files_with_ext(dir: impl AsRef<Path>, ext: &str) -> std::io::Result<
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ out.sort();
|
|
|
Ok(out)
|
|
Ok(out)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -546,25 +621,40 @@ pub fn list_directories(dir_path: PathBuf) -> std::io::Result<Vec<String>> {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ directories.sort();
|
|
|
Ok(directories)
|
|
Ok(directories)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
pub fn list_files_recursive(root: impl AsRef<Path>) -> Vec<PathBuf> {
|
|
pub fn list_files_recursive(root: impl AsRef<Path>) -> Vec<PathBuf> {
|
|
|
fn walk(dir: &Path, out: &mut Vec<PathBuf>) {
|
|
fn walk(dir: &Path, out: &mut Vec<PathBuf>) {
|
|
|
- if let Ok(entries) = fs::read_dir(dir) {
|
|
|
|
|
- for entry in entries.flatten() {
|
|
|
|
|
- let path = entry.path();
|
|
|
|
|
- if path.is_dir() {
|
|
|
|
|
- walk(&path, out);
|
|
|
|
|
- } else if path.is_file() {
|
|
|
|
|
- out.push(path);
|
|
|
|
|
|
|
+ let entries = match fs::read_dir(dir) {
|
|
|
|
|
+ Ok(entries) => entries,
|
|
|
|
|
+ Err(e) => {
|
|
|
|
|
+ warn!("Cannot read directory {}: {}", dir.display(), e);
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+ };
|
|
|
|
|
+
|
|
|
|
|
+ for entry in entries {
|
|
|
|
|
+ let entry = match entry {
|
|
|
|
|
+ Ok(entry) => entry,
|
|
|
|
|
+ Err(e) => {
|
|
|
|
|
+ warn!("Cannot read directory entry in {}: {}", dir.display(), e);
|
|
|
|
|
+ continue;
|
|
|
}
|
|
}
|
|
|
|
|
+ };
|
|
|
|
|
+ let path = entry.path();
|
|
|
|
|
+ if path.is_dir() {
|
|
|
|
|
+ walk(&path, out);
|
|
|
|
|
+ } else if path.is_file() {
|
|
|
|
|
+ out.push(path);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
let mut files = Vec::new();
|
|
let mut files = Vec::new();
|
|
|
walk(root.as_ref(), &mut files);
|
|
walk(root.as_ref(), &mut files);
|
|
|
|
|
+ files.sort();
|
|
|
files
|
|
files
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -589,7 +679,7 @@ pub fn list_files_recursive(root: impl AsRef<Path>) -> Vec<PathBuf> {
|
|
|
/// - Either `file1` or `file2` does not exist.
|
|
/// - Either `file1` or `file2` does not exist.
|
|
|
/// - File metadata cannot be read.
|
|
/// - File metadata cannot be read.
|
|
|
/// - File modification times cannot be retrieved.
|
|
/// - File modification times cannot be retrieved.
|
|
|
-/// - (if `rm == true`) Directory removal fails (if uncommented).
|
|
|
|
|
|
|
+/// - Directory removal fails when `rm == true` and `file1` is older.
|
|
|
///
|
|
///
|
|
|
pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result<bool> {
|
|
pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result<bool> {
|
|
|
debug!("is_file_older {file1} {file2}, {rm:?}");
|
|
debug!("is_file_older {file1} {file2}, {rm:?}");
|
|
@@ -605,7 +695,6 @@ pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result<bool>
|
|
|
|
|
|
|
|
if mtime1 < mtime2 && rm {
|
|
if mtime1 < mtime2 && rm {
|
|
|
if let Some(file1_dir) = Path::new(file1).parent() {
|
|
if let Some(file1_dir) = Path::new(file1).parent() {
|
|
|
-
|
|
|
|
|
warn!("Removing old directory: {}", file1_dir.display());
|
|
warn!("Removing old directory: {}", file1_dir.display());
|
|
|
fs::remove_dir_all(file1_dir).map_err(|e| {
|
|
fs::remove_dir_all(file1_dir).map_err(|e| {
|
|
|
warn!("Failed to remove {}: {}", file1_dir.display(), e);
|
|
warn!("Failed to remove {}: {}", file1_dir.display(), e);
|
|
@@ -619,13 +708,11 @@ pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result<bool>
|
|
|
|
|
|
|
|
pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> {
|
|
pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> {
|
|
|
debug!("Trying to remove: {dir}");
|
|
debug!("Trying to remove: {dir}");
|
|
|
- // match fs::remove_dir_all(dir) {
|
|
|
|
|
- // Ok(_) => {}
|
|
|
|
|
- // Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
|
|
|
|
- // Err(e) => {
|
|
|
|
|
- // anyhow::bail!("Failed to remove directory '{}': {}", dir, e);
|
|
|
|
|
- // }
|
|
|
|
|
- // };
|
|
|
|
|
|
|
+ match fs::remove_dir_all(dir) {
|
|
|
|
|
+ Ok(_) => {}
|
|
|
|
|
+ Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
|
|
|
|
+ Err(e) => anyhow::bail!("Failed to remove directory '{}': {}", dir, e),
|
|
|
|
|
+ }
|
|
|
Ok(())
|
|
Ok(())
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -657,11 +744,11 @@ pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> {
|
|
|
/// }
|
|
/// }
|
|
|
/// ```
|
|
/// ```
|
|
|
pub fn find_matching_file(dir: &Path, starts_with: &str, ends_with: &str) -> Option<PathBuf> {
|
|
pub fn find_matching_file(dir: &Path, starts_with: &str, ends_with: &str) -> Option<PathBuf> {
|
|
|
- fs::read_dir(dir)
|
|
|
|
|
|
|
+ let mut matches: Vec<_> = fs::read_dir(dir)
|
|
|
.ok()?
|
|
.ok()?
|
|
|
.filter_map(Result::ok)
|
|
.filter_map(Result::ok)
|
|
|
.map(|entry| entry.path())
|
|
.map(|entry| entry.path())
|
|
|
- .find(|path| {
|
|
|
|
|
|
|
+ .filter(|path| {
|
|
|
path.is_file()
|
|
path.is_file()
|
|
|
&& path
|
|
&& path
|
|
|
.file_name()
|
|
.file_name()
|
|
@@ -669,6 +756,10 @@ pub fn find_matching_file(dir: &Path, starts_with: &str, ends_with: &str) -> Opt
|
|
|
.map(|name| name.starts_with(starts_with) && name.ends_with(ends_with))
|
|
.map(|name| name.starts_with(starts_with) && name.ends_with(ends_with))
|
|
|
.unwrap_or(false)
|
|
.unwrap_or(false)
|
|
|
})
|
|
})
|
|
|
|
|
+ .collect();
|
|
|
|
|
+
|
|
|
|
|
+ matches.sort();
|
|
|
|
|
+ matches.into_iter().next()
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/// Searches for the first file in the given directory whose file name
|
|
/// Searches for the first file in the given directory whose file name
|
|
@@ -695,21 +786,22 @@ pub fn find_file<F>(dir: &Path, condition: F) -> Option<PathBuf>
|
|
|
where
|
|
where
|
|
|
F: Fn(&str) -> bool,
|
|
F: Fn(&str) -> bool,
|
|
|
{
|
|
{
|
|
|
- fs::read_dir(dir).ok()?.find_map(|entry| {
|
|
|
|
|
- let path = entry.ok()?.path();
|
|
|
|
|
|
|
+ let mut matches: Vec<_> = fs::read_dir(dir)
|
|
|
|
|
+ .ok()?
|
|
|
|
|
+ .filter_map(Result::ok)
|
|
|
|
|
+ .map(|entry| entry.path())
|
|
|
|
|
+ .filter(|path| {
|
|
|
|
|
+ path.is_file()
|
|
|
|
|
+ && path
|
|
|
|
|
+ .file_name()
|
|
|
|
|
+ .and_then(|name| name.to_str())
|
|
|
|
|
+ .map(&condition)
|
|
|
|
|
+ .unwrap_or(false)
|
|
|
|
|
+ })
|
|
|
|
|
+ .collect();
|
|
|
|
|
|
|
|
- if path.is_file()
|
|
|
|
|
- && path
|
|
|
|
|
- .file_name()
|
|
|
|
|
- .and_then(|name| name.to_str())
|
|
|
|
|
- .map(&condition)
|
|
|
|
|
- .unwrap_or(false)
|
|
|
|
|
- {
|
|
|
|
|
- Some(path)
|
|
|
|
|
- } else {
|
|
|
|
|
- None
|
|
|
|
|
- }
|
|
|
|
|
- })
|
|
|
|
|
|
|
+ matches.sort();
|
|
|
|
|
+ matches.into_iter().next()
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/// Represents a string made of either:
|
|
/// Represents a string made of either:
|
|
@@ -775,7 +867,9 @@ use regex::Regex;
|
|
|
/// Returns `None` if no `barcodeNN` pattern is found
|
|
/// Returns `None` if no `barcodeNN` pattern is found
|
|
|
/// or if the number cannot be parsed.
|
|
/// or if the number cannot be parsed.
|
|
|
pub fn extract_barcode(name: &str) -> Option<u32> {
|
|
pub fn extract_barcode(name: &str) -> Option<u32> {
|
|
|
- let re = Regex::new(r"barcode(\d+)").unwrap();
|
|
|
|
|
|
|
+ static BARCODE_RE: OnceLock<Regex> = OnceLock::new();
|
|
|
|
|
+
|
|
|
|
|
+ let re = BARCODE_RE.get_or_init(|| Regex::new(r"barcode(\d+)").unwrap());
|
|
|
re.captures(name)
|
|
re.captures(name)
|
|
|
.and_then(|c| c.get(1))
|
|
.and_then(|c| c.get(1))
|
|
|
.and_then(|m| m.as_str().parse::<u32>().ok())
|
|
.and_then(|m| m.as_str().parse::<u32>().ok())
|
|
@@ -798,7 +892,8 @@ pub fn human_size(bytes: u64) -> String {
|
|
|
format!("{:.2} {}", size, UNITS[unit])
|
|
format!("{:.2} {}", size, UNITS[unit])
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
-// A RAII (Resource Acquisition Is Initialization) guard for cleaning up temporary directories.
|
|
|
|
|
|
|
+
|
|
|
|
|
+/// RAII guard that removes a temporary directory when dropped.
|
|
|
pub struct TempDirGuard {
|
|
pub struct TempDirGuard {
|
|
|
path: PathBuf,
|
|
path: PathBuf,
|
|
|
}
|
|
}
|
|
@@ -809,7 +904,6 @@ impl TempDirGuard {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-// The core cleanup logic: executed automatically when the TempDirGuard is dropped.
|
|
|
|
|
impl Drop for TempDirGuard {
|
|
impl Drop for TempDirGuard {
|
|
|
fn drop(&mut self) {
|
|
fn drop(&mut self) {
|
|
|
if self.path.exists() {
|
|
if self.path.exists() {
|
|
@@ -826,6 +920,25 @@ impl Drop for TempDirGuard {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/// Removes a BAM file and its associated index file (`.bam.bai` or `.bai`).
|
|
|
|
|
+pub fn remove_bam_with_index(bam: &Path) -> anyhow::Result<()> {
|
|
|
|
|
+ if bam.exists() {
|
|
|
|
|
+ fs::remove_file(bam).with_context(|| format!("Failed to remove BAM: {}", bam.display()))?;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ let bai = bam.with_extension("bam.bai");
|
|
|
|
|
+ if bai.exists() {
|
|
|
|
|
+ fs::remove_file(&bai).ok();
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ let bai_alt = bam.with_extension("bai");
|
|
|
|
|
+ if bai_alt.exists() {
|
|
|
|
|
+ fs::remove_file(&bai_alt).ok();
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ Ok(())
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
/// Guard that tracks temporary filesystem paths and cleans them up on drop
|
|
/// Guard that tracks temporary filesystem paths and cleans them up on drop
|
|
|
/// unless explicitly disarmed.
|
|
/// unless explicitly disarmed.
|
|
|
///
|
|
///
|
|
@@ -895,10 +1008,8 @@ impl TempFileGuard {
|
|
|
/// Manually clean up all tracked files.
|
|
/// Manually clean up all tracked files.
|
|
|
pub fn cleanup(&mut self) {
|
|
pub fn cleanup(&mut self) {
|
|
|
for file in &self.files {
|
|
for file in &self.files {
|
|
|
- if file.exists() {
|
|
|
|
|
- if let Err(e) = remove_bam_with_index(file) {
|
|
|
|
|
- warn!("Failed to clean up temp file {}: {}", file.display(), e);
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ if let Err(e) = remove_tracked_path(file) {
|
|
|
|
|
+ warn!("Failed to clean up temp file {}: {}", file.display(), e);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
self.files.clear();
|
|
self.files.clear();
|
|
@@ -935,6 +1046,29 @@ impl TempFileGuard {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+fn remove_tracked_path(path: &Path) -> anyhow::Result<()> {
|
|
|
|
|
+ if path.is_dir() {
|
|
|
|
|
+ fs::remove_dir_all(path)
|
|
|
|
|
+ .with_context(|| format!("Failed to remove temp directory: {}", path.display()))?;
|
|
|
|
|
+ } else if path.exists() {
|
|
|
|
|
+ fs::remove_file(path)
|
|
|
|
|
+ .with_context(|| format!("Failed to remove temp file: {}", path.display()))?;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if path.extension().and_then(|e| e.to_str()) == Some("bam") {
|
|
|
|
|
+ remove_existing_file(path.with_extension("bam.bai"));
|
|
|
|
|
+ remove_existing_file(path.with_extension("bai"));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ Ok(())
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+fn remove_existing_file(path: PathBuf) {
|
|
|
|
|
+ if path.exists() {
|
|
|
|
|
+ let _ = fs::remove_file(path);
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
impl Drop for TempFileGuard {
|
|
impl Drop for TempFileGuard {
|
|
|
fn drop(&mut self) {
|
|
fn drop(&mut self) {
|
|
|
if !self.disarmed.load(std::sync::atomic::Ordering::SeqCst) {
|
|
if !self.disarmed.load(std::sync::atomic::Ordering::SeqCst) {
|
|
@@ -960,29 +1094,47 @@ impl Drop for TempFileGuard {
|
|
|
// }
|
|
// }
|
|
|
|
|
|
|
|
/// Extracts genome sizes from BAM header.
|
|
/// Extracts genome sizes from BAM header.
|
|
|
|
|
+/// Extract contig names and lengths from a BAM header as a hash map.
|
|
|
|
|
+///
|
|
|
|
|
+/// This delegates parsing to [`crate::io::bam::get_genome_sizes`]. Because the
|
|
|
|
|
+/// return type is a hash map, iteration order is not BAM header order; use
|
|
|
|
|
+/// [`get_genome_sizes_in_header_order`] when order matters.
|
|
|
pub fn get_genome_sizes(
|
|
pub fn get_genome_sizes(
|
|
|
header: &rust_htslib::bam::Header,
|
|
header: &rust_htslib::bam::Header,
|
|
|
) -> anyhow::Result<FxHashMap<String, u64>> {
|
|
) -> anyhow::Result<FxHashMap<String, u64>> {
|
|
|
- let mut sizes = FxHashMap::default();
|
|
|
|
|
-
|
|
|
|
|
- for (_, records) in header.to_hashmap() {
|
|
|
|
|
- for record in records {
|
|
|
|
|
- if let (Some(sn), Some(ln)) = (record.get("SN"), record.get("LN")) {
|
|
|
|
|
- if let Ok(len) = ln.parse::<u64>() {
|
|
|
|
|
- sizes.insert(sn.clone(), len);
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ Ok(crate::io::bam::get_genome_sizes(header)?
|
|
|
|
|
+ .into_iter()
|
|
|
|
|
+ .collect())
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
- Ok(sizes)
|
|
|
|
|
|
|
+/// Extract contig names and lengths from a BAM header in header order.
|
|
|
|
|
+///
|
|
|
|
|
+/// Use this when downstream region generation must follow the BAM/reference
|
|
|
|
|
+/// contig order. [`get_genome_sizes`] returns a hash map and does not preserve
|
|
|
|
|
+/// that order.
|
|
|
|
|
+pub fn get_genome_sizes_in_header_order(
|
|
|
|
|
+ header: &rust_htslib::bam::HeaderView,
|
|
|
|
|
+) -> anyhow::Result<Vec<(String, u64)>> {
|
|
|
|
|
+ (0..header.target_count())
|
|
|
|
|
+ .map(|tid| {
|
|
|
|
|
+ let name = String::from_utf8_lossy(header.tid2name(tid)).into_owned();
|
|
|
|
|
+ let len = header
|
|
|
|
|
+ .target_len(tid)
|
|
|
|
|
+ .with_context(|| format!("BAM header target {name} is missing length"))?;
|
|
|
|
|
+
|
|
|
|
|
+ Ok((name, len))
|
|
|
|
|
+ })
|
|
|
|
|
+ .collect()
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
pub fn bam_contigs<P: AsRef<std::path::Path>>(bam: P) -> anyhow::Result<Vec<String>> {
|
|
pub fn bam_contigs<P: AsRef<std::path::Path>>(bam: P) -> anyhow::Result<Vec<String>> {
|
|
|
let reader = rust_htslib::bam::Reader::from_path(&bam)?;
|
|
let reader = rust_htslib::bam::Reader::from_path(&bam)?;
|
|
|
- let header = rust_htslib::bam::Header::from_template(reader.header());
|
|
|
|
|
-
|
|
|
|
|
- Ok(get_genome_sizes(&header)?.into_keys().collect())
|
|
|
|
|
|
|
+ Ok(reader
|
|
|
|
|
+ .header()
|
|
|
|
|
+ .target_names()
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .map(|name| String::from_utf8_lossy(name).into_owned())
|
|
|
|
|
+ .collect())
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/// Split genome into ~n_parts equal-sized chunks (by number of bases),
|
|
/// Split genome into ~n_parts equal-sized chunks (by number of bases),
|
|
@@ -1041,18 +1193,23 @@ pub fn split_genome_into_n_regions_exact(
|
|
|
genome_sizes: &FxHashMap<String, u64>,
|
|
genome_sizes: &FxHashMap<String, u64>,
|
|
|
n_parts: usize,
|
|
n_parts: usize,
|
|
|
) -> Vec<Vec<String>> {
|
|
) -> Vec<Vec<String>> {
|
|
|
- if n_parts == 0 || genome_sizes.is_empty() {
|
|
|
|
|
- return Vec::new();
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- // Deterministic contig order
|
|
|
|
|
let mut contigs: Vec<(String, u64)> = genome_sizes
|
|
let mut contigs: Vec<(String, u64)> = genome_sizes
|
|
|
.iter()
|
|
.iter()
|
|
|
.map(|(ctg, len)| (ctg.clone(), *len))
|
|
.map(|(ctg, len)| (ctg.clone(), *len))
|
|
|
.collect();
|
|
.collect();
|
|
|
contigs.sort_by(|a, b| a.0.cmp(&b.0));
|
|
contigs.sort_by(|a, b| a.0.cmp(&b.0));
|
|
|
|
|
+ split_ordered_genome_into_n_regions_exact(&contigs, n_parts)
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
- let total_bases: u64 = contigs.iter().map(|(_, len)| *len).sum();
|
|
|
|
|
|
|
+pub fn split_ordered_genome_into_n_regions_exact(
|
|
|
|
|
+ genome_sizes: &[(String, u64)],
|
|
|
|
|
+ n_parts: usize,
|
|
|
|
|
+) -> Vec<Vec<String>> {
|
|
|
|
|
+ if n_parts == 0 || genome_sizes.is_empty() {
|
|
|
|
|
+ return Vec::new();
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ let total_bases: u64 = genome_sizes.iter().map(|(_, len)| *len).sum();
|
|
|
if total_bases == 0 {
|
|
if total_bases == 0 {
|
|
|
return Vec::new();
|
|
return Vec::new();
|
|
|
}
|
|
}
|
|
@@ -1080,8 +1237,7 @@ pub fn split_genome_into_n_regions_exact(
|
|
|
let mut part_idx = 0;
|
|
let mut part_idx = 0;
|
|
|
let mut remaining_in_part = target_sizes[0];
|
|
let mut remaining_in_part = target_sizes[0];
|
|
|
|
|
|
|
|
- let mut ctg_iter = contigs.into_iter();
|
|
|
|
|
- while let Some((ctg, mut len)) = ctg_iter.next() {
|
|
|
|
|
|
|
+ for (ctg, mut len) in genome_sizes.iter().cloned() {
|
|
|
let mut start: u64 = 1;
|
|
let mut start: u64 = 1;
|
|
|
|
|
|
|
|
while len > 0 && part_idx < parts {
|
|
while len > 0 && part_idx < parts {
|
|
@@ -1112,27 +1268,6 @@ pub fn split_genome_into_n_regions_exact(
|
|
|
chunks
|
|
chunks
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Removes a BAM file and its associated index file (.bai).
|
|
|
|
|
-pub fn remove_bam_with_index(bam: &Path) -> anyhow::Result<()> {
|
|
|
|
|
- // Remove BAM
|
|
|
|
|
- if bam.exists() {
|
|
|
|
|
- fs::remove_file(bam).with_context(|| format!("Failed to remove BAM: {}", bam.display()))?;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- // Remove index (.bam.bai or .bai)
|
|
|
|
|
- let bai = bam.with_extension("bam.bai");
|
|
|
|
|
- if bai.exists() {
|
|
|
|
|
- fs::remove_file(&bai).ok(); // Don't fail if index doesn't exist
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- let bai_alt = bam.with_extension("bai");
|
|
|
|
|
- if bai_alt.exists() {
|
|
|
|
|
- fs::remove_file(&bai_alt).ok();
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- Ok(())
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
/// Convert a numeric value to an RGB triplet using a diverging palette.
|
|
/// Convert a numeric value to an RGB triplet using a diverging palette.
|
|
|
///
|
|
///
|
|
|
/// Values are clipped to `[-clip, +clip]` and mapped as:
|
|
/// Values are clipped to `[-clip, +clip]` and mapped as:
|
|
@@ -1181,7 +1316,12 @@ pub fn revcomp(s: &str) -> String {
|
|
|
_ => b'N',
|
|
_ => b'N',
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
- let v: Vec<u8> = s.as_bytes().iter().rev().map(|&b| comp(b.to_ascii_uppercase())).collect();
|
|
|
|
|
|
|
+ let v: Vec<u8> = s
|
|
|
|
|
+ .as_bytes()
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .rev()
|
|
|
|
|
+ .map(|&b| comp(b.to_ascii_uppercase()))
|
|
|
|
|
+ .collect();
|
|
|
String::from_utf8(v).unwrap()
|
|
String::from_utf8(v).unwrap()
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -1194,6 +1334,50 @@ mod tests {
|
|
|
|
|
|
|
|
use super::*;
|
|
use super::*;
|
|
|
|
|
|
|
|
|
|
+ #[test]
|
|
|
|
|
+ fn par_intersection_keeps_values_only_in_second_outside_first_range() {
|
|
|
|
|
+ let res = par_intersection(&[10], &[1]);
|
|
|
|
|
+
|
|
|
|
|
+ assert_eq!(res.common, Vec::<i32>::new());
|
|
|
|
|
+ assert_eq!(res.only_in_first, vec![10]);
|
|
|
|
|
+ assert_eq!(res.only_in_second, vec![1]);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ #[test]
|
|
|
|
|
+ fn par_intersection_handles_duplicate_counts() {
|
|
|
|
|
+ let res = par_intersection(&[1, 2, 2, 3], &[0, 2, 2, 2, 4]);
|
|
|
|
|
+
|
|
|
|
|
+ assert_eq!(res.common, vec![2, 2]);
|
|
|
|
|
+ assert_eq!(res.only_in_first, vec![1, 3]);
|
|
|
|
|
+ assert_eq!(res.only_in_second, vec![0, 2, 4]);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ #[test]
|
|
|
|
|
+ fn bin_data_handles_empty_and_invalid_bin_size() {
|
|
|
|
|
+ assert_eq!(bin_data::<f64>(Vec::new(), 0.1), Vec::<(f64, usize)>::new());
|
|
|
|
|
+ assert_eq!(bin_data(vec![1.0, 2.0], 0.0), Vec::<(f64, usize)>::new());
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ bin_data(vec![1.0, 2.0], f64::NAN),
|
|
|
|
|
+ Vec::<(f64, usize)>::new()
|
|
|
|
|
+ );
|
|
|
|
|
+ assert_eq!(bin_data(vec![f64::NAN], 0.1), Vec::<(f64, usize)>::new());
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ #[test]
|
|
|
|
|
+ fn bin_data_keeps_sparse_values_in_aligned_bins() {
|
|
|
|
|
+ assert_eq!(bin_data(vec![0.0, 1.0], 0.25), vec![(0.0, 1), (1.0, 1)]);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ #[test]
|
|
|
|
|
+ fn split_ordered_genome_regions_preserves_input_contig_order() {
|
|
|
|
|
+ let genome = vec![("chr2".to_string(), 2), ("chr1".to_string(), 2)];
|
|
|
|
|
+
|
|
|
|
|
+ assert_eq!(
|
|
|
|
|
+ split_ordered_genome_into_n_regions_exact(&genome, 2),
|
|
|
|
|
+ vec![vec!["chr2:1-2".to_string()], vec!["chr1:1-2".to_string()]]
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
#[test]
|
|
#[test]
|
|
|
fn split_g() -> anyhow::Result<()> {
|
|
fn split_g() -> anyhow::Result<()> {
|
|
|
test_init();
|
|
test_init();
|