Thomas 1 сар өмнө
parent
commit
e8875b045d

+ 5 - 9
src/callers/clairs.rs

@@ -177,8 +177,8 @@ use crate::{
     },
     config::Config,
     helpers::{
-        get_genome_sizes, is_file_older, list_files_recursive, remove_dir_if_exists,
-        singularity_bind_flags, split_genome_into_n_regions_exact,
+        get_genome_sizes_in_header_order, is_file_older, list_files_recursive,
+        remove_dir_if_exists, singularity_bind_flags, split_ordered_genome_into_n_regions_exact,
     },
     io::vcf::read_vcf,
     locker::SampleLock,
@@ -880,9 +880,8 @@ pub fn run_clairs_chunked(id: &str, config: &Config, n_parts: usize) -> anyhow::
     let normal_bam = config.normal_bam(id);
     let reader = bam::Reader::from_path(&normal_bam)
         .with_context(|| format!("Failed to open BAM: {normal_bam}"))?;
-    let header = bam::Header::from_template(reader.header());
-    let genome_sizes = get_genome_sizes(&header)?;
-    let regions = split_genome_into_n_regions_exact(&genome_sizes, n_parts)
+    let genome_sizes = get_genome_sizes_in_header_order(reader.header())?;
+    let regions = split_ordered_genome_into_n_regions_exact(&genome_sizes, n_parts)
         .into_iter()
         .flatten()
         .collect::<Vec<String>>();
@@ -970,10 +969,7 @@ pub fn run_clairs_chunked(id: &str, config: &Config, n_parts: usize) -> anyhow::
             }
         }
 
-        info!(
-            "Cleaned up {} part directories for {}",
-            actual_n_parts, id
-        );
+        info!("Cleaned up {} part directories for {}", actual_n_parts, id);
     }
 
     info!(

+ 6 - 8
src/callers/deep_somatic.rs

@@ -104,8 +104,8 @@ use crate::{
     },
     config::Config,
     helpers::{
-        get_genome_sizes, is_file_older, remove_dir_if_exists, singularity_bind_flags,
-        split_genome_into_n_regions_exact,
+        get_genome_sizes_in_header_order, is_file_older, remove_dir_if_exists,
+        singularity_bind_flags, split_ordered_genome_into_n_regions_exact,
     },
     io::vcf::read_vcf,
     locker::SampleLock,
@@ -498,9 +498,8 @@ impl Version for DeepSomatic {
 }
 
 fn parse_deepsomatic_version(output: &str) -> anyhow::Result<String> {
-    let deepsomatic_re =
-        Regex::new(r"(?mi)\bdeepsomatic(?:\s+version)?\s*[:=]?\s*([^\s]+)")
-            .expect("DeepSomatic version regex is valid");
+    let deepsomatic_re = Regex::new(r"(?mi)\bdeepsomatic(?:\s+version)?\s*[:=]?\s*([^\s]+)")
+        .expect("DeepSomatic version regex is valid");
     if let Some(caps) = deepsomatic_re.captures(output) {
         return Ok(caps
             .get(1)
@@ -622,10 +621,9 @@ pub fn run_deepsomatic_chunked(id: &str, config: &Config, n_parts: usize) -> any
     let tumor_bam = config.tumoral_bam(id);
     let reader = bam::Reader::from_path(&tumor_bam)
         .with_context(|| format!("Failed to open BAM: {tumor_bam}"))?;
-    let header = bam::Header::from_template(reader.header());
-    let genome_sizes = get_genome_sizes(&header)?;
+    let genome_sizes = get_genome_sizes_in_header_order(reader.header())?;
 
-    let region_chunks = split_genome_into_n_regions_exact(&genome_sizes, n_parts)
+    let region_chunks = split_ordered_genome_into_n_regions_exact(&genome_sizes, n_parts)
         .into_iter()
         .map(|v| v.join(" "))
         .collect::<Vec<String>>();

+ 4 - 5
src/callers/deep_variant.rs

@@ -106,8 +106,8 @@ use crate::{
     },
     config::Config,
     helpers::{
-        get_genome_sizes, is_file_older, remove_dir_if_exists, singularity_bind_flags,
-        split_genome_into_n_regions_exact,
+        get_genome_sizes_in_header_order, is_file_older, remove_dir_if_exists,
+        singularity_bind_flags, split_ordered_genome_into_n_regions_exact,
     },
     io::vcf::read_vcf,
     locker::SampleLock,
@@ -815,10 +815,9 @@ pub fn run_deepvariant_chunked(
     let bam_path = config.solo_bam(id, time_point);
     let reader = bam::Reader::from_path(&bam_path)
         .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
-    let header = bam::Header::from_template(reader.header());
-    let genome_sizes = get_genome_sizes(&header)?;
+    let genome_sizes = get_genome_sizes_in_header_order(reader.header())?;
 
-    let region_chunks = split_genome_into_n_regions_exact(&genome_sizes, n_parts)
+    let region_chunks = split_ordered_genome_into_n_regions_exact(&genome_sizes, n_parts)
         .into_iter()
         .map(|v| v.join(" "))
         .collect::<Vec<String>>();

+ 297 - 113
src/helpers.rs

@@ -5,7 +5,6 @@ use log::{debug, error, warn};
 use rust_htslib::bam::Read;
 use rustc_hash::FxHashMap;
 use serde::{Deserialize, Serialize};
-use uuid::Uuid;
 use std::{
     cmp::Ordering,
     collections::{HashMap, HashSet},
@@ -13,8 +12,9 @@ use std::{
     iter::Sum,
     ops::{Add, Div},
     path::{Path, PathBuf},
-    sync::atomic::AtomicBool,
+    sync::{atomic::AtomicBool, OnceLock},
 };
+use uuid::Uuid;
 
 pub fn find_unique_file(dir_path: &str, suffix: &str) -> anyhow::Result<String> {
     let mut matching_files = Vec::new();
@@ -40,14 +40,35 @@ pub fn find_unique_file(dir_path: &str, suffix: &str) -> anyhow::Result<String>
         0 => Err(anyhow::anyhow!("No file found ending with '{}'", suffix))
             .with_context(|| format!("In directory: {}", dir_path)),
         1 => Ok(matching_files[0].to_string_lossy().into_owned()),
-        _ => Err(anyhow::anyhow!(
-            "Multiple files found ending with '{}'",
-            suffix
-        ))
-        .with_context(|| format!("In directory: {}", dir_path)),
+        _ => {
+            matching_files.sort();
+            let matches = matching_files
+                .iter()
+                .map(|p| p.display().to_string())
+                .collect::<Vec<_>>()
+                .join(", ");
+            Err(anyhow::anyhow!(
+                "Multiple files found ending with '{}': {}",
+                suffix,
+                matches
+            ))
+            .with_context(|| format!("In directory: {}", dir_path))
+        }
     }
 }
 
+/// Return the output prefix by removing all suffixes from the first dot in the filename.
+///
+/// This is intentionally stricter than [`Path::file_stem`]: filenames such as
+/// `sample.v1.vcf.gz` become `sample`, not `sample.v1.vcf`. This matches tools
+/// that expect a prefix rather than a single-extension stem.
+///
+/// # Examples
+///
+/// ```text
+/// /tmp/sample.vcf.gz    -> /tmp/sample
+/// /tmp/sample.v1.vcf.gz -> /tmp/sample
+/// ```
 pub fn path_prefix(out: &str) -> anyhow::Result<String> {
     let out_path = Path::new(out);
 
@@ -126,7 +147,6 @@ where
     binds.join(" ")
 }
 
-use rayon::prelude::*;
 use std::cmp::Ord;
 
 pub struct VectorIntersection<T> {
@@ -256,25 +276,65 @@ pub fn par_intersection<T: Ord + Send + Sync + Clone>(
     vec1: &[T],
     vec2: &[T],
 ) -> VectorIntersection<T> {
-    let chunk_size = (vec1.len() / rayon::current_num_threads()).max(1);
+    par_intersection_by_value(vec1, vec2)
+}
 
-    vec1.par_chunks(chunk_size)
-        .map(|chunk| {
-            let start = vec2.partition_point(|x| x < &chunk[0]);
-            let end = vec2.partition_point(|x| x <= &chunk[chunk.len() - 1]);
+fn par_intersection_by_value<T: Ord + Send + Sync + Clone>(
+    vec1: &[T],
+    vec2: &[T],
+) -> VectorIntersection<T> {
+    const SEQUENTIAL_THRESHOLD: usize = 4096;
 
-            // Ensure start is not greater than end
-            if start <= end {
-                intersection(chunk, &vec2[start..end])
-            } else {
-                // If start > end, there's no intersection for this chunk
-                VectorIntersection::default()
-            }
-        })
-        .reduce(VectorIntersection::default, |mut acc, mut x| {
-            acc.merge(&mut x);
-            acc
-        })
+    if vec1.len() + vec2.len() <= SEQUENTIAL_THRESHOLD {
+        return intersection(vec1, vec2);
+    }
+
+    if vec1.is_empty() {
+        return VectorIntersection {
+            common: Vec::new(),
+            only_in_first: Vec::new(),
+            only_in_second: vec2.to_vec(),
+        };
+    }
+    if vec2.is_empty() {
+        return VectorIntersection {
+            common: Vec::new(),
+            only_in_first: vec1.to_vec(),
+            only_in_second: Vec::new(),
+        };
+    }
+
+    let pivot = &vec1[vec1.len() / 2];
+    let v1_left = vec1.partition_point(|x| x < pivot);
+    let v1_right = vec1.partition_point(|x| x <= pivot);
+    let v2_left = vec2.partition_point(|x| x < pivot);
+    let v2_right = vec2.partition_point(|x| x <= pivot);
+
+    let (mut left, mut right) = rayon::join(
+        || par_intersection_by_value(&vec1[..v1_left], &vec2[..v2_left]),
+        || par_intersection_by_value(&vec1[v1_right..], &vec2[v2_right..]),
+    );
+
+    let count1 = v1_right - v1_left;
+    let count2 = v2_right - v2_left;
+
+    let mut middle = VectorIntersection::default();
+    middle
+        .common
+        .extend(std::iter::repeat_n(pivot.clone(), count1.min(count2)));
+    match count1.cmp(&count2) {
+        Ordering::Greater => middle
+            .only_in_first
+            .extend(std::iter::repeat_n(pivot.clone(), count1 - count2)),
+        Ordering::Less => middle
+            .only_in_second
+            .extend(std::iter::repeat_n(pivot.clone(), count2 - count1)),
+        Ordering::Equal => {}
+    }
+
+    left.merge(&mut middle);
+    left.merge(&mut right);
+    left
 }
 
 pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 {
@@ -323,11 +383,21 @@ where
 
 pub fn bin_data<V>(data: Vec<V>, bin_size: V) -> Vec<(V, usize)>
 where
-    V: Copy + PartialOrd + std::ops::AddAssign + std::ops::Add<Output = V>,
+    V: Copy + Default + PartialOrd + std::ops::AddAssign + std::ops::Add<Output = V>,
 {
-    // Sort the data
-    let mut sorted_data = data.clone();
-    sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    if data.is_empty() || bin_size.partial_cmp(&V::default()) != Some(Ordering::Greater) {
+        return Vec::new();
+    }
+
+    // Sort comparable data points. For floats, this drops NaN values.
+    let mut sorted_data: Vec<V> = data
+        .into_iter()
+        .filter(|v| v.partial_cmp(v) == Some(Ordering::Equal))
+        .collect();
+    if sorted_data.is_empty() {
+        return Vec::new();
+    }
+    sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
 
     // Initialize bins
     let mut bins: Vec<(V, usize)> = Vec::new();
@@ -340,6 +410,9 @@ where
         } else {
             bins.push((current_bin_start, count));
             current_bin_start += bin_size;
+            while value >= current_bin_start + bin_size {
+                current_bin_start += bin_size;
+            }
             count = 1;
         }
     }
@@ -491,6 +564,7 @@ pub fn find_files(pattern: &str) -> anyhow::Result<Vec<PathBuf>> {
         result.push(path);
     }
 
+    result.sort();
     Ok(result)
 }
 
@@ -528,6 +602,7 @@ pub fn list_files_with_ext(dir: impl AsRef<Path>, ext: &str) -> std::io::Result<
         }
     }
 
+    out.sort();
     Ok(out)
 }
 
@@ -546,25 +621,40 @@ pub fn list_directories(dir_path: PathBuf) -> std::io::Result<Vec<String>> {
         }
     }
 
+    directories.sort();
     Ok(directories)
 }
 
 pub fn list_files_recursive(root: impl AsRef<Path>) -> Vec<PathBuf> {
     fn walk(dir: &Path, out: &mut Vec<PathBuf>) {
-        if let Ok(entries) = fs::read_dir(dir) {
-            for entry in entries.flatten() {
-                let path = entry.path();
-                if path.is_dir() {
-                    walk(&path, out);
-                } else if path.is_file() {
-                    out.push(path);
+        let entries = match fs::read_dir(dir) {
+            Ok(entries) => entries,
+            Err(e) => {
+                warn!("Cannot read directory {}: {}", dir.display(), e);
+                return;
+            }
+        };
+
+        for entry in entries {
+            let entry = match entry {
+                Ok(entry) => entry,
+                Err(e) => {
+                    warn!("Cannot read directory entry in {}: {}", dir.display(), e);
+                    continue;
                 }
+            };
+            let path = entry.path();
+            if path.is_dir() {
+                walk(&path, out);
+            } else if path.is_file() {
+                out.push(path);
             }
         }
     }
 
     let mut files = Vec::new();
     walk(root.as_ref(), &mut files);
+    files.sort();
     files
 }
 
@@ -589,7 +679,7 @@ pub fn list_files_recursive(root: impl AsRef<Path>) -> Vec<PathBuf> {
 /// - Either `file1` or `file2` does not exist.
 /// - File metadata cannot be read.
 /// - File modification times cannot be retrieved.
-/// - (if `rm == true`) Directory removal fails (if uncommented).
+/// - Directory removal fails when `rm == true` and `file1` is older.
 ///
 pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result<bool> {
     debug!("is_file_older {file1} {file2}, {rm:?}");
@@ -605,7 +695,6 @@ pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result<bool>
 
     if mtime1 < mtime2 && rm {
         if let Some(file1_dir) = Path::new(file1).parent() {
-            
             warn!("Removing old directory: {}", file1_dir.display());
             fs::remove_dir_all(file1_dir).map_err(|e| {
                 warn!("Failed to remove {}: {}", file1_dir.display(), e);
@@ -619,13 +708,11 @@ pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result<bool>
 
 pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> {
     debug!("Trying to remove: {dir}");
-    // match fs::remove_dir_all(dir) {
-    //     Ok(_) => {}
-    //     Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
-    //     Err(e) => {
-    //         anyhow::bail!("Failed to remove directory '{}': {}", dir, e);
-    //     }
-    // };
+    match fs::remove_dir_all(dir) {
+        Ok(_) => {}
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+        Err(e) => anyhow::bail!("Failed to remove directory '{}': {}", dir, e),
+    }
     Ok(())
 }
 
@@ -657,11 +744,11 @@ pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> {
 /// }
 /// ```
 pub fn find_matching_file(dir: &Path, starts_with: &str, ends_with: &str) -> Option<PathBuf> {
-    fs::read_dir(dir)
+    let mut matches: Vec<_> = fs::read_dir(dir)
         .ok()?
         .filter_map(Result::ok)
         .map(|entry| entry.path())
-        .find(|path| {
+        .filter(|path| {
             path.is_file()
                 && path
                     .file_name()
@@ -669,6 +756,10 @@ pub fn find_matching_file(dir: &Path, starts_with: &str, ends_with: &str) -> Opt
                     .map(|name| name.starts_with(starts_with) && name.ends_with(ends_with))
                     .unwrap_or(false)
         })
+        .collect();
+
+    matches.sort();
+    matches.into_iter().next()
 }
 
 /// Searches for the first file in the given directory whose file name
@@ -695,21 +786,22 @@ pub fn find_file<F>(dir: &Path, condition: F) -> Option<PathBuf>
 where
     F: Fn(&str) -> bool,
 {
-    fs::read_dir(dir).ok()?.find_map(|entry| {
-        let path = entry.ok()?.path();
+    let mut matches: Vec<_> = fs::read_dir(dir)
+        .ok()?
+        .filter_map(Result::ok)
+        .map(|entry| entry.path())
+        .filter(|path| {
+            path.is_file()
+                && path
+                    .file_name()
+                    .and_then(|name| name.to_str())
+                    .map(&condition)
+                    .unwrap_or(false)
+        })
+        .collect();
 
-        if path.is_file()
-            && path
-                .file_name()
-                .and_then(|name| name.to_str())
-                .map(&condition)
-                .unwrap_or(false)
-        {
-            Some(path)
-        } else {
-            None
-        }
-    })
+    matches.sort();
+    matches.into_iter().next()
 }
 
 /// Represents a string made of either:
@@ -775,7 +867,9 @@ use regex::Regex;
 /// Returns `None` if no `barcodeNN` pattern is found
 /// or if the number cannot be parsed.
 pub fn extract_barcode(name: &str) -> Option<u32> {
-    let re = Regex::new(r"barcode(\d+)").unwrap();
+    static BARCODE_RE: OnceLock<Regex> = OnceLock::new();
+
+    let re = BARCODE_RE.get_or_init(|| Regex::new(r"barcode(\d+)").unwrap());
     re.captures(name)
         .and_then(|c| c.get(1))
         .and_then(|m| m.as_str().parse::<u32>().ok())
@@ -798,7 +892,8 @@ pub fn human_size(bytes: u64) -> String {
         format!("{:.2} {}", size, UNITS[unit])
     }
 }
-// A RAII (Resource Acquisition Is Initialization) guard for cleaning up temporary directories.
+
+/// RAII guard that removes a temporary directory when dropped.
 pub struct TempDirGuard {
     path: PathBuf,
 }
@@ -809,7 +904,6 @@ impl TempDirGuard {
     }
 }
 
-// The core cleanup logic: executed automatically when the TempDirGuard is dropped.
 impl Drop for TempDirGuard {
     fn drop(&mut self) {
         if self.path.exists() {
@@ -826,6 +920,25 @@ impl Drop for TempDirGuard {
     }
 }
 
+/// Removes a BAM file and its associated index file (`.bam.bai` or `.bai`).
+pub fn remove_bam_with_index(bam: &Path) -> anyhow::Result<()> {
+    if bam.exists() {
+        fs::remove_file(bam).with_context(|| format!("Failed to remove BAM: {}", bam.display()))?;
+    }
+
+    let bai = bam.with_extension("bam.bai");
+    if bai.exists() {
+        fs::remove_file(&bai).ok();
+    }
+
+    let bai_alt = bam.with_extension("bai");
+    if bai_alt.exists() {
+        fs::remove_file(&bai_alt).ok();
+    }
+
+    Ok(())
+}
+
 /// Guard that tracks temporary filesystem paths and cleans them up on drop
 /// unless explicitly disarmed.
 ///
@@ -895,10 +1008,8 @@ impl TempFileGuard {
     /// Manually clean up all tracked files.
     pub fn cleanup(&mut self) {
         for file in &self.files {
-            if file.exists() {
-                if let Err(e) = remove_bam_with_index(file) {
-                    warn!("Failed to clean up temp file {}: {}", file.display(), e);
-                }
+            if let Err(e) = remove_tracked_path(file) {
+                warn!("Failed to clean up temp file {}: {}", file.display(), e);
             }
         }
         self.files.clear();
@@ -935,6 +1046,29 @@ impl TempFileGuard {
     }
 }
 
+fn remove_tracked_path(path: &Path) -> anyhow::Result<()> {
+    if path.is_dir() {
+        fs::remove_dir_all(path)
+            .with_context(|| format!("Failed to remove temp directory: {}", path.display()))?;
+    } else if path.exists() {
+        fs::remove_file(path)
+            .with_context(|| format!("Failed to remove temp file: {}", path.display()))?;
+    }
+
+    if path.extension().and_then(|e| e.to_str()) == Some("bam") {
+        remove_existing_file(path.with_extension("bam.bai"));
+        remove_existing_file(path.with_extension("bai"));
+    }
+
+    Ok(())
+}
+
+fn remove_existing_file(path: PathBuf) {
+    if path.exists() {
+        let _ = fs::remove_file(path);
+    }
+}
+
 impl Drop for TempFileGuard {
     fn drop(&mut self) {
         if !self.disarmed.load(std::sync::atomic::Ordering::SeqCst) {
@@ -960,29 +1094,47 @@ impl Drop for TempFileGuard {
 // }
 
 /// Extracts genome sizes from BAM header.
+/// Extract contig names and lengths from a BAM header as a hash map.
+///
+/// This delegates parsing to [`crate::io::bam::get_genome_sizes`]. Because the
+/// return type is a hash map, iteration order is not BAM header order; use
+/// [`get_genome_sizes_in_header_order`] when order matters.
 pub fn get_genome_sizes(
     header: &rust_htslib::bam::Header,
 ) -> anyhow::Result<FxHashMap<String, u64>> {
-    let mut sizes = FxHashMap::default();
-
-    for (_, records) in header.to_hashmap() {
-        for record in records {
-            if let (Some(sn), Some(ln)) = (record.get("SN"), record.get("LN")) {
-                if let Ok(len) = ln.parse::<u64>() {
-                    sizes.insert(sn.clone(), len);
-                }
-            }
-        }
-    }
+    Ok(crate::io::bam::get_genome_sizes(header)?
+        .into_iter()
+        .collect())
+}
 
-    Ok(sizes)
+/// Extract contig names and lengths from a BAM header in header order.
+///
+/// Use this when downstream region generation must follow the BAM/reference
+/// contig order. [`get_genome_sizes`] returns a hash map and does not preserve
+/// that order.
+pub fn get_genome_sizes_in_header_order(
+    header: &rust_htslib::bam::HeaderView,
+) -> anyhow::Result<Vec<(String, u64)>> {
+    (0..header.target_count())
+        .map(|tid| {
+            let name = String::from_utf8_lossy(header.tid2name(tid)).into_owned();
+            let len = header
+                .target_len(tid)
+                .with_context(|| format!("BAM header target {name} is missing length"))?;
+
+            Ok((name, len))
+        })
+        .collect()
 }
 
 pub fn bam_contigs<P: AsRef<std::path::Path>>(bam: P) -> anyhow::Result<Vec<String>> {
     let reader = rust_htslib::bam::Reader::from_path(&bam)?;
-    let header = rust_htslib::bam::Header::from_template(reader.header());
-
-    Ok(get_genome_sizes(&header)?.into_keys().collect())
+    Ok(reader
+        .header()
+        .target_names()
+        .iter()
+        .map(|name| String::from_utf8_lossy(name).into_owned())
+        .collect())
 }
 
 /// Split genome into ~n_parts equal-sized chunks (by number of bases),
@@ -1041,18 +1193,23 @@ pub fn split_genome_into_n_regions_exact(
     genome_sizes: &FxHashMap<String, u64>,
     n_parts: usize,
 ) -> Vec<Vec<String>> {
-    if n_parts == 0 || genome_sizes.is_empty() {
-        return Vec::new();
-    }
-
-    // Deterministic contig order
     let mut contigs: Vec<(String, u64)> = genome_sizes
         .iter()
         .map(|(ctg, len)| (ctg.clone(), *len))
         .collect();
     contigs.sort_by(|a, b| a.0.cmp(&b.0));
+    split_ordered_genome_into_n_regions_exact(&contigs, n_parts)
+}
 
-    let total_bases: u64 = contigs.iter().map(|(_, len)| *len).sum();
+pub fn split_ordered_genome_into_n_regions_exact(
+    genome_sizes: &[(String, u64)],
+    n_parts: usize,
+) -> Vec<Vec<String>> {
+    if n_parts == 0 || genome_sizes.is_empty() {
+        return Vec::new();
+    }
+
+    let total_bases: u64 = genome_sizes.iter().map(|(_, len)| *len).sum();
     if total_bases == 0 {
         return Vec::new();
     }
@@ -1080,8 +1237,7 @@ pub fn split_genome_into_n_regions_exact(
     let mut part_idx = 0;
     let mut remaining_in_part = target_sizes[0];
 
-    let mut ctg_iter = contigs.into_iter();
-    while let Some((ctg, mut len)) = ctg_iter.next() {
+    for (ctg, mut len) in genome_sizes.iter().cloned() {
         let mut start: u64 = 1;
 
         while len > 0 && part_idx < parts {
@@ -1112,27 +1268,6 @@ pub fn split_genome_into_n_regions_exact(
     chunks
 }
 
-/// Removes a BAM file and its associated index file (.bai).
-pub fn remove_bam_with_index(bam: &Path) -> anyhow::Result<()> {
-    // Remove BAM
-    if bam.exists() {
-        fs::remove_file(bam).with_context(|| format!("Failed to remove BAM: {}", bam.display()))?;
-    }
-
-    // Remove index (.bam.bai or .bai)
-    let bai = bam.with_extension("bam.bai");
-    if bai.exists() {
-        fs::remove_file(&bai).ok(); // Don't fail if index doesn't exist
-    }
-
-    let bai_alt = bam.with_extension("bai");
-    if bai_alt.exists() {
-        fs::remove_file(&bai_alt).ok();
-    }
-
-    Ok(())
-}
-
 /// Convert a numeric value to an RGB triplet using a diverging palette.
 ///
 /// Values are clipped to `[-clip, +clip]` and mapped as:
@@ -1181,7 +1316,12 @@ pub fn revcomp(s: &str) -> String {
             _ => b'N',
         }
     }
-    let v: Vec<u8> = s.as_bytes().iter().rev().map(|&b| comp(b.to_ascii_uppercase())).collect();
+    let v: Vec<u8> = s
+        .as_bytes()
+        .iter()
+        .rev()
+        .map(|&b| comp(b.to_ascii_uppercase()))
+        .collect();
     String::from_utf8(v).unwrap()
 }
 
@@ -1194,6 +1334,50 @@ mod tests {
 
     use super::*;
 
+    #[test]
+    fn par_intersection_keeps_values_only_in_second_outside_first_range() {
+        let res = par_intersection(&[10], &[1]);
+
+        assert_eq!(res.common, Vec::<i32>::new());
+        assert_eq!(res.only_in_first, vec![10]);
+        assert_eq!(res.only_in_second, vec![1]);
+    }
+
+    #[test]
+    fn par_intersection_handles_duplicate_counts() {
+        let res = par_intersection(&[1, 2, 2, 3], &[0, 2, 2, 2, 4]);
+
+        assert_eq!(res.common, vec![2, 2]);
+        assert_eq!(res.only_in_first, vec![1, 3]);
+        assert_eq!(res.only_in_second, vec![0, 2, 4]);
+    }
+
+    #[test]
+    fn bin_data_handles_empty_and_invalid_bin_size() {
+        assert_eq!(bin_data::<f64>(Vec::new(), 0.1), Vec::<(f64, usize)>::new());
+        assert_eq!(bin_data(vec![1.0, 2.0], 0.0), Vec::<(f64, usize)>::new());
+        assert_eq!(
+            bin_data(vec![1.0, 2.0], f64::NAN),
+            Vec::<(f64, usize)>::new()
+        );
+        assert_eq!(bin_data(vec![f64::NAN], 0.1), Vec::<(f64, usize)>::new());
+    }
+
+    #[test]
+    fn bin_data_keeps_sparse_values_in_aligned_bins() {
+        assert_eq!(bin_data(vec![0.0, 1.0], 0.25), vec![(0.0, 1), (1.0, 1)]);
+    }
+
+    #[test]
+    fn split_ordered_genome_regions_preserves_input_contig_order() {
+        let genome = vec![("chr2".to_string(), 2), ("chr1".to_string(), 2)];
+
+        assert_eq!(
+            split_ordered_genome_into_n_regions_exact(&genome, 2),
+            vec![vec!["chr2:1-2".to_string()], vec!["chr1:1-2".to_string()]]
+        );
+    }
+
     #[test]
     fn split_g() -> anyhow::Result<()> {
         test_init();

+ 44 - 25
src/io/bam.rs

@@ -1,12 +1,17 @@
 use std::io::Write;
 use std::{
-    collections::{HashMap, HashSet}, fs::File, path::Path
+    collections::{HashMap, HashSet},
+    fs::File,
+    path::Path,
 };
 
 use anyhow::Context;
 use log::{info, warn};
 use rust_htslib::bam::{
-    self, FetchDefinition, IndexedReader, Read, Record, ext::BamRecordExtensions, record::{Aux, Cigar}
+    self,
+    ext::BamRecordExtensions,
+    record::{Aux, Cigar},
+    FetchDefinition, IndexedReader, Read, Record,
 };
 
 use crate::{commands::samtools::SamtoolsReheader, config::Config, runners::Run};
@@ -66,7 +71,10 @@ pub fn parse_sa_entries(sa: &str) -> Vec<SaEntry<'_>> {
 ///
 /// `pos` is 0-based. Useful when only fetch coordinates are needed.
 pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> {
-    parse_sa_entries(sa).into_iter().map(|e| (e.chr, e.pos)).collect()
+    parse_sa_entries(sa)
+        .into_iter()
+        .map(|e| (e.chr, e.pos))
+        .collect()
 }
 
 /// Resolve a supplementary alignment to its primary record via the SA tag.
@@ -103,7 +111,10 @@ pub fn primary_record(bam: &mut IndexedReader, record: Record) -> anyhow::Result
 
             for result in bam.records() {
                 let candidate = result.context("Invalid BAM record")?;
-                if candidate.qname() == qname && !candidate.is_supplementary() && !candidate.is_secondary() {
+                if candidate.qname() == qname
+                    && !candidate.is_supplementary()
+                    && !candidate.is_secondary()
+                {
                     return Ok(candidate);
                 }
             }
@@ -177,7 +188,9 @@ pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Rec
                 all_positions.extend(positions);
 
                 match String::from_utf8(qname.to_vec()) {
-                    Ok(qname_str) => { all_qnames_to_fetch.insert(qname_str); }
+                    Ok(qname_str) => {
+                        all_qnames_to_fetch.insert(qname_str);
+                    }
                     Err(e) => warn!("Invalid UTF-8 in qname: {}", e),
                 }
             }
@@ -274,9 +287,9 @@ pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<Has
             if key == "SQ" {
                 genome.insert(
                     record["SN"].to_string(),
-                    record["LN"]
-                        .parse::<u64>()
-                        .with_context(|| format!("Failed to parse LN for contig {}", record["SN"]))?,
+                    record["LN"].parse::<u64>().with_context(|| {
+                        format!("Failed to parse LN for contig {}", record["SN"])
+                    })?,
                 );
             }
         }
@@ -551,16 +564,10 @@ fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) {
         return (0, 0);
     }
 
-    // For minus strand, the "first" op on the read is the last in the CIGAR
-    let first_op = if strand == '+' {
-        &ops[0]
+    let query_beg = if strand == '+' {
+        terminal_soft_clip(ops.iter())
     } else {
-        ops.last().unwrap()
-    };
-
-    let query_beg = match first_op {
-        Cigar::SoftClip(len) => *len,
-        _ => 0,
+        terminal_soft_clip(ops.iter().rev())
     };
 
     // Alignment length on the read (consumes query bases)
@@ -575,6 +582,15 @@ fn alignment_query_coords(ops: &[Cigar], strand: char) -> (u32, u32) {
     (query_beg, query_beg + aln_len)
 }
 
+fn terminal_soft_clip<'a>(mut ops: impl Iterator<Item = &'a Cigar>) -> u32 {
+    ops.find_map(|op| match op {
+        Cigar::HardClip(_) | Cigar::Pad(_) => None,
+        Cigar::SoftClip(len) => Some(*len),
+        _ => Some(0),
+    })
+    .unwrap_or(0)
+}
+
 /// Compute the reference-consuming length of a CIGAR.
 ///
 /// Sums lengths of operations that advance the reference coordinate:
@@ -1012,10 +1028,7 @@ pub enum QnameFilter {
 ///
 /// Returns an error if the input cannot be read, the output cannot be written, or
 /// if UTF-8 conversion of a reference name or query name fails.
-pub fn bam_to_aligned_bed<P: AsRef<Path>>(
-    input_bam: P,
-    output_bed: P,
-) -> anyhow::Result<()> {
+pub fn bam_to_aligned_bed<P: AsRef<Path>>(input_bam: P, output_bed: P) -> anyhow::Result<()> {
     let mut bam = bam::Reader::from_path(input_bam)?;
     let header = bam.header().to_owned();
 
@@ -1046,10 +1059,7 @@ pub fn bam_to_aligned_bed<P: AsRef<Path>>(
         if end > start {
             let qname = std::str::from_utf8(record.qname())?;
 
-            writeln!(
-                writer,
-                "{chrom}\t{start}\t{end}\t{qname}"
-            )?;
+            writeln!(writer, "{chrom}\t{start}\t{end}\t{qname}")?;
         }
     }
 
@@ -1069,4 +1079,13 @@ mod tests {
         read_sm_tag_or_inject(&config.tumoral_bam("CHALO"), "CHALO_diag", &config)?;
         Ok(())
     }
+
+    #[test]
+    fn alignment_query_coords_counts_soft_clip_inside_hard_clip() {
+        let ops = parse_cigar_str("5H10S90M").unwrap();
+        assert_eq!(alignment_query_coords(&ops, '+'), (10, 100));
+
+        let ops = parse_cigar_str("90M10S5H").unwrap();
+        assert_eq!(alignment_query_coords(&ops, '-'), (10, 100));
+    }
 }

+ 99 - 33
src/io/bed.rs

@@ -18,19 +18,28 @@
 //! | [`parse_centromere_intervals`] | Extract centromeric intervals from a UCSC cytoband file |
 
 use std::{
-    io::{BufRead, BufReader}, path::Path, str::FromStr, sync::Arc
+    io::{BufRead, BufReader},
+    path::Path,
+    str::FromStr,
+    sync::Arc,
 };
 
 use anyhow::Context;
 use log::warn;
 use rayon::prelude::*;
 
-use crate::{annotation::Annotation, io::writers::{BgzTabixWriter, IndexFormat}, positions::{GenomePosition, GenomeRange, GetGenomeRange, extract_contig_indices, find_contig_indices, overlaps_par}, variant::variant_collection::Variants};
+use crate::{
+    annotation::Annotation,
+    io::writers::{BgzTabixWriter, IndexFormat},
+    positions::{
+        extract_contig_indices, find_contig_indices, overlaps_par, GenomePosition, GenomeRange,
+        GetGenomeRange,
+    },
+    variant::variant_collection::Variants,
+};
 
 use super::readers::{fetch_tabix_lines_with, get_reader};
 
-use std::io::Write;
-
 /// One parsed row from a BED file (up to BED6).
 ///
 /// Coordinates in `range` are **0-based, half-open** as stored in the BED file.
@@ -93,6 +102,29 @@ impl GetGenomeRange for BedRow {
     }
 }
 
+fn is_bed_metadata_line(line: &str) -> bool {
+    let line = line.trim_start();
+    line.is_empty()
+        || line.starts_with('#')
+        || line.starts_with("track")
+        || line.starts_with("browser")
+}
+
+fn bed_row_overlaps_region(row: &BedRow, region: &GenomeRange) -> bool {
+    if row.range.contig != region.contig {
+        return false;
+    }
+
+    let row_start = row.range.range.start;
+    let row_end = row.range.range.end;
+
+    if row_start == row_end {
+        region.range.contains(&row_start)
+    } else {
+        row_end > region.range.start && row_start < region.range.end
+    }
+}
+
 /// Load a BED file into memory as a vector of [`BedRow`]s.
 ///
 /// Skips blank lines and lines starting with `#`, `track`, or `browser`
@@ -116,18 +148,17 @@ pub fn read_bed(path: &str) -> anyhow::Result<Vec<BedRow>> {
 
     let mut res = Vec::new();
     for (i, line) in reader.lines().enumerate() {
+        let line_no = i + 1;
         match line {
             Ok(line) => {
-                if line.is_empty()
-                    || line.starts_with('#')
-                    || line.starts_with("track")
-                    || line.starts_with("browser")
-                {
+                if is_bed_metadata_line(&line) {
                     continue;
                 }
-                res.push(line.parse().context(format!("Can't parse {line}"))?);
+                res.push(line.parse().with_context(|| {
+                    format!("failed parsing BED record at {path}:{line_no}: {line}")
+                })?);
             }
-            Err(e) => warn!("Can't read line {i}: {e}"),
+            Err(e) => warn!("Can't read {path}:{line_no}: {e}"),
         }
     }
 
@@ -192,10 +223,7 @@ pub fn annotate_with_bed(
 ///
 /// Matching BED rows in per-contig order. Cross-contig order is unspecified (parallel
 /// execution).
-pub fn bedrow_overlaps_par(
-    rows: &[BedRow],
-    queries: &[&GenomeRange],
-) -> Vec<BedRow>
+pub fn bedrow_overlaps_par(rows: &[BedRow], queries: &[&GenomeRange]) -> Vec<BedRow>
 where
     BedRow: Clone + Send + Sync,
 {
@@ -234,7 +262,6 @@ where
         .collect()
 }
 
-
 /// Per-contig indexed BED structure for fast gene-name lookup.
 ///
 /// Rows are sorted by `(contig, start, end)` at construction time and stored in
@@ -251,7 +278,8 @@ impl GenesBedIndex {
     /// Rows are sorted by `(contig, start, end)` internally.
     pub fn new(mut rows: Vec<BedRow>) -> Self {
         rows.sort_unstable_by(|a, b| {
-            a.range.contig
+            a.range
+                .contig
                 .cmp(&b.range.contig)
                 .then_with(|| a.range.range.start.cmp(&b.range.range.start))
                 .then_with(|| a.range.range.end.cmp(&b.range.range.end))
@@ -282,7 +310,11 @@ impl GenesBedIndex {
     ///
     /// Names of overlapping genes in start-sorted order.
     pub fn query_genes(&self, contig: u8, start: u32, end: u32) -> Vec<String> {
-        let (s, e) = if start <= end { (start, end) } else { (end, start) };
+        let (s, e) = if start <= end {
+            (start, end)
+        } else {
+            (end, start)
+        };
         let rows = &self.by_contig[contig as usize];
 
         rows.iter()
@@ -332,17 +364,24 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
             break;
         }
 
-        if line.starts_with('#') || line.trim().is_empty() {
+        if is_bed_metadata_line(&line) {
             writer.write_header(line.as_bytes())?;
             continue;
         }
 
         let mut it = line.split('\t');
         let rname = it.next().context("BED: missing contig")?;
-        let start0: u32 = it.next().context("BED: missing start")?
-            .parse().context("BED: invalid start")?;
-        let end0: u32 = it.next().context("BED: missing end")?
-            .trim().parse().context("BED: invalid end")?;
+        let start0: u32 = it
+            .next()
+            .context("BED: missing start")?
+            .parse()
+            .context("BED: invalid start")?;
+        let end0: u32 = it
+            .next()
+            .context("BED: missing end")?
+            .trim()
+            .parse()
+            .context("BED: invalid end")?;
 
         writer.write_bed_record(line.as_bytes(), rname, start0, end0)?;
     }
@@ -377,10 +416,11 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
 pub fn fetch_bed(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<BedRow>> {
     let mut rows = Vec::new();
     fetch_tabix_lines_with(bgz_path, region, |line| {
-        let row: BedRow = line.parse()
+        let row: BedRow = line
+            .parse()
             .with_context(|| format!("failed to parse BED line: {line}"))?;
         // Secondary overlap filter: discard tabix-bin false positives.
-        if row.range.range.end > region.range.start && row.range.range.start < region.range.end {
+        if bed_row_overlaps_region(&row, region) {
             rows.push(row);
         }
         Ok(())
@@ -409,26 +449,30 @@ pub fn fetch_bed(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<Bed
 /// Returns an error if the file cannot be opened, a line has fewer than 5 columns,
 /// or a coordinate field cannot be parsed as `u64`.
 pub fn parse_centromere_intervals(path: &str) -> anyhow::Result<Vec<(String, u64, u64)>> {
-    let reader = BufReader::new(get_reader(path)
-        .with_context(|| format!("Cannot open cytobands BED: {path}"))?);
+    let reader = BufReader::new(
+        get_reader(path).with_context(|| format!("Cannot open cytobands BED: {path}"))?,
+    );
 
     let mut intervals = Vec::new();
-    for line in reader.lines() {
-        let line = line?;
-        if line.starts_with('#') || line.is_empty() {
+    for (i, line) in reader.lines().enumerate() {
+        let line_no = i + 1;
+        let line = line.with_context(|| format!("failed reading {path}:{line_no}"))?;
+        if is_bed_metadata_line(&line) {
             continue;
         }
         let f: Vec<&str> = line.split('\t').collect();
         anyhow::ensure!(
             f.len() >= 5,
-            "Expected 5 columns in cytoband BED, got {}: {line}",
+            "Expected 5 columns in cytoband BED at {path}:{line_no}, got {}: {line}",
             f.len()
         );
         if f[4].trim() == "acen" {
             intervals.push((
                 f[0].to_string(),
-                f[1].parse::<u64>().with_context(|| format!("start: {}", f[1]))?,
-                f[2].parse::<u64>().with_context(|| format!("end: {}", f[2]))?,
+                f[1].parse::<u64>()
+                    .with_context(|| format!("bad cytoband start at {path}:{line_no}: {}", f[1]))?,
+                f[2].parse::<u64>()
+                    .with_context(|| format!("bad cytoband end at {path}:{line_no}: {}", f[2]))?,
             ));
         }
     }
@@ -448,4 +492,26 @@ mod tests {
         convert_bgz_with_tabix(a, true)?;
         Ok(())
     }
+
+    #[test]
+    fn zero_width_bed_row_overlaps_containing_base_only() {
+        let row: BedRow = "chr1\t10\t10\tins".parse().unwrap();
+
+        assert!(!bed_row_overlaps_region(
+            &row,
+            &GenomeRange::new("chr1", 9, 10)
+        ));
+        assert!(bed_row_overlaps_region(
+            &row,
+            &GenomeRange::new("chr1", 10, 11)
+        ));
+        assert!(!bed_row_overlaps_region(
+            &row,
+            &GenomeRange::new("chr1", 11, 12)
+        ));
+        assert!(!bed_row_overlaps_region(
+            &row,
+            &GenomeRange::new("chr2", 10, 11)
+        ));
+    }
 }

+ 15 - 7
src/io/dict.rs

@@ -17,26 +17,34 @@ use crate::io::tsv::TsvLine;
 pub fn read_dict(path: &str) -> anyhow::Result<Vec<(String, u32)>> {
     debug!("Parsing {path}.");
 
-    let mut reader = BufReader::new(File::open(path).with_context(|| format!("cannot open dict: {path}"))?);
+    let mut reader =
+        BufReader::new(File::open(path).with_context(|| format!("cannot open dict: {path}"))?);
     let mut line = TsvLine::new();
+    let mut line_no = 0usize;
     let mut res = Vec::new();
 
-    while line.read(&mut reader)? {
+    while line
+        .read(&mut reader)
+        .with_context(|| format!("failed reading dict line after {path}:{line_no}"))?
+    {
+        line_no += 1;
         let fields = line.split_fields();
         if fields.first().copied() != Some("@SQ") {
             continue;
         }
 
-        let sn = fields.iter()
+        let sn = fields
+            .iter()
             .find_map(|f| f.strip_prefix("SN:"))
-            .context("Missing SN: in @SQ line")?
+            .with_context(|| format!("Missing SN: in @SQ line at {path}:{line_no}"))?
             .to_string();
 
-        let ln: u32 = fields.iter()
+        let ln: u32 = fields
+            .iter()
             .find_map(|f| f.strip_prefix("LN:"))
-            .context("Missing LN: in @SQ line")?
+            .with_context(|| format!("Missing LN: in @SQ line at {path}:{line_no}"))?
             .parse()
-            .context("Invalid LN: value")?;
+            .with_context(|| format!("Invalid LN: value at {path}:{line_no}"))?;
 
         res.push((sn, ln));
     }

+ 33 - 9
src/io/fasta.rs

@@ -5,7 +5,11 @@
 //! transparently inside each function.
 
 use std::io::{BufRead, Write};
-use std::{fs::File, io::BufReader, path::{Path, PathBuf}};
+use std::{
+    fs::File,
+    io::BufReader,
+    path::{Path, PathBuf},
+};
 
 use anyhow::Context;
 use noodles_fasta::io::{BufReader as NoodlesBufReader, IndexedReader};
@@ -15,7 +19,9 @@ use noodles_fasta::io::{BufReader as NoodlesBufReader, IndexedReader};
 /// # Errors
 ///
 /// Returns an error if the FASTA or its index cannot be opened.
-pub fn open_indexed_fasta(reference: &Path) -> anyhow::Result<IndexedReader<NoodlesBufReader<File>>> {
+pub fn open_indexed_fasta(
+    reference: &Path,
+) -> anyhow::Result<IndexedReader<NoodlesBufReader<File>>> {
     noodles_fasta::io::indexed_reader::Builder::default()
         .build_from_path(reference)
         .map_err(|e| anyhow::anyhow!("Failed to open indexed FASTA {}: {e}", reference.display()))
@@ -37,9 +43,13 @@ pub fn sequence_at(
     pos0: usize,
     len: usize,
 ) -> anyhow::Result<String> {
+    anyhow::ensure!(len > 0, "FASTA window length must be > 0");
+
     let position = pos0 + 1; // 0-based → 1-based
     let start = position.saturating_sub(len / 2).max(1);
-    let end = start + len - 1;
+    let end = start
+        .checked_add(len - 1)
+        .context("FASTA window end coordinate overflow")?;
 
     let start = noodles_core::Position::try_from(start)?;
     let end = noodles_core::Position::try_from(end)?;
@@ -65,11 +75,16 @@ pub fn sequence_at(
 pub fn sequence_range(
     fasta_reader: &mut IndexedReader<noodles_fasta::io::BufReader<File>>,
     contig: &str,
-    start0: usize,          // 0-based inclusive
-    end0_inclusive: usize,  // 0-based inclusive
+    start0: usize,         // 0-based inclusive
+    end0_inclusive: usize, // 0-based inclusive
 ) -> anyhow::Result<String> {
+    anyhow::ensure!(
+        start0 <= end0_inclusive,
+        "FASTA range start ({start0}) is after end ({end0_inclusive})"
+    );
+
     let start = noodles_core::Position::try_from(start0 + 1)?;
-    let end   = noodles_core::Position::try_from(end0_inclusive + 1)?;
+    let end = noodles_core::Position::try_from(end0_inclusive + 1)?;
     let interval = noodles_core::region::interval::Interval::from(start..=end);
 
     let r = noodles_core::region::Region::new(contig.to_string(), interval);
@@ -79,7 +94,7 @@ pub fn sequence_range(
 
 /// A single-contig FASTA file produced by [`split_fasta`].
 pub struct ContigFasta {
-    pub name:       String,
+    pub name: String,
     pub fasta_path: PathBuf,
 }
 
@@ -113,7 +128,13 @@ pub fn split_fasta(fasta: &Path, out_dir: &Path) -> anyhow::Result<Vec<ContigFas
                 .next()
                 .unwrap_or("contig")
                 .chars()
-                .map(|c| if c.is_alphanumeric() || c == '_' || c == '-' { c } else { '_' })
+                .map(|c| {
+                    if c.is_alphanumeric() || c == '_' || c == '-' {
+                        c
+                    } else {
+                        '_'
+                    }
+                })
                 .collect();
 
             let path = out_dir.join(format!("{safe_name}.fasta"));
@@ -121,7 +142,10 @@ pub fn split_fasta(fasta: &Path, out_dir: &Path) -> anyhow::Result<Vec<ContigFas
                 .with_context(|| format!("Cannot create contig FASTA: {}", path.display()))?;
             writeln!(w, ">{name}")?;
 
-            contigs.push(ContigFasta { name: safe_name, fasta_path: path });
+            contigs.push(ContigFasta {
+                name: safe_name,
+                fasta_path: path,
+            });
             current_writer = Some(w);
         } else if let Some(ref mut w) = current_writer {
             writeln!(w, "{line}")?;

+ 5 - 2
src/io/liftover.rs

@@ -75,8 +75,11 @@ pub fn build_machine_from_chain<P: AsRef<Path>>(
     // truncating the chain file (which would corrupt all downstream liftover results).
     let normalized: Vec<u8> = reader
         .lines()
-        .map(|result| {
-            let line = result.context("I/O error reading chain file")?;
+        .enumerate()
+        .map(|(i, result)| {
+            let line_no = i + 1;
+            let line =
+                result.with_context(|| format!("I/O error reading chain file {p}:{line_no}"))?;
             let out = if line.starts_with("chain") || line.is_empty() {
                 line
             } else {

+ 90 - 49
src/io/modkit.rs

@@ -35,7 +35,7 @@ use std::{
 };
 
 use crate::{
-    helpers::TempFileGuard,
+    helpers::{diverging_rgb, TempFileGuard},
     io::{bed::read_bed, readers::fetch_tabix_lines_with},
     positions::GenomeRange,
 };
@@ -113,11 +113,20 @@ pub struct GeneActivity {
 /// - anything else → gene key = full name, kind [`RegionKind::Other`]
 pub fn parse_region_name(name: &str) -> ParsedName {
     if let Some(g) = name.strip_suffix("_prom") {
-        ParsedName { gene_key: g.to_string(), kind: RegionKind::Promoter }
+        ParsedName {
+            gene_key: g.to_string(),
+            kind: RegionKind::Promoter,
+        }
     } else if let Some(g) = name.strip_suffix("_body") {
-        ParsedName { gene_key: g.to_string(), kind: RegionKind::GeneBody }
+        ParsedName {
+            gene_key: g.to_string(),
+            kind: RegionKind::GeneBody,
+        }
     } else {
-        ParsedName { gene_key: name.to_string(), kind: RegionKind::Other }
+        ParsedName {
+            gene_key: name.to_string(),
+            kind: RegionKind::Other,
+        }
     }
 }
 
@@ -136,8 +145,8 @@ pub fn read_bed4(path: &str) -> Result<Vec<BedRegion>> {
         .map(|row| BedRegion {
             chrom: row.range.contig(),
             start: row.range.range.start as u64,
-            end:   row.range.range.end   as u64,
-            name:  row.name.unwrap_or_else(|| ".".to_string()),
+            end: row.range.range.end as u64,
+            name: row.name.unwrap_or_else(|| ".".to_string()),
         })
         .collect())
 }
@@ -164,12 +173,18 @@ pub fn region_fraction_modified(
     let mut sum_mod = 0u64;
 
     for s in &chrom_sites[i..] {
-        if s.start >= end { break; }
+        if s.start >= end {
+            break;
+        }
         sum_cov = sum_cov.saturating_add(s.nvalid_cov);
         sum_mod = sum_mod.saturating_add(s.nmod);
     }
 
-    if sum_cov == 0 { None } else { Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod)) }
+    if sum_cov == 0 {
+        None
+    } else {
+        Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod))
+    }
 }
 
 /// Fetch methylation fraction for `region` directly from a Tabix-indexed pileup.
@@ -200,18 +215,26 @@ pub fn region_fraction_from_tabix(
 
     fetch_tabix_lines_with(pileup_bgz, region, |line| {
         let f: Vec<&str> = line.split('\t').collect();
-        let nvalid_cov: u64 = f.get(9)
-            .ok_or_else(|| anyhow!("Missing Nvalid_cov (col 10)"))?
-            .parse().context("Bad Nvalid_cov")?;
-        let nmod: u64 = f.get(11)
-            .ok_or_else(|| anyhow!("Missing Nmod (col 12)"))?
-            .parse().context("Bad Nmod")?;
+        let nvalid_cov: u64 = f
+            .get(9)
+            .ok_or_else(|| anyhow!("Missing Nvalid_cov (col 10) in {pileup_bgz}: {line}"))?
+            .parse()
+            .with_context(|| format!("Bad Nvalid_cov in {pileup_bgz}: {line}"))?;
+        let nmod: u64 = f
+            .get(11)
+            .ok_or_else(|| anyhow!("Missing Nmod (col 12) in {pileup_bgz}: {line}"))?
+            .parse()
+            .with_context(|| format!("Bad Nmod in {pileup_bgz}: {line}"))?;
         sum_cov = sum_cov.saturating_add(nvalid_cov);
         sum_mod = sum_mod.saturating_add(nmod);
         Ok(())
     })?;
 
-    if sum_cov == 0 { Ok(None) } else { Ok(Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod))) }
+    if sum_cov == 0 {
+        Ok(None)
+    } else {
+        Ok(Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod)))
+    }
 }
 
 // ─── Activity computation ─────────────────────────────────────────────────────
@@ -242,9 +265,17 @@ pub fn compute_gene_activity(
     min_sum_cov: u64,
 ) -> Result<Vec<GeneActivity>> {
     #[derive(Clone)]
-    struct Part { chrom: String, start: u64, end: u64, frac: f64 }
+    struct Part {
+        chrom: String,
+        start: u64,
+        end: u64,
+        frac: f64,
+    }
     #[derive(Default)]
-    struct Pair { prom: Option<Part>, body: Option<Part> }
+    struct Pair {
+        prom: Option<Part>,
+        body: Option<Part>,
+    }
 
     let mut by_gene: HashMap<String, Pair> = HashMap::new();
 
@@ -252,12 +283,17 @@ pub fn compute_gene_activity(
         if r.end <= r.start {
             return Err(anyhow!(
                 "Invalid region end<=start {}:{}-{} ({})",
-                r.chrom, r.start, r.end, r.name
+                r.chrom,
+                r.start,
+                r.end,
+                r.name
             ));
         }
 
         let ParsedName { gene_key, kind } = parse_region_name(&r.name);
-        if kind == RegionKind::Other { continue; }
+        if kind == RegionKind::Other {
+            continue;
+        }
 
         let region = GenomeRange::new(&r.chrom, r.start as u32, r.end as u32);
 
@@ -266,14 +302,21 @@ pub fn compute_gene_activity(
             None => continue,
         };
 
-        if sum_cov < min_sum_cov || frac <= 0.0 { continue; }
+        if sum_cov < min_sum_cov || frac <= 0.0 {
+            continue;
+        }
 
-        let part = Part { chrom: r.chrom.clone(), start: r.start, end: r.end, frac };
+        let part = Part {
+            chrom: r.chrom.clone(),
+            start: r.start,
+            end: r.end,
+            frac,
+        };
         let entry = by_gene.entry(gene_key).or_default();
         match kind {
             RegionKind::Promoter => entry.prom = Some(part),
             RegionKind::GeneBody => entry.body = Some(part),
-            RegionKind::Other    => {}
+            RegionKind::Other => {}
         }
     }
 
@@ -303,23 +346,6 @@ pub fn compute_gene_activity(
 
 // ─── Colour / score helpers ───────────────────────────────────────────────────
 
-/// Diverging blue→white→red palette for IGV `itemRgb`.
-///
-/// `v` is clipped to `[-clip, +clip]` and mapped to:
-/// `-clip` → `(0, 0, 255)`, `0` → `(255, 255, 255)`, `+clip` → `(255, 0, 0)`.
-pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) {
-    debug_assert!(clip > 0.0, "clip must be > 0");
-    let t = (v.clamp(-clip, clip) + clip) / (2.0 * clip);
-    if t < 0.5 {
-        let u = t / 0.5;
-        ((255.0 * u).round() as u8, (255.0 * u).round() as u8, 255)
-    } else {
-        let u = (t - 0.5) / 0.5;
-        let c = (255.0 * (1.0 - u)).round() as u8;
-        (255, c, c)
-    }
-}
-
 /// Map a log2-ratio to a BED `score` field (0–1000).
 ///
 /// `v` is clipped to `[-clip, +clip]` then linearly scaled:
@@ -369,15 +395,22 @@ pub fn write_gene_activity_bed9(
         writeln!(
             w,
             "{}\t{}\t{}\t{}={:.4}\t{}\t.\t{}\t{}\t{},{},{}",
-            a.chrom, a.start, a.end,
-            a.gene_key, a.score,
+            a.chrom,
+            a.start,
+            a.end,
+            a.gene_key,
+            a.score,
             bed_score_from_log2_ratio(a.score, clip),
-            a.start, a.end,
-            rr, gg, bb,
+            a.start,
+            a.end,
+            rr,
+            gg,
+            bb,
         )?;
     }
 
-    w.flush().with_context(|| format!("failed to flush: {}", tmp_path.display()))?;
+    w.flush()
+        .with_context(|| format!("failed to flush: {}", tmp_path.display()))?;
     drop(w);
 
     fs::rename(&tmp_path, path)
@@ -420,9 +453,9 @@ pub fn pileup_to_activity_bed9(
 
 #[cfg(test)]
 mod tests {
-    use log::info;
     use super::*;
     use crate::{config::Config, helpers::test_init};
+    use log::info;
 
     #[test]
     fn modkit_activity_igv() -> anyhow::Result<()> {
@@ -434,15 +467,23 @@ mod tests {
         for id in ["DUMCO", "CHAHA"] {
             let path = format!(
                 "{}/{}_{}_modkit_pileup.bed.gz",
-                config.tumoral_dir(id), id, config.tumoral_name
+                config.tumoral_dir(id),
+                id,
+                config.tumoral_name
             );
             let out = format!(
                 "{}/{}_{}_modkit_activity.bed",
-                config.tumoral_dir(id), id, config.tumoral_name
+                config.tumoral_dir(id),
+                id,
+                config.tumoral_name
             );
             let n = pileup_to_activity_bed9(
-                &path, regions_bed4_path, 100, &out,
-                &format!("{id} Gene Activity"), 2.0,
+                &path,
+                regions_bed4_path,
+                100,
+                &out,
+                &format!("{id} Gene Activity"),
+                2.0,
             )?;
             info!("{id}: {n} gene activities written");
         }

+ 15 - 9
src/io/readers.rs

@@ -140,9 +140,7 @@ pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result<String> {
 
     debug!("Compressing {input_path} → {output_path}");
 
-    let tmp_dir = Path::new(input_path)
-        .parent()
-        .unwrap_or(Path::new("."));
+    let tmp_dir = Path::new(input_path).parent().unwrap_or(Path::new("."));
 
     let mut guard = TempFileGuard::new();
     let tmp_path = guard.tmp_path(".gz", tmp_dir);
@@ -158,7 +156,9 @@ pub fn compress_to_bgzip(input_path: &str) -> anyhow::Result<String> {
         let n = reader
             .read(&mut buffer)
             .with_context(|| format!("failed reading {input_path}"))?;
-        if n == 0 { break; }
+        if n == 0 {
+            break;
+        }
         writer
             .write_all(&buffer[..n])
             .with_context(|| format!("failed writing {tmp_str}"))?;
@@ -211,6 +211,10 @@ pub fn fetch_tabix_lines_with<F>(
 where
     F: FnMut(&str) -> anyhow::Result<()>,
 {
+    if region.range.is_empty() {
+        return Ok(());
+    }
+
     let tbi_path = format!("{bgz_path}.tbi");
 
     let index = tabix::fs::read(&tbi_path)
@@ -229,10 +233,10 @@ where
     };
 
     // 0-based half-open [start, end) → 1-based inclusive [start+1, end]
-    let interval_start = Position::try_from(region.range.start as usize + 1)
-        .context("region start overflow")?;
-    let interval_end = Position::try_from(region.range.end as usize)
-        .context("region end overflow")?;
+    let interval_start =
+        Position::try_from(region.range.start as usize + 1).context("region start overflow")?;
+    let interval_end =
+        Position::try_from(region.range.end as usize).context("region end overflow")?;
 
     let interval = Interval::from(interval_start..=interval_end);
     let chunks = index
@@ -258,7 +262,9 @@ where
                 break;
             }
             buf.clear();
-            let n = reader.read_line(&mut buf).context("BGZF read_line failed")?;
+            let n = reader
+                .read_line(&mut buf)
+                .context("BGZF read_line failed")?;
             if n == 0 {
                 break;
             }

+ 43 - 21
src/io/vcf.rs

@@ -27,7 +27,10 @@ use crate::{
     variant::vcf_variant::VcfVariant,
 };
 
-use super::{dict::read_dict, readers::{fetch_tabix_lines_with, get_reader}};
+use super::{
+    dict::read_dict,
+    readers::{fetch_tabix_lines_with, get_reader},
+};
 
 /// Load a BGZF-compressed or plain VCF file into memory.
 ///
@@ -50,14 +53,17 @@ pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
 
     let mut res = Vec::new();
     for (i, line) in reader.lines().enumerate() {
+        let line_no = i + 1;
         match line {
             Ok(line) => {
                 if line.starts_with('#') || line.is_empty() {
                     continue;
                 }
-                res.push(line.parse().context(format!("Can't parse {line}"))?);
+                res.push(line.parse().with_context(|| {
+                    format!("failed parsing VCF record at {path}:{line_no}: {line}")
+                })?);
             }
-            Err(e) => warn!("Can't read line {i}: {e}"),
+            Err(e) => warn!("Can't read {path}:{line_no}: {e}"),
         }
     }
 
@@ -107,12 +113,20 @@ pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
     writer.finish()
 }
 
-/// Fetch variants overlapping `region` from a Tabix-indexed BGZF VCF file.
+/// Fetch variants whose POS falls inside `region` from a Tabix-indexed BGZF VCF file.
 ///
 /// Delegates BGZF seeking and line extraction to [`fetch_tabix_lines`], then
-/// parses each line as a [`VcfVariant`]. No secondary position filter is applied
-/// beyond what Tabix provides — VCF variants are point features (POS = both
-/// tabix start and end), so false positives from the bin query are rare.
+/// parses each line as a [`VcfVariant`] and applies a secondary POS filter to
+/// discard any false positives from the Tabix bin query.
+///
+/// TODO: this is POS-based, not true interval-overlap fetching. Deletions and
+/// symbolic SVs with `INFO/END` can overlap a region even when their POS is
+/// outside it. Supporting that correctly should be done alongside explicit VCF
+/// span parsing, not inside this wrapper.
+///
+/// TODO: VCF allows `POS=0` as a telomere sentinel. Parsing preserves that as
+/// internal position `0`, but tabix region queries use 1-based coordinates and
+/// this wrapper does not currently perform a separate sentinel lookup.
 ///
 /// # Coordinate system
 ///
@@ -127,7 +141,7 @@ pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
 ///
 /// # Returns
 ///
-/// Parsed [`VcfVariant`]s overlapping `region`, in file order.
+/// Parsed [`VcfVariant`]s whose POS is inside `region`, in file order.
 ///
 /// # Errors
 ///
@@ -136,9 +150,12 @@ pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
 pub fn fetch_vcf(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<VcfVariant>> {
     let mut variants = Vec::new();
     fetch_tabix_lines_with(bgz_path, region, |line| {
-        let v: VcfVariant = line.parse()
+        let v: VcfVariant = line
+            .parse()
             .with_context(|| format!("failed to parse VCF line: {line}"))?;
-        variants.push(v);
+        if v.position.contig == region.contig && region.range.contains(&v.position.position) {
+            variants.push(v);
+        }
         Ok(())
     })?;
     Ok(variants)
@@ -174,19 +191,21 @@ impl std::str::FromStr for VCFRow {
     fn from_str(s: &str) -> anyhow::Result<Self> {
         let f: Vec<&str> = s.split('\t').collect();
         let get = |i: usize, name: &str| -> anyhow::Result<&str> {
-            f.get(i).copied().ok_or_else(|| anyhow::anyhow!("missing {name} (col {i})"))
+            f.get(i)
+                .copied()
+                .ok_or_else(|| anyhow::anyhow!("missing {name} (col {i})"))
         };
         Ok(Self {
-            chr:       get(0, "chr")?.to_string(),
-            pos:       get(1, "pos")?.parse().context("bad pos")?,
-            id:        get(2, "id")?.to_string(),
+            chr: get(0, "chr")?.to_string(),
+            pos: get(1, "pos")?.parse().context("bad pos")?,
+            id: get(2, "id")?.to_string(),
             reference: get(3, "reference")?.to_string(),
-            alt:       get(4, "alt")?.to_string(),
-            qual:      get(5, "qual")?.to_string(),
-            filter:    get(6, "filter")?.to_string(),
-            info:      get(7, "info")?.to_string(),
-            format:    get(8, "format")?.to_string(),
-            value:     get(9, "value")?.to_string(),
+            alt: get(4, "alt")?.to_string(),
+            qual: get(5, "qual")?.to_string(),
+            filter: get(6, "filter")?.to_string(),
+            info: get(7, "info")?.to_string(),
+            format: get(8, "format")?.to_string(),
+            value: get(9, "value")?.to_string(),
         })
     }
 }
@@ -220,7 +239,10 @@ pub fn vcf_header(dict: &str) -> anyhow::Result<Vec<String>> {
     );
     header.push("##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">".to_string());
     header.push("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">".to_string());
-    header.push(format!("##Pandora_lib_version={}", env!("CARGO_PKG_VERSION")));
+    header.push(format!(
+        "##Pandora_lib_version={}",
+        env!("CARGO_PKG_VERSION")
+    ));
 
     read_dict(dict)?
         .into_iter()

+ 93 - 25
src/io/writers.rs

@@ -33,10 +33,7 @@ use crate::{helpers::TempFileGuard, io::readers::get_reader};
 ///
 /// Returns an error if the path does not end in `.gz`, removal fails when
 /// `force` is set, or the file cannot be created.
-pub fn get_gz_writer(
-    path: &str,
-    force: bool,
-) -> anyhow::Result<bgzf::io::Writer<BufWriter<File>>> {
+pub fn get_gz_writer(path: &str, force: bool) -> anyhow::Result<bgzf::io::Writer<BufWriter<File>>> {
     if !path.ends_with(".gz") {
         anyhow::bail!("file should end with .gz: {path}");
     }
@@ -184,8 +181,9 @@ pub enum IndexFormat {
 ///
 /// Writes data lines to a BGZF-compressed file while simultaneously building
 /// a Tabix index. On [`finish`](BgzTabixWriter::finish), both the `.gz` and
-/// `.tbi` files are written atomically using a UUID temp file and [`TempFileGuard`],
-/// so a failed write never leaves a corrupt output behind.
+/// `.tbi` files are written through UUID temp files registered with
+/// [`TempFileGuard`], so a failed write cleans up temporary outputs instead of
+/// leaving corrupt partial temp files behind.
 ///
 /// # Coordinate systems
 ///
@@ -193,11 +191,14 @@ pub enum IndexFormat {
 /// `[start, end)`. Use [`write_record`](BgzTabixWriter::write_record) with
 /// pre-converted 1-based [`Position`] values for any format, or the convenience
 /// method [`write_bed_record`](BgzTabixWriter::write_bed_record) which performs
-/// the BED  Tabix conversion automatically:
+/// the BED -> Tabix conversion automatically:
 ///
 /// ```text
 /// tabix_start = bed_start + 1        (0-based inclusive → 1-based inclusive)
 /// tabix_end   = bed_end              (0-based exclusive = 1-based inclusive numerically)
+///
+/// For zero-width BED insertion features (start == end), the original BED line
+/// is preserved and indexed using a one-base proxy: [start + 1, start + 1].
 /// ```
 ///
 /// # Example
@@ -284,7 +285,9 @@ impl BgzTabixWriter {
     ///
     /// Returns an error if the write fails.
     pub fn write_header(&mut self, line: &[u8]) -> anyhow::Result<()> {
-        self.writer.write_all(line).context("failed writing header line")?;
+        self.writer
+            .write_all(line)
+            .context("failed writing header line")?;
         Ok(())
     }
 
@@ -343,7 +346,9 @@ impl BgzTabixWriter {
         self.prev_start = Some(start);
 
         let chunk_start = self.writer.virtual_position();
-        self.writer.write_all(line).context("failed writing record")?;
+        self.writer
+            .write_all(line)
+            .context("failed writing record")?;
         let chunk_end = self.writer.virtual_position();
 
         self.indexer
@@ -359,6 +364,9 @@ impl BgzTabixWriter {
     /// - `tabix_start = start0 + 1`  (0-based inclusive → 1-based inclusive)
     /// - `tabix_end   = end0`        (0-based exclusive = 1-based inclusive numerically)
     ///
+    /// Zero-width BED insertion features (`start0 == end0`) are indexed using
+    /// a one-base proxy at `start0 + 1`. The written BED line remains unchanged.
+    ///
     /// # Arguments
     ///
     /// * `line` - Raw bytes of the BED line (including the trailing newline)
@@ -376,48 +384,58 @@ impl BgzTabixWriter {
         start0: u32,
         end0: u32,
     ) -> anyhow::Result<()> {
-        let start = Position::try_from(start0 as usize + 1)
-            .context("BED: start coordinate overflow to tabix Position")?;
-        let end = Position::try_from(end0 as usize)
-            .context("BED: end must be >= 1 for tabix indexing")?;
+        let (start, end) = bed_to_tabix_interval(start0, end0)?;
         self.write_record(line, rname, start, end)
     }
 
-    /// Finalise the BGZF file, write the `.tbi` index, and atomically rename
-    /// the temp file to the output path.
+    /// Finalise the BGZF file, write the `.tbi` index, and rename both temp
+    /// files to their output paths.
     ///
-    /// The [`TempFileGuard`] is disarmed on success; on failure it removes the
-    /// temp file automatically.
+    /// The index is written to a [`TempFileGuard`]-tracked temp path first. This
+    /// avoids leaving a stale final `.tbi` behind if BGZF finalisation or rename
+    /// fails before the `.gz` is in place.
     ///
     /// # Errors
     ///
     /// Returns an error if BGZF finalisation, index writing, or the rename fails.
-    /// Finalise the BGZF file, write the index, and atomically rename the temp
-    /// file to the output path.
+    /// Finalise the BGZF file, write the index, and rename temp files to the
+    /// output paths.
     ///
     /// The index extension matches the format: `.tbi` for [`IndexFormat::Tbi`],
     /// `.csi` for [`IndexFormat::Csi`] (not yet implemented).
     ///
-    /// The [`TempFileGuard`] is disarmed on success; on failure it removes the
-    /// temp file automatically.
+    /// The [`TempFileGuard`] is disarmed on success; on failure it removes any
+    /// remaining temp files automatically.
     ///
     /// # Errors
     ///
     /// Returns an error if BGZF finalisation, index writing, or the rename fails.
     pub fn finish(self) -> anyhow::Result<()> {
-        let Self { writer, indexer, format, tmp_path, output_path, mut guard, .. } = self;
+        let Self {
+            writer,
+            indexer,
+            format,
+            tmp_path,
+            output_path,
+            mut guard,
+            ..
+        } = self;
 
         let tmp_str = tmp_path.to_string_lossy().to_string();
+        let tmp_dir = tmp_path.parent().unwrap_or(Path::new(".")).to_path_buf();
 
         finalize_bgzf_file(writer, &tmp_str)?;
 
         let index = indexer.build();
+        let tbi_path = format!("{output_path}.tbi");
+        let tmp_tbi_path = guard.tmp_path(".tbi", &tmp_dir);
+        let tmp_tbi_str = tmp_tbi_path.to_string_lossy().to_string();
 
         match format {
             IndexFormat::Tbi => {
-                let tbi_path = format!("{output_path}.tbi");
-                tabix::fs::write(&tbi_path, &index)
-                    .with_context(|| format!("failed writing tabix index: {tbi_path}"))?;
+                tabix::fs::write(&tmp_tbi_path, &index).with_context(|| {
+                    format!("failed writing temporary tabix index: {tmp_tbi_str}")
+                })?;
             }
             IndexFormat::Csi => {
                 // Unreachable: new() rejects Csi before any writing begins.
@@ -429,20 +447,70 @@ impl BgzTabixWriter {
             fs::remove_file(&output_path)
                 .with_context(|| format!("failed to remove existing output: {output_path}"))?;
         }
+        if Path::new(&tbi_path).exists() {
+            fs::remove_file(&tbi_path)
+                .with_context(|| format!("failed to remove existing tabix index: {tbi_path}"))?;
+        }
 
         fs::rename(&tmp_path, &output_path)
             .with_context(|| format!("failed to rename {tmp_str} → {output_path}"))?;
+        if let Err(e) = fs::rename(&tmp_tbi_path, &tbi_path) {
+            let _ = fs::remove_file(&output_path);
+            return Err(e).with_context(|| format!("failed to rename {tmp_tbi_str} → {tbi_path}"));
+        }
 
         guard.disarm();
         Ok(())
     }
 }
 
+/// Convert BED 0-based half-open coordinates into 1-based inclusive Tabix positions.
+///
+/// Zero-width BED features (`start0 == end0`) are valid insertion features in
+/// UCSC BED. They cannot be represented as a zero-length 1-based inclusive
+/// interval, so they are indexed using a one-base proxy at `start0 + 1` while
+/// preserving the original BED line in the BGZF file.
+pub fn bed_to_tabix_interval(start0: u32, end0: u32) -> anyhow::Result<(Position, Position)> {
+    anyhow::ensure!(
+        end0 >= start0,
+        "BED: end ({end0}) is before start ({start0})"
+    );
+
+    let start = Position::try_from(start0 as usize + 1)
+        .context("BED: start coordinate overflow to tabix Position")?;
+    let end1 = if end0 == start0 {
+        start0 as usize + 1
+    } else {
+        end0 as usize
+    };
+    let end = Position::try_from(end1).context("BED: end coordinate overflow to tabix Position")?;
+
+    Ok((start, end))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::helpers::test_init;
 
+    #[test]
+    fn bed_to_tabix_interval_supports_zero_width_features() -> anyhow::Result<()> {
+        let (start, end) = bed_to_tabix_interval(0, 0)?;
+        assert_eq!(usize::from(start), 1);
+        assert_eq!(usize::from(end), 1);
+
+        let (start, end) = bed_to_tabix_interval(10, 10)?;
+        assert_eq!(usize::from(start), 11);
+        assert_eq!(usize::from(end), 11);
+
+        let (start, end) = bed_to_tabix_interval(10, 20)?;
+        assert_eq!(usize::from(start), 11);
+        assert_eq!(usize::from(end), 20);
+
+        assert!(bed_to_tabix_interval(20, 10).is_err());
+        Ok(())
+    }
+
     #[test]
     fn bgz_convert() -> anyhow::Result<()> {
         test_init();