2 Комити a0f6db44b0 ... bf12d05d90

Аутор SHA1 Порука Датум
  Thomas bf12d05d90 longcallD пре 1 месец
  Thomas 3ea0a26d87 multiallelic normalization (just for INFO FREQ of dbsnp) for fetch_vcf, prefer normalized vcf... пре 1 месец

+ 11 - 0
pandora-config.example.toml

@@ -515,6 +515,17 @@ wtdbg2_threads = 8
 # Memory for SLURM. wtdbg2 is lightweight — 16G is ample for local assembly.
 wtdbg2_slurm_mem = "16G"
 
+#######################################
+# longcallD
+#######################################
+# Template for the longcallD output directory (solo and normal/tumor runs).
+#
+# Required placeholders: `{result_dir}`, `{id}`, `{time}`.
+longcalld_output_dir = "{result_dir}/{id}/{time}/longcallD"
+longcalld_bin = "/home/t_steimle/somatic_pipe_tools/longcallD-v0.0.10_x64-linux/longcallD"
+longcalld_threads = 10
+longcalld_slurm_mem = "40G"
+
 #######################################
 # Marlin
 #######################################

+ 6 - 17
src/callers/deep_variant.rs

@@ -387,22 +387,6 @@ impl JobCommand for DeepVariant {
 impl LocalRunner for DeepVariant {}
 impl LocalBatchRunner for DeepVariant {}
 
-impl SlurmRunner for DeepVariant {
-    fn slurm_args(&self) -> Vec<String> {
-        let batch_id = self.part_index.map(|i| format!("_{i}")).unwrap_or_default();
-        SlurmParams {
-            job_name: Some(format!(
-                "deepvariant_{}_{}{}",
-                self.id, self.time_point, batch_id
-            )),
-            cpus_per_task: Some(self.config.deepvariant_threads.into()),
-            mem: Some("40G".into()),
-            partition: Some("shortq".into()),
-            gres: None,
-        }
-        .to_args()
-    }
-}
 impl SbatchRunner for DeepVariant {
     fn slurm_params(&self) -> SlurmParams {
         let batch_id = self.part_index.map(|i| format!("_{i}")).unwrap_or_default();
@@ -419,6 +403,12 @@ impl SbatchRunner for DeepVariant {
     }
 }
 
+impl SlurmRunner for DeepVariant {
+    fn slurm_args(&self) -> Vec<String> {
+        self.slurm_params().to_args()
+    }
+}
+
 impl DeepVariant {
     /// Returns the per-part output directory.
     ///
@@ -823,7 +813,6 @@ pub fn run_deepvariant_chunked(
         .collect::<Vec<String>>();
 
     let actual_n_parts = region_chunks.len();
-
     info!(
         "Running DeepVariant in {} parallel parts for {} {}",
         actual_n_parts, id, time_point

+ 457 - 0
src/callers/longcallD.rs

@@ -0,0 +1,457 @@
+use std::{
+    fmt, fs,
+    path::{Path, PathBuf},
+};
+
+use anyhow::Context;
+use log::info;
+use rust_htslib::bam::{self, Read};
+use uuid::Uuid;
+
+use crate::{
+    commands::{
+        bcftools::{BcftoolsConcat, BcftoolsKeepPass},
+        samtools::{SamtoolsIndex, SamtoolsMergeMany},
+        Command as JobCommand, LocalBatchRunner, LocalRunner, SbatchRunner, SlurmParams,
+        SlurmRunner,
+    },
+    config::Config,
+    helpers::{get_genome_sizes_in_header_order, split_ordered_genome_into_n_regions_exact},
+    locker::SampleLock,
+    pipes::{InitializeSolo, ShouldRun},
+    run, run_many,
+    runners::Run,
+};
+
+#[derive(Debug, Clone)]
+pub struct LongcallD {
+    /// Sample identifier
+    pub case_id: String,
+    pub time_point: String,
+    /// Global pipeline configuration
+    pub bin: String,
+    pub reference: String,
+    pub threads: u8,
+    pub slurm_mem: String,
+    /// Directory for log file storage
+    pub log_dir: String,
+    pub output_dir: String,
+    pub final_vcf: String,
+    /// Optional genomic region restriction (format: "chr:start-end")
+    pub regions: String,
+    /// Optional part index for chunked parallel runs (1-indexed)
+    pub part_index: Option<usize>,
+    pub config: Config,
+}
+
+impl fmt::Display for LongcallD {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "🧬 LongcallD Solo")?;
+        writeln!(f, "   Case ID    : {}", self.case_id)?;
+        writeln!(f, "   Time point : {}", self.time_point)?;
+        writeln!(f, "   Regions    : {}", self.regions)?;
+        writeln!(f, "   Log dir    : {}", self.log_dir)?;
+        writeln!(
+            f,
+            "   Part       : {}",
+            self.part_index
+                .map_or("full".into(), |n| format!("part{n}"))
+        )
+    }
+}
+
+impl InitializeSolo for LongcallD {
+    fn initialize(id: &str, time: &str, config: &Config) -> anyhow::Result<Self> {
+        let case_id = id.to_string();
+        let time_point = time.to_string();
+
+        info!("Initializing LongcallD for {case_id} {time}");
+        let log_dir = format!("{}/{}/log/longcallD", config.result_dir, &id);
+
+        let output_dir = config.longcalld_output_dir(&case_id, &time_point);
+
+        let final_vcf = config.longcalld_solo_passed_vcf(&case_id, &time_point);
+
+        Ok(Self {
+            case_id,
+            time_point,
+            bin: config.longcalld_bin.clone(),
+            reference: config.reference.clone(),
+            threads: config.longcalld_threads,
+            slurm_mem: config.longcalld_slurm_mem.clone(),
+            log_dir,
+            output_dir,
+            final_vcf,
+            regions: String::default(),
+            part_index: None,
+            config: config.clone(),
+        })
+    }
+}
+
+impl ShouldRun for LongcallD {
+    fn should_run(&self) -> bool {
+        // should inspect outputs
+        true
+    }
+}
+
+impl LongcallD {
+    /// Returns the per-part output directory.
+    ///
+    /// For partitioned runs, creates an isolated subdirectory to prevent
+    /// intermediate file collisions between parallel jobs.
+    fn part_output_dir(&self) -> String {
+        match self.part_index {
+            Some(idx) => format!("{}/part{idx}", self.output_dir),
+            None => self.output_dir.clone(),
+        }
+    }
+
+    /// Returns the output VCF path for this run.
+    ///
+    /// For partitioned runs, the file is placed in a part-specific subdirectory.
+    fn output_vcf_path(&self) -> String {
+        let output_dir = self.part_output_dir();
+
+        format!(
+            "{output_dir}/{}_{}_longcallD.vcf.gz",
+            self.case_id, self.time_point
+        )
+    }
+
+    fn output_bam_path(&self) -> String {
+        let output_dir = self.part_output_dir();
+
+        format!(
+            "{output_dir}/{}_{}_longcallD.bam",
+            self.case_id, self.time_point
+        )
+    }
+
+    /// Returns the PASS-filtered VCF path.
+    ///
+    /// - For partitioned runs: part-specific path for intermediate PASS VCF
+    /// - For single runs or final output: canonical path from config
+    ///
+    /// The final merged PASS VCF always uses the canonical path regardless
+    /// of whether it was produced by a single run or merged from parts.
+    fn passed_vcf_path(&self) -> String {
+        match self.part_index {
+            Some(_) => {
+                // Part runs: intermediate PASS VCF in part-specific directory
+                self.output_vcf_path().replace(".vcf.gz", ".pass.vcf.gz")
+            }
+            None => {
+                // Single run or final merged output: canonical path
+                self.final_vcf.clone()
+            }
+        }
+    }
+
+    /// Filters variants to keep only PASS entries.
+    fn filter_pass(&self) -> anyhow::Result<()> {
+        let output_vcf = self.output_vcf_path();
+        let vcf_passed = self.passed_vcf_path();
+
+        info!(
+            "Filtering PASS variants for {} {} (part: {:?})",
+            self.case_id, self.time_point, self.part_index
+        );
+
+        let mut cmd = BcftoolsKeepPass::from_config(&self.config, output_vcf, vcf_passed);
+        let report = run!(&self.config, &mut cmd)?;
+
+        report
+            .save_to_file(format!("{}/bcftools_pass_", self.log_dir))
+            .context("Can't save bcftools PASS logs")?;
+
+        Ok(())
+    }
+}
+
+impl JobCommand for LongcallD {
+    /// Initializes output and log directories for this run.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if directory creation fails.
+    fn init(&mut self) -> anyhow::Result<()> {
+        // Part-specific output dir
+        let output_dir = self.part_output_dir();
+
+        fs::create_dir_all(&output_dir).context(format!("Failed to create dir: {output_dir}"))?;
+
+        // log dir
+        fs::create_dir_all(&self.log_dir)
+            .context(format!("Failed to create dir: {}", self.log_dir))?;
+
+        Ok(())
+    }
+
+    //longcallD call -t16 ref.fa ont.bam --ont --refine-aln -b ont_phased_refined.bam > ont.vcf # output phased & refined ONT reads (BAM tag: HP & PS)
+    /// Generates the longcallD command string for execution.
+    fn cmd(&self) -> String {
+        format!(
+            "{bin} call \
+            -t{threads} \
+            {reference} {bam} {regions} --ont \
+            --refine-aln \
+            -b {out_bam} > {out_vcf}",
+            bin = self.bin,
+            threads = self.threads,
+            reference = self.reference,
+            regions = self.regions,
+            bam = self.config.solo_bam(&self.case_id, &self.time_point),
+            out_bam = self.output_bam_path(),
+            out_vcf = self.output_vcf_path()
+        )
+    }
+}
+
+impl Run for LongcallD {
+    fn run(&mut self) -> anyhow::Result<()> {
+        let lock_dir = format!("{}/locks", self.config.result_dir);
+        let _lock = SampleLock::acquire(
+            &lock_dir,
+            &format!("{}-{}", self.case_id, self.time_point),
+            "longcalld",
+        )
+        .with_context(|| format!("Cannot start longcallD chunked for {}", self.case_id))?;
+
+        if !self.should_run() {
+            info!("longcallD is up-to-data.");
+            return Ok(());
+        }
+
+        if self.config.slurm_runner {
+            run_chunked(&self.case_id, &self.time_point, &self.config, 30)
+        } else {
+            run!(&self.config, self)?;
+            self.filter_pass()
+        }
+    }
+}
+
+fn merge_parts(base: &LongcallD, n_parts: usize) -> anyhow::Result<()> {
+    let mut part_pass_paths: Vec<PathBuf> = Vec::with_capacity(n_parts);
+
+    for i in 1..=n_parts {
+        let mut dv = base.clone();
+        dv.part_index = Some(i);
+        let part_pass = dv.passed_vcf_path();
+
+        anyhow::ensure!(
+            Path::new(&part_pass).exists(),
+            "Missing part {i} PASS VCF: {part_pass}"
+        );
+
+        part_pass_paths.push(PathBuf::from(part_pass));
+    }
+
+    let final_passed_vcf = base
+        .config
+        .longcalld_solo_passed_vcf(&base.case_id, &base.time_point);
+    let rand = Uuid::new_v4();
+    let final_tmp = format!("{final_passed_vcf}_{rand}.tmp");
+    let final_tmp_csi = format!("{final_passed_vcf}_{rand}.tmp.csi");
+
+    if let Some(parent) = Path::new(&final_passed_vcf).parent() {
+        fs::create_dir_all(parent)?;
+    }
+
+    info!(
+        "Concatenating {} part VCFs into {}",
+        n_parts, final_passed_vcf
+    );
+
+    let mut concat = BcftoolsConcat::from_config(&base.config, part_pass_paths, &final_tmp);
+    run!(&base.config, &mut concat)
+        .context("Failed to run bcftools concat for longcallD parts")?
+        .save_to_file(format!("{}/bcftools_concat", base.log_dir))?;
+
+    fs::rename(&final_tmp, &final_passed_vcf)
+        .context("Failed to rename merged longcallD PASS VCF")?;
+
+    fs::rename(&final_tmp_csi, format!("{final_passed_vcf}.csi"))
+        .context("Failed to rename merged longcallD PASS VCF CSI")?;
+
+    info!(
+        "Successfully merged {} parts into {}",
+        n_parts, final_passed_vcf
+    );
+
+    Ok(())
+}
+
+fn run_chunked(
+    case_id: &str,
+    time_point: &str,
+    config: &Config,
+    n_parts: usize,
+) -> anyhow::Result<()> {
+    anyhow::ensure!(n_parts > 0, "n_parts must be > 0");
+
+    let base = LongcallD::initialize(case_id, time_point, config)?;
+
+    let bam_path = config.solo_bam(case_id, time_point);
+    let reader = bam::Reader::from_path(&bam_path)
+        .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
+    let genome_sizes = get_genome_sizes_in_header_order(reader.header())?;
+
+    let region_chunks = split_ordered_genome_into_n_regions_exact(&genome_sizes, n_parts)
+        .into_iter()
+        .map(|v| v.join(" "))
+        .collect::<Vec<String>>();
+
+    let actual_n_parts = region_chunks.len();
+    info!(
+        "Running longcallD in {} parallel parts for {} {}",
+        actual_n_parts, case_id, time_point
+    );
+
+    let mut jobs = Vec::with_capacity(actual_n_parts);
+    for (i, regions) in region_chunks.into_iter().enumerate() {
+        let mut job = base.clone();
+        job.regions = regions;
+        job.part_index = Some(i + 1);
+        job.log_dir = format!("{}/part{}", base.log_dir, i + 1);
+        info!("Planned longcallD job:\n{job}");
+        jobs.push(job);
+    }
+
+    // Run all longcallD jobs
+    let outputs = run_many!(config, jobs.clone())?;
+    for output in outputs.iter() {
+        output.save_to_file(format!("{}/longcallD_", base.log_dir))?;
+    }
+
+    // Filter PASS variants for each part
+    info!(
+        "Filtering PASS variants for all {} parts in parallel",
+        actual_n_parts
+    );
+    let filter_jobs: Vec<_> = jobs
+        .iter()
+        .map(|job| {
+            BcftoolsKeepPass::from_config(&job.config, job.output_vcf_path(), job.passed_vcf_path())
+        })
+        .collect();
+    run_many!(config, filter_jobs)?
+        .iter()
+        .enumerate()
+        .try_for_each(|(i, output)| {
+            output
+                .save_to_file(format!("{}/bcftools_pass_part{}_", base.log_dir, i + 1))
+                .map(|_| ())
+        })?;
+
+    // Merge PASS VCFs
+    merge_parts(&base, actual_n_parts)?;
+
+    // Index per-part BAMs before merging
+    info!("Indexing {} part BAMs in parallel", actual_n_parts);
+    let index_jobs: Vec<_> = jobs
+        .iter()
+        .map(|job| SamtoolsIndex::from_config(&job.config, &job.output_bam_path()))
+        .collect();
+    run_many!(config, index_jobs)?
+        .iter()
+        .enumerate()
+        .try_for_each(|(i, output)| {
+            output
+                .save_to_file(format!("{}/samtools_index_part{}_", base.log_dir, i + 1))
+                .map(|_| ())
+        })?;
+
+    // Merge part BAMs into final BAM
+    let final_bam = PathBuf::from(
+        base.config
+            .longcalld_solo_bam(&base.case_id, &base.time_point),
+    );
+    let part_bams: Vec<PathBuf> = jobs
+        .iter()
+        .map(|job| PathBuf::from(job.output_bam_path()))
+        .collect();
+
+    info!(
+        "Merging {} part BAMs into {}",
+        actual_n_parts,
+        final_bam.display()
+    );
+    run!(
+        config,
+        &mut SamtoolsMergeMany::from_config(final_bam.clone(), part_bams, config)
+    )
+    .context("Failed to merge longcallD part BAMs")?
+    .save_to_file(format!("{}/samtools_merge_", base.log_dir))?;
+
+    // Index final merged BAM
+    run!(
+        config,
+        &mut SamtoolsIndex::from_config(config, final_bam.to_str().unwrap())
+    )
+    .context("Failed to index merged longcallD BAM")?
+    .save_to_file(format!("{}/samtools_index_final_", base.log_dir))?;
+
+    info!(
+        "LongcallD completed for {} {}: {} parts merged",
+        case_id, time_point, actual_n_parts
+    );
+
+    Ok(())
+}
+
+impl LocalRunner for LongcallD {}
+impl LocalBatchRunner for LongcallD {}
+
+impl SbatchRunner for LongcallD {
+    fn slurm_params(&self) -> SlurmParams {
+        let batch_id = self.part_index.map(|i| format!("_{i}")).unwrap_or_default();
+        SlurmParams {
+            job_name: Some(format!(
+                "longcallD_{}_{}{}",
+                self.case_id, self.time_point, batch_id
+            )),
+            cpus_per_task: Some(self.threads.into()),
+            mem: Some(self.slurm_mem.clone()),
+            partition: Some("shortq".into()),
+            gres: None,
+        }
+    }
+}
+
+impl SlurmRunner for LongcallD {
+    fn slurm_args(&self) -> Vec<String> {
+        self.slurm_params().to_args()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::helpers::test_init;
+
+    use super::*;
+
+    // #[test]
+    // fn deepvariant_version() -> anyhow::Result<()> {
+    //     test_init();
+    //     let vl = DeepVariant::version(&Config::default())?;
+    //     info!("DeepVariant local version: {vl}");
+    //     let vs = DeepVariant::version_slurm(&Config::default())?;
+    //     info!("DeepVariant slurm version: {vs}");
+    //     assert_eq!(vl, vs);
+    //     Ok(())
+    // }
+
+    #[test]
+    fn longcalld_run() -> anyhow::Result<()> {
+        test_init();
+        let config = Config::default();
+
+        for id in ["CML2518"] {
+            crate::runners::Run::run(&mut LongcallD::initialize(id, "diag", &config)?)?;
+        }
+        Ok(())
+    }
+}

+ 1 - 0
src/callers/mod.rs

@@ -157,6 +157,7 @@ pub mod nanomonsv;
 pub mod savana;
 pub mod severus;
 pub mod straglr;
+pub mod longcallD;
 
 /// Runs all somatic variant callers sequentially for comprehensive multi-caller analysis.
 ///

+ 39 - 0
src/config.rs

@@ -545,6 +545,15 @@ pub struct Config {
     pub wtdbg2_bin: String,
     pub wtdbg2_threads: u8,
     pub wtdbg2_slurm_mem: String,
+
+    // === longcallD configuration ===
+    /// Template for the longcallD output directory (solo and normal/tumor runs).
+    ///
+    /// Placeholders: `{result_dir}`, `{id}`, `{time}`.
+    pub longcalld_output_dir: String,
+    pub longcalld_bin: String,
+    pub longcalld_threads: u8,
+    pub longcalld_slurm_mem: String,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -1227,6 +1236,36 @@ impl Config {
             .replace("{id}", id)
             .replace("{time}", time)
     }
+
+    /// longcallD output directory (`{result_dir}`, `{id}`, `{time}`).
+    pub fn longcalld_output_dir(&self, id: &str, time: &str) -> String {
+        self.longcalld_output_dir
+            .replace("{result_dir}", &self.result_dir)
+            .replace("{id}", id)
+            .replace("{time}", time)
+    }
+
+    /// longcallD solo *PASSED* VCF for `<id>, <time>`.
+    pub fn longcalld_solo_passed_vcf(&self, id: &str, time: &str) -> String {
+        format!(
+            "{}/{}_{}_longcallD_PASSED.vcf.gz",
+            self.longcalld_output_dir(id, time),
+            id,
+            time
+        )
+    }
+
+    pub fn longcalld_solo_bam(&self, id: &str, time: &str) -> String {
+        format!(
+            "{}/{}_{}_{}_{}_realn.bam",
+            self.solo_dir(id, time),
+            id,
+            time,
+            self.reference_name,
+            self.haplotagged_bam_tag_name
+        )
+    }
+
 }
 
 impl Default for Config {

+ 42 - 0
src/io/fasta.rs

@@ -154,3 +154,45 @@ pub fn split_fasta(fasta: &Path, out_dir: &Path) -> anyhow::Result<Vec<ContigFas
 
     Ok(contigs)
 }
+
+/// Read a single-contig FASTA produced by split_fasta.
+/// Skips the header line, uppercases, replaces non-ACGT with N.
+/// Pure std — no noodles needed since we wrote the file ourselves.
+pub fn read_single_contig_fasta(path: &Path) -> anyhow::Result<Vec<u8>> {
+    use std::io::BufRead;
+    let f = std::fs::File::open(path)?;
+    let reader = std::io::BufReader::with_capacity(8 * 1024 * 1024, f);
+    let mut seq = Vec::new();
+    for line in reader.lines() {
+        let line = line?;
+        if line.starts_with('>') { continue; }
+        let sanitized = line.bytes().map(|b| match b.to_ascii_uppercase() {
+            b'A' | b'C' | b'G' | b'T' => b.to_ascii_uppercase(),
+            _ => b'N',
+        });
+        seq.extend(sanitized);
+    }
+    anyhow::ensure!(!seq.is_empty(), "empty contig in {}", path.display());
+    Ok(seq)
+}
+
+pub struct FaiEntry {
+    pub name: String,
+    pub len:  usize,
+}
+
+pub fn read_fai(path: &Path) -> anyhow::Result<Vec<FaiEntry>> {
+    use std::io::BufRead;
+    let f = std::fs::File::open(path)?;
+    std::io::BufReader::new(f)
+        .lines()
+        .map(|l| {
+            let l = l?;
+            let mut cols = l.split('\t');
+            let name = cols.next().context("fai: missing name")?.to_owned();
+            let len  = cols.next().context("fai: missing len")?.parse::<usize>()
+                           .context("fai: bad len")?;
+            Ok(FaiEntry { name, len })
+        })
+        .collect()
+}

+ 2 - 2
src/io/vcf.rs

@@ -154,7 +154,7 @@ pub fn fetch_vcf(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<Vcf
             .parse()
             .with_context(|| format!("failed to parse VCF line: {line}"))?;
         if v.position.contig == region.contig && region.range.contains(&v.position.position) {
-            for split in from_multiallelic(v) {
+            for split in from_multiallelic(v)? {
                 variants.push(split);
             }
         }
@@ -263,7 +263,7 @@ mod tests {
     #[test]
     fn vcf_dbsnp() -> anyhow::Result<()> {
         let config = Config::default();
-        let v = fetch_vcf(&config.db_snp, &GenomeRange::new("chr1", 5656, 5657))?;
+        let v = fetch_vcf(&config.db_snp, &GenomeRange::new("chr1", 5656, 5660))?;
         println!("{v:#?}");
         Ok(())
     }

+ 1 - 0
src/lib.rs

@@ -150,6 +150,7 @@ pub mod runners;
 pub mod scan;
 pub mod slurm_helpers;
 pub mod variant;
+pub mod uniqueness;
 
 #[macro_use]
 extern crate lazy_static;

+ 156 - 0
src/uniqueness/cache.rs

@@ -0,0 +1,156 @@
+use std::fs::{self, File};
+use std::io::{self, BufReader, BufWriter, Read, Write};
+use std::path::Path;
+use std::time::SystemTime;
+
+/// Magic + version guard
+const MAGIC: &[u8; 8] = b"UNIQIDX1";
+const VERSION: u32 = 1;
+
+/// Identifies a cache entry unambiguously.
+/// Derive from FASTA path + contig name + genome length + content hash.
+pub struct CacheKey {
+    pub contig: String,
+    pub genome_len: usize,
+    /// Blake3 hex hash of the raw genome bytes (first 16 hex chars sufficient)
+    pub hash: String,
+}
+
+impl CacheKey {
+    /// Compute from raw genome bytes + contig name.
+    pub fn from_genome(contig: &str, genome: &[u8]) -> Self {
+        // Blake3 is fast (~3 GB/s), ideal for large genomes.
+        // Use `blake3` crate or fall back to a simple FNV for no-dep builds.
+        let hash = blake3_hex_short(genome);
+        Self {
+            contig: contig.to_owned(),
+            genome_len: genome.len(),
+            hash,
+        }
+    }
+
+    /// Construit la clé depuis le .fai (noodles) — ne lit pas le FASTA.
+    /// `fai_entry` contient déjà la longueur du contig.
+    pub fn from_fai(contig: &str, fai_len: usize, fasta_path: &Path) -> io::Result<Self> {
+        let meta = std::fs::metadata(fasta_path)?;
+        let mtime = meta
+            .modified()?
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .map(|d| d.as_secs())
+            .unwrap_or(0);
+        let fsize = meta.len();
+
+        // Hash = mtime + fsize — suffisant pour invalider sur modification
+        let hash = format!("{mtime:016x}{fsize:016x}");
+
+        Ok(Self {
+            contig: contig.to_owned(),
+            genome_len: fai_len,
+            hash,
+        })
+    }
+
+    pub fn file_name(&self) -> String {
+        format!(
+            "{}__{}__{}.uniqidx",
+            self.contig, self.genome_len, self.hash
+        )
+    }
+}
+
+/// Binary format (little-endian):
+///
+/// [0..8]   magic   = b"UNIQIDX1"
+/// [8..12]  version = 1u32
+/// [12..20] n       = genome_len as u64
+/// [20..]   min_unique_len: n × u64  (usize::MAX encoded as u64::MAX)
+pub fn save_to_cache(path: &Path, min_unique_len: &[usize]) -> io::Result<()> {
+    let tmp = path.with_extension("uniqidx.tmp");
+    {
+        let f = File::create(&tmp)?;
+        let mut w = BufWriter::with_capacity(8 * 1024 * 1024, f);
+
+        w.write_all(MAGIC)?;
+        w.write_all(&VERSION.to_le_bytes())?;
+        w.write_all(&(min_unique_len.len() as u64).to_le_bytes())?;
+
+        // Bulk-write as u64 array — avoids per-element call overhead
+        let buf: Vec<u8> = min_unique_len
+            .iter()
+            .flat_map(|&v| {
+                let encoded: u64 = if v == usize::MAX { u64::MAX } else { v as u64 };
+                encoded.to_le_bytes()
+            })
+            .collect();
+        w.write_all(&buf)?;
+        w.flush()?;
+    }
+    // Atomic rename: avoids corrupt cache on crash mid-write
+    fs::rename(&tmp, path)?;
+    Ok(())
+}
+
+pub fn load_from_cache(path: &Path, expected_len: usize) -> io::Result<Vec<usize>> {
+    let f = File::open(path)?;
+    let mut r = BufReader::with_capacity(8 * 1024 * 1024, f);
+
+    // Magic
+    let mut magic = [0u8; 8];
+    r.read_exact(&mut magic)?;
+    if &magic != MAGIC {
+        return Err(io::Error::new(io::ErrorKind::InvalidData, "bad magic"));
+    }
+
+    // Version
+    let mut vbuf = [0u8; 4];
+    r.read_exact(&mut vbuf)?;
+    if u32::from_le_bytes(vbuf) != VERSION {
+        return Err(io::Error::new(
+            io::ErrorKind::InvalidData,
+            "version mismatch",
+        ));
+    }
+
+    // Length
+    let mut nbuf = [0u8; 8];
+    r.read_exact(&mut nbuf)?;
+    let n = u64::from_le_bytes(nbuf) as usize;
+    if n != expected_len {
+        return Err(io::Error::new(
+            io::ErrorKind::InvalidData,
+            format!("length mismatch: cache has {n}, genome is {expected_len}"),
+        ));
+    }
+
+    // Payload: read all at once then cast
+    let byte_len = n * 8;
+    let mut buf = vec![0u8; byte_len];
+    r.read_exact(&mut buf)?;
+
+    let result: Vec<usize> = buf
+        .chunks_exact(8)
+        .map(|c| {
+            let v = u64::from_le_bytes(c.try_into().unwrap());
+            if v == u64::MAX {
+                usize::MAX
+            } else {
+                v as usize
+            }
+        })
+        .collect();
+
+    Ok(result)
+}
+
+// ── Minimal Blake3-like hash (FNV-1a 64bit, no deps) ─────────────────────────
+// Replace with `blake3` crate for collision resistance on production.
+fn blake3_hex_short(data: &[u8]) -> String {
+    const FNV_OFFSET: u64 = 14695981039346656037;
+    const FNV_PRIME: u64 = 1099511628211;
+    let mut h = FNV_OFFSET;
+    for &b in data {
+        h ^= b as u64;
+        h = h.wrapping_mul(FNV_PRIME);
+    }
+    format!("{h:016x}")
+}

+ 132 - 0
src/uniqueness/genome_index.rs

@@ -0,0 +1,132 @@
+// src/uniqueness/genome_index.rs
+
+use std::collections::HashMap;
+use std::path::Path;
+use log::{info, warn};
+
+use anyhow::Context;
+
+use super::{AnchorBias, CacheKey, UniqueInterval, UniquenessIndex};
+use crate::{
+    io::fasta::{read_fai, read_single_contig_fasta, split_fasta, FaiEntry},
+    uniqueness::cache::load_from_cache,
+}; // ton module existant
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct GenomicPos {
+    pub contig: String,
+    pub pos: usize, // 0-based
+}
+
+pub struct GenomeIndex {
+    contigs: HashMap<String, UniquenessIndex>,
+}
+
+impl GenomeIndex {
+    /// Build or load from cache.
+    ///
+    /// Uses your existing `split_fasta` to stream contig by contig into
+    /// a temp dir, builds/loads a UniquenessIndex per contig, then drops
+    /// the split files.
+    ///
+    /// `filter`: if Some, only process these contig names.
+    pub fn build(
+        fasta_path: impl AsRef<Path>,
+        cache_dir: impl AsRef<Path>,
+        filter: Option<&[&str]>,
+    ) -> anyhow::Result<Self> {
+        let fasta_path = fasta_path.as_ref();
+        let cache_dir = cache_dir.as_ref();
+        std::fs::create_dir_all(cache_dir)?;
+
+        let fai_path = fasta_path.with_extension("fa.fai");
+        let fai = read_fai(&fai_path)
+            .with_context(|| format!("cannot read FAI {}", fai_path.display()))?;
+
+        // ── Premier passage : quels contigs ont besoin d'être (re)buildés ? ──
+        let mut need_build: Vec<&FaiEntry> = Vec::new();
+        let mut cached: HashMap<String, UniquenessIndex> = HashMap::new();
+
+        for entry in &fai {
+            if let Some(allowed) = filter {
+                if !allowed.contains(&entry.name.as_str()) {
+                    continue;
+                }
+            }
+
+            let key = CacheKey::from_fai(&entry.name, entry.len, fasta_path)?;
+            let cache_path = cache_dir.join(key.file_name());
+
+            if cache_path.exists() {
+                match load_from_cache(&cache_path, entry.len) {
+                    Ok(min_unique_len) => {
+                        info!("[genome_index] cache hit  : {}", entry.name);
+                        cached.insert(
+                            entry.name.clone(),
+                            UniquenessIndex::from_raw(min_unique_len),
+                        );
+                    }
+                    Err(e) => {
+                        warn!("[genome_index] cache miss : {} ({e})", entry.name);
+                        need_build.push(entry);
+                    }
+                }
+            } else {
+                warn!("[genome_index] cache miss : {}", entry.name);
+                need_build.push(entry);
+            }
+        }
+
+        if need_build.is_empty() {
+            info!("[genome_index] all contigs loaded from cache, FASTA not read.");
+            return Ok(Self { contigs: cached });
+        }
+
+        let need_names: Vec<&str> = need_build.iter().map(|e| e.name.as_str()).collect();
+        info!(
+            "[genome_index] building {} contig(s): {:?}",
+            need_names.len(),
+            need_names
+        );
+
+        let split_dir = cache_dir.join("_split_tmp");
+        let contig_fastas = split_fasta(fasta_path, &split_dir)?;
+
+        for cf in &contig_fastas {
+            if !need_names.contains(&cf.name.as_str()) {
+                continue;
+            }
+
+            let entry = need_build.iter().find(|e| e.name == cf.name).unwrap();
+            let seq = read_single_contig_fasta(&cf.fasta_path)?;
+
+            info!(
+                "[genome_index] building SA : {} ({:.1} Mbp)",
+                cf.name,
+                seq.len() as f64 / 1e6
+            );
+
+            let key = CacheKey::from_fai(&cf.name, entry.len, fasta_path)?;
+            let idx = UniquenessIndex::build_with_cache(&seq, &key, cache_dir)
+                .with_context(|| format!("build_with_cache failed for {}", cf.name))?;
+            cached.insert(cf.name.clone(), idx);
+        }
+
+        let _ = std::fs::remove_dir_all(&split_dir);
+        Ok(Self { contigs: cached })
+    }
+
+    pub fn query(&self, pos: &GenomicPos, bias: AnchorBias) -> Option<UniqueInterval> {
+        self.contigs
+            .get(&pos.contig)?
+            .minimal_unique_interval_containing(pos.pos, bias)
+    }
+
+    pub fn contig_names(&self) -> impl Iterator<Item = &str> {
+        self.contigs.keys().map(String::as_str)
+    }
+
+    pub fn contig_len(&self, contig: &str) -> Option<usize> {
+        self.contigs.get(contig).map(|idx| idx.genome_len())
+    }
+}

+ 583 - 0
src/uniqueness/mod.rs

@@ -0,0 +1,583 @@
+use std::fs;
+use std::io::{self, BufReader, BufWriter, Read, Write};
+use std::path::{Path, PathBuf};
+use log::{info, warn};
+
+mod cache;
+mod genome_index;
+mod suffix;
+
+pub use cache::CacheKey;
+
+use cache::{load_from_cache, save_to_cache};
+use suffix::{build_lcp_kasai, build_suffix_array};
+
+/// Pure computation: genome bytes → min_unique_len vector.
+/// Separated so GenomeIndex can call it without going through build_with_cache.
+fn build_min_unique_len(genome: &[u8]) -> Vec<usize> {
+    let n   = genome.len();
+    let sa  = build_suffix_array(genome);
+    let lcp = build_lcp_kasai(genome, &sa);
+
+    let mut rank = vec![0usize; n];
+    for (i, &s) in sa.iter().enumerate() { rank[s] = i; }
+
+    let mut mul = vec![usize::MAX; n];
+    for p in 0..n {
+        let r     = rank[p];
+        let left  = if r > 0     { lcp[r]     } else { 0 };
+        let right = if r + 1 < n { lcp[r + 1] } else { 0 };
+        let k     = left.max(right) + 1;
+        mul[p]    = if p + k <= n { k } else { usize::MAX };
+    }
+    mul
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct UniqueInterval {
+    pub start: usize,
+    pub end: usize,
+}
+
+impl UniqueInterval {
+    pub fn len(&self) -> usize {
+        self.end - self.start
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum AnchorBias {
+    Left,
+    Right,
+    Auto,
+}
+
+pub struct UniquenessIndex {
+    min_unique_len: Vec<usize>,
+    genome_len: usize,
+}
+
+impl UniquenessIndex {
+    /// Build from genome bytes, using a disk cache keyed on `cache_key`.
+    /// Cache lives in `cache_dir` (created if absent).
+    /// If the cache file exists and is valid, loading is O(n) read — no recomputation.
+    pub fn build_with_cache(genome: &[u8], key: &CacheKey, cache_dir: &Path) -> io::Result<Self> {
+        let cache_path = cache_dir.join(key.file_name());
+        if cache_path.exists() {
+            match load_from_cache(&cache_path, genome.len()) {
+                Ok(mul) => return Ok(Self::from_raw(mul)),
+                Err(e)  => warn!("[cache] invalid ({e}), rebuilding..."),
+            }
+        }
+        let mul = build_min_unique_len(genome);
+        save_to_cache(&cache_path, &mul)?;
+        Ok(Self::from_raw(mul))
+    }
+
+    /// Construct directly from a precomputed `min_unique_len` vector.
+    /// Used when loading from disk cache — skips SA/LCP computation entirely.
+    pub fn from_raw(min_unique_len: Vec<usize>) -> Self {
+        let genome_len = min_unique_len.len();
+        Self {
+            min_unique_len,
+            genome_len,
+        }
+    }
+
+    /// Build without cache (e.g. for tests or one-shot use).
+    pub fn build(genome: &[u8]) -> Self {
+        Self::from_raw(build_min_unique_len(genome))
+    }
+
+    pub fn minimal_unique_interval_containing(
+        &self,
+        pos: usize,
+        bias: AnchorBias,
+    ) -> Option<UniqueInterval> {
+        assert!(pos < self.genome_len);
+
+        match bias {
+            AnchorBias::Left => {
+                let k = self.min_unique_len[pos];
+                if k == usize::MAX {
+                    return None;
+                }
+                Some(UniqueInterval {
+                    start: pos,
+                    end: pos + k,
+                })
+            }
+
+            AnchorBias::Right => {
+                for a in (0..=pos).rev() {
+                    let k = self.min_unique_len[a];
+                    if k == usize::MAX {
+                        continue;
+                    }
+                    if a + k > pos {
+                        return Some(UniqueInterval {
+                            start: a,
+                            end: a + k,
+                        });
+                    }
+                    // a + k <= pos and a is decreasing: gap can only grow, bail
+                    if pos - a >= self.genome_len {
+                        break;
+                    }
+                }
+                None
+            }
+
+            AnchorBias::Auto => {
+                let window = 500;
+                let mut best: Option<UniqueInterval> = None;
+                for a in pos.saturating_sub(window)..=pos {
+                    let k = self.min_unique_len[a];
+                    if k == usize::MAX || a + k <= pos {
+                        continue;
+                    }
+                    let c = UniqueInterval {
+                        start: a,
+                        end: a + k,
+                    };
+                    best = Some(match best {
+                        None => c,
+                        Some(b) => {
+                            if c.len() < b.len() {
+                                c
+                            } else {
+                                b
+                            }
+                        }
+                    });
+                }
+                best
+            }
+        }
+    }
+
+    pub fn genome_len(&self) -> usize {
+        self.genome_len
+    }
+}
+
+// tests/uniqueness.rs
+
+#[cfg(test)]
+mod tests {
+    use crate::uniqueness::genome_index::{GenomeIndex, GenomicPos};
+
+    use super::*;
+
+    fn toy_genome() -> &'static [u8] {
+        // "banana" — well-known SA example, easy to verify by hand
+        b"ACGTACGTNNACGT"
+    }
+
+    #[test]
+    fn test_cache_roundtrip() {
+        let genome = toy_genome();
+        let dir = std::path::Path::new("/home/t_steimle/tmp/");
+
+        let key = CacheKey::from_genome("chr_test", genome);
+
+        let idx1 = UniquenessIndex::build_with_cache(genome, &key, dir).unwrap();
+        // Second call must hit cache
+        let idx2 = UniquenessIndex::build_with_cache(genome, &key, dir).unwrap();
+
+        assert_eq!(idx1.min_unique_len, idx2.min_unique_len);
+    }
+
+    #[test]
+    fn test_cache_invalidated_on_length_mismatch() {
+        let genome1 = b"ACGTACGT".as_slice();
+        let genome2 = b"ACGTACGTNN".as_slice();
+        let dir = std::path::Path::new("/home/t_steimle/tmp/");
+        let key = CacheKey::from_genome("chr_test", genome1);
+
+        UniquenessIndex::build_with_cache(genome1, &key, dir).unwrap();
+        // genome2 has different len → load_from_cache returns Err → rebuild
+        let key2 = CacheKey::from_genome("chr_test", genome2);
+        // different hash → different file, no collision
+        assert_ne!(key.file_name(), key2.file_name());
+    }
+
+    // ── helpers ───────────────────────────────────────────────────────────────
+
+    /// Verify that a given interval is actually unique in the genome:
+    /// the subsequence appears exactly once.
+    fn assert_unique(genome: &[u8], iv: &UniqueInterval) {
+        let needle = &genome[iv.start..iv.end];
+        let count = genome
+            .windows(needle.len())
+            .filter(|w| *w == needle)
+            .count();
+        assert_eq!(
+            count,
+            1,
+            "interval [{}, {}) = {:?} appears {count}x in genome — not unique",
+            iv.start,
+            iv.end,
+            std::str::from_utf8(needle).unwrap_or("<binary>"),
+        );
+    }
+
+    /// Verify that no strictly shorter sub-interval anchored at the same start
+    /// is also unique (minimality check for Left bias).
+    fn assert_minimal_left(genome: &[u8], iv: &UniqueInterval) {
+        if iv.len() <= 1 {
+            return;
+        }
+        let shorter = UniqueInterval {
+            start: iv.start,
+            end: iv.end - 1,
+        };
+        let needle = &genome[shorter.start..shorter.end];
+        let count = genome
+            .windows(needle.len())
+            .filter(|w| *w == needle)
+            .count();
+        assert!(
+            count > 1,
+            "interval [{}, {}) = {:?} is shorter and already unique — minimality violated",
+            shorter.start,
+            shorter.end,
+            std::str::from_utf8(needle).unwrap_or("<binary>"),
+        );
+    }
+
+    /// Same for Right bias: no strictly shorter interval ending at the same end.
+    fn assert_minimal_right(genome: &[u8], iv: &UniqueInterval) {
+        if iv.len() <= 1 {
+            return;
+        }
+        let shorter = UniqueInterval {
+            start: iv.start + 1,
+            end: iv.end,
+        };
+        let needle = &genome[shorter.start..shorter.end];
+        let count = genome
+            .windows(needle.len())
+            .filter(|w| *w == needle)
+            .count();
+        assert!(
+            count > 1,
+            "interval [{}, {}) = {:?} is shorter and already unique — minimality violated",
+            shorter.start,
+            shorter.end,
+            std::str::from_utf8(needle).unwrap_or("<binary>"),
+        );
+    }
+
+    /// Assert that no interval of the same length covering pos is shorter
+    /// (Auto minimality: no other anchor yields a shorter unique window).
+    fn assert_minimal_auto(genome: &[u8], iv: &UniqueInterval, pos: usize) {
+        // Try all anchors in [pos - len + 1 .. pos] and verify none gives shorter
+        let window = iv.len().saturating_sub(1);
+        for a in pos.saturating_sub(window)..=pos {
+            if a + 1 > genome.len() {
+                break;
+            }
+            let max_end = (a + iv.len()).min(genome.len());
+            for end in (a + 1)..max_end {
+                let needle = &genome[a..end];
+                let count = genome
+                    .windows(needle.len())
+                    .filter(|w| *w == needle)
+                    .count();
+                if count == 1 {
+                    panic!(
+                        "found shorter unique interval [{a}, {end}) len={} \
+                         covering pos={pos}, but Auto returned [{}, {}) len={}",
+                        end - a,
+                        iv.start,
+                        iv.end,
+                        iv.len()
+                    );
+                }
+            }
+        }
+    }
+
+    // ── genome fixtures ───────────────────────────────────────────────────────
+
+    /// Simple genome with obvious unique regions.
+    ///
+    /// Layout (positions):
+    ///  0123456789...
+    ///  AAAA TTTT AAAA GCTAGCTA NNNN GCTAGCTA
+    ///                 ^unique^       ^repeat^
+    ///
+    /// "GCTAGCTA" appears twice → not unique
+    /// "TTTT"     appears once  → unique but short
+    /// The N-run should return None for all biases.
+    fn genome_simple() -> Vec<u8> {
+        //0         1         2         3
+        //0123456789012345678901234567890123456789
+        b"AAAATTTTAAAAGCTAGCTANNNNNNNNNNNGCTAGCTA".to_vec()
+    }
+
+    /// Genome where Left/Right/Auto produce verifiably different intervals.
+    ///
+    ///  ACGT ACGT TGCA TGCA AAAA
+    ///  0    4    8    12   16
+    ///
+    /// "ACGTACGT" covers [0..8) — only one occurrence at pos 0
+    /// "TGCATGCA" covers [8..16) — only one occurrence
+    /// Repeated motifs force longer unique strings.
+    fn genome_repeats() -> Vec<u8> {
+        b"ACGTACGTTGCATGCAAAAACCCCCGGGGG".to_vec()
+    }
+
+    /// Genome constructed so that Left, Right and Auto give three distinct
+    /// intervals for a single query position.
+    ///
+    ///  ATCGATCG GGGG ATCGATCG TTTT ATCG CCCC
+    ///  0        8    12       20   24   28
+    ///
+    /// pos=6 (inside first ATCGATCG repeat region):
+    ///   Left  → must extend right to break the repeat
+    ///   Right → extends left into earlier unique context
+    ///   Auto  → picks whichever anchor gives shortest total span
+    fn genome_biased() -> Vec<u8> {
+        //0         1         2         3
+        //012345678901234567890123456789012345
+        b"ATCGATCGGGGGGGGATCGATCGTTTTTATCGCCCCCCCC".to_vec()
+    }
+
+    // ── Left bias tests ───────────────────────────────────────────────────────
+
+    #[test]
+    fn left_interval_starts_at_pos() {
+        let g = genome_repeats();
+        let idx = UniquenessIndex::build(&g);
+
+        for pos in [0, 4, 8, 12] {
+            if let Some(iv) = idx.minimal_unique_interval_containing(pos, AnchorBias::Left) {
+                assert_eq!(iv.start, pos, "Left: interval must start at pos={pos}");
+                assert!(iv.end > pos, "Left: interval must extend past pos");
+                assert_unique(&g, &iv);
+                assert_minimal_left(&g, &iv);
+            }
+        }
+    }
+
+    #[test]
+    fn left_interval_is_unique() {
+        let g = genome_simple();
+        let idx = UniquenessIndex::build(&g);
+
+        // pos=4 is in the TTTT region — unique and short
+        let iv = idx
+            .minimal_unique_interval_containing(4, AnchorBias::Left)
+            .expect("TTTT region should have a unique interval");
+        assert_eq!(iv.start, 4);
+        assert_unique(&g, &iv);
+        assert_minimal_left(&g, &iv);
+    }
+
+    #[test]
+    fn left_returns_none_for_n_run() {
+        let g = genome_simple();
+        let idx = UniquenessIndex::build(&g);
+        // N-run starts at pos 20 in genome_simple
+        // Ns produce repeated k-mers → no unique interval anchored there
+        let result = idx.minimal_unique_interval_containing(22, AnchorBias::Left);
+        // We don't assert None strictly (an N-run touching unique flanks might
+        // resolve), but if Some, it must be genuinely unique.
+        if let Some(iv) = result {
+            assert_unique(&g, &iv);
+        }
+    }
+
+    // ── Right bias tests ──────────────────────────────────────────────────────
+
+    #[test]
+    fn right_interval_covers_pos() {
+        let g = genome_repeats();
+        let idx = UniquenessIndex::build(&g);
+
+        for pos in [5, 10, 15, 20] {
+            if pos >= g.len() {
+                continue;
+            }
+            if let Some(iv) = idx.minimal_unique_interval_containing(pos, AnchorBias::Right) {
+                assert!(iv.start <= pos, "Right: interval must start ≤ pos={pos}");
+                assert!(iv.end > pos, "Right: interval must end   > pos={pos}");
+                assert_unique(&g, &iv);
+                assert_minimal_right(&g, &iv);
+            }
+        }
+    }
+
+    #[test]
+    fn right_interval_ends_as_early_as_possible() {
+        let g = genome_biased();
+        let idx = UniquenessIndex::build(&g);
+        let pos = 6;
+
+        let iv_right = idx
+            .minimal_unique_interval_containing(pos, AnchorBias::Right)
+            .expect("should find a unique interval with Right bias");
+        let iv_left = idx
+            .minimal_unique_interval_containing(pos, AnchorBias::Left)
+            .expect("should find a unique interval with Left bias");
+
+        // Right-biased interval uses a left anchor → end is generally earlier
+        // than Left-biased interval's end (which is anchored at pos and must
+        // extend further right to become unique).
+        // This is not guaranteed in all genomes, but holds for genome_biased.
+        assert!(
+            iv_right.end <= iv_left.end,
+            "Right end={} should be ≤ Left end={} for this genome",
+            iv_right.end,
+            iv_left.end,
+        );
+        assert_unique(&g, &iv_right);
+    }
+
+    // ── Auto bias tests ───────────────────────────────────────────────────────
+
+    #[test]
+    fn auto_is_shortest_among_all_biases() {
+        let g = genome_biased();
+        let idx = UniquenessIndex::build(&g);
+
+        for pos in 0..g.len() {
+            let auto = idx.minimal_unique_interval_containing(pos, AnchorBias::Auto);
+            let left = idx.minimal_unique_interval_containing(pos, AnchorBias::Left);
+            let right = idx.minimal_unique_interval_containing(pos, AnchorBias::Right);
+
+            if let Some(ref a) = auto {
+                assert_unique(&g, a);
+                assert_minimal_auto(&g, a, pos);
+
+                // Auto must be ≤ Left in length
+                if let Some(ref l) = left {
+                    assert!(
+                        a.len() <= l.len(),
+                        "pos={pos}: Auto len={} > Left len={} — Auto must be minimal",
+                        a.len(),
+                        l.len()
+                    );
+                }
+                // Auto must be ≤ Right in length
+                if let Some(ref r) = right {
+                    assert!(
+                        a.len() <= r.len(),
+                        "pos={pos}: Auto len={} > Right len={} — Auto must be minimal",
+                        a.len(),
+                        r.len()
+                    );
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn auto_covers_pos() {
+        let g = genome_repeats();
+        let idx = UniquenessIndex::build(&g);
+
+        for pos in 0..g.len() {
+            if let Some(iv) = idx.minimal_unique_interval_containing(pos, AnchorBias::Auto) {
+                assert!(
+                    iv.start <= pos && iv.end > pos,
+                    "Auto interval [{}, {}) must contain pos={pos}",
+                    iv.start,
+                    iv.end
+                );
+                assert_unique(&g, &iv);
+            }
+        }
+    }
+
+    // ── Cross-bias consistency ────────────────────────────────────────────────
+
+    #[test]
+    fn all_biases_return_unique_intervals() {
+        let g = genome_simple();
+        let idx = UniquenessIndex::build(&g);
+
+        for pos in 0..g.len() {
+            for bias in [AnchorBias::Left, AnchorBias::Right, AnchorBias::Auto] {
+                if let Some(iv) = idx.minimal_unique_interval_containing(pos, bias) {
+                    assert!(iv.start <= pos, "start must be ≤ pos");
+                    assert!(iv.end > pos, "end must be > pos");
+                    assert!(iv.end <= g.len(), "end must be in bounds");
+                    assert_unique(&g, &iv);
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn left_start_constraint_holds_for_all_positions() {
+        let g = genome_repeats();
+        let idx = UniquenessIndex::build(&g);
+
+        for pos in 0..g.len() {
+            if let Some(iv) = idx.minimal_unique_interval_containing(pos, AnchorBias::Left) {
+                assert_eq!(
+                    iv.start, pos,
+                    "Left bias must anchor at pos={pos}, got start={}",
+                    iv.start
+                );
+            }
+        }
+    }
+
+    // ── Cache integration ─────────────────────────────────────────────────────
+
+    #[test]
+    fn cache_preserves_query_results() {
+        let g = genome_biased();
+        let dir = Path::new("/home/t_steimle/tmp/");
+        let key = CacheKey::from_genome("test_contig", &g);
+
+        let idx_built = UniquenessIndex::build_with_cache(&g, &key, dir).unwrap();
+        let idx_cached = UniquenessIndex::build_with_cache(&g, &key, dir).unwrap();
+
+        for pos in 0..g.len() {
+            for bias in [AnchorBias::Left, AnchorBias::Right, AnchorBias::Auto] {
+                assert_eq!(
+                    idx_built.minimal_unique_interval_containing(pos, bias),
+                    idx_cached.minimal_unique_interval_containing(pos, bias),
+                    "cache mismatch at pos={pos} bias={bias:?}",
+                );
+            }
+        }
+    }
+        use crate::helpers::test_init;
+
+
+    #[test]
+    fn uniqueness_genome() -> anyhow::Result<()> {
+        test_init();
+        let index = GenomeIndex::build(
+            "/home/t_steimle/ref/hs1/chm13v2.0.fa",
+            "/home/t_steimle/ref/hs1/hs1_uniqueness",
+            None, // None = tout indexer
+        )?;
+
+        let pos = GenomicPos {
+            contig: "chr1".into(),
+            pos: 248_000_000,
+        };
+
+        for bias in [AnchorBias::Left, AnchorBias::Right, AnchorBias::Auto] {
+            match index.query(&pos, bias) {
+                Some(iv) => println!(
+                    "{bias:?}: {}:{}-{} ({}bp)",
+                    pos.contig,
+                    iv.start,
+                    iv.end,
+                    iv.len()
+                ),
+                None => println!("{bias:?}: non-unique"),
+            }
+        }
+
+        Ok(())
+    }
+}

+ 75 - 0
src/uniqueness/suffix.rs

@@ -0,0 +1,75 @@
+use rayon::prelude::*;
+
+pub fn build_suffix_array(genome: &[u8]) -> Vec<usize> {
+    let n = genome.len();
+    if n == 0 {
+        return Vec::new();
+    }
+
+    let mut sa: Vec<usize> = (0..n).collect();
+    let mut rank: Vec<i64> = genome.iter().map(|&b| b as i64).collect();
+    let mut tmp = vec![0i64; n];
+    let mut gap = 1usize;
+
+    while gap < n {
+        // ── Sort — main bottleneck, worth parallelising ───────────────────
+        sa.par_sort_unstable_by(|&a, &b| {
+            let second = |i: usize| if i + gap < n { rank[i + gap] } else { -1 };
+            (rank[a], second(a)).cmp(&(rank[b], second(b)))
+        });
+
+        // ── Compute key-change flags in parallel ──────────────────────────
+        let differs: Vec<bool> = (1..n)
+            .into_par_iter()
+            .map(|i| {
+                let second = |i: usize| if i + gap < n { rank[i + gap] } else { -1i64 };
+                let prev = sa[i - 1];
+                let cur = sa[i];
+                (rank[prev], second(prev)) != (rank[cur], second(cur))
+            })
+            .collect();
+
+        // ── Prefix sum — sequential, O(n), cache-friendly ────────────────
+        tmp[sa[0]] = 0;
+        for i in 1..n {
+            tmp[sa[i]] = tmp[sa[i - 1]] + if differs[i - 1] { 1 } else { 0 };
+        }
+
+        rank.copy_from_slice(&tmp);
+
+        if rank[sa[n - 1]] == (n - 1) as i64 {
+            break;
+        }
+        gap *= 2;
+    }
+
+    sa
+}
+
+pub fn build_lcp_kasai(genome: &[u8], sa: &[usize]) -> Vec<usize> {
+    let n = genome.len();
+    let mut rank = vec![0usize; n];
+    for (i, &s) in sa.iter().enumerate() {
+        rank[s] = i;
+    }
+
+    let mut lcp = vec![0usize; n];
+    let mut h = 0usize;
+
+    for p in 0..n {
+        let r = rank[p];
+        if r == 0 {
+            h = 0;
+            continue;
+        }
+        let prev = sa[r - 1];
+        while p + h < n && prev + h < n && genome[p + h] == genome[prev + h] {
+            h += 1;
+        }
+        lcp[r] = h;
+        if h > 0 {
+            h -= 1;
+        }
+    }
+    lcp
+}

+ 21 - 11
src/variant/vcf_variant.rs

@@ -226,7 +226,11 @@ impl FromStr for VcfVariant {
         hasher.update(reference.to_string().as_bytes()); // Reference string as bytes
         hasher.update(alternative.to_string().as_bytes()); // Alternative string as bytes
         let hash = hasher.finalize();
-        let hash = Hash128::new(hash.as_bytes()[..16].try_into().unwrap());
+        let hash = Hash128::new(
+            hash.as_bytes()[..16]
+                .try_into()
+                .expect("blake3 digest is always ≥ 16 bytes"),
+        );
 
         Ok(Self {
             hash,
@@ -841,28 +845,30 @@ impl core::fmt::Display for BNDDesc {
 
 /// Split a multiallelic VcfVariant into N biallelic VcfVariants.
 /// Rewrites A/R-number INFO fields per allele using header metadata.
-pub fn from_multiallelic(variant: VcfVariant) -> Vec<VcfVariant> {
+pub fn from_multiallelic(variant: VcfVariant) -> anyhow::Result<Vec<VcfVariant>> {
     let alt_str = variant.alternative.to_string();
     let alts: Vec<&str> = alt_str.split(',').collect();
 
     // Already biallelic
     if alts.len() == 1 {
-        return vec![variant];
+        return Ok(vec![variant]);
     }
 
     alts.iter()
         .enumerate()
-        .map(|(i, alt)| {
+        .map(|(i, alt)| -> anyhow::Result<VcfVariant> {
             let mut v = variant.clone();
 
             // Rewrite ALT
-            v.alternative = alt.parse().expect("Failed to parse split ALT");
+            v.alternative = alt
+                .parse()
+                .with_context(|| format!("failed to parse split ALT allele: {alt}"))?;
 
             // Rewrite INFO fields (A/R-number)
             v.infos = rewrite_infos(&variant.infos, i);
 
             // Rewrite FORMAT/GT
-            v.formats = rewrite_formats(&variant.formats, i);
+            v.formats = rewrite_formats(&variant.formats, i)?;
 
             // Recompute hash (pos + ref + new alt)
             let mut hasher = blake3::Hasher::new();
@@ -871,9 +877,13 @@ pub fn from_multiallelic(variant: VcfVariant) -> Vec<VcfVariant> {
             hasher.update(v.reference.to_string().as_bytes());
             hasher.update(v.alternative.to_string().as_bytes());
             let hash = hasher.finalize();
-            v.hash = Hash128::new(hash.as_bytes()[..16].try_into().unwrap());
+            v.hash = Hash128::new(
+                hash.as_bytes()[..16]
+                    .try_into()
+                    .expect("blake3 digest is always ≥ 16 bytes"),
+            );
 
-            v
+            Ok(v)
         })
         .collect()
 }
@@ -908,9 +918,9 @@ fn rewrite_infos(infos: &Infos, allele_idx: usize) -> Infos {
     Infos(rewritten)
 }
 
-fn rewrite_formats(formats: &Formats, allele_idx: usize) -> Formats {
+fn rewrite_formats(formats: &Formats, allele_idx: usize) -> anyhow::Result<Formats> {
     if formats.0.is_empty() {
-        return formats.clone();
+        return Ok(formats.clone());
     }
     // Reconstruct a fake FORMAT+sample row to reuse existing FromStr
     let (fmt_str, sample_str): (String, String) = formats.clone().into();
@@ -918,7 +928,7 @@ fn rewrite_formats(formats: &Formats, allele_idx: usize) -> Formats {
     let rewritten_sample = rewrite_sample(&sample_str, &fmt_keys, allele_idx);
     (fmt_str.as_str(), rewritten_sample.as_str())
         .try_into()
-        .expect("Failed to parse rewritten FORMAT")
+        .context("failed to parse rewritten FORMAT field")
 }
 
 /// Recode a sample's GT from multiallelic to biallelic representation.