|
|
@@ -0,0 +1,583 @@
|
|
|
+use std::fs;
|
|
|
+use std::io::{self, BufReader, BufWriter, Read, Write};
|
|
|
+use std::path::{Path, PathBuf};
|
|
|
+use log::{info, warn};
|
|
|
+
|
|
|
+mod cache;
|
|
|
+mod genome_index;
|
|
|
+mod suffix;
|
|
|
+
|
|
|
+pub use cache::CacheKey;
|
|
|
+
|
|
|
+use cache::{load_from_cache, save_to_cache};
|
|
|
+use suffix::{build_lcp_kasai, build_suffix_array};
|
|
|
+
|
|
|
+/// Pure computation: genome bytes → min_unique_len vector.
|
|
|
+/// Separated so GenomeIndex can call it without going through build_with_cache.
|
|
|
+fn build_min_unique_len(genome: &[u8]) -> Vec<usize> {
|
|
|
+ let n = genome.len();
|
|
|
+ let sa = build_suffix_array(genome);
|
|
|
+ let lcp = build_lcp_kasai(genome, &sa);
|
|
|
+
|
|
|
+ let mut rank = vec![0usize; n];
|
|
|
+ for (i, &s) in sa.iter().enumerate() { rank[s] = i; }
|
|
|
+
|
|
|
+ let mut mul = vec![usize::MAX; n];
|
|
|
+ for p in 0..n {
|
|
|
+ let r = rank[p];
|
|
|
+ let left = if r > 0 { lcp[r] } else { 0 };
|
|
|
+ let right = if r + 1 < n { lcp[r + 1] } else { 0 };
|
|
|
+ let k = left.max(right) + 1;
|
|
|
+ mul[p] = if p + k <= n { k } else { usize::MAX };
|
|
|
+ }
|
|
|
+ mul
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone, PartialEq, Eq)]
|
|
|
+pub struct UniqueInterval {
|
|
|
+ pub start: usize,
|
|
|
+ pub end: usize,
|
|
|
+}
|
|
|
+
|
|
|
+impl UniqueInterval {
|
|
|
+ pub fn len(&self) -> usize {
|
|
|
+ self.end - self.start
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone, Copy)]
|
|
|
+pub enum AnchorBias {
|
|
|
+ Left,
|
|
|
+ Right,
|
|
|
+ Auto,
|
|
|
+}
|
|
|
+
|
|
|
+pub struct UniquenessIndex {
|
|
|
+ min_unique_len: Vec<usize>,
|
|
|
+ genome_len: usize,
|
|
|
+}
|
|
|
+
|
|
|
+impl UniquenessIndex {
|
|
|
+ /// Build from genome bytes, using a disk cache keyed on `cache_key`.
|
|
|
+ /// Cache lives in `cache_dir` (created if absent).
|
|
|
+ /// If the cache file exists and is valid, loading is O(n) read — no recomputation.
|
|
|
+ pub fn build_with_cache(genome: &[u8], key: &CacheKey, cache_dir: &Path) -> io::Result<Self> {
|
|
|
+ let cache_path = cache_dir.join(key.file_name());
|
|
|
+ if cache_path.exists() {
|
|
|
+ match load_from_cache(&cache_path, genome.len()) {
|
|
|
+ Ok(mul) => return Ok(Self::from_raw(mul)),
|
|
|
+ Err(e) => warn!("[cache] invalid ({e}), rebuilding..."),
|
|
|
+ }
|
|
|
+ }
|
|
|
+ let mul = build_min_unique_len(genome);
|
|
|
+ save_to_cache(&cache_path, &mul)?;
|
|
|
+ Ok(Self::from_raw(mul))
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Construct directly from a precomputed `min_unique_len` vector.
|
|
|
+ /// Used when loading from disk cache — skips SA/LCP computation entirely.
|
|
|
+ pub fn from_raw(min_unique_len: Vec<usize>) -> Self {
|
|
|
+ let genome_len = min_unique_len.len();
|
|
|
+ Self {
|
|
|
+ min_unique_len,
|
|
|
+ genome_len,
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Build without cache (e.g. for tests or one-shot use).
|
|
|
+ pub fn build(genome: &[u8]) -> Self {
|
|
|
+ Self::from_raw(build_min_unique_len(genome))
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn minimal_unique_interval_containing(
|
|
|
+ &self,
|
|
|
+ pos: usize,
|
|
|
+ bias: AnchorBias,
|
|
|
+ ) -> Option<UniqueInterval> {
|
|
|
+ assert!(pos < self.genome_len);
|
|
|
+
|
|
|
+ match bias {
|
|
|
+ AnchorBias::Left => {
|
|
|
+ let k = self.min_unique_len[pos];
|
|
|
+ if k == usize::MAX {
|
|
|
+ return None;
|
|
|
+ }
|
|
|
+ Some(UniqueInterval {
|
|
|
+ start: pos,
|
|
|
+ end: pos + k,
|
|
|
+ })
|
|
|
+ }
|
|
|
+
|
|
|
+ AnchorBias::Right => {
|
|
|
+ for a in (0..=pos).rev() {
|
|
|
+ let k = self.min_unique_len[a];
|
|
|
+ if k == usize::MAX {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if a + k > pos {
|
|
|
+ return Some(UniqueInterval {
|
|
|
+ start: a,
|
|
|
+ end: a + k,
|
|
|
+ });
|
|
|
+ }
|
|
|
+ // a + k <= pos and a is decreasing: gap can only grow, bail
|
|
|
+ if pos - a >= self.genome_len {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ None
|
|
|
+ }
|
|
|
+
|
|
|
+ AnchorBias::Auto => {
|
|
|
+ let window = 500;
|
|
|
+ let mut best: Option<UniqueInterval> = None;
|
|
|
+ for a in pos.saturating_sub(window)..=pos {
|
|
|
+ let k = self.min_unique_len[a];
|
|
|
+ if k == usize::MAX || a + k <= pos {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ let c = UniqueInterval {
|
|
|
+ start: a,
|
|
|
+ end: a + k,
|
|
|
+ };
|
|
|
+ best = Some(match best {
|
|
|
+ None => c,
|
|
|
+ Some(b) => {
|
|
|
+ if c.len() < b.len() {
|
|
|
+ c
|
|
|
+ } else {
|
|
|
+ b
|
|
|
+ }
|
|
|
+ }
|
|
|
+ });
|
|
|
+ }
|
|
|
+ best
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn genome_len(&self) -> usize {
|
|
|
+ self.genome_len
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+// tests/uniqueness.rs
|
|
|
+
|
|
|
+#[cfg(test)]
|
|
|
+mod tests {
|
|
|
+ use crate::uniqueness::genome_index::{GenomeIndex, GenomicPos};
|
|
|
+
|
|
|
+ use super::*;
|
|
|
+
|
|
|
+ fn toy_genome() -> &'static [u8] {
|
|
|
+ // "banana" — well-known SA example, easy to verify by hand
|
|
|
+ b"ACGTACGTNNACGT"
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn test_cache_roundtrip() {
|
|
|
+ let genome = toy_genome();
|
|
|
+ let dir = std::path::Path::new("/home/t_steimle/tmp/");
|
|
|
+
|
|
|
+ let key = CacheKey::from_genome("chr_test", genome);
|
|
|
+
|
|
|
+ let idx1 = UniquenessIndex::build_with_cache(genome, &key, dir).unwrap();
|
|
|
+ // Second call must hit cache
|
|
|
+ let idx2 = UniquenessIndex::build_with_cache(genome, &key, dir).unwrap();
|
|
|
+
|
|
|
+ assert_eq!(idx1.min_unique_len, idx2.min_unique_len);
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn test_cache_invalidated_on_length_mismatch() {
|
|
|
+ let genome1 = b"ACGTACGT".as_slice();
|
|
|
+ let genome2 = b"ACGTACGTNN".as_slice();
|
|
|
+ let dir = std::path::Path::new("/home/t_steimle/tmp/");
|
|
|
+ let key = CacheKey::from_genome("chr_test", genome1);
|
|
|
+
|
|
|
+ UniquenessIndex::build_with_cache(genome1, &key, dir).unwrap();
|
|
|
+ // genome2 has different len → load_from_cache returns Err → rebuild
|
|
|
+ let key2 = CacheKey::from_genome("chr_test", genome2);
|
|
|
+ // different hash → different file, no collision
|
|
|
+ assert_ne!(key.file_name(), key2.file_name());
|
|
|
+ }
|
|
|
+
|
|
|
+ // ── helpers ───────────────────────────────────────────────────────────────
|
|
|
+
|
|
|
+ /// Verify that a given interval is actually unique in the genome:
|
|
|
+ /// the subsequence appears exactly once.
|
|
|
+ fn assert_unique(genome: &[u8], iv: &UniqueInterval) {
|
|
|
+ let needle = &genome[iv.start..iv.end];
|
|
|
+ let count = genome
|
|
|
+ .windows(needle.len())
|
|
|
+ .filter(|w| *w == needle)
|
|
|
+ .count();
|
|
|
+ assert_eq!(
|
|
|
+ count,
|
|
|
+ 1,
|
|
|
+ "interval [{}, {}) = {:?} appears {count}x in genome — not unique",
|
|
|
+ iv.start,
|
|
|
+ iv.end,
|
|
|
+ std::str::from_utf8(needle).unwrap_or("<binary>"),
|
|
|
+ );
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Verify that no strictly shorter sub-interval anchored at the same start
|
|
|
+ /// is also unique (minimality check for Left bias).
|
|
|
+ fn assert_minimal_left(genome: &[u8], iv: &UniqueInterval) {
|
|
|
+ if iv.len() <= 1 {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ let shorter = UniqueInterval {
|
|
|
+ start: iv.start,
|
|
|
+ end: iv.end - 1,
|
|
|
+ };
|
|
|
+ let needle = &genome[shorter.start..shorter.end];
|
|
|
+ let count = genome
|
|
|
+ .windows(needle.len())
|
|
|
+ .filter(|w| *w == needle)
|
|
|
+ .count();
|
|
|
+ assert!(
|
|
|
+ count > 1,
|
|
|
+ "interval [{}, {}) = {:?} is shorter and already unique — minimality violated",
|
|
|
+ shorter.start,
|
|
|
+ shorter.end,
|
|
|
+ std::str::from_utf8(needle).unwrap_or("<binary>"),
|
|
|
+ );
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Same for Right bias: no strictly shorter interval ending at the same end.
|
|
|
+ fn assert_minimal_right(genome: &[u8], iv: &UniqueInterval) {
|
|
|
+ if iv.len() <= 1 {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ let shorter = UniqueInterval {
|
|
|
+ start: iv.start + 1,
|
|
|
+ end: iv.end,
|
|
|
+ };
|
|
|
+ let needle = &genome[shorter.start..shorter.end];
|
|
|
+ let count = genome
|
|
|
+ .windows(needle.len())
|
|
|
+ .filter(|w| *w == needle)
|
|
|
+ .count();
|
|
|
+ assert!(
|
|
|
+ count > 1,
|
|
|
+ "interval [{}, {}) = {:?} is shorter and already unique — minimality violated",
|
|
|
+ shorter.start,
|
|
|
+ shorter.end,
|
|
|
+ std::str::from_utf8(needle).unwrap_or("<binary>"),
|
|
|
+ );
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Assert that no interval of the same length covering pos is shorter
|
|
|
+ /// (Auto minimality: no other anchor yields a shorter unique window).
|
|
|
+ fn assert_minimal_auto(genome: &[u8], iv: &UniqueInterval, pos: usize) {
|
|
|
+ // Try all anchors in [pos - len + 1 .. pos] and verify none gives shorter
|
|
|
+ let window = iv.len().saturating_sub(1);
|
|
|
+ for a in pos.saturating_sub(window)..=pos {
|
|
|
+ if a + 1 > genome.len() {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ let max_end = (a + iv.len()).min(genome.len());
|
|
|
+ for end in (a + 1)..max_end {
|
|
|
+ let needle = &genome[a..end];
|
|
|
+ let count = genome
|
|
|
+ .windows(needle.len())
|
|
|
+ .filter(|w| *w == needle)
|
|
|
+ .count();
|
|
|
+ if count == 1 {
|
|
|
+ panic!(
|
|
|
+ "found shorter unique interval [{a}, {end}) len={} \
|
|
|
+ covering pos={pos}, but Auto returned [{}, {}) len={}",
|
|
|
+ end - a,
|
|
|
+ iv.start,
|
|
|
+ iv.end,
|
|
|
+ iv.len()
|
|
|
+ );
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // ── genome fixtures ───────────────────────────────────────────────────────
|
|
|
+
|
|
|
+ /// Simple genome with obvious unique regions.
|
|
|
+ ///
|
|
|
+ /// Layout (positions):
|
|
|
+ /// 0123456789...
|
|
|
+ /// AAAA TTTT AAAA GCTAGCTA NNNN GCTAGCTA
|
|
|
+ /// ^unique^ ^repeat^
|
|
|
+ ///
|
|
|
+ /// "GCTAGCTA" appears twice → not unique
|
|
|
+ /// "TTTT" appears once → unique but short
|
|
|
+ /// The N-run should return None for all biases.
|
|
|
+ fn genome_simple() -> Vec<u8> {
|
|
|
+ //0 1 2 3
|
|
|
+ //0123456789012345678901234567890123456789
|
|
|
+ b"AAAATTTTAAAAGCTAGCTANNNNNNNNNNNGCTAGCTA".to_vec()
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Genome where Left/Right/Auto produce verifiably different intervals.
|
|
|
+ ///
|
|
|
+ /// ACGT ACGT TGCA TGCA AAAA
|
|
|
+ /// 0 4 8 12 16
|
|
|
+ ///
|
|
|
+ /// "ACGTACGT" covers [0..8) — only one occurrence at pos 0
|
|
|
+ /// "TGCATGCA" covers [8..16) — only one occurrence
|
|
|
+ /// Repeated motifs force longer unique strings.
|
|
|
+ fn genome_repeats() -> Vec<u8> {
|
|
|
+ b"ACGTACGTTGCATGCAAAAACCCCCGGGGG".to_vec()
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Genome constructed so that Left, Right and Auto give three distinct
|
|
|
+ /// intervals for a single query position.
|
|
|
+ ///
|
|
|
+ /// ATCGATCG GGGG ATCGATCG TTTT ATCG CCCC
|
|
|
+ /// 0 8 12 20 24 28
|
|
|
+ ///
|
|
|
+ /// pos=6 (inside first ATCGATCG repeat region):
|
|
|
+ /// Left → must extend right to break the repeat
|
|
|
+ /// Right → extends left into earlier unique context
|
|
|
+ /// Auto → picks whichever anchor gives shortest total span
|
|
|
+ fn genome_biased() -> Vec<u8> {
|
|
|
+ //0 1 2 3
|
|
|
+ //012345678901234567890123456789012345
|
|
|
+ b"ATCGATCGGGGGGGGATCGATCGTTTTTATCGCCCCCCCC".to_vec()
|
|
|
+ }
|
|
|
+
|
|
|
+ // ── Left bias tests ───────────────────────────────────────────────────────
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn left_interval_starts_at_pos() {
|
|
|
+ let g = genome_repeats();
|
|
|
+ let idx = UniquenessIndex::build(&g);
|
|
|
+
|
|
|
+ for pos in [0, 4, 8, 12] {
|
|
|
+ if let Some(iv) = idx.minimal_unique_interval_containing(pos, AnchorBias::Left) {
|
|
|
+ assert_eq!(iv.start, pos, "Left: interval must start at pos={pos}");
|
|
|
+ assert!(iv.end > pos, "Left: interval must extend past pos");
|
|
|
+ assert_unique(&g, &iv);
|
|
|
+ assert_minimal_left(&g, &iv);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn left_interval_is_unique() {
|
|
|
+ let g = genome_simple();
|
|
|
+ let idx = UniquenessIndex::build(&g);
|
|
|
+
|
|
|
+ // pos=4 is in the TTTT region — unique and short
|
|
|
+ let iv = idx
|
|
|
+ .minimal_unique_interval_containing(4, AnchorBias::Left)
|
|
|
+ .expect("TTTT region should have a unique interval");
|
|
|
+ assert_eq!(iv.start, 4);
|
|
|
+ assert_unique(&g, &iv);
|
|
|
+ assert_minimal_left(&g, &iv);
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn left_returns_none_for_n_run() {
|
|
|
+ let g = genome_simple();
|
|
|
+ let idx = UniquenessIndex::build(&g);
|
|
|
+ // N-run starts at pos 20 in genome_simple
|
|
|
+ // Ns produce repeated k-mers → no unique interval anchored there
|
|
|
+ let result = idx.minimal_unique_interval_containing(22, AnchorBias::Left);
|
|
|
+ // We don't assert None strictly (an N-run touching unique flanks might
|
|
|
+ // resolve), but if Some, it must be genuinely unique.
|
|
|
+ if let Some(iv) = result {
|
|
|
+ assert_unique(&g, &iv);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // ── Right bias tests ──────────────────────────────────────────────────────
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn right_interval_covers_pos() {
|
|
|
+ let g = genome_repeats();
|
|
|
+ let idx = UniquenessIndex::build(&g);
|
|
|
+
|
|
|
+ for pos in [5, 10, 15, 20] {
|
|
|
+ if pos >= g.len() {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if let Some(iv) = idx.minimal_unique_interval_containing(pos, AnchorBias::Right) {
|
|
|
+ assert!(iv.start <= pos, "Right: interval must start ≤ pos={pos}");
|
|
|
+ assert!(iv.end > pos, "Right: interval must end > pos={pos}");
|
|
|
+ assert_unique(&g, &iv);
|
|
|
+ assert_minimal_right(&g, &iv);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn right_interval_ends_as_early_as_possible() {
|
|
|
+ let g = genome_biased();
|
|
|
+ let idx = UniquenessIndex::build(&g);
|
|
|
+ let pos = 6;
|
|
|
+
|
|
|
+ let iv_right = idx
|
|
|
+ .minimal_unique_interval_containing(pos, AnchorBias::Right)
|
|
|
+ .expect("should find a unique interval with Right bias");
|
|
|
+ let iv_left = idx
|
|
|
+ .minimal_unique_interval_containing(pos, AnchorBias::Left)
|
|
|
+ .expect("should find a unique interval with Left bias");
|
|
|
+
|
|
|
+ // Right-biased interval uses a left anchor → end is generally earlier
|
|
|
+ // than Left-biased interval's end (which is anchored at pos and must
|
|
|
+ // extend further right to become unique).
|
|
|
+ // This is not guaranteed in all genomes, but holds for genome_biased.
|
|
|
+ assert!(
|
|
|
+ iv_right.end <= iv_left.end,
|
|
|
+ "Right end={} should be ≤ Left end={} for this genome",
|
|
|
+ iv_right.end,
|
|
|
+ iv_left.end,
|
|
|
+ );
|
|
|
+ assert_unique(&g, &iv_right);
|
|
|
+ }
|
|
|
+
|
|
|
+ // ── Auto bias tests ───────────────────────────────────────────────────────
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn auto_is_shortest_among_all_biases() {
|
|
|
+ let g = genome_biased();
|
|
|
+ let idx = UniquenessIndex::build(&g);
|
|
|
+
|
|
|
+ for pos in 0..g.len() {
|
|
|
+ let auto = idx.minimal_unique_interval_containing(pos, AnchorBias::Auto);
|
|
|
+ let left = idx.minimal_unique_interval_containing(pos, AnchorBias::Left);
|
|
|
+ let right = idx.minimal_unique_interval_containing(pos, AnchorBias::Right);
|
|
|
+
|
|
|
+ if let Some(ref a) = auto {
|
|
|
+ assert_unique(&g, a);
|
|
|
+ assert_minimal_auto(&g, a, pos);
|
|
|
+
|
|
|
+ // Auto must be ≤ Left in length
|
|
|
+ if let Some(ref l) = left {
|
|
|
+ assert!(
|
|
|
+ a.len() <= l.len(),
|
|
|
+ "pos={pos}: Auto len={} > Left len={} — Auto must be minimal",
|
|
|
+ a.len(),
|
|
|
+ l.len()
|
|
|
+ );
|
|
|
+ }
|
|
|
+ // Auto must be ≤ Right in length
|
|
|
+ if let Some(ref r) = right {
|
|
|
+ assert!(
|
|
|
+ a.len() <= r.len(),
|
|
|
+ "pos={pos}: Auto len={} > Right len={} — Auto must be minimal",
|
|
|
+ a.len(),
|
|
|
+ r.len()
|
|
|
+ );
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn auto_covers_pos() {
|
|
|
+ let g = genome_repeats();
|
|
|
+ let idx = UniquenessIndex::build(&g);
|
|
|
+
|
|
|
+ for pos in 0..g.len() {
|
|
|
+ if let Some(iv) = idx.minimal_unique_interval_containing(pos, AnchorBias::Auto) {
|
|
|
+ assert!(
|
|
|
+ iv.start <= pos && iv.end > pos,
|
|
|
+ "Auto interval [{}, {}) must contain pos={pos}",
|
|
|
+ iv.start,
|
|
|
+ iv.end
|
|
|
+ );
|
|
|
+ assert_unique(&g, &iv);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // ── Cross-bias consistency ────────────────────────────────────────────────
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn all_biases_return_unique_intervals() {
|
|
|
+ let g = genome_simple();
|
|
|
+ let idx = UniquenessIndex::build(&g);
|
|
|
+
|
|
|
+ for pos in 0..g.len() {
|
|
|
+ for bias in [AnchorBias::Left, AnchorBias::Right, AnchorBias::Auto] {
|
|
|
+ if let Some(iv) = idx.minimal_unique_interval_containing(pos, bias) {
|
|
|
+ assert!(iv.start <= pos, "start must be ≤ pos");
|
|
|
+ assert!(iv.end > pos, "end must be > pos");
|
|
|
+ assert!(iv.end <= g.len(), "end must be in bounds");
|
|
|
+ assert_unique(&g, &iv);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn left_start_constraint_holds_for_all_positions() {
|
|
|
+ let g = genome_repeats();
|
|
|
+ let idx = UniquenessIndex::build(&g);
|
|
|
+
|
|
|
+ for pos in 0..g.len() {
|
|
|
+ if let Some(iv) = idx.minimal_unique_interval_containing(pos, AnchorBias::Left) {
|
|
|
+ assert_eq!(
|
|
|
+ iv.start, pos,
|
|
|
+ "Left bias must anchor at pos={pos}, got start={}",
|
|
|
+ iv.start
|
|
|
+ );
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // ── Cache integration ─────────────────────────────────────────────────────
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn cache_preserves_query_results() {
|
|
|
+ let g = genome_biased();
|
|
|
+ let dir = Path::new("/home/t_steimle/tmp/");
|
|
|
+ let key = CacheKey::from_genome("test_contig", &g);
|
|
|
+
|
|
|
+ let idx_built = UniquenessIndex::build_with_cache(&g, &key, dir).unwrap();
|
|
|
+ let idx_cached = UniquenessIndex::build_with_cache(&g, &key, dir).unwrap();
|
|
|
+
|
|
|
+ for pos in 0..g.len() {
|
|
|
+ for bias in [AnchorBias::Left, AnchorBias::Right, AnchorBias::Auto] {
|
|
|
+ assert_eq!(
|
|
|
+ idx_built.minimal_unique_interval_containing(pos, bias),
|
|
|
+ idx_cached.minimal_unique_interval_containing(pos, bias),
|
|
|
+ "cache mismatch at pos={pos} bias={bias:?}",
|
|
|
+ );
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ use crate::helpers::test_init;
|
|
|
+
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn uniqueness_genome() -> anyhow::Result<()> {
|
|
|
+ test_init();
|
|
|
+ let index = GenomeIndex::build(
|
|
|
+ "/home/t_steimle/ref/hs1/chm13v2.0.fa",
|
|
|
+ "/home/t_steimle/ref/hs1/hs1_uniqueness",
|
|
|
+ None, // None = tout indexer
|
|
|
+ )?;
|
|
|
+
|
|
|
+ let pos = GenomicPos {
|
|
|
+ contig: "chr1".into(),
|
|
|
+ pos: 248_000_000,
|
|
|
+ };
|
|
|
+
|
|
|
+ for bias in [AnchorBias::Left, AnchorBias::Right, AnchorBias::Auto] {
|
|
|
+ match index.query(&pos, bias) {
|
|
|
+ Some(iv) => println!(
|
|
|
+ "{bias:?}: {}:{}-{} ({}bp)",
|
|
|
+ pos.contig,
|
|
|
+ iv.start,
|
|
|
+ iv.end,
|
|
|
+ iv.len()
|
|
|
+ ),
|
|
|
+ None => println!("{bias:?}: non-unique"),
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+}
|