Ver Fonte

bam annotation par

Thomas há 4 dias atrás
pai
commit
3c15a2820a

+ 63 - 57
Cargo.lock

@@ -174,8 +174,8 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
 
 [[package]]
 name = "arrow"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -194,8 +194,8 @@ dependencies = [
 
 [[package]]
 name = "arrow-arith"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -207,8 +207,8 @@ dependencies = [
 
 [[package]]
 name = "arrow-array"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 dependencies = [
  "ahash 0.8.12",
  "arrow-buffer",
@@ -224,8 +224,8 @@ dependencies = [
 
 [[package]]
 name = "arrow-buffer"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 dependencies = [
  "bytes",
  "half",
@@ -235,8 +235,8 @@ dependencies = [
 
 [[package]]
 name = "arrow-cast"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -255,8 +255,8 @@ dependencies = [
 
 [[package]]
 name = "arrow-csv"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 dependencies = [
  "arrow-array",
  "arrow-cast",
@@ -269,8 +269,8 @@ dependencies = [
 
 [[package]]
 name = "arrow-data"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 dependencies = [
  "arrow-buffer",
  "arrow-schema",
@@ -281,8 +281,8 @@ dependencies = [
 
 [[package]]
 name = "arrow-ipc"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -294,8 +294,8 @@ dependencies = [
 
 [[package]]
 name = "arrow-json"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -318,8 +318,8 @@ dependencies = [
 
 [[package]]
 name = "arrow-ord"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -330,8 +330,8 @@ dependencies = [
 
 [[package]]
 name = "arrow-row"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -342,13 +342,13 @@ dependencies = [
 
 [[package]]
 name = "arrow-schema"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 
 [[package]]
 name = "arrow-select"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 dependencies = [
  "ahash 0.8.12",
  "arrow-array",
@@ -360,8 +360,8 @@ dependencies = [
 
 [[package]]
 name = "arrow-string"
-version = "58.3.0"
-source = "git+https://github.com/apache/arrow-rs#2eeb805b8a8b8ca67788917ec2f5220eb3e6f958"
+version = "59.0.0"
+source = "git+https://github.com/apache/arrow-rs#9f96a8f5c3f47c6303c7d17ac7f00ad4ad0df513"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -428,7 +428,7 @@ dependencies = [
  "quote",
  "regex",
  "rustc-hash 1.1.0",
- "shlex",
+ "shlex 1.3.0",
  "syn 2.0.117",
 ]
 
@@ -477,9 +477,9 @@ dependencies = [
 
 [[package]]
 name = "bitflags"
-version = "2.11.1"
+version = "2.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
+checksum = "84d7ced0ae9557296835c32bf1b1e02b44c746701f898460fb000d7eaa84f00a"
 
 [[package]]
 name = "bitvec"
@@ -638,14 +638,14 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.62"
+version = "1.2.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98"
+checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f"
 dependencies = [
  "find-msvc-tools",
  "jobserver",
  "libc",
- "shlex",
+ "shlex 2.0.1",
 ]
 
 [[package]]
@@ -684,9 +684,9 @@ dependencies = [
 
 [[package]]
 name = "chrono"
-version = "0.4.44"
+version = "0.4.45"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
+checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327"
 dependencies = [
  "iana-time-zone",
  "js-sys",
@@ -1595,9 +1595,9 @@ checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
 
 [[package]]
 name = "jiff"
-version = "0.2.27"
+version = "0.2.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "392c70591e8749fe235ddaf513e6f58b26bce3dcc16524cecc8936f75afa161e"
+checksum = "4603d3033e49e2b0e31229fcab20a5d40089c607d975cd9c80551dc69eed9102"
 dependencies = [
  "jiff-static",
  "log",
@@ -1608,9 +1608,9 @@ dependencies = [
 
 [[package]]
 name = "jiff-static"
-version = "0.2.27"
+version = "0.2.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47b605b0c050d845fc355bb11eb3f9a8deddc218ea60c76e61aa1f2adfb2c96a"
+checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1757,9 +1757,9 @@ dependencies = [
 
 [[package]]
 name = "libz-sys"
-version = "1.1.28"
+version = "1.1.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc3a226e576f50782b3305c5ccf458698f92798987f551c6a02efe8276721e22"
+checksum = "85bc9657773828b90eeb625adff10eeac83cc21bbfd8e23a03eaa8a33c9e28d9"
 dependencies = [
  "cc",
  "cmake",
@@ -1797,9 +1797,9 @@ dependencies = [
 
 [[package]]
 name = "log"
-version = "0.4.30"
+version = "0.4.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "616ec5685824bcc94416c6d4a7a446eea774a31efd7062c8480ba6fd06d7a6e5"
+checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a"
 
 [[package]]
 name = "lzma-sys"
@@ -2317,7 +2317,7 @@ version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f"
 dependencies = [
- "toml_edit 0.25.11+spec-1.1.0",
+ "toml_edit 0.25.12+spec-1.1.0",
 ]
 
 [[package]]
@@ -2873,6 +2873,12 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
+[[package]]
+name = "shlex"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba"
+
 [[package]]
 name = "sigchld"
 version = "0.2.4"
@@ -3170,9 +3176,9 @@ dependencies = [
 
 [[package]]
 name = "toml_edit"
-version = "0.25.11+spec-1.1.0"
+version = "0.25.12+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b59c4d22ed448339746c59b905d24568fcbb3ab65a500494f7b8c3e97739f2b"
+checksum = "d2153edc6955a6c354fad8f5efd38b6a8769bdccf9fe50f8e1329f81b0baa5d7"
 dependencies = [
  "indexmap",
  "toml_datetime 1.1.1+spec-1.1.0",
@@ -3252,9 +3258,9 @@ checksum = "22048bc95dfb2ffd05b1ff9a756290a009224b60b2f0e7525faeee7603851e63"
 
 [[package]]
 name = "typenum"
-version = "1.20.0"
+version = "1.20.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
+checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20"
 
 [[package]]
 name = "unicode-ident"
@@ -3316,9 +3322,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 
 [[package]]
 name = "uuid"
-version = "1.23.1"
+version = "1.23.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76"
+checksum = "d258b83ceec21034727ecee8c382cfa6c3e133699b0742c64571814fb420c9f7"
 dependencies = [
  "getrandom 0.4.2",
  "js-sys",
@@ -3895,9 +3901,9 @@ dependencies = [
 
 [[package]]
 name = "yoke"
-version = "0.8.2"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca"
+checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5"
 dependencies = [
  "stable_deref_trait",
  "yoke-derive",
@@ -3918,18 +3924,18 @@ dependencies = [
 
 [[package]]
 name = "zerocopy"
-version = "0.8.48"
+version = "0.8.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
+checksum = "3b065d4f0e55f82fae73202e189638116a87c55ab6b8e6c2721e13dd9d854ad1"
 dependencies = [
  "zerocopy-derive",
 ]
 
 [[package]]
 name = "zerocopy-derive"
-version = "0.8.48"
+version = "0.8.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
+checksum = "0b631b19d36a892ab55420c92dbc83ccd79274f25be714855d3074aa71cab639"
 dependencies = [
  "proc-macro2",
  "quote",

+ 5 - 0
pandora-config.example.toml

@@ -142,6 +142,11 @@ entropy_seq_len = 10
 # Minimum Shannon entropy threshold.
 min_shannon_entropy = 1.0
 
+# Minimum dbSNP population frequency (max across studies) to flag a variant as a
+# known polymorphism. Variants at or above this frequency with constitutional alt
+# reads are filtered. Set to 0.0 to disable dbSNP-based filtering.
+db_snp_min_freq = 0.001
+
 # Max depth considered "low quality".
 max_depth_low_quality = 20
 

+ 98 - 0
src/annotation/dbsnp.rs

@@ -1,8 +1,17 @@
 use std::{fmt, str::FromStr};
 
 use bitcode::{Decode, Encode};
+use log::{error, warn};
+use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 
+use crate::{
+    annotation::{Annotation, Annotations},
+    io::{readers::TabixReader, vcf::fetch_vcf_with_reader},
+    positions::GenomeRange,
+    variant::vcf_variant::{Info, VcfVariant},
+};
+
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)]
 pub struct DbSnpFreqEntry {
     pub source: String,
@@ -78,6 +87,17 @@ impl fmt::Display for DbSnpFreq {
 }
 
 impl DbSnpFreq {
+    /// Maximum ALT frequency across real population sources in a dbSNP FREQ field.
+    pub fn max_alt_freq(&self) -> Option<f32> {
+        const EXCLUDED: &[&str] = &["SGDP_PRJ", "dbGaP_PopFreq"];
+        self.0
+            .iter()
+            .filter(|entry| !EXCLUDED.contains(&entry.source.as_str()))
+            .filter_map(|entry| entry.values.get(1).copied().flatten().map(|v| v as f32))
+            .filter(|&v| v > 0.0)
+            .reduce(f32::max)
+    }
+
     /// Average ALT frequency across real population sources in a dbSNP FREQ field.
     pub fn maf(&self) -> Option<f32> {
         const EXCLUDED: &[&str] = &["SGDP_PRJ", "dbGaP_PopFreq"];
@@ -103,3 +123,81 @@ impl DbSnpFreq {
         (count > 0).then_some(sum / count as f32)
     }
 }
+
+/// Annotate `variants` with dbSNP FREQ data in parallel.
+///
+/// Variants that already carry a `GnomAD` annotation are skipped (they are
+/// redundant and will be filtered anyway). Remaining variants are split into
+/// chunks processed by Rayon worker threads. Each worker opens the tabix index
+/// and BGZF file **once** via [`TabixReader`] and reuses that handle for every
+/// variant in its chunk, avoiding per-variant file-open overhead.
+pub fn annotate_dbsnp(
+    variants: &[VcfVariant],
+    annotations: &Annotations,
+    db_snp_path: &str,
+) -> anyhow::Result<()> {
+    if variants.is_empty() {
+        return Ok(());
+    }
+
+    let n_threads = rayon::current_num_threads();
+    let chunk_size = variants.len().div_ceil(n_threads).max(500);
+
+    variants.par_chunks(chunk_size).for_each_init(
+        || TabixReader::open(db_snp_path),
+        |reader_res, chunk| {
+            let Ok(ref mut reader) = reader_res else {
+                error!("Failed to open dbSNP tabix reader for chunk");
+                return;
+            };
+
+            for variant in chunk {
+                if let Some(anns) = annotations.store.get(&variant.hash) {
+                    if anns.iter().any(|a| matches!(a, Annotation::GnomAD(_))) {
+                        continue;
+                    }
+                }
+
+                let region = GenomeRange::new(
+                    &variant.position.contig(),
+                    variant.position.position,
+                    variant.position.position + 1,
+                );
+
+                let dbsnp_records = match fetch_vcf_with_reader(reader, &region) {
+                    Ok(v) => v,
+                    Err(e) => {
+                        warn!(
+                            "dbSNP fetch failed at {}:{}: {e}",
+                            variant.position.contig(),
+                            variant.position.position
+                        );
+                        continue;
+                    }
+                };
+
+                for dv in &dbsnp_records {
+                    if dv != variant {
+                        continue;
+                    }
+                    let freq = dv.infos.0.iter().find_map(|i| {
+                        if let Info::FREQ(f) = i {
+                            Some(f.clone())
+                        } else {
+                            None
+                        }
+                    });
+                    if let Some(freq) = freq {
+                        annotations.insert_update(
+                            variant.hash,
+                            &[Annotation::DbSnp(dv.id.clone(), freq)],
+                        );
+                        break;
+                    }
+                }
+            }
+        },
+    );
+
+    Ok(())
+}

+ 26 - 0
src/annotation/mod.rs

@@ -26,6 +26,7 @@ use crate::{
 use bitcode::{Decode, Encode};
 use cosmic::Cosmic;
 use dashmap::DashMap;
+use dbsnp::DbSnpFreq;
 use gnomad::GnomAD;
 use log::info;
 use rayon::prelude::*;
@@ -65,6 +66,9 @@ pub enum Annotation {
     /// GnomAD population frequency database annotation.
     GnomAD(GnomAD),
 
+    /// dbSNP annotation: rsID and per-population allele frequencies from the FREQ INFO field.
+    DbSnp(String, DbSnpFreq),
+
     /// Flag indicating low Shannon entropy (possibly less reliable region).
     LowEntropy,
 
@@ -225,6 +229,7 @@ impl fmt::Display for Annotation {
             HighConstitAlt => "HighConstitAlt".into(),
             Cosmic(_) => "Cosmic".into(),
             GnomAD(_) => "GnomAD".into(),
+            DbSnp(_, _) => "DbSnp".into(),
             LowEntropy => "LowEntropy".into(),
             VEP(_) => "VEP".into(),
             CpG => "CpG".into(),
@@ -806,3 +811,24 @@ pub fn is_gnomad_and_constit_alt(anns: &[Annotation]) -> bool {
 
     (has_gnomad && has_constit_alt) || only_tumor_with_constit
 }
+
+/// Returns `true` when a variant is annotated with a dbSNP population frequency at or
+/// above `min_freq` **and** has at least one alternate read in the constitutional sample.
+///
+/// Mirrors [`is_gnomad_and_constit_alt`] for dbSNP-sourced population evidence.
+pub fn is_dbsnp_and_constit_alt(anns: &[Annotation], min_freq: f32) -> bool {
+    let has_dbsnp = anns.iter().any(|a| {
+        matches!(a, Annotation::DbSnp(_, freq) if freq.max_alt_freq().unwrap_or(0.0) >= min_freq)
+    });
+    let constit_alt: u16 = anns
+        .iter()
+        .filter_map(|a| {
+            if let Annotation::ConstitAlt(n) = a {
+                Some(*n)
+            } else {
+                None
+            }
+        })
+        .sum();
+    has_dbsnp && constit_alt > 0
+}

+ 1 - 1
src/callers/clairs.rs

@@ -1190,7 +1190,7 @@ mod tests {
     #[test]
     fn clairs_resume() -> anyhow::Result<()> {
         test_init();
-        let id = "DUMCO";
+        let id = "CHALO";
         let config = Config::default();
         let  clairs = ClairS::initialize(id, &config)?;
 

+ 4 - 0
src/config.rs

@@ -140,6 +140,10 @@ pub struct Config {
     /// Minimum Shannon entropy threshold for keeping a variant.
     pub min_shannon_entropy: f64,
 
+    /// Minimum dbSNP population frequency (max across studies) to consider a variant a known
+    /// polymorphism for the dbSNP + constitutional-alt filter.
+    pub db_snp_min_freq: f32,
+
     /// Maximum depth considered "low quality" for certain filters.
     pub max_depth_low_quality: u32,
 

+ 100 - 0
src/io/readers.rs

@@ -278,6 +278,106 @@ where
     Ok(())
 }
 
+/// A reusable handle to a Tabix-indexed BGZF file.
+///
+/// Opening a tabix index and BGZF file is expensive. Use this struct when you
+/// need to issue many region queries against the same file (e.g. one query per
+/// variant in a parallel worker) to pay the open cost only once per thread.
+///
+/// # Example
+/// ```ignore
+/// let mut reader = TabixReader::open("/path/to/file.vcf.gz")?;
+/// reader.fetch_with(&region, |line| { println!("{line}"); Ok(()) })?;
+/// ```
+pub struct TabixReader {
+    index: tabix::Index,
+    reader: bgzf::io::Reader<File>,
+    buf: String,
+}
+
+impl TabixReader {
+    /// Open `bgz_path` and its sidecar `.tbi` index.
+    pub fn open(bgz_path: &str) -> anyhow::Result<Self> {
+        let tbi_path = format!("{bgz_path}.tbi");
+        let index = tabix::fs::read(&tbi_path)
+            .with_context(|| format!("cannot read tabix index: {tbi_path}"))?;
+        let reader = bgzf::io::Reader::new(
+            File::open(bgz_path).with_context(|| format!("cannot open {bgz_path}"))?,
+        );
+        Ok(Self {
+            index,
+            reader,
+            buf: String::new(),
+        })
+    }
+
+    /// Visit every data line overlapping `region`, calling `on_line` for each one.
+    ///
+    /// Mirrors [`fetch_tabix_lines_with`] but reuses the already-open index and
+    /// BGZF reader rather than re-opening them.
+    pub fn fetch_with<F>(&mut self, region: &GenomeRange, mut on_line: F) -> anyhow::Result<()>
+    where
+        F: FnMut(&str) -> anyhow::Result<()>,
+    {
+        if region.range.is_empty() {
+            return Ok(());
+        }
+
+        let rname = region.contig();
+        let ref_seq_id = match self
+            .index
+            .header()
+            .ok_or_else(|| anyhow::anyhow!("tabix index has no header"))?
+            .reference_sequence_names()
+            .get_index_of(rname.as_bytes())
+        {
+            Some(id) => id,
+            None => return Ok(()),
+        };
+
+        let interval_start = Position::try_from(region.range.start as usize + 1)
+            .context("region start overflow")?;
+        let interval_end =
+            Position::try_from(region.range.end as usize).context("region end overflow")?;
+        let interval = Interval::from(interval_start..=interval_end);
+
+        let chunks = self
+            .index
+            .query(ref_seq_id, interval)
+            .context("tabix region query failed")?;
+
+        if chunks.is_empty() {
+            return Ok(());
+        }
+
+        for chunk in chunks {
+            self.reader
+                .seek(chunk.start())
+                .context("BGZF seek failed")?;
+
+            loop {
+                if self.reader.virtual_position() >= chunk.end() {
+                    break;
+                }
+                self.buf.clear();
+                let n = self
+                    .reader
+                    .read_line(&mut self.buf)
+                    .context("BGZF read_line failed")?;
+                if n == 0 {
+                    break;
+                }
+                let line = self.buf.trim_end_matches(|c| c == '\n' || c == '\r');
+                if !line.is_empty() && !line.starts_with('#') {
+                    on_line(line)?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
 /// Convenience wrapper over [`fetch_tabix_lines_with`] that collects results
 /// into a `Vec<String>`.
 ///

+ 25 - 1
src/io/vcf.rs

@@ -29,7 +29,7 @@ use crate::{
 
 use super::{
     dict::read_dict,
-    readers::{fetch_tabix_lines_with, get_reader},
+    readers::{fetch_tabix_lines_with, get_reader, TabixReader},
 };
 
 /// Load a BGZF-compressed or plain VCF file into memory.
@@ -168,6 +168,30 @@ pub fn fetch_vcf(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<Vcf
     Ok(variants)
 }
 
+/// Like [`fetch_vcf`] but reuses a pre-opened [`TabixReader`] instead of
+/// reopening the index and BGZF file on every call.
+///
+/// Use this inside `par_chunks` / `for_each_init` workers where one reader
+/// is initialised per thread and shared across all variants in that chunk.
+pub fn fetch_vcf_with_reader(
+    reader: &mut TabixReader,
+    region: &GenomeRange,
+) -> anyhow::Result<Vec<VcfVariant>> {
+    let mut variants = Vec::new();
+    reader.fetch_with(region, |line| {
+        let v: VcfVariant = line
+            .parse()
+            .with_context(|| format!("failed to parse VCF line: {line}"))?;
+        if v.position.contig == region.contig && region.range.contains(&v.position.position) {
+            for split in from_multiallelic(v)? {
+                variants.push(split);
+            }
+        }
+        Ok(())
+    })?;
+    Ok(variants)
+}
+
 /// A flat VCF row for tab-separated line parsing.
 ///
 /// Use [`VcfVariant`] for richer parsed access. This struct exists for

+ 4 - 3
src/lib.rs

@@ -416,6 +416,7 @@ mod tests {
     #[test]
     fn snv_parse() -> anyhow::Result<()> {
         init();
+        let config = Config::default();
         // ClairS
         let row = "chr1\t10407\t.\tA\tG\t10.1\tPASS\tF\tGT:GQ:DP:AD:AF\t0/1:10:31:10,20:0.6452";
         let variant: VcfVariant = row.parse()?;
@@ -434,7 +435,7 @@ mod tests {
         variant_col.annotate_with_constit_bam(
             &annotations,
             "/data/longreads_basic_pipe/ACHITE/mrd/ACHITE_mrd_hs1.bam",
-            1,
+            &config,
         )?;
         // DeepVariant
         let row =
@@ -551,7 +552,7 @@ mod tests {
         variant_col.annotate_with_constit_bam(
             &annotations,
             "/data/longreads_basic_pipe/ACHITE/mrd/ACHITE_mrd_hs1.bam",
-            1,
+            &Config::default(),
         )?;
 
         Ok(())
@@ -616,7 +617,7 @@ mod tests {
         variant_col.annotate_with_constit_bam(
             &annotations,
             "/data/longreads_basic_pipe/PASSARD/mrd/PASSARD_mrd_hs1.bam",
-            1,
+            &Config::default(),
         )?;
         println!("{variant_col:?}");
         println!("{annotations:?}");

+ 51 - 9
src/pipes/somatic.rs

@@ -59,7 +59,7 @@
 //! - Pandora export helpers
 //! - Annotation matrix generation utilities
 use crate::{
-    annotation::{is_gnomad_and_constit_alt, Caller},
+    annotation::{dbsnp::annotate_dbsnp, is_dbsnp_and_constit_alt, is_gnomad_and_constit_alt, Caller},
     create_should_run_normal_tumoral, init_solo_callers_normal_tumoral,
     io::bed::read_bed,
     pipes::{Initialize, InitializeSolo, ShouldRun},
@@ -420,7 +420,7 @@ impl Run for SomaticPipe {
         // Annotate with depth and number of alternate reads from constitutional BAM
         info!("Reading Constit BAM file for depth and pileup annotation.");
         variants_collections.iter().try_for_each(|c| {
-            c.annotate_with_constit_bam(&annotations, &self.config.normal_bam(&id), 150)
+            c.annotate_with_constit_bam(&annotations, &self.config.normal_bam(&id), &self.config)
         })?;
         annotations.somatic_constit_boundaries(
             self.config.somatic_max_alt_constit,
@@ -489,7 +489,6 @@ impl Run for SomaticPipe {
                 &annotations,
                 &self.config.reference,
                 self.config.entropy_seq_len,
-                self.config.somatic_pipe_threads,
             )
         })?;
 
@@ -535,6 +534,40 @@ impl Run for SomaticPipe {
                 "{stats_dir}/{id}_annotations_06_gnomad_filter.json"
             ))?;
 
+        // Annotate with dbSNP (skip variants already carrying a GnomAD annotation)
+        info!("Annotating with dbSNP.");
+        variants_collections
+            .iter()
+            .try_for_each(|c| annotate_dbsnp(&c.variants, &annotations, &config.db_snp))?;
+
+        annotations
+            .callers_stat(Some(Box::new(|v| {
+                matches!(v, Annotation::Callers(_, _) | Annotation::DbSnp(_, _))
+            })))
+            .save_to_json(&format!("{stats_dir}/{id}_annotations_06b_dbsnp.json"))?;
+
+        // Filter: remove variants in dbSNP with sufficient population freq AND constit alt
+        info!(
+            "Filtering out variants in dbSNP with population AF >= {} and constit alt.",
+            config.db_snp_min_freq
+        );
+        somatic_stats.n_dbsnp_constit = annotations
+            .retain_variants(&mut variants_collections, |anns| {
+                !is_dbsnp_and_constit_alt(anns, config.db_snp_min_freq)
+            });
+        info!(
+            "{} variants filtered by dbSNP + constit alt.",
+            somatic_stats.n_dbsnp_constit
+        );
+
+        annotations
+            .callers_stat(Some(Box::new(|v| {
+                matches!(v, Annotation::Callers(_, _) | Annotation::DbSnp(_, _))
+            })))
+            .save_to_json(&format!(
+                "{stats_dir}/{id}_annotations_06c_dbsnp_filter.json"
+            ))?;
+
         // Filter low entropy variants
         annotations.low_shannon_entropy(self.config.min_shannon_entropy);
 
@@ -766,6 +799,10 @@ fn filter_step_summaries(somatic_pipe_stats: &SomaticPipeStats) -> Vec<FilterSte
             name: "gnomad_and_constit_alt".to_string(),
             removed: somatic_pipe_stats.n_high_alt_constit_gnomad,
         },
+        FilterStepSummary {
+            name: "dbsnp_and_constit_alt".to_string(),
+            removed: somatic_pipe_stats.n_dbsnp_constit,
+        },
         FilterStepSummary {
             name: "low_entropy".to_string(),
             removed: somatic_pipe_stats.n_low_entropies,
@@ -817,6 +854,11 @@ pub struct SomaticPipeStats {
     /// [`is_gnomad_and_constit_alt`].
     pub n_high_alt_constit_gnomad: usize,
 
+    /// Number of variants removed because they are present in dbSNP at or above
+    /// `config.db_snp_min_freq` and also show alternate-allele support in the
+    /// constitutional BAM, according to [`is_dbsnp_and_constit_alt`].
+    pub n_dbsnp_constit: usize,
+
     /// Number of variants removed due to low sequence complexity (Shannon entropy),
     /// i.e. annotated with [`Annotation::LowEntropy`].
     pub n_low_entropies: usize,
@@ -1112,8 +1154,6 @@ impl InputStats {
 
 #[cfg(test)]
 mod tests {
-    use rayon::ThreadPoolBuilder;
-
     use super::*;
     use crate::helpers::test_init;
 
@@ -1122,11 +1162,13 @@ mod tests {
         test_init();
         let config = Config::default();
 
-        let pool = ThreadPoolBuilder::new()
-            .num_threads(config.threads.into())
-            .build()?;
+        // let pool = ThreadPoolBuilder::new()
+        //     .num_threads(config.threads.into())
+        //     .build()?;
+        //
+        // pool.install(|| SomaticPipe::initialize("CHALO", &config)?.run())?;
 
-        pool.install(|| SomaticPipe::initialize("DUMCO", &config)?.run())?;
+        SomaticPipe::initialize("DUMCO", &config)?.run()?;
         Ok(())
     }
 }

+ 137 - 126
src/variant/variant_collection.rs

@@ -9,7 +9,7 @@ use std::{
 use crate::{
     callers::nanomonsv::nanomonsv_insert_classify,
     io::{somaticpipe_container::PandoraReader, tsv::TsvLine, vcf::read_vcf},
-    runners::Run,
+    runners::Run, variant::vcf_variant::VariantId,
 };
 use anyhow::Context;
 use bitcode::{Decode, Encode};
@@ -309,13 +309,9 @@ impl VariantCollection {
         Ok(())
     }
 
-    fn chunk_size(&self, max_threads: u8) -> usize {
-        let total_items = self.variants.len();
-        let min_chunk_size = 1000;
-        let max_chunks = max_threads;
-
-        let optimal_chunk_size = total_items.div_ceil(max_chunks as usize);
-        optimal_chunk_size.max(min_chunk_size)
+    fn chunk_size(&self) -> usize {
+        let n_threads = rayon::current_num_threads().max(1);
+        self.variants.len().div_ceil(n_threads).max(1)
     }
 
     /// Annotates variants with local sequence context–based features:
@@ -372,7 +368,6 @@ impl VariantCollection {
         annotations: &Annotations,
         reference: &str,
         seq_len: usize,
-        max_threads: u8,
     ) -> anyhow::Result<()> {
         // Preflight: fail early rather than panicking in rayon workers.
         noodles_fasta::io::indexed_reader::Builder::default()
@@ -382,7 +377,7 @@ impl VariantCollection {
         // Need at least 3 to compute a trinucleotide.
         let seq_len = seq_len.max(3);
 
-        let chunk_size = self.chunk_size(max_threads);
+        let chunk_size = self.chunk_size();
 
         self.variants.par_chunks(chunk_size).for_each_init(
             || noodles_fasta::io::indexed_reader::Builder::default().build_from_path(reference),
@@ -530,7 +525,7 @@ impl VariantCollection {
         &self,
         annotations: &Annotations,
         constit_bam_path: &str,
-        max_threads: u8,
+        config: &Config,
     ) -> anyhow::Result<()> {
         fn folder<'a>(alt_seq: &'a str) -> impl Fn((u32, u32), (String, i32)) -> (u32, u32) + 'a {
             move |(depth_acc, alt_acc), (seq, n): (String, i32)| {
@@ -543,142 +538,158 @@ impl VariantCollection {
             }
         }
 
-        self.variants
-            .par_chunks(self.chunk_size(max_threads))
-            .try_for_each(|chunk| {
-                let mut bam = rust_htslib::bam::IndexedReader::from_path(constit_bam_path)
-                    .map_err(|e| anyhow::anyhow!("Failed to open BAM file: {e}"))?;
+        // Preflight: fail early rather than silently inside rayon workers.
+        rust_htslib::bam::IndexedReader::from_path(constit_bam_path)
+            .map_err(|e| anyhow::anyhow!("Failed to open BAM {constit_bam_path}: {e}"))?;
+        noodles_fasta::io::indexed_reader::Builder::default()
+            .build_from_path(&config.reference)
+            .map_err(|e| anyhow::anyhow!("Failed to open FASTA {}: {e}", config.reference))?;
 
-                let c = crate::config::Config::default();
+        let chunk_size = self.chunk_size();
+        let reference = config.reference.clone();
 
-                let mut fasta_reader = noodles_fasta::io::indexed_reader::Builder::default()
-                    .build_from_path(c.reference)?;
+        self.variants
+            .par_chunks(chunk_size)
+            .for_each_init(
+                || {
+                    let bam = rust_htslib::bam::IndexedReader::from_path(constit_bam_path);
+                    let fasta = noodles_fasta::io::indexed_reader::Builder::default()
+                        .build_from_path(&reference);
+                    (bam, fasta)
+                },
+                |(bam_res, fasta_res), chunk| {
+                    let Ok(ref mut bam) = bam_res else {
+                        error!("BAM reader unavailable for chunk");
+                        return;
+                    };
+                    let Ok(ref mut fasta_reader) = fasta_res else {
+                        error!("FASTA reader unavailable for chunk");
+                        return;
+                    };
 
-                for var in chunk {
-                    let key = var.hash;
-                    let mut anns = annotations.store.entry(key).or_default();
+                    for var in chunk {
+                        let key = var.hash;
+                        let mut anns = annotations.store.entry(key).or_default();
 
-                    if anns
-                        .iter()
-                        .filter(|e| {
-                            matches!(e, Annotation::ConstitAlt(_) | Annotation::ConstitDepth(_))
-                        })
-                        .count()
-                        != 2
-                    {
-                        match var.alteration_category() {
-                            AlterationCategory::SNV => {
-                                let pileup = counts_at(
-                                    &mut bam,
-                                    &var.position.contig(),
-                                    var.position.position,
-                                )?;
-                                let alt_seq = var.alternative.to_string();
-
-                                let (depth, alt) =
-                                    pileup.into_iter().fold((0, 0), folder(&alt_seq));
-                                // debug!("{} {alt} / {depth}", var.variant_id());
-                                anns.push(Annotation::ConstitDepth(depth as u16));
-                                anns.push(Annotation::ConstitAlt(alt as u16));
-                            }
-                            AlterationCategory::DEL => {
-                                if let Some(del_repr) = var.deletion_desc() {
-                                    let len = var.deletion_len().unwrap_or_default();
+                        if anns
+                            .iter()
+                            .filter(|e| {
+                                matches!(e, Annotation::ConstitAlt(_) | Annotation::ConstitDepth(_))
+                            })
+                            .count()
+                            == 2
+                        {
+                            continue;
+                        }
 
-                                    let pileup_start = crate::collection::bam::nt_pileup_new(
-                                        &mut bam,
+                        let result = (|| -> anyhow::Result<()> {
+                            match var.alteration_category() {
+                                AlterationCategory::SNV => {
+                                    let pileup = counts_at(
+                                        bam,
                                         &var.position.contig(),
-                                        del_repr.start.saturating_sub(1),
-                                        false,
+                                        var.position.position,
                                     )?;
+                                    let alt_seq = var.alternative.to_string();
+                                    let (depth, alt) =
+                                        pileup.into_iter().fold((0, 0), folder(&alt_seq));
+                                    anns.push(Annotation::ConstitDepth(depth as u16));
+                                    anns.push(Annotation::ConstitAlt(alt as u16));
+                                }
+                                AlterationCategory::DEL => {
+                                    if let Some(del_repr) = var.deletion_desc() {
+                                        let len = var.deletion_len().unwrap_or_default();
 
-                                    let pileup_end = crate::collection::bam::nt_pileup_new(
-                                        &mut bam,
-                                        &var.position.contig(),
-                                        del_repr.end.saturating_sub(1),
-                                        false,
-                                    )?;
+                                        let pileup_start = crate::collection::bam::nt_pileup_new(
+                                            bam,
+                                            &var.position.contig(),
+                                            del_repr.start.saturating_sub(1),
+                                            false,
+                                        )?;
 
-                                    let tol = if len > 1 {
-                                        let seq = crate::io::fasta::sequence_range(
-                                            &mut fasta_reader,
+                                        let pileup_end = crate::collection::bam::nt_pileup_new(
+                                            bam,
                                             &var.position.contig(),
-                                            del_repr.start.saturating_sub(1) as usize,
-                                            del_repr.end.saturating_sub(1) as usize,
+                                            del_repr.end.saturating_sub(1),
+                                            false,
                                         )?;
 
-                                        match detect_repetition(&seq) {
-                                            Repeat::None => 0,
-                                            Repeat::RepOne(_, _) => 3,
-                                            Repeat::RepTwo(_, _) => 3,
-                                        }
-                                    } else {
-                                        0
-                                    };
-
-                                    let alt: u32 = pileup_start
-                                        .iter()
-                                        .map(|pb| match pb {
-                                            crate::collection::bam::PileBase::Del((_qn, l))
-                                                if /* end_qnames.contains(qn) */
-                                                     *l >= len.saturating_sub(tol).max(1)
-                                                    && *l <= len + tol =>
-                                            {
-                                                1
+                                        let tol = if len > 1 {
+                                            let seq = crate::io::fasta::sequence_range(
+                                                fasta_reader,
+                                                &var.position.contig(),
+                                                del_repr.start.saturating_sub(1) as usize,
+                                                del_repr.end.saturating_sub(1) as usize,
+                                            )?;
+                                            match detect_repetition(&seq) {
+                                                Repeat::None => 0,
+                                                Repeat::RepOne(_, _) => 3,
+                                                Repeat::RepTwo(_, _) => 3,
                                             }
-                                            _ => 0,
-                                        })
-                                        .sum();
-
-                                    let depth = pileup_start.len().max(pileup_end.len());
-
-                                    // debug!("{} {alt} / {depth} {len}", var.variant_id());
-
-                                    anns.push(Annotation::ConstitAlt(alt as u16));
-                                    anns.push(Annotation::ConstitDepth(depth as u16));
-                                } else {
-                                    anns.push(Annotation::ConstitAlt(0_u16));
-                                    anns.push(Annotation::ConstitDepth(111_u16));
+                                        } else {
+                                            0
+                                        };
+
+                                        let alt: u32 = pileup_start
+                                            .iter()
+                                            .map(|pb| match pb {
+                                                crate::collection::bam::PileBase::Del((_qn, l))
+                                                    if *l >= len.saturating_sub(tol).max(1)
+                                                        && *l <= len + tol =>
+                                                {
+                                                    1
+                                                }
+                                                _ => 0,
+                                            })
+                                            .sum();
+
+                                        let depth = pileup_start.len().max(pileup_end.len());
+                                        anns.push(Annotation::ConstitAlt(alt as u16));
+                                        anns.push(Annotation::ConstitDepth(depth as u16));
+                                    } else {
+                                        anns.push(Annotation::ConstitAlt(0_u16));
+                                        anns.push(Annotation::ConstitDepth(111_u16));
+                                    }
                                 }
-                            }
-                            AlterationCategory::INS => {
-                                let normal_pileup = crate::collection::bam::nt_pileup_new(
-                                    &mut bam,
-                                    &var.position.contig(),
-                                    var.position.position,
-                                    false,
-                                )?;
-                                let normal_depth = normal_pileup.len();
-
-                                let alt_seq = var.inserted_seq().unwrap_or_default();
-                                let alt_seq = alt_seq.as_bytes().to_vec();
-
-                                let mut normal_n_alt = 0;
-                                for pile in normal_pileup {
-                                    if let PileBase::Ins { len, seq } = pile {
-                                        if let Some(seq) = seq {
-                                            let dist = levenshtein_exp(&alt_seq, &seq);
-                                            let dist_frac = dist as f64 / len as f64;
-                                            // dbg!(dist_frac);
-                                            if dist_frac < 0.1 {
+                                AlterationCategory::INS => {
+                                    let normal_pileup = crate::collection::bam::nt_pileup_new(
+                                        bam,
+                                        &var.position.contig(),
+                                        var.position.position,
+                                        false,
+                                    )?;
+                                    let normal_depth = normal_pileup.len();
+                                    let alt_seq = var.inserted_seq().unwrap_or_default();
+                                    let alt_seq = alt_seq.as_bytes().to_vec();
+
+                                    let mut normal_n_alt = 0;
+                                    for pile in normal_pileup {
+                                        if let PileBase::Ins { len, seq } = pile {
+                                            if let Some(seq) = seq {
+                                                let dist = levenshtein_exp(&alt_seq, &seq);
+                                                let dist_frac = dist as f64 / len as f64;
+                                                if dist_frac < 0.1 {
+                                                    normal_n_alt += 1;
+                                                }
+                                            } else if alt_seq.len() as u32 == len {
                                                 normal_n_alt += 1;
                                             }
-                                        } else if alt_seq.len() as u32 == len {
-                                            normal_n_alt += 1;
                                         }
                                     }
+                                    anns.push(Annotation::ConstitAlt(normal_n_alt as u16));
+                                    anns.push(Annotation::ConstitDepth(normal_depth as u16));
                                 }
-
-                                // debug!("{} {alt} / {depth} ", var.variant_id());
-                                anns.push(Annotation::ConstitAlt(normal_n_alt as u16));
-                                anns.push(Annotation::ConstitDepth(normal_depth as u16));
+                                _ => (),
                             }
-                            _ => (),
+                            Ok(())
+                        })();
+
+                        if let Err(e) = result {
+                            warn!("BAM annotation failed for {}: {e}", var.variant_id());
                         }
                     }
-                }
-                anyhow::Ok(())
-            })?;
+                },
+            );
         Ok(())
     }
 
@@ -2388,7 +2399,7 @@ mod tests {
 
         let constit_bam_path = &config.normal_bam("CHAHA");
         let annotations = Annotations::default();
-        coll.annotate_with_constit_bam(&annotations, constit_bam_path, 1)?;
+        coll.annotate_with_constit_bam(&annotations, constit_bam_path, &config)?;
 
         println!("{annotations:#?}");
         Ok(())