Ver Fonte

blake3 hash128

Thomas há 1 ano atrás
pai
commit
0c0c97d84b

+ 16 - 8
src/annotation/mod.rs

@@ -11,7 +11,7 @@ use std::{
     sync::Arc,
 };
 
-use crate::{helpers::mean, variant::variant_collection::VariantCollection};
+use crate::{helpers::{mean, Blake3BuildHasher, Hash128}, variant::variant_collection::VariantCollection};
 use cosmic::Cosmic;
 use dashmap::DashMap;
 use gnomad::GnomAD;
@@ -103,9 +103,17 @@ impl fmt::Display for Caller {
     }
 }
 
-#[derive(Debug, Default, Clone)]
+#[derive(Debug, Clone)]
 pub struct Annotations {
-    pub store: DashMap<u128, Vec<Annotation>>,
+    pub store: DashMap<Hash128, Vec<Annotation>, Blake3BuildHasher>,
+}
+
+impl Default for Annotations {
+    fn default() -> Self {
+        Annotations {
+            store: DashMap::with_hasher(Blake3BuildHasher),
+        }
+    }
 }
 
 #[derive(Debug, Default, Clone)]
@@ -115,7 +123,7 @@ pub struct AnnotationsStats {
 }
 
 impl Annotations {
-    pub fn insert_update(&self, key: u128, add: &[Annotation]) {
+    pub fn insert_update(&self, key: Hash128, add: &[Annotation]) {
         self.store
             .entry(key)
             .or_default()
@@ -195,7 +203,7 @@ impl Annotations {
     pub fn get_keys_filter(
         &self,
         filter: impl Fn(&Vec<Annotation>) -> bool + Send + Sync,
-    ) -> Vec<u128> {
+    ) -> Vec<Hash128> {
         self.store
             .par_iter()
             .filter(|entry| filter(entry.value()))
@@ -238,7 +246,7 @@ impl Annotations {
                 c.variants = c
                     .variants
                     .par_iter()
-                    .filter(|a| keys.contains(&a.hash_variant()))
+                    .filter(|a| keys.contains(&a.hash()))
                     // .filter(|a| keys.par_iter().any(|k| k == &a.hash_variant()))
                     .cloned()
                     .collect();
@@ -254,11 +262,11 @@ impl Annotations {
         n_removed
     }
 
-    pub fn retain_keys(&mut self, keys_to_keep: &HashSet<u128>) {
+    pub fn retain_keys(&mut self, keys_to_keep: &HashSet<Hash128>) {
         self.store.retain(|key, _| keys_to_keep.contains(key));
     }
 
-    pub fn remove_keys(&mut self, keys_to_remove: &HashSet<u128>) {
+    pub fn remove_keys(&mut self, keys_to_remove: &HashSet<Hash128>) {
         self.store.retain(|key, _| !keys_to_remove.contains(key));
     }
 

+ 2 - 2
src/callers/clairs.rs

@@ -198,7 +198,7 @@ impl Variants for ClairS {
         let variants = read_vcf(&self.vcf_passed)?;
 
         variants.par_iter().for_each(|v| {
-            annotations.insert_update(v.hash_variant(), &add);
+            annotations.insert_update(v.hash(), &add);
         });
         Ok(VariantCollection {
             variants,
@@ -214,7 +214,7 @@ impl ClairS {
         let add = vec![Annotation::Callers(Caller::ClairS), Annotation::Germline];
         let variants = read_vcf(&self.clair3_germline_passed)?;
         variants.par_iter().for_each(|v| {
-            annotations.insert_update(v.hash_variant(), &add);
+            annotations.insert_update(v.hash(), &add);
         });
 
         Ok(VariantCollection {

+ 1 - 1
src/callers/deep_variant.rs

@@ -151,7 +151,7 @@ impl Variants for DeepVariant {
         info!("Loading variant from DeepVariant {} {} with annotations: {:?}", self.id, self.time, add);
         let variants = read_vcf(&self.vcf_passed)?;
         variants.par_iter().for_each(|v| {
-            annotations.insert_update(v.hash_variant(), &add);
+            annotations.insert_update(v.hash(), &add);
         });
 
         Ok(VariantCollection {

+ 2 - 2
src/callers/nanomonsv.rs

@@ -147,7 +147,7 @@ impl Variants for NanomonSV {
         let variants = read_vcf(&self.vcf_passed)?;
 
         variants.par_iter().for_each(|v| {
-            annotations.insert_update(v.hash_variant(), &add);
+            annotations.insert_update(v.hash(), &add);
         });
         Ok(VariantCollection {
             variants,
@@ -281,7 +281,7 @@ impl Variants for NanomonSVSolo {
         let variants = read_vcf(&self.vcf_passed)?;
 
         variants.par_iter().for_each(|v| {
-            annotations.insert_update(v.hash_variant(), &add);
+            annotations.insert_update(v.hash(), &add);
         });
         Ok(VariantCollection {
             variants,

+ 1 - 1
src/callers/savana.rs

@@ -158,7 +158,7 @@ impl Variants for Savana {
         let variants = read_vcf(&self.passed_vcf)?;
 
         variants.par_iter().for_each(|v| {
-            annotations.insert_update(v.hash_variant(), &add);
+            annotations.insert_update(v.hash(), &add);
         });
         Ok(VariantCollection {
             variants,

+ 1 - 1
src/callers/severus.rs

@@ -161,7 +161,7 @@ impl Variants for Severus {
         let variants = read_vcf(&vcf_passed)?;
 
         variants.par_iter().for_each(|v| {
-            annotations.insert_update(v.hash_variant(), &add);
+            annotations.insert_update(v.hash(), &add);
         });
         Ok(VariantCollection {
             variants,

+ 28 - 0
src/helpers.rs

@@ -1,5 +1,6 @@
 use anyhow::Context;
 use log::warn;
+use serde::{Deserialize, Serialize};
 use std::{
     cmp::Ordering,
     collections::HashMap,
@@ -328,4 +329,31 @@ impl BuildHasher for Blake3BuildHasher {
         Blake3Hash(Blake3Hasher::new())
     }
 }
+// Custom 128-bit hash type
+#[derive(PartialEq, Eq, Clone, Copy, Serialize, Deserialize, Debug)]
+pub struct Hash128([u8; 16]);
 
+// use std::hash::{Hash, Hasher};
+
+impl std::hash::Hash for Hash128 {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        state.write(&self.0);
+    }
+}
+
+
+impl Hash128 {
+    pub fn new(bytes: [u8; 16]) -> Self {
+        Hash128(bytes)
+    }
+    pub fn to_bytes(&self) -> [u8; 16] {
+        self.0
+    }
+}
+
+// impl Hash for Hash128 {
+//     fn hash<H: Hasher>(&self, state: &mut H) {
+//         state.write(&self.0);
+//     }
+// }
+//

+ 53 - 30
src/variant/variant.rs

@@ -1,5 +1,6 @@
 use crate::{
     annotation::Annotations,
+    helpers::Hash128,
     positions::{GenomePosition, GetGenomePosition, VcfPosition},
     runners::Run,
     variant::variant_collection::VariantCollection,
@@ -12,6 +13,7 @@ use std::{cmp::Ordering, collections::HashSet, fmt, hash::Hash, str::FromStr};
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct VcfVariant {
+    pub hash: Hash128,
     pub position: GenomePosition,
     pub id: String,
     pub reference: ReferenceAlternative,
@@ -68,22 +70,36 @@ impl FromStr for VcfVariant {
             Formats::default()
         };
 
+        let position: GenomePosition = vcf_position.into();
+        let reference: ReferenceAlternative = v
+            .get(3)
+            .ok_or(anyhow!("Can't parse reference from: {s}"))?
+            .parse()
+            .context(format!("Can't parse reference from: {s}"))?;
+        let alternative: ReferenceAlternative = v
+            .get(4)
+            .ok_or(anyhow!("Can't parse alternative from: {s}"))?
+            .parse()
+            .context(format!("Can't parse alternative from: {s}"))?;
+
+
+        let mut hasher = blake3::Hasher::new();
+        hasher.update(&position.contig.to_ne_bytes()); // Convert position to bytes
+        hasher.update(&position.position.to_ne_bytes()); // Convert position to bytes
+        hasher.update(reference.to_string().as_bytes()); // Reference string as bytes
+        hasher.update(alternative.to_string().as_bytes()); // Alternative string as bytes
+        let hash = hasher.finalize();
+        let hash = Hash128::new(hash.as_bytes()[..16].try_into().unwrap());
+
         Ok(Self {
-            position: vcf_position.into(),
+            hash,
+            position,
             id: v
                 .get(2)
                 .ok_or(anyhow!("Can't parse id from: {s}"))?
                 .to_string(),
-            reference: v
-                .get(3)
-                .ok_or(anyhow!("Can't parse reference from: {s}"))?
-                .parse()
-                .context(format!("Can't parse reference from: {s}"))?,
-            alternative: v
-                .get(4)
-                .ok_or(anyhow!("Can't parse alternative from: {s}"))?
-                .parse()
-                .context(format!("Can't parse alternative from: {s}"))?,
+            reference,
+            alternative,
             quality: v
                 .get(5)
                 .map(|s| s.parse::<f32>().ok()) // Try to parse as f64; returns Option<f64>
@@ -131,8 +147,13 @@ impl VcfVariant {
         columns.join("\t")
     }
 
+    pub fn hash(&self) -> Hash128 {
+        self.hash
+    }
+
     pub fn commun_deepvariant_clairs(&self) -> VcfVariant {
         VcfVariant {
+            hash: self.hash,
             position: self.position.clone(),
             id: self.id.clone(),
             reference: self.reference.clone(),
@@ -144,25 +165,27 @@ impl VcfVariant {
         }
     }
 
-    pub fn hash_variant(&self) -> u128 {
-        // Create a new BLAKE3 hasher
-        let mut hasher = blake3::Hasher::new();
-        // Update the hasher with the fields of the variant
-        hasher.update(&self.position.contig.to_ne_bytes()); // Convert position to bytes
-        hasher.update(&self.position.position.to_ne_bytes()); // Convert position to bytes
-        hasher.update(self.reference.to_string().as_bytes()); // Reference string as bytes
-        hasher.update(self.alternative.to_string().as_bytes()); // Alternative string as bytes
-
-        // Finalize the hash and get the output
-        let hash_output = hasher.finalize();
-
-        // Convert the first 16 bytes of the hash output to a u128
-        let mut array = [0u8; 16];
-        array.copy_from_slice(&hash_output.as_bytes()[..16]);
-
-        // Convert to u128
-        u128::from_ne_bytes(array)
-    }
+    // pub fn hash_variant(&self) -> Hash128 {
+    //     // Create a new BLAKE3 hasher
+    //     let mut hasher = blake3::Hasher::new();
+    //     // Update the hasher with the fields of the variant
+    //     hasher.update(&self.position.contig.to_ne_bytes()); // Convert position to bytes
+    //     hasher.update(&self.position.position.to_ne_bytes()); // Convert position to bytes
+    //     hasher.update(self.reference.to_string().as_bytes()); // Reference string as bytes
+    //     hasher.update(self.alternative.to_string().as_bytes()); // Alternative string as bytes
+    //
+    //     // Finalize the hash and get the output
+    //     // let hash_output = hasher.finalize();
+    //
+    //     // Convert the first 16 bytes of the hash output to a u128
+    //     // let mut array = [0u8; 16];
+    //     // array.copy_from_slice(&hash_output.as_bytes()[..16]);
+    //
+    //     // Convert to u128
+    //     // u128::from_ne_bytes(array)
+    //     let hash = hasher.finalize();
+    //     Hash128::new(hash.as_bytes()[..16].try_into().unwrap())
+    // }
 
     pub fn alteration_category(&self) -> AlterationCategory {
         match (&self.reference, &self.alternative) {

+ 18 - 18
src/variant/variant_collection.rs

@@ -23,7 +23,7 @@ use crate::{
         bam::{counts_at, counts_ins_at},
         vcf::Vcf,
     },
-    helpers::{app_storage_dir, estimate_shannon_entropy, temp_file_path},
+    helpers::{app_storage_dir, estimate_shannon_entropy, temp_file_path, Hash128},
     io::{readers::get_reader, vcf::vcf_header},
     pipes::somatic::sequence_at,
 };
@@ -37,18 +37,18 @@ pub struct VariantCollection {
 }
 
 impl VariantCollection {
-    pub fn keys(&self) -> Vec<u128> {
-        self.variants.iter().map(|v| v.hash_variant()).collect()
+    pub fn keys(&self) -> Vec<Hash128> {
+        self.variants.iter().map(|v| v.hash()).collect()
     }
 
-    pub fn retain_keys(&mut self, keys_to_keep: &HashSet<u128>) {
+    pub fn retain_keys(&mut self, keys_to_keep: &HashSet<Hash128>) {
         self.variants
-            .retain(|v| keys_to_keep.contains(&v.hash_variant()));
+            .retain(|v| keys_to_keep.contains(&v.hash()));
     }
 
-    pub fn remove_keys(&mut self, keys_to_remove: &HashSet<u128>) {
+    pub fn remove_keys(&mut self, keys_to_remove: &HashSet<Hash128>) {
         self.variants
-            .retain(|v| !keys_to_remove.contains(&v.hash_variant()));
+            .retain(|v| !keys_to_remove.contains(&v.hash()));
     }
 
     pub fn partition<F>(self, predicate: F) -> (Self, Self)
@@ -102,7 +102,7 @@ impl VariantCollection {
                     .unwrap();
 
                 for c in chunk {
-                    let key = c.hash_variant();
+                    let key = c.hash();
                     let mut anns = annotations.store.entry(key).or_default();
 
                     if !anns
@@ -136,7 +136,7 @@ impl VariantCollection {
                     .map_err(|e| anyhow::anyhow!("Failed to open BAM file: {e}"))?;
 
                 for var in chunk {
-                    let key = var.hash_variant();
+                    let key = var.hash();
                     let mut anns = annotations.store.entry(key).or_default();
 
                     if anns
@@ -233,7 +233,7 @@ impl ExternalAnnotation {
         let mut unfound = Vec::new();
 
         for variant in variants {
-            let hash: u128 = variant.hash_variant();
+            let hash  = variant.hash();
             let mut has_pushed = false;
 
             // Check COSMIC
@@ -277,12 +277,12 @@ impl ExternalAnnotation {
 
     fn get_annotation<T: serde::de::DeserializeOwned>(
         &self,
-        hash: u128,
+        hash: Hash128,
         source: &str,
     ) -> anyhow::Result<Option<T>> {
         let result: SqliteResult<Vec<u8>> = self.conn.query_row(
             "SELECT data FROM annotations WHERE hash = ? AND source = ?",
-            params![hash.to_le_bytes().to_vec(), source],
+            params![hash.to_bytes(), source],
             |row| row.get(0),
         );
 
@@ -354,7 +354,7 @@ impl ExternalAnnotation {
 
                     let (cosmic, gnomad) = parse_echtvar_val(&row.info)?;
 
-                    let hash = chunk[i].hash_variant();
+                    let hash = chunk[i].hash();
 
                     chunk_results.push((hash, cosmic, gnomad));
                 }
@@ -416,7 +416,7 @@ impl ExternalAnnotation {
         let mut unfound = Vec::new();
 
         for variant in variants {
-            let hash: u128 = variant.hash_variant();
+            let hash = variant.hash();
 
             // Check VEP
             match self.get_annotation(hash, "VEP")? {
@@ -495,14 +495,14 @@ impl ExternalAnnotation {
                 fs::remove_file(out_vep)?;
 
                 let mut n_not_vep = 0;
-                let mut chunk_results: Vec<(u128, Vec<VEP>)> = Vec::new();
+                let mut chunk_results: Vec<(Hash128, Vec<VEP>)> = Vec::new();
 
                 chunk.iter().enumerate().for_each(|(i, entry)| {
                     let k = (i + 1) as u64;
 
                     if let Some(vep_lines) = lines.get(&k) {
                         if let Ok(veps) = vep_lines.iter().map(VEP::try_from).collect() {
-                            chunk_results.push((entry.hash_variant(), veps));
+                            chunk_results.push((entry.hash(), veps));
                         }
                     } else {
                         n_not_vep += 1;
@@ -530,12 +530,12 @@ impl ExternalAnnotation {
         Ok(())
     }
 
-    pub fn update_database(&self, hash: u128, source: &str, data: &[u8]) -> anyhow::Result<()> {
+    pub fn update_database(&self, hash: Hash128, source: &str, data: &[u8]) -> anyhow::Result<()> {
         let modified = chrono::Utc::now().to_rfc3339();
 
         self.conn.execute(
             "INSERT OR REPLACE INTO annotations (hash, source, data, modified) VALUES (?, ?, ?, ?)",
-            params![hash.to_le_bytes().to_vec(), source, data, modified],
+            params![hash.to_bytes(), source, data, modified],
         )?;
         Ok(())
     }