Thomas 9 ヶ月 前
コミット
f3dda27be5

+ 31 - 0
Cargo.lock

@@ -515,6 +515,30 @@ version = "0.10.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
 
+[[package]]
+name = "bitcode"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18c1406a27371b2f76232a2259df6ab607b91b5a0a7476a7729ff590df5a969a"
+dependencies = [
+ "arrayvec",
+ "bitcode_derive",
+ "bytemuck",
+ "glam",
+ "serde",
+]
+
+[[package]]
+name = "bitcode_derive"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42b6b4cb608b8282dc3b53d0f4c9ab404655d562674c682db7e6c0458cc83c23"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.98",
+]
+
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -1646,6 +1670,12 @@ version = "0.31.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
 
+[[package]]
+name = "glam"
+version = "0.30.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17fcdf9683c406c2fc4d124afd29c0d595e22210d633cbdb8695ba9935ab1dc6"
+
 [[package]]
 name = "glob"
 version = "0.3.2"
@@ -2879,6 +2909,7 @@ dependencies = [
  "anyhow",
  "arrow",
  "bgzip",
+ "bitcode",
  "blake3",
  "byte-unit",
  "chrono",

+ 1 - 0
Cargo.toml

@@ -43,6 +43,7 @@ rand = "0.9.0"
 tar = "0.4.43"
 flatbuffers = "25.2.10"
 ordered-float = { version = "5.0.0", features = ["serde"] }
+bitcode = "0.6.5"
 
 [profile.dev]
 opt-level = 0

+ 11 - 9
src/annotation/cosmic.rs

@@ -1,10 +1,9 @@
 use std::str::FromStr;
 
-use serde::{Serialize, Deserialize};
+use bitcode::{Decode, Encode};
+use serde::{Deserialize, Serialize};
 
-use anyhow::{anyhow, Context, Ok, Result};
-
-#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
+#[derive(Debug, PartialEq, Serialize, Deserialize, Clone, Encode, Decode)]
 pub struct Cosmic {
     pub cosmic_cnt: u64,
 }
@@ -12,21 +11,24 @@ pub struct Cosmic {
 impl FromStr for Cosmic {
     type Err = anyhow::Error;
 
-    fn from_str(s: &str) -> Result<Self> {
+    fn from_str(s: &str) -> anyhow::Result<Self> {
         let vs: Vec<&str> = s.split(";").collect();
         if vs.len() != 3 {
-            return Err(anyhow!("Error while parsing Cosmic results not the right number of parts for {s}"));
+            return Err(anyhow::anyhow!(
+                "Error while parsing Cosmic results not the right number of parts for {s}"
+            ));
         }
 
         if vs[0].contains("MISSING") {
-            Err(anyhow!("MISSING values in Cosmic results: {s}"))
+            Err(anyhow::anyhow!("MISSING values in Cosmic results: {s}"))
         } else {
             let v: Vec<&str> = vs[2].split("=").collect();
 
             Ok(Cosmic {
-                cosmic_cnt: v[1].parse().context("parsing cosmic cnt")?,
+                cosmic_cnt: v[1]
+                    .parse()
+                    .map_err(|e| anyhow::anyhow!("Failed to parse COSMIC CNT.\n{e}"))?,
             })
         }
     }
 }
-

+ 5 - 5
src/annotation/gnomad.rs

@@ -1,8 +1,8 @@
-use anyhow::{anyhow, Ok, Result};
+use bitcode::{Decode, Encode};
 use serde::{Deserialize, Serialize};
 use std::str::FromStr;
 
-#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
+#[derive(Debug, PartialEq, Serialize, Deserialize, Clone, Encode, Decode)]
 pub struct GnomAD {
     pub gnomad_ac: u64,
     pub gnomad_an: u64,
@@ -22,13 +22,13 @@ pub struct GnomAD {
 impl FromStr for GnomAD {
     type Err = anyhow::Error;
 
-    fn from_str(s: &str) -> Result<Self> {
+    fn from_str(s: &str) -> anyhow::Result<Self> {
         let vs: Vec<_> = s.split(";").collect();
         if vs.len() < 13 {
-            return Err(anyhow!("Error not the right number of parts for {:?}", s));
+            return Err(anyhow::anyhow!("Error not the right number of parts for {:?}", s));
         }
         if vs[0].contains("-1") {
-            return Err(anyhow!(
+            return Err(anyhow::anyhow!(
                 "MISSING values check for -1 before parsing for {:?}",
                 s
             ));

+ 4 - 3
src/annotation/mod.rs

@@ -20,6 +20,7 @@ use crate::{
     helpers::{mean, Blake3BuildHasher, Hash128},
     variant::{variant::AlterationCategory, variant_collection::VariantCollection},
 };
+use bitcode::{Decode, Encode};
 use cosmic::Cosmic;
 use dashmap::DashMap;
 use gnomad::GnomAD;
@@ -28,7 +29,7 @@ use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use vep::{get_best_vep, VEP};
 
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
 pub enum Annotation {
     Callers(Caller, Sample),
     AlterationCategory(AlterationCategory),
@@ -43,7 +44,7 @@ pub enum Annotation {
     VEP(Vec<VEP>),
 }
 
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
 pub enum Sample {
     SoloTumor,
     SoloConstit,
@@ -124,7 +125,7 @@ impl FromStr for Annotation {
     }
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)]
 pub enum Caller {
     DeepVariant,
     ClairS,

+ 5 - 4
src/annotation/vep.rs

@@ -1,4 +1,5 @@
 use anyhow::anyhow;
+use bitcode::{Decode, Encode};
 use hashbrown::HashMap;
 use itertools::Itertools;
 use log::{debug, warn};
@@ -98,7 +99,7 @@ impl FromStr for VepLine {
 ///
 /// This struct encapsulates various fields that describe the predicted effects
 /// of a variant on genes, transcripts, and proteins, as determined by VEP.
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
 pub struct VEP {
     /// The gene affected by the variant
     pub gene: Option<String>,
@@ -135,7 +136,7 @@ pub struct VEP {
 ///
 /// For more information, see:
 /// <https://ensembl.org/info/genome/variation/prediction/predicted_data.html>
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)]
 pub enum VepConsequence {
     /// Complete destruction of a transcript
     TranscriptAblation,
@@ -223,7 +224,7 @@ pub enum VepConsequence {
 /// Variant Effect Predictor (VEP).
 ///
 /// The impact categories are ordered from most severe (HIGH) to least severe (MODIFIER).
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)]
 pub enum VepImpact {
     /// High impact variants are expected to have high (disruptive) impact in the protein,
     /// probably causing protein truncation, loss of function or triggering nonsense mediated decay.
@@ -542,7 +543,7 @@ impl TryFrom<&VepLine> for VEP {
 ///
 /// This struct encapsulates various optional fields that provide extra context
 /// about a variant's predicted effect.
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
 pub struct VEPExtra {
     /// The impact severity of the variant (e.g., HIGH, MODERATE, LOW, MODIFIER)
     pub impact: Option<VepImpact>,

+ 2 - 1
src/helpers.rs

@@ -1,4 +1,5 @@
 use anyhow::Context;
+use bitcode::{Decode, Encode};
 use chrono::{DateTime, TimeZone, Utc};
 use glob::glob;
 use serde::{Deserialize, Serialize};
@@ -391,7 +392,7 @@ impl BuildHasher for Blake3BuildHasher {
     }
 }
 // Custom 128-bit hash type
-#[derive(PartialEq, Eq, Clone, Copy, Serialize, Deserialize, Debug)]
+#[derive(PartialEq, Eq, Clone, Copy, Serialize, Deserialize, Debug, Encode, Decode)]
 pub struct Hash128([u8; 16]);
 
 impl std::hash::Hash for Hash128 {

+ 2 - 1
src/positions.rs

@@ -1,6 +1,7 @@
 use std::{cmp::Ordering, fmt::Display, ops::Range, str::FromStr};
 
 use anyhow::Context;
+use bitcode::{Decode, Encode};
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 
@@ -36,7 +37,7 @@ use serde::{Deserialize, Serialize};
 ///     position: 1000,  // 1001st base pair in the chromosome (remember it's 0-based)
 /// };
 /// ```
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash, Default)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash, Default, Encode, Decode)]
 pub struct GenomePosition {
     pub contig: u8,
     pub position: u32,

+ 11 - 10
src/variant/variant.rs

@@ -6,13 +6,14 @@ use crate::{
     variant::variant_collection::VariantCollection,
 };
 use anyhow::{anyhow, Context};
+use bitcode::{Decode, Encode};
 use log::warn;
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use std::{cmp::Ordering, collections::HashSet, fmt, hash::Hash, str::FromStr};
 
 /// Represents a variant in the Variant Call Format (VCF).
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
 pub struct VcfVariant {
     /// A 128-bit hash of the variant's key properties for efficient comparison and storage.
     pub hash: Hash128,
@@ -350,7 +351,7 @@ pub struct BNDDesc {
     pub added_nt: String,
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
 pub enum AlterationCategory {
     SNV,
     DEL,
@@ -395,7 +396,7 @@ impl From<SVType> for AlterationCategory {
     }
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)]
 pub enum SVType {
     DEL,
     INS,
@@ -463,7 +464,7 @@ impl Ord for VcfVariant {
 }
 
 /// Info
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)]
 pub struct Infos(pub Vec<Info>);
 
 impl FromStr for Infos {
@@ -494,7 +495,7 @@ impl fmt::Display for Infos {
 }
 
 #[allow(non_camel_case_types)]
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)]
 pub enum Info {
     Empty,
     H,
@@ -689,7 +690,7 @@ pub fn concat_numbers<T: ToString>(v: &[T]) -> String {
 }
 
 /// Format
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)]
 pub enum Format {
     // DeepVariant
     GT(String),
@@ -721,7 +722,7 @@ pub enum Format {
     Other((String, String)), // (key, value)
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)]
 pub struct Formats(pub Vec<Format>);
 
 impl Formats {
@@ -911,7 +912,7 @@ impl Formats {
 }
 
 /// Filter
-#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)]
 pub enum Filter {
     PASS,
     Other(String),
@@ -937,7 +938,7 @@ impl fmt::Display for Filter {
     }
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
 pub enum ReferenceAlternative {
     Nucleotide(Base),
     Nucleotides(Vec<Base>),
@@ -980,7 +981,7 @@ impl fmt::Display for ReferenceAlternative {
     }
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
 pub enum Base {
     A,
     T,

+ 23 - 3
src/variant/variant_collection.rs

@@ -1,12 +1,13 @@
 use std::{
     collections::{BTreeMap, HashMap, HashSet},
     fs::{self, File},
-    io::Write,
+    io::{Read, Write},
     path::Path,
 };
 
 use anyhow::Context;
 use bgzip::{BGZFReader, BGZFWriter};
+use bitcode::{Decode, Encode};
 use csv::ReaderBuilder;
 use dashmap::DashMap;
 use log::{debug, error, info, warn};
@@ -405,7 +406,7 @@ impl VariantCollection {
 ///     annotations: vec![/* Annotation instances */],
 /// };
 /// ```
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Serialize, Deserialize, Clone, Encode, Decode)]
 pub struct Variant {
     pub hash: Hash128,
     pub position: GenomePosition,
@@ -495,7 +496,7 @@ impl Variant {
 ///
 /// # Note
 /// The `Default` implementation creates an empty vector of variants.
-#[derive(Debug, Default, Serialize, Deserialize, Clone)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone, Encode, Decode)]
 pub struct Variants {
     pub data: Vec<Variant>,
 }
@@ -742,6 +743,25 @@ impl Variants {
         debug!("Successfully loaded variants from {}", filename);
         Ok(variants)
     }
+
+    pub fn save_to_file(&self, filename: &str) -> Result<(), Box<dyn std::error::Error>> {
+        info!("Saving file: {filename}");
+        let encoded = bitcode::encode(self);
+        let mut file = File::create(filename)?;
+        file.write_all(&encoded)?;
+        Ok(())
+    }
+
+    pub fn load_from_file(filename: &str) -> Result<Self, Box<dyn std::error::Error>> {
+        info!("Load from file: {filename}");
+        let mut file = File::open(filename)?;
+        let mut buffer = Vec::new();
+        file.read_to_end(&mut buffer)?;
+        let decoded: Self = bitcode::decode(&buffer)?;
+        Ok(decoded)
+    }
+
+
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]