Forráskód Böngészése

Add Pandora SomaticPipe output container

STEIMLE Thomas 2 hete
szülő
commit
87009c3474

+ 1 - 0
.gitignore

@@ -2,3 +2,4 @@
 .temp_*
 *.out
 *.dat
+/patches/

+ 258 - 42
Cargo.lock

@@ -8,6 +8,41 @@ version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
 
+[[package]]
+name = "aead"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0"
+dependencies = [
+ "crypto-common",
+ "generic-array",
+]
+
+[[package]]
+name = "aes"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
+dependencies = [
+ "cfg-if",
+ "cipher",
+ "cpufeatures",
+]
+
+[[package]]
+name = "aes-gcm"
+version = "0.10.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "831010a0f742e1209b3bcea8fab6a8e149051ba6099432c8cb2cc117dec3ead1"
+dependencies = [
+ "aead",
+ "aes",
+ "cipher",
+ "ctr",
+ "ghash",
+ "subtle",
+]
+
 [[package]]
 name = "ahash"
 version = "0.7.8"
@@ -113,6 +148,18 @@ version = "1.0.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61"
 
+[[package]]
+name = "argon2"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c3610892ee6e0cbce8ae2700349fcf8f98adb0dbfbee85aec3c9179d29cc072"
+dependencies = [
+ "base64ct",
+ "blake2",
+ "cpufeatures",
+ "password-hash",
+]
+
 [[package]]
 name = "arrayref"
 version = "0.3.9"
@@ -358,6 +405,12 @@ version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
+[[package]]
+name = "base64ct"
+version = "1.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06"
+
 [[package]]
 name = "bindgen"
 version = "0.69.5"
@@ -439,6 +492,15 @@ dependencies = [
  "wyz",
 ]
 
+[[package]]
+name = "blake2"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe"
+dependencies = [
+ "digest",
+]
+
 [[package]]
 name = "blake3"
 version = "1.8.2"
@@ -452,6 +514,15 @@ dependencies = [
  "constant_time_eq",
 ]
 
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
 [[package]]
 name = "borsh"
 version = "1.6.0"
@@ -622,6 +693,16 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "cipher"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
+dependencies = [
+ "crypto-common",
+ "inout",
+]
+
 [[package]]
 name = "clang-sys"
 version = "1.8.1"
@@ -705,6 +786,15 @@ version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "crc32fast"
 version = "1.5.0"
@@ -754,6 +844,17 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
 
+[[package]]
+name = "crypto-common"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
+dependencies = [
+ "generic-array",
+ "rand_core 0.6.4",
+ "typenum",
+]
+
 [[package]]
 name = "csv"
 version = "1.4.0"
@@ -775,6 +876,15 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "ctr"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835"
+dependencies = [
+ "cipher",
+]
+
 [[package]]
 name = "curl-sys"
 version = "0.4.84+curl-8.17.0"
@@ -812,17 +922,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "derive-new"
-version = "0.5.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
 [[package]]
 name = "derive-new"
 version = "0.6.0"
@@ -845,6 +944,17 @@ dependencies = [
  "syn 2.0.111",
 ]
 
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "crypto-common",
+ "subtle",
+]
+
 [[package]]
 name = "directories"
 version = "5.0.1"
@@ -1074,6 +1184,16 @@ version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
 
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.2.16"
@@ -1097,6 +1217,16 @@ dependencies = [
  "wasip2",
 ]
 
+[[package]]
+name = "ghash"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0d8a4362ccb29cb0b265253fb0a2728f592895ee6854fd9bc13f2ffda266ff1"
+dependencies = [
+ "opaque-debug",
+ "polyval",
+]
+
 [[package]]
 name = "glam"
 version = "0.30.9"
@@ -1192,8 +1322,6 @@ dependencies = [
 [[package]]
 name = "hts-sys"
 version = "2.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e38d7f1c121cd22aa214cb4dadd4277dc5447391eac518b899b29ba6356fbbb2"
 dependencies = [
  "bindgen",
  "bzip2-sys",
@@ -1361,6 +1489,15 @@ dependencies = [
  "web-time",
 ]
 
+[[package]]
+name = "inout"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
+dependencies = [
+ "generic-array",
+]
+
 [[package]]
 name = "is_terminal_polyfill"
 version = "1.70.2"
@@ -1824,6 +1961,12 @@ version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
 
+[[package]]
+name = "opaque-debug"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"
+
 [[package]]
 name = "openssl-src"
 version = "300.5.4+3.5.4"
@@ -1876,7 +2019,7 @@ dependencies = [
 [[package]]
 name = "pandora_lib_assembler"
 version = "0.1.0"
-source = "git+https://git.t0m4.fr/Thomas/pandora_lib_assembler.git#f942be5a9f38dcda572d45231f81d9268bc73197"
+source = "git+https://git.t0m4.fr/Thomas/pandora_lib_assembler.git#3d241b7dbc175701e6454a9befffb4acdcdd26f3"
 dependencies = [
  "anyhow",
  "bam",
@@ -1892,7 +2035,7 @@ dependencies = [
  "petgraph 0.6.5",
  "rayon",
  "regex",
- "rust-htslib 0.47.1",
+ "rust-htslib",
  "seq_io",
  "serde",
  "thiserror 1.0.69",
@@ -1917,7 +2060,7 @@ dependencies = [
 [[package]]
 name = "pandora_lib_igv"
 version = "0.1.0"
-source = "git+https://git.t0m4.fr/Thomas/pandora_lib_igv.git#a2b1426937426b4bd9822f7d12efe892d3ba5666"
+source = "git+https://git.t0m4.fr/Thomas/pandora_lib_igv.git#f31bbb6a04aa5d27cb2c7acd88fa419f308062ce"
 dependencies = [
  "anyhow",
  "base64",
@@ -1931,8 +2074,11 @@ dependencies = [
 name = "pandora_lib_promethion"
 version = "0.1.0"
 dependencies = [
+ "aes-gcm",
  "anyhow",
+ "argon2",
  "arrow",
+ "base64",
  "bitcode",
  "blake3",
  "byte-unit",
@@ -1967,8 +2113,9 @@ dependencies = [
  "rand 0.9.2",
  "rayon",
  "regex",
+ "rmp-serde",
  "rusqlite",
- "rust-htslib 0.51.0",
+ "rust-htslib",
  "rustc-hash 2.1.1",
  "semver 1.0.27",
  "serde",
@@ -1980,6 +2127,7 @@ dependencies = [
  "triple_accel",
  "uuid",
  "walkdir",
+ "zstd",
 ]
 
 [[package]]
@@ -1995,6 +2143,17 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "password-hash"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166"
+dependencies = [
+ "base64ct",
+ "rand_core 0.6.4",
+ "subtle",
+]
+
 [[package]]
 name = "percent-encoding"
 version = "2.3.2"
@@ -2035,6 +2194,18 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
 
+[[package]]
+name = "polyval"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "opaque-debug",
+ "universal-hash",
+]
+
 [[package]]
 name = "portable-atomic"
 version = "1.11.1"
@@ -2341,6 +2512,25 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "rmp"
+version = "0.8.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ba8be72d372b2c9b35542551678538b562e7cf86c3315773cae48dfbfe7790c"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "rmp-serde"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72f81bee8c8ef9b577d1681a70ebbc962c232461e397b22c208c43c04b67a155"
+dependencies = [
+ "rmp",
+ "serde",
+]
+
 [[package]]
 name = "rusqlite"
 version = "0.38.0"
@@ -2360,31 +2550,7 @@ dependencies = [
 
 [[package]]
 name = "rust-htslib"
-version = "0.47.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f62b46e47d8b025589565f0eefe83e8646faf0faf74d17014b1b6ea2c1504930"
-dependencies = [
- "bio-types",
- "byteorder",
- "custom_derive",
- "derive-new 0.5.9",
- "hts-sys",
- "ieee754",
- "lazy_static",
- "libc",
- "libz-sys",
- "linear-map",
- "newtype_derive",
- "regex",
- "thiserror 1.0.69",
- "url",
-]
-
-[[package]]
-name = "rust-htslib"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "354416dd2300ff9e7aff8ddc747c875d6c5086f83c2cb2599f3692421c2b77fd"
+version = "1.0.0"
 dependencies = [
  "bio-types",
  "byteorder",
@@ -2716,6 +2882,12 @@ dependencies = [
  "syn 2.0.111",
 ]
 
+[[package]]
+name = "subtle"
+version = "2.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+
 [[package]]
 name = "syn"
 version = "1.0.109"
@@ -2981,6 +3153,12 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22048bc95dfb2ffd05b1ff9a756290a009224b60b2f0e7525faeee7603851e63"
 
+[[package]]
+name = "typenum"
+version = "1.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.22"
@@ -2993,6 +3171,16 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
 
+[[package]]
+name = "universal-hash"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea"
+dependencies = [
+ "crypto-common",
+ "subtle",
+]
+
 [[package]]
 name = "url"
 version = "2.5.7"
@@ -3564,3 +3752,31 @@ name = "zlib-rs"
 version = "0.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "51f936044d677be1a1168fae1d03b583a285a5dd9d8cbf7b24c23aa1fc775235"
+
+[[package]]
+name = "zstd"
+version = "0.13.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
+dependencies = [
+ "zstd-safe",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "7.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
+dependencies = [
+ "zstd-sys",
+]
+
+[[package]]
+name = "zstd-sys"
+version = "2.0.16+zstd.1.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748"
+dependencies = [
+ "cc",
+ "pkg-config",
+]

+ 10 - 1
Cargo.toml

@@ -7,6 +7,11 @@ edition = "2021"
 log = "^0.4.29"
 env_logger = "^0.11.8"
 anyhow = "1.0.100"
+rmp-serde = "1.3.0"
+zstd = "0.13.3"
+aes-gcm = "0.10.3"
+argon2 = "0.5.3"
+base64 = "0.22.1"
 glob = "0.3.2"
 pandora_lib_assembler = { git = "https://git.t0m4.fr/Thomas/pandora_lib_assembler.git" }
 chrono = { version = "0.4.40", features = ["serde"] }
@@ -22,7 +27,7 @@ rayon = "1.11.0"
 hashbrown = { version = "0.16.1", features = ["rayon"] }
 lazy_static = "1.5.0"
 indicatif = "0.17.8"
-rust-htslib = "0.51.0"
+rust-htslib = "1.0.0"
 arrow = { git = "https://github.com/apache/arrow-rs" }
 # arrow = "54.2.1"
 # bgzip = "0.3.1"
@@ -60,3 +65,7 @@ filetime = "0.2.27"
 opt-level = 0
 debug = false
 
+[patch.crates-io]
+hts-sys = { path = "patches/hts-sys" }
+rust-htslib = { path = "patches/rust-htslib" }
+

+ 5 - 0
README.md

@@ -20,6 +20,11 @@ sudo apt install cmake libclang-dev
 
 ## Usage
 
+### SomaticPipe output container
+
+A proposed single-file container for SomaticPipe results is documented in
+[`docs/somaticpipe-output-format.md`](docs/somaticpipe-output-format.md).
+
 ### Use jq for selecting variants
 
 * Somatic Variants of chrM (25) 

+ 272 - 0
docs/somaticpipe-output-format.md

@@ -0,0 +1,272 @@
+# SomaticPipe Output Container Format
+
+This document proposes a single binary container for SomaticPipe outputs. The goal is to replace scattered `.json.gz`, `.bit`, VCF-derived tables, BAM QC files, pipeline QC files, methylation summaries, CNV files, and fingerprint artifacts with one random-access file that is compact, typed, encrypted, and versioned.
+
+Recommended extension: `.pandora`
+
+## Design Goals
+
+- Store all sample-level SomaticPipe results in one file.
+- Preserve the current merged `Variants` model instead of forcing SNV/SV data into a flat table.
+- Keep optional derived indexes columnar and fast to load with Arrow IPC.
+- Keep small structured metadata simple with MessagePack.
+- Allow random access to each result section without reading the full file.
+- Encrypt patient-identifying result payloads with authenticated encryption.
+- Make corruption and version mismatch failures explicit before decoding payloads.
+- Keep the format append-friendly enough for future sections.
+
+## File Layout
+
+All integers in the fixed prelude are unsigned big-endian.
+
+```text
+[MAGIC: 8 bytes]          "PANDORA\0"
+[VERSION: u16]            format version, initial value 1
+[HEADER_LEN: u64]         compressed header length in bytes
+[HEADER_CHECKSUM: 32]     BLAKE3(header_zstd)
+[HEADER: msgpack + zstd]
+[SECTION 0..N]
+```
+
+The header is not encrypted because readers need the section table before they can load payloads. It must not contain direct clinical results. Patient-identifying fields such as `sample_id` should be either pseudonymized or moved into an encrypted metadata section if the file may leave a controlled environment.
+
+## Header Schema
+
+The header is MessagePack serialized and then zstd-compressed.
+
+```text
+{
+  "format": "somaticpipe.output",
+  "format_version": 1,
+  "producer": {
+    "name": "pandora_lib_promethion",
+    "pipeline": "SomaticPipe",
+    "pipeline_version": "0.1.0",
+    "git_commit": "...",
+    "created_at": "2026-05-27T00:00:00Z"
+  },
+  "sample": {
+    "sample_id": "pseudonym-or-local-id",
+    "tumor_timepoint": "diag",
+    "normal_timepoint": "constit",
+    "reference": "hs1",
+    "reference_digest": "blake3:..."
+  },
+  "compression": {
+    "algorithm": "zstd",
+    "level": 3
+  },
+  "encryption": {
+    "algorithm": "AES-256-GCM",
+    "key_derivation": "Argon2id",
+    "salt": "<base64 16-32 bytes>",
+    "aad": "fixed-prelude + canonical-section-descriptor"
+  },
+  "sections": [
+    {
+      "name": "variants",
+      "kind": "bitcode",
+      "compression": "zstd",
+      "encryption": "AES-256-GCM",
+      "offset": 0,
+      "length": 0,
+      "nonce": "<base64 12 bytes>",
+      "tag": "<base64 16 bytes>",
+      "checksum": "blake3:<ciphertext digest>",
+      "schema_hash": "blake3:<logical schema digest>",
+      "required": true
+    }
+  ]
+}
+```
+
+Offsets are absolute byte offsets from the start of the file. `length` is the stored section length, including encrypted ciphertext but excluding any tag if the tag is stored in the header. A reader must validate the magic, version, header checksum, every offset/length bound, and every section checksum before decoding.
+
+## Section Encoding
+
+Each encrypted section uses this transform order:
+
+```text
+logical payload -> encode -> zstd compress -> AES-256-GCM encrypt -> write bytes
+```
+
+The GCM authentication tag is stored in the header. The additional authenticated data should be a canonical representation of the fixed prelude plus the section descriptor with `offset`, `length`, `nonce`, `name`, `kind`, and `schema_hash`. This prevents moving encrypted bytes between sections or files without detection.
+
+Checksums are BLAKE3 over the stored bytes, not plaintext. AES-GCM already authenticates decrypted plaintext; the checksum catches storage corruption before decryption and helps diagnostics.
+
+## Standard Sections
+
+| Section | Required | Encoding | Notes |
+| --- | --- | --- | --- |
+| `variants` | yes | bitcode or MessagePack + zstd + AES-256-GCM | Canonical merged `Variants` struct. This keeps SNV, indel, SV, BND, and CNV-like VCF calls together. |
+| `variant_index` | no | Arrow IPC file + zstd + AES-256-GCM | Optional flattened projection for fast GUI/search. It is derived from `variants`, not canonical. |
+| `copy_number` | no | MessagePack or Arrow IPC file + zstd + AES-256-GCM | Savana `SavanaCN` / `CNSegment` absolute copy-number segments. |
+| `bam_qc` | yes | MessagePack + zstd + AES-256-GCM | Tumor/normal `WGSBamStats`, including coverage, N50, karyotype, read groups, source files, and flag stats. |
+| `pipe_qc` | yes | MessagePack + zstd + AES-256-GCM | SomaticPipe filtering counters, input caller counts, annotation summaries, VEP stats, tool versions. |
+| `methylation` | no | Arrow IPC file + zstd + AES-256-GCM | Per-region or per-CpG 5mC outputs. |
+| `fingerprint` | yes | MessagePack + zstd + AES-256-GCM | Sample identity/fingerprint data; encrypt by default because it is identifying. |
+| `provenance` | yes | MessagePack + zstd + AES-256-GCM | Input file digests, command lines, container/image versions, config digest. |
+
+The user-suggested unencrypted `fingerprint` section is risky: fingerprints are identifiers. The safer default is to encrypt it like the other clinical sections. If a public fingerprint is required for indexing, add a separate `public_index` section containing only non-reversible IDs and high-level availability flags.
+
+## Canonical Data Models
+
+### `variants`
+
+The canonical variant section should store the existing merged Rust model:
+
+```text
+Variants {
+  data: Vec<Variant>
+}
+
+Variant {
+  hash: Hash128,
+  position: GenomePosition,
+  reference: ReferenceAlternative,
+  alternative: ReferenceAlternative,
+  vcf_variants: Vec<VcfVariant>,
+  annotations: Vec<Annotation>
+}
+```
+
+This is important because the current model intentionally merges SNV, indel, SV/BND, and caller annotations into one nested representation. A flat Arrow table would lose structure or require many lossy string columns. For version 1, use the existing `bitcode` representation when compatibility with the `.bit` output is desired. MessagePack is acceptable if cross-language readers are more important than Rust-native speed.
+
+### `copy_number`
+
+The CNV section should store Savana absolute copy-number segments:
+
+```text
+SavanaCN {
+  segments: Vec<CNSegment>
+}
+
+CNSegment {
+  chromosome: String,
+  start: u64,
+  end: u64,
+  segment_id: String,
+  bin_count: u32,
+  sum_of_bin_lengths: u64,
+  weight: f64,
+  copy_number: f64,
+  minor_allele_copy_number: Option<f64>,
+  mean_baf: Option<f64>,
+  no_het_snps: u32
+}
+```
+
+This section is distinct from `variants` even if a caller also emits CNV-like VCF records. Segment-level absolute copy number is a continuous genome track and should not be forced into the merged variant list.
+
+### `bam_qc`
+
+The BAM QC section should store one `WGSBamStats` payload per analyzed BAM, keyed by role/timepoint:
+
+```text
+{
+  "tumor": WGSBamStats,
+  "normal": WGSBamStats,
+  "additional_bams": [
+    {"role": "mrd", "stats": WGSBamStats}
+  ]
+}
+```
+
+`WGSBamStats` already contains the useful BAM-level QC fields: total records, passing reads, mapped fraction, unmapped/duplicate/low-MAPQ counts, mapped yield, read lengths, global coverage, per-contig coverage/karyotype, N50, length histogram, read-group stats, source-file stats, and `FlagStats`.
+
+### `pipe_qc`
+
+The pipeline QC section should store SomaticPipe-specific metrics separately from BAM QC:
+
+```text
+{
+  "somatic_pipe_stats": SomaticPipeStats,
+  "variant_stats": VariantsStats,
+  "annotation_stats": {
+    "initial": AnnotationsStats,
+    "post_filters": AnnotationsStats,
+    "vep": VepStats
+  },
+  "caller_outputs": [
+    {"caller": "ClairS Somatic", "n_input": 0, "n_after_filters": 0}
+  ],
+  "filter_steps": [
+    {"name": "germline_or_constit", "removed": 0},
+    {"name": "low_constit_depth", "removed": 0},
+    {"name": "high_constit_alt", "removed": 0},
+    {"name": "gnomad_and_constit_alt", "removed": 0},
+    {"name": "low_entropy", "removed": 0}
+  ]
+}
+```
+
+The current `SomaticPipeStats` tracks the key filtering counters and input categorization. If future code records every intermediate annotation snapshot, store the snapshots here as named MessagePack entries or add optional sections such as `annotations_01_initial`.
+
+## Optional Arrow Indexes
+
+Arrow should be used for derived projections, not for the canonical nested `Variants` payload.
+
+### `variant_index`
+
+This section can be regenerated from `variants`, so it is optional. It is useful for a GUI, quick filtering, and summary browsing before loading the full nested payload.
+
+- `variant_id: utf8`
+- `contig: utf8`
+- `start: int64`
+- `end: int64`
+- `ref: utf8`
+- `alt: utf8`
+- `variant_type: utf8`
+- `callers: list<utf8>`
+- `n_callers: uint8`
+- `tumor_depth: int32`
+- `tumor_alt: int32`
+- `normal_depth: int32`
+- `normal_alt: int32`
+- `vaf: float32`
+- `filters: list<utf8>`
+- `vep_consequence: utf8`
+- `vep_impact: utf8`
+- `gene: utf8`
+- `cosmic_count: int32`
+- `gnomad_af: float32`
+
+### `methylation`
+
+- `contig: utf8`
+- `start: int64`
+- `end: int64`
+- `region_id: utf8`
+- `mod_code: utf8`
+- `valid_coverage: int32`
+- `modified_count: int32`
+- `fraction_modified: float32`
+- `strand: utf8`
+
+## Versioning Rules
+
+- Readers must reject unsupported major versions.
+- New optional sections may be added without changing the major version.
+- Adding nullable columns to Arrow sections is a minor-compatible change.
+- Removing columns, changing types, or changing crypto/compression semantics requires a major version bump.
+- Section names are stable ASCII identifiers.
+
+## Minimal Reader Algorithm
+
+1. Read and validate the 8-byte magic.
+2. Read `VERSION`, `HEADER_LEN`, and `HEADER_CHECKSUM`.
+3. Read `HEADER_LEN` bytes, verify BLAKE3, decompress zstd, decode MessagePack.
+4. Check every section `offset + length` is inside the file and non-overlapping.
+5. For the requested section, read bytes and verify BLAKE3 checksum.
+6. Decrypt with AES-256-GCM using the section nonce, tag, and AAD.
+7. Decompress zstd.
+8. Decode bitcode, Arrow IPC, or MessagePack according to `kind`.
+
+## Open Implementation Choices
+
+- Use `bitcode` for Rust-native canonical sections that already derive `Encode` / `Decode`, especially `Variants`.
+- Use `rmp-serde` for MessagePack.
+- Use `zstd` crate for compression.
+- Use `aes-gcm` and `argon2` crates for encryption and key derivation.
+- Store Arrow IPC as file format rather than stream format for derived indexes and tabular tracks, because file format carries schema/footer metadata and is better for self-contained sections.
+- Add a `public_index` section only if GUI browsing needs non-secret metadata before decryption.

+ 113 - 0
setup-patches.ps1

@@ -0,0 +1,113 @@
+param()
+$ErrorActionPreference = "Stop"
+
+$found = Resolve-Path "$env:USERPROFILE\.cargo\registry\src\index.crates.io-*\hts-sys-2.2.0" `
+         -ErrorAction SilentlyContinue
+if (-not $found) {
+    Write-Error "hts-sys 2.2.0 not in cargo registry. Run 'cargo fetch' first."
+    exit 1
+}
+
+$dest = Join-Path $PSScriptRoot "patches\hts-sys"
+if (Test-Path $dest) { Remove-Item $dest -Recurse -Force }
+Copy-Item $found.Path $dest -Recurse
+Write-Host "Copied hts-sys 2.2.0 -> patches\hts-sys"
+
+$buildRs = Join-Path $dest "build.rs"
+$text = [System.IO.File]::ReadAllText($buildRs) -replace "`r`n", "`n"
+
+# Fix 1: guard HAVE_DRAND48. drand48/srand48 are absent from MinGW libc.
+$old1 = "    let mut config_lines = vec![`n        `"/* Default config.h generated by build.rs */`",`n        `"#define HAVE_DRAND48 1`",`n    ];"
+$new1 = "    let mut config_lines = vec![`n        `"/* Default config.h generated by build.rs */`",`n    ];`n    if target_os != `"windows`" {`n        config_lines.push(`"#define HAVE_DRAND48 1`");`n    }"
+$result = $text.Replace($old1, $new1)
+if ($result -eq $text) { Write-Warning "Fix 1 pattern not matched - verify hts-sys version" }
+else { Write-Host "Fix 1 applied (HAVE_DRAND48 guard)"; $text = $result }
+
+# Fix 2: run version.sh through bash. Windows CreateProcess cannot execute .sh files.
+$old2 = "        let version = std::process::Command::new(out.join(`"htslib`").join(`"version.sh`"))`n            .output()`n            .expect(`"failed to execute process`");`n        let version_str = std::str::from_utf8(&version.stdout).unwrap().trim();"
+$new2 = "        let version_str = std::process::Command::new(`"bash`")`n            .arg(out.join(`"htslib`").join(`"version.sh`"))`n            .output()`n            .map(|o| std::str::from_utf8(&o.stdout).unwrap_or(`"1.19.1`").trim().to_string())`n            .unwrap_or_else(|_| `"1.19.1`".to_string());"
+$result = $text.Replace($old2, $new2)
+if ($result -eq $text) { Write-Warning "Fix 2 pattern not matched - verify hts-sys version" }
+else { Write-Host "Fix 2 applied (version.sh via bash)"; $text = $result }
+
+# Fix 3: 64-bit file offsets on Windows MinGW.
+# Both the C compiler and bindgen must see the same define so types match.
+$old3 = "    if want_static {`n        cfg.warnings(false).static_flag(true).pic(true);`n    } else {`n        cfg.warnings(false).static_flag(false).pic(true);`n    }"
+$new3 = "    if want_static {`n        cfg.warnings(false).static_flag(true).pic(true);`n    } else {`n        cfg.warnings(false).static_flag(false).pic(true);`n    }`n`n    // Fix 3: 64-bit file offsets on Windows MinGW.`n    // Without this, off_t = i32 and seeks wrap at 2 GB.`n    if target_os == `"windows`" {`n        cfg.define(`"_FILE_OFFSET_BITS`", `"64`");`n    }"
+$result = $text.Replace($old3, $new3)
+if ($result -eq $text) { Write-Warning "Fix 3a pattern not matched - verify hts-sys version" }
+else { Write-Host "Fix 3a applied (_FILE_OFFSET_BITS=64 cc define)"; $text = $result }
+
+$old4 = "        bindgen::Builder::default()`n            .header(`"wrapper.h`")`n            .generate_comments(false)`n            .blocklist_function(`"strtold`")`n            .blocklist_type(`"max_align_t`")`n            .generate()"
+$new4 = "        let mut bindgen_builder = bindgen::Builder::default()`n            .header(`"wrapper.h`")`n            .generate_comments(false)`n            .blocklist_function(`"strtold`")`n            .blocklist_type(`"max_align_t`");`n        if target_os == `"windows`" {`n            bindgen_builder = bindgen_builder.clang_arg(`"-D_FILE_OFFSET_BITS=64`");`n        }`n        bindgen_builder`n            .generate()"
+$result = $text.Replace($old4, $new4)
+if ($result -eq $text) { Write-Warning "Fix 3b pattern not matched - verify hts-sys version" }
+else { Write-Host "Fix 3b applied (_FILE_OFFSET_BITS=64 bindgen)"; $text = $result }
+
+# Fix 4: static-link MinGW regex libraries on Windows.
+# hts_expr.c needs POSIX regex; systre.c needs TRE. libregex depends on
+# libintl/gettext, which depends on libiconv.
+$old5 = "    cfg.file(`"wrapper.c`");`n    cfg.compile(`"hts`");"
+$new5 = "    cfg.file(`"wrapper.c`");`n    cfg.compile(`"hts`");`n`n    // hts_expr.c uses POSIX regex (regcomp/regexec/regfree) -- provided by libregex (gnurx).`n    // systre.c uses the TRE regex API (tre_regexec/tre_regerror) -- provided by libtre.`n    // Both live in the MinGW lib dir; locate it via gcc -print-file-name.`n    // Link statically so the exe has no runtime dependency on .dll files.`n    if target_os == `"windows`" {`n        let compiler = cfg.get_compiler();`n        let mingw_lib = std::process::Command::new(compiler.path())`n            .arg(`"-print-file-name=libtre.a`")`n            .output()`n            .ok()`n            .and_then(|o| String::from_utf8(o.stdout).ok())`n            .map(|s| s.trim().to_string())`n            .filter(|s| s != `"libtre.a`")`n            .and_then(|s| std::path::PathBuf::from(s).parent().map(|p| p.to_path_buf()));`n        if let Some(dir) = mingw_lib {`n            println!(`"cargo:rustc-link-search=native={}`", dir.display());`n        }`n        println!(`"cargo:rustc-link-lib=static=tre`");    // systre.c: tre_regexec/tre_regerror`n        println!(`"cargo:rustc-link-lib=static=regex`");  // hts_expr.c: regcomp/regexec/regfree`n        println!(`"cargo:rustc-link-lib=static=intl`");   // libregex dep: gettext`n        println!(`"cargo:rustc-link-lib=static=iconv`");  // libintl dep: iconv`n    }"
+$result = $text.Replace($old5, $new5)
+if ($result -eq $text) { Write-Warning "Fix 4 pattern not matched - verify hts-sys version" }
+else { Write-Host "Fix 4 applied (static-link regex/tre/intl/iconv on Windows)"; $text = $result }
+
+[System.IO.File]::WriteAllText($buildRs, $text, (New-Object System.Text.UTF8Encoding $false))
+
+$cargoToml = Join-Path $dest "Cargo.toml"
+$manifest = [System.IO.File]::ReadAllText($cargoToml) -replace "`r`n", "`n"
+$oldManifest = "[target.'cfg(all(unix, not(target_os = `"macos`")))'.dependencies.openssl-sys]`nversion = `"0.9.56`"`noptional = true"
+$newManifest = "[target.'cfg(any(windows, all(unix, not(target_os = `"macos`"))))'.dependencies.openssl-sys]`nversion = `"0.9.56`"`noptional = true"
+$result = $manifest.Replace($oldManifest, $newManifest)
+if ($result -eq $manifest) { Write-Warning "Manifest OpenSSL target patch not matched - verify hts-sys version" }
+else {
+    Write-Host "Manifest patch applied (openssl-sys enabled on Windows)"
+    [System.IO.File]::WriteAllText($cargoToml, $result, (New-Object System.Text.UTF8Encoding $false))
+}
+
+Write-Host "patches\hts-sys is ready."
+
+$rhl = Resolve-Path "$env:USERPROFILE\.cargo\registry\src\index.crates.io-*\rust-htslib-1.0.0" `
+       -ErrorAction SilentlyContinue
+if (-not $rhl) {
+    Write-Warning "rust-htslib 1.0.0 not in cargo registry. Run 'cargo fetch' first."
+} else {
+    $rhlDest = Join-Path $PSScriptRoot "patches\rust-htslib"
+    if (Test-Path $rhlDest) { Remove-Item $rhlDest -Recurse -Force }
+    Copy-Item $rhl.Path $rhlDest -Recurse
+    Write-Host "Copied rust-htslib 1.0.0 -> patches\rust-htslib"
+
+    $bamMod = Join-Path $rhlDest "src\bam\mod.rs"
+    $rhlText = [System.IO.File]::ReadAllText($bamMod) -replace "`r`n", "`n"
+
+    $rhlText = $rhlText.Replace("offset as libc::off_t,", "offset as hts_sys::off_t,")
+    if ($rhlText -match "offset as libc::off_t") {
+        Write-Warning "rust-htslib fix A not applied - pattern not matched"
+    } else {
+        Write-Host "rust-htslib fix A applied (libc::off_t -> hts_sys::off_t)"
+    }
+
+    $oldIdxLoad = "        let idx = unsafe { htslib::sam_index_load(htsfile, c_str.as_ptr()) };"
+    $newIdxLoad = "        // flags=0 omits HTS_IDX_SAVE_REMOTE so remote .bai files are not cached locally.`n        let idx = unsafe { htslib::sam_index_load3(htsfile, c_str.as_ptr(), std::ptr::null(), 0) };"
+    $rhlText = $rhlText.Replace($oldIdxLoad, $newIdxLoad)
+    if ($rhlText -match "sam_index_load\(htsfile") {
+        Write-Warning "rust-htslib fix B not applied - pattern not matched"
+    } else {
+        Write-Host "rust-htslib fix B applied (sam_index_load3 no-cache)"
+    }
+
+    $fromUrl = "    pub fn from_url(url: &Url) -> Result<Self> {`n        Self::new(url.as_str().as_bytes())`n    }"
+    $fromUrlAndIndex = "    pub fn from_url(url: &Url) -> Result<Self> {`n        Self::new(url.as_str().as_bytes())`n    }`n`n    /// Open a remote BAM via URL with a pre-downloaded local index file.`n    /// Using a local index bypasses htslib's remote-index caching entirely.`n    pub fn from_url_and_index<P: AsRef<Path>>(url: &Url, index_path: P) -> Result<Self> {`n        Self::new_with_index_path(`n            url.as_str().as_bytes(),`n            &path_as_bytes(index_path, true)?,`n        )`n    }"
+    $count = ([regex]::Matches($rhlText, [regex]::Escape($fromUrl))).Count
+    if ($count -ge 2) {
+        $idx = $rhlText.IndexOf($fromUrl, $rhlText.IndexOf($fromUrl) + 1)
+        $rhlText = $rhlText.Substring(0, $idx) + $fromUrlAndIndex + $rhlText.Substring($idx + $fromUrl.Length)
+        Write-Host "rust-htslib fix C applied (IndexedReader::from_url_and_index)"
+    } else {
+        Write-Warning "rust-htslib fix C not applied - could not locate second from_url in IndexedReader"
+    }
+
+    [System.IO.File]::WriteAllText($bamMod, $rhlText, (New-Object System.Text.UTF8Encoding $false))
+    Write-Host "patches\rust-htslib is ready."
+}

+ 8 - 4
src/collection/flowcells.rs

@@ -3,7 +3,6 @@ use std::{
     fmt,
     fs::File,
     io::{BufReader, Read, Write},
-    os::unix::fs::MetadataExt,
     path::Path,
 };
 
@@ -656,9 +655,14 @@ pub fn scan_local(dir: &str) -> anyhow::Result<ExperimentData> {
             "Failed to access file metadata: {}",
             file.display()
         ))?;
-        let size = metadata.size();
-        let modified = metadata.mtime();
-        let modified_utc: DateTime<Utc> = Utc.timestamp_opt(modified as i64, 0).unwrap();
+        let size = metadata.len();
+        let modified_utc: DateTime<Utc> = metadata
+            .modified()
+            .context(format!(
+                "Failed to access file modification time: {}",
+                file.display()
+            ))?
+            .into();
 
         let path = file.to_string_lossy().into_owned();
 

+ 2 - 2
src/collection/vcf.rs

@@ -3,7 +3,7 @@ use chrono::{DateTime, Utc};
 use csi::binning_index::ReferenceSequence;
 use glob::glob;
 use log::warn;
-use std::{collections::HashMap, fs::Metadata, os::unix::fs::MetadataExt, path::PathBuf};
+use std::{collections::HashMap, fs::Metadata, path::PathBuf};
 
 use noodles_csi as csi;
 use num_format::{Locale, ToFormattedString};
@@ -54,7 +54,7 @@ impl Vcf {
     }
 
     pub fn size(&self) -> u64 {
-        self.file_metadata.size()
+        self.file_metadata.len()
     }
 
     pub fn tsv(&self) -> anyhow::Result<String> {

+ 2 - 0
src/io/mod.rs

@@ -25,6 +25,7 @@
 //! | [`gff`] | GFF3 feature range extraction |
 //! | [`modkit`] | Modkit bedMethyl pileup parsing, epigenetic activity computation |
 //! | [`straglr`] | Straglr STR genotyper TSV parsing |
+//! | [`somaticpipe_container`] | `.pandora` SomaticPipe result container metadata and payload models |
 //! | [`liftover`] | UCSC chain file parsing and coordinate liftover |
 //! | [`readers`] | Generic BGZF/plain readers, Tabix region fetch (`fetch_tabix_lines_with`) |
 //! | [`writers`] | BGZF writers, `BgzTabixWriter` for combined BGZF + Tabix output |
@@ -45,6 +46,7 @@ pub mod modkit;
 pub mod pod5_footer_generated;
 pub mod pod5_infos;
 pub mod readers;
+pub mod somaticpipe_container;
 pub mod straglr;
 pub mod tsv;
 pub mod vcf;

+ 685 - 0
src/io/somaticpipe_container.rs

@@ -0,0 +1,685 @@
+//! Metadata and payload models for the `.pandora` SomaticPipe output container.
+//!
+//! This module intentionally defines only the format model. The writer and reader
+//! implementation should be added separately so the schema can stabilize first.
+
+use std::{
+    collections::BTreeMap,
+    fs::File,
+    io::{Read, Seek, SeekFrom, Write},
+    path::Path,
+};
+
+use anyhow::{bail, Context};
+use chrono::{DateTime, Utc};
+use serde::de::DeserializeOwned;
+use serde::{Deserialize, Serialize};
+
+use crate::{
+    annotation::{AnnotationsStats, VepStats},
+    callers::savana::SavanaCN,
+    collection::bam_stats::WGSBamStats,
+    pipes::somatic::SomaticPipeStats,
+    variant::{variant_collection::Variants, variants_stats::VariantsStats},
+};
+
+pub const PANDORA_MAGIC: &[u8; 8] = b"PANDORA\0";
+pub const PANDORA_FORMAT_VERSION: u16 = 1;
+pub const PANDORA_EXTENSION: &str = "pandora";
+pub const PANDORA_PRELUDE_LEN: usize = 8 + 2 + 8 + 32;
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct ContainerPrelude {
+    pub magic: [u8; 8],
+    pub version: u16,
+    pub header_len: u64,
+    pub header_checksum: [u8; 32],
+}
+
+impl Default for ContainerPrelude {
+    fn default() -> Self {
+        Self {
+            magic: *PANDORA_MAGIC,
+            version: PANDORA_FORMAT_VERSION,
+            header_len: 0,
+            header_checksum: [0; 32],
+        }
+    }
+}
+
+impl ContainerPrelude {
+    pub fn from_reader(mut reader: impl Read) -> anyhow::Result<Self> {
+        let mut magic = [0; 8];
+        reader.read_exact(&mut magic)?;
+
+        let mut version = [0; 2];
+        reader.read_exact(&mut version)?;
+
+        let mut header_len = [0; 8];
+        reader.read_exact(&mut header_len)?;
+
+        let mut header_checksum = [0; 32];
+        reader.read_exact(&mut header_checksum)?;
+
+        Ok(Self {
+            magic,
+            version: u16::from_be_bytes(version),
+            header_len: u64::from_be_bytes(header_len),
+            header_checksum,
+        })
+    }
+
+    pub fn write_to(&self, mut writer: impl Write) -> anyhow::Result<()> {
+        writer.write_all(&self.magic)?;
+        writer.write_all(&self.version.to_be_bytes())?;
+        writer.write_all(&self.header_len.to_be_bytes())?;
+        writer.write_all(&self.header_checksum)?;
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum SectionName {
+    Variants,
+    VariantIndex,
+    CopyNumber,
+    BamQc,
+    PipeQc,
+    Methylation,
+    Fingerprint,
+    Provenance,
+    PublicIndex,
+    Other(String),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum SectionKind {
+    Bitcode,
+    MessagePack,
+    ArrowIpcFile,
+    RawBytes,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum CompressionAlgorithm {
+    None,
+    Zstd,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct CompressionMetadata {
+    pub algorithm: CompressionAlgorithm,
+    pub level: Option<i32>,
+}
+
+impl Default for CompressionMetadata {
+    fn default() -> Self {
+        Self {
+            algorithm: CompressionAlgorithm::Zstd,
+            level: Some(3),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum EncryptionAlgorithm {
+    None,
+    Aes256Gcm,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum KeyDerivationAlgorithm {
+    None,
+    Argon2id,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct EncryptionMetadata {
+    pub algorithm: EncryptionAlgorithm,
+    pub key_derivation: KeyDerivationAlgorithm,
+    pub salt_b64: Option<String>,
+    pub aad_context: String,
+}
+
+impl Default for EncryptionMetadata {
+    fn default() -> Self {
+        Self {
+            algorithm: EncryptionAlgorithm::None,
+            key_derivation: KeyDerivationAlgorithm::None,
+            salt_b64: None,
+            aad_context: "fixed-prelude + canonical-section-descriptor".to_string(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct ProducerMetadata {
+    pub name: String,
+    pub pipeline: String,
+    pub pipeline_version: String,
+    pub git_commit: Option<String>,
+    pub created_at: DateTime<Utc>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct SampleMetadata {
+    pub sample_id: String,
+    pub tumor_timepoint: Option<String>,
+    pub normal_timepoint: Option<String>,
+    pub reference: Option<String>,
+    pub reference_digest: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct SectionDescriptor {
+    pub name: SectionName,
+    pub kind: SectionKind,
+    pub compression: CompressionMetadata,
+    pub encryption: EncryptionAlgorithm,
+    pub offset: u64,
+    pub length: u64,
+    pub nonce_b64: Option<String>,
+    pub tag_b64: Option<String>,
+    pub checksum: String,
+    pub schema_hash: Option<String>,
+    pub required: bool,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct ContainerHeader {
+    pub format: String,
+    pub format_version: u16,
+    pub producer: ProducerMetadata,
+    pub sample: SampleMetadata,
+    pub compression: CompressionMetadata,
+    pub encryption: EncryptionMetadata,
+    pub sections: Vec<SectionDescriptor>,
+}
+
+impl ContainerHeader {
+    pub fn section(&self, name: &SectionName) -> Option<&SectionDescriptor> {
+        self.sections.iter().find(|section| &section.name == name)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct PendingSection {
+    pub name: SectionName,
+    pub kind: SectionKind,
+    pub compression: CompressionMetadata,
+    pub schema_hash: Option<String>,
+    pub required: bool,
+    pub payload: Vec<u8>,
+}
+
+impl PendingSection {
+    pub fn new(name: SectionName, kind: SectionKind, payload: Vec<u8>) -> Self {
+        Self {
+            name,
+            kind,
+            compression: CompressionMetadata::default(),
+            schema_hash: None,
+            required: true,
+            payload,
+        }
+    }
+
+    pub fn optional(mut self) -> Self {
+        self.required = false;
+        self
+    }
+
+    pub fn with_schema_hash(mut self, schema_hash: impl Into<String>) -> Self {
+        self.schema_hash = Some(schema_hash.into());
+        self
+    }
+
+    pub fn with_compression(mut self, compression: CompressionMetadata) -> Self {
+        self.compression = compression;
+        self
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct DecodedSection {
+    pub descriptor: SectionDescriptor,
+    pub payload: Vec<u8>,
+}
+
+pub fn encode_header(header: &ContainerHeader) -> anyhow::Result<Vec<u8>> {
+    let packed = rmp_serde::to_vec_named(header)?;
+    zstd::bulk::compress(&packed, header.compression.level.unwrap_or(3))
+        .context("failed to zstd-compress .pandora header")
+}
+
+pub fn decode_header(bytes: &[u8]) -> anyhow::Result<ContainerHeader> {
+    let unpacked = zstd::bulk::decompress(bytes, usize::MAX)
+        .context("failed to decompress .pandora header")?;
+    rmp_serde::from_slice(&unpacked).context("failed to decode .pandora header")
+}
+
+pub fn write_container(
+    path: impl AsRef<Path>,
+    mut header: ContainerHeader,
+    sections: Vec<PendingSection>,
+) -> anyhow::Result<()> {
+    if header.format_version != PANDORA_FORMAT_VERSION {
+        bail!(
+            "unsupported .pandora format version for writer: {}",
+            header.format_version
+        );
+    }
+    if matches!(header.encryption.algorithm, EncryptionAlgorithm::Aes256Gcm) {
+        bail!("encrypted .pandora writing is not implemented yet");
+    }
+
+    let stored_sections = sections
+        .into_iter()
+        .map(encode_pending_section)
+        .collect::<anyhow::Result<Vec<_>>>()?;
+
+    let (header_bytes, descriptors) = finalize_header_sections(&header, &stored_sections)?;
+    header.sections = descriptors;
+    let header_bytes = encode_header(&header)?;
+    let header_checksum = blake3::hash(&header_bytes);
+
+    let prelude = ContainerPrelude {
+        header_len: header_bytes.len() as u64,
+        header_checksum: *header_checksum.as_bytes(),
+        ..Default::default()
+    };
+
+    let mut writer = File::create(path.as_ref())
+        .with_context(|| format!("failed to create {}", path.as_ref().display()))?;
+    prelude.write_to(&mut writer)?;
+    writer.write_all(&header_bytes)?;
+    for stored in stored_sections {
+        writer.write_all(&stored.payload)?;
+    }
+    Ok(())
+}
+
+pub fn read_header(path: impl AsRef<Path>) -> anyhow::Result<(ContainerPrelude, ContainerHeader)> {
+    let mut reader = File::open(path.as_ref())
+        .with_context(|| format!("failed to open {}", path.as_ref().display()))?;
+    let prelude = ContainerPrelude::from_reader(&mut reader)?;
+    validate_prelude(&prelude)?;
+
+    let mut header_bytes = vec![0; prelude.header_len as usize];
+    reader.read_exact(&mut header_bytes)?;
+    verify_checksum(&header_bytes, &prelude.header_checksum, "header")?;
+
+    let header = decode_header(&header_bytes)?;
+    validate_header(&header)?;
+    Ok((prelude, header))
+}
+
+pub fn read_section(
+    path: impl AsRef<Path>,
+    name: &SectionName,
+) -> anyhow::Result<Option<DecodedSection>> {
+    let (_, header) = read_header(path.as_ref())?;
+    let Some(descriptor) = header.section(name).cloned() else {
+        return Ok(None);
+    };
+
+    let mut reader = File::open(path.as_ref())
+        .with_context(|| format!("failed to open {}", path.as_ref().display()))?;
+    reader.seek(SeekFrom::Start(descriptor.offset))?;
+
+    let mut stored = vec![0; descriptor.length as usize];
+    reader.read_exact(&mut stored)?;
+    verify_prefixed_checksum(
+        &stored,
+        &descriptor.checksum,
+        &format!("{:?}", descriptor.name),
+    )?;
+
+    if matches!(descriptor.encryption, EncryptionAlgorithm::Aes256Gcm) {
+        bail!("encrypted .pandora sections are not implemented yet");
+    }
+
+    let payload = decode_section_payload(&stored, &descriptor.compression)?;
+    Ok(Some(DecodedSection {
+        descriptor,
+        payload,
+    }))
+}
+
+pub fn read_required_section(
+    path: impl AsRef<Path>,
+    name: &SectionName,
+) -> anyhow::Result<DecodedSection> {
+    read_section(path, name)?.with_context(|| format!("missing required section: {name:?}"))
+}
+
+pub fn variants_section(variants: &Variants) -> PendingSection {
+    PendingSection::new(
+        SectionName::Variants,
+        SectionKind::Bitcode,
+        bitcode::encode(variants),
+    )
+    .with_schema_hash(logical_schema_hash("Variants"))
+}
+
+pub fn decode_variants_section(section: &DecodedSection) -> anyhow::Result<Variants> {
+    ensure_section_kind(section, &SectionName::Variants, SectionKind::Bitcode)?;
+    bitcode::decode(&section.payload).context("failed to decode Variants bitcode payload")
+}
+
+pub fn copy_number_section(copy_number: &SavanaCN) -> anyhow::Result<PendingSection> {
+    message_pack_section(
+        SectionName::CopyNumber,
+        copy_number,
+        logical_schema_hash("SavanaCN"),
+    )
+}
+
+pub fn decode_copy_number_section(section: &DecodedSection) -> anyhow::Result<SavanaCN> {
+    ensure_section_kind(section, &SectionName::CopyNumber, SectionKind::MessagePack)?;
+    decode_message_pack_payload(&section.payload, "SavanaCN")
+}
+
+pub fn bam_qc_section(payload: &BamQcPayload) -> anyhow::Result<PendingSection> {
+    message_pack_section(
+        SectionName::BamQc,
+        payload,
+        logical_schema_hash("BamQcPayload"),
+    )
+}
+
+pub fn decode_bam_qc_section(section: &DecodedSection) -> anyhow::Result<BamQcPayload> {
+    ensure_section_kind(section, &SectionName::BamQc, SectionKind::MessagePack)?;
+    decode_message_pack_payload(&section.payload, "BamQcPayload")
+}
+
+pub fn pipe_qc_section(payload: &PipeQcPayload) -> anyhow::Result<PendingSection> {
+    message_pack_section(
+        SectionName::PipeQc,
+        payload,
+        logical_schema_hash("PipeQcPayload"),
+    )
+}
+
+pub fn decode_pipe_qc_section(section: &DecodedSection) -> anyhow::Result<PipeQcPayload> {
+    ensure_section_kind(section, &SectionName::PipeQc, SectionKind::MessagePack)?;
+    decode_message_pack_payload(&section.payload, "PipeQcPayload")
+}
+
+pub fn provenance_section(payload: &ProvenancePayload) -> anyhow::Result<PendingSection> {
+    message_pack_section(
+        SectionName::Provenance,
+        payload,
+        logical_schema_hash("ProvenancePayload"),
+    )
+}
+
+pub fn decode_provenance_section(section: &DecodedSection) -> anyhow::Result<ProvenancePayload> {
+    ensure_section_kind(section, &SectionName::Provenance, SectionKind::MessagePack)?;
+    decode_message_pack_payload(&section.payload, "ProvenancePayload")
+}
+
+fn message_pack_section<T: Serialize>(
+    name: SectionName,
+    payload: &T,
+    schema_hash: String,
+) -> anyhow::Result<PendingSection> {
+    let encoded = rmp_serde::to_vec_named(payload)
+        .with_context(|| format!("failed to encode {name:?} MessagePack payload"))?;
+    Ok(PendingSection::new(name, SectionKind::MessagePack, encoded).with_schema_hash(schema_hash))
+}
+
+fn decode_message_pack_payload<T: DeserializeOwned>(
+    bytes: &[u8],
+    label: &str,
+) -> anyhow::Result<T> {
+    rmp_serde::from_slice(bytes)
+        .with_context(|| format!("failed to decode {label} MessagePack payload"))
+}
+
+fn ensure_section_kind(
+    section: &DecodedSection,
+    expected_name: &SectionName,
+    expected_kind: SectionKind,
+) -> anyhow::Result<()> {
+    if &section.descriptor.name != expected_name {
+        bail!(
+            "expected section {expected_name:?}, found {:?}",
+            section.descriptor.name
+        );
+    }
+    if section.descriptor.kind != expected_kind {
+        bail!(
+            "expected section kind {expected_kind:?}, found {:?}",
+            section.descriptor.kind
+        );
+    }
+    Ok(())
+}
+
+fn logical_schema_hash(name: &str) -> String {
+    format!("blake3:{}", blake3::hash(name.as_bytes()).to_hex())
+}
+
+fn validate_prelude(prelude: &ContainerPrelude) -> anyhow::Result<()> {
+    if &prelude.magic != PANDORA_MAGIC {
+        bail!("invalid .pandora magic");
+    }
+    if prelude.version != PANDORA_FORMAT_VERSION {
+        bail!("unsupported .pandora format version: {}", prelude.version);
+    }
+    Ok(())
+}
+
+fn validate_header(header: &ContainerHeader) -> anyhow::Result<()> {
+    if header.format != "somaticpipe.output" {
+        bail!("invalid .pandora header format: {}", header.format);
+    }
+    if header.format_version != PANDORA_FORMAT_VERSION {
+        bail!(
+            "unsupported .pandora header version: {}",
+            header.format_version
+        );
+    }
+    Ok(())
+}
+
+fn verify_checksum(bytes: &[u8], expected: &[u8; 32], label: &str) -> anyhow::Result<()> {
+    let actual = blake3::hash(bytes);
+    if actual.as_bytes() != expected {
+        bail!("{label} checksum mismatch");
+    }
+    Ok(())
+}
+
+fn verify_prefixed_checksum(bytes: &[u8], expected: &str, label: &str) -> anyhow::Result<()> {
+    let Some(hex_digest) = expected.strip_prefix("blake3:") else {
+        bail!("{label} checksum does not use blake3 prefix");
+    };
+    let actual = blake3::hash(bytes).to_hex().to_string();
+    if actual != hex_digest {
+        bail!("{label} checksum mismatch");
+    }
+    Ok(())
+}
+
+#[derive(Debug, Clone)]
+struct StoredSection {
+    descriptor: SectionDescriptor,
+    payload: Vec<u8>,
+}
+
+fn encode_pending_section(section: PendingSection) -> anyhow::Result<StoredSection> {
+    let payload = encode_section_payload(&section.payload, &section.compression)?;
+    let checksum = format!("blake3:{}", blake3::hash(&payload).to_hex());
+
+    Ok(StoredSection {
+        descriptor: SectionDescriptor {
+            name: section.name,
+            kind: section.kind,
+            compression: section.compression,
+            encryption: EncryptionAlgorithm::None,
+            offset: 0,
+            length: payload.len() as u64,
+            nonce_b64: None,
+            tag_b64: None,
+            checksum,
+            schema_hash: section.schema_hash,
+            required: section.required,
+        },
+        payload,
+    })
+}
+
+fn encode_section_payload(
+    bytes: &[u8],
+    compression: &CompressionMetadata,
+) -> anyhow::Result<Vec<u8>> {
+    match compression.algorithm {
+        CompressionAlgorithm::None => Ok(bytes.to_vec()),
+        CompressionAlgorithm::Zstd => zstd::bulk::compress(bytes, compression.level.unwrap_or(3))
+            .context("failed to zstd-compress .pandora section"),
+    }
+}
+
+fn decode_section_payload(
+    bytes: &[u8],
+    compression: &CompressionMetadata,
+) -> anyhow::Result<Vec<u8>> {
+    match compression.algorithm {
+        CompressionAlgorithm::None => Ok(bytes.to_vec()),
+        CompressionAlgorithm::Zstd => zstd::bulk::decompress(bytes, usize::MAX)
+            .context("failed to zstd-decompress .pandora section"),
+    }
+}
+
+fn finalize_header_sections(
+    header: &ContainerHeader,
+    stored_sections: &[StoredSection],
+) -> anyhow::Result<(Vec<u8>, Vec<SectionDescriptor>)> {
+    let mut descriptors: Vec<SectionDescriptor> = stored_sections
+        .iter()
+        .map(|stored| stored.descriptor.clone())
+        .collect();
+
+    let mut previous_header_len = None;
+    for _ in 0..16 {
+        let mut probe = header.clone();
+        probe.sections = descriptors.clone();
+        let header_bytes = encode_header(&probe)?;
+        let mut offset = (PANDORA_PRELUDE_LEN + header_bytes.len()) as u64;
+
+        for (descriptor, stored) in descriptors.iter_mut().zip(stored_sections) {
+            descriptor.offset = offset;
+            descriptor.length = stored.payload.len() as u64;
+            offset += descriptor.length;
+        }
+
+        if previous_header_len == Some(header_bytes.len()) {
+            let mut final_header = header.clone();
+            final_header.sections = descriptors.clone();
+            return Ok((encode_header(&final_header)?, descriptors));
+        }
+        previous_header_len = Some(header_bytes.len());
+    }
+
+    bail!("failed to stabilize .pandora header length after offset assignment")
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct BamQcPayload {
+    pub by_role: BTreeMap<String, WGSBamStats>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct PipeQcPayload {
+    pub somatic_pipe_stats: SomaticPipeStats,
+    pub variant_stats: Option<VariantsStats>,
+    pub annotation_stats: BTreeMap<String, AnnotationsStats>,
+    pub vep_stats: Option<VepStats>,
+    pub caller_outputs: Vec<CallerOutputSummary>,
+    pub filter_steps: Vec<FilterStepSummary>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct CallerOutputSummary {
+    pub caller: String,
+    pub n_input: usize,
+    pub n_after_filters: Option<usize>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct FilterStepSummary {
+    pub name: String,
+    pub removed: usize,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct ProvenancePayload {
+    pub config_digest: Option<String>,
+    pub input_digests: BTreeMap<String, String>,
+    pub tool_versions: BTreeMap<String, String>,
+    pub command_lines: Vec<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_header() -> ContainerHeader {
+        ContainerHeader {
+            format: "somaticpipe.output".to_string(),
+            format_version: PANDORA_FORMAT_VERSION,
+            producer: ProducerMetadata {
+                name: "pandora_lib_promethion".to_string(),
+                pipeline: "SomaticPipe".to_string(),
+                pipeline_version: "0.1.0".to_string(),
+                git_commit: None,
+                created_at: Utc::now(),
+            },
+            sample: SampleMetadata {
+                sample_id: "sample_001".to_string(),
+                tumor_timepoint: Some("diag".to_string()),
+                normal_timepoint: Some("constit".to_string()),
+                reference: Some("hs1".to_string()),
+                reference_digest: None,
+            },
+            compression: CompressionMetadata::default(),
+            encryption: EncryptionMetadata::default(),
+            sections: Vec::new(),
+        }
+    }
+
+    #[test]
+    fn writes_and_reads_raw_section() -> anyhow::Result<()> {
+        let path = std::env::temp_dir().join(format!("{}.pandora", uuid::Uuid::new_v4()));
+        let payload = b"hello pandora".to_vec();
+        let section = PendingSection::new(
+            SectionName::Provenance,
+            SectionKind::RawBytes,
+            payload.clone(),
+        );
+
+        write_container(&path, test_header(), vec![section])?;
+        let read = read_required_section(&path, &SectionName::Provenance)?;
+
+        assert_eq!(read.payload, payload);
+        assert_eq!(read.descriptor.name, SectionName::Provenance);
+
+        std::fs::remove_file(path)?;
+        Ok(())
+    }
+
+    #[test]
+    fn rejects_bad_magic() -> anyhow::Result<()> {
+        let bytes = [0u8; PANDORA_PRELUDE_LEN];
+        let prelude = ContainerPrelude::from_reader(&bytes[..])?;
+        assert!(validate_prelude(&prelude).is_err());
+        Ok(())
+    }
+}

+ 115 - 8
src/pipes/somatic.rs

@@ -12,7 +12,7 @@ use crate::{
 };
 use log::info;
 use rayon::slice::ParallelSliceMut;
-use serde::Serialize;
+use serde::{Deserialize, Serialize};
 use std::{
     collections::BTreeMap,
     fs::{self, File},
@@ -22,11 +22,21 @@ use std::{
 use crate::{
     annotation::{Annotation, Annotations, AnnotationsStats, Sample},
     callers::{
-        clairs::ClairS, deep_somatic::DeepSomatic, deep_variant::DeepVariant, nanomonsv::NanomonSV,
-        savana::Savana, severus::Severus,
+        clairs::ClairS,
+        deep_somatic::DeepSomatic,
+        deep_variant::DeepVariant,
+        nanomonsv::NanomonSV,
+        savana::{Savana, SavanaCN},
+        severus::Severus,
     },
+    collection::bam_stats::WGSBamStats,
     config::Config,
     create_should_run, init_somatic_callers,
+    io::somaticpipe_container::{
+        bam_qc_section, copy_number_section, pipe_qc_section, provenance_section, variants_section,
+        BamQcPayload, CompressionMetadata, ContainerHeader, EncryptionMetadata, PipeQcPayload,
+        ProducerMetadata, ProvenancePayload, SampleMetadata,
+    },
     runners::Run,
     variant::{
         variant_collection::{ExternalAnnotation, VariantCollection, Variants},
@@ -210,8 +220,9 @@ impl Run for SomaticPipe {
         let result_json = format!("{}/{id}_somatic_variants.json.gz", config.tumoral_dir(&id));
         let result_bit = format!("{}/{id}_somatic_variants.bit", config.tumoral_dir(&id));
         let result_vcf = format!("{}/{id}_somatic_variants.vcf.gz", config.tumoral_dir(&id));
+        let result_pandora = format!("{}/{id}.pandora", config.tumoral_dir(&id));
 
-        let outputs = [&result_json, &result_bit, &result_vcf];
+        let outputs = [&result_json, &result_bit, &result_vcf, &result_pandora];
         if !config.somatic_pipe_force && outputs.iter().any(|p| Path::new(p).exists()) {
             return Err(anyhow::anyhow!(
                 "Somatic Pipe output already exists for {id}."
@@ -514,8 +525,8 @@ impl Run for SomaticPipe {
         let vep_stats = annotations.vep_stats()?;
         vep_stats.save_to_json(&format!("{stats_dir}/{id}_annotations_10_vep.json"))?;
 
-        VariantsStats::new(&mut variants, &id, &config, &high_depth_ranges)?
-            .save_to_json(&format!("{stats_dir}/{id}_variants_stats_final.json.gz"))?;
+        let variant_stats = VariantsStats::new(&mut variants, &id, &config, &high_depth_ranges)?;
+        variant_stats.save_to_json(&format!("{stats_dir}/{id}_variants_stats_final.json.gz"))?;
 
         info!("Final unique variants: {}", variants.data.len());
 
@@ -537,11 +548,107 @@ impl Run for SomaticPipe {
         variants.save_to_json(&result_json)?;
         variants.save_to_file(&result_bit)?;
         variants.write_vcf(&result_vcf, &config.dict_file, config.somatic_pipe_force)?;
+        write_pandora_output(
+            &result_pandora,
+            &id,
+            &config,
+            &variants,
+            somatic_stats,
+            variant_stats,
+            vep_stats,
+            &annotations.callers_stat(None),
+        )?;
 
         Ok(())
     }
 }
 
+fn write_pandora_output(
+    output_path: &str,
+    id: &str,
+    config: &Config,
+    variants: &Variants,
+    somatic_pipe_stats: SomaticPipeStats,
+    variant_stats: VariantsStats,
+    vep_stats: crate::annotation::VepStats,
+    final_annotation_stats: &AnnotationsStats,
+) -> anyhow::Result<()> {
+    let mut sections = Vec::new();
+    sections.push(variants_section(variants));
+
+    match SavanaCN::parse_file(id, config) {
+        Ok(copy_number) => sections.push(copy_number_section(&copy_number)?.optional()),
+        Err(err) => info!("Skipping .pandora copy_number section for {id}: {err}"),
+    }
+
+    let mut bam_qc = BamQcPayload {
+        by_role: BTreeMap::new(),
+    };
+    match WGSBamStats::open(id, &config.tumoral_name, config) {
+        Ok(stats) => {
+            bam_qc.by_role.insert(config.tumoral_name.clone(), stats);
+        }
+        Err(err) => info!("Skipping .pandora tumor BAM QC for {id}: {err}"),
+    }
+    match WGSBamStats::open(id, &config.normal_name, config) {
+        Ok(stats) => {
+            bam_qc.by_role.insert(config.normal_name.clone(), stats);
+        }
+        Err(err) => info!("Skipping .pandora normal BAM QC for {id}: {err}"),
+    }
+    if !bam_qc.by_role.is_empty() {
+        sections.push(bam_qc_section(&bam_qc)?);
+    }
+
+    let mut annotation_stats = BTreeMap::new();
+    annotation_stats.insert("final".to_string(), final_annotation_stats.clone());
+
+    let pipe_qc = PipeQcPayload {
+        somatic_pipe_stats,
+        variant_stats: Some(variant_stats),
+        annotation_stats,
+        vep_stats: Some(vep_stats),
+        caller_outputs: Vec::new(),
+        filter_steps: Vec::new(),
+    };
+    sections.push(pipe_qc_section(&pipe_qc)?);
+
+    let config_digest = serde_json::to_vec(config)
+        .ok()
+        .map(|bytes| format!("blake3:{}", blake3::hash(&bytes).to_hex()));
+    let provenance = ProvenancePayload {
+        config_digest,
+        input_digests: BTreeMap::new(),
+        tool_versions: BTreeMap::new(),
+        command_lines: Vec::new(),
+    };
+    sections.push(provenance_section(&provenance)?);
+
+    let header = ContainerHeader {
+        format: "somaticpipe.output".to_string(),
+        format_version: crate::io::somaticpipe_container::PANDORA_FORMAT_VERSION,
+        producer: ProducerMetadata {
+            name: env!("CARGO_PKG_NAME").to_string(),
+            pipeline: "SomaticPipe".to_string(),
+            pipeline_version: env!("CARGO_PKG_VERSION").to_string(),
+            git_commit: option_env!("GIT_COMMIT").map(ToString::to_string),
+            created_at: chrono::Utc::now(),
+        },
+        sample: SampleMetadata {
+            sample_id: id.to_string(),
+            tumor_timepoint: Some(config.tumoral_name.clone()),
+            normal_timepoint: Some(config.normal_name.clone()),
+            reference: Some(config.reference_name.clone()),
+            reference_digest: None,
+        },
+        compression: CompressionMetadata::default(),
+        encryption: EncryptionMetadata::default(),
+        sections: Vec::new(),
+    };
+
+    crate::io::somaticpipe_container::write_container(output_path, header, sections)
+}
+
 pub fn const_stats(id: String, config: Config) -> anyhow::Result<()> {
     info!("Loading Germline");
     let annotations = Annotations::default();
@@ -563,7 +670,7 @@ pub fn const_stats(id: String, config: Config) -> anyhow::Result<()> {
 
 /// Holds statistical data for somatic variant pipeline processing,
 /// including summary counts and input categorization.
-#[derive(Debug, Default, Clone)]
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
 pub struct SomaticPipeStats {
     /// Summary of input variant collections grouped by sample type.
     pub input: InputStats,
@@ -816,7 +923,7 @@ impl SomaticPipeStats {
 /// Each vector contains tuples of the form `(Annotation, usize)`, where:
 /// - `Annotation` identifies the caller and sample type.
 /// - `usize` represents the number of variants for that annotation.
-#[derive(Debug, Default, Clone)]
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
 pub struct InputStats {
     /// Variants from tumor-only samples.
     pub solo_tumor: Vec<(Annotation, usize)>,