Kaynağa Gözat

Add Pandora reader and Windows curl patch

STEIMLE Thomas 2 hafta önce
ebeveyn
işleme
fdc969a486
3 değiştirilmiş dosya ile 176 ekleme ve 9 silme
  1. 1 1
      Cargo.lock
  2. 1 1
      setup-patches.ps1
  3. 174 7
      src/io/somaticpipe_container.rs

+ 1 - 1
Cargo.lock

@@ -2019,7 +2019,7 @@ dependencies = [
 [[package]]
 name = "pandora_lib_assembler"
 version = "0.1.0"
-source = "git+https://git.t0m4.fr/Thomas/pandora_lib_assembler.git#3d241b7dbc175701e6454a9befffb4acdcdd26f3"
+source = "git+https://git.t0m4.fr/Thomas/pandora_lib_assembler.git#0a2145264fa97784c254ef96d1e2fe6b3f2e62e3"
 dependencies = [
  "anyhow",
  "bam",

+ 1 - 1
setup-patches.ps1

@@ -48,7 +48,7 @@ else { Write-Host "Fix 3b applied (_FILE_OFFSET_BITS=64 bindgen)"; $text = $resu
 # hts_expr.c needs POSIX regex; systre.c needs TRE. libregex depends on
 # libintl/gettext, which depends on libiconv.
 $old5 = "    cfg.file(`"wrapper.c`");`n    cfg.compile(`"hts`");"
-$new5 = "    cfg.file(`"wrapper.c`");`n    cfg.compile(`"hts`");`n`n    // hts_expr.c uses POSIX regex (regcomp/regexec/regfree) -- provided by libregex (gnurx).`n    // systre.c uses the TRE regex API (tre_regexec/tre_regerror) -- provided by libtre.`n    // Both live in the MinGW lib dir; locate it via gcc -print-file-name.`n    // Link statically so the exe has no runtime dependency on .dll files.`n    if target_os == `"windows`" {`n        let compiler = cfg.get_compiler();`n        let mingw_lib = std::process::Command::new(compiler.path())`n            .arg(`"-print-file-name=libtre.a`")`n            .output()`n            .ok()`n            .and_then(|o| String::from_utf8(o.stdout).ok())`n            .map(|s| s.trim().to_string())`n            .filter(|s| s != `"libtre.a`")`n            .and_then(|s| std::path::PathBuf::from(s).parent().map(|p| p.to_path_buf()));`n        if let Some(dir) = mingw_lib {`n            println!(`"cargo:rustc-link-search=native={}`", dir.display());`n        }`n        println!(`"cargo:rustc-link-lib=static=tre`");    // systre.c: tre_regexec/tre_regerror`n        println!(`"cargo:rustc-link-lib=static=regex`");  // hts_expr.c: regcomp/regexec/regfree`n        println!(`"cargo:rustc-link-lib=static=intl`");   // libregex dep: gettext`n        println!(`"cargo:rustc-link-lib=static=iconv`");  // libintl dep: iconv`n    }"
+$new5 = "    cfg.file(`"wrapper.c`");`n    cfg.compile(`"hts`");`n`n    // hts_expr.c uses POSIX regex (regcomp/regexec/regfree) -- provided by libregex (gnurx).`n    // systre.c uses the TRE regex API (tre_regexec/tre_regerror) -- provided by libtre.`n    // Both live in the MinGW lib dir; locate it via gcc -print-file-name.`n    // Link statically so the exe has no runtime dependency on .dll files.`n    if target_os == `"windows`" {`n        let compiler = cfg.get_compiler();`n        let mingw_lib = std::process::Command::new(compiler.path())`n            .arg(`"-print-file-name=libtre.a`")`n            .output()`n            .ok()`n            .and_then(|o| String::from_utf8(o.stdout).ok())`n            .map(|s| s.trim().to_string())`n            .filter(|s| s != `"libtre.a`")`n            .and_then(|s| std::path::PathBuf::from(s).parent().map(|p| p.to_path_buf()));`n        if let Some(dir) = mingw_lib {`n            println!(`"cargo:rustc-link-search=native={}`", dir.display());`n        }`n        println!(`"cargo:rustc-link-lib=dylib:+verbatim=libcurl.dll.a`"); // hfile_libcurl.c: curl_easy/curl_multi APIs`n        println!(`"cargo:rustc-link-lib=static=tre`");    // systre.c: tre_regexec/tre_regerror`n        println!(`"cargo:rustc-link-lib=static=regex`");  // hts_expr.c: regcomp/regexec/regfree`n        println!(`"cargo:rustc-link-lib=static=intl`");   // libregex dep: gettext`n        println!(`"cargo:rustc-link-lib=static=iconv`");  // libintl dep: iconv`n    }"
 $result = $text.Replace($old5, $new5)
 if ($result -eq $text) { Write-Warning "Fix 4 pattern not matched - verify hts-sys version" }
 else { Write-Host "Fix 4 applied (static-link regex/tre/intl/iconv on Windows)"; $text = $result }

+ 174 - 7
src/io/somaticpipe_container.rs

@@ -6,8 +6,8 @@
 use std::{
     collections::BTreeMap,
     fs::File,
-    io::{Read, Seek, SeekFrom, Write},
-    path::Path,
+    io::{Cursor, Read, Seek, SeekFrom, Write},
+    path::{Path, PathBuf},
 };
 
 use anyhow::{bail, Context};
@@ -251,6 +251,141 @@ pub struct DecodedSection {
     pub payload: Vec<u8>,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PandoraSummary {
+    pub prelude: ContainerPrelude,
+    pub header: ContainerHeader,
+    pub sections: Vec<PandoraSectionSummary>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct PandoraSectionSummary {
+    pub name: SectionName,
+    pub kind: SectionKind,
+    pub offset: u64,
+    pub length: u64,
+    pub checksum: String,
+    pub required: bool,
+}
+
+#[derive(Debug, Clone)]
+pub struct PandoraReader {
+    path: PathBuf,
+    prelude: ContainerPrelude,
+    header: ContainerHeader,
+}
+
+impl PandoraReader {
+    pub fn open(path: impl AsRef<Path>) -> anyhow::Result<Self> {
+        let path = path.as_ref().to_path_buf();
+        let (prelude, header) = read_header(&path)?;
+        Ok(Self {
+            path,
+            prelude,
+            header,
+        })
+    }
+
+    pub fn path(&self) -> &Path {
+        &self.path
+    }
+
+    pub fn prelude(&self) -> &ContainerPrelude {
+        &self.prelude
+    }
+
+    pub fn header(&self) -> &ContainerHeader {
+        &self.header
+    }
+
+    pub fn section_descriptor(&self, name: &SectionName) -> Option<&SectionDescriptor> {
+        self.header.section(name)
+    }
+
+    pub fn summary(&self) -> PandoraSummary {
+        PandoraSummary::from_parts(self.prelude.clone(), self.header.clone())
+    }
+
+    pub fn read_section(&self, name: &SectionName) -> anyhow::Result<Option<DecodedSection>> {
+        let Some(descriptor) = self.section_descriptor(name).cloned() else {
+            return Ok(None);
+        };
+        read_described_section(&self.path, descriptor).map(Some)
+    }
+
+    pub fn read_required_section(&self, name: &SectionName) -> anyhow::Result<DecodedSection> {
+        self.read_section(name)?
+            .with_context(|| format!("missing required section: {name:?}"))
+    }
+
+    pub fn variants(&self) -> anyhow::Result<Variants> {
+        let section = self.read_required_section(&SectionName::Variants)?;
+        decode_variants_section(&section)
+    }
+
+    pub fn copy_number(&self) -> anyhow::Result<Option<SavanaCN>> {
+        self.read_section(&SectionName::CopyNumber)?
+            .map(|section| decode_copy_number_section(&section))
+            .transpose()
+    }
+
+    pub fn bam_qc(&self) -> anyhow::Result<Option<BamQcPayload>> {
+        self.read_section(&SectionName::BamQc)?
+            .map(|section| decode_bam_qc_section(&section))
+            .transpose()
+    }
+
+    pub fn pipe_qc(&self) -> anyhow::Result<Option<PipeQcPayload>> {
+        self.read_section(&SectionName::PipeQc)?
+            .map(|section| decode_pipe_qc_section(&section))
+            .transpose()
+    }
+
+    pub fn provenance(&self) -> anyhow::Result<Option<ProvenancePayload>> {
+        self.read_section(&SectionName::Provenance)?
+            .map(|section| decode_provenance_section(&section))
+            .transpose()
+    }
+}
+
+impl PandoraSummary {
+    pub fn from_parts(prelude: ContainerPrelude, header: ContainerHeader) -> Self {
+        let sections = header
+            .sections
+            .iter()
+            .map(PandoraSectionSummary::from)
+            .collect();
+
+        Self {
+            prelude,
+            header,
+            sections,
+        }
+    }
+}
+
+impl From<&SectionDescriptor> for PandoraSectionSummary {
+    fn from(section: &SectionDescriptor) -> Self {
+        Self {
+            name: section.name.clone(),
+            kind: section.kind.clone(),
+            offset: section.offset,
+            length: section.length,
+            checksum: section.checksum.clone(),
+            required: section.required,
+        }
+    }
+}
+
+pub fn open_pandora(path: impl AsRef<Path>) -> anyhow::Result<PandoraReader> {
+    PandoraReader::open(path)
+}
+
+pub fn read_pandora_summary(path: impl AsRef<Path>) -> anyhow::Result<PandoraSummary> {
+    let (prelude, header) = read_header(path)?;
+    Ok(PandoraSummary::from_parts(prelude, header))
+}
+
 pub fn encode_header(header: &ContainerHeader) -> anyhow::Result<Vec<u8>> {
     let packed = rmp_serde::to_vec_named(header)?;
     zstd::bulk::compress(&packed, header.compression.level.unwrap_or(3))
@@ -258,7 +393,7 @@ pub fn encode_header(header: &ContainerHeader) -> anyhow::Result<Vec<u8>> {
 }
 
 pub fn decode_header(bytes: &[u8]) -> anyhow::Result<ContainerHeader> {
-    let unpacked = zstd::bulk::decompress(bytes, usize::MAX)
+    let unpacked = zstd::stream::decode_all(Cursor::new(bytes))
         .context("failed to decompress .pandora header")?;
     rmp_serde::from_slice(&unpacked).context("failed to decode .pandora header")
 }
@@ -283,7 +418,7 @@ pub fn write_container(
         .map(encode_pending_section)
         .collect::<anyhow::Result<Vec<_>>>()?;
 
-    let (header_bytes, descriptors) = finalize_header_sections(&header, &stored_sections)?;
+    let (_, descriptors) = finalize_header_sections(&header, &stored_sections)?;
     header.sections = descriptors;
     let header_bytes = encode_header(&header)?;
     let header_checksum = blake3::hash(&header_bytes);
@@ -328,6 +463,13 @@ pub fn read_section(
         return Ok(None);
     };
 
+    read_described_section(path, descriptor).map(Some)
+}
+
+fn read_described_section(
+    path: impl AsRef<Path>,
+    descriptor: SectionDescriptor,
+) -> anyhow::Result<DecodedSection> {
     let mut reader = File::open(path.as_ref())
         .with_context(|| format!("failed to open {}", path.as_ref().display()))?;
     reader.seek(SeekFrom::Start(descriptor.offset))?;
@@ -345,10 +487,10 @@ pub fn read_section(
     }
 
     let payload = decode_section_payload(&stored, &descriptor.compression)?;
-    Ok(Some(DecodedSection {
+    Ok(DecodedSection {
         descriptor,
         payload,
-    }))
+    })
 }
 
 pub fn read_required_section(
@@ -553,7 +695,7 @@ fn decode_section_payload(
 ) -> anyhow::Result<Vec<u8>> {
     match compression.algorithm {
         CompressionAlgorithm::None => Ok(bytes.to_vec()),
-        CompressionAlgorithm::Zstd => zstd::bulk::decompress(bytes, usize::MAX)
+        CompressionAlgorithm::Zstd => zstd::stream::decode_all(Cursor::new(bytes))
             .context("failed to zstd-decompress .pandora section"),
     }
 }
@@ -675,6 +817,31 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn reader_exposes_summary_and_sections() -> anyhow::Result<()> {
+        let path = std::env::temp_dir().join(format!("{}.pandora", uuid::Uuid::new_v4()));
+        let payload = b"hello reader".to_vec();
+        let section = PendingSection::new(
+            SectionName::Provenance,
+            SectionKind::RawBytes,
+            payload.clone(),
+        );
+
+        write_container(&path, test_header(), vec![section])?;
+
+        let reader = PandoraReader::open(&path)?;
+        let summary = reader.summary();
+        assert_eq!(summary.header.sample.sample_id, "sample_001");
+        assert_eq!(summary.sections.len(), 1);
+        assert_eq!(summary.sections[0].name, SectionName::Provenance);
+
+        let read = reader.read_required_section(&SectionName::Provenance)?;
+        assert_eq!(read.payload, payload);
+
+        std::fs::remove_file(path)?;
+        Ok(())
+    }
+
     #[test]
     fn rejects_bad_magic() -> anyhow::Result<()> {
         let bytes = [0u8; PANDORA_PRELUDE_LEN];