|
@@ -271,6 +271,7 @@ pub struct VariantRecordPointer {
|
|
|
pub block_offset: u64,
|
|
pub block_offset: u64,
|
|
|
pub block_length: u64,
|
|
pub block_length: u64,
|
|
|
pub block_uncompressed_length: u64,
|
|
pub block_uncompressed_length: u64,
|
|
|
|
|
+ pub block_compression: CompressionMetadata,
|
|
|
pub record_offset_in_block: u64,
|
|
pub record_offset_in_block: u64,
|
|
|
pub record_length_in_block: u64,
|
|
pub record_length_in_block: u64,
|
|
|
}
|
|
}
|
|
@@ -606,9 +607,8 @@ pub fn write_container(
|
|
|
.map(encode_pending_section)
|
|
.map(encode_pending_section)
|
|
|
.collect::<anyhow::Result<Vec<_>>>()?;
|
|
.collect::<anyhow::Result<Vec<_>>>()?;
|
|
|
|
|
|
|
|
- let (_, descriptors) = finalize_header_sections(&header, &stored_sections)?;
|
|
|
|
|
|
|
+ let (header_bytes, descriptors) = finalize_header_sections(&header, &stored_sections)?;
|
|
|
header.sections = descriptors;
|
|
header.sections = descriptors;
|
|
|
- let header_bytes = encode_header(&header)?;
|
|
|
|
|
let header_checksum = blake3::hash(&header_bytes);
|
|
let header_checksum = blake3::hash(&header_bytes);
|
|
|
|
|
|
|
|
let prelude = ContainerPrelude {
|
|
let prelude = ContainerPrelude {
|
|
@@ -621,6 +621,13 @@ pub fn write_container(
|
|
|
.with_context(|| format!("failed to create {}", path.as_ref().display()))?;
|
|
.with_context(|| format!("failed to create {}", path.as_ref().display()))?;
|
|
|
prelude.write_to(&mut writer)?;
|
|
prelude.write_to(&mut writer)?;
|
|
|
writer.write_all(&header_bytes)?;
|
|
writer.write_all(&header_bytes)?;
|
|
|
|
|
+ if let Some(first_section) = header.sections.first() {
|
|
|
|
|
+ let written = (PANDORA_PRELUDE_LEN + header_bytes.len()) as u64;
|
|
|
|
|
+ if first_section.offset > written {
|
|
|
|
|
+ let padding = vec![0; (first_section.offset - written) as usize];
|
|
|
|
|
+ writer.write_all(&padding)?;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
for stored in stored_sections {
|
|
for stored in stored_sections {
|
|
|
writer.write_all(&stored.payload)?;
|
|
writer.write_all(&stored.payload)?;
|
|
|
}
|
|
}
|
|
@@ -639,6 +646,11 @@ pub fn read_header(path: impl AsRef<Path>) -> anyhow::Result<(ContainerPrelude,
|
|
|
|
|
|
|
|
let header = decode_header(&header_bytes)?;
|
|
let header = decode_header(&header_bytes)?;
|
|
|
validate_header(&header)?;
|
|
validate_header(&header)?;
|
|
|
|
|
+ validate_section_layout(
|
|
|
|
|
+ &header,
|
|
|
|
|
+ (PANDORA_PRELUDE_LEN + header_bytes.len()) as u64,
|
|
|
|
|
+ reader.metadata()?.len(),
|
|
|
|
|
+ )?;
|
|
|
Ok((prelude, header))
|
|
Ok((prelude, header))
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -902,6 +914,7 @@ fn flush_variant_record_block(
|
|
|
block_offset,
|
|
block_offset,
|
|
|
block_length,
|
|
block_length,
|
|
|
block_uncompressed_length,
|
|
block_uncompressed_length,
|
|
|
|
|
+ block_compression: block_compression.clone(),
|
|
|
record_offset_in_block,
|
|
record_offset_in_block,
|
|
|
record_length_in_block,
|
|
record_length_in_block,
|
|
|
});
|
|
});
|
|
@@ -927,6 +940,8 @@ fn encode_variant_index_arrow(
|
|
|
Field::new("block_offset", DataType::UInt64, false),
|
|
Field::new("block_offset", DataType::UInt64, false),
|
|
|
Field::new("block_length", DataType::UInt64, false),
|
|
Field::new("block_length", DataType::UInt64, false),
|
|
|
Field::new("block_uncompressed_length", DataType::UInt64, false),
|
|
Field::new("block_uncompressed_length", DataType::UInt64, false),
|
|
|
|
|
+ Field::new("block_compression", DataType::Utf8, false),
|
|
|
|
|
+ Field::new("block_compression_level", DataType::UInt8, true),
|
|
|
Field::new("record_offset_in_block", DataType::UInt64, false),
|
|
Field::new("record_offset_in_block", DataType::UInt64, false),
|
|
|
Field::new("record_length_in_block", DataType::UInt64, false),
|
|
Field::new("record_length_in_block", DataType::UInt64, false),
|
|
|
Field::new("contig", DataType::Utf8, false),
|
|
Field::new("contig", DataType::Utf8, false),
|
|
@@ -977,6 +992,24 @@ fn encode_variant_index_arrow(
|
|
|
.map(|row| row.pointer.block_uncompressed_length)
|
|
.map(|row| row.pointer.block_uncompressed_length)
|
|
|
.collect::<Vec<_>>(),
|
|
.collect::<Vec<_>>(),
|
|
|
)),
|
|
)),
|
|
|
|
|
+ Arc::new(StringArray::from(
|
|
|
|
|
+ rows.iter()
|
|
|
|
|
+ .map(|row| match row.pointer.block_compression.algorithm {
|
|
|
|
|
+ CompressionAlgorithm::None => "none",
|
|
|
|
|
+ CompressionAlgorithm::Zstd => "zstd",
|
|
|
|
|
+ })
|
|
|
|
|
+ .collect::<Vec<_>>(),
|
|
|
|
|
+ )),
|
|
|
|
|
+ Arc::new(UInt8Array::from(
|
|
|
|
|
+ rows.iter()
|
|
|
|
|
+ .map(|row| {
|
|
|
|
|
+ row.pointer
|
|
|
|
|
+ .block_compression
|
|
|
|
|
+ .level
|
|
|
|
|
+ .and_then(|level| u8::try_from(level).ok())
|
|
|
|
|
+ })
|
|
|
|
|
+ .collect::<Vec<_>>(),
|
|
|
|
|
+ )),
|
|
|
Arc::new(UInt64Array::from(
|
|
Arc::new(UInt64Array::from(
|
|
|
rows.iter()
|
|
rows.iter()
|
|
|
.map(|row| row.pointer.record_offset_in_block)
|
|
.map(|row| row.pointer.record_offset_in_block)
|
|
@@ -1114,6 +1147,7 @@ fn read_variant_record_from_path(
|
|
|
let descriptor = header
|
|
let descriptor = header
|
|
|
.section(&SectionName::VariantRecords)
|
|
.section(&SectionName::VariantRecords)
|
|
|
.context("missing variant_records section")?;
|
|
.context("missing variant_records section")?;
|
|
|
|
|
+
|
|
|
ensure_variant_record_pointer(descriptor, pointer)?;
|
|
ensure_variant_record_pointer(descriptor, pointer)?;
|
|
|
|
|
|
|
|
let mut reader = File::open(path.as_ref())
|
|
let mut reader = File::open(path.as_ref())
|
|
@@ -1146,10 +1180,11 @@ fn read_variant_records_from_path(
|
|
|
pointer.block_length,
|
|
pointer.block_length,
|
|
|
pointer.block_uncompressed_length,
|
|
pointer.block_uncompressed_length,
|
|
|
);
|
|
);
|
|
|
- if !block_cache.contains_key(&block_key) {
|
|
|
|
|
|
|
+ if let std::collections::btree_map::Entry::Vacant(e) = block_cache.entry(block_key) {
|
|
|
let block = read_variant_record_block(&mut reader, descriptor, pointer)?;
|
|
let block = read_variant_record_block(&mut reader, descriptor, pointer)?;
|
|
|
- block_cache.insert(block_key, block);
|
|
|
|
|
|
|
+ e.insert(block);
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
let block = block_cache
|
|
let block = block_cache
|
|
|
.get(&block_key)
|
|
.get(&block_key)
|
|
|
.context("variant record block missing from cache")?;
|
|
.context("variant record block missing from cache")?;
|
|
@@ -1168,8 +1203,8 @@ fn read_variant_record_block(
|
|
|
|
|
|
|
|
let mut block = vec![0; pointer.block_length as usize];
|
|
let mut block = vec![0; pointer.block_length as usize];
|
|
|
reader.read_exact(&mut block)?;
|
|
reader.read_exact(&mut block)?;
|
|
|
- let block = zstd::stream::decode_all(Cursor::new(block))
|
|
|
|
|
- .context("failed to zstd-decompress variant record block")?;
|
|
|
|
|
|
|
+ let block = decode_section_payload(&block, &pointer.block_compression)
|
|
|
|
|
+ .context("failed to decode variant record block")?;
|
|
|
if block.len() as u64 != pointer.block_uncompressed_length {
|
|
if block.len() as u64 != pointer.block_uncompressed_length {
|
|
|
bail!(
|
|
bail!(
|
|
|
"variant record block length mismatch: expected {}, decoded {}",
|
|
"variant record block length mismatch: expected {}, decoded {}",
|
|
@@ -1401,6 +1436,59 @@ fn validate_header(header: &ContainerHeader) -> anyhow::Result<()> {
|
|
|
Ok(())
|
|
Ok(())
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+fn validate_section_layout(
|
|
|
|
|
+ header: &ContainerHeader,
|
|
|
|
|
+ data_start: u64,
|
|
|
|
|
+ file_len: u64,
|
|
|
|
|
+) -> anyhow::Result<()> {
|
|
|
|
|
+ let mut ranges = Vec::with_capacity(header.sections.len());
|
|
|
|
|
+ let mut seen = Vec::with_capacity(header.sections.len());
|
|
|
|
|
+
|
|
|
|
|
+ for section in &header.sections {
|
|
|
|
|
+ let name = format!("{:?}", section.name);
|
|
|
|
|
+ if seen.iter().any(|existing| existing == &name) {
|
|
|
|
|
+ bail!("duplicate .pandora section: {name}");
|
|
|
|
|
+ }
|
|
|
|
|
+ seen.push(name.clone());
|
|
|
|
|
+
|
|
|
|
|
+ if section.offset < data_start {
|
|
|
|
|
+ bail!(
|
|
|
|
|
+ "{name} section starts inside .pandora prelude/header: offset {} < {}",
|
|
|
|
|
+ section.offset,
|
|
|
|
|
+ data_start
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ let end = section
|
|
|
|
|
+ .offset
|
|
|
|
|
+ .checked_add(section.length)
|
|
|
|
|
+ .with_context(|| format!("{name} section offset/length overflow"))?;
|
|
|
|
|
+ if end > file_len {
|
|
|
|
|
+ bail!(
|
|
|
|
|
+ "{name} section extends beyond file: end {} > {}",
|
|
|
|
|
+ end,
|
|
|
|
|
+ file_len
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+ ranges.push((section.offset, end, name));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ ranges.sort_by_key(|(start, _, _)| *start);
|
|
|
|
|
+ for pair in ranges.windows(2) {
|
|
|
|
|
+ let (_, previous_end, previous_name) = &pair[0];
|
|
|
|
|
+ let (next_start, _, next_name) = &pair[1];
|
|
|
|
|
+ if previous_end > next_start {
|
|
|
|
|
+ bail!(
|
|
|
|
|
+ "{previous_name} section overlaps {next_name}: end {} > start {}",
|
|
|
|
|
+ previous_end,
|
|
|
|
|
+ next_start
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ Ok(())
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
fn verify_checksum(bytes: &[u8], expected: &[u8; 32], label: &str) -> anyhow::Result<()> {
|
|
fn verify_checksum(bytes: &[u8], expected: &[u8; 32], label: &str) -> anyhow::Result<()> {
|
|
|
let actual = blake3::hash(bytes);
|
|
let actual = blake3::hash(bytes);
|
|
|
if actual.as_bytes() != expected {
|
|
if actual.as_bytes() != expected {
|
|
@@ -1474,17 +1562,18 @@ fn finalize_header_sections(
|
|
|
header: &ContainerHeader,
|
|
header: &ContainerHeader,
|
|
|
stored_sections: &[StoredSection],
|
|
stored_sections: &[StoredSection],
|
|
|
) -> anyhow::Result<(Vec<u8>, Vec<SectionDescriptor>)> {
|
|
) -> anyhow::Result<(Vec<u8>, Vec<SectionDescriptor>)> {
|
|
|
- let mut descriptors: Vec<SectionDescriptor> = stored_sections
|
|
|
|
|
|
|
+ let base_descriptors: Vec<SectionDescriptor> = stored_sections
|
|
|
.iter()
|
|
.iter()
|
|
|
.map(|stored| stored.descriptor.clone())
|
|
.map(|stored| stored.descriptor.clone())
|
|
|
.collect();
|
|
.collect();
|
|
|
|
|
+ let mut descriptors = base_descriptors.clone();
|
|
|
|
|
+ let mut probe = header.clone();
|
|
|
|
|
+ probe.sections = descriptors.clone();
|
|
|
|
|
+ let mut reserved_header_len = encode_header(&probe)?.len() + 1024;
|
|
|
|
|
|
|
|
- let mut previous_header_len = None;
|
|
|
|
|
for _ in 0..16 {
|
|
for _ in 0..16 {
|
|
|
- let mut probe = header.clone();
|
|
|
|
|
- probe.sections = descriptors.clone();
|
|
|
|
|
- let header_bytes = encode_header(&probe)?;
|
|
|
|
|
- let mut offset = (PANDORA_PRELUDE_LEN + header_bytes.len()) as u64;
|
|
|
|
|
|
|
+ descriptors = base_descriptors.clone();
|
|
|
|
|
+ let mut offset = (PANDORA_PRELUDE_LEN + reserved_header_len) as u64;
|
|
|
|
|
|
|
|
for (descriptor, stored) in descriptors.iter_mut().zip(stored_sections) {
|
|
for (descriptor, stored) in descriptors.iter_mut().zip(stored_sections) {
|
|
|
descriptor.offset = offset;
|
|
descriptor.offset = offset;
|
|
@@ -1492,15 +1581,17 @@ fn finalize_header_sections(
|
|
|
offset += descriptor.length;
|
|
offset += descriptor.length;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- if previous_header_len == Some(header_bytes.len()) {
|
|
|
|
|
- let mut final_header = header.clone();
|
|
|
|
|
- final_header.sections = descriptors.clone();
|
|
|
|
|
- return Ok((encode_header(&final_header)?, descriptors));
|
|
|
|
|
|
|
+ let mut final_header = header.clone();
|
|
|
|
|
+ final_header.sections = descriptors.clone();
|
|
|
|
|
+ let header_bytes = encode_header(&final_header)?;
|
|
|
|
|
+ if header_bytes.len() <= reserved_header_len {
|
|
|
|
|
+ return Ok((header_bytes, descriptors));
|
|
|
}
|
|
}
|
|
|
- previous_header_len = Some(header_bytes.len());
|
|
|
|
|
|
|
+
|
|
|
|
|
+ reserved_header_len = header_bytes.len() + 1024;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- bail!("failed to stabilize .pandora header length after offset assignment")
|
|
|
|
|
|
|
+ bail!("failed to reserve enough .pandora header space after 16 passes")
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
@@ -1567,6 +1658,24 @@ mod tests {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ fn test_variant(pos: u32, reference: &str, alternative: &str) -> anyhow::Result<Variant> {
|
|
|
|
|
+ let vcf: crate::variant::vcf_variant::VcfVariant = format!(
|
|
|
|
|
+ "chr1\t{}\t.\t{}\t{}\t60\tPASS\t.",
|
|
|
|
|
+ pos + 1,
|
|
|
|
|
+ reference,
|
|
|
|
|
+ alternative
|
|
|
|
|
+ )
|
|
|
|
|
+ .parse()?;
|
|
|
|
|
+ Ok(Variant {
|
|
|
|
|
+ hash: vcf.hash,
|
|
|
|
|
+ position: vcf.position.clone(),
|
|
|
|
|
+ reference: vcf.reference.clone(),
|
|
|
|
|
+ alternative: vcf.alternative.clone(),
|
|
|
|
|
+ vcf_variants: vec![vcf],
|
|
|
|
|
+ annotations: Vec::new(),
|
|
|
|
|
+ })
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
#[test]
|
|
#[test]
|
|
|
fn writes_and_reads_raw_section() -> anyhow::Result<()> {
|
|
fn writes_and_reads_raw_section() -> anyhow::Result<()> {
|
|
|
let path = std::env::temp_dir().join(format!("{}.pandora", uuid::Uuid::new_v4()));
|
|
let path = std::env::temp_dir().join(format!("{}.pandora", uuid::Uuid::new_v4()));
|
|
@@ -1634,6 +1743,74 @@ mod tests {
|
|
|
Ok(())
|
|
Ok(())
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ #[test]
|
|
|
|
|
+ fn reads_lazy_variant_records() -> anyhow::Result<()> {
|
|
|
|
|
+ let path = std::env::temp_dir().join(format!("{}.pandora", uuid::Uuid::new_v4()));
|
|
|
|
|
+ let variants = Variants {
|
|
|
|
|
+ data: vec![test_variant(99, "A", "T")?, test_variant(199, "G", "C")?],
|
|
|
|
|
+ };
|
|
|
|
|
+ let (records, _index) = variant_records_and_index_sections(&variants)?;
|
|
|
|
|
+
|
|
|
|
|
+ write_container(&path, test_header(), vec![records])?;
|
|
|
|
|
+ let pointer = VariantRecordPointer {
|
|
|
|
|
+ variant_idx: 1,
|
|
|
|
|
+ block_offset: 0,
|
|
|
|
|
+ block_length: read_header(&path)?
|
|
|
|
|
+ .1
|
|
|
|
|
+ .section(&SectionName::VariantRecords)
|
|
|
|
|
+ .context("missing variant_records section")?
|
|
|
|
|
+ .length,
|
|
|
|
|
+ block_uncompressed_length: bitcode::encode(&variants.data[0]).len() as u64
|
|
|
|
|
+ + bitcode::encode(&variants.data[1]).len() as u64,
|
|
|
|
|
+ block_compression: CompressionMetadata::default(),
|
|
|
|
|
+ record_offset_in_block: bitcode::encode(&variants.data[0]).len() as u64,
|
|
|
|
|
+ record_length_in_block: bitcode::encode(&variants.data[1]).len() as u64,
|
|
|
|
|
+ };
|
|
|
|
|
+
|
|
|
|
|
+ let reader = PandoraReader::open(&path)?;
|
|
|
|
|
+ let read = reader.read_variant_record(&pointer)?;
|
|
|
|
|
+ assert_eq!(read, variants.data[1]);
|
|
|
|
|
+
|
|
|
|
|
+ std::fs::remove_file(path)?;
|
|
|
|
|
+ Ok(())
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ #[test]
|
|
|
|
|
+ fn rejects_overlapping_section_layout() {
|
|
|
|
|
+ let mut header = test_header();
|
|
|
|
|
+ header.sections = vec![
|
|
|
|
|
+ SectionDescriptor {
|
|
|
|
|
+ name: SectionName::Variants,
|
|
|
|
|
+ kind: SectionKind::RawBytes,
|
|
|
|
|
+ compression: CompressionMetadata::default(),
|
|
|
|
|
+ encryption: EncryptionAlgorithm::None,
|
|
|
|
|
+ offset: 128,
|
|
|
|
|
+ length: 16,
|
|
|
|
|
+ nonce_b64: None,
|
|
|
|
|
+ tag_b64: None,
|
|
|
|
|
+ checksum: "blake3:unused".to_string(),
|
|
|
|
|
+ schema_hash: None,
|
|
|
|
|
+ required: true,
|
|
|
|
|
+ },
|
|
|
|
|
+ SectionDescriptor {
|
|
|
|
|
+ name: SectionName::Provenance,
|
|
|
|
|
+ kind: SectionKind::RawBytes,
|
|
|
|
|
+ compression: CompressionMetadata::default(),
|
|
|
|
|
+ encryption: EncryptionAlgorithm::None,
|
|
|
|
|
+ offset: 140,
|
|
|
|
|
+ length: 8,
|
|
|
|
|
+ nonce_b64: None,
|
|
|
|
|
+ tag_b64: None,
|
|
|
|
|
+ checksum: "blake3:unused".to_string(),
|
|
|
|
|
+ schema_hash: None,
|
|
|
|
|
+ required: true,
|
|
|
|
|
+ },
|
|
|
|
|
+ ];
|
|
|
|
|
+
|
|
|
|
|
+ let err = validate_section_layout(&header, 64, 200).unwrap_err();
|
|
|
|
|
+ assert!(err.to_string().contains("overlaps"));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
#[test]
|
|
#[test]
|
|
|
fn validation_reports_corrupt_section() -> anyhow::Result<()> {
|
|
fn validation_reports_corrupt_section() -> anyhow::Result<()> {
|
|
|
let path = std::env::temp_dir().join(format!("{}.pandora", uuid::Uuid::new_v4()));
|
|
let path = std::env::temp_dir().join(format!("{}.pandora", uuid::Uuid::new_v4()));
|