1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
// SPDX-License-Identifier: Apache-2.0
// Copyright (c) 2024-present, fjall-rs
// Copyright (c) 2026-present, Structured World Foundation
use crate::io::{LittleEndian, ReadBytesExt};
#[cfg(not(feature = "std"))]
use crate::io::{Read, Write};
use crate::{
CompressionType, InternalValue, KeyRange, SeqNo, Slice,
checksum::ChecksumType,
coding::{Decode, Encode},
comparator::default_comparator,
table::{Block, DataBlock},
vlog::BlobFileId,
};
#[cfg(feature = "std")]
use std::io::{Read, Write};
macro_rules! read_u64 {
($block:expr, $name:expr, $cmp:expr) => {{
let bytes = $block
.point_read($name, SeqNo::MAX, $cmp)?
.ok_or(crate::Error::InvalidHeader("BlobFileMeta"))?;
let mut bytes = &bytes.value[..];
bytes.read_u64::<LittleEndian>()?
}};
}
macro_rules! read_u128 {
($block:expr, $name:expr, $cmp:expr) => {{
let bytes = $block
.point_read($name, SeqNo::MAX, $cmp)?
.ok_or(crate::Error::InvalidHeader("BlobFileMeta"))?;
let mut bytes = &bytes.value[..];
bytes.read_u128::<LittleEndian>()?
}};
}
pub const METADATA_HEADER_MAGIC: &[u8] = b"META";
// Note: `pub` for crate-internal use; parent `vlog` module is NOT
// exported from `lib.rs`, so this struct is not public API.
#[derive(Debug, PartialEq, Eq)]
pub struct Metadata {
pub id: BlobFileId,
/// Blob file format version (3 = V3, 4 = V4 with header CRC).
pub version: u8,
pub created_at: u128,
/// Number of KV-pairs in the blob file
pub item_count: u64,
/// compressed size in bytes (on disk) (without metadata or trailer)
pub total_compressed_bytes: u64,
/// true size in bytes (if no compression were used)
pub total_uncompressed_bytes: u64,
/// Key range
pub key_range: KeyRange,
/// Compression type used for all blobs in this file
pub compression: CompressionType,
}
impl Metadata {
pub fn encode_into<W: Write>(&self, writer: &mut W) -> crate::Result<()> {
fn meta(key: &str, value: &[u8]) -> InternalValue {
InternalValue::from_components(key, value, 0, crate::ValueType::Value)
}
// Write header
writer.write_all(METADATA_HEADER_MAGIC)?;
#[rustfmt::skip]
let meta_items = [
meta("blob_file_version", &[self.version]),
meta("checksum_type", &[u8::from(ChecksumType::Xxh3)]),
meta("compression", &self.compression.encode_into_vec()),
meta("crate_version", env!("CARGO_PKG_VERSION").as_bytes()),
meta("created_at", &self.created_at.to_le_bytes()),
meta("file_size", &self.total_compressed_bytes.to_le_bytes()),
meta("id", &self.id.to_le_bytes()),
meta("item_count", &self.item_count.to_le_bytes()),
meta("key#max", self.key_range.max()),
meta("key#min", self.key_range.min()),
meta("uncompressed_size", &self.total_uncompressed_bytes.to_le_bytes()),
];
// NOTE: Just to make sure the items are definitely sorted
#[cfg(debug_assertions)]
{
let is_sorted = meta_items.iter().is_sorted_by_key(|kv| &kv.key);
assert!(is_sorted, "meta items not sorted correctly");
}
// TODO: no binary index
let buf = DataBlock::encode_into_vec(&meta_items, 1, 0.0)?;
// Blob files are currently not encrypted at all: neither this metadata
// block nor the blob value frames/contents are covered by block-level
// encryption. The metadata contains structural fields (version, counts,
// compression) plus key_range (min/max keys), which may leak key
// prefixes, and KV separation can leave large values on disk in
// plaintext. Full blob-level encryption (metadata + contents) is
// planned as a follow-up to block-level encryption.
// TODO: encrypt blob metadata and blob contents when an encryption
// provider is threaded through the blob file writer/reader paths.
Block::write_into(
writer,
&buf,
crate::table::block::BlockIdentity {
// Mirror the table-meta bootstrapping exception:
// blob meta is read via from_slice BEFORE the
// reader knows the BlobFileId (the id is what
// from_slice produces). For write/read AAD to
// match once #251 wires AAD, the writer must
// use the same table_id=0 the reader uses.
// Pre-#251 the value is accepted-but-not-consumed,
// but choosing the asymmetric `self.id` here would
// bake a permanent decrypt-mismatch into any
// encrypted blob meta we ever write.
table_id: 0,
block_type: crate::table::block::BlockType::Meta,
dict_id: 0,
window_log: 0,
},
// Blob-meta blocks are always uncompressed and currently
// never encrypted (see TODO above on blob-level
// encryption). Plain transform here.
&crate::table::block::BlockTransform::PLAIN,
)?;
Ok(())
}
pub fn from_slice(slice: &Slice) -> crate::Result<Self> {
let reader = &mut &slice[..];
// Check header
let mut magic = [0u8; METADATA_HEADER_MAGIC.len()];
reader.read_exact(&mut magic)?;
if magic != METADATA_HEADER_MAGIC {
return Err(crate::Error::InvalidHeader("BlobFileMeta"));
}
// TODO: Block::from_slice
let block = Block::from_reader(
reader,
crate::table::block::BlockIdentity {
// from_slice constructs Self by parsing the blob
// meta — self.id is what THIS read produces, not
// available beforehand. table_id=0 here mirrors
// the table-meta parse path: cross-blob swap
// detection still relies on the meta payload's
// own id field being part of the verified body.
table_id: 0,
block_type: crate::table::block::BlockType::Meta,
dict_id: 0,
window_log: 0,
},
// Blob-meta blocks are always uncompressed and currently
// never encrypted (see TODO above on blob-level
// encryption). Plain transform here.
&crate::table::block::BlockTransform::PLAIN,
)?;
let block = DataBlock::new(block);
// Metadata keys are always lexicographic, so use the default comparator.
let cmp = default_comparator();
let version = {
let bytes = block
.point_read(b"blob_file_version", SeqNo::MAX, &cmp)?
.ok_or(crate::Error::InvalidHeader("BlobFileMeta"))?;
*bytes
.value
.first()
.ok_or(crate::Error::InvalidHeader("BlobFileMeta"))?
};
// Reject unknown versions early to catch corrupted or
// future-incompatible metadata before downstream code
// misinterprets header fields.
match version {
3 | 4 => {}
_ => return Err(crate::Error::InvalidHeader("BlobFileMeta")),
}
let id = read_u64!(block, b"id", &cmp);
let created_at = read_u128!(block, b"created_at", &cmp);
let item_count = read_u64!(block, b"item_count", &cmp);
let file_size = read_u64!(block, b"file_size", &cmp);
let total_uncompressed_bytes = read_u64!(block, b"uncompressed_size", &cmp);
let compression = {
let bytes = block
.point_read(b"compression", SeqNo::MAX, &cmp)?
.ok_or(crate::Error::InvalidHeader("BlobFileMeta"))?;
let mut bytes = &bytes.value[..];
CompressionType::decode_from(&mut bytes)?
};
let key_range = KeyRange::new((
block
.point_read(b"key#min", SeqNo::MAX, &cmp)?
.ok_or(crate::Error::InvalidHeader("BlobFileMeta"))?
.value,
block
.point_read(b"key#max", SeqNo::MAX, &cmp)?
.ok_or(crate::Error::InvalidHeader("BlobFileMeta"))?
.value,
));
Ok(Self {
id,
version,
created_at,
compression,
item_count,
total_compressed_bytes: file_size,
total_uncompressed_bytes,
key_range,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use test_log::test;
#[test]
fn test_blob_file_meta_truncated_returns_err() {
// Truncated metadata (just the magic header) must return Err, not panic
let buf = Slice::from(METADATA_HEADER_MAGIC.to_vec());
assert!(Metadata::from_slice(&buf).is_err());
}
/// Build a metadata block that is structurally valid but omits a required
/// property (`compression`). `from_slice` must return `Err`, not panic.
#[test]
#[expect(clippy::unwrap_used)]
fn test_blob_file_meta_missing_field_returns_err() {
use crate::table::block::BlockType;
use std::io::Write;
fn meta(key: &str, value: &[u8]) -> InternalValue {
InternalValue::from_components(key, value, 0, crate::ValueType::Value)
}
// Include all required fields EXCEPT `compression`
#[rustfmt::skip]
let meta_items = [
meta("blob_file_version", &[4u8]),
meta("checksum_type", &[u8::from(ChecksumType::Xxh3)]),
// "compression" intentionally omitted
meta("crate_version", env!("CARGO_PKG_VERSION").as_bytes()),
meta("created_at", &1_234_567_890u128.to_le_bytes()),
meta("file_size", &1024u64.to_le_bytes()),
meta("id", &0u64.to_le_bytes()),
meta("item_count", &100u64.to_le_bytes()),
meta("key#max", b"z"),
meta("key#min", b"a"),
meta("uncompressed_size", &2048u64.to_le_bytes()),
];
let encoded = DataBlock::encode_into_vec(&meta_items, 1, 0.0).unwrap();
let mut buf = Vec::new();
buf.write_all(METADATA_HEADER_MAGIC).unwrap();
Block::write_into(
&mut buf,
&encoded,
crate::table::block::BlockIdentity::for_test(0, BlockType::Meta),
&crate::table::block::BlockTransform::PLAIN,
)
.unwrap();
let buf = Slice::from(buf);
let result = Metadata::from_slice(&buf);
assert!(
matches!(result, Err(crate::Error::InvalidHeader("BlobFileMeta"))),
"expected Err(InvalidHeader(\"BlobFileMeta\")), got {result:?}",
);
}
/// Regression test for #195: corrupt the block trailer (last bytes) of a
/// valid blob file metadata block. `from_slice` must return `Err`, not
/// panic. The checksum layer catches byte-level corruption before trailer
/// parsing; the `point_read` → `ok_or` error path for missing/malformed
/// fields is exercised by `test_blob_file_meta_missing_field_returns_err`.
#[test]
#[expect(clippy::unwrap_used)]
fn test_blob_file_meta_corrupted_trailer_returns_err() {
let meta = Metadata {
id: 0,
version: 4,
created_at: 1_234_567_890,
compression: CompressionType::None,
item_count: 100,
total_compressed_bytes: 1024,
total_uncompressed_bytes: 2048,
key_range: KeyRange::new((b"a".into(), b"z".into())),
};
let mut buf = Vec::new();
meta.encode_into(&mut buf).unwrap();
// Corrupt the last 4 bytes of the block (trailer region).
// This triggers a ChecksumMismatch in `Block::from_reader` — the
// first defense layer. The deeper point_read → ok_or path (which
// previously could panic) is exercised separately by
// `test_blob_file_meta_missing_field_returns_err`, where the block
// is structurally valid but omits a required property.
let len = buf.len();
assert!(len >= 4, "buffer too small for corruption");
#[expect(clippy::indexing_slicing, reason = "length checked above")]
for b in &mut buf[len - 4..] {
*b ^= 0xFF;
}
let buf = Slice::from(buf);
let result = Metadata::from_slice(&buf);
assert!(
result.is_err(),
"corrupted trailer must produce Err, got {result:?}",
);
}
#[test]
#[expect(clippy::unwrap_used)]
fn test_blob_file_meta_roundtrip() {
let meta = Metadata {
id: 0,
version: 4,
created_at: 1_234_567_890,
compression: CompressionType::None,
item_count: 100,
total_compressed_bytes: 1024,
total_uncompressed_bytes: 2048,
key_range: KeyRange::new((b"a".into(), b"z".into())),
};
let mut buf = Vec::new();
meta.encode_into(&mut buf).unwrap();
let buf = Slice::from(buf);
let meta2 = Metadata::from_slice(&buf).unwrap();
assert_eq!(meta, meta2);
}
}