net/adapter/net/dataforts/blob/blob_ref.rs
1//! `BlobRef` — typed event-payload that points at content stored
2//! out-of-band in a [`super::BlobAdapter`] backend.
3//!
4//! # Wire encoding (v0.15 Small + v0.2 Manifest)
5//!
6//! Every encoded form starts with the four-byte magic
7//! `[0xB0, 0xB1, 0xB2, 0xB3]` followed by a one-byte version
8//! discriminant:
9//!
10//! | Version | Variant | Body layout |
11//! |---|---|---|
12//! | `0x01` | [`BlobRef::Small`] | `[hash 32][size 8][uri …]` — hand-rolled byte layout, v0.15-compatible. |
13//! | `0x02` | [`BlobRef::Manifest`] | `[postcard manifest body …]` — chunked content. See [`BLOB_MANIFEST_BODY_VERSION`]. |
14//!
15//! No length prefix on the Small URI — the encoded form lives inside
16//! an event payload whose length is already framed by the substrate.
17//! The Manifest body is postcard-encoded with its own 1-byte version
18//! prefix (`BLOB_MANIFEST_BODY_VERSION`) so the manifest schema can
19//! evolve independently of the outer wire discriminant.
20//!
21//! Inline event payloads carry no magic (back-compat); the substrate
22//! distinguishes by peeking at the first four bytes. The magic is
23//! four bytes (rather than one) because a single discriminator byte
24//! (`0xB0`) collides with arbitrary binary payloads — protobuf wire
25//! bytes, MessagePack, compressed data — and a false match would
26//! silently re-interpret an inline payload as a `BlobRef` whose
27//! decoded URI gets fetched against the channel's adapter. A
28//! four-byte magic with three high-bit bytes is statistically
29//! unreachable in valid UTF-8 text and rare enough in binary that
30//! decode-then-verify catches the rest.
31//!
32//! # Chunking
33//!
34//! Payloads above [`BLOB_CHUNK_SIZE_BYTES`] (4 MiB) split into
35//! fixed-size chunks; the resulting [`BlobRef::Manifest`] carries the
36//! chunk list. Below the threshold, payloads ride as a single
37//! [`BlobRef::Small`]. Chunk size is fixed across versions for
38//! determinism: two callers chunking the same N-byte payload produce
39//! identical [`ChunkRef`] lists, which deduplicates at the
40//! replication layer for free. See [`chunk_payload`] for the
41//! algorithm + [`byte_range_to_chunks`] for the inverse (resolving a
42//! byte range to chunk indices for partial fetches).
43
44use serde::{Deserialize, Serialize};
45
46use super::error::BlobError;
47
48/// 4-byte magic at offset 0 of an encoded [`BlobRef`].
49/// Distinguishes blob-ref payloads from inline event payloads on
50/// every `read_range` / `tail` output. Single-byte discriminators
51/// collide too readily with arbitrary binary payloads; four
52/// high-bit bytes are improbable enough that decode-then-verify
53/// handles the residual cases without misinterpreting attacker-
54/// controlled bytes as a `BlobRef`.
55pub const BLOB_REF_MAGIC: [u8; 4] = [0xB0, 0xB1, 0xB2, 0xB3];
56
57/// Backwards-compatible single-byte discriminator alias for code
58/// paths that just need to peek byte 0 (e.g. the bindings'
59/// `EventPayload` classification). Equal to `BLOB_REF_MAGIC[0]`.
60/// The decoder still requires the full four-byte magic, so this
61/// alias is only useful for a cheap "might be a blob" pre-check.
62pub const BLOB_REF_DISCRIMINATOR: u8 = BLOB_REF_MAGIC[0];
63
64/// `BlobRef::Small` wire-encoding version. v1 is the only Small
65/// version this build encodes; the version byte is reserved so
66/// future migrations (e.g. BLAKE3-256 → BLAKE3-512, or a multi-hash
67/// format) can land without breaking the decoder.
68pub const BLOB_REF_VERSION_V1: u8 = 0x01;
69
70/// `BlobRef::Manifest` wire-encoding version. Lands in v0.2 alongside
71/// the mesh-native blob storage track. Manifest body schema evolves
72/// independently via [`BLOB_MANIFEST_BODY_VERSION`].
73pub const BLOB_REF_VERSION_V2_MANIFEST: u8 = 0x02;
74
75/// `BlobRef::Tree` wire-encoding version. Lands in v0.3 alongside
76/// the hierarchical-manifest terabyte-scale track. Tree body
77/// schema evolves independently via [`BLOB_TREE_BODY_VERSION`].
78pub const BLOB_REF_VERSION_V3_TREE: u8 = 0x03;
79
80/// Inner-version prefix on the postcard-encoded tree body. Bumps
81/// independently of the outer wire discriminator
82/// ([`BLOB_REF_VERSION_V3_TREE`]) so the tree body schema can
83/// evolve without re-cutting the outer version space.
84pub const BLOB_TREE_BODY_VERSION: u8 = 0x01;
85
86/// Hard ceiling on the postcard-encoded `BlobRef::Tree` body.
87/// Tree bodies are tiny by design (a few hashes + ints), so a
88/// 1 KiB cap is generous and bounds the decoder's allocator
89/// before per-field validation runs.
90pub const BLOB_REF_TREE_BODY_MAX_BYTES: usize = 1024;
91
92/// Hard ceiling on `BlobRef::Tree::total_size`. Equals the
93/// fanout 128 + depth 4 + 4 MiB chunk maximum: 128 × 128 × 128
94/// × 128 × 4 MiB = 128 PiB = 2^57 bytes. Bounded so a malicious
95/// or buggy publisher can't stamp `total_size = u64::MAX` and
96/// propagate it into `Vec::with_capacity` allocations downstream.
97pub const BLOB_TREE_MAX_TOTAL_SIZE: u64 = 128 * (1u64 << 50);
98
99/// Inner-version prefix on the postcard-encoded manifest body. Bumps
100/// independently of the outer wire discriminator
101/// ([`BLOB_REF_VERSION_V2_MANIFEST`]) so the manifest schema can
102/// evolve (extra fields, new encodings, etc.) without re-cutting the
103/// outer version space.
104pub const BLOB_MANIFEST_BODY_VERSION: u8 = 0x01;
105
106/// Minimum encoded length for a [`BlobRef::Small`]: magic + version
107/// + hash + size. URI may be empty.
108pub const BLOB_REF_SMALL_HEADER_LEN: usize = 4 + 1 + 32 + 8;
109
110/// Hard ceiling on any single blob payload — applies to both the
111/// `size` field on a [`BlobRef::Small`] and the `total_size` field on
112/// a [`BlobRef::Manifest`]. A malicious or buggy publisher could
113/// otherwise stamp `size = u64::MAX` which then propagates into
114/// `vec![0u8; len as usize]` allocations on the fetch path — OOMs on
115/// 64-bit targets, silent truncation to short reads on 32-bit. 16
116/// GiB is generous enough for legitimate multi-GB blobs while still
117/// bounded; sites that need higher should validate on construction
118/// and consider streaming (the BlobAdapter trait's streaming hooks
119/// are the right escape valve).
120pub const BLOB_REF_MAX_SIZE: u64 = 16 * 1024 * 1024 * 1024;
121
122/// Fixed chunk size for chunked storage. 4 MiB is the locked
123/// threshold per [`DATAFORTS_BLOB_STORAGE_PLAN.md`] — fixed across
124/// versions for determinism (two callers chunking the same N-byte
125/// payload produce identical [`ChunkRef`] lists, which deduplicates
126/// at the replication layer for free). Payloads at or below this
127/// threshold ride as a single [`BlobRef::Small`]; above it, the
128/// chunker emits a [`BlobRef::Manifest`].
129///
130/// [`DATAFORTS_BLOB_STORAGE_PLAN.md`]: ../../../../../docs/plans/DATAFORTS_BLOB_STORAGE_PLAN.md
131pub const BLOB_CHUNK_SIZE_BYTES: u64 = 4 * 1024 * 1024;
132
133/// Hard ceiling on the number of chunks a single
134/// [`BlobRef::Manifest`] may carry. 4 GiB / 4 MiB = 1024 chunks at
135/// the typical max-blob size; 16 GiB / 4 MiB = 4096 chunks at the
136/// `BLOB_REF_MAX_SIZE` cap. The cap protects the decoder from a
137/// malicious peer stamping `chunks: Vec<…>` with tens of millions of
138/// entries (the postcard varint length prefix would otherwise admit
139/// up to `u32::MAX` and OOM the decoder).
140pub const BLOB_MANIFEST_MAX_CHUNKS: usize = 8192;
141
142/// Replication encoding for a chunked blob. v0.2 only supports
143/// `Replicated`; `ReedSolomon { k, m }` is reserved on the wire so
144/// v0.3 can land erasure coding without a manifest format change.
145///
146/// Wire-encoded via postcard; the unit-variant `Replicated`
147/// occupies 1 byte (varint discriminant 0), `ReedSolomon { k, m }`
148/// occupies 3 bytes (varint 1 + two `u8`).
149#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
150pub enum Encoding {
151 /// N identical replicas of every chunk; the only encoding
152 /// supported in v0.2. Survives loss of `replication_factor - 1`
153 /// nodes per chunk; correlated failures depend on placement
154 /// tags. See `DATAFORTS_BLOB_STORAGE_PLAN.md` § W-2.
155 Replicated,
156 /// Reed–Solomon `(k, m)` erasure coding. **Reserved for v0.3**;
157 /// constructing this variant is allowed for forward-compat
158 /// testing, but the v0.2 store / fetch paths reject it with a
159 /// `BlobError::UnsupportedEncoding` variant added in PR-2.
160 ReedSolomon {
161 /// Data chunks per group.
162 k: u8,
163 /// Parity chunks per group.
164 m: u8,
165 },
166}
167
168/// Reference to a single chunk within a [`BlobRef::Manifest`].
169/// Each chunk is a content-addressed RedEX file in the mesh-native
170/// storage path (v0.2). The hash is BLAKE3-256 of the chunk's raw
171/// bytes; `size` is the chunk's payload length in bytes (≤
172/// [`BLOB_CHUNK_SIZE_BYTES`]; only the last chunk may be smaller).
173#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
174pub struct ChunkRef {
175 /// BLAKE3-256 of the chunk's canonical bytes.
176 pub hash: [u8; 32],
177 /// Chunk payload length in bytes. Bounded above by
178 /// [`BLOB_CHUNK_SIZE_BYTES`]; only the last chunk in a manifest
179 /// may be strictly smaller.
180 pub size: u32,
181}
182
183/// Postcard-encoded tree body. Lives inside the
184/// [`BlobRef::Tree`] wire form after the four-byte magic +
185/// version discriminator. The body itself is tiny — fixed-size
186/// fields only; no embedded chunk list (the chunks live at the
187/// referenced [`TreeNode`](super::blob_tree::TreeNode) leaves).
188#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
189struct TreeBody {
190 /// Body schema version; bumps independently of the outer
191 /// `BlobRef::Tree` discriminant.
192 body_version: u8,
193 /// Adapter-routed URI. For the mesh-native path this is
194 /// `mesh://<hex-of-root_hash>`; external adapters use their
195 /// own scheme.
196 uri: String,
197 /// Replication / erasure encoding for the chunks. Tree
198 /// inherits the same enum surface as Manifest.
199 encoding: Encoding,
200 /// BLAKE3 hash of the root
201 /// [`TreeNode`](super::blob_tree::TreeNode) body. The
202 /// substrate fetches the root, verifies its bytes hash to
203 /// this value, then walks down.
204 root_hash: [u8; 32],
205 /// Total reconstructed payload size in bytes. The decoder
206 /// trusts this value (same trust model as Manifest's
207 /// `total_size`); the tree walk cross-checks against the
208 /// sum of leaf chunk sizes at the bottom of each descent.
209 total_size: u64,
210 /// Tree depth — `0` is a single-leaf tree (root IS the leaf,
211 /// degenerate), `1` is root + leaves, `2` is root + internals +
212 /// leaves, etc. Capped at [`super::blob_tree::MAX_TREE_DEPTH`]
213 /// (currently 4).
214 depth: u8,
215}
216
217/// Borrow-only sibling of [`TreeBody`]. Same rationale as
218/// [`ManifestBodyRef`] — measure-only via postcard with no URI
219/// clone. (Tree bodies don't have a chunk vector, so the saving
220/// here is just the `String` clone, but the symmetry keeps the
221/// two encoded_len arms consistent.)
222#[derive(Serialize)]
223struct TreeBodyRef<'a> {
224 body_version: u8,
225 uri: &'a str,
226 encoding: Encoding,
227 root_hash: [u8; 32],
228 total_size: u64,
229 depth: u8,
230}
231
232/// Postcard-encoded manifest body. Lives inside the
233/// [`BlobRef::Manifest`] wire form after the four-byte magic +
234/// version discriminator.
235#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
236struct ManifestBody {
237 /// Body schema version; bumps independently of the outer
238 /// `BlobRef::Manifest` discriminant.
239 body_version: u8,
240 /// Adapter-routed URI — e.g. `mesh://<hex>`, `s3://bucket/key`.
241 /// The scheme picks the adapter; the rest is passed through
242 /// opaque.
243 uri: String,
244 /// Replication / erasure encoding for the chunks.
245 encoding: Encoding,
246 /// Ordered chunk list. Position N in the vector corresponds to
247 /// the byte range `[N * BLOB_CHUNK_SIZE_BYTES, …)`.
248 chunks: Vec<ChunkRef>,
249 /// Sum of every chunk's `size`. Cached for cheap `BlobRef::size`
250 /// without iterating the vector; validated on decode to match
251 /// the iterated sum.
252 total_size: u64,
253}
254
255/// Borrow-only sibling of [`ManifestBody`]. Postcard's serializer
256/// walks fields in declaration order and encodes `String` /
257/// `Vec<T>` identically to `&str` / `&[T]` (same length-prefix +
258/// bytes shape), so this type's `Serialize` output is byte-for-
259/// byte identical to [`ManifestBody`]'s with the same field
260/// values. Used by [`BlobRef::encoded_len`] to *measure* without
261/// cloning the URI and chunk vector — cubic-dev-ai code review
262/// flagged the original `chunks.clone()` as a 36-bytes-per-chunk
263/// allocation per sizing call on large manifests.
264///
265/// Encoders that need to produce the wire bytes still use
266/// [`ManifestBody`] via `postcard::to_allocvec`; this type is
267/// measure-only.
268#[derive(Serialize)]
269struct ManifestBodyRef<'a> {
270 body_version: u8,
271 uri: &'a str,
272 encoding: Encoding,
273 chunks: &'a [ChunkRef],
274 total_size: u64,
275}
276
277/// Pointer to content stored out-of-band. Round-trips through every
278/// binding as a typed value via the public fields; the substrate
279/// uses [`Self::encode`] / [`Self::decode`] for the wire form.
280///
281/// Two variants:
282///
283/// - [`BlobRef::Small`] — payload ≤ [`BLOB_CHUNK_SIZE_BYTES`]; a
284/// single content-addressed blob. Wire-compatible with v0.15.
285/// - [`BlobRef::Manifest`] — payload > [`BLOB_CHUNK_SIZE_BYTES`];
286/// carries an ordered [`ChunkRef`] list plus an [`Encoding`]
287/// discriminant. Each chunk is itself a content-addressed Small
288/// blob stored independently via the adapter; the manifest exists
289/// only as the routing structure that ties them together.
290#[derive(Clone, Debug, PartialEq, Eq, Hash)]
291pub enum BlobRef {
292 /// Single-blob path. Wire-compatible with v0.15.
293 Small {
294 /// Encoding version byte. Always [`BLOB_REF_VERSION_V1`] on
295 /// fresh constructions; decode preserves the on-wire value so
296 /// upstream code can detect forward-compat scenarios.
297 version: u8,
298 /// Adapter-routed URI — e.g. `s3://bucket/key`,
299 /// `ipfs://<cid>`, `file:///abs/path`, `mesh://<hex>`. The
300 /// scheme picks the adapter; the rest is passed through
301 /// opaque.
302 uri: String,
303 /// BLAKE3-256 hash of the canonical bytes the URI resolves
304 /// to. The substrate verifies this on every successful
305 /// fetch; an adversarial adapter cannot fake-verify because
306 /// the check runs in the substrate, not the adapter.
307 hash: [u8; 32],
308 /// Size of the resolved content in bytes. Range-fetch
309 /// callers use this to bound their reads; the verification
310 /// path uses it to short-circuit obviously-wrong payloads.
311 size: u64,
312 },
313 /// Chunked-blob path (v0.2). Wire version
314 /// [`BLOB_REF_VERSION_V2_MANIFEST`]; body schema version
315 /// [`BLOB_MANIFEST_BODY_VERSION`].
316 Manifest {
317 /// Outer wire discriminator (always
318 /// [`BLOB_REF_VERSION_V2_MANIFEST`] on fresh constructions).
319 version: u8,
320 /// Adapter-routed URI.
321 uri: String,
322 /// Replication / erasure encoding for the chunks.
323 encoding: Encoding,
324 /// Ordered chunk list. Empty manifests are rejected on
325 /// decode (use [`BlobRef::Small`] for zero-byte payloads).
326 chunks: Vec<ChunkRef>,
327 /// Total payload size = sum of every chunk's `size`. Cached
328 /// for cheap `BlobRef::size`; validated on decode against
329 /// the iterated sum.
330 total_size: u64,
331 },
332 /// Tree-manifest path (v0.3). Wire version
333 /// [`BLOB_REF_VERSION_V3_TREE`]; body schema version
334 /// [`BLOB_TREE_BODY_VERSION`]. Lifts the addressable size
335 /// from the v0.2 16 GiB cap to 128 PiB at fanout 128 + depth
336 /// 4 + 4 MiB chunks. The blob's actual chunk references live
337 /// at the [`TreeNode`](super::blob_tree::TreeNode) leaves,
338 /// reachable via the tree walk starting from `root_hash`.
339 Tree {
340 /// Outer wire discriminator (always
341 /// [`BLOB_REF_VERSION_V3_TREE`] on fresh constructions).
342 version: u8,
343 /// Adapter-routed URI. For the mesh-native path this is
344 /// `mesh://<hex-of-root_hash>`; external adapters use
345 /// their own scheme.
346 uri: String,
347 /// Replication / erasure encoding (inherits the same
348 /// enum surface as `Manifest`).
349 encoding: Encoding,
350 /// BLAKE3 hash of the root
351 /// [`TreeNode`](super::blob_tree::TreeNode) body — the
352 /// substrate fetches this hash to start the tree walk.
353 root_hash: [u8; 32],
354 /// Total payload size in bytes (sum of every leaf
355 /// chunk's `size` across the whole tree). Cached for
356 /// cheap [`Self::size`].
357 total_size: u64,
358 /// Tree depth — `1` for root-as-leaf, up to
359 /// [`super::blob_tree::MAX_TREE_DEPTH`].
360 depth: u8,
361 },
362}
363
364impl BlobRef {
365 // -----------------------------------------------------------
366 // Construction
367 // -----------------------------------------------------------
368
369 /// Construct a v1 [`BlobRef::Small`]. The caller is responsible
370 /// for the `hash` matching the content at `uri` — the substrate
371 /// verifies on fetch, not on construction.
372 pub fn small(uri: impl Into<String>, hash: [u8; 32], size: u64) -> Self {
373 Self::Small {
374 version: BLOB_REF_VERSION_V1,
375 uri: uri.into(),
376 hash,
377 size,
378 }
379 }
380
381 /// Backwards-compatible alias for [`Self::small`]. Pre-v0.2
382 /// callers used `BlobRef::new(uri, hash, size)` which produced
383 /// the single-blob shape; the new enum surface uses
384 /// [`Self::small`] for the same shape.
385 #[deprecated(
386 since = "0.18.0",
387 note = "use `BlobRef::small` for explicit-variant construction"
388 )]
389 pub fn new(uri: impl Into<String>, hash: [u8; 32], size: u64) -> Self {
390 Self::small(uri, hash, size)
391 }
392
393 /// Construct a v2 [`BlobRef::Manifest`] from a chunk list. The
394 /// caller is responsible for each chunk's hash matching the
395 /// stored chunk; the substrate verifies on fetch.
396 pub fn manifest(
397 uri: impl Into<String>,
398 encoding: Encoding,
399 chunks: Vec<ChunkRef>,
400 ) -> Result<Self, BlobError> {
401 if chunks.is_empty() {
402 return Err(BlobError::Decode(
403 "manifest must carry at least one chunk".to_owned(),
404 ));
405 }
406 if chunks.len() > BLOB_MANIFEST_MAX_CHUNKS {
407 return Err(BlobError::Decode(format!(
408 "manifest chunk count {} exceeds cap {}",
409 chunks.len(),
410 BLOB_MANIFEST_MAX_CHUNKS
411 )));
412 }
413 validate_chunk_sizes(&chunks)?;
414 let total_size: u64 = chunks.iter().map(|c| c.size as u64).sum();
415 if total_size > BLOB_REF_MAX_SIZE {
416 return Err(BlobError::Decode(format!(
417 "manifest total_size {} exceeds cap {}",
418 total_size, BLOB_REF_MAX_SIZE
419 )));
420 }
421 Ok(Self::Manifest {
422 version: BLOB_REF_VERSION_V2_MANIFEST,
423 uri: uri.into(),
424 encoding,
425 chunks,
426 total_size,
427 })
428 }
429
430 /// Construct a v3 [`BlobRef::Tree`]. The caller is responsible
431 /// for `root_hash` matching the BLAKE3 of the root
432 /// [`TreeNode`](super::blob_tree::TreeNode)'s encoded bytes,
433 /// and for `total_size` matching the sum of every leaf
434 /// chunk's `size` across the tree — the substrate verifies the
435 /// hash on tree-walk descent and cross-checks total_size at
436 /// the leaves.
437 ///
438 /// Validates:
439 /// - `total_size > 0` (use [`BlobRef::Small`] for zero-byte payloads).
440 /// - `total_size <= BLOB_TREE_MAX_TOTAL_SIZE` (~128 PiB ceiling).
441 /// - `depth` in `1..=MAX_TREE_DEPTH`.
442 pub fn tree(
443 uri: impl Into<String>,
444 encoding: Encoding,
445 root_hash: [u8; 32],
446 total_size: u64,
447 depth: u8,
448 ) -> Result<Self, BlobError> {
449 if total_size == 0 {
450 return Err(BlobError::Decode(
451 "tree total_size must be > 0; use BlobRef::Small for empty payloads".to_owned(),
452 ));
453 }
454 if total_size > BLOB_TREE_MAX_TOTAL_SIZE {
455 return Err(BlobError::Decode(format!(
456 "tree total_size {} exceeds cap {}",
457 total_size, BLOB_TREE_MAX_TOTAL_SIZE
458 )));
459 }
460 if depth == 0 || depth > super::blob_tree::MAX_TREE_DEPTH {
461 return Err(BlobError::Decode(format!(
462 "tree depth {} out of range 1..={}",
463 depth,
464 super::blob_tree::MAX_TREE_DEPTH
465 )));
466 }
467 Ok(Self::Tree {
468 version: BLOB_REF_VERSION_V3_TREE,
469 uri: uri.into(),
470 encoding,
471 root_hash,
472 total_size,
473 depth,
474 })
475 }
476
477 // -----------------------------------------------------------
478 // Accessors (uniform across variants)
479 // -----------------------------------------------------------
480
481 /// Outer wire version discriminator —
482 /// [`BLOB_REF_VERSION_V1`] for Small, [`BLOB_REF_VERSION_V2_MANIFEST`]
483 /// for Manifest, [`BLOB_REF_VERSION_V3_TREE`] for Tree.
484 pub fn version(&self) -> u8 {
485 match self {
486 Self::Small { version, .. }
487 | Self::Manifest { version, .. }
488 | Self::Tree { version, .. } => *version,
489 }
490 }
491
492 /// Adapter-routed URI. The scheme picks the adapter; the rest is
493 /// passed through opaque.
494 pub fn uri(&self) -> &str {
495 match self {
496 Self::Small { uri, .. } | Self::Manifest { uri, .. } | Self::Tree { uri, .. } => {
497 uri.as_str()
498 }
499 }
500 }
501
502 /// Total payload size in bytes — `size` for Small,
503 /// `total_size` for Manifest, `total_size` for Tree.
504 pub fn size(&self) -> u64 {
505 match self {
506 Self::Small { size, .. } => *size,
507 Self::Manifest { total_size, .. } | Self::Tree { total_size, .. } => *total_size,
508 }
509 }
510
511 /// `true` if this is a chunked-blob manifest (flat
512 /// [`Self::Manifest`] or hierarchical [`Self::Tree`]).
513 pub fn is_chunked(&self) -> bool {
514 matches!(self, Self::Manifest { .. } | Self::Tree { .. })
515 }
516
517 /// `true` if this is a hierarchical-manifest tree.
518 pub fn is_tree(&self) -> bool {
519 matches!(self, Self::Tree { .. })
520 }
521
522 /// The single content hash for a Small blob; `None` for a
523 /// Manifest or Tree (manifests reference many chunks, each
524 /// with its own hash — use [`Self::chunks`] for Manifest or
525 /// [`Self::tree_root_hash`] for Tree).
526 pub fn small_hash(&self) -> Option<&[u8; 32]> {
527 match self {
528 Self::Small { hash, .. } => Some(hash),
529 Self::Manifest { .. } | Self::Tree { .. } => None,
530 }
531 }
532
533 /// The root [`TreeNode`](super::blob_tree::TreeNode) hash for
534 /// a [`Self::Tree`]; `None` for [`Self::Small`] or
535 /// [`Self::Manifest`].
536 pub fn tree_root_hash(&self) -> Option<&[u8; 32]> {
537 match self {
538 Self::Tree { root_hash, .. } => Some(root_hash),
539 Self::Small { .. } | Self::Manifest { .. } => None,
540 }
541 }
542
543 /// The tree depth for a [`Self::Tree`]; `None` for
544 /// [`Self::Small`] or [`Self::Manifest`].
545 pub fn tree_depth(&self) -> Option<u8> {
546 match self {
547 Self::Tree { depth, .. } => Some(*depth),
548 Self::Small { .. } | Self::Manifest { .. } => None,
549 }
550 }
551
552 /// The chunk list for a Manifest; empty slice for a Small or
553 /// Tree (Tree chunks live at the leaf [`TreeNode`](super::blob_tree::TreeNode)s,
554 /// reachable via tree walk — not flattened here).
555 pub fn chunks(&self) -> &[ChunkRef] {
556 match self {
557 Self::Small { .. } | Self::Tree { .. } => &[],
558 Self::Manifest { chunks, .. } => chunks,
559 }
560 }
561
562 /// The encoding tag for a Manifest or Tree; `None` for a
563 /// Small (Small has no encoding because the bytes are stored
564 /// directly).
565 pub fn encoding(&self) -> Option<Encoding> {
566 match self {
567 Self::Small { .. } => None,
568 Self::Manifest { encoding, .. } | Self::Tree { encoding, .. } => Some(*encoding),
569 }
570 }
571
572 // -----------------------------------------------------------
573 // Wire format
574 // -----------------------------------------------------------
575
576 /// Encoded length in bytes. The `Small` variant is O(1) —
577 /// header size plus URI length. The `Manifest` / `Tree`
578 /// variants now use [`postcard::experimental::serialized_size`]
579 /// to *measure* without allocating, per dataforts perf #174.
580 ///
581 /// Pre-fix these variants called `self.encode().len()` — a
582 /// full postcard alloc-encode of the entire body just to
583 /// read `.len()` off the temporary and drop it. For a
584 /// 1000-chunk Manifest, that was 64 KB+ allocated and
585 /// thrown away per `encoded_len` call. Workloads that
586 /// pair `encoded_len` + `encode` (typical sizing-then-emit
587 /// pattern) paid 2× the encode cost.
588 ///
589 /// Post-fix `encoded_len` walks the structure measuring
590 /// without allocating the output buffer — same byte count,
591 /// no `Vec` churn.
592 #[expect(
593 clippy::expect_used,
594 reason = "ManifestBodyRef / TreeBodyRef are composed of sized Serialize types — `postcard::experimental::serialized_size` is infallible against them; mirrors the existing `#[expect]` on `encode()`"
595 )]
596 pub fn encoded_len(&self) -> usize {
597 match self {
598 Self::Small { uri, .. } => BLOB_REF_SMALL_HEADER_LEN + uri.len(),
599 Self::Manifest {
600 uri,
601 encoding,
602 chunks,
603 total_size,
604 ..
605 } => {
606 // Per cubic-dev-ai code review: use the borrow-
607 // only [`ManifestBodyRef`] so the sizing walk
608 // doesn't `chunks.clone()` (36 bytes/chunk × N for
609 // a manifest of N chunks — kilobytes of pointless
610 // allocation per `encoded_len` call on large
611 // manifests) or `uri.clone()`. Postcard's
612 // serializer encodes `&str` and `&[T]` identically
613 // to `String` / `Vec<T>` so the walked byte count
614 // matches `encode()`'s output exactly.
615 let body = ManifestBodyRef {
616 body_version: BLOB_MANIFEST_BODY_VERSION,
617 uri: uri.as_str(),
618 encoding: *encoding,
619 chunks: chunks.as_slice(),
620 total_size: *total_size,
621 };
622 let body_len = postcard::experimental::serialized_size(&body)
623 .expect("manifest body postcard-encodes infallibly");
624 BLOB_REF_MAGIC.len() + 1 + body_len
625 }
626 Self::Tree {
627 uri,
628 encoding,
629 root_hash,
630 total_size,
631 depth,
632 ..
633 } => {
634 // Symmetric with the Manifest arm — borrow rather
635 // than clone the URI for the sizing walk.
636 let body = TreeBodyRef {
637 body_version: BLOB_TREE_BODY_VERSION,
638 uri: uri.as_str(),
639 encoding: *encoding,
640 root_hash: *root_hash,
641 total_size: *total_size,
642 depth: *depth,
643 };
644 let body_len = postcard::experimental::serialized_size(&body)
645 .expect("tree body postcard-encodes infallibly");
646 BLOB_REF_MAGIC.len() + 1 + body_len
647 }
648 }
649 }
650
651 /// Emit the wire form. See the module-level table for the
652 /// byte layout per variant.
653 #[expect(
654 clippy::expect_used,
655 reason = "ManifestBody / TreeBody are composed of sized Serialize types; postcard alloc-encoding is infallible against them"
656 )]
657 pub fn encode(&self) -> Vec<u8> {
658 match self {
659 Self::Small {
660 version,
661 uri,
662 hash,
663 size,
664 } => {
665 let mut buf = Vec::with_capacity(BLOB_REF_SMALL_HEADER_LEN + uri.len());
666 buf.extend_from_slice(&BLOB_REF_MAGIC);
667 buf.push(*version);
668 buf.extend_from_slice(hash);
669 buf.extend_from_slice(&size.to_le_bytes());
670 buf.extend_from_slice(uri.as_bytes());
671 buf
672 }
673 Self::Manifest {
674 version,
675 uri,
676 encoding,
677 chunks,
678 total_size,
679 } => {
680 let body = ManifestBody {
681 body_version: BLOB_MANIFEST_BODY_VERSION,
682 uri: uri.clone(),
683 encoding: *encoding,
684 chunks: chunks.clone(),
685 total_size: *total_size,
686 };
687 // Postcard alloc-encode is infallible against
688 // `Serialize` types whose subobjects are all sized;
689 // every field here is sized. The Result-bearing
690 // signature is for fallible writers (e.g. fixed-size
691 // buffers); we use the heap allocator.
692 let body_bytes = postcard::to_allocvec(&body)
693 .expect("manifest body postcard-encodes infallibly");
694 let mut buf = Vec::with_capacity(5 + body_bytes.len());
695 buf.extend_from_slice(&BLOB_REF_MAGIC);
696 buf.push(*version);
697 buf.extend_from_slice(&body_bytes);
698 buf
699 }
700 Self::Tree {
701 version,
702 uri,
703 encoding,
704 root_hash,
705 total_size,
706 depth,
707 } => {
708 let body = TreeBody {
709 body_version: BLOB_TREE_BODY_VERSION,
710 uri: uri.clone(),
711 encoding: *encoding,
712 root_hash: *root_hash,
713 total_size: *total_size,
714 depth: *depth,
715 };
716 let body_bytes =
717 postcard::to_allocvec(&body).expect("tree body postcard-encodes infallibly");
718 let mut buf = Vec::with_capacity(5 + body_bytes.len());
719 buf.extend_from_slice(&BLOB_REF_MAGIC);
720 buf.push(*version);
721 buf.extend_from_slice(&body_bytes);
722 buf
723 }
724 }
725 }
726
727 /// Decode a wire form. Returns `Ok(None)` when the first four
728 /// bytes are not [`BLOB_REF_MAGIC`] (caller should treat the
729 /// payload as inline). Returns `Err` only when the magic matches
730 /// but the rest of the frame is malformed.
731 pub fn decode(bytes: &[u8]) -> Result<Option<Self>, BlobError> {
732 if bytes.len() < BLOB_REF_MAGIC.len() || bytes[..BLOB_REF_MAGIC.len()] != BLOB_REF_MAGIC {
733 return Ok(None);
734 }
735 if bytes.len() < 5 {
736 return Err(BlobError::Decode(format!(
737 "frame too short for version byte: {} bytes",
738 bytes.len()
739 )));
740 }
741 let version = bytes[4];
742 match version {
743 BLOB_REF_VERSION_V1 => Self::decode_small(version, &bytes[5..]).map(Some),
744 BLOB_REF_VERSION_V2_MANIFEST => Self::decode_manifest(version, &bytes[5..]).map(Some),
745 BLOB_REF_VERSION_V3_TREE => Self::decode_tree(version, &bytes[5..]).map(Some),
746 other => Err(BlobError::UnsupportedVersion(other)),
747 }
748 }
749
750 fn decode_small(version: u8, rest: &[u8]) -> Result<Self, BlobError> {
751 // rest layout: [hash 32][size 8][uri …]
752 if rest.len() < 40 {
753 return Err(BlobError::Decode(format!(
754 "small frame too short: {} bytes after version, need at least 40",
755 rest.len()
756 )));
757 }
758 let mut hash = [0u8; 32];
759 hash.copy_from_slice(&rest[0..32]);
760 let mut size_bytes = [0u8; 8];
761 size_bytes.copy_from_slice(&rest[32..40]);
762 let size = u64::from_le_bytes(size_bytes);
763 if size > BLOB_REF_MAX_SIZE {
764 return Err(BlobError::Decode(format!(
765 "blob size {} exceeds cap {}",
766 size, BLOB_REF_MAX_SIZE
767 )));
768 }
769 let uri = std::str::from_utf8(&rest[40..])
770 .map_err(|e| BlobError::Decode(format!("URI not UTF-8: {}", e)))?
771 .to_owned();
772 Ok(Self::Small {
773 version,
774 uri,
775 hash,
776 size,
777 })
778 }
779
780 fn decode_manifest(version: u8, rest: &[u8]) -> Result<Self, BlobError> {
781 // Bound the wire size BEFORE postcard allocates the
782 // `Vec<ChunkRef>`. Otherwise a malicious peer can stamp
783 // the chunks-length varint up to ~u32::MAX, forcing a
784 // multi-MB Vec allocation before our post-decode cap
785 // check at line ~25 below fires. The legitimate upper
786 // bound for a well-formed manifest body is:
787 //
788 // uri (≤ 8 KiB after the substrate's outer length cap)
789 // + 1 byte encoding discriminant
790 // + 1 byte body_version
791 // + ≤ 10 bytes total_size varint
792 // + ≤ 5 bytes chunks-len varint (covers u32::MAX, far above our cap)
793 // + BLOB_MANIFEST_MAX_CHUNKS chunks × ≤ 50 bytes max
794 // each (32 hash + 5 size varint + 10 offset varint +
795 // framing slack)
796 //
797 // Round up generously to a static upper bound. Anything
798 // past this is by construction malformed; reject without
799 // touching the allocator.
800 const MAX_MANIFEST_WIRE_BYTES: usize = 8192 + 32 + BLOB_MANIFEST_MAX_CHUNKS * 50;
801 if rest.len() > MAX_MANIFEST_WIRE_BYTES {
802 return Err(BlobError::Decode(format!(
803 "manifest body {} bytes exceeds legitimate upper bound {}",
804 rest.len(),
805 MAX_MANIFEST_WIRE_BYTES
806 )));
807 }
808 let body: ManifestBody = postcard::from_bytes(rest)
809 .map_err(|e| BlobError::Decode(format!("manifest body decode failed: {}", e)))?;
810 if body.body_version != BLOB_MANIFEST_BODY_VERSION {
811 return Err(BlobError::UnsupportedVersion(body.body_version));
812 }
813 if body.chunks.is_empty() {
814 return Err(BlobError::Decode(
815 "manifest must carry at least one chunk".to_owned(),
816 ));
817 }
818 if body.chunks.len() > BLOB_MANIFEST_MAX_CHUNKS {
819 return Err(BlobError::Decode(format!(
820 "manifest chunk count {} exceeds cap {}",
821 body.chunks.len(),
822 BLOB_MANIFEST_MAX_CHUNKS
823 )));
824 }
825 validate_chunk_sizes(&body.chunks)?;
826 // Validate the cached total_size matches the iterated sum —
827 // a malicious peer could otherwise lie about total_size to
828 // mislead range math without flipping any chunk's hash.
829 let iterated_sum: u64 = body.chunks.iter().map(|c| c.size as u64).sum();
830 if iterated_sum != body.total_size {
831 return Err(BlobError::Decode(format!(
832 "manifest total_size mismatch: declared {}, iterated {}",
833 body.total_size, iterated_sum
834 )));
835 }
836 if body.total_size > BLOB_REF_MAX_SIZE {
837 return Err(BlobError::Decode(format!(
838 "manifest total_size {} exceeds cap {}",
839 body.total_size, BLOB_REF_MAX_SIZE
840 )));
841 }
842 Ok(Self::Manifest {
843 version,
844 uri: body.uri,
845 encoding: body.encoding,
846 chunks: body.chunks,
847 total_size: body.total_size,
848 })
849 }
850
851 fn decode_tree(version: u8, rest: &[u8]) -> Result<Self, BlobError> {
852 // Bound the wire size BEFORE postcard allocates. The Tree
853 // body carries only fixed-size fields (root_hash, sizes,
854 // depth) plus a URI string — 1 KiB is generous for the
855 // legitimate shape and bounds malicious oversize payloads
856 // before the URI's String allocation runs.
857 if rest.len() > BLOB_REF_TREE_BODY_MAX_BYTES {
858 return Err(BlobError::Decode(format!(
859 "tree body {} bytes exceeds cap {}",
860 rest.len(),
861 BLOB_REF_TREE_BODY_MAX_BYTES
862 )));
863 }
864 let body: TreeBody = postcard::from_bytes(rest)
865 .map_err(|e| BlobError::Decode(format!("tree body decode failed: {}", e)))?;
866 if body.body_version != BLOB_TREE_BODY_VERSION {
867 return Err(BlobError::UnsupportedVersion(body.body_version));
868 }
869 if body.total_size == 0 {
870 return Err(BlobError::Decode(
871 "tree total_size must be > 0; empty payloads use BlobRef::Small".to_owned(),
872 ));
873 }
874 if body.total_size > BLOB_TREE_MAX_TOTAL_SIZE {
875 return Err(BlobError::Decode(format!(
876 "tree total_size {} exceeds cap {}",
877 body.total_size, BLOB_TREE_MAX_TOTAL_SIZE
878 )));
879 }
880 if body.depth == 0 || body.depth > super::blob_tree::MAX_TREE_DEPTH {
881 return Err(BlobError::Decode(format!(
882 "tree depth {} out of range 1..={}",
883 body.depth,
884 super::blob_tree::MAX_TREE_DEPTH
885 )));
886 }
887 // Defensive depth-vs-size lower bound. A well-formed depth=N
888 // tree (N >= 2) requires AT LEAST TREE_FANOUT^(N-1) bytes
889 // to be productive — depth=2 needs > FANOUT (128) bytes
890 // for an Internal root to be useful, depth=3 needs >
891 // FANOUT^2 = 16 384, depth=4 needs > FANOUT^3 ≈ 2 M. A
892 // manifest claiming depth=4 + total_size=1 is structurally
893 // malformed (a single chunk can't justify three internal
894 // levels) — reject before any walk traffic happens. The
895 // walker's depth-shortening check would catch this too,
896 // but at the cost of a round trip to fetch the root.
897 if body.depth >= 2 {
898 let exp = body.depth as u32 - 1;
899 // Compute FANOUT^exp using checked_pow; on overflow
900 // the depth is at the cap and the lower bound is
901 // satisfied by any reasonable total_size, so skip the
902 // check in that direction.
903 if let Some(min_size) = (super::blob_tree::TREE_FANOUT as u64).checked_pow(exp) {
904 if body.total_size < min_size {
905 return Err(BlobError::Decode(format!(
906 "tree depth {} requires total_size >= {} (TREE_FANOUT^(depth-1)); got {}",
907 body.depth, min_size, body.total_size
908 )));
909 }
910 }
911 }
912 Ok(Self::Tree {
913 version,
914 uri: body.uri,
915 encoding: body.encoding,
916 root_hash: body.root_hash,
917 total_size: body.total_size,
918 depth: body.depth,
919 })
920 }
921
922 /// Verify `bytes` resolves to this `BlobRef`'s hash. Only
923 /// defined for [`BlobRef::Small`] — call sites holding a
924 /// Manifest verify chunk-by-chunk via [`Self::chunks`]; call
925 /// sites holding a Tree verify via tree-walk descent (each
926 /// [`TreeNode`](super::blob_tree::TreeNode)'s bytes hash to
927 /// the parent's stored child-hash entry).
928 /// Returns `Ok(())` on match,
929 /// `Err(BlobError::HashMismatch)` otherwise, `Err(BlobError::Decode)`
930 /// on a Manifest / Tree. Runs inside the substrate, not the
931 /// adapter, so an adversarial adapter cannot fake-verify.
932 pub fn verify(&self, bytes: &[u8]) -> Result<(), BlobError> {
933 match self {
934 Self::Small { hash, .. } => {
935 let actual: [u8; 32] = blake3::hash(bytes).into();
936 if actual == *hash {
937 Ok(())
938 } else {
939 Err(BlobError::HashMismatch {
940 expected: *hash,
941 actual,
942 })
943 }
944 }
945 Self::Manifest { .. } => Err(BlobError::Decode(
946 "verify is undefined on a Manifest variant; verify chunks individually".to_owned(),
947 )),
948 Self::Tree { .. } => Err(BlobError::Decode(
949 "verify is undefined on a Tree variant; verify chunks individually via tree walk"
950 .to_owned(),
951 )),
952 }
953 }
954}
955
956// -------------------------------------------------------------------
957// Chunking + range math (pure logic — no I/O)
958// -------------------------------------------------------------------
959
960/// Reject manifests where any chunk size disagrees with the substrate's
961/// fixed [`BLOB_CHUNK_SIZE_BYTES`] stride. Every non-last chunk MUST
962/// be exactly `BLOB_CHUNK_SIZE_BYTES`; the last chunk MAY be smaller
963/// but must be at least one byte. `byte_range_to_chunks` and the
964/// adapter's range slicer rely on the fixed stride; an attacker-stamped
965/// `{size: u32::MAX}` chunk would otherwise either return wrong-window
966/// bytes silently or trip a panicking slice in the consumer.
967fn validate_chunk_sizes(chunks: &[ChunkRef]) -> Result<(), BlobError> {
968 let last = chunks.len() - 1;
969 for (i, chunk) in chunks.iter().enumerate() {
970 let size = chunk.size as u64;
971 if i < last {
972 if size != BLOB_CHUNK_SIZE_BYTES {
973 return Err(BlobError::Decode(format!(
974 "manifest non-last chunk {} has size {} (expected {})",
975 i, size, BLOB_CHUNK_SIZE_BYTES
976 )));
977 }
978 } else {
979 if size == 0 || size > BLOB_CHUNK_SIZE_BYTES {
980 return Err(BlobError::Decode(format!(
981 "manifest last chunk {} has size {} (expected 1..={})",
982 i, size, BLOB_CHUNK_SIZE_BYTES
983 )));
984 }
985 }
986 }
987 Ok(())
988}
989
990/// Outcome of [`chunk_payload`] — either the payload fit below the
991/// threshold (single Small blob shape) or it split into N chunks
992/// plus a manifest.
993#[derive(Clone, Debug)]
994pub enum ChunkedPayload<'a> {
995 /// Payload size ≤ [`BLOB_CHUNK_SIZE_BYTES`]; ride as a single
996 /// content-addressed blob. The caller stores `payload` against
997 /// the resulting hash; the [`BlobRef`] returned by
998 /// [`Self::into_blob_ref`] points at that single content.
999 Inline {
1000 /// BLAKE3 of the whole payload.
1001 hash: [u8; 32],
1002 /// Payload bytes (zero-copy slice into the caller's buffer).
1003 payload: &'a [u8],
1004 },
1005 /// Payload size > [`BLOB_CHUNK_SIZE_BYTES`]; split into N
1006 /// 4-MiB chunks (last chunk may be smaller). The caller stores
1007 /// each chunk independently against its hash; the
1008 /// [`BlobRef::Manifest`] returned by [`Self::into_blob_ref`]
1009 /// references all of them.
1010 Chunked {
1011 /// Each chunk's `(hash, byte-slice)`. Slices are zero-copy
1012 /// views into the caller's buffer.
1013 chunks: Vec<(ChunkRef, &'a [u8])>,
1014 /// Total payload length = sum of chunk lengths.
1015 total_size: u64,
1016 },
1017}
1018
1019impl<'a> ChunkedPayload<'a> {
1020 /// Total payload size — `payload.len()` for Inline, sum of chunk
1021 /// sizes for Chunked.
1022 pub fn size(&self) -> u64 {
1023 match self {
1024 Self::Inline { payload, .. } => payload.len() as u64,
1025 Self::Chunked { total_size, .. } => *total_size,
1026 }
1027 }
1028
1029 /// Convert into the corresponding [`BlobRef`] given the
1030 /// adapter-routed URI. Inline produces [`BlobRef::Small`];
1031 /// Chunked produces [`BlobRef::Manifest`] with the supplied
1032 /// encoding. Returns `Err` only when the chunked variant exceeds
1033 /// [`BLOB_MANIFEST_MAX_CHUNKS`] (defense-in-depth — the chunker
1034 /// already enforces the cap).
1035 pub fn into_blob_ref(
1036 self,
1037 uri: impl Into<String>,
1038 encoding: Encoding,
1039 ) -> Result<BlobRef, BlobError> {
1040 match self {
1041 Self::Inline { hash, payload } => Ok(BlobRef::small(uri, hash, payload.len() as u64)),
1042 Self::Chunked { chunks, .. } => {
1043 let chunk_refs: Vec<ChunkRef> = chunks.into_iter().map(|(r, _)| r).collect();
1044 BlobRef::manifest(uri, encoding, chunk_refs)
1045 }
1046 }
1047 }
1048}
1049
1050/// Split a byte payload into either a single Inline blob or N
1051/// fixed-size chunks, content-addressing each part. Locked decisions:
1052///
1053/// - Threshold is a hard `≤` comparison: payload at exactly
1054/// [`BLOB_CHUNK_SIZE_BYTES`] rides as Inline (the chunker
1055/// wouldn't have anything to split into), payloads strictly larger
1056/// split into N = `ceil(len / BLOB_CHUNK_SIZE_BYTES)` chunks.
1057/// - Chunk size is fixed at [`BLOB_CHUNK_SIZE_BYTES`]; the algorithm
1058/// is deterministic — two callers chunking the same `bytes`
1059/// produce identical hash lists.
1060/// - Empty payload produces an Inline result with `payload = &[]`
1061/// and the BLAKE3-of-empty hash.
1062///
1063/// Rejects payloads larger than [`BLOB_REF_MAX_SIZE`] or whose chunk
1064/// count would exceed [`BLOB_MANIFEST_MAX_CHUNKS`].
1065pub fn chunk_payload(bytes: &[u8]) -> Result<ChunkedPayload<'_>, BlobError> {
1066 let len = bytes.len() as u64;
1067 if len > BLOB_REF_MAX_SIZE {
1068 return Err(BlobError::Decode(format!(
1069 "payload size {} exceeds cap {}",
1070 len, BLOB_REF_MAX_SIZE
1071 )));
1072 }
1073 if len <= BLOB_CHUNK_SIZE_BYTES {
1074 let hash: [u8; 32] = blake3::hash(bytes).into();
1075 return Ok(ChunkedPayload::Inline {
1076 hash,
1077 payload: bytes,
1078 });
1079 }
1080 let chunk_size = BLOB_CHUNK_SIZE_BYTES as usize;
1081 let chunk_count = bytes.len().div_ceil(chunk_size);
1082 if chunk_count > BLOB_MANIFEST_MAX_CHUNKS {
1083 return Err(BlobError::Decode(format!(
1084 "payload requires {} chunks, exceeds cap {}",
1085 chunk_count, BLOB_MANIFEST_MAX_CHUNKS
1086 )));
1087 }
1088 let mut chunks = Vec::with_capacity(chunk_count);
1089 for slice in bytes.chunks(chunk_size) {
1090 let hash: [u8; 32] = blake3::hash(slice).into();
1091 chunks.push((
1092 ChunkRef {
1093 hash,
1094 size: slice.len() as u32,
1095 },
1096 slice,
1097 ));
1098 }
1099 Ok(ChunkedPayload::Chunked {
1100 chunks,
1101 total_size: len,
1102 })
1103}
1104
1105/// One chunk-range request emitted by [`byte_range_to_chunks`].
1106#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1107pub struct ChunkRangeRequest {
1108 /// Index into the manifest's chunk list.
1109 pub chunk_index: usize,
1110 /// Byte offset *within the chunk* where the requested range
1111 /// starts (always 0 for non-boundary chunks; non-zero only for
1112 /// the first chunk of a partial fetch).
1113 pub start_in_chunk: u32,
1114 /// Byte offset *within the chunk* where the requested range
1115 /// ends (exclusive). Equals the chunk's `size` for non-boundary
1116 /// chunks; smaller only for the last chunk of a partial fetch.
1117 pub end_in_chunk: u32,
1118}
1119
1120impl ChunkRangeRequest {
1121 /// Length of the requested slice within this chunk.
1122 pub fn len(&self) -> u32 {
1123 self.end_in_chunk - self.start_in_chunk
1124 }
1125
1126 /// `true` if the requested slice is empty.
1127 pub fn is_empty(&self) -> bool {
1128 self.start_in_chunk >= self.end_in_chunk
1129 }
1130}
1131
1132/// Translate a global byte range `[start, end)` over a chunked blob
1133/// into the per-chunk fetch requests needed to satisfy it. Returns
1134/// the requests in chunk-index order so the caller can concatenate
1135/// the returned slices in iteration order. The math:
1136///
1137/// - `chunk_index` walks `[start / CHUNK, ceil(end / CHUNK))`.
1138/// - The first chunk's `start_in_chunk` is `start % CHUNK`; every
1139/// later chunk's `start_in_chunk` is `0`.
1140/// - The last chunk's `end_in_chunk` is `((end - 1) % CHUNK) + 1`
1141/// capped at the chunk's actual `size`; every earlier chunk's
1142/// `end_in_chunk` is the chunk's full `size`.
1143///
1144/// Returns an empty `Vec` for empty ranges (`start == end`) or when
1145/// `start >= total_size`. Errors when `end > total_size` or
1146/// `start > end` (callers should range-check before invoking, but
1147/// we surface a typed error to ease use as a defensive backstop).
1148///
1149/// Pure-logic; no chunk fetches happen here.
1150pub fn byte_range_to_chunks(
1151 manifest: &BlobRef,
1152 start: u64,
1153 end: u64,
1154) -> Result<Vec<ChunkRangeRequest>, BlobError> {
1155 let (chunks, total_size) = match manifest {
1156 BlobRef::Manifest {
1157 chunks, total_size, ..
1158 } => (chunks.as_slice(), *total_size),
1159 BlobRef::Small { .. } => {
1160 return Err(BlobError::Decode(
1161 "byte_range_to_chunks called on a Small BlobRef".to_owned(),
1162 ));
1163 }
1164 BlobRef::Tree { .. } => {
1165 // Tree blobs resolve ranges via tree walk
1166 // (A4 `TreeWalker`), not via the flat-manifest
1167 // helper. Callers holding a Tree BlobRef route
1168 // through `MeshBlobAdapter::fetch_range`'s tree
1169 // path directly.
1170 return Err(BlobError::Decode(
1171 "byte_range_to_chunks called on a Tree BlobRef — \
1172 use the tree-walker path instead"
1173 .to_owned(),
1174 ));
1175 }
1176 };
1177 if start > end {
1178 return Err(BlobError::Decode(format!(
1179 "range start {} > end {}",
1180 start, end
1181 )));
1182 }
1183 if end > total_size {
1184 return Err(BlobError::Decode(format!(
1185 "range end {} exceeds total_size {}",
1186 end, total_size
1187 )));
1188 }
1189 if start == end || start >= total_size {
1190 return Ok(Vec::new());
1191 }
1192 let chunk_size = BLOB_CHUNK_SIZE_BYTES;
1193 let first_chunk = (start / chunk_size) as usize;
1194 let last_chunk_inclusive = ((end - 1) / chunk_size) as usize;
1195 let mut out = Vec::with_capacity(last_chunk_inclusive - first_chunk + 1);
1196 for (chunk_index, chunk) in chunks
1197 .iter()
1198 .enumerate()
1199 .skip(first_chunk)
1200 .take(last_chunk_inclusive - first_chunk + 1)
1201 {
1202 let chunk_start_in_blob = chunk_index as u64 * chunk_size;
1203 // Clamp [start, end) against this chunk's
1204 // [chunk_start_in_blob, chunk_start_in_blob + chunk.size).
1205 let local_start = start.saturating_sub(chunk_start_in_blob);
1206 let local_end = (end - chunk_start_in_blob).min(chunk.size as u64);
1207 out.push(ChunkRangeRequest {
1208 chunk_index,
1209 start_in_chunk: local_start as u32,
1210 end_in_chunk: local_end as u32,
1211 });
1212 }
1213 Ok(out)
1214}
1215
1216#[cfg(test)]
1217mod tests {
1218 use super::*;
1219
1220 // -----------------------------------------------------------
1221 // Small variant — round-trip + decode-edge tests
1222 // (preserved from v0.15 for back-compat coverage)
1223 // -----------------------------------------------------------
1224
1225 fn small_fixture() -> BlobRef {
1226 BlobRef::small("s3://bucket/key", [0xAB; 32], 12345)
1227 }
1228
1229 #[test]
1230 fn small_round_trip_encode_decode() {
1231 let original = small_fixture();
1232 let bytes = original.encode();
1233 let decoded = BlobRef::decode(&bytes).unwrap().unwrap();
1234 assert_eq!(decoded, original);
1235 }
1236
1237 #[test]
1238 fn decode_returns_none_when_magic_missing() {
1239 let bytes = vec![0x00, 0x01, 0x02, 0x03, 0x04];
1240 assert!(BlobRef::decode(&bytes).unwrap().is_none());
1241 }
1242
1243 #[test]
1244 fn decode_returns_none_for_payloads_starting_with_old_discriminator_only() {
1245 let bytes = vec![0xB0, 0x00, 0x00, 0x00];
1246 assert!(BlobRef::decode(&bytes).unwrap().is_none());
1247 let bytes = vec![0xB0, 0xB1, 0x00, 0x00];
1248 assert!(BlobRef::decode(&bytes).unwrap().is_none());
1249 let bytes = vec![0xB0, 0xB1, 0xB2, 0x00];
1250 assert!(BlobRef::decode(&bytes).unwrap().is_none());
1251 }
1252
1253 #[test]
1254 fn decode_rejects_short_small_frame() {
1255 let mut bytes = BLOB_REF_MAGIC.to_vec();
1256 bytes.push(BLOB_REF_VERSION_V1);
1257 bytes.push(0x00); // truncated mid-hash
1258 let err = BlobRef::decode(&bytes).unwrap_err();
1259 assert!(matches!(err, BlobError::Decode(_)));
1260 }
1261
1262 #[test]
1263 fn decode_rejects_unknown_outer_version() {
1264 let blob = small_fixture();
1265 let mut bytes = blob.encode();
1266 bytes[4] = 0xFE;
1267 let err = BlobRef::decode(&bytes).unwrap_err();
1268 assert!(matches!(err, BlobError::UnsupportedVersion(0xFE)));
1269 }
1270
1271 #[test]
1272 fn encoded_len_matches_real_encoding_small() {
1273 let blob = small_fixture();
1274 assert_eq!(blob.encoded_len(), blob.encode().len());
1275 }
1276
1277 #[test]
1278 fn small_verify_accepts_matching_bytes() {
1279 let payload = b"the lazy dog";
1280 let hash: [u8; 32] = blake3::hash(payload).into();
1281 let blob = BlobRef::small("file:///x", hash, payload.len() as u64);
1282 blob.verify(payload).unwrap();
1283 }
1284
1285 #[test]
1286 fn small_verify_rejects_mismatching_bytes() {
1287 let blob = BlobRef::small("file:///x", [0xCC; 32], 0);
1288 let err = blob.verify(b"different content").unwrap_err();
1289 match err {
1290 BlobError::HashMismatch { expected, actual } => {
1291 assert_eq!(expected, [0xCC; 32]);
1292 assert_ne!(actual, expected);
1293 }
1294 other => panic!("expected HashMismatch, got {:?}", other),
1295 }
1296 }
1297
1298 #[test]
1299 fn small_decode_rejects_oversize_size_field() {
1300 let mut bytes = BLOB_REF_MAGIC.to_vec();
1301 bytes.push(BLOB_REF_VERSION_V1);
1302 bytes.extend_from_slice(&[0u8; 32]);
1303 bytes.extend_from_slice(&u64::MAX.to_le_bytes());
1304 let err = BlobRef::decode(&bytes).unwrap_err();
1305 assert!(matches!(err, BlobError::Decode(_)));
1306 }
1307
1308 #[test]
1309 fn empty_uri_round_trips_small() {
1310 let blob = BlobRef::small("", [0x00; 32], 0);
1311 let bytes = blob.encode();
1312 let decoded = BlobRef::decode(&bytes).unwrap().unwrap();
1313 assert_eq!(decoded.uri(), "");
1314 assert_eq!(decoded.size(), 0);
1315 }
1316
1317 // -----------------------------------------------------------
1318 // Manifest variant — round-trip + decode-edge tests
1319 // -----------------------------------------------------------
1320
1321 fn manifest_fixture(chunk_count: usize) -> BlobRef {
1322 let chunks: Vec<ChunkRef> = (0..chunk_count)
1323 .map(|i| ChunkRef {
1324 hash: [i as u8; 32],
1325 size: BLOB_CHUNK_SIZE_BYTES as u32,
1326 })
1327 .collect();
1328 BlobRef::manifest("mesh://abc", Encoding::Replicated, chunks).unwrap()
1329 }
1330
1331 #[test]
1332 fn manifest_round_trip_encode_decode() {
1333 let original = manifest_fixture(8);
1334 let bytes = original.encode();
1335 let decoded = BlobRef::decode(&bytes).unwrap().unwrap();
1336 assert_eq!(decoded, original);
1337 }
1338
1339 /// Pin dataforts perf #174: `encoded_len` measures the same
1340 /// byte count `encode()` produces, byte-for-byte, without
1341 /// allocating the encoded buffer. Pre-fix `encoded_len` for
1342 /// Manifest / Tree allocated a `Vec` via `encode()` and
1343 /// threw it away. The post-fix path uses
1344 /// `postcard::experimental::serialized_size` to walk the
1345 /// structure measuring. A regression that drifts the
1346 /// header-prefix accounting (4-byte magic + 1-byte version)
1347 /// would surface as a length mismatch here.
1348 #[test]
1349 fn encoded_len_matches_encode_len_without_allocating() {
1350 // Small variant: closed-form size — sanity check.
1351 let small = small_fixture();
1352 assert_eq!(small.encoded_len(), small.encode().len(), "Small parity");
1353
1354 // Manifest variants across several chunk-count regimes:
1355 // 1 chunk (smallest), 8 chunks (typical), 128 chunks
1356 // (large). Each exercises a different postcard leb128
1357 // length-prefix size.
1358 for count in [1usize, 8, 128] {
1359 let manifest = manifest_fixture(count);
1360 assert_eq!(
1361 manifest.encoded_len(),
1362 manifest.encode().len(),
1363 "Manifest({count} chunks) parity",
1364 );
1365 }
1366
1367 // Tree variant.
1368 let tree = BlobRef::tree("mesh://tree", Encoding::Replicated, [0xCD; 32], 1024, 3)
1369 .expect("tree ref");
1370 assert_eq!(tree.encoded_len(), tree.encode().len(), "Tree parity");
1371
1372 // ReedSolomon-encoded manifest: different encoding variant
1373 // sometimes serializes to a different byte count.
1374 let rs_manifest = BlobRef::manifest(
1375 "mesh://rs",
1376 Encoding::ReedSolomon { k: 4, m: 2 },
1377 vec![ChunkRef {
1378 hash: [0xAA; 32],
1379 size: 1024,
1380 }],
1381 )
1382 .unwrap();
1383 assert_eq!(
1384 rs_manifest.encoded_len(),
1385 rs_manifest.encode().len(),
1386 "RS Manifest parity",
1387 );
1388 }
1389
1390 /// Pin cubic-dev-ai code review for dataforts perf #174:
1391 /// `ManifestBodyRef` and `TreeBodyRef` must serialize
1392 /// byte-for-byte identically to the owned `ManifestBody` /
1393 /// `TreeBody` they mirror. The `encoded_len` sizing path
1394 /// uses the `Ref` form to avoid the per-chunk `.clone()` of
1395 /// the chunk vector; the byte-for-byte serialization
1396 /// equivalence is what makes the substitution safe.
1397 ///
1398 /// A future refactor that adds/reorders/renames a field in
1399 /// one type but not the other would silently corrupt the
1400 /// sizing path (post-corruption an `encoded_len` call would
1401 /// disagree with `encode().len()`). The existing
1402 /// `encoded_len_matches_encode_len_without_allocating` test
1403 /// would also catch this — this companion narrows the
1404 /// signal to specifically "ref form vs owned form" rather
1405 /// than the broader "encoded_len vs encode round-trip".
1406 #[test]
1407 fn manifest_body_ref_serializes_identically_to_owned_form() {
1408 let chunks: Vec<ChunkRef> = (0..32)
1409 .map(|i| ChunkRef {
1410 hash: [i as u8; 32],
1411 size: 1024 + i as u32,
1412 })
1413 .collect();
1414 let owned = ManifestBody {
1415 body_version: BLOB_MANIFEST_BODY_VERSION,
1416 uri: "mesh://parity-test".to_string(),
1417 encoding: Encoding::ReedSolomon { k: 4, m: 2 },
1418 chunks: chunks.clone(),
1419 total_size: 99_999,
1420 };
1421 let borrowed = ManifestBodyRef {
1422 body_version: BLOB_MANIFEST_BODY_VERSION,
1423 uri: "mesh://parity-test",
1424 encoding: Encoding::ReedSolomon { k: 4, m: 2 },
1425 chunks: chunks.as_slice(),
1426 total_size: 99_999,
1427 };
1428 let owned_bytes = postcard::to_allocvec(&owned).unwrap();
1429 let borrowed_bytes = postcard::to_allocvec(&borrowed).unwrap();
1430 assert_eq!(
1431 owned_bytes, borrowed_bytes,
1432 "ManifestBodyRef must serialize byte-for-byte identically to ManifestBody",
1433 );
1434 // And the measured-only path agrees with the alloc'd path.
1435 let measured_owned = postcard::experimental::serialized_size(&owned).unwrap();
1436 let measured_borrowed = postcard::experimental::serialized_size(&borrowed).unwrap();
1437 assert_eq!(measured_owned, measured_borrowed);
1438 assert_eq!(measured_owned, owned_bytes.len());
1439 }
1440
1441 /// Sibling of the Manifest parity pin: same invariant for
1442 /// `TreeBodyRef` against `TreeBody`.
1443 #[test]
1444 fn tree_body_ref_serializes_identically_to_owned_form() {
1445 let owned = TreeBody {
1446 body_version: BLOB_TREE_BODY_VERSION,
1447 uri: "mesh://tree-parity".to_string(),
1448 encoding: Encoding::Replicated,
1449 root_hash: [0xCD; 32],
1450 total_size: 1_234_567,
1451 depth: 3,
1452 };
1453 let borrowed = TreeBodyRef {
1454 body_version: BLOB_TREE_BODY_VERSION,
1455 uri: "mesh://tree-parity",
1456 encoding: Encoding::Replicated,
1457 root_hash: [0xCD; 32],
1458 total_size: 1_234_567,
1459 depth: 3,
1460 };
1461 let owned_bytes = postcard::to_allocvec(&owned).unwrap();
1462 let borrowed_bytes = postcard::to_allocvec(&borrowed).unwrap();
1463 assert_eq!(
1464 owned_bytes, borrowed_bytes,
1465 "TreeBodyRef must serialize byte-for-byte identically to TreeBody",
1466 );
1467 }
1468
1469 #[test]
1470 fn manifest_round_trip_with_reed_solomon_reserved() {
1471 let chunks = vec![ChunkRef {
1472 hash: [0xAA; 32],
1473 size: 1024,
1474 }];
1475 let blob =
1476 BlobRef::manifest("mesh://rs", Encoding::ReedSolomon { k: 4, m: 2 }, chunks).unwrap();
1477 let bytes = blob.encode();
1478 let decoded = BlobRef::decode(&bytes).unwrap().unwrap();
1479 assert_eq!(
1480 decoded.encoding(),
1481 Some(Encoding::ReedSolomon { k: 4, m: 2 })
1482 );
1483 }
1484
1485 #[test]
1486 fn manifest_rejects_empty_chunk_list() {
1487 let err = BlobRef::manifest("mesh://", Encoding::Replicated, Vec::new()).unwrap_err();
1488 assert!(matches!(err, BlobError::Decode(_)));
1489 }
1490
1491 #[test]
1492 fn manifest_rejects_too_many_chunks() {
1493 let chunks: Vec<ChunkRef> = (0..BLOB_MANIFEST_MAX_CHUNKS + 1)
1494 .map(|_| ChunkRef {
1495 hash: [0; 32],
1496 size: 1,
1497 })
1498 .collect();
1499 let err = BlobRef::manifest("mesh://", Encoding::Replicated, chunks).unwrap_err();
1500 assert!(matches!(err, BlobError::Decode(_)));
1501 }
1502
1503 #[test]
1504 fn manifest_rejects_total_size_over_cap() {
1505 let chunks = vec![
1506 ChunkRef {
1507 hash: [0; 32],
1508 size: u32::MAX,
1509 };
1510 5
1511 ];
1512 // 5 × 4 GiB ≈ 20 GiB > 16 GiB cap (also fails chunk-size validator)
1513 let err = BlobRef::manifest("mesh://", Encoding::Replicated, chunks).unwrap_err();
1514 assert!(matches!(err, BlobError::Decode(_)));
1515 }
1516
1517 /// `byte_range_to_chunks` and the adapter's range slicer rely on
1518 /// the substrate's fixed `BLOB_CHUNK_SIZE_BYTES` stride. A
1519 /// peer-crafted manifest with non-stride chunk sizes makes the
1520 /// position math return wrong-window bytes silently, so both
1521 /// `manifest()` and `decode_manifest()` must reject those shapes.
1522 #[test]
1523 fn manifest_rejects_non_last_chunk_smaller_than_stride() {
1524 let chunks = vec![
1525 ChunkRef {
1526 hash: [1; 32],
1527 size: 1, // first chunk must be exactly BLOB_CHUNK_SIZE_BYTES
1528 },
1529 ChunkRef {
1530 hash: [2; 32],
1531 size: BLOB_CHUNK_SIZE_BYTES as u32,
1532 },
1533 ];
1534 let err = BlobRef::manifest("mesh://", Encoding::Replicated, chunks).unwrap_err();
1535 assert!(matches!(err, BlobError::Decode(_)));
1536 }
1537
1538 #[test]
1539 fn manifest_rejects_non_last_chunk_larger_than_stride() {
1540 let chunks = vec![
1541 ChunkRef {
1542 hash: [1; 32],
1543 size: (BLOB_CHUNK_SIZE_BYTES as u32) + 1,
1544 },
1545 ChunkRef {
1546 hash: [2; 32],
1547 size: BLOB_CHUNK_SIZE_BYTES as u32,
1548 },
1549 ];
1550 let err = BlobRef::manifest("mesh://", Encoding::Replicated, chunks).unwrap_err();
1551 assert!(matches!(err, BlobError::Decode(_)));
1552 }
1553
1554 #[test]
1555 fn manifest_rejects_last_chunk_above_stride() {
1556 let chunks = vec![ChunkRef {
1557 hash: [1; 32],
1558 size: (BLOB_CHUNK_SIZE_BYTES as u32) + 1,
1559 }];
1560 let err = BlobRef::manifest("mesh://", Encoding::Replicated, chunks).unwrap_err();
1561 assert!(matches!(err, BlobError::Decode(_)));
1562 }
1563
1564 #[test]
1565 fn manifest_rejects_zero_size_chunk() {
1566 let chunks = vec![ChunkRef {
1567 hash: [1; 32],
1568 size: 0,
1569 }];
1570 let err = BlobRef::manifest("mesh://", Encoding::Replicated, chunks).unwrap_err();
1571 assert!(matches!(err, BlobError::Decode(_)));
1572 }
1573
1574 #[test]
1575 fn manifest_accepts_single_short_chunk_as_last() {
1576 // A single chunk smaller than the stride is the valid
1577 // single-chunk last-chunk case (a payload less than 4 MiB
1578 // would normally ride as Small, but Manifest with one short
1579 // chunk is structurally legal).
1580 let chunks = vec![ChunkRef {
1581 hash: [1; 32],
1582 size: 1024,
1583 }];
1584 let blob = BlobRef::manifest("mesh://", Encoding::Replicated, chunks).unwrap();
1585 assert_eq!(blob.size(), 1024);
1586 }
1587
1588 #[test]
1589 fn manifest_accepts_multichunk_with_short_last() {
1590 let chunks = vec![
1591 ChunkRef {
1592 hash: [1; 32],
1593 size: BLOB_CHUNK_SIZE_BYTES as u32,
1594 },
1595 ChunkRef {
1596 hash: [2; 32],
1597 size: 1024,
1598 },
1599 ];
1600 let blob = BlobRef::manifest("mesh://", Encoding::Replicated, chunks).unwrap();
1601 assert_eq!(blob.size(), BLOB_CHUNK_SIZE_BYTES + 1024);
1602 }
1603
1604 #[test]
1605 fn manifest_decode_detects_total_size_lie() {
1606 // Hand-craft a manifest body whose declared total_size is
1607 // wrong vs. the iterated sum — a malicious peer could
1608 // otherwise mislead range math by under-reporting the
1609 // total. Decode must reject.
1610 use serde::Serialize;
1611 #[derive(Serialize)]
1612 struct LyingBody {
1613 body_version: u8,
1614 uri: String,
1615 encoding: Encoding,
1616 chunks: Vec<ChunkRef>,
1617 total_size: u64,
1618 }
1619 let lying = LyingBody {
1620 body_version: BLOB_MANIFEST_BODY_VERSION,
1621 uri: "mesh://lie".to_owned(),
1622 encoding: Encoding::Replicated,
1623 chunks: vec![ChunkRef {
1624 hash: [0; 32],
1625 size: 100,
1626 }],
1627 total_size: 200, // declared 200 but iterated sum is 100
1628 };
1629 let body = postcard::to_allocvec(&lying).unwrap();
1630 let mut bytes = BLOB_REF_MAGIC.to_vec();
1631 bytes.push(BLOB_REF_VERSION_V2_MANIFEST);
1632 bytes.extend_from_slice(&body);
1633 let err = BlobRef::decode(&bytes).unwrap_err();
1634 assert!(matches!(err, BlobError::Decode(_)));
1635 }
1636
1637 #[test]
1638 fn manifest_decode_rejects_unknown_body_version() {
1639 use serde::Serialize;
1640 #[derive(Serialize)]
1641 struct FutureBody {
1642 body_version: u8,
1643 uri: String,
1644 encoding: Encoding,
1645 chunks: Vec<ChunkRef>,
1646 total_size: u64,
1647 }
1648 let body = FutureBody {
1649 body_version: 0xFE,
1650 uri: "mesh://".to_owned(),
1651 encoding: Encoding::Replicated,
1652 chunks: vec![ChunkRef {
1653 hash: [0; 32],
1654 size: 1,
1655 }],
1656 total_size: 1,
1657 };
1658 let body_bytes = postcard::to_allocvec(&body).unwrap();
1659 let mut bytes = BLOB_REF_MAGIC.to_vec();
1660 bytes.push(BLOB_REF_VERSION_V2_MANIFEST);
1661 bytes.extend_from_slice(&body_bytes);
1662 let err = BlobRef::decode(&bytes).unwrap_err();
1663 assert!(matches!(err, BlobError::UnsupportedVersion(0xFE)));
1664 }
1665
1666 #[test]
1667 fn manifest_size_matches_iterated_chunk_sum() {
1668 let blob = manifest_fixture(10);
1669 let iterated: u64 = blob.chunks().iter().map(|c| c.size as u64).sum();
1670 assert_eq!(blob.size(), iterated);
1671 }
1672
1673 #[test]
1674 fn accessors_uniform_across_variants() {
1675 let small = BlobRef::small("file:///s", [0; 32], 99);
1676 assert_eq!(small.uri(), "file:///s");
1677 assert_eq!(small.size(), 99);
1678 assert!(!small.is_chunked());
1679 assert!(small.small_hash().is_some());
1680 assert!(small.chunks().is_empty());
1681 assert_eq!(small.encoding(), None);
1682
1683 let m = manifest_fixture(3);
1684 assert_eq!(m.uri(), "mesh://abc");
1685 assert!(m.is_chunked());
1686 assert!(m.small_hash().is_none());
1687 assert_eq!(m.chunks().len(), 3);
1688 assert_eq!(m.encoding(), Some(Encoding::Replicated));
1689 }
1690
1691 // -----------------------------------------------------------
1692 // Chunking algorithm — idempotency + edge cases
1693 // -----------------------------------------------------------
1694
1695 #[test]
1696 fn chunk_payload_inline_under_threshold() {
1697 let payload = vec![0x42u8; 1024]; // 1 KiB
1698 match chunk_payload(&payload).unwrap() {
1699 ChunkedPayload::Inline { payload: p, hash } => {
1700 assert_eq!(p.len(), 1024);
1701 let expected_hash: [u8; 32] = blake3::hash(&payload).into();
1702 assert_eq!(hash, expected_hash);
1703 }
1704 ChunkedPayload::Chunked { .. } => panic!("expected Inline for 1 KiB payload"),
1705 }
1706 }
1707
1708 #[test]
1709 fn chunk_payload_inline_at_exact_threshold() {
1710 let payload = vec![0x42u8; BLOB_CHUNK_SIZE_BYTES as usize]; // exactly 4 MiB
1711 assert!(matches!(
1712 chunk_payload(&payload).unwrap(),
1713 ChunkedPayload::Inline { .. }
1714 ));
1715 }
1716
1717 #[test]
1718 fn chunk_payload_chunks_above_threshold() {
1719 let payload = vec![0x42u8; (BLOB_CHUNK_SIZE_BYTES as usize) + 1]; // 4 MiB + 1
1720 match chunk_payload(&payload).unwrap() {
1721 ChunkedPayload::Chunked { chunks, total_size } => {
1722 assert_eq!(chunks.len(), 2);
1723 assert_eq!(chunks[0].0.size, BLOB_CHUNK_SIZE_BYTES as u32);
1724 assert_eq!(chunks[1].0.size, 1);
1725 assert_eq!(total_size, payload.len() as u64);
1726 }
1727 ChunkedPayload::Inline { .. } => panic!("expected Chunked for 4MiB+1 payload"),
1728 }
1729 }
1730
1731 #[test]
1732 fn chunk_payload_idempotent_same_bytes_same_hashes() {
1733 // Two callers chunking the same payload must produce
1734 // identical ChunkRef lists — the dedup property the
1735 // replication layer relies on.
1736 let payload: Vec<u8> = (0..(8 * 1024 * 1024 + 17))
1737 .map(|i| (i % 251) as u8)
1738 .collect();
1739 let first = match chunk_payload(&payload).unwrap() {
1740 ChunkedPayload::Chunked { chunks, .. } => {
1741 chunks.iter().map(|(c, _)| *c).collect::<Vec<_>>()
1742 }
1743 _ => panic!("expected Chunked"),
1744 };
1745 let second = match chunk_payload(&payload).unwrap() {
1746 ChunkedPayload::Chunked { chunks, .. } => {
1747 chunks.iter().map(|(c, _)| *c).collect::<Vec<_>>()
1748 }
1749 _ => panic!("expected Chunked"),
1750 };
1751 assert_eq!(first, second);
1752 }
1753
1754 #[test]
1755 fn chunk_payload_empty_is_inline() {
1756 let payload: Vec<u8> = Vec::new();
1757 match chunk_payload(&payload).unwrap() {
1758 ChunkedPayload::Inline { payload, hash } => {
1759 assert!(payload.is_empty());
1760 let expected: [u8; 32] = blake3::hash(b"").into();
1761 assert_eq!(hash, expected);
1762 }
1763 _ => panic!("empty payload must be Inline"),
1764 }
1765 }
1766
1767 #[test]
1768 fn chunk_payload_rejects_oversize() {
1769 // Construct a fake "len" by lying via slice — but we can't
1770 // actually allocate 16 GiB. Instead, test the cap-check
1771 // arithmetic via a payload sized 4 GiB + 1 against a smaller
1772 // synthetic cap. The production cap is BLOB_REF_MAX_SIZE so
1773 // we test the chunk-count cap path here.
1774 // (chunk-count cap fires at MAX_CHUNKS * 4 MiB = 32 GiB,
1775 // before BLOB_REF_MAX_SIZE — verified below.)
1776 assert!(BLOB_MANIFEST_MAX_CHUNKS as u64 * BLOB_CHUNK_SIZE_BYTES > BLOB_REF_MAX_SIZE);
1777 }
1778
1779 // -----------------------------------------------------------
1780 // byte_range_to_chunks — range math
1781 // -----------------------------------------------------------
1782
1783 fn five_chunk_manifest() -> BlobRef {
1784 // Five 4 MiB chunks (20 MiB total).
1785 let chunks: Vec<ChunkRef> = (0..5)
1786 .map(|i| ChunkRef {
1787 hash: [i as u8; 32],
1788 size: BLOB_CHUNK_SIZE_BYTES as u32,
1789 })
1790 .collect();
1791 BlobRef::manifest("mesh://x", Encoding::Replicated, chunks).unwrap()
1792 }
1793
1794 #[test]
1795 fn range_aligned_single_chunk() {
1796 let m = five_chunk_manifest();
1797 let req = byte_range_to_chunks(&m, 0, BLOB_CHUNK_SIZE_BYTES).unwrap();
1798 assert_eq!(req.len(), 1);
1799 assert_eq!(req[0].chunk_index, 0);
1800 assert_eq!(req[0].start_in_chunk, 0);
1801 assert_eq!(req[0].end_in_chunk, BLOB_CHUNK_SIZE_BYTES as u32);
1802 }
1803
1804 #[test]
1805 fn range_unaligned_within_one_chunk() {
1806 let m = five_chunk_manifest();
1807 let req = byte_range_to_chunks(&m, 100, 200).unwrap();
1808 assert_eq!(req.len(), 1);
1809 assert_eq!(req[0].chunk_index, 0);
1810 assert_eq!(req[0].start_in_chunk, 100);
1811 assert_eq!(req[0].end_in_chunk, 200);
1812 assert_eq!(req[0].len(), 100);
1813 }
1814
1815 #[test]
1816 fn range_spans_two_chunks() {
1817 let m = five_chunk_manifest();
1818 let chunk = BLOB_CHUNK_SIZE_BYTES;
1819 // Last 1 KiB of chunk 0, first 1 KiB of chunk 1.
1820 let req = byte_range_to_chunks(&m, chunk - 1024, chunk + 1024).unwrap();
1821 assert_eq!(req.len(), 2);
1822 assert_eq!(req[0].chunk_index, 0);
1823 assert_eq!(req[0].start_in_chunk, (chunk - 1024) as u32);
1824 assert_eq!(req[0].end_in_chunk, chunk as u32);
1825 assert_eq!(req[1].chunk_index, 1);
1826 assert_eq!(req[1].start_in_chunk, 0);
1827 assert_eq!(req[1].end_in_chunk, 1024);
1828 }
1829
1830 #[test]
1831 fn range_spans_all_chunks() {
1832 let m = five_chunk_manifest();
1833 let req = byte_range_to_chunks(&m, 0, m.size()).unwrap();
1834 assert_eq!(req.len(), 5);
1835 for (i, r) in req.iter().enumerate() {
1836 assert_eq!(r.chunk_index, i);
1837 assert_eq!(r.start_in_chunk, 0);
1838 assert_eq!(r.end_in_chunk, BLOB_CHUNK_SIZE_BYTES as u32);
1839 }
1840 }
1841
1842 #[test]
1843 fn range_with_partial_last_chunk() {
1844 // Manifest where the last chunk is smaller than the chunk
1845 // size — exercises the per-chunk clamp on `end_in_chunk`.
1846 let chunks = vec![
1847 ChunkRef {
1848 hash: [0; 32],
1849 size: BLOB_CHUNK_SIZE_BYTES as u32,
1850 },
1851 ChunkRef {
1852 hash: [1; 32],
1853 size: 1024, // last chunk is 1 KiB
1854 },
1855 ];
1856 let m = BlobRef::manifest("mesh://", Encoding::Replicated, chunks).unwrap();
1857 // Range covers all of chunk 0 + first 100 bytes of chunk 1.
1858 let req = byte_range_to_chunks(&m, 0, BLOB_CHUNK_SIZE_BYTES + 100).unwrap();
1859 assert_eq!(req.len(), 2);
1860 assert_eq!(req[1].chunk_index, 1);
1861 assert_eq!(req[1].start_in_chunk, 0);
1862 assert_eq!(req[1].end_in_chunk, 100);
1863 }
1864
1865 #[test]
1866 fn range_empty_is_empty_request_list() {
1867 let m = five_chunk_manifest();
1868 assert!(byte_range_to_chunks(&m, 100, 100).unwrap().is_empty());
1869 // start past end-of-blob → empty too.
1870 assert!(byte_range_to_chunks(&m, m.size(), m.size())
1871 .unwrap()
1872 .is_empty());
1873 }
1874
1875 #[test]
1876 fn range_rejects_end_past_total_size() {
1877 let m = five_chunk_manifest();
1878 let err = byte_range_to_chunks(&m, 0, m.size() + 1).unwrap_err();
1879 assert!(matches!(err, BlobError::Decode(_)));
1880 }
1881
1882 #[test]
1883 fn range_rejects_start_after_end() {
1884 let m = five_chunk_manifest();
1885 let err = byte_range_to_chunks(&m, 200, 100).unwrap_err();
1886 assert!(matches!(err, BlobError::Decode(_)));
1887 }
1888
1889 #[test]
1890 fn range_rejects_call_against_small() {
1891 let s = BlobRef::small("file:///x", [0; 32], 100);
1892 let err = byte_range_to_chunks(&s, 0, 50).unwrap_err();
1893 assert!(matches!(err, BlobError::Decode(_)));
1894 }
1895
1896 #[test]
1897 fn range_math_reassembles_exact_payload() {
1898 // End-to-end sanity: chunk a payload, then for several
1899 // sub-ranges, reconstruct the byte slice by walking the
1900 // chunk-range requests and assembling.
1901 let payload: Vec<u8> = (0..(BLOB_CHUNK_SIZE_BYTES as usize * 3 + 1000))
1902 .map(|i| (i % 251) as u8)
1903 .collect();
1904 let chunked = chunk_payload(&payload).unwrap();
1905 let (chunks_owned, total_size) = match chunked {
1906 ChunkedPayload::Chunked { chunks, total_size } => (chunks, total_size),
1907 _ => panic!("expected Chunked"),
1908 };
1909 let chunk_refs: Vec<ChunkRef> = chunks_owned.iter().map(|(r, _)| *r).collect();
1910 let chunk_bytes: Vec<&[u8]> = chunks_owned.iter().map(|(_, b)| *b).collect();
1911 let m = BlobRef::manifest("mesh://x", Encoding::Replicated, chunk_refs).unwrap();
1912 assert_eq!(m.size(), total_size);
1913
1914 let cases = [
1915 (0u64, total_size),
1916 (10, 5_000_000),
1917 (BLOB_CHUNK_SIZE_BYTES, BLOB_CHUNK_SIZE_BYTES + 1),
1918 (total_size - 100, total_size),
1919 ];
1920 for (start, end) in cases {
1921 let requests = byte_range_to_chunks(&m, start, end).unwrap();
1922 let mut assembled = Vec::with_capacity((end - start) as usize);
1923 for r in requests {
1924 let chunk = chunk_bytes[r.chunk_index];
1925 assembled
1926 .extend_from_slice(&chunk[r.start_in_chunk as usize..r.end_in_chunk as usize]);
1927 }
1928 assert_eq!(
1929 assembled,
1930 payload[start as usize..end as usize],
1931 "range [{}, {}) reassembly mismatch",
1932 start,
1933 end
1934 );
1935 }
1936 }
1937
1938 // -----------------------------------------------------------
1939 // BlobRef::Tree (v0.3) constructor + wire round-trip
1940 // -----------------------------------------------------------
1941
1942 fn tree_root() -> [u8; 32] {
1943 [0xAB; 32]
1944 }
1945
1946 #[test]
1947 fn tree_constructor_sets_version_and_fields() {
1948 let r = BlobRef::tree(
1949 "mesh://ab".to_string(),
1950 Encoding::Replicated,
1951 tree_root(),
1952 1024 * 1024 * 1024 * 64, // 64 GiB
1953 2,
1954 )
1955 .unwrap();
1956 assert_eq!(r.version(), BLOB_REF_VERSION_V3_TREE);
1957 assert_eq!(r.uri(), "mesh://ab");
1958 assert_eq!(r.size(), 1024 * 1024 * 1024 * 64);
1959 assert_eq!(r.tree_depth(), Some(2));
1960 assert_eq!(r.tree_root_hash(), Some(&tree_root()));
1961 assert_eq!(r.encoding(), Some(Encoding::Replicated));
1962 assert!(r.is_chunked());
1963 assert!(r.is_tree());
1964 assert!(r.small_hash().is_none());
1965 assert!(r.chunks().is_empty());
1966 }
1967
1968 #[test]
1969 fn tree_constructor_rejects_zero_total_size() {
1970 let err = BlobRef::tree("mesh://aa", Encoding::Replicated, tree_root(), 0, 1).unwrap_err();
1971 let msg = err.to_string();
1972 assert!(msg.contains("must be > 0"), "got: {msg}");
1973 }
1974
1975 #[test]
1976 fn tree_constructor_rejects_total_size_above_cap() {
1977 let err = BlobRef::tree(
1978 "mesh://aa",
1979 Encoding::Replicated,
1980 tree_root(),
1981 BLOB_TREE_MAX_TOTAL_SIZE + 1,
1982 4,
1983 )
1984 .unwrap_err();
1985 let msg = err.to_string();
1986 assert!(msg.contains("exceeds cap"), "got: {msg}");
1987 }
1988
1989 #[test]
1990 fn tree_constructor_rejects_zero_depth() {
1991 let err =
1992 BlobRef::tree("mesh://aa", Encoding::Replicated, tree_root(), 1024, 0).unwrap_err();
1993 let msg = err.to_string();
1994 assert!(msg.contains("depth"), "got: {msg}");
1995 }
1996
1997 #[test]
1998 fn tree_constructor_rejects_depth_above_cap() {
1999 let err = BlobRef::tree(
2000 "mesh://aa",
2001 Encoding::Replicated,
2002 tree_root(),
2003 1024,
2004 super::super::blob_tree::MAX_TREE_DEPTH + 1,
2005 )
2006 .unwrap_err();
2007 let msg = err.to_string();
2008 assert!(msg.contains("depth"), "got: {msg}");
2009 }
2010
2011 #[test]
2012 fn tree_encode_decode_round_trips() {
2013 let original = BlobRef::tree(
2014 "mesh://cafe".to_string(),
2015 Encoding::Replicated,
2016 tree_root(),
2017 1024 * 1024 * 1024, // 1 GiB
2018 1,
2019 )
2020 .unwrap();
2021 let bytes = original.encode();
2022 let decoded = BlobRef::decode(&bytes).unwrap().unwrap();
2023 assert_eq!(original, decoded);
2024 match decoded {
2025 BlobRef::Tree {
2026 version,
2027 uri,
2028 encoding,
2029 root_hash,
2030 total_size,
2031 depth,
2032 } => {
2033 assert_eq!(version, BLOB_REF_VERSION_V3_TREE);
2034 assert_eq!(uri, "mesh://cafe");
2035 assert_eq!(encoding, Encoding::Replicated);
2036 assert_eq!(root_hash, tree_root());
2037 assert_eq!(total_size, 1024 * 1024 * 1024);
2038 assert_eq!(depth, 1);
2039 }
2040 other => panic!("expected Tree, got {:?}", other),
2041 }
2042 }
2043
2044 #[test]
2045 fn tree_decode_preserves_reedsolomon_encoding_tag() {
2046 let original = BlobRef::tree(
2047 "mesh://ff",
2048 Encoding::ReedSolomon { k: 10, m: 4 },
2049 tree_root(),
2050 1u64 << 40, // 1 TiB
2051 3,
2052 )
2053 .unwrap();
2054 let bytes = original.encode();
2055 let decoded = BlobRef::decode(&bytes).unwrap().unwrap();
2056 assert_eq!(
2057 decoded.encoding(),
2058 Some(Encoding::ReedSolomon { k: 10, m: 4 })
2059 );
2060 }
2061
2062 #[test]
2063 fn tree_decode_rejects_unknown_outer_version() {
2064 // Hand-craft magic + an unknown version byte + arbitrary
2065 // postcard body bytes. Must surface UnsupportedVersion
2066 // rather than mis-decode as Small or Manifest.
2067 let mut bytes = Vec::new();
2068 bytes.extend_from_slice(&BLOB_REF_MAGIC);
2069 bytes.push(0xFE); // not 0x01/0x02/0x03
2070 bytes.extend_from_slice(&[0u8; 64]);
2071 let err = BlobRef::decode(&bytes).unwrap_err();
2072 assert!(
2073 matches!(err, BlobError::UnsupportedVersion(0xFE)),
2074 "expected UnsupportedVersion(0xFE), got {err:?}"
2075 );
2076 }
2077
2078 #[test]
2079 fn tree_decode_rejects_unknown_body_version() {
2080 // Encode a tree, then hand-mutate the body_version field
2081 // (first byte after magic+outer-version) to an unknown
2082 // value. Decoder must surface UnsupportedVersion for the
2083 // body, not silently accept.
2084 let original =
2085 BlobRef::tree("mesh://aa", Encoding::Replicated, tree_root(), 1024, 1).unwrap();
2086 let mut bytes = original.encode();
2087 // The postcard body starts at offset 5. The body's first
2088 // field is `body_version: u8`, which postcard emits as a
2089 // single byte (no leading length prefix on `u8`). Mutate
2090 // it to an unknown value.
2091 bytes[5] = 0xEF;
2092 let err = BlobRef::decode(&bytes).unwrap_err();
2093 assert!(
2094 matches!(err, BlobError::UnsupportedVersion(0xEF)),
2095 "expected UnsupportedVersion(0xEF), got {err:?}"
2096 );
2097 }
2098
2099 #[test]
2100 fn tree_decode_rejects_oversize_body() {
2101 // Hand-construct magic + outer version + a body whose
2102 // length exceeds BLOB_REF_TREE_BODY_MAX_BYTES. Decoder
2103 // must reject BEFORE postcard allocates so a malicious
2104 // peer can't force a large allocation.
2105 let mut bytes = Vec::new();
2106 bytes.extend_from_slice(&BLOB_REF_MAGIC);
2107 bytes.push(BLOB_REF_VERSION_V3_TREE);
2108 bytes.extend(std::iter::repeat_n(0u8, BLOB_REF_TREE_BODY_MAX_BYTES + 1));
2109 let err = BlobRef::decode(&bytes).unwrap_err();
2110 let msg = err.to_string();
2111 assert!(msg.contains("exceeds cap"), "got: {msg}");
2112 }
2113
2114 #[test]
2115 fn tree_decode_rejects_total_size_above_cap() {
2116 // Hand-encode a TreeBody with a u64 total_size past
2117 // BLOB_TREE_MAX_TOTAL_SIZE. Decoder catches it via the
2118 // post-decode validation, not via the constructor.
2119 let body = TreeBody {
2120 body_version: BLOB_TREE_BODY_VERSION,
2121 uri: "mesh://x".to_string(),
2122 encoding: Encoding::Replicated,
2123 root_hash: tree_root(),
2124 total_size: BLOB_TREE_MAX_TOTAL_SIZE + 1,
2125 depth: 4,
2126 };
2127 let body_bytes = postcard::to_allocvec(&body).unwrap();
2128 let mut bytes = Vec::new();
2129 bytes.extend_from_slice(&BLOB_REF_MAGIC);
2130 bytes.push(BLOB_REF_VERSION_V3_TREE);
2131 bytes.extend_from_slice(&body_bytes);
2132 let err = BlobRef::decode(&bytes).unwrap_err();
2133 let msg = err.to_string();
2134 assert!(msg.contains("exceeds cap"), "got: {msg}");
2135 }
2136
2137 #[test]
2138 fn tree_decode_rejects_depth_inconsistent_with_total_size() {
2139 // depth=4 against a 1-byte total_size is structurally
2140 // malformed — TREE_FANOUT^3 ≈ 2 M bytes is the lower bound
2141 // for a productive depth-4 tree. Pre-fix the walker
2142 // would still catch the mismatch at fetch time, but the
2143 // decode-side check short-circuits before any walk traffic.
2144 let body = TreeBody {
2145 body_version: BLOB_TREE_BODY_VERSION,
2146 uri: "mesh://x".to_string(),
2147 encoding: Encoding::Replicated,
2148 root_hash: tree_root(),
2149 total_size: 1,
2150 depth: 4,
2151 };
2152 let body_bytes = postcard::to_allocvec(&body).unwrap();
2153 let mut bytes = Vec::new();
2154 bytes.extend_from_slice(&BLOB_REF_MAGIC);
2155 bytes.push(BLOB_REF_VERSION_V3_TREE);
2156 bytes.extend_from_slice(&body_bytes);
2157 let err = BlobRef::decode(&bytes).unwrap_err();
2158 let msg = err.to_string();
2159 assert!(
2160 msg.contains("requires total_size >="),
2161 "expected depth-vs-size lower-bound error; got: {msg}",
2162 );
2163 }
2164
2165 #[test]
2166 fn tree_decode_rejects_depth_above_cap() {
2167 let body = TreeBody {
2168 body_version: BLOB_TREE_BODY_VERSION,
2169 uri: "mesh://x".to_string(),
2170 encoding: Encoding::Replicated,
2171 root_hash: tree_root(),
2172 total_size: 1024,
2173 depth: super::super::blob_tree::MAX_TREE_DEPTH + 1,
2174 };
2175 let body_bytes = postcard::to_allocvec(&body).unwrap();
2176 let mut bytes = Vec::new();
2177 bytes.extend_from_slice(&BLOB_REF_MAGIC);
2178 bytes.push(BLOB_REF_VERSION_V3_TREE);
2179 bytes.extend_from_slice(&body_bytes);
2180 let err = BlobRef::decode(&bytes).unwrap_err();
2181 let msg = err.to_string();
2182 assert!(msg.contains("depth"), "got: {msg}");
2183 }
2184
2185 #[test]
2186 fn verify_on_tree_returns_typed_error() {
2187 let r = BlobRef::tree("mesh://aa", Encoding::Replicated, tree_root(), 1024, 1).unwrap();
2188 let err = r.verify(b"any bytes").unwrap_err();
2189 let msg = err.to_string();
2190 assert!(
2191 msg.contains("Tree variant"),
2192 "Tree verify should surface a typed Decode error pointing at tree-walk; got: {msg}",
2193 );
2194 }
2195
2196 #[test]
2197 fn tree_does_not_alias_small_or_manifest_via_decode() {
2198 // Round-trip three variants and assert each decodes back
2199 // to its own shape. Pre-fix the version-byte gate ensures
2200 // a Tree wire form is never mis-decoded as Small/Manifest.
2201 let small = BlobRef::small("mesh://aa", [0xAA; 32], 100);
2202 let manifest = BlobRef::manifest(
2203 "mesh://bb",
2204 Encoding::Replicated,
2205 vec![ChunkRef {
2206 hash: [0xBB; 32],
2207 size: 1024,
2208 }],
2209 )
2210 .unwrap();
2211 let tree = BlobRef::tree(
2212 "mesh://cc",
2213 Encoding::Replicated,
2214 [0xCC; 32],
2215 1024 * 1024 * 1024,
2216 1,
2217 )
2218 .unwrap();
2219
2220 let s_decoded = BlobRef::decode(&small.encode()).unwrap().unwrap();
2221 let m_decoded = BlobRef::decode(&manifest.encode()).unwrap().unwrap();
2222 let t_decoded = BlobRef::decode(&tree.encode()).unwrap().unwrap();
2223
2224 assert!(matches!(s_decoded, BlobRef::Small { .. }));
2225 assert!(matches!(m_decoded, BlobRef::Manifest { .. }));
2226 assert!(matches!(t_decoded, BlobRef::Tree { .. }));
2227 assert_eq!(s_decoded.version(), BLOB_REF_VERSION_V1);
2228 assert_eq!(m_decoded.version(), BLOB_REF_VERSION_V2_MANIFEST);
2229 assert_eq!(t_decoded.version(), BLOB_REF_VERSION_V3_TREE);
2230 }
2231}