triblespace_core/blob.rs
1//! Anything that can be represented as a byte sequence.
2//!
3//! Blobs store larger data items outside tribles and values. For the design
4//! rationale and an extended usage example see the [Blobs
5//! chapter](../book/src/deep-dive/blobs.md) of the Tribles Book.
6
7// Converting Rust types to blobs is infallible in practice, so only `IntoBlob`
8// and `TryFromBlob` are used throughout the codebase. `TryToBlob` and
9// `FromBlob` were never required and have been removed for simplicity.
10
11mod cache;
12mod memoryblobstore;
13/// Built-in blob encoding types and their conversion implementations.
14pub mod encodings;
15
16use crate::metadata::MetaDescribe;
17use crate::inline::encodings::hash::Handle;
18use crate::inline::Inline;
19use crate::inline::InlineEncoding;
20
21use std::convert::Infallible;
22use std::error::Error;
23use std::fmt::Debug;
24use std::fmt::{self};
25use std::hash::Hash;
26use std::marker::PhantomData;
27
28/// Re-export of the blob cache wrapper.
29pub use cache::BlobCache;
30/// Re-export of the in-memory blob store.
31pub use memoryblobstore::MemoryBlobStore;
32
33/// Re-export of `anybytes::Bytes` for blob payloads.
34pub use anybytes::Bytes;
35
36/// A content-addressed value: immutable bytes paired with their
37/// Blake3 handle and a schema marker.
38///
39/// `Blob<S>` is the **heavy form** of a content-addressed payload —
40/// it carries the bytes plus the cached
41/// [`Inline<Handle<S>>`][Handle] that names them. The handle is the
42/// **lightweight form**: a 32-byte reference you can store in
43/// tribles, send across the network, or hand around freely without
44/// dragging the bytes along. `Blob` ↔ `Handle<S>` is the same
45/// "content / reference" duality as `Vec<T>` ↔ `&[T]`, except the
46/// reference is hash-based rather than pointer-based and survives
47/// crossing process boundaries.
48///
49/// The link is enforced by construction:
50/// - [`Blob::new`] hashes the bytes and stores the resulting handle.
51/// Subsequent `get_handle` / `as_ref` calls are O(1).
52/// - [`Blob::with_handle`] is the explicit "trust me" constructor for
53/// read paths where the handle is already known (a blob-store
54/// reader pulling a known-keyed entry, a pile-format decoder where
55/// the index has the hash). Caller asserts `handle == Blake3(bytes)`.
56/// - [`Blob::transmute`] / [`Blob::as_transmute`] preserve the cached
57/// handle across schema casts — the Blake3 hash is over bytes, not
58/// over schema, so the digest survives the phantom change.
59///
60/// `Blob<S>: AsRef<Inline<Handle<S>>>` so `&blob` deref-coerces to the
61/// lightweight reference for free.
62///
63/// The previous shape (`#[repr(transparent)]` around `Bytes`) was
64/// given up deliberately: caching the handle in the struct
65/// eliminates a real double-hash that surfaced at every `insert` site,
66/// and the only call that relied on transparency (`as_transmute`'s
67/// `mem::transmute`) still works because `Blob<S>` and `Blob<T>`
68/// have identical layouts for any `S`/`T: BlobEncoding` (phantoms
69/// are zero-sized, handle is `[u8; 32] + PhantomData`).
70pub struct Blob<S: BlobEncoding> {
71 /// The raw byte content of this blob.
72 pub bytes: Bytes,
73 /// Cached content-addressed handle. Computed eagerly at
74 /// construction time; reused on every `get_handle` call and on
75 /// `MemoryBlobStore::insert`.
76 handle: Inline<Handle<S>>,
77 _schema: PhantomData<S>,
78}
79
80impl<S> Blob<S>
81where
82 S: BlobEncoding,
83 Handle<S>: InlineEncoding,
84{
85 /// Creates a new blob from a sequence of bytes.
86 ///
87 /// **Hashes eagerly**: this call runs Blake3 over `bytes` once and
88 /// caches the resulting handle. Subsequent `get_handle` /
89 /// `MemoryBlobStore::insert` calls reuse the cached value at O(1).
90 /// For most use cases this is what callers want — `Blob::new`
91 /// almost always precedes an `insert` or a `get_handle`. If you
92 /// have a blob path that's *never* hashed and the eager cost
93 /// matters, reach for the raw `Bytes` instead.
94 pub fn new(bytes: Bytes) -> Self {
95 let digest = crate::inline::encodings::hash::Blake3::digest(&bytes);
96 Self {
97 bytes,
98 handle: Inline::new(digest),
99 _schema: PhantomData,
100 }
101 }
102
103 /// Constructs a blob from bytes *and* a precomputed handle,
104 /// skipping the hash step.
105 ///
106 /// Used by blob-store readers (`MemoryBlobStoreReader::get` and
107 /// friends) and pile-format decoders that already know the
108 /// handle the blob is stored under — they read the bytes out of
109 /// their backing storage already keyed by hash, so recomputing
110 /// it would be pure overhead.
111 ///
112 /// # Safety
113 ///
114 /// The caller asserts that `handle == Blake3(bytes)`. The cache
115 /// is trusted on read paths; if these diverge,
116 /// `MemoryBlobStore::insert(blob)` will store the bytes under
117 /// `handle` (not the true Blake3 hash), and subsequent lookups
118 /// will silently miss or return wrong data. Always pair this
119 /// with a hash you got from a trusted source (the same store
120 /// you're reading from, the pile header, a verified network
121 /// fetch). For callers without that guarantee, use
122 /// [`Blob::new`] which hashes from bytes.
123 pub fn with_handle(bytes: Bytes, handle: Inline<Handle<S>>) -> Self {
124 Self {
125 bytes,
126 handle,
127 _schema: PhantomData,
128 }
129 }
130
131 /// Reinterprets the contained bytes as a blob of a different schema.
132 ///
133 /// This is a zero-copy transformation: bytes pass through and the
134 /// cached handle is recast at the phantom level. It does **not**
135 /// validate that the data actually conforms to the new schema.
136 pub fn transmute<T: BlobEncoding>(self) -> Blob<T>
137 where
138 Handle<T>: InlineEncoding,
139 {
140 Blob {
141 bytes: self.bytes,
142 handle: self.handle.transmute(),
143 _schema: PhantomData,
144 }
145 }
146
147 /// Transmutes the blob to a blob of a different schema.
148 /// This is a zero-cost operation.
149 /// If the schema types are not compatible, this will not cause undefined behavior,
150 /// but it might cause unexpected results.
151 ///
152 /// This is primarily used to give blobs with an [UnknownBlob](crate::blob::encodings::UnknownBlob) schema a more specific schema.
153 /// Use with caution.
154 pub fn as_transmute<T: BlobEncoding>(&self) -> &Blob<T> {
155 unsafe { std::mem::transmute(self) }
156 }
157
158 /// Returns the cached Blake3 handle. O(1) — no rehash.
159 ///
160 /// The handle is the *lightweight reference* form of this blob —
161 /// 32 bytes you can store in a trible, share over the network, or
162 /// pass around freely. The blob is the *heavy* form (bytes you
163 /// can decode). Both share the same Blake3 identity.
164 pub fn get_handle(&self) -> Inline<Handle<S>> {
165 self.handle
166 }
167
168 /// Tries to convert the blob to a concrete Rust type.
169 /// If the conversion fails, an error is returned.
170 pub fn try_from_blob<T>(self) -> Result<T, <T as TryFromBlob<S>>::Error>
171 where
172 T: TryFromBlob<S>,
173 {
174 <T as TryFromBlob<S>>::try_from_blob(self)
175 }
176}
177
178impl<T> Clone for Blob<T>
179where
180 T: BlobEncoding,
181 Handle<T>: InlineEncoding,
182{
183 fn clone(&self) -> Self {
184 Self {
185 bytes: self.bytes.clone(),
186 handle: self.handle,
187 _schema: PhantomData,
188 }
189 }
190}
191
192/// `Blob<S>` borrows as the `Inline<Handle<S>>` that references it.
193///
194/// Models the heavy/lightweight duality at the type system level:
195/// a `Blob<S>` IS a content-addressed value, and its `Handle<S>` is
196/// the 32-byte reference form. Coercing a `&Blob<S>` to a
197/// `&Inline<Handle<S>>` is free — the handle is stored as a field —
198/// so code that wants to pass the lightweight reference around
199/// (e.g. inserting into a trible, sending over the network) can
200/// just `blob.as_ref()` instead of `&blob.get_handle()`.
201impl<S> AsRef<Inline<Handle<S>>> for Blob<S>
202where
203 S: BlobEncoding,
204 Handle<S>: InlineEncoding,
205{
206 fn as_ref(&self) -> &Inline<Handle<S>> {
207 &self.handle
208 }
209}
210
211impl<T: BlobEncoding> PartialEq for Blob<T> {
212 fn eq(&self, other: &Self) -> bool {
213 self.bytes == other.bytes
214 }
215}
216
217impl<T: BlobEncoding> Eq for Blob<T> {}
218
219impl<T: BlobEncoding> Hash for Blob<T> {
220 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
221 self.bytes.hash(state);
222 }
223}
224
225impl<T: BlobEncoding> Debug for Blob<T> {
226 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
227 write!(f, "Blob<{}>", std::any::type_name::<T>())
228 }
229}
230
231/// A trait for defining the abstract schema type of a blob.
232/// This is similar to the [`InlineEncoding`] trait in the [`value`](crate::value) module.
233pub trait BlobEncoding: MetaDescribe + Sized + 'static {
234 /// Converts a concrete Rust type to a blob with this schema via [`IntoBlob`].
235 fn blob_from<T: IntoBlob<Self>>(t: T) -> Blob<Self> {
236 t.to_blob()
237 }
238
239 /// Lift a `Blob<Self>` into the [`Encoded`](crate::inline::Encoded)
240 /// sum `entity!{}` consumes — yields
241 /// `Encoded::Blob(blob.transmute())`. The handle lives inside the
242 /// blob; consumers recover it via
243 /// [`Encoded::inline`](crate::inline::Encoded::inline).
244 ///
245 /// Overridable if a schema has unusual storage semantics. The
246 /// inline-path counterpart lives on
247 /// [`InlineEncoding::to_encoded`].
248 fn to_encoded(blob: Blob<Self>) -> crate::inline::Encoded<Handle<Self>>
249 where
250 Handle<Self>: InlineEncoding,
251 {
252 crate::inline::Encoded::Blob(blob.transmute::<crate::blob::encodings::UnknownBlob>())
253 }
254}
255
256/// Shorthand bound for `IntoEncoded<S, Output = Blob<S>>` — "this
257/// source produces a `Blob<S>` for content-addressed storage."
258///
259/// `IntoBlob` is a supertrait alias over
260/// [`IntoEncoded`](crate::inline::IntoEncoded): any type that
261/// implements `IntoEncoded<S>` with `Output = Blob<S>` automatically
262/// becomes `IntoBlob<S>`, and gains the `to_blob(self) -> Blob<S>`
263/// convenience method.
264///
265/// The trait parameter is the [`BlobEncoding`] directly (not
266/// `Handle<S>`) — this is what makes `impl IntoBlob<MyBlobEncoding>
267/// for MyForeignType` legal for downstream crates: the local
268/// `MyBlobEncoding` sits at trait position 0, satisfying Rust's
269/// orphan rule.
270pub trait IntoBlob<S: BlobEncoding>:
271 crate::inline::IntoEncoded<S, Output = Blob<S>>
272{
273 /// Convert directly to `Blob<S>`.
274 fn to_blob(self) -> Blob<S>
275 where
276 Self: Sized,
277 {
278 self.into_encoded()
279 }
280}
281impl<S, T> IntoBlob<S> for T
282where
283 S: BlobEncoding,
284 T: crate::inline::IntoEncoded<S, Output = Blob<S>>,
285{
286}
287
288/// A trait for converting a [Blob] with a specific schema to a Rust type.
289/// This trait is implemented on the concrete Rust type.
290///
291/// This might return an error if the conversion is not possible,
292/// This is the counterpart to the [`IntoBlob`] trait.
293///
294/// See [TryFromInline](crate::inline::TryFromInline) for the counterpart trait for values.
295pub trait TryFromBlob<S: BlobEncoding>: Sized {
296 /// The error type returned when the conversion fails.
297 type Error: Error + Send + Sync + 'static;
298 /// Attempts to convert a blob into this type.
299 fn try_from_blob(b: Blob<S>) -> Result<Self, Self::Error>;
300}
301
302impl<S: BlobEncoding> TryFromBlob<S> for Blob<S> {
303 type Error = Infallible;
304
305 fn try_from_blob(b: Blob<S>) -> Result<Self, Self::Error> {
306 Ok(b)
307 }
308}
309
310/// `Blob<S>` is the identity source for [`IntoEncoded<S>`] in the
311/// blob path: it converts to itself with no allocation, and the
312/// cached handle inside lets every downstream step skip rehashing.
313impl<S: BlobEncoding> crate::inline::Encodes<Blob<S>> for S
314where
315 Handle<S>: InlineEncoding,
316{
317 type Output = Blob<S>;
318 fn encode(source: Blob<S>) -> Blob<S> {
319 source
320 }
321}
322
323/// `Blob<T>` is the `ToEncoded<Handle<T>>` expander: it delegates to
324/// [`BlobEncoding::to_encoded`] for the actual blob-to-Encoded lift. The
325/// trait is the macro-side dispatch shim; the logic lives on
326/// `BlobEncoding` so users (and schemas that need custom storage
327/// semantics) can call or override it directly.
328impl<T> crate::inline::ToEncoded<Handle<T>> for Blob<T>
329where
330 T: BlobEncoding,
331 Handle<T>: InlineEncoding,
332{
333 fn to_encoded(self) -> crate::inline::Encoded<Handle<T>> {
334 <T as BlobEncoding>::to_encoded(self)
335 }
336}
337
338/// Precomputed-handle case: a `Inline<Handle<T>>` can be passed as a
339/// `IntoEncoded<T>` source (T is the BlobEncoding, matching the
340/// `Handle<T>`-attributed field's `Encoding`). Output is the value
341/// itself; no side-blob — caller asserts the bytes live somewhere
342/// resolvable.
343impl<T: BlobEncoding> crate::inline::Encodes<Inline<Handle<T>>> for T
344where
345 Handle<T>: InlineEncoding,
346{
347 type Output = Inline<Handle<T>>;
348 fn encode(source: Inline<Handle<T>>) -> Inline<Handle<T>> {
349 source
350 }
351}
352
353/// Reference form of the precomputed-handle case.
354impl<T: BlobEncoding> crate::inline::Encodes<&Inline<Handle<T>>> for T
355where
356 Handle<T>: InlineEncoding,
357{
358 type Output = Inline<Handle<T>>;
359 fn encode(source: &Inline<Handle<T>>) -> Inline<Handle<T>> {
360 *source
361 }
362}
363
364#[cfg(test)]
365mod tests {
366 use super::*;
367 use crate::blob::encodings::UnknownBlob;
368 use crate::inline::encodings::hash::Blake3;
369
370 #[test]
371 fn new_computes_and_caches_handle() {
372 let b: Blob<UnknownBlob> = Blob::new(Bytes::from(b"hello".to_vec()));
373 let h1 = b.get_handle();
374 let h2 = b.get_handle();
375 // Same handle on repeat — cache is stable.
376 assert_eq!(h1, h2);
377 // And matches a fresh independent Blake3 of the bytes.
378 let independent = Inline::new(Blake3::digest(b"hello"));
379 let h_typed: Inline<Handle<UnknownBlob>> = independent;
380 assert_eq!(h1, h_typed);
381 }
382
383 #[test]
384 fn with_handle_trusts_the_provided_handle() {
385 // Construct a blob with a *deliberately bogus* handle. The
386 // cache returns it verbatim — proving we don't recompute from
387 // bytes. This is the optimization read paths exploit (they
388 // already know the handle, no point re-hashing).
389 let bogus: Inline<Handle<UnknownBlob>> = Inline::new([0xAA; 32]);
390 let b: Blob<UnknownBlob> = Blob::with_handle(
391 Bytes::from(b"any bytes".to_vec()),
392 bogus,
393 );
394 assert_eq!(b.get_handle(), bogus);
395 }
396
397 #[test]
398 fn as_ref_borrows_the_lightweight_handle() {
399 let b: Blob<UnknownBlob> = Blob::new(Bytes::from(b"borrow me".to_vec()));
400 let h_owned: Inline<Handle<UnknownBlob>> = b.get_handle();
401 let h_borrowed: &Inline<Handle<UnknownBlob>> = b.as_ref();
402 // Same value, no allocation, no rehash.
403 assert_eq!(h_owned, *h_borrowed);
404 }
405
406 #[test]
407 fn transmute_carries_cached_handle() {
408 let b: Blob<UnknownBlob> = Blob::new(Bytes::from(b"shared".to_vec()));
409 let h_before: Inline<Handle<UnknownBlob>> = b.get_handle();
410 // Schema cast — handle bytes stay identical, only the phantom
411 // changes.
412 let b2: Blob<crate::blob::encodings::longstring::LongString> =
413 b.transmute::<crate::blob::encodings::longstring::LongString>();
414 let h_after = b2.get_handle();
415 assert_eq!(h_before.raw, h_after.raw);
416 }
417}