Skip to main content

triblespace_core/
blob.rs

1//! Anything that can be represented as a byte sequence.
2//!
3//! Blobs store larger data items outside tribles and values. For the design
4//! rationale and an extended usage example see the [Blobs
5//! chapter](../book/src/deep-dive/blobs.md) of the Tribles Book.
6
7// Converting Rust types to blobs is infallible in practice, so only `IntoBlob`
8// and `TryFromBlob` are used throughout the codebase.  `TryToBlob` and
9// `FromBlob` were never required and have been removed for simplicity.
10
11mod cache;
12mod memoryblobstore;
13/// Built-in blob encoding types and their conversion implementations.
14pub mod encodings;
15
16use crate::metadata::MetaDescribe;
17use crate::inline::encodings::hash::Handle;
18use crate::inline::Inline;
19use crate::inline::InlineEncoding;
20
21use std::convert::Infallible;
22use std::error::Error;
23use std::fmt::Debug;
24use std::fmt::{self};
25use std::hash::Hash;
26use std::marker::PhantomData;
27
28/// Re-export of the blob cache wrapper.
29pub use cache::BlobCache;
30/// Re-export of the in-memory blob store.
31pub use memoryblobstore::MemoryBlobStore;
32
33/// Re-export of `anybytes::Bytes` for blob payloads.
34pub use anybytes::Bytes;
35
36/// A content-addressed value: immutable bytes paired with their
37/// Blake3 handle and a schema marker.
38///
39/// `Blob<S>` is the **heavy form** of a content-addressed payload —
40/// it carries the bytes plus the cached
41/// [`Inline<Handle<S>>`][Handle] that names them. The handle is the
42/// **lightweight form**: a 32-byte reference you can store in
43/// tribles, send across the network, or hand around freely without
44/// dragging the bytes along. `Blob` ↔ `Handle<S>` is the same
45/// "content / reference" duality as `Vec<T>` ↔ `&[T]`, except the
46/// reference is hash-based rather than pointer-based and survives
47/// crossing process boundaries.
48///
49/// The link is enforced by construction:
50/// - [`Blob::new`] hashes the bytes and stores the resulting handle.
51///   Subsequent `get_handle` / `as_ref` calls are O(1).
52/// - [`Blob::with_handle`] is the explicit "trust me" constructor for
53///   read paths where the handle is already known (a blob-store
54///   reader pulling a known-keyed entry, a pile-format decoder where
55///   the index has the hash). Caller asserts `handle == Blake3(bytes)`.
56/// - [`Blob::transmute`] / [`Blob::as_transmute`] preserve the cached
57///   handle across schema casts — the Blake3 hash is over bytes, not
58///   over schema, so the digest survives the phantom change.
59///
60/// `Blob<S>: AsRef<Inline<Handle<S>>>` so `&blob` deref-coerces to the
61/// lightweight reference for free.
62///
63/// The previous shape (`#[repr(transparent)]` around `Bytes`) was
64/// given up deliberately: caching the handle in the struct
65/// eliminates a real double-hash that surfaced at every `insert` site,
66/// and the only call that relied on transparency (`as_transmute`'s
67/// `mem::transmute`) still works because `Blob<S>` and `Blob<T>`
68/// have identical layouts for any `S`/`T: BlobEncoding` (phantoms
69/// are zero-sized, handle is `[u8; 32] + PhantomData`).
70pub struct Blob<S: BlobEncoding> {
71    /// The raw byte content of this blob.
72    pub bytes: Bytes,
73    /// Cached content-addressed handle. Computed eagerly at
74    /// construction time; reused on every `get_handle` call and on
75    /// `MemoryBlobStore::insert`.
76    handle: Inline<Handle<S>>,
77    _schema: PhantomData<S>,
78}
79
80impl<S> Blob<S>
81where
82    S: BlobEncoding,
83    Handle<S>: InlineEncoding,
84{
85    /// Creates a new blob from a sequence of bytes.
86    ///
87    /// **Hashes eagerly**: this call runs Blake3 over `bytes` once and
88    /// caches the resulting handle. Subsequent `get_handle` /
89    /// `MemoryBlobStore::insert` calls reuse the cached value at O(1).
90    /// For most use cases this is what callers want — `Blob::new`
91    /// almost always precedes an `insert` or a `get_handle`. If you
92    /// have a blob path that's *never* hashed and the eager cost
93    /// matters, reach for the raw `Bytes` instead.
94    pub fn new(bytes: Bytes) -> Self {
95        let digest = crate::inline::encodings::hash::Blake3::digest(&bytes);
96        Self {
97            bytes,
98            handle: Inline::new(digest),
99            _schema: PhantomData,
100        }
101    }
102
103    /// Constructs a blob from bytes *and* a precomputed handle,
104    /// skipping the hash step.
105    ///
106    /// Used by blob-store readers (`MemoryBlobStoreReader::get` and
107    /// friends) and pile-format decoders that already know the
108    /// handle the blob is stored under — they read the bytes out of
109    /// their backing storage already keyed by hash, so recomputing
110    /// it would be pure overhead.
111    ///
112    /// # Safety
113    ///
114    /// The caller asserts that `handle == Blake3(bytes)`. The cache
115    /// is trusted on read paths; if these diverge,
116    /// `MemoryBlobStore::insert(blob)` will store the bytes under
117    /// `handle` (not the true Blake3 hash), and subsequent lookups
118    /// will silently miss or return wrong data. Always pair this
119    /// with a hash you got from a trusted source (the same store
120    /// you're reading from, the pile header, a verified network
121    /// fetch). For callers without that guarantee, use
122    /// [`Blob::new`] which hashes from bytes.
123    pub fn with_handle(bytes: Bytes, handle: Inline<Handle<S>>) -> Self {
124        Self {
125            bytes,
126            handle,
127            _schema: PhantomData,
128        }
129    }
130
131    /// Reinterprets the contained bytes as a blob of a different schema.
132    ///
133    /// This is a zero-copy transformation: bytes pass through and the
134    /// cached handle is recast at the phantom level. It does **not**
135    /// validate that the data actually conforms to the new schema.
136    pub fn transmute<T: BlobEncoding>(self) -> Blob<T>
137    where
138        Handle<T>: InlineEncoding,
139    {
140        Blob {
141            bytes: self.bytes,
142            handle: self.handle.transmute(),
143            _schema: PhantomData,
144        }
145    }
146
147    /// Transmutes the blob to a blob of a different schema.
148    /// This is a zero-cost operation.
149    /// If the schema types are not compatible, this will not cause undefined behavior,
150    /// but it might cause unexpected results.
151    ///
152    /// This is primarily used to give blobs with an [UnknownBlob](crate::blob::encodings::UnknownBlob) schema a more specific schema.
153    /// Use with caution.
154    pub fn as_transmute<T: BlobEncoding>(&self) -> &Blob<T> {
155        unsafe { std::mem::transmute(self) }
156    }
157
158    /// Returns the cached Blake3 handle. O(1) — no rehash.
159    ///
160    /// The handle is the *lightweight reference* form of this blob —
161    /// 32 bytes you can store in a trible, share over the network, or
162    /// pass around freely. The blob is the *heavy* form (bytes you
163    /// can decode). Both share the same Blake3 identity.
164    pub fn get_handle(&self) -> Inline<Handle<S>> {
165        self.handle
166    }
167
168    /// Tries to convert the blob to a concrete Rust type.
169    /// If the conversion fails, an error is returned.
170    pub fn try_from_blob<T>(self) -> Result<T, <T as TryFromBlob<S>>::Error>
171    where
172        T: TryFromBlob<S>,
173    {
174        <T as TryFromBlob<S>>::try_from_blob(self)
175    }
176}
177
178impl<T> Clone for Blob<T>
179where
180    T: BlobEncoding,
181    Handle<T>: InlineEncoding,
182{
183    fn clone(&self) -> Self {
184        Self {
185            bytes: self.bytes.clone(),
186            handle: self.handle,
187            _schema: PhantomData,
188        }
189    }
190}
191
192/// `Blob<S>` borrows as the `Inline<Handle<S>>` that references it.
193///
194/// Models the heavy/lightweight duality at the type system level:
195/// a `Blob<S>` IS a content-addressed value, and its `Handle<S>` is
196/// the 32-byte reference form. Coercing a `&Blob<S>` to a
197/// `&Inline<Handle<S>>` is free — the handle is stored as a field —
198/// so code that wants to pass the lightweight reference around
199/// (e.g. inserting into a trible, sending over the network) can
200/// just `blob.as_ref()` instead of `&blob.get_handle()`.
201impl<S> AsRef<Inline<Handle<S>>> for Blob<S>
202where
203    S: BlobEncoding,
204    Handle<S>: InlineEncoding,
205{
206    fn as_ref(&self) -> &Inline<Handle<S>> {
207        &self.handle
208    }
209}
210
211impl<T: BlobEncoding> PartialEq for Blob<T> {
212    fn eq(&self, other: &Self) -> bool {
213        self.bytes == other.bytes
214    }
215}
216
217impl<T: BlobEncoding> Eq for Blob<T> {}
218
219impl<T: BlobEncoding> Hash for Blob<T> {
220    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
221        self.bytes.hash(state);
222    }
223}
224
225impl<T: BlobEncoding> Debug for Blob<T> {
226    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
227        write!(f, "Blob<{}>", std::any::type_name::<T>())
228    }
229}
230
231/// A trait for defining the abstract schema type of a blob.
232/// This is similar to the [`InlineEncoding`] trait in the [`value`](crate::value) module.
233pub trait BlobEncoding: MetaDescribe + Sized + 'static {
234    /// Converts a concrete Rust type to a blob with this schema via [`IntoBlob`].
235    fn blob_from<T: IntoBlob<Self>>(t: T) -> Blob<Self> {
236        t.to_blob()
237    }
238
239    /// Lift a `Blob<Self>` into the [`Encoded`](crate::inline::Encoded)
240    /// sum `entity!{}` consumes — yields
241    /// `Encoded::Blob(blob.transmute())`. The handle lives inside the
242    /// blob; consumers recover it via
243    /// [`Encoded::inline`](crate::inline::Encoded::inline).
244    ///
245    /// Overridable if a schema has unusual storage semantics. The
246    /// inline-path counterpart lives on
247    /// [`InlineEncoding::to_encoded`].
248    fn to_encoded(blob: Blob<Self>) -> crate::inline::Encoded<Handle<Self>>
249    where
250        Handle<Self>: InlineEncoding,
251    {
252        crate::inline::Encoded::Blob(blob.transmute::<crate::blob::encodings::UnknownBlob>())
253    }
254}
255
256/// Shorthand bound for `IntoEncoded<S, Output = Blob<S>>` — "this
257/// source produces a `Blob<S>` for content-addressed storage."
258///
259/// `IntoBlob` is a supertrait alias over
260/// [`IntoEncoded`](crate::inline::IntoEncoded): any type that
261/// implements `IntoEncoded<S>` with `Output = Blob<S>` automatically
262/// becomes `IntoBlob<S>`, and gains the `to_blob(self) -> Blob<S>`
263/// convenience method.
264///
265/// The trait parameter is the [`BlobEncoding`] directly (not
266/// `Handle<S>`) — this is what makes `impl IntoBlob<MyBlobEncoding>
267/// for MyForeignType` legal for downstream crates: the local
268/// `MyBlobEncoding` sits at trait position 0, satisfying Rust's
269/// orphan rule.
270pub trait IntoBlob<S: BlobEncoding>:
271    crate::inline::IntoEncoded<S, Output = Blob<S>>
272{
273    /// Convert directly to `Blob<S>`.
274    fn to_blob(self) -> Blob<S>
275    where
276        Self: Sized,
277    {
278        self.into_encoded()
279    }
280}
281impl<S, T> IntoBlob<S> for T
282where
283    S: BlobEncoding,
284    T: crate::inline::IntoEncoded<S, Output = Blob<S>>,
285{
286}
287
288/// A trait for converting a [Blob] with a specific schema to a Rust type.
289/// This trait is implemented on the concrete Rust type.
290///
291/// This might return an error if the conversion is not possible,
292/// This is the counterpart to the [`IntoBlob`] trait.
293///
294/// See [TryFromInline](crate::inline::TryFromInline) for the counterpart trait for values.
295pub trait TryFromBlob<S: BlobEncoding>: Sized {
296    /// The error type returned when the conversion fails.
297    type Error: Error + Send + Sync + 'static;
298    /// Attempts to convert a blob into this type.
299    fn try_from_blob(b: Blob<S>) -> Result<Self, Self::Error>;
300}
301
302impl<S: BlobEncoding> TryFromBlob<S> for Blob<S> {
303    type Error = Infallible;
304
305    fn try_from_blob(b: Blob<S>) -> Result<Self, Self::Error> {
306        Ok(b)
307    }
308}
309
310/// `Blob<S>` is the identity source for [`IntoEncoded<S>`] in the
311/// blob path: it converts to itself with no allocation, and the
312/// cached handle inside lets every downstream step skip rehashing.
313impl<S: BlobEncoding> crate::inline::Encodes<Blob<S>> for S
314where
315    Handle<S>: InlineEncoding,
316{
317    type Output = Blob<S>;
318    fn encode(source: Blob<S>) -> Blob<S> {
319        source
320    }
321}
322
323/// `Blob<T>` is the `ToEncoded<Handle<T>>` expander: it delegates to
324/// [`BlobEncoding::to_encoded`] for the actual blob-to-Encoded lift. The
325/// trait is the macro-side dispatch shim; the logic lives on
326/// `BlobEncoding` so users (and schemas that need custom storage
327/// semantics) can call or override it directly.
328impl<T> crate::inline::ToEncoded<Handle<T>> for Blob<T>
329where
330    T: BlobEncoding,
331    Handle<T>: InlineEncoding,
332{
333    fn to_encoded(self) -> crate::inline::Encoded<Handle<T>> {
334        <T as BlobEncoding>::to_encoded(self)
335    }
336}
337
338/// Precomputed-handle case: a `Inline<Handle<T>>` can be passed as a
339/// `IntoEncoded<T>` source (T is the BlobEncoding, matching the
340/// `Handle<T>`-attributed field's `Encoding`). Output is the value
341/// itself; no side-blob — caller asserts the bytes live somewhere
342/// resolvable.
343impl<T: BlobEncoding> crate::inline::Encodes<Inline<Handle<T>>> for T
344where
345    Handle<T>: InlineEncoding,
346{
347    type Output = Inline<Handle<T>>;
348    fn encode(source: Inline<Handle<T>>) -> Inline<Handle<T>> {
349        source
350    }
351}
352
353/// Reference form of the precomputed-handle case.
354impl<T: BlobEncoding> crate::inline::Encodes<&Inline<Handle<T>>> for T
355where
356    Handle<T>: InlineEncoding,
357{
358    type Output = Inline<Handle<T>>;
359    fn encode(source: &Inline<Handle<T>>) -> Inline<Handle<T>> {
360        *source
361    }
362}
363
364#[cfg(test)]
365mod tests {
366    use super::*;
367    use crate::blob::encodings::UnknownBlob;
368    use crate::inline::encodings::hash::Blake3;
369
370    #[test]
371    fn new_computes_and_caches_handle() {
372        let b: Blob<UnknownBlob> = Blob::new(Bytes::from(b"hello".to_vec()));
373        let h1 = b.get_handle();
374        let h2 = b.get_handle();
375        // Same handle on repeat — cache is stable.
376        assert_eq!(h1, h2);
377        // And matches a fresh independent Blake3 of the bytes.
378        let independent = Inline::new(Blake3::digest(b"hello"));
379        let h_typed: Inline<Handle<UnknownBlob>> = independent;
380        assert_eq!(h1, h_typed);
381    }
382
383    #[test]
384    fn with_handle_trusts_the_provided_handle() {
385        // Construct a blob with a *deliberately bogus* handle. The
386        // cache returns it verbatim — proving we don't recompute from
387        // bytes. This is the optimization read paths exploit (they
388        // already know the handle, no point re-hashing).
389        let bogus: Inline<Handle<UnknownBlob>> = Inline::new([0xAA; 32]);
390        let b: Blob<UnknownBlob> = Blob::with_handle(
391            Bytes::from(b"any bytes".to_vec()),
392            bogus,
393        );
394        assert_eq!(b.get_handle(), bogus);
395    }
396
397    #[test]
398    fn as_ref_borrows_the_lightweight_handle() {
399        let b: Blob<UnknownBlob> = Blob::new(Bytes::from(b"borrow me".to_vec()));
400        let h_owned: Inline<Handle<UnknownBlob>> = b.get_handle();
401        let h_borrowed: &Inline<Handle<UnknownBlob>> = b.as_ref();
402        // Same value, no allocation, no rehash.
403        assert_eq!(h_owned, *h_borrowed);
404    }
405
406    #[test]
407    fn transmute_carries_cached_handle() {
408        let b: Blob<UnknownBlob> = Blob::new(Bytes::from(b"shared".to_vec()));
409        let h_before: Inline<Handle<UnknownBlob>> = b.get_handle();
410        // Schema cast — handle bytes stay identical, only the phantom
411        // changes.
412        let b2: Blob<crate::blob::encodings::longstring::LongString> =
413            b.transmute::<crate::blob::encodings::longstring::LongString>();
414        let h_after = b2.get_handle();
415        assert_eq!(h_before.raw, h_after.raw);
416    }
417}