cortex_retrieval/embedding/local_stub.rs
1//! Deterministic BLAKE3-based **stub** embedder backend (Phase 4.C / D2-B).
2//!
3//! # This is NOT a semantic embedder
4//!
5//! Read that line again. The vectors produced by [`LocalStubEmbedder`] are
6//! reproducible — same `(text, tags)` produces the same bytes for the lifetime
7//! of the `stub:blake3-v1` backend id — but they carry **no semantic
8//! information**. Two inputs that differ by a single byte produce vectors that
9//! look as different as two inputs from unrelated topics, because BLAKE3 is a
10//! cryptographic hash with strong avalanche properties.
11//!
12//! Comparing two memories' stub embeddings reflects hash-collision properties
13//! of BLAKE3 (effectively: never collide), not semantic similarity. Downstream
14//! `cortex retrieval` operators should expect the stub to give **random
15//! ordering** on real queries — which is the right behavior for "we don't
16//! have a real embedder yet" and prevents operators from accidentally relying
17//! on illusory semantic ranking.
18//!
19//! # Why it exists
20//!
21//! Phase 4.C is landing the composition pipeline that wires lexical retrieval,
22//! FTS5 fuzzy match, and (eventually) semantic embeddings into a single
23//! `score` call. The composer needs *some* `Embedder` to call against so the
24//! pipeline can be tested end-to-end without depending on external ML weights,
25//! ONNX runtimes, or training data. This stub is that "some Embedder".
26//!
27//! Replace this backend with a real one before claiming any **semantic
28//! similarity** capability to operators. The replacement slots in behind the
29//! [`super::Embedder`] trait — the composer does not change.
30//!
31//! # Determinism construction
32//!
33//! Given an input `(text, tags)`:
34//!
35//! 1. Build a canonical, length-prefixed input buffer:
36//! ```text
37//! DOMAIN_TAG_STUB_EMBED // 1 byte: 0x10
38//! || text.len() as u64 (LE) // 8 bytes
39//! || text bytes
40//! || tags.len() as u64 (LE) // 8 bytes
41//! || for each tag in input order:
42//! tag.len() as u64 (LE) // 8 bytes
43//! || tag bytes
44//! ```
45//! The length-prefix + domain-tag framing is the same defense against
46//! boundary-confusion collisions used by `cortex-ledger` event hashing
47//! (see `cortex_ledger::hash` docs).
48//!
49//! 2. Feed the buffer into a BLAKE3 [extendable-output][xof] reader and pull
50//! `STUB_DIM * 4 = 512` bytes. Each 4-byte little-endian chunk becomes one
51//! `f32` slot via `(byte0 / 255.0) * 2.0 - 1.0` averaged across the chunk
52//! — concretely we average four `[-1, 1]` byte mappings to land in
53//! `[-1, 1]` with finer granularity than a single byte gives.
54//!
55//! 3. **L2-normalize** the vector so cosine similarity (the natural metric
56//! for downstream comparisons) behaves correctly. Without normalization,
57//! cosine similarity would still work mathematically, but the magnitude
58//! of the raw vector would carry implementation noise that propagates
59//! into score weights.
60//!
61//! [xof]: https://docs.rs/blake3/latest/blake3/struct.OutputReader.html
62//!
63//! # Backend id
64//!
65//! `backend_id() = "stub:blake3-v1"`. The `v1` suffix is mandatory: any change
66//! to the input framing, dimensionality, byte-to-float mapping, or
67//! normalization invalidates the deterministic contract for previously stored
68//! vectors and MUST bump the version to `v2`, `v3`, etc. Future real backends
69//! (e.g. `"onnx:minilm-l6-v2"`) coexist with this stub by carrying distinct
70//! ids.
71//!
72//! # Slotting in a real backend
73//!
74//! Future real backends live in sibling modules (e.g. `embedding/onnx.rs`) and
75//! implement the same [`super::Embedder`] trait. The composer accepts a
76//! `Box<dyn Embedder>` (or `Arc<dyn Embedder>`) — wiring the new backend is a
77//! one-line change at the call site. We deliberately do **not** feature-gate
78//! the stub: the stub is harmless to compile and useful as a fallback when a
79//! real backend is unavailable. (If a future requirement demands removing the
80//! stub from release binaries, gate it then with `#[cfg(feature = "stub")]`;
81//! today, runtime registration is sufficient.)
82
83use blake3::Hasher;
84
85use super::{EmbedResult, Embedder};
86
87/// Domain tag separating stub-embed BLAKE3 inputs from any other BLAKE3
88/// domain. Reserved: 0x10. Reusing this tag for a different purpose is a
89/// cross-domain collision risk; pick a different byte.
90const DOMAIN_TAG_STUB_EMBED: u8 = 0x10;
91
92/// Backend id reported by [`LocalStubEmbedder::backend_id`].
93///
94/// The version suffix is part of the contract: bump it on any change that
95/// affects the byte-level output of [`Embedder::embed`].
96pub const STUB_BACKEND_ID: &str = "stub:blake3-v1";
97
98/// Output dimensionality of the stub backend.
99///
100/// 128 is a round number large enough to exercise the composer's vector-math
101/// code paths but small enough to keep test fixtures readable. Real backends
102/// typically pick 384, 768, or 1536; the stub does not need to match.
103pub const STUB_DIM: usize = 128;
104
105/// Number of bytes pulled from the BLAKE3 XOF per `f32` slot.
106///
107/// Four bytes per slot gives 256^4 ≈ 4.3 * 10^9 distinct raw values, plenty
108/// for the `[-1, 1]` continuous-ish projection. Using only one byte per slot
109/// would cap each component at 256 distinct values, which is enough for
110/// determinism testing but visibly coarse in test assertions.
111const BYTES_PER_SLOT: usize = 4;
112
113/// Deterministic BLAKE3-based stub embedder.
114///
115/// See the module docs for the full contract. Construct via
116/// [`LocalStubEmbedder::new`] (zero-sized; no state is held).
117#[derive(Debug, Default, Clone, Copy)]
118pub struct LocalStubEmbedder;
119
120impl LocalStubEmbedder {
121 /// Construct a stub embedder. Zero-cost; identical to `Default::default`.
122 #[must_use]
123 pub const fn new() -> Self {
124 Self
125 }
126}
127
128impl Embedder for LocalStubEmbedder {
129 fn backend_id(&self) -> &str {
130 STUB_BACKEND_ID
131 }
132
133 fn dim(&self) -> usize {
134 STUB_DIM
135 }
136
137 fn embed(&self, text: &str, tags: &[String]) -> EmbedResult<Vec<f32>> {
138 // Step 1: build the framed input. Length prefixes + domain tag
139 // prevent boundary-confusion across (text, tags) splits.
140 let mut hasher = Hasher::new();
141 hasher.update(&[DOMAIN_TAG_STUB_EMBED]);
142
143 let text_bytes = text.as_bytes();
144 hasher.update(&(text_bytes.len() as u64).to_le_bytes());
145 hasher.update(text_bytes);
146
147 hasher.update(&(tags.len() as u64).to_le_bytes());
148 for tag in tags {
149 let tag_bytes = tag.as_bytes();
150 hasher.update(&(tag_bytes.len() as u64).to_le_bytes());
151 hasher.update(tag_bytes);
152 }
153
154 // Step 2: pull STUB_DIM * BYTES_PER_SLOT bytes from the XOF and map
155 // every 4-byte chunk to an averaged `[-1, 1]` f32.
156 let mut xof = hasher.finalize_xof();
157 let mut bytes = vec![0u8; STUB_DIM * BYTES_PER_SLOT];
158 xof.fill(&mut bytes);
159
160 let mut vec = Vec::with_capacity(STUB_DIM);
161 for chunk in bytes.chunks_exact(BYTES_PER_SLOT) {
162 // Map each byte to [-1, 1] and average. The result is a finer-
163 // grained value in [-1, 1] than a single byte alone would give.
164 let mut acc = 0.0f32;
165 for &b in chunk {
166 acc += (f32::from(b) / 255.0) * 2.0 - 1.0;
167 }
168 vec.push(acc / BYTES_PER_SLOT as f32);
169 }
170
171 // Step 3: L2-normalize so cosine similarity behaves naturally.
172 let norm_sq: f32 = vec.iter().map(|x| x * x).sum();
173 // A zero-norm output from BLAKE3 is astronomically improbable (it
174 // would require every one of 512 bytes to be exactly 0x7F or
175 // 0x80 in a balanced way), but we guard against it for clarity
176 // rather than panicking.
177 if norm_sq == 0.0 {
178 return Ok(vec);
179 }
180 let inv_norm = norm_sq.sqrt().recip();
181 for slot in &mut vec {
182 *slot *= inv_norm;
183 }
184
185 Ok(vec)
186 }
187}
188
189#[cfg(test)]
190mod tests {
191 use super::*;
192 use crate::embedding::cosine_similarity;
193
194 /// Determinism: same input → same output, byte-for-byte.
195 #[test]
196 fn stub_embedder_is_deterministic_for_identical_input() {
197 let e = LocalStubEmbedder::new();
198 let tags = vec!["alpha".to_string(), "beta".to_string()];
199 let a = e.embed("hello world", &tags).unwrap();
200 let b = e.embed("hello world", &tags).unwrap();
201 assert_eq!(a, b, "stub embedder must be deterministic across calls");
202 // And across freshly constructed instances:
203 let c = LocalStubEmbedder::new()
204 .embed("hello world", &tags)
205 .unwrap();
206 assert_eq!(a, c, "stub embedder must be instance-independent");
207 }
208
209 /// Text changes produce different vectors.
210 #[test]
211 fn stub_embedder_differs_for_different_text() {
212 let e = LocalStubEmbedder::new();
213 let tags = vec!["alpha".to_string()];
214 let a = e.embed("hello world", &tags).unwrap();
215 let b = e.embed("goodbye world", &tags).unwrap();
216 assert_ne!(a, b, "different text must produce different vectors");
217 }
218
219 /// Tag changes produce different vectors even when text is identical.
220 #[test]
221 fn stub_embedder_differs_for_different_tags() {
222 let e = LocalStubEmbedder::new();
223 let a = e.embed("hello", &["alpha".to_string()]).unwrap();
224 let b = e.embed("hello", &["beta".to_string()]).unwrap();
225 assert_ne!(a, b, "different tags must produce different vectors");
226
227 // Empty vs single-tag must also differ — exercises the
228 // tags.len() framing prefix.
229 let empty: Vec<String> = vec![];
230 let c = e.embed("hello", &empty).unwrap();
231 assert_ne!(a, c, "no tags vs one tag must differ");
232
233 // Tag *order* matters: ["a", "b"] vs ["b", "a"] must differ
234 // because the framing concatenates in input order. This protects
235 // against a future "tag canonicalization" change being silent.
236 let ab = e
237 .embed("hello", &["a".to_string(), "b".to_string()])
238 .unwrap();
239 let ba = e
240 .embed("hello", &["b".to_string(), "a".to_string()])
241 .unwrap();
242 assert_ne!(ab, ba, "tag order must be part of the determinism key");
243 }
244
245 /// Backend id is versioned and stable.
246 #[test]
247 fn stub_embedder_backend_id_is_versioned() {
248 let e = LocalStubEmbedder::new();
249 assert_eq!(e.backend_id(), "stub:blake3-v1");
250 assert_eq!(e.backend_id(), STUB_BACKEND_ID);
251 // The id MUST contain a version segment so future stubs can coexist.
252 assert!(
253 e.backend_id().contains("-v"),
254 "backend id must carry a version suffix"
255 );
256 }
257
258 /// Dimensionality is fixed at 128 and matches the produced vector length.
259 #[test]
260 fn stub_embedder_dim_is_128() {
261 let e = LocalStubEmbedder::new();
262 assert_eq!(e.dim(), 128);
263 assert_eq!(STUB_DIM, 128);
264 let v = e.embed("anything", &[]).unwrap();
265 assert_eq!(v.len(), 128, "embed output length must equal dim()");
266 }
267
268 /// L2-normalized output: norm is ~1.0 so cosine similarity behaves.
269 ///
270 /// Documented in the module docs: we normalize so that downstream
271 /// `cosine_similarity` reads as a dot product on unit vectors.
272 #[test]
273 fn stub_embedder_output_is_unit_normalized() {
274 let e = LocalStubEmbedder::new();
275 let v = e
276 .embed("the quick brown fox", &["english".to_string()])
277 .unwrap();
278 let norm_sq: f32 = v.iter().map(|x| x * x).sum();
279 let norm = norm_sq.sqrt();
280 // f32 rounding tolerance: BLAKE3 output is essentially uniform,
281 // and we sum 128 squared values, so the f32 error budget is
282 // dominated by the final sqrt and per-element multiply. 1e-5 is
283 // comfortably loose.
284 assert!(
285 (norm - 1.0).abs() < 1e-5,
286 "expected unit-norm vector, got norm = {norm}"
287 );
288
289 // Cosine similarity of a vector with itself must be ~1.0; with a
290 // different vector it must be in [-1, 1].
291 let v2 = e.embed("the lazy dog", &["english".to_string()]).unwrap();
292 let self_sim = cosine_similarity(&v, &v);
293 let cross_sim = cosine_similarity(&v, &v2);
294 assert!(
295 (self_sim - 1.0).abs() < 1e-5,
296 "self-similarity must be ~1.0, got {self_sim}"
297 );
298 assert!(
299 (-1.0..=1.0).contains(&cross_sim),
300 "cross-similarity must be in [-1, 1], got {cross_sim}"
301 );
302 // The stub is NOT a semantic embedder, so we do not assert that
303 // "english"-tagged English sentences are similar. We *do* assert
304 // they are not perfectly aligned: BLAKE3 avalanche guarantees the
305 // cross similarity is nowhere near 1.0 for distinct inputs.
306 assert!(
307 cross_sim.abs() < 0.5,
308 "BLAKE3 avalanche should keep distinct-input similarity well below 1.0, \
309 got {cross_sim}"
310 );
311 }
312
313 /// Bonus: empty text and empty tags is a valid input and produces a
314 /// vector. (Protects against an off-by-one in the framing buffer.)
315 #[test]
316 fn stub_embedder_handles_empty_inputs() {
317 let e = LocalStubEmbedder::new();
318 let v = e.embed("", &[]).unwrap();
319 assert_eq!(v.len(), STUB_DIM);
320 let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
321 assert!(
322 (norm - 1.0).abs() < 1e-5,
323 "empty-input vector must still be unit-normalized, got {norm}"
324 );
325 }
326
327 /// Bonus: framing prefix prevents the classic boundary-confusion
328 /// collision. ("hello" + ["world"]) and ("helloworld" + []) MUST
329 /// produce different vectors despite their raw byte concatenations
330 /// being similar.
331 #[test]
332 fn stub_embedder_framing_resists_boundary_confusion() {
333 let e = LocalStubEmbedder::new();
334 let a = e.embed("hello", &["world".to_string()]).unwrap();
335 let b = e.embed("helloworld", &[]).unwrap();
336 assert_ne!(
337 a, b,
338 "length-prefix framing must prevent boundary-confusion collision"
339 );
340 }
341}