use blake3::Hasher;
use super::{EmbedResult, Embedder};
const DOMAIN_TAG_STUB_EMBED: u8 = 0x10;
pub const STUB_BACKEND_ID: &str = "stub:blake3-v1";
pub const STUB_DIM: usize = 128;
const BYTES_PER_SLOT: usize = 4;
#[derive(Debug, Default, Clone, Copy)]
pub struct LocalStubEmbedder;
impl LocalStubEmbedder {
#[must_use]
pub const fn new() -> Self {
Self
}
}
impl Embedder for LocalStubEmbedder {
fn backend_id(&self) -> &str {
STUB_BACKEND_ID
}
fn dim(&self) -> usize {
STUB_DIM
}
fn embed(&self, text: &str, tags: &[String]) -> EmbedResult<Vec<f32>> {
let mut hasher = Hasher::new();
hasher.update(&[DOMAIN_TAG_STUB_EMBED]);
let text_bytes = text.as_bytes();
hasher.update(&(text_bytes.len() as u64).to_le_bytes());
hasher.update(text_bytes);
hasher.update(&(tags.len() as u64).to_le_bytes());
for tag in tags {
let tag_bytes = tag.as_bytes();
hasher.update(&(tag_bytes.len() as u64).to_le_bytes());
hasher.update(tag_bytes);
}
let mut xof = hasher.finalize_xof();
let mut bytes = vec![0u8; STUB_DIM * BYTES_PER_SLOT];
xof.fill(&mut bytes);
let mut vec = Vec::with_capacity(STUB_DIM);
for chunk in bytes.chunks_exact(BYTES_PER_SLOT) {
let mut acc = 0.0f32;
for &b in chunk {
acc += (f32::from(b) / 255.0) * 2.0 - 1.0;
}
vec.push(acc / BYTES_PER_SLOT as f32);
}
let norm_sq: f32 = vec.iter().map(|x| x * x).sum();
if norm_sq == 0.0 {
return Ok(vec);
}
let inv_norm = norm_sq.sqrt().recip();
for slot in &mut vec {
*slot *= inv_norm;
}
Ok(vec)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::embedding::cosine_similarity;
#[test]
fn stub_embedder_is_deterministic_for_identical_input() {
let e = LocalStubEmbedder::new();
let tags = vec!["alpha".to_string(), "beta".to_string()];
let a = e.embed("hello world", &tags).unwrap();
let b = e.embed("hello world", &tags).unwrap();
assert_eq!(a, b, "stub embedder must be deterministic across calls");
let c = LocalStubEmbedder::new()
.embed("hello world", &tags)
.unwrap();
assert_eq!(a, c, "stub embedder must be instance-independent");
}
#[test]
fn stub_embedder_differs_for_different_text() {
let e = LocalStubEmbedder::new();
let tags = vec!["alpha".to_string()];
let a = e.embed("hello world", &tags).unwrap();
let b = e.embed("goodbye world", &tags).unwrap();
assert_ne!(a, b, "different text must produce different vectors");
}
#[test]
fn stub_embedder_differs_for_different_tags() {
let e = LocalStubEmbedder::new();
let a = e.embed("hello", &["alpha".to_string()]).unwrap();
let b = e.embed("hello", &["beta".to_string()]).unwrap();
assert_ne!(a, b, "different tags must produce different vectors");
let empty: Vec<String> = vec![];
let c = e.embed("hello", &empty).unwrap();
assert_ne!(a, c, "no tags vs one tag must differ");
let ab = e
.embed("hello", &["a".to_string(), "b".to_string()])
.unwrap();
let ba = e
.embed("hello", &["b".to_string(), "a".to_string()])
.unwrap();
assert_ne!(ab, ba, "tag order must be part of the determinism key");
}
#[test]
fn stub_embedder_backend_id_is_versioned() {
let e = LocalStubEmbedder::new();
assert_eq!(e.backend_id(), "stub:blake3-v1");
assert_eq!(e.backend_id(), STUB_BACKEND_ID);
assert!(
e.backend_id().contains("-v"),
"backend id must carry a version suffix"
);
}
#[test]
fn stub_embedder_dim_is_128() {
let e = LocalStubEmbedder::new();
assert_eq!(e.dim(), 128);
assert_eq!(STUB_DIM, 128);
let v = e.embed("anything", &[]).unwrap();
assert_eq!(v.len(), 128, "embed output length must equal dim()");
}
#[test]
fn stub_embedder_output_is_unit_normalized() {
let e = LocalStubEmbedder::new();
let v = e
.embed("the quick brown fox", &["english".to_string()])
.unwrap();
let norm_sq: f32 = v.iter().map(|x| x * x).sum();
let norm = norm_sq.sqrt();
assert!(
(norm - 1.0).abs() < 1e-5,
"expected unit-norm vector, got norm = {norm}"
);
let v2 = e.embed("the lazy dog", &["english".to_string()]).unwrap();
let self_sim = cosine_similarity(&v, &v);
let cross_sim = cosine_similarity(&v, &v2);
assert!(
(self_sim - 1.0).abs() < 1e-5,
"self-similarity must be ~1.0, got {self_sim}"
);
assert!(
(-1.0..=1.0).contains(&cross_sim),
"cross-similarity must be in [-1, 1], got {cross_sim}"
);
assert!(
cross_sim.abs() < 0.5,
"BLAKE3 avalanche should keep distinct-input similarity well below 1.0, \
got {cross_sim}"
);
}
#[test]
fn stub_embedder_handles_empty_inputs() {
let e = LocalStubEmbedder::new();
let v = e.embed("", &[]).unwrap();
assert_eq!(v.len(), STUB_DIM);
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
assert!(
(norm - 1.0).abs() < 1e-5,
"empty-input vector must still be unit-normalized, got {norm}"
);
}
#[test]
fn stub_embedder_framing_resists_boundary_confusion() {
let e = LocalStubEmbedder::new();
let a = e.embed("hello", &["world".to_string()]).unwrap();
let b = e.embed("helloworld", &[]).unwrap();
assert_ne!(
a, b,
"length-prefix framing must prevent boundary-confusion collision"
);
}
}