crtx-retrieval 0.1.1

//! Deterministic BLAKE3-based **stub** embedder backend (Phase 4.C / D2-B).
//!
//! # This is NOT a semantic embedder
//!
//! Read that line again. The vectors produced by [`LocalStubEmbedder`] are
//! reproducible — same `(text, tags)` produces the same bytes for the lifetime
//! of the `stub:blake3-v1` backend id — but they carry **no semantic
//! information**. Two inputs that differ by a single byte produce vectors that
//! look as different as two inputs from unrelated topics, because BLAKE3 is a
//! cryptographic hash with strong avalanche properties.
//!
//! Comparing two memories' stub embeddings reflects hash-collision properties
//! of BLAKE3 (effectively: never collide), not semantic similarity. Downstream
//! `cortex retrieval` operators should expect the stub to give **random
//! ordering** on real queries — which is the right behavior for "we don't
//! have a real embedder yet" and prevents operators from accidentally relying
//! on illusory semantic ranking.
//!
//! # Why it exists
//!
//! Phase 4.C is landing the composition pipeline that wires lexical retrieval,
//! FTS5 fuzzy match, and (eventually) semantic embeddings into a single
//! `score` call. The composer needs *some* `Embedder` to call against so the
//! pipeline can be tested end-to-end without depending on external ML weights,
//! ONNX runtimes, or training data. This stub is that "some Embedder".
//!
//! Replace this backend with a real one before claiming any **semantic
//! similarity** capability to operators. The replacement slots in behind the
//! [`super::Embedder`] trait — the composer does not change.
//!
//! # Determinism construction
//!
//! Given an input `(text, tags)`:
//!
//! 1. Build a canonical, length-prefixed input buffer:
//!    ```text
//!    DOMAIN_TAG_STUB_EMBED            // 1 byte: 0x10
//!    || text.len() as u64 (LE)        // 8 bytes
//!    || text bytes
//!    || tags.len() as u64 (LE)        // 8 bytes
//!    || for each tag in input order:
//!         tag.len() as u64 (LE)       // 8 bytes
//!         || tag bytes
//!    ```
//!    The length-prefix + domain-tag framing is the same defense against
//!    boundary-confusion collisions used by `cortex-ledger` event hashing
//!    (see `cortex_ledger::hash` docs).
//!
//! 2. Feed the buffer into a BLAKE3 [extendable-output][xof] reader and pull
//!    `STUB_DIM * 4 = 512` bytes. Each 4-byte little-endian chunk becomes one
//!    `f32` slot via `(byte0 / 255.0) * 2.0 - 1.0` averaged across the chunk
//!    — concretely we average four `[-1, 1]` byte mappings to land in
//!    `[-1, 1]` with finer granularity than a single byte gives.
//!
//! 3. **L2-normalize** the vector so cosine similarity (the natural metric
//!    for downstream comparisons) behaves correctly. Without normalization,
//!    cosine similarity would still work mathematically, but the magnitude
//!    of the raw vector would carry implementation noise that propagates
//!    into score weights.
//!
//! [xof]: https://docs.rs/blake3/latest/blake3/struct.OutputReader.html
//!
//! # Backend id
//!
//! `backend_id() = "stub:blake3-v1"`. The `v1` suffix is mandatory: any change
//! to the input framing, dimensionality, byte-to-float mapping, or
//! normalization invalidates the deterministic contract for previously stored
//! vectors and MUST bump the version to `v2`, `v3`, etc. Future real backends
//! (e.g. `"onnx:minilm-l6-v2"`) coexist with this stub by carrying distinct
//! ids.
//!
//! # Slotting in a real backend
//!
//! Future real backends live in sibling modules (e.g. `embedding/onnx.rs`) and
//! implement the same [`super::Embedder`] trait. The composer accepts a
//! `Box<dyn Embedder>` (or `Arc<dyn Embedder>`) — wiring the new backend is a
//! one-line change at the call site. We deliberately do **not** feature-gate
//! the stub: the stub is harmless to compile and useful as a fallback when a
//! real backend is unavailable. (If a future requirement demands removing the
//! stub from release binaries, gate it then with `#[cfg(feature = "stub")]`;
//! today, runtime registration is sufficient.)

use blake3::Hasher;

use super::{EmbedResult, Embedder};

/// Domain tag separating stub-embed BLAKE3 inputs from any other BLAKE3
/// domain. Reserved: 0x10. Reusing this tag for a different purpose is a
/// cross-domain collision risk; pick a different byte.
const DOMAIN_TAG_STUB_EMBED: u8 = 0x10;

/// Backend id reported by [`LocalStubEmbedder::backend_id`].
///
/// The version suffix is part of the contract: bump it on any change that
/// affects the byte-level output of [`Embedder::embed`].
pub const STUB_BACKEND_ID: &str = "stub:blake3-v1";

/// Output dimensionality of the stub backend.
///
/// 128 is a round number large enough to exercise the composer's vector-math
/// code paths but small enough to keep test fixtures readable. Real backends
/// typically pick 384, 768, or 1536; the stub does not need to match.
pub const STUB_DIM: usize = 128;

/// Number of bytes pulled from the BLAKE3 XOF per `f32` slot.
///
/// Four bytes per slot gives 256^4 ≈ 4.3 * 10^9 distinct raw values, plenty
/// for the `[-1, 1]` continuous-ish projection. Using only one byte per slot
/// would cap each component at 256 distinct values, which is enough for
/// determinism testing but visibly coarse in test assertions.
const BYTES_PER_SLOT: usize = 4;

/// Deterministic BLAKE3-based stub embedder.
///
/// See the module docs for the full contract. Construct via
/// [`LocalStubEmbedder::new`] (zero-sized; no state is held).
#[derive(Debug, Default, Clone, Copy)]
pub struct LocalStubEmbedder;

impl LocalStubEmbedder {
    /// Construct a stub embedder. Zero-cost; identical to `Default::default`.
    #[must_use]
    pub const fn new() -> Self {
        Self
    }
}

impl Embedder for LocalStubEmbedder {
    fn backend_id(&self) -> &str {
        STUB_BACKEND_ID
    }

    fn dim(&self) -> usize {
        STUB_DIM
    }

    fn embed(&self, text: &str, tags: &[String]) -> EmbedResult<Vec<f32>> {
        // Step 1: build the framed input. Length prefixes + domain tag
        // prevent boundary-confusion across (text, tags) splits.
        let mut hasher = Hasher::new();
        hasher.update(&[DOMAIN_TAG_STUB_EMBED]);

        let text_bytes = text.as_bytes();
        hasher.update(&(text_bytes.len() as u64).to_le_bytes());
        hasher.update(text_bytes);

        hasher.update(&(tags.len() as u64).to_le_bytes());
        for tag in tags {
            let tag_bytes = tag.as_bytes();
            hasher.update(&(tag_bytes.len() as u64).to_le_bytes());
            hasher.update(tag_bytes);
        }

        // Step 2: pull STUB_DIM * BYTES_PER_SLOT bytes from the XOF and map
        // every 4-byte chunk to an averaged `[-1, 1]` f32.
        let mut xof = hasher.finalize_xof();
        let mut bytes = vec![0u8; STUB_DIM * BYTES_PER_SLOT];
        xof.fill(&mut bytes);

        let mut vec = Vec::with_capacity(STUB_DIM);
        for chunk in bytes.chunks_exact(BYTES_PER_SLOT) {
            // Map each byte to [-1, 1] and average. The result is a finer-
            // grained value in [-1, 1] than a single byte alone would give.
            let mut acc = 0.0f32;
            for &b in chunk {
                acc += (f32::from(b) / 255.0) * 2.0 - 1.0;
            }
            vec.push(acc / BYTES_PER_SLOT as f32);
        }

        // Step 3: L2-normalize so cosine similarity behaves naturally.
        let norm_sq: f32 = vec.iter().map(|x| x * x).sum();
        // A zero-norm output from BLAKE3 is astronomically improbable (it
        // would require every one of 512 bytes to be exactly 0x7F or
        // 0x80 in a balanced way), but we guard against it for clarity
        // rather than panicking.
        if norm_sq == 0.0 {
            return Ok(vec);
        }
        let inv_norm = norm_sq.sqrt().recip();
        for slot in &mut vec {
            *slot *= inv_norm;
        }

        Ok(vec)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::embedding::cosine_similarity;

    /// Determinism: same input → same output, byte-for-byte.
    #[test]
    fn stub_embedder_is_deterministic_for_identical_input() {
        let e = LocalStubEmbedder::new();
        let tags = vec!["alpha".to_string(), "beta".to_string()];
        let a = e.embed("hello world", &tags).unwrap();
        let b = e.embed("hello world", &tags).unwrap();
        assert_eq!(a, b, "stub embedder must be deterministic across calls");
        // And across freshly constructed instances:
        let c = LocalStubEmbedder::new()
            .embed("hello world", &tags)
            .unwrap();
        assert_eq!(a, c, "stub embedder must be instance-independent");
    }

    /// Text changes produce different vectors.
    #[test]
    fn stub_embedder_differs_for_different_text() {
        let e = LocalStubEmbedder::new();
        let tags = vec!["alpha".to_string()];
        let a = e.embed("hello world", &tags).unwrap();
        let b = e.embed("goodbye world", &tags).unwrap();
        assert_ne!(a, b, "different text must produce different vectors");
    }

    /// Tag changes produce different vectors even when text is identical.
    #[test]
    fn stub_embedder_differs_for_different_tags() {
        let e = LocalStubEmbedder::new();
        let a = e.embed("hello", &["alpha".to_string()]).unwrap();
        let b = e.embed("hello", &["beta".to_string()]).unwrap();
        assert_ne!(a, b, "different tags must produce different vectors");

        // Empty vs single-tag must also differ — exercises the
        // tags.len() framing prefix.
        let empty: Vec<String> = vec![];
        let c = e.embed("hello", &empty).unwrap();
        assert_ne!(a, c, "no tags vs one tag must differ");

        // Tag *order* matters: ["a", "b"] vs ["b", "a"] must differ
        // because the framing concatenates in input order. This protects
        // against a future "tag canonicalization" change being silent.
        let ab = e
            .embed("hello", &["a".to_string(), "b".to_string()])
            .unwrap();
        let ba = e
            .embed("hello", &["b".to_string(), "a".to_string()])
            .unwrap();
        assert_ne!(ab, ba, "tag order must be part of the determinism key");
    }

    /// Backend id is versioned and stable.
    #[test]
    fn stub_embedder_backend_id_is_versioned() {
        let e = LocalStubEmbedder::new();
        assert_eq!(e.backend_id(), "stub:blake3-v1");
        assert_eq!(e.backend_id(), STUB_BACKEND_ID);
        // The id MUST contain a version segment so future stubs can coexist.
        assert!(
            e.backend_id().contains("-v"),
            "backend id must carry a version suffix"
        );
    }

    /// Dimensionality is fixed at 128 and matches the produced vector length.
    #[test]
    fn stub_embedder_dim_is_128() {
        let e = LocalStubEmbedder::new();
        assert_eq!(e.dim(), 128);
        assert_eq!(STUB_DIM, 128);
        let v = e.embed("anything", &[]).unwrap();
        assert_eq!(v.len(), 128, "embed output length must equal dim()");
    }

    /// L2-normalized output: norm is ~1.0 so cosine similarity behaves.
    ///
    /// Documented in the module docs: we normalize so that downstream
    /// `cosine_similarity` reads as a dot product on unit vectors.
    #[test]
    fn stub_embedder_output_is_unit_normalized() {
        let e = LocalStubEmbedder::new();
        let v = e
            .embed("the quick brown fox", &["english".to_string()])
            .unwrap();
        let norm_sq: f32 = v.iter().map(|x| x * x).sum();
        let norm = norm_sq.sqrt();
        // f32 rounding tolerance: BLAKE3 output is essentially uniform,
        // and we sum 128 squared values, so the f32 error budget is
        // dominated by the final sqrt and per-element multiply. 1e-5 is
        // comfortably loose.
        assert!(
            (norm - 1.0).abs() < 1e-5,
            "expected unit-norm vector, got norm = {norm}"
        );

        // Cosine similarity of a vector with itself must be ~1.0; with a
        // different vector it must be in [-1, 1].
        let v2 = e.embed("the lazy dog", &["english".to_string()]).unwrap();
        let self_sim = cosine_similarity(&v, &v);
        let cross_sim = cosine_similarity(&v, &v2);
        assert!(
            (self_sim - 1.0).abs() < 1e-5,
            "self-similarity must be ~1.0, got {self_sim}"
        );
        assert!(
            (-1.0..=1.0).contains(&cross_sim),
            "cross-similarity must be in [-1, 1], got {cross_sim}"
        );
        // The stub is NOT a semantic embedder, so we do not assert that
        // "english"-tagged English sentences are similar. We *do* assert
        // they are not perfectly aligned: BLAKE3 avalanche guarantees the
        // cross similarity is nowhere near 1.0 for distinct inputs.
        assert!(
            cross_sim.abs() < 0.5,
            "BLAKE3 avalanche should keep distinct-input similarity well below 1.0, \
             got {cross_sim}"
        );
    }

    /// Bonus: empty text and empty tags is a valid input and produces a
    /// vector. (Protects against an off-by-one in the framing buffer.)
    #[test]
    fn stub_embedder_handles_empty_inputs() {
        let e = LocalStubEmbedder::new();
        let v = e.embed("", &[]).unwrap();
        assert_eq!(v.len(), STUB_DIM);
        let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
        assert!(
            (norm - 1.0).abs() < 1e-5,
            "empty-input vector must still be unit-normalized, got {norm}"
        );
    }

    /// Bonus: framing prefix prevents the classic boundary-confusion
    /// collision. ("hello" + ["world"]) and ("helloworld" + []) MUST
    /// produce different vectors despite their raw byte concatenations
    /// being similar.
    #[test]
    fn stub_embedder_framing_resists_boundary_confusion() {
        let e = LocalStubEmbedder::new();
        let a = e.embed("hello", &["world".to_string()]).unwrap();
        let b = e.embed("helloworld", &[]).unwrap();
        assert_ne!(
            a, b,
            "length-prefix framing must prevent boundary-confusion collision"
        );
    }
}