Skip to main content

smooth_operator/
embedding.rs

1//! Text → vector embedding — the shared seam for dense retrieval.
2//!
3//! Both the Postgres adapter (pgvector knowledge base) and the ingestion
4//! pipeline need to turn text into dense vectors, and they must agree byte-for-
5//! byte: a document embedded at ingest time and a query embedded at retrieval
6//! time only land close together if they went through the *same* projection.
7//! This module is that one shared home, so the two consumers can never drift.
8//!
9//! - [`Embedder`] — the provider-agnostic trait. One vector per input string,
10//!   each of length [`Embedder::dim`].
11//! - [`DeterministicEmbedder`] — the **default**. A stable hash-based
12//!   pseudo-embedding (FNV-1a token hashing, L2-normalized, no network), so
13//!   conformance tests are reproducible with zero API calls and zero cost.
14//!   Dimension is configurable; the Postgres schema defaults to **1024**
15//!   (mirrors smooai's `knowledge_vectors embedding vector(1024)`, Voyage
16//!   `voyage-3-large` shape).
17//! - [`cosine_similarity`] — a small helper for comparing two vectors (used by
18//!   tests and any in-memory ranking that wants to score by dense similarity).
19//!
20//! Provider-backed embedders live with their consumer: the Postgres adapter's
21//! `GatewayEmbedder` (an OpenAI-compatible `/v1/embeddings` HTTP client over the
22//! SmooAI LiteLLM gateway) implements this same [`Embedder`] trait but stays in
23//! the adapter crate so `core` keeps no heavy HTTP dependency on the dense path.
24//!
25//! ## Dimension decision
26//!
27//! Voyage (`voyage-3-large`, 1024-d) is the production north-star (it backs
28//! smooai's `knowledge_vectors`), but Voyage is *not* exposed on the LiteLLM
29//! gateway. The gateway does expose OpenAI `text-embedding-3-small` (1536-d).
30//! Rather than couple the column width to whichever embedder happens to be
31//! configured, the vector dimension is a first-class parameter — the Postgres
32//! adapter takes its `vector(N)` column width from `embedder.dim()`, so dense
33//! retrieval is always dimension-consistent.
34
35use anyhow::Result;
36use async_trait::async_trait;
37
38/// Default embedding dimension (Voyage `voyage-3-large` shape; mirrors
39/// smooai's `knowledge_vectors embedding vector(1024)`).
40pub const DEFAULT_EMBEDDING_DIM: usize = 1024;
41
42/// Whether an embedding is for a document being stored or a search query.
43///
44/// Voyage and most modern embedding models distinguish the two (asymmetric
45/// retrieval). The deterministic embedder ignores it; a provider-backed embedder
46/// (e.g. the adapter's `GatewayEmbedder`) maps it onto the request unchanged.
47/// The parameter keeps the seam honest for when a Voyage-native gateway lands.
48#[derive(Debug, Clone, Copy, PartialEq, Eq)]
49pub enum InputType {
50    /// Embedding a corpus document for storage.
51    Document,
52    /// Embedding a user query for retrieval.
53    Query,
54}
55
56/// Turn text into dense vectors. Implementations must return one vector per
57/// input string, each of length [`Embedder::dim`].
58#[async_trait]
59pub trait Embedder: Send + Sync {
60    /// The fixed output dimension. Must equal the `vector(N)` column width.
61    fn dim(&self) -> usize;
62
63    /// Embed a batch of texts. Returns `texts.len()` vectors, each `dim()` long.
64    ///
65    /// # Errors
66    /// Returns an error if the backing embedding service fails.
67    async fn embed(&self, texts: &[String], input_type: InputType) -> Result<Vec<Vec<f32>>>;
68}
69
70/// Deterministic, network-free pseudo-embedder.
71///
72/// Produces a stable vector from the text via a token-hashing bag-of-words
73/// projection, then L2-normalizes it so cosine distance is well-behaved. Same
74/// text → same vector, always. This makes pgvector retrieval (and ingestion)
75/// tests reproducible without any external service: a document and a query that
76/// share salient tokens land close together in the projected space.
77#[derive(Debug, Clone)]
78pub struct DeterministicEmbedder {
79    dim: usize,
80}
81
82impl DeterministicEmbedder {
83    /// Build with the [`DEFAULT_EMBEDDING_DIM`] (1024).
84    #[must_use]
85    pub fn new() -> Self {
86        Self {
87            dim: DEFAULT_EMBEDDING_DIM,
88        }
89    }
90
91    /// Build with a custom dimension (must match the adapter's `vector(N)`).
92    #[must_use]
93    pub fn with_dim(dim: usize) -> Self {
94        Self { dim }
95    }
96
97    /// FNV-1a hash of a token — cheap and stable across runs/platforms.
98    fn hash_token(token: &str) -> u64 {
99        let mut hash: u64 = 0xcbf2_9ce4_8422_2325;
100        for b in token.bytes() {
101            hash ^= u64::from(b);
102            hash = hash.wrapping_mul(0x0000_0100_0000_01b3);
103        }
104        hash
105    }
106
107    /// Project one text into a normalized vector of `self.dim` floats.
108    fn embed_one(&self, text: &str) -> Vec<f32> {
109        let mut v = vec![0.0_f32; self.dim];
110        let lower = text.to_lowercase();
111        let tokens: Vec<&str> = lower
112            .split(|c: char| !c.is_alphanumeric())
113            .filter(|t| !t.is_empty())
114            .collect();
115
116        for token in tokens {
117            let h = Self::hash_token(token);
118            // Two hashed buckets per token with deterministic signs spreads the
119            // signal so distinct tokens rarely fully collide.
120            let idx_a = (h % self.dim as u64) as usize;
121            let idx_b = ((h >> 32) % self.dim as u64) as usize;
122            let sign_a = if (h & 1) == 0 { 1.0 } else { -1.0 };
123            let sign_b = if (h & 2) == 0 { 1.0 } else { -1.0 };
124            v[idx_a] += sign_a;
125            v[idx_b] += sign_b;
126        }
127
128        // L2-normalize so all vectors live on the unit sphere (cosine == dot).
129        let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
130        if norm > 0.0 {
131            for x in &mut v {
132                *x /= norm;
133            }
134        }
135        v
136    }
137}
138
139impl Default for DeterministicEmbedder {
140    fn default() -> Self {
141        Self::new()
142    }
143}
144
145#[async_trait]
146impl Embedder for DeterministicEmbedder {
147    fn dim(&self) -> usize {
148        self.dim
149    }
150
151    async fn embed(&self, texts: &[String], _input_type: InputType) -> Result<Vec<Vec<f32>>> {
152        Ok(texts.iter().map(|t| self.embed_one(t)).collect())
153    }
154}
155
156/// Cosine similarity of two equal-length vectors.
157///
158/// Returns the dot product over the product of L2 norms, in `[-1.0, 1.0]`. If
159/// either vector is zero-length, mismatched in length, or has zero norm, returns
160/// `0.0` (orthogonal) rather than `NaN`, so callers can rank without guarding.
161#[must_use]
162pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
163    if a.is_empty() || a.len() != b.len() {
164        return 0.0;
165    }
166    let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
167    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
168    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
169    if norm_a == 0.0 || norm_b == 0.0 {
170        return 0.0;
171    }
172    dot / (norm_a * norm_b)
173}
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178
179    #[tokio::test]
180    async fn deterministic_is_stable_and_normalized() {
181        let e = DeterministicEmbedder::new();
182        let a = e
183            .embed(&["hello world".to_string()], InputType::Document)
184            .await
185            .unwrap();
186        let b = e
187            .embed(&["hello world".to_string()], InputType::Query)
188            .await
189            .unwrap();
190        assert_eq!(a[0].len(), DEFAULT_EMBEDDING_DIM);
191        assert_eq!(a, b, "same text must yield the same vector");
192        let norm: f32 = a[0].iter().map(|x| x * x).sum::<f32>().sqrt();
193        assert!((norm - 1.0).abs() < 1e-4, "expected unit norm, got {norm}");
194    }
195
196    #[tokio::test]
197    async fn deterministic_similar_text_is_closer() {
198        let e = DeterministicEmbedder::new();
199        let vecs = e
200            .embed(
201                &[
202                    "the quick brown fox jumps".to_string(),
203                    "the quick brown fox leaps".to_string(),
204                    "completely unrelated banana finance report".to_string(),
205                ],
206                InputType::Document,
207            )
208            .await
209            .unwrap();
210        let close = cosine_similarity(&vecs[0], &vecs[1]);
211        let far = cosine_similarity(&vecs[0], &vecs[2]);
212        assert!(
213            close > far,
214            "shared-token texts should be more similar ({close} vs {far})"
215        );
216    }
217
218    #[tokio::test]
219    async fn custom_dim_respected() {
220        let e = DeterministicEmbedder::with_dim(1536);
221        let v = e
222            .embed(&["x".to_string()], InputType::Document)
223            .await
224            .unwrap();
225        assert_eq!(v[0].len(), 1536);
226    }
227
228    /// Byte-identical-vector guard: a known input must always project to the same
229    /// vector prefix. If the hashing/projection ever drifts (across this crate or
230    /// a consumer that imports it), this catches it before retrieval changes.
231    #[tokio::test]
232    async fn known_input_produces_known_vector_prefix() {
233        let e = DeterministicEmbedder::with_dim(16);
234        let v = e
235            .embed(&["return policy refund".to_string()], InputType::Document)
236            .await
237            .unwrap();
238        assert_eq!(v[0].len(), 16);
239        // Captured from the FNV-1a token-hash projection (L2-normalized) at the
240        // time of consolidation. Any change to the algorithm shifts these.
241        let expected: [f32; 16] = [
242            -0.28867513,
243            0.0,
244            0.0,
245            -0.28867513,
246            0.0,
247            0.0,
248            0.0,
249            0.0,
250            0.0,
251            0.0,
252            0.0,
253            0.0,
254            0.0,
255            -0.28867513,
256            0.0,
257            -0.86602545,
258        ];
259        for (i, (got, want)) in v[0].iter().zip(expected.iter()).enumerate() {
260            assert!(
261                (got - want).abs() < 1e-5,
262                "vector drift at index {i}: got {got}, expected {want}"
263            );
264        }
265    }
266
267    #[test]
268    fn cosine_similarity_basics() {
269        assert!((cosine_similarity(&[1.0, 0.0], &[1.0, 0.0]) - 1.0).abs() < 1e-6);
270        assert!(cosine_similarity(&[1.0, 0.0], &[0.0, 1.0]).abs() < 1e-6);
271        // Mismatched length / zero norm → 0.0, never NaN.
272        assert_eq!(cosine_similarity(&[1.0, 0.0], &[1.0]), 0.0);
273        assert_eq!(cosine_similarity(&[0.0, 0.0], &[1.0, 1.0]), 0.0);
274    }
275}