smooth_operator/embedding.rs
1//! Text → vector embedding — the shared seam for dense retrieval.
2//!
3//! Both the Postgres adapter (pgvector knowledge base) and the ingestion
4//! pipeline need to turn text into dense vectors, and they must agree byte-for-
5//! byte: a document embedded at ingest time and a query embedded at retrieval
6//! time only land close together if they went through the *same* projection.
7//! This module is that one shared home, so the two consumers can never drift.
8//!
9//! - [`Embedder`] — the provider-agnostic trait. One vector per input string,
10//! each of length [`Embedder::dim`].
11//! - [`DeterministicEmbedder`] — the **default**. A stable hash-based
12//! pseudo-embedding (FNV-1a token hashing, L2-normalized, no network), so
13//! conformance tests are reproducible with zero API calls and zero cost.
14//! Dimension is configurable; the Postgres schema defaults to **1024**
15//! (mirrors smooai's `knowledge_vectors embedding vector(1024)`, Voyage
16//! `voyage-3-large` shape).
17//! - [`cosine_similarity`] — a small helper for comparing two vectors (used by
18//! tests and any in-memory ranking that wants to score by dense similarity).
19//!
20//! Provider-backed embedders live with their consumer: the Postgres adapter's
21//! `GatewayEmbedder` (an OpenAI-compatible `/v1/embeddings` HTTP client over the
22//! SmooAI LiteLLM gateway) implements this same [`Embedder`] trait but stays in
23//! the adapter crate so `core` keeps no heavy HTTP dependency on the dense path.
24//!
25//! ## Dimension decision
26//!
27//! Voyage (`voyage-3-large`, 1024-d) is the production north-star (it backs
28//! smooai's `knowledge_vectors`), but Voyage is *not* exposed on the LiteLLM
29//! gateway. The gateway does expose OpenAI `text-embedding-3-small` (1536-d).
30//! Rather than couple the column width to whichever embedder happens to be
31//! configured, the vector dimension is a first-class parameter — the Postgres
32//! adapter takes its `vector(N)` column width from `embedder.dim()`, so dense
33//! retrieval is always dimension-consistent.
34
35use anyhow::Result;
36use async_trait::async_trait;
37
38/// Default embedding dimension (Voyage `voyage-3-large` shape; mirrors
39/// smooai's `knowledge_vectors embedding vector(1024)`).
40pub const DEFAULT_EMBEDDING_DIM: usize = 1024;
41
42/// Whether an embedding is for a document being stored or a search query.
43///
44/// Voyage and most modern embedding models distinguish the two (asymmetric
45/// retrieval). The deterministic embedder ignores it; a provider-backed embedder
46/// (e.g. the adapter's `GatewayEmbedder`) maps it onto the request unchanged.
47/// The parameter keeps the seam honest for when a Voyage-native gateway lands.
48#[derive(Debug, Clone, Copy, PartialEq, Eq)]
49pub enum InputType {
50 /// Embedding a corpus document for storage.
51 Document,
52 /// Embedding a user query for retrieval.
53 Query,
54}
55
56/// Turn text into dense vectors. Implementations must return one vector per
57/// input string, each of length [`Embedder::dim`].
58#[async_trait]
59pub trait Embedder: Send + Sync {
60 /// The fixed output dimension. Must equal the `vector(N)` column width.
61 fn dim(&self) -> usize;
62
63 /// Embed a batch of texts. Returns `texts.len()` vectors, each `dim()` long.
64 ///
65 /// # Errors
66 /// Returns an error if the backing embedding service fails.
67 async fn embed(&self, texts: &[String], input_type: InputType) -> Result<Vec<Vec<f32>>>;
68}
69
70/// Deterministic, network-free pseudo-embedder.
71///
72/// Produces a stable vector from the text via a token-hashing bag-of-words
73/// projection, then L2-normalizes it so cosine distance is well-behaved. Same
74/// text → same vector, always. This makes pgvector retrieval (and ingestion)
75/// tests reproducible without any external service: a document and a query that
76/// share salient tokens land close together in the projected space.
77#[derive(Debug, Clone)]
78pub struct DeterministicEmbedder {
79 dim: usize,
80}
81
82impl DeterministicEmbedder {
83 /// Build with the [`DEFAULT_EMBEDDING_DIM`] (1024).
84 #[must_use]
85 pub fn new() -> Self {
86 Self {
87 dim: DEFAULT_EMBEDDING_DIM,
88 }
89 }
90
91 /// Build with a custom dimension (must match the adapter's `vector(N)`).
92 #[must_use]
93 pub fn with_dim(dim: usize) -> Self {
94 Self { dim }
95 }
96
97 /// FNV-1a hash of a token — cheap and stable across runs/platforms.
98 fn hash_token(token: &str) -> u64 {
99 let mut hash: u64 = 0xcbf2_9ce4_8422_2325;
100 for b in token.bytes() {
101 hash ^= u64::from(b);
102 hash = hash.wrapping_mul(0x0000_0100_0000_01b3);
103 }
104 hash
105 }
106
107 /// Project one text into a normalized vector of `self.dim` floats.
108 fn embed_one(&self, text: &str) -> Vec<f32> {
109 let mut v = vec![0.0_f32; self.dim];
110 let lower = text.to_lowercase();
111 let tokens: Vec<&str> = lower
112 .split(|c: char| !c.is_alphanumeric())
113 .filter(|t| !t.is_empty())
114 .collect();
115
116 for token in tokens {
117 let h = Self::hash_token(token);
118 // Two hashed buckets per token with deterministic signs spreads the
119 // signal so distinct tokens rarely fully collide.
120 let idx_a = (h % self.dim as u64) as usize;
121 let idx_b = ((h >> 32) % self.dim as u64) as usize;
122 let sign_a = if (h & 1) == 0 { 1.0 } else { -1.0 };
123 let sign_b = if (h & 2) == 0 { 1.0 } else { -1.0 };
124 v[idx_a] += sign_a;
125 v[idx_b] += sign_b;
126 }
127
128 // L2-normalize so all vectors live on the unit sphere (cosine == dot).
129 let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
130 if norm > 0.0 {
131 for x in &mut v {
132 *x /= norm;
133 }
134 }
135 v
136 }
137}
138
139impl Default for DeterministicEmbedder {
140 fn default() -> Self {
141 Self::new()
142 }
143}
144
145#[async_trait]
146impl Embedder for DeterministicEmbedder {
147 fn dim(&self) -> usize {
148 self.dim
149 }
150
151 async fn embed(&self, texts: &[String], _input_type: InputType) -> Result<Vec<Vec<f32>>> {
152 Ok(texts.iter().map(|t| self.embed_one(t)).collect())
153 }
154}
155
156/// Cosine similarity of two equal-length vectors.
157///
158/// Returns the dot product over the product of L2 norms, in `[-1.0, 1.0]`. If
159/// either vector is zero-length, mismatched in length, or has zero norm, returns
160/// `0.0` (orthogonal) rather than `NaN`, so callers can rank without guarding.
161#[must_use]
162pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
163 if a.is_empty() || a.len() != b.len() {
164 return 0.0;
165 }
166 let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
167 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
168 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
169 if norm_a == 0.0 || norm_b == 0.0 {
170 return 0.0;
171 }
172 dot / (norm_a * norm_b)
173}
174
175#[cfg(test)]
176mod tests {
177 use super::*;
178
179 #[tokio::test]
180 async fn deterministic_is_stable_and_normalized() {
181 let e = DeterministicEmbedder::new();
182 let a = e
183 .embed(&["hello world".to_string()], InputType::Document)
184 .await
185 .unwrap();
186 let b = e
187 .embed(&["hello world".to_string()], InputType::Query)
188 .await
189 .unwrap();
190 assert_eq!(a[0].len(), DEFAULT_EMBEDDING_DIM);
191 assert_eq!(a, b, "same text must yield the same vector");
192 let norm: f32 = a[0].iter().map(|x| x * x).sum::<f32>().sqrt();
193 assert!((norm - 1.0).abs() < 1e-4, "expected unit norm, got {norm}");
194 }
195
196 #[tokio::test]
197 async fn deterministic_similar_text_is_closer() {
198 let e = DeterministicEmbedder::new();
199 let vecs = e
200 .embed(
201 &[
202 "the quick brown fox jumps".to_string(),
203 "the quick brown fox leaps".to_string(),
204 "completely unrelated banana finance report".to_string(),
205 ],
206 InputType::Document,
207 )
208 .await
209 .unwrap();
210 let close = cosine_similarity(&vecs[0], &vecs[1]);
211 let far = cosine_similarity(&vecs[0], &vecs[2]);
212 assert!(
213 close > far,
214 "shared-token texts should be more similar ({close} vs {far})"
215 );
216 }
217
218 #[tokio::test]
219 async fn custom_dim_respected() {
220 let e = DeterministicEmbedder::with_dim(1536);
221 let v = e
222 .embed(&["x".to_string()], InputType::Document)
223 .await
224 .unwrap();
225 assert_eq!(v[0].len(), 1536);
226 }
227
228 /// Byte-identical-vector guard: a known input must always project to the same
229 /// vector prefix. If the hashing/projection ever drifts (across this crate or
230 /// a consumer that imports it), this catches it before retrieval changes.
231 #[tokio::test]
232 async fn known_input_produces_known_vector_prefix() {
233 let e = DeterministicEmbedder::with_dim(16);
234 let v = e
235 .embed(&["return policy refund".to_string()], InputType::Document)
236 .await
237 .unwrap();
238 assert_eq!(v[0].len(), 16);
239 // Captured from the FNV-1a token-hash projection (L2-normalized) at the
240 // time of consolidation. Any change to the algorithm shifts these.
241 let expected: [f32; 16] = [
242 -0.28867513,
243 0.0,
244 0.0,
245 -0.28867513,
246 0.0,
247 0.0,
248 0.0,
249 0.0,
250 0.0,
251 0.0,
252 0.0,
253 0.0,
254 0.0,
255 -0.28867513,
256 0.0,
257 -0.86602545,
258 ];
259 for (i, (got, want)) in v[0].iter().zip(expected.iter()).enumerate() {
260 assert!(
261 (got - want).abs() < 1e-5,
262 "vector drift at index {i}: got {got}, expected {want}"
263 );
264 }
265 }
266
267 #[test]
268 fn cosine_similarity_basics() {
269 assert!((cosine_similarity(&[1.0, 0.0], &[1.0, 0.0]) - 1.0).abs() < 1e-6);
270 assert!(cosine_similarity(&[1.0, 0.0], &[0.0, 1.0]).abs() < 1e-6);
271 // Mismatched length / zero norm → 0.0, never NaN.
272 assert_eq!(cosine_similarity(&[1.0, 0.0], &[1.0]), 0.0);
273 assert_eq!(cosine_similarity(&[0.0, 0.0], &[1.0, 1.0]), 0.0);
274 }
275}