Skip to main content

rlm_rs/embedding/
fallback.rs

1//! Hash-based fallback embedder.
2//!
3//! Provides deterministic pseudo-embeddings when `FastEmbed` is not available.
4//! Uses content hashing to generate reproducible embeddings that cluster
5//! similar text together (based on word overlap, not semantics).
6
7use crate::Result;
8use crate::embedding::Embedder;
9use std::collections::hash_map::DefaultHasher;
10use std::hash::{Hash, Hasher};
11
12/// Hash-based fallback embedder.
13///
14/// Generates deterministic pseudo-embeddings using a combination of:
15/// - Word-level hashing for vocabulary capture
16/// - Character n-gram hashing for fuzzy matching
17/// - Normalization to unit length for cosine similarity
18///
19/// This is NOT semantic similarity - it's based on lexical overlap.
20/// Use `FastEmbed` for true semantic understanding.
21///
22/// # Examples
23///
24/// ```
25/// use rlm_rs::embedding::{Embedder, FallbackEmbedder, DEFAULT_DIMENSIONS};
26///
27/// let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
28/// let emb1 = embedder.embed("hello world").unwrap();
29/// let emb2 = embedder.embed("hello world").unwrap();
30/// assert_eq!(emb1, emb2); // Deterministic
31/// ```
32pub struct FallbackEmbedder {
33    dimensions: usize,
34}
35
36impl FallbackEmbedder {
37    /// Creates a new fallback embedder with the specified dimensions.
38    #[must_use]
39    pub const fn new(dimensions: usize) -> Self {
40        Self { dimensions }
41    }
42
43    /// Hashes a string to a u64 value.
44    fn hash_string(s: &str) -> u64 {
45        let mut hasher = DefaultHasher::new();
46        s.hash(&mut hasher);
47        hasher.finish()
48    }
49
50    /// Generates a pseudo-embedding from text.
51    #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
52    fn generate_embedding(&self, text: &str) -> Vec<f32> {
53        let mut embedding = vec![0.0f32; self.dimensions];
54
55        // Normalize text: lowercase and basic cleanup
56        let normalized: String = text
57            .chars()
58            .map(|c| {
59                if c.is_alphanumeric() || c.is_whitespace() {
60                    c.to_ascii_lowercase()
61                } else {
62                    ' '
63                }
64            })
65            .collect();
66
67        // Split into words
68        let words: Vec<&str> = normalized.split_whitespace().collect();
69
70        // Word-level hashing (primary signal)
71        for word in &words {
72            let hash = Self::hash_string(word);
73            let idx = (hash as usize) % self.dimensions;
74            // Use hash bits to determine sign and magnitude
75            let sign = if (hash >> 32) & 1 == 0 { 1.0 } else { -1.0 };
76            let magnitude = 1.0 + ((hash >> 16) & 0xFF) as f32 / 255.0;
77            embedding[idx] += sign * magnitude;
78        }
79
80        // Character trigram hashing (secondary signal for fuzzy matching)
81        let chars: Vec<char> = normalized.chars().collect();
82        if chars.len() >= 3 {
83            for window in chars.windows(3) {
84                let trigram: String = window.iter().collect();
85                let hash = Self::hash_string(&trigram);
86                let idx = (hash as usize) % self.dimensions;
87                let sign = if (hash >> 32) & 1 == 0 { 0.5 } else { -0.5 };
88                embedding[idx] += sign;
89            }
90        }
91
92        // Normalize to unit length for cosine similarity
93        let magnitude: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
94        if magnitude > 0.0 {
95            for val in &mut embedding {
96                *val /= magnitude;
97            }
98        }
99
100        embedding
101    }
102}
103
104impl Embedder for FallbackEmbedder {
105    fn dimensions(&self) -> usize {
106        self.dimensions
107    }
108
109    fn model_name(&self) -> &'static str {
110        "fallback-hash-v1"
111    }
112
113    fn embed(&self, text: &str) -> Result<Vec<f32>> {
114        Ok(self.generate_embedding(text))
115    }
116
117    fn embed_batch(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
118        // Parallel processing for batch embedding
119        use rayon::prelude::*;
120
121        Ok(texts
122            .par_iter()
123            .map(|text| self.generate_embedding(text))
124            .collect())
125    }
126}
127
128// Note: FallbackEmbedder auto-derives Send + Sync because it only contains
129// Copy types (usize) with no interior mutability.
130
131#[cfg(test)]
132mod tests {
133    use super::*;
134    use crate::embedding::{DEFAULT_DIMENSIONS, cosine_similarity};
135
136    #[test]
137    fn test_deterministic() {
138        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
139        let emb1 = embedder.embed("hello world").unwrap();
140        let emb2 = embedder.embed("hello world").unwrap();
141        assert_eq!(emb1, emb2);
142    }
143
144    #[test]
145    fn test_dimensions() {
146        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
147        let emb = embedder.embed("test").unwrap();
148        assert_eq!(emb.len(), DEFAULT_DIMENSIONS);
149    }
150
151    #[test]
152    fn test_normalized() {
153        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
154        let emb = embedder.embed("hello world").unwrap();
155        let magnitude: f32 = emb.iter().map(|x| x * x).sum::<f32>().sqrt();
156        assert!((magnitude - 1.0).abs() < 1e-5);
157    }
158
159    #[test]
160    fn test_similar_text_higher_similarity() {
161        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
162        let emb_base = embedder.embed("the quick brown fox").unwrap();
163        let emb_similar = embedder.embed("the quick brown dog").unwrap();
164        let emb_different = embedder.embed("completely unrelated text").unwrap();
165
166        let sim_similar = cosine_similarity(&emb_base, &emb_similar);
167        let sim_different = cosine_similarity(&emb_base, &emb_different);
168
169        assert!(
170            sim_similar > sim_different,
171            "Similar text should have higher similarity: {sim_similar} vs {sim_different}"
172        );
173    }
174
175    #[test]
176    fn test_batch_embedding() {
177        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
178        let texts = vec!["hello", "world", "test"];
179        let embeddings = embedder.embed_batch(&texts).unwrap();
180
181        assert_eq!(embeddings.len(), 3);
182        for emb in embeddings {
183            assert_eq!(emb.len(), DEFAULT_DIMENSIONS);
184        }
185    }
186
187    #[test]
188    fn test_empty_text() {
189        let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
190        let emb = embedder.embed("").unwrap();
191        assert_eq!(emb.len(), DEFAULT_DIMENSIONS);
192        // Empty text should produce zero vector (all zeros)
193        assert!(emb.iter().all(|&x| x == 0.0));
194    }
195}