rlm_rs/embedding/
fallback.rs1use crate::Result;
8use crate::embedding::Embedder;
9use std::collections::hash_map::DefaultHasher;
10use std::hash::{Hash, Hasher};
11
12pub struct FallbackEmbedder {
33 dimensions: usize,
34}
35
36impl FallbackEmbedder {
37 #[must_use]
39 pub const fn new(dimensions: usize) -> Self {
40 Self { dimensions }
41 }
42
43 fn hash_string(s: &str) -> u64 {
45 let mut hasher = DefaultHasher::new();
46 s.hash(&mut hasher);
47 hasher.finish()
48 }
49
50 #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
52 fn generate_embedding(&self, text: &str) -> Vec<f32> {
53 let mut embedding = vec![0.0f32; self.dimensions];
54
55 let normalized: String = text
57 .chars()
58 .map(|c| {
59 if c.is_alphanumeric() || c.is_whitespace() {
60 c.to_ascii_lowercase()
61 } else {
62 ' '
63 }
64 })
65 .collect();
66
67 let words: Vec<&str> = normalized.split_whitespace().collect();
69
70 for word in &words {
72 let hash = Self::hash_string(word);
73 let idx = (hash as usize) % self.dimensions;
74 let sign = if (hash >> 32) & 1 == 0 { 1.0 } else { -1.0 };
76 let magnitude = 1.0 + ((hash >> 16) & 0xFF) as f32 / 255.0;
77 embedding[idx] += sign * magnitude;
78 }
79
80 let chars: Vec<char> = normalized.chars().collect();
82 if chars.len() >= 3 {
83 for window in chars.windows(3) {
84 let trigram: String = window.iter().collect();
85 let hash = Self::hash_string(&trigram);
86 let idx = (hash as usize) % self.dimensions;
87 let sign = if (hash >> 32) & 1 == 0 { 0.5 } else { -0.5 };
88 embedding[idx] += sign;
89 }
90 }
91
92 let magnitude: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
94 if magnitude > 0.0 {
95 for val in &mut embedding {
96 *val /= magnitude;
97 }
98 }
99
100 embedding
101 }
102}
103
104impl Embedder for FallbackEmbedder {
105 fn dimensions(&self) -> usize {
106 self.dimensions
107 }
108
109 fn model_name(&self) -> &'static str {
110 "fallback-hash-v1"
111 }
112
113 fn embed(&self, text: &str) -> Result<Vec<f32>> {
114 Ok(self.generate_embedding(text))
115 }
116
117 fn embed_batch(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
118 use rayon::prelude::*;
120
121 Ok(texts
122 .par_iter()
123 .map(|text| self.generate_embedding(text))
124 .collect())
125 }
126}
127
128#[cfg(test)]
132mod tests {
133 use super::*;
134 use crate::embedding::{DEFAULT_DIMENSIONS, cosine_similarity};
135
136 #[test]
137 fn test_deterministic() {
138 let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
139 let emb1 = embedder.embed("hello world").unwrap();
140 let emb2 = embedder.embed("hello world").unwrap();
141 assert_eq!(emb1, emb2);
142 }
143
144 #[test]
145 fn test_dimensions() {
146 let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
147 let emb = embedder.embed("test").unwrap();
148 assert_eq!(emb.len(), DEFAULT_DIMENSIONS);
149 }
150
151 #[test]
152 fn test_normalized() {
153 let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
154 let emb = embedder.embed("hello world").unwrap();
155 let magnitude: f32 = emb.iter().map(|x| x * x).sum::<f32>().sqrt();
156 assert!((magnitude - 1.0).abs() < 1e-5);
157 }
158
159 #[test]
160 fn test_similar_text_higher_similarity() {
161 let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
162 let emb_base = embedder.embed("the quick brown fox").unwrap();
163 let emb_similar = embedder.embed("the quick brown dog").unwrap();
164 let emb_different = embedder.embed("completely unrelated text").unwrap();
165
166 let sim_similar = cosine_similarity(&emb_base, &emb_similar);
167 let sim_different = cosine_similarity(&emb_base, &emb_different);
168
169 assert!(
170 sim_similar > sim_different,
171 "Similar text should have higher similarity: {sim_similar} vs {sim_different}"
172 );
173 }
174
175 #[test]
176 fn test_batch_embedding() {
177 let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
178 let texts = vec!["hello", "world", "test"];
179 let embeddings = embedder.embed_batch(&texts).unwrap();
180
181 assert_eq!(embeddings.len(), 3);
182 for emb in embeddings {
183 assert_eq!(emb.len(), DEFAULT_DIMENSIONS);
184 }
185 }
186
187 #[test]
188 fn test_empty_text() {
189 let embedder = FallbackEmbedder::new(DEFAULT_DIMENSIONS);
190 let emb = embedder.embed("").unwrap();
191 assert_eq!(emb.len(), DEFAULT_DIMENSIONS);
192 assert!(emb.iter().all(|&x| x == 0.0));
194 }
195}