1use anyhow::Result;
17use serde::{Deserialize, Serialize};
18use tracing;
19
20#[derive(Debug, Clone, Serialize, Deserialize)]
22pub struct EmbeddingConfig {
23 pub similarity_threshold: f32,
25 pub batch_size: usize,
27 pub cache_embeddings: bool,
29}
30
31impl Default for EmbeddingConfig {
32 fn default() -> Self {
33 Self {
34 similarity_threshold: 0.7,
35 batch_size: 32,
36 cache_embeddings: true,
37 }
38 }
39}
40
41#[must_use]
43pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
44 if a.len() != b.len() || a.is_empty() {
45 return 0.0;
46 }
47
48 let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
49 let magnitude_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
50 let magnitude_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
51
52 if magnitude_a == 0.0 || magnitude_b == 0.0 {
53 return 0.0;
54 }
55
56 let similarity = dot_product / (magnitude_a * magnitude_b);
58 (similarity + 1.0) / 2.0
59}
60
61pub fn text_to_embedding(text: &str) -> Vec<f32> {
82 use std::collections::hash_map::DefaultHasher;
83 use std::hash::{Hash, Hasher};
84
85 tracing::warn!(
87 "PRODUCTION WARNING: Using hash-based pseudo-embeddings - semantic search will not work correctly! \
88 Text: '{}'. Use real embedding models for production.",
89 text.chars().take(20).collect::<String>()
90 );
91
92 let mut hasher = DefaultHasher::new();
94 text.hash(&mut hasher);
95 let hash = hasher.finish();
96
97 let dimension = 384; let mut embedding = Vec::with_capacity(dimension);
99 let mut seed = hash;
100
101 for _ in 0..dimension {
102 seed = seed.wrapping_mul(1_103_515_245).wrapping_add(12345);
104 let value = ((seed >> 16) as f32) / 32768.0 - 1.0; embedding.push(value);
106 }
107
108 let magnitude = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
110 if magnitude > 0.0 {
111 for x in &mut embedding {
112 *x /= magnitude;
113 }
114 }
115
116 embedding
117}
118
119#[cfg(test)]
124#[must_use]
125pub fn text_to_embedding_test(text: &str) -> Vec<f32> {
126 use std::collections::hash_map::DefaultHasher;
127 use std::hash::{Hash, Hasher};
128
129 let mut hasher = DefaultHasher::new();
131 text.hash(&mut hasher);
132 let hash = hasher.finish();
133
134 let dimension = 384; let mut embedding = Vec::with_capacity(dimension);
136 let mut seed = hash;
137
138 for _ in 0..dimension {
139 seed = seed.wrapping_mul(1_103_515_245).wrapping_add(12345);
141 let value = ((seed >> 16) as f32) / 32768.0 - 1.0; embedding.push(value);
143 }
144
145 let magnitude = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
147 if magnitude > 0.0 {
148 for x in &mut embedding {
149 *x /= magnitude;
150 }
151 }
152
153 embedding
154}
155
156pub fn find_similar_texts(
165 query: &str,
166 candidates: &[String],
167 limit: usize,
168 threshold: f32,
169) -> Vec<(usize, f32, String)> {
170 tracing::warn!(
171 "Using mock embeddings for semantic search - results are not semantically meaningful!"
172 );
173
174 let query_embedding = text_to_embedding(query);
175
176 let mut similarities: Vec<(usize, f32, String)> = candidates
177 .iter()
178 .enumerate()
179 .map(|(i, text)| {
180 let embedding = text_to_embedding(text);
181 let similarity = cosine_similarity(&query_embedding, &embedding);
182 (i, similarity, text.clone())
183 })
184 .filter(|(_, similarity, _)| *similarity >= threshold)
185 .collect();
186
187 similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
189
190 similarities.into_iter().take(limit).collect()
192}
193
194pub fn demonstrate_semantic_search() -> Result<()> {
204 tracing::warn!("🧠 Semantic Search Demonstration (Mock Embeddings)");
205 tracing::warn!("WARNING: This demonstration uses hash-based pseudo-embeddings");
206 tracing::warn!("that are NOT semantically meaningful. Similarity scores are");
207 tracing::warn!("essentially random and do not reflect actual semantic similarity.");
208 tracing::warn!("For production semantic search, use real embedding models.");
209 tracing::info!("Enable with: cargo run --features local-embeddings");
210
211 let episodes = vec![
213 "Implement user authentication with JWT tokens".to_string(),
214 "Build REST API endpoints for user management".to_string(),
215 "Create data validation middleware for API requests".to_string(),
216 "Add rate limiting to prevent API abuse".to_string(),
217 "Implement OAuth2 authentication flow".to_string(),
218 "Design database schema for user profiles".to_string(),
219 "Write unit tests for authentication module".to_string(),
220 "Deploy API to production with Docker".to_string(),
221 "Monitor API performance and error rates".to_string(),
222 "Document API endpoints with OpenAPI spec".to_string(),
223 ];
224
225 let queries = vec![
227 "How to secure API with authentication?",
228 "Need to create user management endpoints",
229 "Add validation to API requests",
230 "Prevent API abuse and rate limiting",
231 ];
232
233 for query in queries {
234 tracing::debug!("Query: \"{}\"", query);
235 let results = find_similar_texts(query, &episodes, 3, 0.5);
236
237 tracing::debug!("Top {} similar episodes:", results.len());
238 for (i, (idx, similarity, text)) in results.iter().enumerate() {
239 tracing::debug!(
240 " {}. [{}] {} (similarity: {:.3})",
241 i + 1,
242 idx,
243 text,
244 similarity
245 );
246 }
247 }
248
249 tracing::debug!("Direct Similarity Examples:");
251 let pairs = vec![
252 ("user authentication", "login system"),
253 ("REST API", "web service endpoints"),
254 ("data validation", "input verification"),
255 ("rate limiting", "API throttling"),
256 ];
257
258 for (text1, text2) in pairs {
259 let emb1 = text_to_embedding(text1);
260 let emb2 = text_to_embedding(text2);
261 let similarity = cosine_similarity(&emb1, &emb2);
262 tracing::debug!(" \"{}\" <-> \"{}\" = {:.3}", text1, text2, similarity);
263 }
264
265 tracing::info!("For real semantic search, use memory-core::embeddings modules");
266 tracing::info!("with proper ONNX models and sentence transformers.");
267
268 Ok(())
269}
270
271#[cfg(test)]
272mod tests {
273 use super::*;
274
275 #[test]
276 fn test_cosine_similarity() {
277 let vec1 = vec![1.0, 2.0, 3.0];
279 let vec2 = vec![1.0, 2.0, 3.0];
280 let similarity = cosine_similarity(&vec1, &vec2);
281 assert!((similarity - 1.0).abs() < 0.001);
282
283 let vec3 = vec![1.0, 0.0];
285 let vec4 = vec![0.0, 1.0];
286 let similarity = cosine_similarity(&vec3, &vec4);
287 assert!((similarity - 0.5).abs() < 0.001);
288 }
289
290 #[test]
291 fn test_text_to_embedding() {
292 let embedding1 = text_to_embedding("hello world");
293 let embedding2 = text_to_embedding("hello world");
294 let embedding3 = text_to_embedding("different text");
295
296 assert_eq!(embedding1, embedding2);
298
299 assert_ne!(embedding1, embedding3);
301
302 let magnitude1: f32 = embedding1.iter().map(|x| x * x).sum::<f32>().sqrt();
304 assert!((magnitude1 - 1.0).abs() < 0.001);
305 }
306
307 #[test]
308 fn test_find_similar_texts() {
309 let candidates = vec![
310 "implement user authentication".to_string(),
311 "create REST API endpoints".to_string(),
312 "add input validation".to_string(),
313 "deploy with Docker".to_string(),
314 ];
315
316 let results = find_similar_texts("user login system", &candidates, 2, 0.0);
317
318 assert!(results.len() <= 2);
320
321 if results.len() > 1 {
323 assert!(results[0].1 >= results[1].1);
324 }
325 }
326
327 #[test]
328 fn test_embedding_config() {
329 let config = EmbeddingConfig::default();
330 assert_eq!(config.similarity_threshold, 0.7);
331 assert_eq!(config.batch_size, 32);
332 assert!(config.cache_embeddings);
333 }
334}