Skip to main content

do_memory_core/
embeddings_simple.rs

1//! Simple semantic embeddings implementation
2//!
3//! **⚠️ WARNING: This module contains mock/test-only implementations**
4//!
5//! This is a simplified version that demonstrates the concept
6//! without all the complex integrations that cause compilation issues.
7//! The `text_to_embedding` function uses hash-based pseudo-embeddings
8//! that are NOT semantically meaningful and should only be used for:
9//! - Unit testing
10//! - Development/demonstration purposes
11//! - Fallback when real embeddings are unavailable
12//!
13//! **Production Use:** Use `memory-core/src/embeddings/` module with real
14//! embedding models (gte-rs, ONNX runtime) for actual semantic search.
15
16use anyhow::Result;
17use serde::{Deserialize, Serialize};
18use tracing;
19
20/// Configuration for embeddings
21#[derive(Debug, Clone, Serialize, Deserialize)]
22pub struct EmbeddingConfig {
23    /// Similarity threshold for search (0.0 to 1.0)
24    pub similarity_threshold: f32,
25    /// Maximum batch size for embedding generation
26    pub batch_size: usize,
27    /// Cache embeddings to avoid regeneration
28    pub cache_embeddings: bool,
29}
30
31impl Default for EmbeddingConfig {
32    fn default() -> Self {
33        Self {
34            similarity_threshold: 0.7,
35            batch_size: 32,
36            cache_embeddings: true,
37        }
38    }
39}
40
41/// Calculate cosine similarity between two vectors
42#[must_use]
43pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
44    if a.len() != b.len() || a.is_empty() {
45        return 0.0;
46    }
47
48    let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
49    let magnitude_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
50    let magnitude_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
51
52    if magnitude_a == 0.0 || magnitude_b == 0.0 {
53        return 0.0;
54    }
55
56    // Normalize from [-1, 1] to [0, 1] range
57    let similarity = dot_product / (magnitude_a * magnitude_b);
58    (similarity + 1.0) / 2.0
59}
60
61/// Simple text to embedding converter (mock implementation)
62///
63/// **⚠️ PRODUCTION WARNING: This is a mock/test-only implementation**
64///
65/// This function generates deterministic hash-based "embeddings" that are
66/// NOT semantically meaningful. The similarity between these vectors is
67/// essentially random and does not reflect actual semantic similarity.
68///
69/// **Use Cases:**
70/// - Unit testing (deterministic, fast)
71/// - Development/demonstration
72/// - Fallback when real embeddings unavailable
73///
74/// **Do NOT Use For:**
75/// - Production semantic search
76/// - Real similarity calculations
77/// - User-facing features
78///
79/// **For Production:** Use `memory-core::embeddings::LocalEmbeddingProvider`
80/// with the `local-embeddings` feature enabled and real ONNX models.
81pub fn text_to_embedding(text: &str) -> Vec<f32> {
82    use std::collections::hash_map::DefaultHasher;
83    use std::hash::{Hash, Hasher};
84
85    // Emit production warning
86    tracing::warn!(
87        "PRODUCTION WARNING: Using hash-based pseudo-embeddings - semantic search will not work correctly! \
88         Text: '{}'. Use real embedding models for production.",
89        text.chars().take(20).collect::<String>()
90    );
91
92    // Create a deterministic embedding based on text hash
93    let mut hasher = DefaultHasher::new();
94    text.hash(&mut hasher);
95    let hash = hasher.finish();
96
97    let dimension = 384; // Standard sentence transformer dimension
98    let mut embedding = Vec::with_capacity(dimension);
99    let mut seed = hash;
100
101    for _ in 0..dimension {
102        // Simple PRNG to generate values
103        seed = seed.wrapping_mul(1_103_515_245).wrapping_add(12345);
104        let value = ((seed >> 16) as f32) / 32768.0 - 1.0; // Range [-1, 1]
105        embedding.push(value);
106    }
107
108    // Normalize the vector
109    let magnitude = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
110    if magnitude > 0.0 {
111        for x in &mut embedding {
112            *x /= magnitude;
113        }
114    }
115
116    embedding
117}
118
119/// Test-only text to embedding converter
120///
121/// This function is identical to `text_to_embedding` but without the warning
122/// for use in tests and internal testing scenarios.
123#[cfg(test)]
124#[must_use]
125pub fn text_to_embedding_test(text: &str) -> Vec<f32> {
126    use std::collections::hash_map::DefaultHasher;
127    use std::hash::{Hash, Hasher};
128
129    // Create a deterministic embedding based on text hash
130    let mut hasher = DefaultHasher::new();
131    text.hash(&mut hasher);
132    let hash = hasher.finish();
133
134    let dimension = 384; // Standard sentence transformer dimension
135    let mut embedding = Vec::with_capacity(dimension);
136    let mut seed = hash;
137
138    for _ in 0..dimension {
139        // Simple PRNG to generate values
140        seed = seed.wrapping_mul(1_103_515_245).wrapping_add(12345);
141        let value = ((seed >> 16) as f32) / 32768.0 - 1.0; // Range [-1, 1]
142        embedding.push(value);
143    }
144
145    // Normalize the vector
146    let magnitude = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
147    if magnitude > 0.0 {
148        for x in &mut embedding {
149            *x /= magnitude;
150        }
151    }
152
153    embedding
154}
155
156/// Find most similar texts from a collection
157///
158/// **⚠️ Uses Mock Embeddings:** This function uses hash-based pseudo-embeddings
159/// that are NOT semantically meaningful. The "similarity" results are
160/// essentially random and should not be used for production semantic search.
161///
162/// For production semantic search, use `memory-core::embeddings::SemanticService`
163/// with real embedding models.
164pub fn find_similar_texts(
165    query: &str,
166    candidates: &[String],
167    limit: usize,
168    threshold: f32,
169) -> Vec<(usize, f32, String)> {
170    tracing::warn!(
171        "Using mock embeddings for semantic search - results are not semantically meaningful!"
172    );
173
174    let query_embedding = text_to_embedding(query);
175
176    let mut similarities: Vec<(usize, f32, String)> = candidates
177        .iter()
178        .enumerate()
179        .map(|(i, text)| {
180            let embedding = text_to_embedding(text);
181            let similarity = cosine_similarity(&query_embedding, &embedding);
182            (i, similarity, text.clone())
183        })
184        .filter(|(_, similarity, _)| *similarity >= threshold)
185        .collect();
186
187    // Sort by similarity (highest first)
188    similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
189
190    // Return top results
191    similarities.into_iter().take(limit).collect()
192}
193
194/// Simple semantic search demonstration
195///
196/// **⚠️ DEMONSTRATION ONLY:** This function uses mock hash-based embeddings
197/// that are NOT semantically meaningful. The results shown here are for
198/// demonstration purposes only and should NOT be used to evaluate the
199/// effectiveness of semantic search.
200///
201/// For real semantic search demonstrations, use the proper embeddings module
202/// with `cargo run --features local-embeddings`.
203pub fn demonstrate_semantic_search() -> Result<()> {
204    tracing::warn!("🧠 Semantic Search Demonstration (Mock Embeddings)");
205    tracing::warn!("WARNING: This demonstration uses hash-based pseudo-embeddings");
206    tracing::warn!("that are NOT semantically meaningful. Similarity scores are");
207    tracing::warn!("essentially random and do not reflect actual semantic similarity.");
208    tracing::warn!("For production semantic search, use real embedding models.");
209    tracing::info!("Enable with: cargo run --features local-embeddings");
210
211    // Sample episode descriptions
212    let episodes = vec![
213        "Implement user authentication with JWT tokens".to_string(),
214        "Build REST API endpoints for user management".to_string(),
215        "Create data validation middleware for API requests".to_string(),
216        "Add rate limiting to prevent API abuse".to_string(),
217        "Implement OAuth2 authentication flow".to_string(),
218        "Design database schema for user profiles".to_string(),
219        "Write unit tests for authentication module".to_string(),
220        "Deploy API to production with Docker".to_string(),
221        "Monitor API performance and error rates".to_string(),
222        "Document API endpoints with OpenAPI spec".to_string(),
223    ];
224
225    // Test queries
226    let queries = vec![
227        "How to secure API with authentication?",
228        "Need to create user management endpoints",
229        "Add validation to API requests",
230        "Prevent API abuse and rate limiting",
231    ];
232
233    for query in queries {
234        tracing::debug!("Query: \"{}\"", query);
235        let results = find_similar_texts(query, &episodes, 3, 0.5);
236
237        tracing::debug!("Top {} similar episodes:", results.len());
238        for (i, (idx, similarity, text)) in results.iter().enumerate() {
239            tracing::debug!(
240                "  {}. [{}] {} (similarity: {:.3})",
241                i + 1,
242                idx,
243                text,
244                similarity
245            );
246        }
247    }
248
249    // Demonstrate similarity calculation
250    tracing::debug!("Direct Similarity Examples:");
251    let pairs = vec![
252        ("user authentication", "login system"),
253        ("REST API", "web service endpoints"),
254        ("data validation", "input verification"),
255        ("rate limiting", "API throttling"),
256    ];
257
258    for (text1, text2) in pairs {
259        let emb1 = text_to_embedding(text1);
260        let emb2 = text_to_embedding(text2);
261        let similarity = cosine_similarity(&emb1, &emb2);
262        tracing::debug!("  \"{}\" <-> \"{}\" = {:.3}", text1, text2, similarity);
263    }
264
265    tracing::info!("For real semantic search, use memory-core::embeddings modules");
266    tracing::info!("with proper ONNX models and sentence transformers.");
267
268    Ok(())
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274
275    #[test]
276    fn test_cosine_similarity() {
277        // Identical vectors should have similarity 1.0
278        let vec1 = vec![1.0, 2.0, 3.0];
279        let vec2 = vec![1.0, 2.0, 3.0];
280        let similarity = cosine_similarity(&vec1, &vec2);
281        assert!((similarity - 1.0).abs() < 0.001);
282
283        // Orthogonal vectors should have similarity 0.5 (normalized from 0)
284        let vec3 = vec![1.0, 0.0];
285        let vec4 = vec![0.0, 1.0];
286        let similarity = cosine_similarity(&vec3, &vec4);
287        assert!((similarity - 0.5).abs() < 0.001);
288    }
289
290    #[test]
291    fn test_text_to_embedding() {
292        let embedding1 = text_to_embedding("hello world");
293        let embedding2 = text_to_embedding("hello world");
294        let embedding3 = text_to_embedding("different text");
295
296        // Same text should produce same embedding
297        assert_eq!(embedding1, embedding2);
298
299        // Different text should produce different embedding
300        assert_ne!(embedding1, embedding3);
301
302        // All embeddings should be unit vectors (magnitude ≈ 1.0)
303        let magnitude1: f32 = embedding1.iter().map(|x| x * x).sum::<f32>().sqrt();
304        assert!((magnitude1 - 1.0).abs() < 0.001);
305    }
306
307    #[test]
308    fn test_find_similar_texts() {
309        let candidates = vec![
310            "implement user authentication".to_string(),
311            "create REST API endpoints".to_string(),
312            "add input validation".to_string(),
313            "deploy with Docker".to_string(),
314        ];
315
316        let results = find_similar_texts("user login system", &candidates, 2, 0.0);
317
318        // Should return at most 2 results
319        assert!(results.len() <= 2);
320
321        // Results should be sorted by similarity (highest first)
322        if results.len() > 1 {
323            assert!(results[0].1 >= results[1].1);
324        }
325    }
326
327    #[test]
328    fn test_embedding_config() {
329        let config = EmbeddingConfig::default();
330        assert_eq!(config.similarity_threshold, 0.7);
331        assert_eq!(config.batch_size, 32);
332        assert!(config.cache_embeddings);
333    }
334}