Skip to main content

rust_memex/
engine.rs

1//! High-level MemexEngine API for library consumers.
2//!
3//! The `MemexEngine` provides a simple, ergonomic interface for storing and
4//! searching vector embeddings. It wraps the lower-level `StorageManager` and
5//! `EmbeddingClient` to provide a unified API.
6//!
7//! # Example
8//!
9//! ```rust,ignore
10//! use rust_memex::{MemexEngine, MemexConfig};
11//! use serde_json::json;
12//!
13//! #[tokio::main]
14//! async fn main() -> anyhow::Result<()> {
15//!     // Quick setup for an app
16//!     let engine = MemexEngine::for_app("my-app", "documents").await?;
17//!
18//!     // Store a document
19//!     engine.store("doc-1", "Hello world!", json!({"source": "test"})).await?;
20//!
21//!     // Search for similar documents
22//!     let results = engine.search("greeting", 5).await?;
23//!
24//!     // Get by ID
25//!     if let Some(doc) = engine.get("doc-1").await? {
26//!         println!("Found: {}", doc.text);
27//!     }
28//!
29//!     // Delete
30//!     engine.delete("doc-1").await?;
31//!
32//!     Ok(())
33//! }
34//! ```
35
36use anyhow::{Result, anyhow};
37use serde::{Deserialize, Serialize};
38use serde_json::Value;
39use std::collections::HashMap;
40use std::sync::Arc;
41use tokio::sync::Mutex;
42use tracing::{debug, info};
43
44use crate::embeddings::{DEFAULT_REQUIRED_DIMENSION, EmbeddingClient, EmbeddingConfig};
45use crate::rag::{SearchOptions, SearchResult, SliceLayer};
46use crate::search::{
47    BM25Config, BM25Index, HybridConfig, HybridSearchResult, HybridSearcher, SearchMode,
48};
49use crate::storage::{ChromaDocument, StorageManager};
50
51// Re-export SearchResult for convenience
52pub use crate::rag::SearchResult as Document;
53
54/// Configuration for MemexEngine.
55///
56/// Provides sensible defaults while allowing customization of all components.
57#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct MemexConfig {
59    /// Application name (used for default db_path)
60    pub app_name: String,
61    /// Namespace for document isolation
62    pub namespace: String,
63    /// Path to LanceDB storage (defaults to ~/.rmcp-servers/{app_name}/lancedb)
64    #[serde(default)]
65    pub db_path: Option<String>,
66    /// Embedding vector dimension (must match your embedding model)
67    #[serde(default = "default_dimension")]
68    pub dimension: usize,
69    /// Embedding provider configuration
70    #[serde(default)]
71    pub embedding_config: EmbeddingConfig,
72    /// Enable BM25 keyword search
73    #[serde(default)]
74    pub enable_bm25: bool,
75    /// BM25 configuration (if enabled)
76    #[serde(default)]
77    pub bm25_config: Option<BM25Config>,
78    /// Enable hybrid search (vector + BM25 fusion)
79    #[serde(default = "default_enable_hybrid")]
80    pub enable_hybrid: bool,
81    /// Hybrid search configuration
82    #[serde(default)]
83    pub hybrid_config: Option<HybridConfig>,
84}
85
86fn default_enable_hybrid() -> bool {
87    true // Hybrid enabled by default
88}
89
90fn default_dimension() -> usize {
91    DEFAULT_REQUIRED_DIMENSION
92}
93
94impl Default for MemexConfig {
95    fn default() -> Self {
96        Self {
97            app_name: "memex".to_string(),
98            namespace: "default".to_string(),
99            db_path: None,
100            dimension: default_dimension(),
101            embedding_config: EmbeddingConfig::default(),
102            enable_bm25: false,
103            bm25_config: None,
104            enable_hybrid: default_enable_hybrid(),
105            hybrid_config: None,
106        }
107    }
108}
109
110impl MemexConfig {
111    /// Create a new config for an app with a namespace
112    pub fn new(app_name: impl Into<String>, namespace: impl Into<String>) -> Self {
113        Self {
114            app_name: app_name.into(),
115            namespace: namespace.into(),
116            ..Default::default()
117        }
118    }
119
120    /// Set custom database path
121    pub fn with_db_path(mut self, path: impl Into<String>) -> Self {
122        self.db_path = Some(path.into());
123        self
124    }
125
126    /// Set embedding dimension
127    pub fn with_dimension(mut self, dimension: usize) -> Self {
128        self.dimension = dimension;
129        self.embedding_config.required_dimension = dimension;
130        self
131    }
132
133    /// Set embedding configuration
134    pub fn with_embedding_config(mut self, config: EmbeddingConfig) -> Self {
135        self.dimension = config.required_dimension;
136        self.embedding_config = config;
137        self
138    }
139
140    fn sync_dimension_fields(&mut self) -> Result<()> {
141        if self.dimension == self.embedding_config.required_dimension {
142            return Ok(());
143        }
144
145        let default_dim = default_dimension();
146        if self.dimension == default_dim {
147            self.dimension = self.embedding_config.required_dimension;
148            return Ok(());
149        }
150
151        if self.embedding_config.required_dimension == default_dim {
152            self.embedding_config.required_dimension = self.dimension;
153            return Ok(());
154        }
155
156        Err(anyhow!(
157            "MemexConfig.dimension={} conflicts with embedding_config.required_dimension={}. \
158             Set them to the same value or use with_dimension()/with_embedding_config() so one source of truth updates both.",
159            self.dimension,
160            self.embedding_config.required_dimension
161        ))
162    }
163
164    /// Enable BM25 hybrid search
165    pub fn with_bm25(mut self, config: BM25Config) -> Self {
166        self.enable_bm25 = true;
167        self.bm25_config = Some(config);
168        self
169    }
170
171    /// Get the effective database path
172    pub fn effective_db_path(&self) -> String {
173        self.db_path
174            .clone()
175            .unwrap_or_else(|| format!("~/.rmcp-servers/{}/lancedb", self.app_name))
176    }
177
178    /// Get the effective BM25 path
179    pub fn effective_bm25_path(&self) -> String {
180        self.bm25_config
181            .as_ref()
182            .map(|c| c.index_path.clone())
183            .unwrap_or_else(|| format!("~/.rmcp-servers/{}/bm25", self.app_name))
184    }
185
186    fn hybrid_uses_bm25(&self) -> bool {
187        self.enable_hybrid
188            && self.hybrid_config.clone().unwrap_or_default().mode != SearchMode::Vector
189    }
190
191    fn normalize_bm25_config(&self, mut config: BM25Config) -> BM25Config {
192        if config.index_path == BM25Config::default().index_path {
193            config.index_path = self.effective_bm25_path();
194        }
195        config
196    }
197
198    fn resolved_bm25_config(&self) -> Option<BM25Config> {
199        if !self.enable_bm25 && !self.hybrid_uses_bm25() {
200            return None;
201        }
202
203        let config = self
204            .bm25_config
205            .clone()
206            .or_else(|| {
207                self.hybrid_config
208                    .as_ref()
209                    .filter(|cfg| cfg.mode != SearchMode::Vector)
210                    .map(|cfg| cfg.bm25.clone())
211            })
212            .unwrap_or_default();
213
214        Some(self.normalize_bm25_config(config))
215    }
216
217    fn resolved_hybrid_config(&self) -> HybridConfig {
218        let mut config = self.hybrid_config.clone().unwrap_or_default();
219        if let Some(bm25) = self.resolved_bm25_config() {
220            config.bm25 = bm25;
221        }
222        config
223    }
224}
225
226/// Metadata filter for search and deletion operations.
227///
228/// Used for filtering documents by metadata fields (e.g., patient_id, visit_id).
229/// Supports GDPR-compliant data deletion by patient.
230#[derive(Debug, Clone, Default, Serialize, Deserialize)]
231pub struct MetaFilter {
232    /// Filter by patient ID
233    #[serde(skip_serializing_if = "Option::is_none")]
234    pub patient_id: Option<String>,
235    /// Filter by visit ID
236    #[serde(skip_serializing_if = "Option::is_none")]
237    pub visit_id: Option<String>,
238    /// Filter by document type
239    #[serde(skip_serializing_if = "Option::is_none")]
240    pub doc_type: Option<String>,
241    /// Filter by date range (start)
242    #[serde(skip_serializing_if = "Option::is_none")]
243    pub date_from: Option<String>,
244    /// Filter by date range (end)
245    #[serde(skip_serializing_if = "Option::is_none")]
246    pub date_to: Option<String>,
247    /// Custom metadata key-value filters
248    #[serde(default, skip_serializing_if = "Vec::is_empty")]
249    pub custom: Vec<(String, String)>,
250}
251
252impl MetaFilter {
253    /// Create a filter for a specific patient (GDPR deletion use case)
254    pub fn for_patient(patient_id: impl Into<String>) -> Self {
255        Self {
256            patient_id: Some(patient_id.into()),
257            ..Default::default()
258        }
259    }
260
261    /// Create a filter for a specific visit
262    pub fn for_visit(visit_id: impl Into<String>) -> Self {
263        Self {
264            visit_id: Some(visit_id.into()),
265            ..Default::default()
266        }
267    }
268
269    /// Add a custom metadata filter
270    pub fn with_custom(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
271        self.custom.push((key.into(), value.into()));
272        self
273    }
274
275    /// Check if this filter matches a document's metadata
276    pub fn matches(&self, metadata: &Value) -> bool {
277        if let Some(ref patient_id) = self.patient_id
278            && metadata.get("patient_id").and_then(|v| v.as_str()) != Some(patient_id)
279        {
280            return false;
281        }
282
283        if let Some(ref visit_id) = self.visit_id
284            && metadata.get("visit_id").and_then(|v| v.as_str()) != Some(visit_id)
285        {
286            return false;
287        }
288
289        if let Some(ref doc_type) = self.doc_type
290            && metadata.get("doc_type").and_then(|v| v.as_str()) != Some(doc_type)
291        {
292            return false;
293        }
294
295        // Date range filtering
296        if let Some(ref date_from) = self.date_from
297            && let Some(doc_date) = metadata.get("date").and_then(|v| v.as_str())
298            && doc_date < date_from.as_str()
299        {
300            return false;
301        }
302
303        if let Some(ref date_to) = self.date_to
304            && let Some(doc_date) = metadata.get("date").and_then(|v| v.as_str())
305            && doc_date > date_to.as_str()
306        {
307            return false;
308        }
309
310        // Custom filters
311        for (key, value) in &self.custom {
312            if metadata.get(key).and_then(|v| v.as_str()) != Some(value) {
313                return false;
314            }
315        }
316
317        true
318    }
319}
320
321/// Item for batch storage operations.
322#[derive(Debug, Clone, Serialize, Deserialize)]
323pub struct StoreItem {
324    /// Unique document ID
325    pub id: String,
326    /// Text content to embed and store
327    pub text: String,
328    /// Optional metadata
329    #[serde(default)]
330    pub metadata: Value,
331}
332
333impl StoreItem {
334    /// Create a new store item
335    pub fn new(id: impl Into<String>, text: impl Into<String>) -> Self {
336        Self {
337            id: id.into(),
338            text: text.into(),
339            metadata: Value::Object(serde_json::Map::new()),
340        }
341    }
342
343    /// Add metadata to this item
344    pub fn with_metadata(mut self, metadata: Value) -> Self {
345        self.metadata = metadata;
346        self
347    }
348}
349
350/// Result of a batch operation
351#[derive(Debug, Clone)]
352pub struct BatchResult {
353    /// Number of items successfully processed
354    pub success_count: usize,
355    /// Number of items that failed
356    pub failure_count: usize,
357    /// IDs of failed items (if any)
358    pub failed_ids: Vec<String>,
359}
360
361/// Statistics for a single layer in dive results
362#[derive(Debug, Clone, Serialize, Deserialize)]
363pub struct LayerStats {
364    /// Total number of chunks found in this layer
365    pub total_chunks: usize,
366    /// Average score of results in this layer
367    pub avg_score: f32,
368    /// Top keywords across results in this layer
369    pub top_keywords: Vec<String>,
370}
371
372impl LayerStats {
373    /// Create empty layer stats
374    pub fn empty() -> Self {
375        Self {
376            total_chunks: 0,
377            avg_score: 0.0,
378            top_keywords: vec![],
379        }
380    }
381
382    /// Create layer stats from search results
383    pub fn from_results(results: &[SearchResult]) -> Self {
384        if results.is_empty() {
385            return Self::empty();
386        }
387
388        let total_chunks = results.len();
389        let avg_score = results.iter().map(|r| r.score).sum::<f32>() / total_chunks as f32;
390
391        // Aggregate keywords across results
392        let mut keyword_counts: HashMap<String, usize> = HashMap::new();
393        for result in results {
394            for keyword in &result.keywords {
395                *keyword_counts.entry(keyword.clone()).or_insert(0) += 1;
396            }
397        }
398
399        // Sort by frequency and take top 10
400        let mut keywords: Vec<_> = keyword_counts.into_iter().collect();
401        keywords.sort_by_key(|b| std::cmp::Reverse(b.1));
402        let top_keywords = keywords.into_iter().take(10).map(|(k, _)| k).collect();
403
404        Self {
405            total_chunks,
406            avg_score,
407            top_keywords,
408        }
409    }
410}
411
412/// Result of a dive operation for a single layer
413#[derive(Debug, Clone, Serialize, Deserialize)]
414pub struct DiveResult {
415    /// The layer this result is for
416    pub layer: SliceLayer,
417    /// Search results for this layer
418    pub results: Vec<SearchResult>,
419    /// Statistics for this layer
420    pub layer_stats: LayerStats,
421}
422
423/// High-level API for vector memory operations.
424///
425/// MemexEngine provides a simple interface for storing, searching, and managing
426/// vector embeddings. It orchestrates the embedding client and storage manager.
427pub struct MemexEngine {
428    storage: Arc<StorageManager>,
429    embeddings: Arc<Mutex<EmbeddingClient>>,
430    bm25: Option<Arc<BM25Index>>,
431    hybrid_searcher: Option<HybridSearcher>,
432    namespace: String,
433    config: MemexConfig,
434}
435
436impl MemexEngine {
437    /// Create a new MemexEngine with the given configuration.
438    ///
439    /// # Example
440    ///
441    /// ```rust,ignore
442    /// let config = MemexConfig::new("my-app", "documents")
443    ///     .with_dimension(1024);
444    /// let engine = MemexEngine::new(config).await?;
445    /// ```
446    pub async fn new(mut config: MemexConfig) -> Result<Self> {
447        config.sync_dimension_fields()?;
448        let db_path = config.effective_db_path();
449
450        info!(
451            "Initializing MemexEngine: app={}, namespace={}, db={}",
452            config.app_name, config.namespace, db_path
453        );
454
455        // Initialize storage
456        let storage = StorageManager::new_lance_only(&db_path).await?;
457        storage.ensure_collection().await?;
458
459        // Initialize embedding client
460        let embeddings = EmbeddingClient::new(&config.embedding_config).await?;
461
462        info!(
463            "Connected to embedding provider: {} (dim={})",
464            embeddings.connected_to(),
465            embeddings.required_dimension()
466        );
467
468        // Initialize BM25 if enabled
469        let bm25 = config
470            .resolved_bm25_config()
471            .map(|bm25_config| BM25Index::new(&bm25_config).map(Arc::new))
472            .transpose()?;
473
474        let storage_arc = Arc::new(storage);
475
476        // Initialize HybridSearcher if hybrid mode is enabled
477        let hybrid_searcher = if config.enable_hybrid {
478            let hybrid_config = config.resolved_hybrid_config();
479            Some(if let Some(ref bm25_index) = bm25 {
480                HybridSearcher::with_bm25_index(
481                    storage_arc.clone(),
482                    bm25_index.clone(),
483                    hybrid_config,
484                )
485            } else {
486                HybridSearcher::new(storage_arc.clone(), hybrid_config).await?
487            })
488        } else {
489            None
490        };
491
492        Ok(Self {
493            storage: storage_arc,
494            embeddings: Arc::new(Mutex::new(embeddings)),
495            bm25,
496            hybrid_searcher,
497            namespace: config.namespace.clone(),
498            config,
499        })
500    }
501
502    /// Quick setup for an application.
503    ///
504    /// Uses default embedding configuration and auto-detects providers.
505    ///
506    /// # Example
507    ///
508    /// ```rust,ignore
509    /// let engine = MemexEngine::for_app("vista", "patient-notes").await?;
510    /// ```
511    pub async fn for_app(app_name: &str, namespace: &str) -> Result<Self> {
512        let config = MemexConfig::new(app_name, namespace);
513        Self::new(config).await
514    }
515
516    /// Vista-optimized setup with 1024-dimension embeddings.
517    ///
518    /// Uses smaller embedding model (qwen3-embedding:0.6b) for faster inference.
519    ///
520    /// # Example
521    ///
522    /// ```rust,ignore
523    /// let engine = MemexEngine::for_vista().await?;
524    /// ```
525    pub async fn for_vista() -> Result<Self> {
526        use crate::embeddings::ProviderConfig;
527
528        let config = MemexConfig {
529            app_name: "vista".to_string(),
530            namespace: "default".to_string(),
531            db_path: Some("~/.rmcp-servers/vista/lancedb".to_string()),
532            dimension: 1024,
533            embedding_config: EmbeddingConfig {
534                required_dimension: 1024,
535                providers: vec![ProviderConfig {
536                    name: "ollama-vista".to_string(),
537                    base_url: "http://localhost:11434".to_string(),
538                    model: "qwen3-embedding:0.6b".to_string(),
539                    priority: 1,
540                    endpoint: "/v1/embeddings".to_string(),
541                }],
542                ..EmbeddingConfig::default()
543            },
544            enable_bm25: false,
545            bm25_config: None,
546            enable_hybrid: true, // Hybrid enabled for Vista
547            hybrid_config: None,
548        };
549        Self::new(config).await
550    }
551
552    /// Get the namespace this engine operates on
553    pub fn namespace(&self) -> &str {
554        &self.namespace
555    }
556
557    /// Get the configuration
558    pub fn config(&self) -> &MemexConfig {
559        &self.config
560    }
561
562    /// Get the underlying storage manager (for advanced operations)
563    pub fn storage(&self) -> Arc<StorageManager> {
564        self.storage.clone()
565    }
566
567    // =========================================================================
568    // CORE CRUD OPERATIONS
569    // =========================================================================
570
571    /// Store a document with embedding.
572    ///
573    /// The text is automatically embedded using the configured embedding provider.
574    ///
575    /// # Arguments
576    /// * `id` - Unique document identifier
577    /// * `text` - Text content to embed and store
578    /// * `metadata` - Additional metadata (JSON object)
579    ///
580    /// # Example
581    ///
582    /// ```rust,ignore
583    /// engine.store(
584    ///     "visit-123",
585    ///     "Patient presented with lethargy and decreased appetite...",
586    ///     json!({"patient_id": "P-456", "visit_type": "checkup"})
587    /// ).await?;
588    /// ```
589    pub async fn store(&self, id: &str, text: &str, metadata: Value) -> Result<()> {
590        debug!("Storing document: id={}, text_len={}", id, text.len());
591
592        // Generate embedding
593        let embedding = self.embeddings.lock().await.embed(text).await?;
594
595        // Create document
596        let doc = ChromaDocument::new_flat(
597            id.to_string(),
598            self.namespace.clone(),
599            embedding,
600            metadata.clone(),
601            text.to_string(),
602        );
603
604        // Store in vector DB
605        self.storage.add_to_store(vec![doc]).await?;
606
607        // Also index in BM25 if enabled
608        if let Some(ref bm25) = self.bm25 {
609            bm25.add_documents(&[(id.to_string(), self.namespace.clone(), text.to_string())])
610                .await?;
611        }
612
613        debug!("Stored document: id={}", id);
614        Ok(())
615    }
616
617    /// Search for similar documents.
618    ///
619    /// Returns documents ordered by similarity score (highest first).
620    ///
621    /// # Arguments
622    /// * `query` - Search query text
623    /// * `limit` - Maximum number of results
624    ///
625    /// # Example
626    ///
627    /// ```rust,ignore
628    /// let results = engine.search("lethargy symptoms", 10).await?;
629    /// for result in results {
630    ///     println!("{}: {} (score: {})", result.id, result.text, result.score);
631    /// }
632    /// ```
633    pub async fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchResult>> {
634        debug!("Searching: query='{}', limit={}", query, limit);
635
636        // Generate query embedding
637        let query_embedding = self.embeddings.lock().await.embed(query).await?;
638
639        // Search vector store
640        let candidates = self
641            .storage
642            .search_store(Some(&self.namespace), query_embedding, limit)
643            .await?;
644
645        // Convert to SearchResult
646        let results: Vec<SearchResult> = candidates
647            .into_iter()
648            .enumerate()
649            .map(|(idx, doc)| {
650                // Simple inverse-index scoring (better results have lower index)
651                let score = 1.0 - (idx as f32 / (limit as f32 + 1.0));
652                let layer = doc.slice_layer();
653                SearchResult {
654                    id: doc.id,
655                    namespace: doc.namespace,
656                    text: doc.document,
657                    score,
658                    metadata: doc.metadata,
659                    layer,
660                    parent_id: doc.parent_id,
661                    children_ids: doc.children_ids,
662                    keywords: doc.keywords,
663                }
664            })
665            .collect();
666
667        debug!("Search returned {} results", results.len());
668        Ok(results)
669    }
670
671    /// Hybrid search combining vector similarity and BM25 keyword matching.
672    ///
673    /// Returns results with combined scores from both methods.
674    /// Requires `enable_hybrid: true` in config.
675    ///
676    /// # Example
677    ///
678    /// ```rust,ignore
679    /// let results = engine.search_hybrid("when did we buy dragon", 10).await?;
680    /// for r in results {
681    ///     println!("{}: combined={:.3}, vector={:?}, bm25={:?}",
682    ///         r.id, r.combined_score, r.vector_score, r.bm25_score);
683    /// }
684    /// ```
685    pub async fn search_hybrid(
686        &self,
687        query: &str,
688        limit: usize,
689    ) -> Result<Vec<HybridSearchResult>> {
690        debug!("Hybrid search: query='{}', limit={}", query, limit);
691
692        let hybrid = self.hybrid_searcher.as_ref().ok_or_else(|| {
693            anyhow!("Hybrid search not enabled. Set enable_hybrid: true in MemexConfig.")
694        })?;
695
696        // Generate query embedding
697        let query_embedding = self.embeddings.lock().await.embed(query).await?;
698
699        // Perform hybrid search
700        let results = hybrid
701            .search(
702                query,
703                query_embedding,
704                Some(&self.namespace),
705                limit,
706                SearchOptions::default(),
707            )
708            .await?;
709
710        debug!("Hybrid search returned {} results", results.len());
711        Ok(results)
712    }
713
714    /// Search with explicit mode selection.
715    ///
716    /// Allows choosing between vector-only, keyword-only, or hybrid search.
717    ///
718    /// # Example
719    ///
720    /// ```rust,ignore
721    /// use rust_memex::SearchMode;
722    ///
723    /// // Keyword-only for exact matches
724    /// let results = engine.search_with_mode("dragon", 10, SearchMode::Keyword).await?;
725    /// ```
726    pub async fn search_with_mode(
727        &self,
728        query: &str,
729        limit: usize,
730        mode: SearchMode,
731    ) -> Result<Vec<HybridSearchResult>> {
732        debug!("Search with mode: query='{}', mode={:?}", query, mode);
733
734        match mode {
735            SearchMode::Vector => {
736                // Use regular vector search and convert to HybridSearchResult
737                let results = self.search(query, limit).await?;
738                Ok(results
739                    .into_iter()
740                    .map(|r| HybridSearchResult {
741                        id: r.id,
742                        namespace: r.namespace,
743                        document: r.text,
744                        combined_score: r.score,
745                        vector_score: Some(r.score),
746                        bm25_score: None,
747                        metadata: r.metadata,
748                        layer: r.layer,
749                        parent_id: r.parent_id,
750                        children_ids: r.children_ids,
751                        keywords: r.keywords,
752                    })
753                    .collect())
754            }
755            SearchMode::Keyword | SearchMode::Hybrid => {
756                // Use hybrid searcher
757                self.search_hybrid(query, limit).await
758            }
759        }
760    }
761
762    /// Get a document by ID.
763    ///
764    /// # Example
765    ///
766    /// ```rust,ignore
767    /// if let Some(doc) = engine.get("visit-123").await? {
768    ///     println!("Found: {}", doc.text);
769    /// }
770    /// ```
771    pub async fn get(&self, id: &str) -> Result<Option<SearchResult>> {
772        debug!("Getting document: id={}", id);
773
774        if let Some(doc) = self.storage.get_document(&self.namespace, id).await? {
775            let layer = doc.slice_layer();
776            return Ok(Some(SearchResult {
777                id: doc.id,
778                namespace: doc.namespace,
779                text: doc.document,
780                score: 1.0,
781                metadata: doc.metadata,
782                layer,
783                parent_id: doc.parent_id,
784                children_ids: doc.children_ids,
785                keywords: doc.keywords,
786            }));
787        }
788
789        Ok(None)
790    }
791
792    /// Delete a document by ID.
793    ///
794    /// Returns true if a document was deleted, false if not found.
795    ///
796    /// # Example
797    ///
798    /// ```rust,ignore
799    /// if engine.delete("visit-123").await? {
800    ///     println!("Document deleted");
801    /// }
802    /// ```
803    pub async fn delete(&self, id: &str) -> Result<bool> {
804        debug!("Deleting document: id={}", id);
805
806        let deleted = self.storage.delete_document(&self.namespace, id).await?;
807
808        // Also delete from BM25 if enabled
809        if let Some(ref bm25) = self.bm25 {
810            bm25.delete_documents(&[id.to_string()]).await?;
811        }
812
813        Ok(deleted > 0)
814    }
815
816    // =========================================================================
817    // BATCH OPERATIONS
818    // =========================================================================
819
820    /// Store multiple documents in a batch.
821    ///
822    /// More efficient than calling `store()` multiple times as embeddings
823    /// are generated in batches.
824    ///
825    /// # Example
826    ///
827    /// ```rust,ignore
828    /// let items = vec![
829    ///     StoreItem::new("doc-1", "First document").with_metadata(json!({"type": "note"})),
830    ///     StoreItem::new("doc-2", "Second document").with_metadata(json!({"type": "note"})),
831    /// ];
832    /// let result = engine.store_batch(items).await?;
833    /// println!("Stored {} documents", result.success_count);
834    /// ```
835    pub async fn store_batch(&self, items: Vec<StoreItem>) -> Result<BatchResult> {
836        if items.is_empty() {
837            return Ok(BatchResult {
838                success_count: 0,
839                failure_count: 0,
840                failed_ids: vec![],
841            });
842        }
843
844        info!("Batch storing {} documents", items.len());
845
846        // Extract texts for batch embedding
847        let texts: Vec<String> = items.iter().map(|i| i.text.clone()).collect();
848
849        // Generate embeddings in batch
850        let embeddings = self.embeddings.lock().await.embed_batch(&texts).await?;
851
852        // Create documents
853        let mut docs = Vec::with_capacity(items.len());
854        let mut bm25_docs = Vec::new();
855
856        for (item, embedding) in items.iter().zip(embeddings) {
857            let doc = ChromaDocument::new_flat(
858                item.id.clone(),
859                self.namespace.clone(),
860                embedding,
861                item.metadata.clone(),
862                item.text.clone(),
863            );
864            docs.push(doc);
865
866            if self.bm25.is_some() {
867                bm25_docs.push((item.id.clone(), self.namespace.clone(), item.text.clone()));
868            }
869        }
870
871        // Store in vector DB
872        self.storage.add_to_store(docs).await?;
873
874        // Also index in BM25 if enabled
875        if let Some(ref bm25) = self.bm25 {
876            bm25.add_documents(&bm25_docs).await?;
877        }
878
879        Ok(BatchResult {
880            success_count: items.len(),
881            failure_count: 0,
882            failed_ids: vec![],
883        })
884    }
885
886    // =========================================================================
887    // FILTERED OPERATIONS
888    // =========================================================================
889
890    /// Search with metadata filter.
891    ///
892    /// Performs vector search and then filters results by metadata.
893    ///
894    /// # Example
895    ///
896    /// ```rust,ignore
897    /// let filter = MetaFilter::for_patient("P-456");
898    /// let results = engine.search_filtered("symptoms", filter, 10).await?;
899    /// ```
900    pub async fn search_filtered(
901        &self,
902        query: &str,
903        filter: MetaFilter,
904        limit: usize,
905    ) -> Result<Vec<SearchResult>> {
906        // Fetch more candidates than needed, then filter
907        let candidates = self.search(query, limit * 3).await?;
908
909        // Apply metadata filter
910        let filtered: Vec<SearchResult> = candidates
911            .into_iter()
912            .filter(|r| filter.matches(&r.metadata))
913            .take(limit)
914            .collect();
915
916        debug!(
917            "Filtered search: query='{}', filter={:?}, results={}",
918            query,
919            filter,
920            filtered.len()
921        );
922
923        Ok(filtered)
924    }
925
926    /// Delete all documents matching a filter.
927    ///
928    /// This is the primary method for GDPR-compliant data deletion.
929    ///
930    /// # Example
931    ///
932    /// ```rust,ignore
933    /// // Delete all documents for a patient (GDPR request)
934    /// let filter = MetaFilter::for_patient("P-456");
935    /// let deleted = engine.delete_by_filter(filter).await?;
936    /// println!("Deleted {} documents", deleted);
937    /// ```
938    pub async fn delete_by_filter(&self, filter: MetaFilter) -> Result<usize> {
939        info!("Deleting documents by filter: {:?}", filter);
940
941        // We need to search for all matching documents first
942        // This is expensive but necessary for metadata-based filtering
943        // Note: A more efficient implementation would add filter support to StorageManager
944
945        // For now, we'll scan namespace documents and filter in memory.
946        // TODO: Add native metadata filtering to LanceDB queries.
947
948        let mut deleted_ids = Vec::new();
949
950        // Search namespace documents page by page before mutating the table.
951        // Deleting while paginating would shift row offsets and risk skipping
952        // matches in later pages.
953        const BATCH_SIZE: usize = 1000;
954        let mut offset = 0;
955
956        loop {
957            let candidates = self
958                .storage
959                .all_documents_page(Some(&self.namespace), offset, BATCH_SIZE)
960                .await?;
961
962            if candidates.is_empty() {
963                break;
964            }
965
966            let page_len = candidates.len();
967            for doc in candidates {
968                if filter.matches(&doc.metadata) {
969                    deleted_ids.push(doc.id);
970                }
971            }
972
973            if page_len < BATCH_SIZE {
974                break;
975            }
976
977            offset += page_len;
978        }
979
980        for id in &deleted_ids {
981            self.storage.delete_document(&self.namespace, id).await?;
982        }
983
984        // Delete from BM25 if enabled
985        if let Some(ref bm25) = self.bm25
986            && !deleted_ids.is_empty()
987        {
988            bm25.delete_documents(&deleted_ids).await?;
989        }
990
991        let deleted_count = deleted_ids.len();
992        info!("Deleted {} documents by filter", deleted_count);
993        Ok(deleted_count)
994    }
995
996    /// Delete all documents in the namespace.
997    ///
998    /// Use with caution - this removes all data!
999    pub async fn purge_namespace(&self) -> Result<usize> {
1000        info!("Purging namespace: {}", self.namespace);
1001
1002        let deleted = self
1003            .storage
1004            .delete_namespace_documents(&self.namespace)
1005            .await?;
1006
1007        if let Some(ref bm25) = self.bm25 {
1008            bm25.delete_namespace_term(&self.namespace).await?;
1009        }
1010
1011        Ok(deleted)
1012    }
1013
1014    // =========================================================================
1015    // HYBRID SEARCH (BM25 + Vector)
1016    // =========================================================================
1017
1018    /// Hybrid search combining BM25 keyword matching with vector similarity.
1019    ///
1020    /// Requires `enable_bm25: true` in config.
1021    ///
1022    /// # Arguments
1023    /// * `query` - Search query
1024    /// * `limit` - Maximum results
1025    /// * `bm25_weight` - Weight for BM25 scores (0.0-1.0, default 0.3)
1026    #[deprecated(
1027        since = "0.3.1",
1028        note = "Use search_hybrid() with HybridSearcher instead"
1029    )]
1030    pub async fn search_bm25_fusion(
1031        &self,
1032        query: &str,
1033        limit: usize,
1034        bm25_weight: f32,
1035    ) -> Result<Vec<SearchResult>> {
1036        let bm25 = self
1037            .bm25
1038            .as_ref()
1039            .ok_or_else(|| anyhow!("BM25 not enabled. Set enable_bm25: true in MemexConfig."))?;
1040
1041        // Get BM25 results
1042        let bm25_results = bm25.search(query, Some(&self.namespace), limit * 2)?;
1043        let bm25_max_score = bm25_results.first().map(|(_, _, s)| *s).unwrap_or(1.0);
1044
1045        // Get vector results
1046        let vector_results = self.search(query, limit * 2).await?;
1047
1048        // Merge and re-score
1049        use std::collections::HashMap;
1050        let mut scores: HashMap<String, (f32, Option<SearchResult>)> = HashMap::new();
1051
1052        // Add BM25 scores (normalized)
1053        for (id, _namespace, score) in bm25_results {
1054            let normalized = score / bm25_max_score.max(0.001);
1055            scores.insert(id, (normalized * bm25_weight, None));
1056        }
1057
1058        // Add vector scores
1059        let vector_weight = 1.0 - bm25_weight;
1060        for result in vector_results {
1061            let entry = scores.entry(result.id.clone()).or_insert((0.0, None));
1062            entry.0 += result.score * vector_weight;
1063            entry.1 = Some(result);
1064        }
1065
1066        // Collect and sort by combined score
1067        let mut combined: Vec<_> = scores
1068            .into_iter()
1069            .filter_map(|(_id, (score, result))| {
1070                // If we have the full result, use it; otherwise fetch from storage
1071                result.map(|mut r| {
1072                    r.score = score;
1073                    r
1074                })
1075            })
1076            .collect();
1077
1078        combined.sort_by(|a, b| {
1079            b.score
1080                .partial_cmp(&a.score)
1081                .unwrap_or(std::cmp::Ordering::Equal)
1082        });
1083        combined.truncate(limit);
1084
1085        Ok(combined)
1086    }
1087}
1088
1089#[cfg(test)]
1090mod tests {
1091    use super::*;
1092
1093    #[test]
1094    fn test_meta_filter_matches() {
1095        let filter = MetaFilter::for_patient("P-123");
1096
1097        let matching = serde_json::json!({
1098            "patient_id": "P-123",
1099            "visit_id": "V-456"
1100        });
1101        assert!(filter.matches(&matching));
1102
1103        let not_matching = serde_json::json!({
1104            "patient_id": "P-999",
1105            "visit_id": "V-456"
1106        });
1107        assert!(!filter.matches(&not_matching));
1108    }
1109
1110    #[test]
1111    fn test_meta_filter_custom() {
1112        let filter = MetaFilter::default()
1113            .with_custom("doc_type", "soap_note")
1114            .with_custom("status", "active");
1115
1116        let matching = serde_json::json!({
1117            "doc_type": "soap_note",
1118            "status": "active"
1119        });
1120        assert!(filter.matches(&matching));
1121
1122        let missing_field = serde_json::json!({
1123            "doc_type": "soap_note"
1124        });
1125        assert!(!filter.matches(&missing_field));
1126    }
1127
1128    #[test]
1129    fn test_memex_config_defaults() {
1130        let config = MemexConfig::default();
1131        assert_eq!(config.dimension, DEFAULT_REQUIRED_DIMENSION);
1132        assert_eq!(
1133            config.embedding_config.required_dimension,
1134            DEFAULT_REQUIRED_DIMENSION
1135        );
1136        assert_eq!(config.namespace, "default");
1137        assert_eq!(config.effective_db_path(), "~/.rmcp-servers/memex/lancedb");
1138    }
1139
1140    #[test]
1141    fn test_memex_config_builder() {
1142        let config = MemexConfig::new("vista", "patients")
1143            .with_dimension(1024)
1144            .with_db_path("/custom/path/db");
1145
1146        assert_eq!(config.app_name, "vista");
1147        assert_eq!(config.namespace, "patients");
1148        assert_eq!(config.dimension, 1024);
1149        assert_eq!(config.embedding_config.required_dimension, 1024);
1150        assert_eq!(config.effective_db_path(), "/custom/path/db");
1151    }
1152
1153    #[test]
1154    fn test_memex_config_with_embedding_config_syncs_dimension() {
1155        let embedding_config = EmbeddingConfig {
1156            required_dimension: 768,
1157            ..EmbeddingConfig::default()
1158        };
1159
1160        let config = MemexConfig::new("sync-test", "ns").with_embedding_config(embedding_config);
1161
1162        assert_eq!(config.dimension, 768);
1163        assert_eq!(config.embedding_config.required_dimension, 768);
1164    }
1165
1166    #[test]
1167    fn test_memex_config_sync_dimension_fields_uses_non_default_embedding_dimension() {
1168        let mut config = MemexConfig::default();
1169        config.embedding_config.required_dimension = 1024;
1170
1171        config.sync_dimension_fields().unwrap();
1172
1173        assert_eq!(config.dimension, 1024);
1174        assert_eq!(config.embedding_config.required_dimension, 1024);
1175    }
1176
1177    #[test]
1178    fn test_memex_config_sync_dimension_fields_rejects_true_conflict() {
1179        let mut config = MemexConfig {
1180            dimension: 768,
1181            ..MemexConfig::default()
1182        };
1183        config.embedding_config.required_dimension = 1024;
1184
1185        let err = config.sync_dimension_fields().unwrap_err().to_string();
1186        assert!(err.contains("conflicts with embedding_config.required_dimension"));
1187    }
1188
1189    #[test]
1190    fn test_store_item() {
1191        let item = StoreItem::new("doc-1", "Hello world")
1192            .with_metadata(serde_json::json!({"type": "greeting"}));
1193
1194        assert_eq!(item.id, "doc-1");
1195        assert_eq!(item.text, "Hello world");
1196        assert_eq!(item.metadata["type"], "greeting");
1197    }
1198
1199    #[test]
1200    fn test_store_item_default_metadata() {
1201        let item = StoreItem::new("doc-1", "Hello world");
1202
1203        assert_eq!(item.id, "doc-1");
1204        assert_eq!(item.text, "Hello world");
1205        assert!(item.metadata.is_object());
1206        assert!(item.metadata.as_object().unwrap().is_empty());
1207    }
1208
1209    #[test]
1210    fn test_meta_filter_empty_matches_all() {
1211        let filter = MetaFilter::default();
1212
1213        // Empty filter should match any metadata
1214        let any_metadata = serde_json::json!({
1215            "patient_id": "P-123",
1216            "visit_id": "V-456",
1217            "random_field": "value"
1218        });
1219        assert!(filter.matches(&any_metadata));
1220
1221        // Even empty metadata should match
1222        let empty = serde_json::json!({});
1223        assert!(filter.matches(&empty));
1224    }
1225
1226    #[test]
1227    fn test_meta_filter_date_range() {
1228        let filter = MetaFilter {
1229            date_from: Some("2024-01-01".to_string()),
1230            date_to: Some("2024-12-31".to_string()),
1231            ..Default::default()
1232        };
1233
1234        // Within range
1235        let in_range = serde_json::json!({
1236            "date": "2024-06-15"
1237        });
1238        assert!(filter.matches(&in_range));
1239
1240        // Before range
1241        let before = serde_json::json!({
1242            "date": "2023-12-31"
1243        });
1244        assert!(!filter.matches(&before));
1245
1246        // After range
1247        let after = serde_json::json!({
1248            "date": "2025-01-01"
1249        });
1250        assert!(!filter.matches(&after));
1251
1252        // No date field still matches (filter only applies if field exists)
1253        let no_date = serde_json::json!({
1254            "patient_id": "P-123"
1255        });
1256        assert!(filter.matches(&no_date));
1257    }
1258
1259    #[test]
1260    fn test_meta_filter_for_visit() {
1261        let filter = MetaFilter::for_visit("V-789");
1262
1263        let matching = serde_json::json!({
1264            "visit_id": "V-789",
1265            "patient_id": "P-123"
1266        });
1267        assert!(filter.matches(&matching));
1268
1269        let not_matching = serde_json::json!({
1270            "visit_id": "V-other",
1271            "patient_id": "P-123"
1272        });
1273        assert!(!filter.matches(&not_matching));
1274    }
1275
1276    #[test]
1277    fn test_meta_filter_combined() {
1278        let filter = MetaFilter {
1279            patient_id: Some("P-123".to_string()),
1280            doc_type: Some("soap_note".to_string()),
1281            ..Default::default()
1282        };
1283
1284        // Both match
1285        let both_match = serde_json::json!({
1286            "patient_id": "P-123",
1287            "doc_type": "soap_note"
1288        });
1289        assert!(filter.matches(&both_match));
1290
1291        // One doesn't match
1292        let wrong_type = serde_json::json!({
1293            "patient_id": "P-123",
1294            "doc_type": "prescription"
1295        });
1296        assert!(!filter.matches(&wrong_type));
1297
1298        // Missing required field
1299        let missing = serde_json::json!({
1300            "patient_id": "P-123"
1301        });
1302        assert!(!filter.matches(&missing));
1303    }
1304
1305    #[test]
1306    fn test_batch_result_struct() {
1307        let result = BatchResult {
1308            success_count: 10,
1309            failure_count: 2,
1310            failed_ids: vec!["doc-5".to_string(), "doc-8".to_string()],
1311        };
1312
1313        assert_eq!(result.success_count, 10);
1314        assert_eq!(result.failure_count, 2);
1315        assert_eq!(result.failed_ids.len(), 2);
1316        assert!(result.failed_ids.contains(&"doc-5".to_string()));
1317    }
1318
1319    #[test]
1320    fn test_memex_config_with_bm25() {
1321        use crate::search::BM25Config;
1322
1323        let bm25_config = BM25Config::default();
1324        let config = MemexConfig::new("test-app", "docs").with_bm25(bm25_config);
1325
1326        assert!(config.enable_bm25);
1327        assert!(config.bm25_config.is_some());
1328    }
1329
1330    #[test]
1331    fn test_memex_config_effective_bm25_path() {
1332        let config = MemexConfig::new("my-app", "docs");
1333        assert_eq!(config.effective_bm25_path(), "~/.rmcp-servers/my-app/bm25");
1334    }
1335
1336    #[test]
1337    fn test_resolved_bm25_config_uses_app_specific_path_for_hybrid_defaults() {
1338        let config = MemexConfig::new("my-app", "docs");
1339        let bm25 = config
1340            .resolved_bm25_config()
1341            .expect("hybrid defaults should provision BM25");
1342
1343        assert_eq!(bm25.index_path, "~/.rmcp-servers/my-app/bm25");
1344    }
1345
1346    #[test]
1347    fn test_resolved_hybrid_config_reuses_resolved_bm25_path() {
1348        let config = MemexConfig::new("my-app", "docs");
1349        let hybrid = config.resolved_hybrid_config();
1350
1351        assert_eq!(hybrid.bm25.index_path, "~/.rmcp-servers/my-app/bm25");
1352    }
1353
1354    #[test]
1355    fn test_meta_filter_serialization() {
1356        let filter = MetaFilter::for_patient("P-123").with_custom("status", "active");
1357
1358        let json = serde_json::to_string(&filter).unwrap();
1359        let deserialized: MetaFilter = serde_json::from_str(&json).unwrap();
1360
1361        assert_eq!(deserialized.patient_id, Some("P-123".to_string()));
1362        assert_eq!(deserialized.custom.len(), 1);
1363        assert_eq!(
1364            deserialized.custom[0],
1365            ("status".to_string(), "active".to_string())
1366        );
1367    }
1368
1369    #[test]
1370    fn test_memex_config_serialization() {
1371        let config = MemexConfig::new("test", "ns")
1372            .with_dimension(512)
1373            .with_db_path("/tmp/test");
1374
1375        let json = serde_json::to_string(&config).unwrap();
1376        let deserialized: MemexConfig = serde_json::from_str(&json).unwrap();
1377
1378        assert_eq!(deserialized.app_name, "test");
1379        assert_eq!(deserialized.namespace, "ns");
1380        assert_eq!(deserialized.dimension, 512);
1381        assert_eq!(deserialized.embedding_config.required_dimension, 512);
1382        assert_eq!(deserialized.db_path, Some("/tmp/test".to_string()));
1383    }
1384
1385    #[test]
1386    fn test_store_item_serialization() {
1387        let item =
1388            StoreItem::new("id-1", "content").with_metadata(serde_json::json!({"key": "value"}));
1389
1390        let json = serde_json::to_string(&item).unwrap();
1391        let deserialized: StoreItem = serde_json::from_str(&json).unwrap();
1392
1393        assert_eq!(deserialized.id, "id-1");
1394        assert_eq!(deserialized.text, "content");
1395        assert_eq!(deserialized.metadata["key"], "value");
1396    }
1397}