codemem_engine/
lib.rs

1//! codemem-engine: Domain logic engine for the Codemem memory system.
2//!
3//! This crate contains all business logic, orchestration, and domain operations:
4//! - **index** — ast-grep based code indexing, symbol extraction, reference resolution
5//! - **hooks** — Lifecycle hook handlers (PostToolUse, SessionStart, Stop)
6//! - **watch** — Real-time file watching with debouncing and .gitignore support
7//! - **bm25** — Okapi BM25 scoring with code-aware tokenization
8//! - **scoring** — 9-component hybrid scoring for memory recall
9//! - **patterns** — Cross-session pattern detection
10//! - **compress** — Optional LLM-powered observation compression
11//! - **metrics** — Operational metrics collection
12
13use codemem_core::{
14    CodememConfig, CodememError, ScoringWeights, StorageBackend, VectorBackend, VectorConfig,
15};
16use codemem_storage::graph::GraphEngine;
17use codemem_storage::HnswIndex;
18use codemem_storage::Storage;
19use std::path::{Path, PathBuf};
20use std::sync::atomic::AtomicBool;
21#[cfg(test)]
22use std::sync::atomic::Ordering;
23use std::sync::{Arc, Mutex, RwLock};
24
25pub mod analysis;
26pub mod bm25;
27pub mod compress;
28pub mod consolidation;
29pub mod enrichment;
30mod enrichment_text;
31mod file_indexing;
32mod graph_linking;
33pub mod hooks;
34pub mod index;
35mod memory_ops;
36pub mod metrics;
37pub mod patterns;
38pub mod persistence;
39pub mod recall;
40pub mod scoring;
41pub mod search;
42pub mod watch;
43
44#[cfg(test)]
45#[path = "tests/engine_integration_tests.rs"]
46mod integration_tests;
47
48#[cfg(test)]
49#[path = "tests/enrichment_tests.rs"]
50mod enrichment_tests;
51
52#[cfg(test)]
53#[path = "tests/recall_tests.rs"]
54mod recall_tests;
55
56#[cfg(test)]
57#[path = "tests/search_tests.rs"]
58mod search_tests;
59
60#[cfg(test)]
61#[path = "tests/consolidation_tests.rs"]
62mod consolidation_tests;
63
64#[cfg(test)]
65#[path = "tests/analysis_tests.rs"]
66mod analysis_tests;
67
68#[cfg(test)]
69#[path = "tests/persistence_tests.rs"]
70mod persistence_tests;
71
72// Re-export key index types at crate root for convenience
73pub use index::{
74    ChunkConfig, CodeChunk, CodeParser, Dependency, IndexAndResolveResult, IndexProgress,
75    IndexResult, Indexer, ManifestResult, ParseResult, Reference, ReferenceKind, ReferenceResolver,
76    ResolvedEdge, Symbol, SymbolKind, Visibility, Workspace,
77};
78
79// Re-export key domain types for convenience
80pub use bm25::Bm25Index;
81pub use metrics::InMemoryMetrics;
82
83// Re-export enrichment types
84pub use enrichment::{EnrichResult, EnrichmentPipelineResult};
85
86// Re-export persistence types
87pub use persistence::{edge_weight_for, IndexPersistResult};
88
89// Re-export recall types
90pub use recall::{ExpandedResult, NamespaceStats};
91
92// Re-export search types
93pub use search::{CodeSearchResult, SummaryTreeNode, SymbolSearchResult};
94
95// Re-export analysis types
96pub use analysis::{
97    DecisionChain, DecisionConnection, DecisionEntry, ImpactResult, SessionCheckpointReport,
98};
99
100/// A part descriptor for `split_memory()`.
101#[derive(Debug, Clone)]
102pub struct SplitPart {
103    pub content: String,
104    pub tags: Option<Vec<String>>,
105    pub importance: Option<f64>,
106}
107
108// ── Index Cache ──────────────────────────────────────────────────────────────
109
110/// Cached code-index results for structural queries.
111pub struct IndexCache {
112    pub symbols: Vec<Symbol>,
113    pub chunks: Vec<CodeChunk>,
114    pub root_path: String,
115}
116
117// ── CodememEngine ────────────────────────────────────────────────────────────
118
119/// Core domain engine holding all backends and domain state.
120///
121/// This struct contains all the business logic for the Codemem memory system.
122/// Transport layers (MCP, REST API, CLI) hold a `CodememEngine` and delegate
123/// domain operations to it, keeping transport concerns separate.
124///
125/// **Concrete types are intentional**: `CodememEngine` uses concrete backend types
126/// (`Storage`, `HnswIndex`, `GraphEngine`) rather than trait objects (`dyn StorageBackend`,
127/// `dyn VectorBackend`, `dyn GraphBackend`) for performance. This enables monomorphization
128/// (the compiler generates specialized code for each concrete type), eliminates vtable
129/// indirection overhead on every call, and provides predictable memory layout for
130/// cache-friendly access patterns. The trait abstractions exist for testing and
131/// alternative implementations, but the engine itself benefits from static dispatch.
132pub struct CodememEngine {
133    pub(crate) storage: Box<dyn StorageBackend>,
134    pub(crate) vector: Mutex<HnswIndex>,
135    pub(crate) graph: Mutex<GraphEngine>,
136    /// Optional embedding provider (None if not configured).
137    pub(crate) embeddings: Option<Mutex<Box<dyn codemem_embeddings::EmbeddingProvider>>>,
138    /// Path to the database file, used to derive the index save path.
139    pub(crate) db_path: Option<PathBuf>,
140    /// Cached index results for structural queries.
141    pub(crate) index_cache: Mutex<Option<IndexCache>>,
142    /// Configurable scoring weights for the 9-component hybrid scoring system.
143    pub(crate) scoring_weights: RwLock<ScoringWeights>,
144    /// BM25 index for code-aware token overlap scoring.
145    pub(crate) bm25_index: Mutex<Bm25Index>,
146    /// Loaded configuration.
147    pub(crate) config: CodememConfig,
148    /// Operational metrics collector.
149    pub(crate) metrics: Arc<InMemoryMetrics>,
150    /// Dirty flag for batch saves: set after `persist_memory_no_save()`,
151    /// cleared by `save_index()`.
152    dirty: AtomicBool,
153    /// Active session ID for auto-populating `session_id` on persisted memories.
154    active_session_id: RwLock<Option<String>>,
155}
156
157impl CodememEngine {
158    /// Create an engine with storage, vector, graph, and optional embeddings backends.
159    pub fn new(
160        storage: Box<dyn StorageBackend>,
161        vector: HnswIndex,
162        graph: GraphEngine,
163        embeddings: Option<Box<dyn codemem_embeddings::EmbeddingProvider>>,
164    ) -> Self {
165        let config = CodememConfig::load_or_default();
166        Self::new_with_config(storage, vector, graph, embeddings, config)
167    }
168
169    /// Create an engine with an explicit config (avoids double-loading from disk).
170    pub fn new_with_config(
171        storage: Box<dyn StorageBackend>,
172        vector: HnswIndex,
173        graph: GraphEngine,
174        embeddings: Option<Box<dyn codemem_embeddings::EmbeddingProvider>>,
175        config: CodememConfig,
176    ) -> Self {
177        Self {
178            storage,
179            vector: Mutex::new(vector),
180            graph: Mutex::new(graph),
181            embeddings: embeddings.map(Mutex::new),
182            db_path: None,
183            index_cache: Mutex::new(None),
184            scoring_weights: RwLock::new(config.scoring.clone()),
185            bm25_index: Mutex::new(Bm25Index::new()),
186            config,
187            metrics: Arc::new(InMemoryMetrics::new()),
188            dirty: AtomicBool::new(false),
189            active_session_id: RwLock::new(None),
190        }
191    }
192
193    /// Create an engine from a database path, loading all backends.
194    pub fn from_db_path(db_path: &Path) -> Result<Self, CodememError> {
195        // Ensure parent directory exists (e.g. ~/.codemem/)
196        if let Some(parent) = db_path.parent() {
197            if !parent.exists() {
198                std::fs::create_dir_all(parent).map_err(|e| {
199                    CodememError::Storage(format!(
200                        "Failed to create database directory {}: {e}",
201                        parent.display()
202                    ))
203                })?;
204            }
205        }
206
207        let config = CodememConfig::load_or_default();
208
209        // Wire StorageConfig into Storage::open
210        let storage = Storage::open_with_config(
211            db_path,
212            Some(config.storage.cache_size_mb),
213            Some(config.storage.busy_timeout_secs),
214        )?;
215        let vector_config = VectorConfig {
216            dimensions: config.vector.dimensions,
217            ..VectorConfig::default()
218        };
219        let mut vector = HnswIndex::new(vector_config.clone())?;
220
221        // Load existing vector index if it exists
222        let index_path = db_path.with_extension("idx");
223        if index_path.exists() {
224            vector.load(&index_path)?;
225        }
226
227        // C6: Vector index consistency check — compare vector index count vs DB embedding count.
228        // If they mismatch, rebuild the vector index from SQLite embeddings.
229        let vector_count = vector.stats().count;
230        let db_stats = storage.stats()?;
231        let db_embed_count = db_stats.embedding_count;
232        if vector_count != db_embed_count {
233            tracing::warn!(
234                "Vector index ({vector_count}) out of sync with DB ({db_embed_count}), rebuilding..."
235            );
236            // Rebuild: create a fresh index and re-insert all embeddings from DB
237            let mut fresh_vector = HnswIndex::new(vector_config)?;
238            if let Ok(embeddings) = storage.list_all_embeddings() {
239                for (id, embedding) in &embeddings {
240                    if let Err(e) = fresh_vector.insert(id, embedding) {
241                        tracing::warn!("Failed to re-insert embedding {id}: {e}");
242                    }
243                }
244            }
245            vector = fresh_vector;
246            // Save the rebuilt index
247            if let Err(e) = vector.save(&index_path) {
248                tracing::warn!("Failed to save rebuilt vector index: {e}");
249            }
250        }
251
252        // Load graph from storage
253        let graph = GraphEngine::from_storage(&storage)?;
254
255        // Wire EmbeddingConfig into from_env as fallback
256        let embeddings = codemem_embeddings::from_env(Some(&config.embedding)).ok();
257
258        let mut engine =
259            Self::new_with_config(Box::new(storage), vector, graph, embeddings, config);
260        engine.db_path = Some(db_path.to_path_buf());
261
262        // H7: Only compute PageRank at startup; betweenness is computed lazily
263        // via `ensure_betweenness_computed()` when first needed.
264        engine
265            .lock_graph()?
266            .recompute_centrality_with_options(false);
267
268        // Try loading persisted BM25 index; fall back to rebuilding from memories.
269        let bm25_path = db_path.with_extension("bm25");
270        let mut bm25_loaded = false;
271        if bm25_path.exists() {
272            match std::fs::read(&bm25_path) {
273                Ok(data) => match Bm25Index::deserialize(&data) {
274                    Ok(index) => {
275                        let mut bm25 = engine.lock_bm25()?;
276                        *bm25 = index;
277                        bm25_loaded = true;
278                        tracing::info!(
279                            "Loaded BM25 index from disk ({} documents)",
280                            bm25.doc_count
281                        );
282                    }
283                    Err(e) => {
284                        tracing::warn!("Failed to deserialize BM25 index, rebuilding: {e}");
285                    }
286                },
287                Err(e) => {
288                    tracing::warn!("Failed to read BM25 index file, rebuilding: {e}");
289                }
290            }
291        }
292
293        if !bm25_loaded {
294            // Rebuild BM25 index from all existing memories (batch load)
295            if let Ok(ids) = engine.storage.list_memory_ids() {
296                let id_refs: Vec<&str> = ids.iter().map(|s| s.as_str()).collect();
297                if let Ok(memories) = engine.storage.get_memories_batch(&id_refs) {
298                    let mut bm25 = engine.lock_bm25()?;
299                    for memory in &memories {
300                        bm25.add_document(&memory.id, &memory.content);
301                    }
302                    tracing::info!("Rebuilt BM25 index from {} memories", bm25.doc_count);
303                }
304            }
305        }
306
307        Ok(engine)
308    }
309
310    /// Create a minimal engine for testing.
311    pub fn for_testing() -> Self {
312        let storage = Storage::open_in_memory().unwrap();
313        let vector = HnswIndex::with_defaults().unwrap();
314        let graph = GraphEngine::new();
315        let config = CodememConfig::default();
316        Self {
317            storage: Box::new(storage),
318            vector: Mutex::new(vector),
319            graph: Mutex::new(graph),
320            embeddings: None,
321            db_path: None,
322            index_cache: Mutex::new(None),
323            scoring_weights: RwLock::new(config.scoring.clone()),
324            bm25_index: Mutex::new(Bm25Index::new()),
325            config,
326            metrics: Arc::new(InMemoryMetrics::new()),
327            dirty: AtomicBool::new(false),
328            active_session_id: RwLock::new(None),
329        }
330    }
331
332    // ── Lock Helpers ─────────────────────────────────────────────────────────
333
334    pub fn lock_vector(&self) -> Result<std::sync::MutexGuard<'_, HnswIndex>, CodememError> {
335        self.vector
336            .lock()
337            .map_err(|e| CodememError::LockPoisoned(format!("vector: {e}")))
338    }
339
340    pub fn lock_graph(&self) -> Result<std::sync::MutexGuard<'_, GraphEngine>, CodememError> {
341        self.graph
342            .lock()
343            .map_err(|e| CodememError::LockPoisoned(format!("graph: {e}")))
344    }
345
346    pub fn lock_bm25(&self) -> Result<std::sync::MutexGuard<'_, Bm25Index>, CodememError> {
347        self.bm25_index
348            .lock()
349            .map_err(|e| CodememError::LockPoisoned(format!("bm25: {e}")))
350    }
351
352    pub fn lock_embeddings(
353        &self,
354    ) -> Result<
355        Option<std::sync::MutexGuard<'_, Box<dyn codemem_embeddings::EmbeddingProvider>>>,
356        CodememError,
357    > {
358        match &self.embeddings {
359            Some(m) => Ok(Some(m.lock().map_err(|e| {
360                CodememError::LockPoisoned(format!("embeddings: {e}"))
361            })?)),
362            None => Ok(None),
363        }
364    }
365
366    pub fn lock_index_cache(
367        &self,
368    ) -> Result<std::sync::MutexGuard<'_, Option<IndexCache>>, CodememError> {
369        self.index_cache
370            .lock()
371            .map_err(|e| CodememError::LockPoisoned(format!("index_cache: {e}")))
372    }
373
374    pub fn scoring_weights(
375        &self,
376    ) -> Result<std::sync::RwLockReadGuard<'_, ScoringWeights>, CodememError> {
377        self.scoring_weights
378            .read()
379            .map_err(|e| CodememError::LockPoisoned(format!("scoring_weights read: {e}")))
380    }
381
382    pub fn scoring_weights_mut(
383        &self,
384    ) -> Result<std::sync::RwLockWriteGuard<'_, ScoringWeights>, CodememError> {
385        self.scoring_weights
386            .write()
387            .map_err(|e| CodememError::LockPoisoned(format!("scoring_weights write: {e}")))
388    }
389
390    // ── Active Session ───────────────────────────────────────────────────
391
392    /// Set the active session ID for auto-populating `session_id` on persisted memories.
393    pub fn set_active_session(&self, id: Option<String>) {
394        match self.active_session_id.write() {
395            Ok(mut guard) => *guard = id,
396            Err(e) => *e.into_inner() = id,
397        }
398    }
399
400    /// Get the current active session ID.
401    pub fn active_session_id(&self) -> Option<String> {
402        match self.active_session_id.read() {
403            Ok(guard) => guard.clone(),
404            Err(e) => e.into_inner().clone(),
405        }
406    }
407
408    // ── Public Accessors ──────────────────────────────────────────────────
409
410    /// Access the storage backend.
411    pub fn storage(&self) -> &dyn StorageBackend {
412        &*self.storage
413    }
414
415    /// Whether an embedding provider is configured.
416    pub fn has_embeddings(&self) -> bool {
417        self.embeddings.is_some()
418    }
419
420    /// Access the database path (if backed by a file).
421    pub fn db_path(&self) -> Option<&Path> {
422        self.db_path.as_deref()
423    }
424
425    /// Access the loaded configuration.
426    pub fn config(&self) -> &CodememConfig {
427        &self.config
428    }
429
430    /// Access the metrics collector.
431    pub fn metrics(&self) -> &Arc<InMemoryMetrics> {
432        &self.metrics
433    }
434
435    /// Access the raw graph Mutex (for callers that need `&Mutex<GraphEngine>`).
436    pub fn graph_mutex(&self) -> &Mutex<GraphEngine> {
437        &self.graph
438    }
439
440    /// Access the raw vector Mutex (for callers that need `&Mutex<HnswIndex>`).
441    pub fn vector_mutex(&self) -> &Mutex<HnswIndex> {
442        &self.vector
443    }
444
445    /// Access the raw BM25 Mutex (for callers that need `&Mutex<Bm25Index>`).
446    pub fn bm25_mutex(&self) -> &Mutex<Bm25Index> {
447        &self.bm25_index
448    }
449
450    /// Access the raw embeddings Mutex (for callers that need the `Option<&Mutex<...>>`).
451    pub fn embeddings_mutex(
452        &self,
453    ) -> Option<&Mutex<Box<dyn codemem_embeddings::EmbeddingProvider>>> {
454        self.embeddings.as_ref()
455    }
456
457    /// Check if the engine has unsaved changes (dirty flag is set).
458    #[cfg(test)]
459    pub(crate) fn is_dirty(&self) -> bool {
460        self.dirty.load(Ordering::Acquire)
461    }
462}
463
464// Re-export types from file_indexing at crate root for API compatibility
465pub use file_indexing::{IndexEnrichResult, SessionContext};
codemem_engine/lib.rs

codemem_engine/
lib.rs