Skip to main content

codemem_engine/
lib.rs

1//! codemem-engine: Domain logic engine for the Codemem memory system.
2//!
3//! This crate contains all business logic, orchestration, and domain operations:
4//! - **index** — ast-grep based code indexing, symbol extraction, reference resolution
5//! - **hooks** — Lifecycle hook handlers (PostToolUse, SessionStart, Stop)
6//! - **watch** — Real-time file watching with debouncing and .gitignore support
7//! - **bm25** — Okapi BM25 scoring with code-aware tokenization
8//! - **scoring** — 9-component hybrid scoring for memory recall
9//! - **patterns** — Cross-session pattern detection
10//! - **compress** — Optional LLM-powered observation compression
11//! - **metrics** — Operational metrics collection
12
13use codemem_core::{
14    CodememConfig, CodememError, ScoringWeights, StorageBackend, VectorBackend, VectorConfig,
15};
16use codemem_storage::graph::GraphEngine;
17use codemem_storage::HnswIndex;
18use codemem_storage::Storage;
19use std::path::{Path, PathBuf};
20use std::sync::atomic::AtomicBool;
21#[cfg(test)]
22use std::sync::atomic::Ordering;
23use std::sync::{Arc, Mutex, RwLock};
24
25pub mod analysis;
26pub mod bm25;
27pub mod compress;
28pub mod consolidation;
29pub mod enrichment;
30mod enrichment_text;
31mod file_indexing;
32mod graph_linking;
33pub mod hooks;
34pub mod index;
35mod memory_ops;
36pub mod metrics;
37pub mod patterns;
38pub mod persistence;
39pub mod recall;
40pub mod scoring;
41pub mod search;
42pub mod watch;
43
44#[cfg(test)]
45#[path = "tests/engine_integration_tests.rs"]
46mod integration_tests;
47
48#[cfg(test)]
49#[path = "tests/enrichment_tests.rs"]
50mod enrichment_tests;
51
52#[cfg(test)]
53#[path = "tests/recall_tests.rs"]
54mod recall_tests;
55
56#[cfg(test)]
57#[path = "tests/search_tests.rs"]
58mod search_tests;
59
60#[cfg(test)]
61#[path = "tests/consolidation_tests.rs"]
62mod consolidation_tests;
63
64#[cfg(test)]
65#[path = "tests/analysis_tests.rs"]
66mod analysis_tests;
67
68#[cfg(test)]
69#[path = "tests/persistence_tests.rs"]
70mod persistence_tests;
71
72// Re-export key index types at crate root for convenience
73pub use index::{
74    ChunkConfig, CodeChunk, CodeParser, Dependency, IndexAndResolveResult, IndexProgress,
75    IndexResult, Indexer, ManifestResult, ParseResult, Reference, ReferenceKind, ReferenceResolver,
76    ResolvedEdge, Symbol, SymbolKind, Visibility, Workspace,
77};
78
79// Re-export key domain types for convenience
80pub use bm25::Bm25Index;
81pub use metrics::InMemoryMetrics;
82
83// Re-export enrichment types
84pub use enrichment::{EnrichResult, EnrichmentPipelineResult};
85
86// Re-export persistence types
87pub use persistence::{edge_weight_for, IndexPersistResult};
88
89// Re-export recall types
90pub use recall::{ExpandedResult, NamespaceStats};
91
92// Re-export search types
93pub use search::{CodeSearchResult, SummaryTreeNode, SymbolSearchResult};
94
95// Re-export analysis types
96pub use analysis::{
97    DecisionChain, DecisionConnection, DecisionEntry, ImpactResult, SessionCheckpointReport,
98};
99
100/// A part descriptor for `split_memory()`.
101#[derive(Debug, Clone)]
102pub struct SplitPart {
103    pub content: String,
104    pub tags: Option<Vec<String>>,
105    pub importance: Option<f64>,
106}
107
108// ── Index Cache ──────────────────────────────────────────────────────────────
109
110/// Cached code-index results for structural queries.
111pub struct IndexCache {
112    pub symbols: Vec<Symbol>,
113    pub chunks: Vec<CodeChunk>,
114    pub root_path: String,
115}
116
117// ── CodememEngine ────────────────────────────────────────────────────────────
118
119/// Core domain engine holding all backends and domain state.
120///
121/// This struct contains all the business logic for the Codemem memory system.
122/// Transport layers (MCP, REST API, CLI) hold a `CodememEngine` and delegate
123/// domain operations to it, keeping transport concerns separate.
124///
125/// **Concrete types are intentional**: `CodememEngine` uses concrete backend types
126/// (`Storage`, `HnswIndex`, `GraphEngine`) rather than trait objects (`dyn StorageBackend`,
127/// `dyn VectorBackend`, `dyn GraphBackend`) for performance. This enables monomorphization
128/// (the compiler generates specialized code for each concrete type), eliminates vtable
129/// indirection overhead on every call, and provides predictable memory layout for
130/// cache-friendly access patterns. The trait abstractions exist for testing and
131/// alternative implementations, but the engine itself benefits from static dispatch.
132pub struct CodememEngine {
133    pub(crate) storage: Box<dyn StorageBackend>,
134    pub(crate) vector: Mutex<HnswIndex>,
135    pub(crate) graph: Mutex<GraphEngine>,
136    /// Optional embedding provider (None if not configured).
137    pub(crate) embeddings: Option<Mutex<Box<dyn codemem_embeddings::EmbeddingProvider>>>,
138    /// Path to the database file, used to derive the index save path.
139    pub(crate) db_path: Option<PathBuf>,
140    /// Cached index results for structural queries.
141    pub(crate) index_cache: Mutex<Option<IndexCache>>,
142    /// Configurable scoring weights for the 9-component hybrid scoring system.
143    pub(crate) scoring_weights: RwLock<ScoringWeights>,
144    /// BM25 index for code-aware token overlap scoring.
145    pub(crate) bm25_index: Mutex<Bm25Index>,
146    /// Loaded configuration.
147    pub(crate) config: CodememConfig,
148    /// Operational metrics collector.
149    pub(crate) metrics: Arc<InMemoryMetrics>,
150    /// Dirty flag for batch saves: set after `persist_memory_no_save()`,
151    /// cleared by `save_index()`.
152    dirty: AtomicBool,
153}
154
155impl CodememEngine {
156    /// Create an engine with storage, vector, graph, and optional embeddings backends.
157    pub fn new(
158        storage: Box<dyn StorageBackend>,
159        vector: HnswIndex,
160        graph: GraphEngine,
161        embeddings: Option<Box<dyn codemem_embeddings::EmbeddingProvider>>,
162    ) -> Self {
163        let config = CodememConfig::load_or_default();
164        Self::new_with_config(storage, vector, graph, embeddings, config)
165    }
166
167    /// Create an engine with an explicit config (avoids double-loading from disk).
168    pub fn new_with_config(
169        storage: Box<dyn StorageBackend>,
170        vector: HnswIndex,
171        graph: GraphEngine,
172        embeddings: Option<Box<dyn codemem_embeddings::EmbeddingProvider>>,
173        config: CodememConfig,
174    ) -> Self {
175        Self {
176            storage,
177            vector: Mutex::new(vector),
178            graph: Mutex::new(graph),
179            embeddings: embeddings.map(Mutex::new),
180            db_path: None,
181            index_cache: Mutex::new(None),
182            scoring_weights: RwLock::new(config.scoring.clone()),
183            bm25_index: Mutex::new(Bm25Index::new()),
184            config,
185            metrics: Arc::new(InMemoryMetrics::new()),
186            dirty: AtomicBool::new(false),
187        }
188    }
189
190    /// Create an engine from a database path, loading all backends.
191    pub fn from_db_path(db_path: &Path) -> Result<Self, CodememError> {
192        // Ensure parent directory exists (e.g. ~/.codemem/)
193        if let Some(parent) = db_path.parent() {
194            if !parent.exists() {
195                std::fs::create_dir_all(parent).map_err(|e| {
196                    CodememError::Storage(format!(
197                        "Failed to create database directory {}: {e}",
198                        parent.display()
199                    ))
200                })?;
201            }
202        }
203
204        let config = CodememConfig::load_or_default();
205
206        // Wire StorageConfig into Storage::open
207        let storage = Storage::open_with_config(
208            db_path,
209            Some(config.storage.cache_size_mb),
210            Some(config.storage.busy_timeout_secs),
211        )?;
212        let vector_config = VectorConfig {
213            dimensions: config.vector.dimensions,
214            ..VectorConfig::default()
215        };
216        let mut vector = HnswIndex::new(vector_config.clone())?;
217
218        // Load existing vector index if it exists
219        let index_path = db_path.with_extension("idx");
220        if index_path.exists() {
221            vector.load(&index_path)?;
222        }
223
224        // C6: Vector index consistency check — compare vector index count vs DB embedding count.
225        // If they mismatch, rebuild the vector index from SQLite embeddings.
226        let vector_count = vector.stats().count;
227        let db_stats = storage.stats()?;
228        let db_embed_count = db_stats.embedding_count;
229        if vector_count != db_embed_count {
230            tracing::warn!(
231                "Vector index ({vector_count}) out of sync with DB ({db_embed_count}), rebuilding..."
232            );
233            // Rebuild: create a fresh index and re-insert all embeddings from DB
234            let mut fresh_vector = HnswIndex::new(vector_config)?;
235            if let Ok(embeddings) = storage.list_all_embeddings() {
236                for (id, embedding) in &embeddings {
237                    if let Err(e) = fresh_vector.insert(id, embedding) {
238                        tracing::warn!("Failed to re-insert embedding {id}: {e}");
239                    }
240                }
241            }
242            vector = fresh_vector;
243            // Save the rebuilt index
244            if let Err(e) = vector.save(&index_path) {
245                tracing::warn!("Failed to save rebuilt vector index: {e}");
246            }
247        }
248
249        // Load graph from storage
250        let graph = GraphEngine::from_storage(&storage)?;
251
252        // Wire EmbeddingConfig into from_env as fallback
253        let embeddings = codemem_embeddings::from_env(Some(&config.embedding)).ok();
254
255        let mut engine =
256            Self::new_with_config(Box::new(storage), vector, graph, embeddings, config);
257        engine.db_path = Some(db_path.to_path_buf());
258
259        // H7: Only compute PageRank at startup; betweenness is computed lazily
260        // via `ensure_betweenness_computed()` when first needed.
261        engine
262            .lock_graph()?
263            .recompute_centrality_with_options(false);
264
265        // Try loading persisted BM25 index; fall back to rebuilding from memories.
266        let bm25_path = db_path.with_extension("bm25");
267        let mut bm25_loaded = false;
268        if bm25_path.exists() {
269            match std::fs::read(&bm25_path) {
270                Ok(data) => match Bm25Index::deserialize(&data) {
271                    Ok(index) => {
272                        let mut bm25 = engine.lock_bm25()?;
273                        *bm25 = index;
274                        bm25_loaded = true;
275                        tracing::info!(
276                            "Loaded BM25 index from disk ({} documents)",
277                            bm25.doc_count
278                        );
279                    }
280                    Err(e) => {
281                        tracing::warn!("Failed to deserialize BM25 index, rebuilding: {e}");
282                    }
283                },
284                Err(e) => {
285                    tracing::warn!("Failed to read BM25 index file, rebuilding: {e}");
286                }
287            }
288        }
289
290        if !bm25_loaded {
291            // Rebuild BM25 index from all existing memories (batch load)
292            if let Ok(ids) = engine.storage.list_memory_ids() {
293                let id_refs: Vec<&str> = ids.iter().map(|s| s.as_str()).collect();
294                if let Ok(memories) = engine.storage.get_memories_batch(&id_refs) {
295                    let mut bm25 = engine.lock_bm25()?;
296                    for memory in &memories {
297                        bm25.add_document(&memory.id, &memory.content);
298                    }
299                    tracing::info!("Rebuilt BM25 index from {} memories", bm25.doc_count);
300                }
301            }
302        }
303
304        Ok(engine)
305    }
306
307    /// Create a minimal engine for testing.
308    pub fn for_testing() -> Self {
309        let storage = Storage::open_in_memory().unwrap();
310        let vector = HnswIndex::with_defaults().unwrap();
311        let graph = GraphEngine::new();
312        let config = CodememConfig::default();
313        Self {
314            storage: Box::new(storage),
315            vector: Mutex::new(vector),
316            graph: Mutex::new(graph),
317            embeddings: None,
318            db_path: None,
319            index_cache: Mutex::new(None),
320            scoring_weights: RwLock::new(config.scoring.clone()),
321            bm25_index: Mutex::new(Bm25Index::new()),
322            config,
323            metrics: Arc::new(InMemoryMetrics::new()),
324            dirty: AtomicBool::new(false),
325        }
326    }
327
328    // ── Lock Helpers ─────────────────────────────────────────────────────────
329
330    pub fn lock_vector(&self) -> Result<std::sync::MutexGuard<'_, HnswIndex>, CodememError> {
331        self.vector
332            .lock()
333            .map_err(|e| CodememError::LockPoisoned(format!("vector: {e}")))
334    }
335
336    pub fn lock_graph(&self) -> Result<std::sync::MutexGuard<'_, GraphEngine>, CodememError> {
337        self.graph
338            .lock()
339            .map_err(|e| CodememError::LockPoisoned(format!("graph: {e}")))
340    }
341
342    pub fn lock_bm25(&self) -> Result<std::sync::MutexGuard<'_, Bm25Index>, CodememError> {
343        self.bm25_index
344            .lock()
345            .map_err(|e| CodememError::LockPoisoned(format!("bm25: {e}")))
346    }
347
348    pub fn lock_embeddings(
349        &self,
350    ) -> Result<
351        Option<std::sync::MutexGuard<'_, Box<dyn codemem_embeddings::EmbeddingProvider>>>,
352        CodememError,
353    > {
354        match &self.embeddings {
355            Some(m) => Ok(Some(m.lock().map_err(|e| {
356                CodememError::LockPoisoned(format!("embeddings: {e}"))
357            })?)),
358            None => Ok(None),
359        }
360    }
361
362    pub fn lock_index_cache(
363        &self,
364    ) -> Result<std::sync::MutexGuard<'_, Option<IndexCache>>, CodememError> {
365        self.index_cache
366            .lock()
367            .map_err(|e| CodememError::LockPoisoned(format!("index_cache: {e}")))
368    }
369
370    pub fn scoring_weights(
371        &self,
372    ) -> Result<std::sync::RwLockReadGuard<'_, ScoringWeights>, CodememError> {
373        self.scoring_weights
374            .read()
375            .map_err(|e| CodememError::LockPoisoned(format!("scoring_weights read: {e}")))
376    }
377
378    pub fn scoring_weights_mut(
379        &self,
380    ) -> Result<std::sync::RwLockWriteGuard<'_, ScoringWeights>, CodememError> {
381        self.scoring_weights
382            .write()
383            .map_err(|e| CodememError::LockPoisoned(format!("scoring_weights write: {e}")))
384    }
385
386    // ── Public Accessors ──────────────────────────────────────────────────
387
388    /// Access the storage backend.
389    pub fn storage(&self) -> &dyn StorageBackend {
390        &*self.storage
391    }
392
393    /// Whether an embedding provider is configured.
394    pub fn has_embeddings(&self) -> bool {
395        self.embeddings.is_some()
396    }
397
398    /// Access the database path (if backed by a file).
399    pub fn db_path(&self) -> Option<&Path> {
400        self.db_path.as_deref()
401    }
402
403    /// Access the loaded configuration.
404    pub fn config(&self) -> &CodememConfig {
405        &self.config
406    }
407
408    /// Access the metrics collector.
409    pub fn metrics(&self) -> &Arc<InMemoryMetrics> {
410        &self.metrics
411    }
412
413    /// Access the raw graph Mutex (for callers that need `&Mutex<GraphEngine>`).
414    pub fn graph_mutex(&self) -> &Mutex<GraphEngine> {
415        &self.graph
416    }
417
418    /// Access the raw vector Mutex (for callers that need `&Mutex<HnswIndex>`).
419    pub fn vector_mutex(&self) -> &Mutex<HnswIndex> {
420        &self.vector
421    }
422
423    /// Access the raw BM25 Mutex (for callers that need `&Mutex<Bm25Index>`).
424    pub fn bm25_mutex(&self) -> &Mutex<Bm25Index> {
425        &self.bm25_index
426    }
427
428    /// Access the raw embeddings Mutex (for callers that need the `Option<&Mutex<...>>`).
429    pub fn embeddings_mutex(
430        &self,
431    ) -> Option<&Mutex<Box<dyn codemem_embeddings::EmbeddingProvider>>> {
432        self.embeddings.as_ref()
433    }
434
435    /// Check if the engine has unsaved changes (dirty flag is set).
436    #[cfg(test)]
437    pub(crate) fn is_dirty(&self) -> bool {
438        self.dirty.load(Ordering::Acquire)
439    }
440}
441
442// Re-export types from file_indexing at crate root for API compatibility
443pub use file_indexing::{IndexEnrichResult, SessionContext};