Skip to main content

codemem_engine/
lib.rs

1//! codemem-engine: Domain logic engine for the Codemem memory system.
2//!
3//! This crate contains all business logic, orchestration, and domain operations:
4//! - **index** — ast-grep based code indexing, symbol extraction, reference resolution
5//! - **hooks** — Lifecycle hook handlers (PostToolUse, SessionStart, Stop)
6//! - **watch** — Real-time file watching with debouncing and .gitignore support
7//! - **bm25** — Okapi BM25 scoring with code-aware tokenization
8//! - **scoring** — 9-component hybrid scoring for memory recall
9//! - **patterns** — Cross-session pattern detection
10//! - **compress** — Optional LLM-powered observation compression
11//! - **metrics** — Operational metrics collection
12
13use codemem_core::{
14    CodememConfig, CodememError, ScoringWeights, StorageBackend, VectorBackend, VectorConfig,
15};
16pub use codemem_storage::graph::GraphEngine;
17pub use codemem_storage::HnswIndex;
18pub use codemem_storage::Storage;
19use std::path::{Path, PathBuf};
20use std::sync::atomic::AtomicBool;
21#[cfg(test)]
22use std::sync::atomic::Ordering;
23use std::sync::{Arc, Mutex, RwLock};
24
25pub mod analysis;
26pub mod bm25;
27pub mod compress;
28pub mod consolidation;
29pub mod enrichment;
30mod enrichment_text;
31mod file_indexing;
32mod graph_linking;
33pub mod graph_ops;
34pub mod hooks;
35pub mod index;
36pub mod insights;
37mod memory_ops;
38pub mod metrics;
39pub mod patterns;
40pub mod pca;
41pub mod persistence;
42pub mod recall;
43pub mod scoring;
44pub mod search;
45pub mod watch;
46
47#[cfg(test)]
48#[path = "tests/engine_integration_tests.rs"]
49mod integration_tests;
50
51#[cfg(test)]
52#[path = "tests/enrichment_tests.rs"]
53mod enrichment_tests;
54
55#[cfg(test)]
56#[path = "tests/recall_tests.rs"]
57mod recall_tests;
58
59#[cfg(test)]
60#[path = "tests/search_tests.rs"]
61mod search_tests;
62
63#[cfg(test)]
64#[path = "tests/consolidation_tests.rs"]
65mod consolidation_tests;
66
67#[cfg(test)]
68#[path = "tests/analysis_tests.rs"]
69mod analysis_tests;
70
71#[cfg(test)]
72#[path = "tests/persistence_tests.rs"]
73mod persistence_tests;
74
75// Re-export key index types at crate root for convenience
76pub use index::{
77    ChunkConfig, CodeChunk, CodeParser, Dependency, IndexAndResolveResult, IndexProgress,
78    IndexResult, Indexer, ManifestResult, ParseResult, Reference, ReferenceKind, ReferenceResolver,
79    ResolvedEdge, Symbol, SymbolKind, Visibility, Workspace,
80};
81
82// Re-export key domain types for convenience
83pub use bm25::Bm25Index;
84pub use metrics::InMemoryMetrics;
85
86// Re-export enrichment types
87pub use enrichment::{EnrichResult, EnrichmentPipelineResult};
88
89// Re-export persistence types
90pub use persistence::{edge_weight_for, IndexPersistResult};
91
92// Re-export recall types
93pub use recall::{ExpandedResult, NamespaceStats, RecallQuery};
94
95// Re-export search types
96pub use search::{CodeSearchResult, SummaryTreeNode, SymbolSearchResult};
97
98// Re-export analysis types
99pub use analysis::{
100    DecisionChain, DecisionConnection, DecisionEntry, ImpactResult, SessionCheckpointReport,
101};
102
103/// A part descriptor for `split_memory()`.
104#[derive(Debug, Clone)]
105pub struct SplitPart {
106    pub content: String,
107    pub tags: Option<Vec<String>>,
108    pub importance: Option<f64>,
109}
110
111// ── Index Cache ──────────────────────────────────────────────────────────────
112
113/// Cached code-index results for structural queries.
114pub struct IndexCache {
115    pub symbols: Vec<Symbol>,
116    pub chunks: Vec<CodeChunk>,
117    pub root_path: String,
118}
119
120// ── CodememEngine ────────────────────────────────────────────────────────────
121
122/// Core domain engine holding all backends and domain state.
123///
124/// This struct contains all the business logic for the Codemem memory system.
125/// Transport layers (MCP, REST API, CLI) hold a `CodememEngine` and delegate
126/// domain operations to it, keeping transport concerns separate.
127///
128/// **Concrete types are intentional**: `CodememEngine` uses concrete backend types
129/// (`Storage`, `HnswIndex`, `GraphEngine`) rather than trait objects (`dyn StorageBackend`,
130/// `dyn VectorBackend`, `dyn GraphBackend`) for performance. This enables monomorphization
131/// (the compiler generates specialized code for each concrete type), eliminates vtable
132/// indirection overhead on every call, and provides predictable memory layout for
133/// cache-friendly access patterns. The trait abstractions exist for testing and
134/// alternative implementations, but the engine itself benefits from static dispatch.
135pub struct CodememEngine {
136    pub(crate) storage: Box<dyn StorageBackend>,
137    pub(crate) vector: Mutex<HnswIndex>,
138    pub(crate) graph: Mutex<GraphEngine>,
139    /// Optional embedding provider (None if not configured).
140    pub(crate) embeddings: Option<Mutex<Box<dyn codemem_embeddings::EmbeddingProvider>>>,
141    /// Path to the database file, used to derive the index save path.
142    pub(crate) db_path: Option<PathBuf>,
143    /// Cached index results for structural queries.
144    pub(crate) index_cache: Mutex<Option<IndexCache>>,
145    /// Configurable scoring weights for the 9-component hybrid scoring system.
146    pub(crate) scoring_weights: RwLock<ScoringWeights>,
147    /// BM25 index for code-aware token overlap scoring.
148    pub(crate) bm25_index: Mutex<Bm25Index>,
149    /// Loaded configuration.
150    pub(crate) config: CodememConfig,
151    /// Operational metrics collector.
152    pub(crate) metrics: Arc<InMemoryMetrics>,
153    /// Dirty flag for batch saves: set after `persist_memory_no_save()`,
154    /// cleared by `save_index()`.
155    dirty: AtomicBool,
156    /// Active session ID for auto-populating `session_id` on persisted memories.
157    active_session_id: RwLock<Option<String>>,
158}
159
160impl CodememEngine {
161    /// Create an engine with storage, vector, graph, and optional embeddings backends.
162    pub fn new(
163        storage: Box<dyn StorageBackend>,
164        vector: HnswIndex,
165        graph: GraphEngine,
166        embeddings: Option<Box<dyn codemem_embeddings::EmbeddingProvider>>,
167    ) -> Self {
168        let config = CodememConfig::load_or_default();
169        Self::new_with_config(storage, vector, graph, embeddings, config)
170    }
171
172    /// Create an engine with an explicit config (avoids double-loading from disk).
173    pub fn new_with_config(
174        storage: Box<dyn StorageBackend>,
175        vector: HnswIndex,
176        graph: GraphEngine,
177        embeddings: Option<Box<dyn codemem_embeddings::EmbeddingProvider>>,
178        config: CodememConfig,
179    ) -> Self {
180        Self {
181            storage,
182            vector: Mutex::new(vector),
183            graph: Mutex::new(graph),
184            embeddings: embeddings.map(Mutex::new),
185            db_path: None,
186            index_cache: Mutex::new(None),
187            scoring_weights: RwLock::new(config.scoring.clone()),
188            bm25_index: Mutex::new(Bm25Index::new()),
189            config,
190            metrics: Arc::new(InMemoryMetrics::new()),
191            dirty: AtomicBool::new(false),
192            active_session_id: RwLock::new(None),
193        }
194    }
195
196    /// Create an engine from a database path, loading all backends.
197    pub fn from_db_path(db_path: &Path) -> Result<Self, CodememError> {
198        // Ensure parent directory exists (e.g. ~/.codemem/)
199        if let Some(parent) = db_path.parent() {
200            if !parent.exists() {
201                std::fs::create_dir_all(parent).map_err(|e| {
202                    CodememError::Storage(format!(
203                        "Failed to create database directory {}: {e}",
204                        parent.display()
205                    ))
206                })?;
207            }
208        }
209
210        let config = CodememConfig::load_or_default();
211
212        // Wire StorageConfig into Storage::open
213        let storage = Storage::open_with_config(
214            db_path,
215            Some(config.storage.cache_size_mb),
216            Some(config.storage.busy_timeout_secs),
217        )?;
218        let vector_config = VectorConfig {
219            dimensions: config.vector.dimensions,
220            ..VectorConfig::default()
221        };
222        let mut vector = HnswIndex::new(vector_config.clone())?;
223
224        // Load existing vector index if it exists
225        let index_path = db_path.with_extension("idx");
226        if index_path.exists() {
227            if let Err(e) = vector.load(&index_path) {
228                tracing::warn!("Stale or corrupt vector index, will rebuild: {e}");
229            }
230        }
231
232        // C6: Vector index consistency check — compare vector index count vs DB embedding count.
233        // If they mismatch, rebuild the vector index from SQLite embeddings.
234        let vector_count = vector.stats().count;
235        let db_stats = storage.stats()?;
236        let db_embed_count = db_stats.embedding_count;
237        if vector_count != db_embed_count {
238            tracing::warn!(
239                "Vector index ({vector_count}) out of sync with DB ({db_embed_count}), rebuilding..."
240            );
241            // Rebuild: create a fresh index and re-insert all embeddings from DB
242            let mut fresh_vector = HnswIndex::new(vector_config)?;
243            if let Ok(embeddings) = storage.list_all_embeddings() {
244                for (id, embedding) in &embeddings {
245                    if let Err(e) = fresh_vector.insert(id, embedding) {
246                        tracing::warn!("Failed to re-insert embedding {id}: {e}");
247                    }
248                }
249            }
250            vector = fresh_vector;
251            // Save the rebuilt index
252            if let Err(e) = vector.save(&index_path) {
253                tracing::warn!("Failed to save rebuilt vector index: {e}");
254            }
255        }
256
257        // Load graph from storage
258        let graph = GraphEngine::from_storage(&storage)?;
259
260        // Wire EmbeddingConfig into from_env as fallback
261        let embeddings = codemem_embeddings::from_env(Some(&config.embedding)).ok();
262
263        let mut engine =
264            Self::new_with_config(Box::new(storage), vector, graph, embeddings, config);
265        engine.db_path = Some(db_path.to_path_buf());
266
267        // H7: Only compute PageRank at startup; betweenness is computed lazily
268        // via `ensure_betweenness_computed()` when first needed.
269        engine
270            .lock_graph()?
271            .recompute_centrality_with_options(false);
272
273        // Try loading persisted BM25 index; fall back to rebuilding from memories.
274        let bm25_path = db_path.with_extension("bm25");
275        let mut bm25_loaded = false;
276        if bm25_path.exists() {
277            match std::fs::read(&bm25_path) {
278                Ok(data) => match Bm25Index::deserialize(&data) {
279                    Ok(index) => {
280                        let mut bm25 = engine.lock_bm25()?;
281                        *bm25 = index;
282                        bm25_loaded = true;
283                        tracing::info!(
284                            "Loaded BM25 index from disk ({} documents)",
285                            bm25.doc_count
286                        );
287                    }
288                    Err(e) => {
289                        tracing::warn!("Failed to deserialize BM25 index, rebuilding: {e}");
290                    }
291                },
292                Err(e) => {
293                    tracing::warn!("Failed to read BM25 index file, rebuilding: {e}");
294                }
295            }
296        }
297
298        if !bm25_loaded {
299            // Rebuild BM25 index from all existing memories (batch load)
300            if let Ok(ids) = engine.storage.list_memory_ids() {
301                let id_refs: Vec<&str> = ids.iter().map(|s| s.as_str()).collect();
302                if let Ok(memories) = engine.storage.get_memories_batch(&id_refs) {
303                    let mut bm25 = engine.lock_bm25()?;
304                    for memory in &memories {
305                        bm25.add_document(&memory.id, &memory.content);
306                    }
307                    tracing::info!("Rebuilt BM25 index from {} memories", bm25.doc_count);
308                }
309            }
310        }
311
312        Ok(engine)
313    }
314
315    /// Create a minimal engine for testing.
316    pub fn for_testing() -> Self {
317        let storage = Storage::open_in_memory().unwrap();
318        let vector = HnswIndex::with_defaults().unwrap();
319        let graph = GraphEngine::new();
320        let config = CodememConfig::default();
321        Self {
322            storage: Box::new(storage),
323            vector: Mutex::new(vector),
324            graph: Mutex::new(graph),
325            embeddings: None,
326            db_path: None,
327            index_cache: Mutex::new(None),
328            scoring_weights: RwLock::new(config.scoring.clone()),
329            bm25_index: Mutex::new(Bm25Index::new()),
330            config,
331            metrics: Arc::new(InMemoryMetrics::new()),
332            dirty: AtomicBool::new(false),
333            active_session_id: RwLock::new(None),
334        }
335    }
336
337    // ── Lock Helpers ─────────────────────────────────────────────────────────
338
339    pub fn lock_vector(&self) -> Result<std::sync::MutexGuard<'_, HnswIndex>, CodememError> {
340        self.vector
341            .lock()
342            .map_err(|e| CodememError::LockPoisoned(format!("vector: {e}")))
343    }
344
345    pub fn lock_graph(&self) -> Result<std::sync::MutexGuard<'_, GraphEngine>, CodememError> {
346        self.graph
347            .lock()
348            .map_err(|e| CodememError::LockPoisoned(format!("graph: {e}")))
349    }
350
351    pub fn lock_bm25(&self) -> Result<std::sync::MutexGuard<'_, Bm25Index>, CodememError> {
352        self.bm25_index
353            .lock()
354            .map_err(|e| CodememError::LockPoisoned(format!("bm25: {e}")))
355    }
356
357    pub fn lock_embeddings(
358        &self,
359    ) -> Result<
360        Option<std::sync::MutexGuard<'_, Box<dyn codemem_embeddings::EmbeddingProvider>>>,
361        CodememError,
362    > {
363        match &self.embeddings {
364            Some(m) => Ok(Some(m.lock().map_err(|e| {
365                CodememError::LockPoisoned(format!("embeddings: {e}"))
366            })?)),
367            None => Ok(None),
368        }
369    }
370
371    pub fn lock_index_cache(
372        &self,
373    ) -> Result<std::sync::MutexGuard<'_, Option<IndexCache>>, CodememError> {
374        self.index_cache
375            .lock()
376            .map_err(|e| CodememError::LockPoisoned(format!("index_cache: {e}")))
377    }
378
379    pub fn scoring_weights(
380        &self,
381    ) -> Result<std::sync::RwLockReadGuard<'_, ScoringWeights>, CodememError> {
382        self.scoring_weights
383            .read()
384            .map_err(|e| CodememError::LockPoisoned(format!("scoring_weights read: {e}")))
385    }
386
387    pub fn scoring_weights_mut(
388        &self,
389    ) -> Result<std::sync::RwLockWriteGuard<'_, ScoringWeights>, CodememError> {
390        self.scoring_weights
391            .write()
392            .map_err(|e| CodememError::LockPoisoned(format!("scoring_weights write: {e}")))
393    }
394
395    // ── Active Session ───────────────────────────────────────────────────
396
397    /// Set the active session ID for auto-populating `session_id` on persisted memories.
398    pub fn set_active_session(&self, id: Option<String>) {
399        match self.active_session_id.write() {
400            Ok(mut guard) => *guard = id,
401            Err(e) => *e.into_inner() = id,
402        }
403    }
404
405    /// Get the current active session ID.
406    pub fn active_session_id(&self) -> Option<String> {
407        match self.active_session_id.read() {
408            Ok(guard) => guard.clone(),
409            Err(e) => e.into_inner().clone(),
410        }
411    }
412
413    // ── Public Accessors ──────────────────────────────────────────────────
414
415    /// Access the storage backend.
416    pub fn storage(&self) -> &dyn StorageBackend {
417        &*self.storage
418    }
419
420    /// Whether an embedding provider is configured.
421    pub fn has_embeddings(&self) -> bool {
422        self.embeddings.is_some()
423    }
424
425    /// Access the database path (if backed by a file).
426    pub fn db_path(&self) -> Option<&Path> {
427        self.db_path.as_deref()
428    }
429
430    /// Access the loaded configuration.
431    pub fn config(&self) -> &CodememConfig {
432        &self.config
433    }
434
435    /// Access the metrics collector.
436    pub fn metrics(&self) -> &Arc<InMemoryMetrics> {
437        &self.metrics
438    }
439
440    // ── Closure Accessors (safe read-only access for transport layers) ──
441
442    /// Execute a closure with a locked reference to the graph engine.
443    /// Provides safe read-only access without exposing raw mutexes.
444    pub fn with_graph<F, R>(&self, f: F) -> Result<R, CodememError>
445    where
446        F: FnOnce(&GraphEngine) -> R,
447    {
448        let guard = self.lock_graph()?;
449        Ok(f(&guard))
450    }
451
452    /// Execute a closure with a locked reference to the vector index.
453    /// Provides safe read-only access without exposing raw mutexes.
454    pub fn with_vector<F, R>(&self, f: F) -> Result<R, CodememError>
455    where
456        F: FnOnce(&HnswIndex) -> R,
457    {
458        let guard = self.lock_vector()?;
459        Ok(f(&guard))
460    }
461
462    /// Check if the engine has unsaved changes (dirty flag is set).
463    #[cfg(test)]
464    pub(crate) fn is_dirty(&self) -> bool {
465        self.dirty.load(Ordering::Acquire)
466    }
467
468    // ── Repository Management (delegates to storage) ─────────────────
469
470    /// List all registered repositories.
471    pub fn list_repos(&self) -> Result<Vec<codemem_core::Repository>, CodememError> {
472        self.storage.list_repos()
473    }
474
475    /// Add a new repository.
476    pub fn add_repo(&self, repo: &codemem_core::Repository) -> Result<(), CodememError> {
477        self.storage.add_repo(repo)
478    }
479
480    /// Get a repository by ID.
481    pub fn get_repo(&self, id: &str) -> Result<Option<codemem_core::Repository>, CodememError> {
482        self.storage.get_repo(id)
483    }
484
485    /// Remove a repository by ID.
486    pub fn remove_repo(&self, id: &str) -> Result<bool, CodememError> {
487        self.storage.remove_repo(id)
488    }
489
490    /// Update a repository's status and optionally its last-indexed timestamp.
491    pub fn update_repo_status(
492        &self,
493        id: &str,
494        status: &str,
495        indexed_at: Option<&str>,
496    ) -> Result<(), CodememError> {
497        self.storage.update_repo_status(id, status, indexed_at)
498    }
499}
500
501// Re-export types from file_indexing at crate root for API compatibility
502pub use file_indexing::{AnalyzeOptions, AnalyzeProgress, AnalyzeResult, SessionContext};
503
504// Re-export embeddings types so downstream crates need not depend on codemem-embeddings directly.
505/// Create an embedding provider from environment configuration.
506pub use codemem_embeddings::from_env as embeddings_from_env;
507pub use codemem_embeddings::{EmbeddingProvider, EmbeddingService};