codemem_engine/
lib.rs

1//! codemem-engine: Domain logic engine for the Codemem memory system.
2//!
3//! This crate contains all business logic, orchestration, and domain operations:
4//! - **index** — ast-grep based code indexing, symbol extraction, reference resolution
5//! - **hooks** — Lifecycle hook handlers (PostToolUse, SessionStart, Stop)
6//! - **watch** — Real-time file watching with debouncing and .gitignore support
7//! - **bm25** — Okapi BM25 scoring with code-aware tokenization
8//! - **scoring** — 9-component hybrid scoring for memory recall
9//! - **patterns** — Cross-session pattern detection
10//! - **compress** — Optional LLM-powered observation compression
11//! - **metrics** — Operational metrics collection
12
13use codemem_core::{
14    CodememConfig, CodememError, ScoringWeights, StorageBackend, VectorBackend, VectorConfig,
15};
16pub use codemem_storage::graph::GraphEngine;
17pub use codemem_storage::HnswIndex;
18pub use codemem_storage::Storage;
19use std::path::{Path, PathBuf};
20use std::sync::atomic::AtomicBool;
21#[cfg(test)]
22use std::sync::atomic::Ordering;
23use std::sync::{Arc, Mutex, OnceLock, RwLock};
24
25pub mod analysis;
26pub mod bm25;
27pub mod compress;
28pub mod consolidation;
29pub mod enrichment;
30mod enrichment_text;
31mod file_indexing;
32mod graph_linking;
33pub mod graph_ops;
34pub mod hooks;
35pub mod index;
36pub mod insights;
37mod memory_ops;
38pub mod metrics;
39pub mod patterns;
40pub mod pca;
41pub mod persistence;
42pub mod recall;
43pub mod scoring;
44pub mod search;
45pub mod watch;
46
47#[cfg(test)]
48#[path = "tests/engine_integration_tests.rs"]
49mod integration_tests;
50
51#[cfg(test)]
52#[path = "tests/enrichment_tests.rs"]
53mod enrichment_tests;
54
55#[cfg(test)]
56#[path = "tests/recall_tests.rs"]
57mod recall_tests;
58
59#[cfg(test)]
60#[path = "tests/search_tests.rs"]
61mod search_tests;
62
63#[cfg(test)]
64#[path = "tests/consolidation_tests.rs"]
65mod consolidation_tests;
66
67#[cfg(test)]
68#[path = "tests/analysis_tests.rs"]
69mod analysis_tests;
70
71#[cfg(test)]
72#[path = "tests/persistence_tests.rs"]
73mod persistence_tests;
74
75// Re-export key index types at crate root for convenience
76pub use index::{
77    ChunkConfig, CodeChunk, CodeParser, Dependency, IndexAndResolveResult, IndexProgress,
78    IndexResult, Indexer, ManifestResult, ParseResult, Reference, ReferenceKind, ReferenceResolver,
79    ResolvedEdge, Symbol, SymbolKind, Visibility, Workspace,
80};
81
82// Re-export key domain types for convenience
83pub use bm25::Bm25Index;
84pub use metrics::InMemoryMetrics;
85
86// Re-export enrichment types
87pub use enrichment::{EnrichResult, EnrichmentPipelineResult};
88
89// Re-export persistence types
90pub use persistence::{edge_weight_for, IndexPersistResult};
91
92// Re-export recall types
93pub use recall::{ExpandedResult, NamespaceStats, RecallQuery};
94
95// Re-export search types
96pub use search::{CodeSearchResult, SummaryTreeNode, SymbolSearchResult};
97
98// Re-export analysis types
99pub use analysis::{
100    DecisionChain, DecisionConnection, DecisionEntry, ImpactResult, SessionCheckpointReport,
101};
102
103/// A part descriptor for `split_memory()`.
104#[derive(Debug, Clone)]
105pub struct SplitPart {
106    pub content: String,
107    pub tags: Option<Vec<String>>,
108    pub importance: Option<f64>,
109}
110
111// ── Index Cache ──────────────────────────────────────────────────────────────
112
113/// Cached code-index results for structural queries.
114pub struct IndexCache {
115    pub symbols: Vec<Symbol>,
116    pub chunks: Vec<CodeChunk>,
117    pub root_path: String,
118}
119
120// ── CodememEngine ────────────────────────────────────────────────────────────
121
122/// Core domain engine holding all backends and domain state.
123///
124/// This struct contains all the business logic for the Codemem memory system.
125/// Transport layers (MCP, REST API, CLI) hold a `CodememEngine` and delegate
126/// domain operations to it, keeping transport concerns separate.
127///
128/// **Concrete types are intentional**: `CodememEngine` uses concrete backend types
129/// (`Storage`, `HnswIndex`, `GraphEngine`) rather than trait objects (`dyn StorageBackend`,
130/// `dyn VectorBackend`, `dyn GraphBackend`) for performance. This enables monomorphization
131/// (the compiler generates specialized code for each concrete type), eliminates vtable
132/// indirection overhead on every call, and provides predictable memory layout for
133/// cache-friendly access patterns. The trait abstractions exist for testing and
134/// alternative implementations, but the engine itself benefits from static dispatch.
135pub struct CodememEngine {
136    pub(crate) storage: Box<dyn StorageBackend>,
137    /// Lazily initialized HNSW vector index. Loaded on first `lock_vector()` call.
138    pub(crate) vector: OnceLock<Mutex<HnswIndex>>,
139    pub(crate) graph: Mutex<GraphEngine>,
140    /// Lazily initialized embedding provider. Loaded on first `lock_embeddings()` call.
141    pub(crate) embeddings: OnceLock<Option<Mutex<Box<dyn codemem_embeddings::EmbeddingProvider>>>>,
142    /// Path to the database file, used to derive the index save path.
143    pub(crate) db_path: Option<PathBuf>,
144    /// Cached index results for structural queries.
145    pub(crate) index_cache: Mutex<Option<IndexCache>>,
146    /// Configurable scoring weights for the 9-component hybrid scoring system.
147    pub(crate) scoring_weights: RwLock<ScoringWeights>,
148    /// Lazily initialized BM25 index. Loaded on first `lock_bm25()` call.
149    pub(crate) bm25_index: OnceLock<Mutex<Bm25Index>>,
150    /// Loaded configuration.
151    pub(crate) config: CodememConfig,
152    /// Operational metrics collector.
153    pub(crate) metrics: Arc<InMemoryMetrics>,
154    /// Dirty flag for batch saves: set after `persist_memory_no_save()`,
155    /// cleared by `save_index()`.
156    dirty: AtomicBool,
157    /// Active session ID for auto-populating `session_id` on persisted memories.
158    active_session_id: RwLock<Option<String>>,
159    /// Cached change detector for incremental single-file indexing.
160    /// Loaded lazily from storage on first use.
161    change_detector: Mutex<Option<index::incremental::ChangeDetector>>,
162}
163
164impl CodememEngine {
165    /// Create an engine with storage, vector, graph, and optional embeddings backends.
166    pub fn new(
167        storage: Box<dyn StorageBackend>,
168        vector: HnswIndex,
169        graph: GraphEngine,
170        embeddings: Option<Box<dyn codemem_embeddings::EmbeddingProvider>>,
171    ) -> Self {
172        let config = CodememConfig::load_or_default();
173        Self::new_with_config(storage, vector, graph, embeddings, config)
174    }
175
176    /// Create an engine with an explicit config (avoids double-loading from disk).
177    pub fn new_with_config(
178        storage: Box<dyn StorageBackend>,
179        vector: HnswIndex,
180        graph: GraphEngine,
181        embeddings: Option<Box<dyn codemem_embeddings::EmbeddingProvider>>,
182        config: CodememConfig,
183    ) -> Self {
184        let vector_lock = OnceLock::new();
185        let _ = vector_lock.set(Mutex::new(vector));
186        let embeddings_lock = OnceLock::new();
187        let _ = embeddings_lock.set(embeddings.map(Mutex::new));
188        let bm25_lock = OnceLock::new();
189        let _ = bm25_lock.set(Mutex::new(Bm25Index::new()));
190        Self {
191            storage,
192            vector: vector_lock,
193            graph: Mutex::new(graph),
194            embeddings: embeddings_lock,
195            db_path: None,
196            index_cache: Mutex::new(None),
197            scoring_weights: RwLock::new(config.scoring.clone()),
198            bm25_index: bm25_lock,
199            config,
200            metrics: Arc::new(InMemoryMetrics::new()),
201            dirty: AtomicBool::new(false),
202            active_session_id: RwLock::new(None),
203            change_detector: Mutex::new(None),
204        }
205    }
206
207    /// Create an engine from a database path.
208    ///
209    /// Only loads SQLite storage and the in-memory graph eagerly. The vector index,
210    /// BM25 index, and embedding provider are lazily initialized on first access
211    /// via `lock_vector()`, `lock_bm25()`, and `lock_embeddings()`. This makes
212    /// lightweight callers (lifecycle hooks) fast (~200ms) while full operations
213    /// (recall, search, analyze) pay the init cost once on first use.
214    pub fn from_db_path(db_path: &Path) -> Result<Self, CodememError> {
215        // Ensure parent directory exists (e.g. ~/.codemem/)
216        if let Some(parent) = db_path.parent() {
217            if !parent.exists() {
218                std::fs::create_dir_all(parent).map_err(|e| {
219                    CodememError::Storage(format!(
220                        "Failed to create database directory {}: {e}",
221                        parent.display()
222                    ))
223                })?;
224            }
225        }
226
227        let config = CodememConfig::load_or_default();
228
229        // Wire StorageConfig into Storage::open
230        let storage = Storage::open_with_config(
231            db_path,
232            Some(config.storage.cache_size_mb),
233            Some(config.storage.busy_timeout_secs),
234        )?;
235
236        // Load graph from storage (needed for centrality and graph queries)
237        let graph = GraphEngine::from_storage(&storage)?;
238
239        let engine = Self {
240            storage: Box::new(storage),
241            vector: OnceLock::new(),
242            graph: Mutex::new(graph),
243            embeddings: OnceLock::new(),
244            db_path: Some(db_path.to_path_buf()),
245            index_cache: Mutex::new(None),
246            scoring_weights: RwLock::new(config.scoring.clone()),
247            bm25_index: OnceLock::new(),
248            config,
249            metrics: Arc::new(InMemoryMetrics::new()),
250            dirty: AtomicBool::new(false),
251            active_session_id: RwLock::new(None),
252            change_detector: Mutex::new(None),
253        };
254
255        // H7: Only compute PageRank at startup; betweenness is computed lazily
256        // via `ensure_betweenness_computed()` when first needed.
257        engine
258            .lock_graph()?
259            .recompute_centrality_with_options(false);
260
261        Ok(engine)
262    }
263
264    /// Create a minimal engine for testing.
265    pub fn for_testing() -> Self {
266        let storage = Storage::open_in_memory().unwrap();
267        let graph = GraphEngine::new();
268        let config = CodememConfig::default();
269        let vector_lock = OnceLock::new();
270        let _ = vector_lock.set(Mutex::new(HnswIndex::with_defaults().unwrap()));
271        let embeddings_lock = OnceLock::new();
272        let _ = embeddings_lock.set(None);
273        let bm25_lock = OnceLock::new();
274        let _ = bm25_lock.set(Mutex::new(Bm25Index::new()));
275        Self {
276            storage: Box::new(storage),
277            vector: vector_lock,
278            graph: Mutex::new(graph),
279            embeddings: embeddings_lock,
280            db_path: None,
281            index_cache: Mutex::new(None),
282            scoring_weights: RwLock::new(config.scoring.clone()),
283            bm25_index: bm25_lock,
284            config,
285            metrics: Arc::new(InMemoryMetrics::new()),
286            dirty: AtomicBool::new(false),
287            active_session_id: RwLock::new(None),
288            change_detector: Mutex::new(None),
289        }
290    }
291
292    // ── Lock Helpers ─────────────────────────────────────────────────────────
293
294    pub fn lock_vector(&self) -> Result<std::sync::MutexGuard<'_, HnswIndex>, CodememError> {
295        self.vector
296            .get_or_init(|| self.init_vector())
297            .lock()
298            .map_err(|e| CodememError::LockPoisoned(format!("vector: {e}")))
299    }
300
301    pub fn lock_graph(&self) -> Result<std::sync::MutexGuard<'_, GraphEngine>, CodememError> {
302        self.graph
303            .lock()
304            .map_err(|e| CodememError::LockPoisoned(format!("graph: {e}")))
305    }
306
307    pub fn lock_bm25(&self) -> Result<std::sync::MutexGuard<'_, Bm25Index>, CodememError> {
308        self.bm25_index
309            .get_or_init(|| self.init_bm25())
310            .lock()
311            .map_err(|e| CodememError::LockPoisoned(format!("bm25: {e}")))
312    }
313
314    /// Lock the embedding provider, lazily initializing it on first access.
315    ///
316    /// Returns `Ok(None)` if no provider is configured (e.g. `from_env()` fails).
317    pub fn lock_embeddings(
318        &self,
319    ) -> Result<
320        Option<std::sync::MutexGuard<'_, Box<dyn codemem_embeddings::EmbeddingProvider>>>,
321        CodememError,
322    > {
323        match self.embeddings.get_or_init(|| self.init_embeddings()) {
324            Some(m) => Ok(Some(m.lock().map_err(|e| {
325                CodememError::LockPoisoned(format!("embeddings: {e}"))
326            })?)),
327            None => Ok(None),
328        }
329    }
330
331    /// Check if embeddings are already initialized (without triggering lazy init).
332    fn embeddings_ready(&self) -> bool {
333        self.embeddings.get().is_some_and(|opt| opt.is_some())
334    }
335
336    /// Check if the vector index is already initialized (without triggering lazy init).
337    fn vector_ready(&self) -> bool {
338        self.vector.get().is_some()
339    }
340
341    /// Check if the BM25 index is already initialized (without triggering lazy init).
342    fn bm25_ready(&self) -> bool {
343        self.bm25_index.get().is_some()
344    }
345
346    pub fn lock_index_cache(
347        &self,
348    ) -> Result<std::sync::MutexGuard<'_, Option<IndexCache>>, CodememError> {
349        self.index_cache
350            .lock()
351            .map_err(|e| CodememError::LockPoisoned(format!("index_cache: {e}")))
352    }
353
354    pub fn scoring_weights(
355        &self,
356    ) -> Result<std::sync::RwLockReadGuard<'_, ScoringWeights>, CodememError> {
357        self.scoring_weights
358            .read()
359            .map_err(|e| CodememError::LockPoisoned(format!("scoring_weights read: {e}")))
360    }
361
362    pub fn scoring_weights_mut(
363        &self,
364    ) -> Result<std::sync::RwLockWriteGuard<'_, ScoringWeights>, CodememError> {
365        self.scoring_weights
366            .write()
367            .map_err(|e| CodememError::LockPoisoned(format!("scoring_weights write: {e}")))
368    }
369
370    // ── Lazy Initialization ────────────────────────────────────────────
371
372    /// Initialize the HNSW vector index: load from disk, run consistency check.
373    fn init_vector(&self) -> Mutex<HnswIndex> {
374        let vector_config = VectorConfig {
375            dimensions: self.config.vector.dimensions,
376            ..VectorConfig::default()
377        };
378        let mut vector = HnswIndex::new(vector_config.clone())
379            .unwrap_or_else(|_| HnswIndex::with_defaults().expect("default vector index"));
380
381        if let Some(ref db_path) = self.db_path {
382            let index_path = db_path.with_extension("idx");
383            if index_path.exists() {
384                if let Err(e) = vector.load(&index_path) {
385                    tracing::warn!("Stale or corrupt vector index, will rebuild: {e}");
386                }
387            }
388
389            // C6: Consistency check — rebuild if count mismatches DB embedding count.
390            let vector_count = vector.stats().count;
391            if let Ok(db_stats) = self.storage.stats() {
392                let db_embed_count = db_stats.embedding_count;
393                if vector_count != db_embed_count {
394                    tracing::warn!(
395                        "Vector index ({vector_count}) out of sync with DB ({db_embed_count}), rebuilding..."
396                    );
397                    if let Ok(mut fresh) = HnswIndex::new(vector_config) {
398                        if let Ok(embeddings) = self.storage.list_all_embeddings() {
399                            for (id, emb) in &embeddings {
400                                if let Err(e) = fresh.insert(id, emb) {
401                                    tracing::warn!("Failed to re-insert embedding {id}: {e}");
402                                }
403                            }
404                        }
405                        vector = fresh;
406                        if let Err(e) = vector.save(&index_path) {
407                            tracing::warn!("Failed to save rebuilt vector index: {e}");
408                        }
409                    }
410                }
411            }
412        }
413
414        Mutex::new(vector)
415    }
416
417    /// Initialize the BM25 index: load from disk or rebuild from memories.
418    fn init_bm25(&self) -> Mutex<Bm25Index> {
419        let mut bm25 = Bm25Index::new();
420
421        if let Some(ref db_path) = self.db_path {
422            let bm25_path = db_path.with_extension("bm25");
423            let mut loaded = false;
424            if bm25_path.exists() {
425                if let Ok(data) = std::fs::read(&bm25_path) {
426                    if let Ok(index) = Bm25Index::deserialize(&data) {
427                        tracing::info!(
428                            "Loaded BM25 index from disk ({} documents)",
429                            index.doc_count
430                        );
431                        bm25 = index;
432                        loaded = true;
433                    }
434                }
435            }
436            if !loaded {
437                if let Ok(ids) = self.storage.list_memory_ids() {
438                    let id_refs: Vec<&str> = ids.iter().map(|s| s.as_str()).collect();
439                    if let Ok(memories) = self.storage.get_memories_batch(&id_refs) {
440                        for m in &memories {
441                            bm25.add_document(&m.id, &m.content);
442                        }
443                        tracing::info!("Rebuilt BM25 index from {} memories", bm25.doc_count);
444                    }
445                }
446            }
447        }
448
449        Mutex::new(bm25)
450    }
451
452    /// Initialize the embedding provider from environment/config.
453    ///
454    /// Also backfills embeddings for any memories that were stored without them
455    /// (e.g. by lifecycle hooks that skipped embedding for speed).
456    fn init_embeddings(&self) -> Option<Mutex<Box<dyn codemem_embeddings::EmbeddingProvider>>> {
457        let provider = match codemem_embeddings::from_env(Some(&self.config.embedding)) {
458            Ok(p) => p,
459            Err(e) => {
460                tracing::warn!("Failed to initialize embedding provider: {e}");
461                return None;
462            }
463        };
464
465        // Backfill un-embedded memories (from hooks that skipped embedding)
466        self.backfill_embeddings(&*provider);
467
468        Some(Mutex::new(provider))
469    }
470
471    /// Embed any memories that lack embeddings in SQLite.
472    ///
473    /// This runs during lazy init of the embedding provider to pick up memories
474    /// stored by lightweight hooks without embedding.
475    fn backfill_embeddings(&self, provider: &dyn codemem_embeddings::EmbeddingProvider) {
476        let ids = match self.storage.list_memory_ids() {
477            Ok(ids) => ids,
478            Err(_) => return,
479        };
480
481        let mut to_embed: Vec<(String, String)> = Vec::new();
482        for id in &ids {
483            if self.storage.get_embedding(id).ok().flatten().is_none() {
484                if let Ok(Some(mem)) = self.storage.get_memory_no_touch(id) {
485                    let text = self.enrich_memory_text(
486                        &mem.content,
487                        mem.memory_type,
488                        &mem.tags,
489                        mem.namespace.as_deref(),
490                        Some(&mem.id),
491                    );
492                    to_embed.push((id.clone(), text));
493                }
494            }
495        }
496
497        if to_embed.is_empty() {
498            return;
499        }
500
501        tracing::info!("Backfilling {} un-embedded memories", to_embed.len());
502        let text_refs: Vec<&str> = to_embed.iter().map(|(_, t)| t.as_str()).collect();
503        match provider.embed_batch(&text_refs) {
504            Ok(embeddings) => {
505                for ((id, _), emb) in to_embed.iter().zip(embeddings.iter()) {
506                    let _ = self.storage.store_embedding(id, emb);
507                    // Insert into vector index if already loaded
508                    if let Some(vi_mutex) = self.vector.get() {
509                        if let Ok(mut vi) = vi_mutex.lock().map_err(|e| {
510                            tracing::warn!("Vector lock failed during backfill: {e}");
511                            e
512                        }) {
513                            let _ = vi.insert(id, emb);
514                        }
515                    }
516                }
517                tracing::info!("Backfilled {} embeddings", to_embed.len());
518            }
519            Err(e) => tracing::warn!("Backfill embedding failed: {e}"),
520        }
521    }
522
523    // ── Active Session ───────────────────────────────────────────────────
524
525    /// Set the active session ID for auto-populating `session_id` on persisted memories.
526    pub fn set_active_session(&self, id: Option<String>) {
527        match self.active_session_id.write() {
528            Ok(mut guard) => *guard = id,
529            Err(e) => *e.into_inner() = id,
530        }
531    }
532
533    /// Get the current active session ID.
534    pub fn active_session_id(&self) -> Option<String> {
535        match self.active_session_id.read() {
536            Ok(guard) => guard.clone(),
537            Err(e) => e.into_inner().clone(),
538        }
539    }
540
541    // ── Public Accessors ──────────────────────────────────────────────────
542
543    /// Access the storage backend.
544    pub fn storage(&self) -> &dyn StorageBackend {
545        &*self.storage
546    }
547
548    /// Whether an embedding provider is configured.
549    ///
550    /// Returns `true` if embeddings are already loaded, or if the config suggests
551    /// a provider is available (without triggering lazy init).
552    pub fn has_embeddings(&self) -> bool {
553        match self.embeddings.get() {
554            Some(opt) => opt.is_some(),
555            None => !self.config.embedding.provider.is_empty(),
556        }
557    }
558
559    /// Access the database path (if backed by a file).
560    pub fn db_path(&self) -> Option<&Path> {
561        self.db_path.as_deref()
562    }
563
564    /// Access the loaded configuration.
565    pub fn config(&self) -> &CodememConfig {
566        &self.config
567    }
568
569    /// Access the metrics collector.
570    pub fn metrics(&self) -> &Arc<InMemoryMetrics> {
571        &self.metrics
572    }
573
574    // ── Closure Accessors (safe read-only access for transport layers) ──
575
576    /// Execute a closure with a locked reference to the graph engine.
577    /// Provides safe read-only access without exposing raw mutexes.
578    pub fn with_graph<F, R>(&self, f: F) -> Result<R, CodememError>
579    where
580        F: FnOnce(&GraphEngine) -> R,
581    {
582        let guard = self.lock_graph()?;
583        Ok(f(&guard))
584    }
585
586    /// Execute a closure with a locked reference to the vector index.
587    /// Provides safe read-only access without exposing raw mutexes.
588    pub fn with_vector<F, R>(&self, f: F) -> Result<R, CodememError>
589    where
590        F: FnOnce(&HnswIndex) -> R,
591    {
592        let guard = self.lock_vector()?;
593        Ok(f(&guard))
594    }
595
596    /// Check if the engine has unsaved changes (dirty flag is set).
597    #[cfg(test)]
598    pub(crate) fn is_dirty(&self) -> bool {
599        self.dirty.load(Ordering::Acquire)
600    }
601
602    // ── Repository Management (delegates to storage) ─────────────────
603
604    /// List all registered repositories.
605    pub fn list_repos(&self) -> Result<Vec<codemem_core::Repository>, CodememError> {
606        self.storage.list_repos()
607    }
608
609    /// Add a new repository.
610    pub fn add_repo(&self, repo: &codemem_core::Repository) -> Result<(), CodememError> {
611        self.storage.add_repo(repo)
612    }
613
614    /// Get a repository by ID.
615    pub fn get_repo(&self, id: &str) -> Result<Option<codemem_core::Repository>, CodememError> {
616        self.storage.get_repo(id)
617    }
618
619    /// Remove a repository by ID.
620    pub fn remove_repo(&self, id: &str) -> Result<bool, CodememError> {
621        self.storage.remove_repo(id)
622    }
623
624    /// Update a repository's status and optionally its last-indexed timestamp.
625    pub fn update_repo_status(
626        &self,
627        id: &str,
628        status: &str,
629        indexed_at: Option<&str>,
630    ) -> Result<(), CodememError> {
631        self.storage.update_repo_status(id, status, indexed_at)
632    }
633}
634
635// Re-export types from file_indexing at crate root for API compatibility
636pub use file_indexing::{AnalyzeOptions, AnalyzeProgress, AnalyzeResult, SessionContext};
637
638// Re-export embeddings types so downstream crates need not depend on codemem-embeddings directly.
639/// Create an embedding provider from environment configuration.
640pub use codemem_embeddings::from_env as embeddings_from_env;
641pub use codemem_embeddings::{EmbeddingProvider, EmbeddingService};
codemem_engine/lib.rs

codemem_engine/
lib.rs