Skip to main content

codemem_engine/
file_indexing.rs

1use crate::index::{self, IndexAndResolveResult, Indexer};
2use crate::patterns;
3use crate::CodememEngine;
4use codemem_core::{
5    CodememError, DetectedPattern, GraphBackend, MemoryNode, NodeKind, VectorBackend,
6};
7use std::collections::HashSet;
8use std::path::Path;
9use std::sync::atomic::Ordering;
10
11impl CodememEngine {
12    // ── Index Persistence ────────────────────────────────────────────────
13
14    /// Save the vector and BM25 indexes to disk if a db_path is configured.
15    /// Compacts the HNSW index if ghost entries exceed 20% of live entries.
16    /// Always clears the dirty flag so `flush_if_dirty()` won't double-save.
17    pub fn save_index(&self) {
18        if let Some(ref db_path) = self.db_path {
19            let idx_path = db_path.with_extension("idx");
20            if let Ok(mut vi) = self.lock_vector() {
21                // Compact HNSW if ghost entries exceed threshold
22                if vi.needs_compaction() {
23                    let ghost = vi.ghost_count();
24                    let live = vi.stats().count;
25                    tracing::info!(
26                        "HNSW ghost compaction: {ghost} ghosts / {live} live entries, rebuilding..."
27                    );
28                    if let Ok(embeddings) = self.storage.list_all_embeddings() {
29                        if let Err(e) = vi.rebuild_from_entries(&embeddings) {
30                            tracing::warn!("HNSW compaction failed: {e}");
31                        }
32                    }
33                }
34                if let Err(e) = vi.save(&idx_path) {
35                    tracing::warn!("Failed to save vector index: {e}");
36                }
37            }
38
39            // Persist BM25 index alongside the vector index
40            let bm25_path = db_path.with_extension("bm25");
41            if let Ok(bm25) = self.lock_bm25() {
42                if bm25.needs_save() {
43                    let data = bm25.serialize();
44                    let tmp_path = db_path.with_extension("bm25.tmp");
45                    if let Err(e) = std::fs::write(&tmp_path, &data)
46                        .and_then(|_| std::fs::rename(&tmp_path, &bm25_path))
47                    {
48                        tracing::warn!("Failed to save BM25 index: {e}");
49                    }
50                }
51            }
52        }
53        self.dirty.store(false, Ordering::Release);
54    }
55
56    /// Reload the in-memory graph from the database.
57    pub fn reload_graph(&self) -> Result<(), CodememError> {
58        let new_graph = codemem_storage::graph::GraphEngine::from_storage(&*self.storage)?;
59        let mut graph = self.lock_graph()?;
60        *graph = new_graph;
61        graph.recompute_centrality();
62        Ok(())
63    }
64
65    // ── A2: File Watcher Event Processing ───────────────────────────────
66
67    /// Process a single file watcher event by re-indexing changed/created files
68    /// or cleaning up deleted file nodes.
69    ///
70    /// Call this from a watcher event loop:
71    /// ```ignore
72    /// while let Ok(event) = watcher.receiver().recv() {
73    ///     engine.process_watch_event(&event, namespace, Some(root));
74    /// }
75    /// ```
76    pub fn process_watch_event(
77        &self,
78        event: &crate::watch::WatchEvent,
79        namespace: Option<&str>,
80        project_root: Option<&Path>,
81    ) -> Result<(), CodememError> {
82        match event {
83            crate::watch::WatchEvent::FileChanged(path)
84            | crate::watch::WatchEvent::FileCreated(path) => {
85                self.index_single_file(path, namespace, project_root)?;
86            }
87            crate::watch::WatchEvent::FileDeleted(path) => {
88                // Relativize the deleted path so the node ID matches what was indexed.
89                let rel = if let Some(root) = project_root {
90                    path.strip_prefix(root)
91                        .unwrap_or(path)
92                        .to_string_lossy()
93                        .to_string()
94                } else {
95                    path.to_string_lossy().to_string()
96                };
97                self.cleanup_file_nodes(&rel)?;
98            }
99        }
100        Ok(())
101    }
102
103    /// Index (or re-index) a single file: parse it, persist nodes/edges/embeddings,
104    /// and update the index cache.
105    ///
106    /// `project_root` is used to relativize the absolute `path` so node IDs are
107    /// portable. If `None`, the path is stored as-is (absolute).
108    fn index_single_file(
109        &self,
110        path: &Path,
111        namespace: Option<&str>,
112        project_root: Option<&Path>,
113    ) -> Result<(), CodememError> {
114        let content = std::fs::read(path)?;
115
116        let path_str = if let Some(root) = project_root {
117            path.strip_prefix(root)
118                .unwrap_or(path)
119                .to_string_lossy()
120                .to_string()
121        } else {
122            path.to_string_lossy().to_string()
123        };
124        let parser = index::CodeParser::new();
125
126        let parse_result = match parser.parse_file(&path_str, &content) {
127            Some(pr) => pr,
128            None => return Ok(()), // Unsupported file type or parse failure
129        };
130
131        // Build a minimal IndexAndResolveResult for this single file
132        let mut file_paths = HashSet::new();
133        file_paths.insert(parse_result.file_path.clone());
134
135        let mut resolver = index::ReferenceResolver::new();
136        resolver.add_symbols(&parse_result.symbols);
137        let edges = resolver.resolve_all(&parse_result.references);
138
139        let results = IndexAndResolveResult {
140            index: index::IndexResult {
141                files_scanned: 1,
142                files_parsed: 1,
143                files_skipped: 0,
144                total_symbols: parse_result.symbols.len(),
145                total_references: parse_result.references.len(),
146                total_chunks: parse_result.chunks.len(),
147                parse_results: Vec::new(),
148            },
149            symbols: parse_result.symbols,
150            references: parse_result.references,
151            chunks: parse_result.chunks,
152            file_paths,
153            edges,
154            root_path: project_root
155                .map(|p| p.to_path_buf())
156                .unwrap_or_else(|| path.to_path_buf()),
157        };
158
159        self.persist_index_results(&results, namespace)?;
160        Ok(())
161    }
162
163    // ── A3: File Deletion Cleanup ───────────────────────────────────────
164
165    /// Remove graph nodes, edges, and embeddings for a single deleted file.
166    fn cleanup_file_nodes(&self, file_path: &str) -> Result<(), CodememError> {
167        let file_node_id = format!("file:{file_path}");
168
169        // Remove all chunk nodes for this file
170        let chunk_prefix = format!("chunk:{file_path}:");
171        if let Err(e) = self.storage.delete_graph_nodes_by_prefix(&chunk_prefix) {
172            tracing::warn!("Failed to delete chunk nodes for {file_path}: {e}");
173        }
174
175        // Remove symbol nodes for this file by checking graph
176        let graph = self.lock_graph()?;
177        let sym_ids: Vec<String> = graph
178            .get_all_nodes()
179            .into_iter()
180            .filter(|n| {
181                n.id.starts_with("sym:")
182                    && n.payload.get("file_path").and_then(|v| v.as_str()) == Some(file_path)
183            })
184            .map(|n| n.id.clone())
185            .collect();
186        drop(graph);
187
188        for sym_id in &sym_ids {
189            if let Err(e) = self.storage.delete_graph_edges_for_node(sym_id) {
190                tracing::warn!("Failed to delete graph edges for {sym_id}: {e}");
191            }
192            if let Err(e) = self.storage.delete_graph_node(sym_id) {
193                tracing::warn!("Failed to delete graph node {sym_id}: {e}");
194            }
195            if let Err(e) = self.storage.delete_embedding(sym_id) {
196                tracing::warn!("Failed to delete embedding {sym_id}: {e}");
197            }
198        }
199
200        // Remove file node itself
201        if let Err(e) = self.storage.delete_graph_edges_for_node(&file_node_id) {
202            tracing::warn!("Failed to delete graph edges for {file_node_id}: {e}");
203        }
204        if let Err(e) = self.storage.delete_graph_node(&file_node_id) {
205            tracing::warn!("Failed to delete graph node {file_node_id}: {e}");
206        }
207
208        // Clean up in-memory graph
209        let mut graph = self.lock_graph()?;
210        for sym_id in &sym_ids {
211            if let Err(e) = graph.remove_node(sym_id) {
212                tracing::warn!("Failed to remove {sym_id} from in-memory graph: {e}");
213            }
214        }
215        // Remove chunk nodes from in-memory graph
216        let chunk_ids: Vec<String> = graph
217            .get_all_nodes()
218            .into_iter()
219            .filter(|n| n.id.starts_with(&format!("chunk:{file_path}:")))
220            .map(|n| n.id.clone())
221            .collect();
222        for chunk_id in &chunk_ids {
223            if let Err(e) = graph.remove_node(chunk_id) {
224                tracing::warn!("Failed to remove {chunk_id} from in-memory graph: {e}");
225            }
226        }
227        if let Err(e) = graph.remove_node(&file_node_id) {
228            tracing::warn!("Failed to remove {file_node_id} from in-memory graph: {e}");
229        }
230        drop(graph);
231
232        // Remove stale embeddings from vector index
233        let mut vec = self.lock_vector()?;
234        for sym_id in &sym_ids {
235            if let Err(e) = vec.remove(sym_id) {
236                tracing::warn!("Failed to remove {sym_id} from vector index: {e}");
237            }
238        }
239        for chunk_id in &chunk_ids {
240            if let Err(e) = vec.remove(chunk_id) {
241                tracing::warn!("Failed to remove {chunk_id} from vector index: {e}");
242            }
243        }
244        drop(vec);
245
246        self.save_index();
247        Ok(())
248    }
249
250    /// Compare files on disk vs file nodes in the graph and clean up stale entries.
251    /// Call this after indexing or on watcher delete events.
252    pub fn cleanup_deleted_files(&self, dir_path: &str) -> Result<usize, CodememError> {
253        let dir = Path::new(dir_path);
254        if !dir.is_dir() {
255            return Ok(0);
256        }
257
258        // Collect file: nodes from the graph
259        let graph = self.lock_graph()?;
260        let file_nodes: Vec<String> = graph
261            .get_all_nodes()
262            .into_iter()
263            .filter(|n| n.kind == NodeKind::File && n.label.starts_with(dir_path))
264            .map(|n| n.label.clone())
265            .collect();
266        drop(graph);
267
268        let mut cleaned = 0usize;
269        for file_path in &file_nodes {
270            if !Path::new(file_path).exists() {
271                self.cleanup_file_nodes(file_path)?;
272                cleaned += 1;
273            }
274        }
275
276        if cleaned > 0 {
277            self.lock_graph()?.recompute_centrality();
278        }
279
280        Ok(cleaned)
281    }
282
283    // ── A4: Combined Index + Enrich Pipeline ────────────────────────────
284
285    /// Combined result from `index_and_enrich`.
286    /// Index a codebase and run all enrichment passes in one call.
287    pub fn index_and_enrich(
288        &self,
289        path: &str,
290        namespace: Option<&str>,
291        git_days: u64,
292    ) -> Result<IndexEnrichResult, CodememError> {
293        // 1. Index the codebase
294        let mut indexer = Indexer::new();
295        let index_results = indexer.index_and_resolve(Path::new(path))?;
296        let persist = self.persist_index_results(&index_results, namespace)?;
297
298        // 2. Run all enrichment passes, accumulating total insights
299        let root = Path::new(path);
300        let project_root = Some(root);
301        let mut total_insights = 0usize;
302
303        macro_rules! run_enrich {
304            ($label:expr, $call:expr) => {
305                match $call {
306                    Ok(r) => total_insights += r.insights_stored,
307                    Err(e) => tracing::warn!(concat!($label, " enrichment failed: {}"), e),
308                }
309            };
310        }
311
312        run_enrich!(
313            "git_history",
314            self.enrich_git_history(path, git_days, namespace)
315        );
316        run_enrich!("security", self.enrich_security(namespace));
317        run_enrich!("performance", self.enrich_performance(10, namespace));
318        run_enrich!(
319            "complexity",
320            self.enrich_complexity(namespace, project_root)
321        );
322        run_enrich!("architecture", self.enrich_architecture(namespace));
323        run_enrich!("test_mapping", self.enrich_test_mapping(namespace));
324        run_enrich!("api_surface", self.enrich_api_surface(namespace));
325        run_enrich!("doc_coverage", self.enrich_doc_coverage(namespace));
326        run_enrich!(
327            "code_smells",
328            self.enrich_code_smells(namespace, project_root)
329        );
330        run_enrich!("hot_complex", self.enrich_hot_complex(namespace));
331        run_enrich!("blame", self.enrich_blame(path, namespace));
332        run_enrich!(
333            "security_scan",
334            self.enrich_security_scan(namespace, project_root)
335        );
336        run_enrich!("quality", self.enrich_quality_stratification(namespace));
337        // change_impact is per-file, not included in run_all
338
339        // Recompute centrality after all changes
340        self.lock_graph()?.recompute_centrality();
341
342        Ok(IndexEnrichResult {
343            files_indexed: persist.files_created,
344            symbols_stored: persist.symbols_stored,
345            chunks_stored: persist.chunks_stored,
346            edges_resolved: persist.edges_resolved,
347            symbols_embedded: persist.symbols_embedded,
348            chunks_embedded: persist.chunks_embedded,
349            total_insights,
350        })
351    }
352
353    // ── A8: Session Context Synthesis ───────────────────────────────────
354
355    /// Synthesize context for a new session: recent memories, pending analyses,
356    /// active patterns, and last session summary.
357    pub fn session_context(&self, namespace: Option<&str>) -> Result<SessionContext, CodememError> {
358        let now = chrono::Utc::now();
359        let cutoff_24h = now - chrono::Duration::hours(24);
360
361        // 1. Recent memories (last 24h)
362        let ids = match namespace {
363            Some(ns) => self.storage.list_memory_ids_for_namespace(ns)?,
364            None => self.storage.list_memory_ids()?,
365        };
366
367        let mut recent_memories = Vec::new();
368        let mut pending_analyses = Vec::new();
369
370        for id in ids.iter().rev().take(200) {
371            if let Ok(Some(m)) = self.storage.get_memory_no_touch(id) {
372                // Collect pending analyses
373                if m.tags.contains(&"pending-analysis".to_string()) {
374                    pending_analyses.push(m.clone());
375                }
376                // Collect recent memories from last 24h
377                if m.created_at >= cutoff_24h {
378                    recent_memories.push(m);
379                }
380                if recent_memories.len() >= 50 && pending_analyses.len() >= 10 {
381                    break;
382                }
383            }
384        }
385
386        // 2. Active patterns
387        let session_count = self.storage.session_count(namespace).unwrap_or(1).max(1);
388        let active_patterns = patterns::detect_patterns(
389            &*self.storage,
390            namespace,
391            2, // min_frequency
392            session_count,
393        )
394        .unwrap_or_default();
395
396        // 3. Last session summary
397        let last_session_summary = self
398            .storage
399            .list_sessions(namespace, 1)?
400            .into_iter()
401            .next()
402            .and_then(|s| s.summary);
403
404        Ok(SessionContext {
405            recent_memories,
406            pending_analyses,
407            active_patterns,
408            last_session_summary,
409        })
410    }
411}
412
413// ── Result Types ────────────────────────────────────────────────────────────
414
415/// Combined result from `index_and_enrich`.
416#[derive(Debug)]
417pub struct IndexEnrichResult {
418    pub files_indexed: usize,
419    pub symbols_stored: usize,
420    pub chunks_stored: usize,
421    pub edges_resolved: usize,
422    pub symbols_embedded: usize,
423    pub chunks_embedded: usize,
424    pub total_insights: usize,
425}
426
427/// Session context synthesized at session start.
428#[derive(Debug)]
429pub struct SessionContext {
430    /// Memories created in the last 24 hours.
431    pub recent_memories: Vec<MemoryNode>,
432    /// Memories tagged `pending-analysis` awaiting code-mapper review.
433    pub pending_analyses: Vec<MemoryNode>,
434    /// Cross-session patterns detected with sufficient frequency.
435    pub active_patterns: Vec<DetectedPattern>,
436    /// Summary text from the most recent session (if any).
437    pub last_session_summary: Option<String>,
438}