Skip to main content

codemem_engine/
file_indexing.rs

1use crate::index::{self, IndexAndResolveResult, Indexer};
2use crate::patterns;
3use crate::CodememEngine;
4use codemem_core::{CodememError, DetectedPattern, GraphBackend, MemoryNode, VectorBackend};
5use std::collections::HashSet;
6use std::path::Path;
7use std::sync::atomic::Ordering;
8
9impl CodememEngine {
10    // ── Index Persistence ────────────────────────────────────────────────
11
12    /// Save the vector and BM25 indexes to disk if a db_path is configured.
13    /// Compacts the HNSW index if ghost entries exceed 20% of live entries.
14    /// Always clears the dirty flag so `flush_if_dirty()` won't double-save.
15    pub fn save_index(&self) {
16        if let Some(ref db_path) = self.db_path {
17            // Only save vector index if it has been lazily initialized.
18            if self.vector_ready() {
19                let idx_path = db_path.with_extension("idx");
20                if let Ok(mut vi) = self.lock_vector() {
21                    // Compact HNSW if ghost entries exceed threshold
22                    if vi.needs_compaction() {
23                        let ghost = vi.ghost_count();
24                        let live = vi.stats().count;
25                        tracing::info!(
26                            "HNSW ghost compaction: {ghost} ghosts / {live} live entries, rebuilding..."
27                        );
28                        if let Ok(embeddings) = self.storage.list_all_embeddings() {
29                            if let Err(e) = vi.rebuild_from_entries(&embeddings) {
30                                tracing::warn!("HNSW compaction failed: {e}");
31                            }
32                        }
33                    }
34                    if let Err(e) = vi.save(&idx_path) {
35                        tracing::warn!("Failed to save vector index: {e}");
36                    }
37                }
38            }
39
40            // Only save BM25 index if it has been lazily initialized.
41            if self.bm25_ready() {
42                let bm25_path = db_path.with_extension("bm25");
43                if let Ok(bm25) = self.lock_bm25() {
44                    if bm25.needs_save() {
45                        let data = bm25.serialize();
46                        let tmp_path = db_path.with_extension("bm25.tmp");
47                        if let Err(e) = std::fs::write(&tmp_path, &data)
48                            .and_then(|_| std::fs::rename(&tmp_path, &bm25_path))
49                        {
50                            tracing::warn!("Failed to save BM25 index: {e}");
51                        }
52                    }
53                }
54            }
55        }
56        self.dirty.store(false, Ordering::Release);
57    }
58
59    /// Reload the in-memory graph from the database.
60    pub fn reload_graph(&self) -> Result<(), CodememError> {
61        let new_graph = codemem_storage::graph::GraphEngine::from_storage(&*self.storage)?;
62        let mut graph = self.lock_graph()?;
63        *graph = new_graph;
64        graph.recompute_centrality();
65        Ok(())
66    }
67
68    // ── A2: File Watcher Event Processing ───────────────────────────────
69
70    /// Process a single file watcher event by re-indexing changed/created files
71    /// or cleaning up deleted file nodes.
72    ///
73    /// Call this from a watcher event loop:
74    /// ```ignore
75    /// while let Ok(event) = watcher.receiver().recv() {
76    ///     engine.process_watch_event(&event, namespace, Some(root));
77    /// }
78    /// ```
79    pub fn process_watch_event(
80        &self,
81        event: &crate::watch::WatchEvent,
82        namespace: Option<&str>,
83        project_root: Option<&Path>,
84    ) -> Result<(), CodememError> {
85        match event {
86            crate::watch::WatchEvent::FileChanged(path)
87            | crate::watch::WatchEvent::FileCreated(path) => {
88                self.index_single_file(path, namespace, project_root)?;
89            }
90            crate::watch::WatchEvent::FileDeleted(path) => {
91                // Relativize the deleted path so the node ID matches what was indexed.
92                let rel = if let Some(root) = project_root {
93                    path.strip_prefix(root)
94                        .unwrap_or(path)
95                        .to_string_lossy()
96                        .to_string()
97                } else {
98                    path.to_string_lossy().to_string()
99                };
100                self.cleanup_file_nodes(&rel)?;
101            }
102        }
103        Ok(())
104    }
105
106    /// Index (or re-index) a single file: parse it, persist nodes/edges/embeddings,
107    /// and update the index cache.
108    ///
109    /// `project_root` is used to relativize the absolute `path` so node IDs are
110    /// portable. If `None`, the path is stored as-is (absolute).
111    ///
112    /// Uses SHA-256 hash dedup to skip re-indexing when content is unchanged.
113    /// This prevents duplicate work when both the PostToolUse hook and the
114    /// background file watcher fire for the same edit.
115    fn index_single_file(
116        &self,
117        path: &Path,
118        namespace: Option<&str>,
119        project_root: Option<&Path>,
120    ) -> Result<(), CodememError> {
121        let content = std::fs::read(path)?;
122
123        let path_str = if let Some(root) = project_root {
124            path.strip_prefix(root)
125                .unwrap_or(path)
126                .to_string_lossy()
127                .to_string()
128        } else {
129            path.to_string_lossy().to_string()
130        };
131
132        // SHA-256 dedup: skip if content unchanged since last index.
133        // Uses cached ChangeDetector to avoid reloading all hashes from storage per file.
134        let hash = {
135            let mut cd_guard = self
136                .change_detector
137                .lock()
138                .map_err(|_| CodememError::LockPoisoned("change_detector".into()))?;
139            let ns = namespace.unwrap_or("");
140            let cd = cd_guard.get_or_insert_with(|| {
141                let mut cd = index::incremental::ChangeDetector::new();
142                cd.load_from_storage(&*self.storage, ns);
143                cd
144            });
145            let (changed, hash) = cd.check_changed(&path_str, &content);
146            if !changed {
147                tracing::debug!("Skipping unchanged file: {path_str}");
148                return Ok(());
149            }
150            hash
151        };
152
153        let parser = index::CodeParser::new();
154
155        let parse_result = match parser.parse_file(&path_str, &content) {
156            Some(pr) => pr,
157            None => return Ok(()), // Unsupported file type or parse failure
158        };
159
160        // Build a minimal IndexAndResolveResult for this single file
161        let mut file_paths = HashSet::new();
162        file_paths.insert(parse_result.file_path.clone());
163
164        let mut resolver = index::ReferenceResolver::new();
165        resolver.add_symbols(&parse_result.symbols);
166        let resolve_result = resolver.resolve_all_with_unresolved(&parse_result.references);
167
168        let results = IndexAndResolveResult {
169            index: index::IndexResult {
170                files_scanned: 1,
171                files_parsed: 1,
172                files_skipped: 0,
173                total_symbols: parse_result.symbols.len(),
174                total_references: parse_result.references.len(),
175                total_chunks: parse_result.chunks.len(),
176                parse_results: Vec::new(),
177            },
178            symbols: parse_result.symbols,
179            references: parse_result.references,
180            chunks: parse_result.chunks,
181            file_paths,
182            edges: resolve_result.edges,
183            unresolved: resolve_result.unresolved,
184            root_path: project_root
185                .map(|p| p.to_path_buf())
186                .unwrap_or_else(|| path.to_path_buf()),
187            scip_build: None,
188        };
189
190        self.persist_index_results(&results, namespace)?;
191
192        // Record new hash in the cached detector after successful persist
193        if let Ok(mut cd_guard) = self.change_detector.lock() {
194            if let Some(cd) = cd_guard.as_mut() {
195                cd.record_hash(&path_str, hash);
196                if let Err(e) = cd.save_to_storage(&*self.storage, namespace.unwrap_or("")) {
197                    tracing::warn!("Failed to save file hash for {path_str}: {e}");
198                }
199            }
200        }
201
202        Ok(())
203    }
204
205    // ── A2b: Symbol-Level Diff on Re-index ────────────────────────────
206
207    /// Remove symbols that existed for a file before re-indexing but are no
208    /// longer present in the new parse results. Returns count of cleaned symbols.
209    ///
210    /// For code→code edges (CALLS, IMPORTS, etc.), performs a hard delete.
211    /// For memory→symbol edges, creates a live redirected edge pointing to the
212    /// parent file node, preserving the memory→file connection so recall can
213    /// still traverse it. The original edge is then deleted along with the
214    /// stale symbol node.
215    ///
216    /// `old_symbol_ids` should be the set of symbol IDs that existed for this
217    /// file before re-indexing (collected from the in-memory graph by the caller
218    /// in a single pass across all files).
219    pub fn cleanup_stale_symbols(
220        &self,
221        file_path: &str,
222        old_symbol_ids: &HashSet<String>,
223        new_symbol_ids: &HashSet<String>,
224    ) -> Result<usize, CodememError> {
225        // Compute stale set: symbols that existed before but are not in the new parse
226        let stale_ids: Vec<&String> = old_symbol_ids
227            .iter()
228            .filter(|id| !new_symbol_ids.contains(*id))
229            .collect();
230
231        if stale_ids.is_empty() {
232            return Ok(0);
233        }
234
235        let count = stale_ids.len();
236        tracing::info!(
237            "Cleaning up {count} stale symbols for {file_path}: {:?}",
238            stale_ids
239        );
240
241        let file_node_id = format!("file:{file_path}");
242        let mut redirected_pairs: std::collections::HashSet<(String, String)> =
243            std::collections::HashSet::new();
244        let mut redirected_edges: Vec<codemem_core::Edge> = Vec::new();
245        for sym_id in &stale_ids {
246            // Before deleting the symbol, redirect memory→symbol edges to the
247            // parent file node, preserving historical context.
248            // Memory node IDs are UUIDs (no known prefix like sym:/file:/chunk:).
249            let edges = self.storage.get_edges_for_node(sym_id.as_str())?;
250            for edge in &edges {
251                let other = if edge.src.as_str() == sym_id.as_str() {
252                    &edge.dst
253                } else {
254                    &edge.src
255                };
256                let is_code_node = other.starts_with("sym:")
257                    || other.starts_with("file:")
258                    || other.starts_with("chunk:")
259                    || other.starts_with("pkg:");
260                if !is_code_node {
261                    // Skip if we already redirected this memory→file pair
262                    let pair = (other.to_string(), file_node_id.clone());
263                    if !redirected_pairs.insert(pair) {
264                        continue;
265                    }
266                    let mut redirected = edge.clone();
267                    if redirected.src.as_str() == sym_id.as_str() {
268                        redirected.src = file_node_id.clone();
269                    } else {
270                        redirected.dst = file_node_id.clone();
271                    }
272                    // Don't set valid_to — the redirect should be a live,
273                    // queryable edge so recall can still traverse memory→file.
274                    redirected.id = format!("{}-redirected", edge.id);
275                    if let Err(e) = self.storage.insert_graph_edge(&redirected) {
276                        tracing::warn!("Failed to redirect memory edge {}: {e}", edge.id);
277                    }
278                    redirected_edges.push(redirected);
279                }
280            }
281
282            // Delete all edges and the node itself
283            if let Err(e) = self.storage.delete_graph_edges_for_node(sym_id) {
284                tracing::warn!("Failed to delete edges for stale symbol {sym_id}: {e}");
285            }
286            if let Err(e) = self.storage.delete_graph_node(sym_id) {
287                tracing::warn!("Failed to delete stale symbol node {sym_id}: {e}");
288            }
289            if let Err(e) = self.storage.delete_embedding(sym_id) {
290                tracing::warn!("Failed to delete embedding for stale symbol {sym_id}: {e}");
291            }
292        }
293
294        // Clean up in-memory graph and vector index
295        {
296            let mut graph = self.lock_graph()?;
297            for sym_id in &stale_ids {
298                if let Err(e) = graph.remove_node(sym_id.as_str()) {
299                    tracing::warn!("Failed to remove stale {sym_id} from in-memory graph: {e}");
300                }
301            }
302            // Add redirected memory→file edges so they're visible to
303            // in-memory traversal (BFS, PageRank, recall) during this session.
304            for edge in redirected_edges {
305                let _ = graph.add_edge(edge);
306            }
307        }
308        {
309            let mut vec = self.lock_vector()?;
310            for sym_id in &stale_ids {
311                if let Err(e) = vec.remove(sym_id.as_str()) {
312                    tracing::warn!("Failed to remove stale {sym_id} from vector index: {e}");
313                }
314            }
315        }
316
317        Ok(count)
318    }
319
320    // ── A3: File Deletion Cleanup ───────────────────────────────────────
321
322    /// Remove graph nodes, edges, and embeddings for a single deleted file.
323    fn cleanup_file_nodes(&self, file_path: &str) -> Result<(), CodememError> {
324        let file_node_id = format!("file:{file_path}");
325
326        // Remove all chunk nodes for this file
327        let chunk_prefix = format!("chunk:{file_path}:");
328        if let Err(e) = self.storage.delete_graph_nodes_by_prefix(&chunk_prefix) {
329            tracing::warn!("Failed to delete chunk nodes for {file_path}: {e}");
330        }
331
332        // Remove symbol nodes for this file by checking graph
333        let graph = self.lock_graph()?;
334        let sym_ids: Vec<String> = graph
335            .get_all_nodes()
336            .into_iter()
337            .filter(|n| {
338                n.id.starts_with("sym:")
339                    && n.payload.get("file_path").and_then(|v| v.as_str()) == Some(file_path)
340            })
341            .map(|n| n.id.clone())
342            .collect();
343        drop(graph);
344
345        for sym_id in &sym_ids {
346            if let Err(e) = self.storage.delete_graph_edges_for_node(sym_id) {
347                tracing::warn!("Failed to delete graph edges for {sym_id}: {e}");
348            }
349            if let Err(e) = self.storage.delete_graph_node(sym_id) {
350                tracing::warn!("Failed to delete graph node {sym_id}: {e}");
351            }
352            if let Err(e) = self.storage.delete_embedding(sym_id) {
353                tracing::warn!("Failed to delete embedding {sym_id}: {e}");
354            }
355        }
356
357        // Remove file node itself
358        if let Err(e) = self.storage.delete_graph_edges_for_node(&file_node_id) {
359            tracing::warn!("Failed to delete graph edges for {file_node_id}: {e}");
360        }
361        if let Err(e) = self.storage.delete_graph_node(&file_node_id) {
362            tracing::warn!("Failed to delete graph node {file_node_id}: {e}");
363        }
364
365        // Clean up in-memory graph
366        let mut graph = self.lock_graph()?;
367        for sym_id in &sym_ids {
368            if let Err(e) = graph.remove_node(sym_id) {
369                tracing::warn!("Failed to remove {sym_id} from in-memory graph: {e}");
370            }
371        }
372        // Remove chunk nodes from in-memory graph
373        let chunk_ids: Vec<String> = graph
374            .get_all_nodes()
375            .into_iter()
376            .filter(|n| n.id.starts_with(&format!("chunk:{file_path}:")))
377            .map(|n| n.id.clone())
378            .collect();
379        for chunk_id in &chunk_ids {
380            if let Err(e) = graph.remove_node(chunk_id) {
381                tracing::warn!("Failed to remove {chunk_id} from in-memory graph: {e}");
382            }
383        }
384        if let Err(e) = graph.remove_node(&file_node_id) {
385            tracing::warn!("Failed to remove {file_node_id} from in-memory graph: {e}");
386        }
387        drop(graph);
388
389        // Remove stale embeddings from vector index
390        let mut vec = self.lock_vector()?;
391        for sym_id in &sym_ids {
392            if let Err(e) = vec.remove(sym_id) {
393                tracing::warn!("Failed to remove {sym_id} from vector index: {e}");
394            }
395        }
396        for chunk_id in &chunk_ids {
397            if let Err(e) = vec.remove(chunk_id) {
398                tracing::warn!("Failed to remove {chunk_id} from vector index: {e}");
399            }
400        }
401        drop(vec);
402
403        self.save_index();
404        Ok(())
405    }
406
407    // ── A3b: Orphan Detection ─────────────────────────────────────────
408
409    /// Scan for orphaned symbol/chunk nodes whose files no longer exist on disk.
410    /// Also cleans up dangling edges (src or dst node doesn't exist).
411    /// Returns `(symbols_cleaned, edges_cleaned)`.
412    ///
413    /// When `project_root` is `None`, file-existence checks are skipped
414    /// (only dangling edge cleanup runs) to avoid CWD-dependent path
415    /// resolution that could cause mass deletion.
416    pub fn detect_orphans(
417        &self,
418        project_root: Option<&Path>,
419    ) -> Result<(usize, usize), CodememError> {
420        // Use storage for both nodes and edges to avoid in-memory/storage sync races.
421        let all_nodes = self.storage.all_graph_nodes()?;
422        let node_ids: HashSet<String> = all_nodes.iter().map(|n| n.id.clone()).collect();
423
424        let mut orphan_sym_ids: Vec<String> = Vec::new();
425
426        // Only check file existence when we have a known project root.
427        // Without it, relative paths resolve against CWD which may be wrong.
428        if let Some(root) = project_root {
429            for node in &all_nodes {
430                if !node.id.starts_with("sym:") && !node.id.starts_with("chunk:") {
431                    continue;
432                }
433                let file_path = match node.payload.get("file_path").and_then(|v| v.as_str()) {
434                    Some(fp) => fp,
435                    None => continue,
436                };
437                let abs_path = root.join(file_path);
438                if !abs_path.exists() {
439                    orphan_sym_ids.push(node.id.clone());
440                }
441            }
442        }
443
444        // Also find dangling edges (src or dst doesn't exist in graph)
445        let all_edges = self.storage.all_graph_edges()?;
446        let mut dangling_edge_ids: Vec<String> = Vec::new();
447        for edge in &all_edges {
448            if !node_ids.contains(&edge.src) || !node_ids.contains(&edge.dst) {
449                dangling_edge_ids.push(edge.id.clone());
450            }
451        }
452
453        let symbols_cleaned = orphan_sym_ids.len();
454
455        // Clean up orphan nodes
456        for sym_id in &orphan_sym_ids {
457            if let Err(e) = self.storage.delete_graph_edges_for_node(sym_id) {
458                tracing::warn!("Orphan cleanup: failed to delete edges for {sym_id}: {e}");
459            }
460            if let Err(e) = self.storage.delete_graph_node(sym_id) {
461                tracing::warn!("Orphan cleanup: failed to delete node {sym_id}: {e}");
462            }
463            if let Err(e) = self.storage.delete_embedding(sym_id) {
464                tracing::warn!("Orphan cleanup: failed to delete embedding {sym_id}: {e}");
465            }
466        }
467
468        // Clean up orphan nodes from in-memory graph + vector
469        if !orphan_sym_ids.is_empty() {
470            if let Ok(mut graph) = self.lock_graph() {
471                for sym_id in &orphan_sym_ids {
472                    let _ = graph.remove_node(sym_id);
473                }
474            }
475            if let Ok(mut vec) = self.lock_vector() {
476                for sym_id in &orphan_sym_ids {
477                    let _ = vec.remove(sym_id);
478                }
479            }
480        }
481
482        // Delete dangling edges that weren't already removed by node cleanup
483        let mut edges_cleaned = 0usize;
484        for edge_id in &dangling_edge_ids {
485            match self.storage.delete_graph_edge(edge_id) {
486                Ok(true) => edges_cleaned += 1,
487                Ok(false) => {} // Already deleted by node cleanup above
488                Err(e) => {
489                    tracing::warn!("Orphan cleanup: failed to delete dangling edge {edge_id}: {e}");
490                }
491            }
492        }
493
494        if symbols_cleaned > 0 || edges_cleaned > 0 {
495            tracing::info!(
496                "Orphan scan: cleaned {symbols_cleaned} symbol/chunk nodes, {edges_cleaned} dangling edges"
497            );
498        }
499
500        Ok((symbols_cleaned, edges_cleaned))
501    }
502
503    // ── A4: Unified Analyze Pipeline ────────────────────────────────────
504
505    /// Full analysis pipeline: index → persist → enrich → recompute centrality.
506    ///
507    /// This is the single entry point for all callers (CLI, MCP, API).
508    /// Supports incremental indexing via `ChangeDetector`, progress callbacks,
509    /// and returns comprehensive results.
510    pub fn analyze(&self, options: AnalyzeOptions<'_>) -> Result<AnalyzeResult, CodememError> {
511        let root = options.path;
512
513        // Eagerly initialize embeddings/vector/BM25 for the full analysis pipeline.
514        // This triggers lazy init so that embed_and_persist() finds them ready.
515        // Skip if embeddings are not needed.
516        if !options.skip_embed {
517            drop(self.lock_embeddings());
518            drop(self.lock_vector());
519            drop(self.lock_bm25());
520        }
521
522        // 0. SCIP phase: run indexers, parse results, build graph data.
523        // Runs BEFORE ast-grep so we know which files SCIP covered.
524        let (scip_covered, scip_build) = if !options.skip_scip && self.config.scip.enabled {
525            match self.run_scip_phase(root, options.namespace) {
526                Ok((covered, build)) => (Some(covered), Some(build)),
527                Err(e) => {
528                    tracing::warn!("SCIP phase failed, falling back to ast-grep only: {e}");
529                    (None, None)
530                }
531            }
532        } else {
533            (None, None)
534        };
535
536        let scip_nodes_created = scip_build.as_ref().map_or(0, |b| b.nodes.len());
537        let scip_edges_created = scip_build.as_ref().map_or(0, |b| b.edges.len());
538        let scip_files_covered = scip_covered.as_ref().map_or(0, |s| s.len());
539
540        // 1. Index (ast-grep skips symbol extraction for SCIP-covered files)
541        // When force=true, ignore the change detector so all files are re-processed.
542        let mut indexer = match options.change_detector {
543            Some(cd) if !options.force => Indexer::with_change_detector(cd),
544            _ => Indexer::new(),
545        };
546        let resolved =
547            indexer.index_and_resolve_with_scip(root, scip_covered.as_ref(), scip_build)?;
548
549        // 2. Persist (with or without progress callback)
550        let persist = if options.skip_embed {
551            self.persist_graph_only(&resolved, Some(options.namespace))?
552        } else if let Some(ref on_progress) = options.progress {
553            self.persist_index_results_with_progress(
554                &resolved,
555                Some(options.namespace),
556                |done, total| {
557                    on_progress(AnalyzeProgress::Embedding { done, total });
558                },
559            )?
560        } else {
561            self.persist_index_results(&resolved, Some(options.namespace))?
562        };
563
564        // Cache results for structural queries
565        {
566            if let Ok(mut cache) = self.lock_index_cache() {
567                *cache = Some(crate::IndexCache {
568                    symbols: resolved.symbols,
569                    chunks: resolved.chunks,
570                    root_path: root.to_string_lossy().to_string(),
571                });
572            }
573        }
574
575        // 3. Enrich (skip if requested)
576        let enrichment = if options.skip_enrich {
577            crate::enrichment::EnrichmentPipelineResult {
578                results: serde_json::json!({}),
579                total_insights: 0,
580            }
581        } else {
582            let path_str = root.to_str().unwrap_or("");
583            self.run_enrichments(
584                path_str,
585                &[],
586                options.git_days,
587                Some(options.namespace),
588                None,
589            )
590        };
591
592        // 4. Recompute centrality
593        self.lock_graph()?.recompute_centrality();
594
595        // 5. Compute summary stats
596        let top_nodes = self.find_important_nodes(10, 0.85).unwrap_or_default();
597        let community_count = self.louvain_communities(1.0).map(|c| c.len()).unwrap_or(0);
598
599        // 6. Save indexes
600        self.save_index();
601
602        // Save incremental state
603        indexer
604            .change_detector()
605            .save_to_storage(self.storage(), options.namespace)?;
606
607        Ok(AnalyzeResult {
608            files_parsed: resolved.index.files_parsed,
609            files_skipped: resolved.index.files_skipped,
610            symbols_found: resolved.index.total_symbols,
611            edges_resolved: persist.edges_resolved,
612            chunks_stored: persist.chunks_stored,
613            symbols_embedded: persist.symbols_embedded,
614            chunks_embedded: persist.chunks_embedded,
615            chunks_pruned: persist.chunks_pruned,
616            symbols_pruned: persist.symbols_pruned,
617            enrichment_results: enrichment.results,
618            total_insights: enrichment.total_insights,
619            top_nodes,
620            community_count,
621            scip_nodes_created,
622            scip_edges_created,
623            scip_files_covered,
624        })
625    }
626
627    /// Run the SCIP phase: orchestrate indexers, parse results, build graph data.
628    fn run_scip_phase(
629        &self,
630        root: &Path,
631        namespace: &str,
632    ) -> Result<(HashSet<String>, index::scip::graph_builder::ScipBuildResult), CodememError> {
633        let orchestrator =
634            index::scip::orchestrate::ScipOrchestrator::new(self.config.scip.clone());
635        let orch_result = orchestrator.run(root, namespace)?;
636
637        if orch_result.scip_result.covered_files.is_empty() {
638            return Ok((
639                HashSet::new(),
640                index::scip::graph_builder::ScipBuildResult::default(),
641            ));
642        }
643
644        for (lang, err) in &orch_result.failed_languages {
645            tracing::warn!("SCIP indexer for {:?} failed: {}", lang, err);
646        }
647        for lang in &orch_result.indexed_languages {
648            tracing::info!("SCIP indexed {:?} successfully", lang);
649        }
650
651        let build = index::scip::graph_builder::build_graph(
652            &orch_result.scip_result,
653            Some(namespace),
654            &self.config.scip,
655        );
656        let covered: HashSet<String> = build.files_covered.clone();
657
658        tracing::info!(
659            "SCIP phase: {} nodes, {} edges, {} ext nodes, {} files covered, {} doc memories",
660            build.nodes.len(),
661            build.edges.len(),
662            build.ext_nodes_created,
663            covered.len(),
664            build.doc_memories_created,
665        );
666
667        Ok((covered, build))
668    }
669
670    // ── A8: Session Context Synthesis ───────────────────────────────────
671
672    /// Synthesize context for a new session: recent memories, pending analyses,
673    /// active patterns, and last session summary.
674    pub fn session_context(&self, namespace: Option<&str>) -> Result<SessionContext, CodememError> {
675        let now = chrono::Utc::now();
676        let cutoff_24h = now - chrono::Duration::hours(24);
677
678        // 1. Recent memories (last 24h)
679        let ids = match namespace {
680            Some(ns) => self.storage.list_memory_ids_for_namespace(ns)?,
681            None => self.storage.list_memory_ids()?,
682        };
683
684        let mut recent_memories = Vec::new();
685        let mut pending_analyses = Vec::new();
686
687        for id in ids.iter().rev().take(200) {
688            if let Ok(Some(m)) = self.storage.get_memory_no_touch(id) {
689                // Collect pending analyses
690                if m.tags.contains(&"pending-analysis".to_string()) {
691                    pending_analyses.push(m.clone());
692                }
693                // Collect recent memories from last 24h
694                if m.created_at >= cutoff_24h {
695                    recent_memories.push(m);
696                }
697                if recent_memories.len() >= 50 && pending_analyses.len() >= 10 {
698                    break;
699                }
700            }
701        }
702
703        // 2. Active patterns
704        let session_count = self.storage.session_count(namespace).unwrap_or(1).max(1);
705        let active_patterns = patterns::detect_patterns(
706            &*self.storage,
707            namespace,
708            2, // min_frequency
709            session_count,
710        )
711        .unwrap_or_default();
712
713        // 3. Last session summary
714        let last_session_summary = self
715            .storage
716            .list_sessions(namespace, 1)?
717            .into_iter()
718            .next()
719            .and_then(|s| s.summary);
720
721        Ok(SessionContext {
722            recent_memories,
723            pending_analyses,
724            active_patterns,
725            last_session_summary,
726        })
727    }
728}
729
730// ── Result Types ────────────────────────────────────────────────────────────
731
732/// Options for the unified `analyze()` pipeline.
733pub struct AnalyzeOptions<'a> {
734    pub path: &'a Path,
735    pub namespace: &'a str,
736    pub git_days: u64,
737    pub change_detector: Option<index::incremental::ChangeDetector>,
738    pub progress: Option<Box<dyn Fn(AnalyzeProgress) + Send + 'a>>,
739    /// Skip SCIP indexing — use ast-grep only (faster, less accurate).
740    pub skip_scip: bool,
741    /// Skip embedding phase (graph + chunks stored but not vectorized).
742    pub skip_embed: bool,
743    /// Skip enrichment phase (no git-history/complexity/etc. analysis).
744    pub skip_enrich: bool,
745    /// Force re-index even when file SHAs haven't changed.
746    pub force: bool,
747}
748
749/// Progress events emitted during analysis.
750#[derive(Debug, Clone)]
751pub enum AnalyzeProgress {
752    Embedding { done: usize, total: usize },
753}
754
755/// Result of the unified `analyze()` pipeline.
756#[derive(Debug)]
757pub struct AnalyzeResult {
758    pub files_parsed: usize,
759    pub files_skipped: usize,
760    pub symbols_found: usize,
761    pub edges_resolved: usize,
762    pub chunks_stored: usize,
763    pub symbols_embedded: usize,
764    pub chunks_embedded: usize,
765    pub chunks_pruned: usize,
766    pub symbols_pruned: usize,
767    pub enrichment_results: serde_json::Value,
768    pub total_insights: usize,
769    pub top_nodes: Vec<crate::graph_ops::RankedNode>,
770    pub community_count: usize,
771    /// SCIP nodes created (sym: + ext: nodes).
772    pub scip_nodes_created: usize,
773    /// SCIP edges created (CALLS, IMPORTS, READS, WRITES, IMPLEMENTS, etc.).
774    pub scip_edges_created: usize,
775    /// Files covered by SCIP indexers (ast-grep skipped symbol extraction for these).
776    pub scip_files_covered: usize,
777}
778
779/// Session context synthesized at session start.
780#[derive(Debug)]
781pub struct SessionContext {
782    /// Memories created in the last 24 hours.
783    pub recent_memories: Vec<MemoryNode>,
784    /// Memories tagged `pending-analysis` awaiting code-mapper review.
785    pub pending_analyses: Vec<MemoryNode>,
786    /// Cross-session patterns detected with sufficient frequency.
787    pub active_patterns: Vec<DetectedPattern>,
788    /// Summary text from the most recent session (if any).
789    pub last_session_summary: Option<String>,
790}
791
792#[cfg(test)]
793mod tests {
794    use super::*;
795    use codemem_core::{Edge, GraphBackend, GraphNode, NodeKind, RelationshipType};
796    use std::collections::{HashMap, HashSet};
797
798    /// Create a test engine backed by a temporary database.
799    fn test_engine() -> CodememEngine {
800        let dir = tempfile::tempdir().unwrap();
801        let db_path = dir.path().join("test.db");
802        // Keep the tempdir alive by leaking it (tests are short-lived).
803        let _ = Box::leak(Box::new(dir));
804        CodememEngine::from_db_path(&db_path).unwrap()
805    }
806
807    fn graph_node(id: &str, kind: NodeKind, file_path: Option<&str>) -> GraphNode {
808        let mut payload = HashMap::new();
809        if let Some(fp) = file_path {
810            payload.insert(
811                "file_path".to_string(),
812                serde_json::Value::String(fp.to_string()),
813            );
814        }
815        GraphNode {
816            id: id.to_string(),
817            kind,
818            label: id.to_string(),
819            payload,
820            centrality: 0.0,
821            memory_id: None,
822            namespace: None,
823        }
824    }
825
826    fn edge(src: &str, dst: &str, rel: RelationshipType) -> Edge {
827        Edge {
828            id: format!("{rel}:{src}->{dst}"),
829            src: src.to_string(),
830            dst: dst.to_string(),
831            relationship: rel,
832            weight: 1.0,
833            properties: HashMap::new(),
834            created_at: chrono::Utc::now(),
835            valid_from: None,
836            valid_to: None,
837        }
838    }
839
840    // ── cleanup_stale_symbols tests ──────────────────────────────────────
841
842    #[test]
843    fn cleanup_stale_symbols_deletes_stale_nodes() {
844        let engine = test_engine();
845
846        // Set up: file with two symbols, one will become stale
847        let file = graph_node("file:src/a.rs", NodeKind::File, None);
848        let sym_keep = graph_node("sym:a::keep", NodeKind::Function, Some("src/a.rs"));
849        let sym_stale = graph_node("sym:a::stale", NodeKind::Function, Some("src/a.rs"));
850
851        {
852            let mut g = engine.lock_graph().unwrap();
853            g.add_node(file).unwrap();
854            g.add_node(sym_keep.clone()).unwrap();
855            g.add_node(sym_stale.clone()).unwrap();
856            g.add_edge(edge(
857                "file:src/a.rs",
858                "sym:a::keep",
859                RelationshipType::Contains,
860            ))
861            .unwrap();
862            g.add_edge(edge(
863                "file:src/a.rs",
864                "sym:a::stale",
865                RelationshipType::Contains,
866            ))
867            .unwrap();
868        }
869        // Also persist to storage so cleanup can find edges
870        let _ =
871            engine
872                .storage
873                .insert_graph_node(&graph_node("file:src/a.rs", NodeKind::File, None));
874        let _ = engine.storage.insert_graph_node(&sym_keep);
875        let _ = engine.storage.insert_graph_node(&sym_stale);
876        let _ = engine.storage.insert_graph_edge(&edge(
877            "file:src/a.rs",
878            "sym:a::keep",
879            RelationshipType::Contains,
880        ));
881        let _ = engine.storage.insert_graph_edge(&edge(
882            "file:src/a.rs",
883            "sym:a::stale",
884            RelationshipType::Contains,
885        ));
886
887        let old_ids: HashSet<String> = ["sym:a::keep", "sym:a::stale"]
888            .iter()
889            .map(|s| s.to_string())
890            .collect();
891        let new_ids: HashSet<String> = ["sym:a::keep"].iter().map(|s| s.to_string()).collect();
892
893        let cleaned = engine
894            .cleanup_stale_symbols("src/a.rs", &old_ids, &new_ids)
895            .unwrap();
896        assert_eq!(cleaned, 1);
897
898        // Stale node should be gone from in-memory graph
899        let g = engine.lock_graph().unwrap();
900        assert!(g.get_node("sym:a::stale").unwrap().is_none());
901        assert!(g.get_node("sym:a::keep").unwrap().is_some());
902    }
903
904    #[test]
905    fn cleanup_stale_symbols_redirects_memory_edges_to_graph() {
906        let engine = test_engine();
907
908        let file = graph_node("file:src/a.rs", NodeKind::File, None);
909        let sym_stale = graph_node("sym:a::old_fn", NodeKind::Function, Some("src/a.rs"));
910        let mem = graph_node("mem-uuid-123", NodeKind::Memory, None);
911
912        {
913            let mut g = engine.lock_graph().unwrap();
914            g.add_node(file.clone()).unwrap();
915            g.add_node(sym_stale.clone()).unwrap();
916            g.add_node(mem.clone()).unwrap();
917            g.add_edge(edge(
918                "file:src/a.rs",
919                "sym:a::old_fn",
920                RelationshipType::Contains,
921            ))
922            .unwrap();
923            g.add_edge(edge(
924                "mem-uuid-123",
925                "sym:a::old_fn",
926                RelationshipType::RelatesTo,
927            ))
928            .unwrap();
929        }
930        let _ = engine.storage.insert_graph_node(&file);
931        let _ = engine.storage.insert_graph_node(&sym_stale);
932        let _ = engine.storage.insert_graph_node(&mem);
933        let _ = engine.storage.insert_graph_edge(&edge(
934            "file:src/a.rs",
935            "sym:a::old_fn",
936            RelationshipType::Contains,
937        ));
938        let _ = engine.storage.insert_graph_edge(&edge(
939            "mem-uuid-123",
940            "sym:a::old_fn",
941            RelationshipType::RelatesTo,
942        ));
943
944        let old_ids: HashSet<String> = ["sym:a::old_fn"].iter().map(|s| s.to_string()).collect();
945        let new_ids: HashSet<String> = HashSet::new();
946
947        engine
948            .cleanup_stale_symbols("src/a.rs", &old_ids, &new_ids)
949            .unwrap();
950
951        // The redirected edge should be in the in-memory graph
952        let g = engine.lock_graph().unwrap();
953        let file_edges = g.get_edges("file:src/a.rs").unwrap();
954        let has_redirect = file_edges.iter().any(|e| {
955            (e.src == "mem-uuid-123" || e.dst == "mem-uuid-123") && e.id.contains("-redirected")
956        });
957        assert!(
958            has_redirect,
959            "redirected memory→file edge should be in the in-memory graph"
960        );
961    }
962
963    #[test]
964    fn cleanup_stale_symbols_deduplicates_redirects() {
965        let engine = test_engine();
966
967        let file = graph_node("file:src/a.rs", NodeKind::File, None);
968        let sym1 = graph_node("sym:a::fn1", NodeKind::Function, Some("src/a.rs"));
969        let sym2 = graph_node("sym:a::fn2", NodeKind::Function, Some("src/a.rs"));
970        let mem = graph_node("mem-uuid-456", NodeKind::Memory, None);
971
972        // Same memory linked to two symbols in the same file
973        let _ = engine.storage.insert_graph_node(&file);
974        let _ = engine.storage.insert_graph_node(&sym1);
975        let _ = engine.storage.insert_graph_node(&sym2);
976        let _ = engine.storage.insert_graph_node(&mem);
977        let _ = engine.storage.insert_graph_edge(&edge(
978            "mem-uuid-456",
979            "sym:a::fn1",
980            RelationshipType::RelatesTo,
981        ));
982        let _ = engine.storage.insert_graph_edge(&edge(
983            "mem-uuid-456",
984            "sym:a::fn2",
985            RelationshipType::RelatesTo,
986        ));
987
988        {
989            let mut g = engine.lock_graph().unwrap();
990            g.add_node(file).unwrap();
991            g.add_node(sym1).unwrap();
992            g.add_node(sym2).unwrap();
993            g.add_node(mem).unwrap();
994        }
995
996        let old_ids: HashSet<String> = ["sym:a::fn1", "sym:a::fn2"]
997            .iter()
998            .map(|s| s.to_string())
999            .collect();
1000        let new_ids: HashSet<String> = HashSet::new();
1001
1002        engine
1003            .cleanup_stale_symbols("src/a.rs", &old_ids, &new_ids)
1004            .unwrap();
1005
1006        // Should have exactly one redirect edge, not two
1007        let g = engine.lock_graph().unwrap();
1008        let file_edges = g.get_edges("file:src/a.rs").unwrap();
1009        let redirect_count = file_edges
1010            .iter()
1011            .filter(|e| e.id.contains("-redirected"))
1012            .count();
1013        assert_eq!(
1014            redirect_count, 1,
1015            "should have exactly 1 redirected edge, got {redirect_count}"
1016        );
1017    }
1018
1019    // ── detect_orphans tests ─────────────────────────────────────────────
1020
1021    #[test]
1022    fn detect_orphans_skips_file_check_when_no_root() {
1023        let engine = test_engine();
1024
1025        // Add a symbol node with a file path that definitely doesn't exist
1026        let sym = graph_node(
1027            "sym:nonexistent::fn",
1028            NodeKind::Function,
1029            Some("does/not/exist.rs"),
1030        );
1031        let _ = engine.storage.insert_graph_node(&sym);
1032        {
1033            let mut g = engine.lock_graph().unwrap();
1034            g.add_node(sym).unwrap();
1035        }
1036
1037        // With None, should NOT delete the node (skips file existence check)
1038        let (symbols_cleaned, _) = engine.detect_orphans(None).unwrap();
1039        assert_eq!(
1040            symbols_cleaned, 0,
1041            "detect_orphans(None) should not delete nodes based on file existence"
1042        );
1043    }
1044
1045    #[test]
1046    fn detect_orphans_removes_missing_files_with_root() {
1047        let dir = tempfile::tempdir().unwrap();
1048        let db_path = dir.path().join("test.db");
1049        let engine = CodememEngine::from_db_path(&db_path).unwrap();
1050
1051        // Add a symbol whose file doesn't exist under the project root
1052        let sym = graph_node(
1053            "sym:missing::fn",
1054            NodeKind::Function,
1055            Some("src/missing.rs"),
1056        );
1057        let _ = engine.storage.insert_graph_node(&sym);
1058        {
1059            let mut g = engine.lock_graph().unwrap();
1060            g.add_node(sym).unwrap();
1061        }
1062
1063        let (symbols_cleaned, _) = engine.detect_orphans(Some(dir.path())).unwrap();
1064        assert_eq!(symbols_cleaned, 1);
1065    }
1066
1067    #[test]
1068    fn detect_orphans_keeps_existing_files() {
1069        let dir = tempfile::tempdir().unwrap();
1070        let db_path = dir.path().join("test.db");
1071        let engine = CodememEngine::from_db_path(&db_path).unwrap();
1072
1073        // Create the actual file so it won't be orphaned
1074        let src_dir = dir.path().join("src");
1075        std::fs::create_dir_all(&src_dir).unwrap();
1076        std::fs::write(src_dir.join("exists.rs"), "fn main() {}").unwrap();
1077
1078        let sym = graph_node(
1079            "sym:exists::main",
1080            NodeKind::Function,
1081            Some("src/exists.rs"),
1082        );
1083        let _ = engine.storage.insert_graph_node(&sym);
1084        {
1085            let mut g = engine.lock_graph().unwrap();
1086            g.add_node(sym).unwrap();
1087        }
1088
1089        let (symbols_cleaned, _) = engine.detect_orphans(Some(dir.path())).unwrap();
1090        assert_eq!(symbols_cleaned, 0);
1091    }
1092
1093    // Note: dangling edge cleanup in detect_orphans is a defensive no-op
1094    // because graph_edges has ON DELETE CASCADE foreign keys on src/dst.
1095    // Deleting a node automatically cascades to its edges in SQLite.
1096}