Skip to main content

codemem_engine/
file_indexing.rs

1use crate::index::{self, IndexAndResolveResult, Indexer};
2use crate::patterns;
3use crate::CodememEngine;
4use codemem_core::{CodememError, DetectedPattern, MemoryNode};
5use std::collections::HashSet;
6use std::path::Path;
7use std::sync::atomic::Ordering;
8
9/// Check if a file is a spec file (OpenAPI/AsyncAPI) by name or content.
10///
11/// First checks well-known filenames. For other YAML/JSON files, peeks at
12/// the first bytes to look for `"openapi"`, `"swagger"`, or `"asyncapi"` keys.
13fn is_spec_file_with_content(path: &str, content: &[u8]) -> bool {
14    let filename = path.rsplit('/').next().unwrap_or(path);
15    let filename_lower = filename.to_lowercase();
16
17    // Fast path: well-known names
18    if matches!(
19        filename_lower.as_str(),
20        "openapi.yaml"
21            | "openapi.yml"
22            | "openapi.json"
23            | "swagger.yaml"
24            | "swagger.yml"
25            | "swagger.json"
26            | "asyncapi.yaml"
27            | "asyncapi.yml"
28            | "asyncapi.json"
29    ) {
30        return true;
31    }
32
33    // For any YAML/JSON file, peek at content for spec-identifying keys
34    let is_yaml_json = filename_lower.ends_with(".yaml")
35        || filename_lower.ends_with(".yml")
36        || filename_lower.ends_with(".json");
37    if !is_yaml_json {
38        return false;
39    }
40
41    let peek = std::str::from_utf8(&content[..content.len().min(300)]).unwrap_or("");
42    let peek_lower = peek.to_lowercase();
43    peek_lower.contains("\"openapi\"")
44        || peek_lower.contains("\"swagger\"")
45        || peek_lower.contains("\"asyncapi\"")
46        || peek_lower.contains("openapi:")
47        || peek_lower.contains("swagger:")
48        || peek_lower.contains("asyncapi:")
49}
50
51impl CodememEngine {
52    // ── Index Persistence ────────────────────────────────────────────────
53
54    /// Save the vector and BM25 indexes to disk if a db_path is configured.
55    /// Compacts the HNSW index if ghost entries exceed 20% of live entries.
56    /// Always clears the dirty flag so `flush_if_dirty()` won't double-save.
57    pub fn save_index(&self) {
58        if let Some(ref db_path) = self.db_path {
59            // Only save vector index if it has been lazily initialized.
60            if self.vector_ready() {
61                let idx_path = db_path.with_extension("idx");
62                if let Ok(mut vi) = self.lock_vector() {
63                    // Compact HNSW if ghost entries exceed threshold
64                    if vi.needs_compaction() {
65                        let ghost = vi.ghost_count();
66                        let live = vi.stats().count;
67                        tracing::info!(
68                            "HNSW ghost compaction: {ghost} ghosts / {live} live entries, rebuilding..."
69                        );
70                        if let Ok(embeddings) = self.storage.list_all_embeddings() {
71                            if let Err(e) = vi.rebuild_from_entries(&embeddings) {
72                                tracing::warn!("HNSW compaction failed: {e}");
73                            }
74                        }
75                    }
76                    if let Err(e) = vi.save(&idx_path) {
77                        tracing::warn!("Failed to save vector index: {e}");
78                    }
79                }
80            }
81
82            // Only save BM25 index if it has been lazily initialized.
83            if self.bm25_ready() {
84                let bm25_path = db_path.with_extension("bm25");
85                if let Ok(bm25) = self.lock_bm25() {
86                    if bm25.needs_save() {
87                        let data = bm25.serialize();
88                        let tmp_path = db_path.with_extension("bm25.tmp");
89                        if let Err(e) = std::fs::write(&tmp_path, &data)
90                            .and_then(|_| std::fs::rename(&tmp_path, &bm25_path))
91                        {
92                            tracing::warn!("Failed to save BM25 index: {e}");
93                        }
94                    }
95                }
96            }
97        }
98        self.dirty.store(false, Ordering::Release);
99    }
100
101    /// Reload the in-memory graph from the database.
102    ///
103    /// **Note**: This loads graph structure but does NOT recompute centrality scores.
104    /// Callers must recompute for their namespace after reloading via
105    /// `recompute_centrality_for_namespace(namespace)` to avoid cross-project PageRank pollution.
106    pub fn reload_graph(&self) -> Result<(), CodememError> {
107        let new_graph = codemem_storage::graph::GraphEngine::from_storage(&*self.storage)?;
108        let mut graph = self.lock_graph()?;
109        *graph = Box::new(new_graph);
110        Ok(())
111    }
112
113    // ── A2: File Watcher Event Processing ───────────────────────────────
114
115    /// Process a single file watcher event by re-indexing changed/created files
116    /// or cleaning up deleted file nodes.
117    ///
118    /// Call this from a watcher event loop:
119    /// ```ignore
120    /// while let Ok(event) = watcher.receiver().recv() {
121    ///     engine.process_watch_event(&event, namespace, Some(root));
122    /// }
123    /// ```
124    pub fn process_watch_event(
125        &self,
126        event: &crate::watch::WatchEvent,
127        namespace: Option<&str>,
128        project_root: Option<&Path>,
129    ) -> Result<(), CodememError> {
130        match event {
131            crate::watch::WatchEvent::FileChanged(path)
132            | crate::watch::WatchEvent::FileCreated(path) => {
133                self.index_single_file(path, namespace, project_root)?;
134            }
135            crate::watch::WatchEvent::FileDeleted(path) => {
136                // Relativize the deleted path so the node ID matches what was indexed.
137                let rel = if let Some(root) = project_root {
138                    path.strip_prefix(root)
139                        .unwrap_or(path)
140                        .to_string_lossy()
141                        .to_string()
142                } else {
143                    path.to_string_lossy().to_string()
144                };
145                self.cleanup_file_nodes(&rel)?;
146            }
147        }
148        Ok(())
149    }
150
151    /// Index (or re-index) a single file: parse it, persist nodes/edges/embeddings,
152    /// and update the index cache.
153    ///
154    /// `project_root` is used to relativize the absolute `path` so node IDs are
155    /// portable. If `None`, the path is stored as-is (absolute).
156    ///
157    /// Uses SHA-256 hash dedup to skip re-indexing when content is unchanged.
158    /// This prevents duplicate work when both the PostToolUse hook and the
159    /// background file watcher fire for the same edit.
160    fn index_single_file(
161        &self,
162        path: &Path,
163        namespace: Option<&str>,
164        project_root: Option<&Path>,
165    ) -> Result<(), CodememError> {
166        let content = std::fs::read(path)?;
167
168        let path_str = if let Some(root) = project_root {
169            path.strip_prefix(root)
170                .unwrap_or(path)
171                .to_string_lossy()
172                .to_string()
173        } else {
174            path.to_string_lossy().to_string()
175        };
176
177        // SHA-256 dedup: skip if content unchanged since last index.
178        // Uses cached ChangeDetector to avoid reloading all hashes from storage per file.
179        let hash = {
180            let mut cd_guard = self
181                .change_detector
182                .lock()
183                .map_err(|_| CodememError::LockPoisoned("change_detector".into()))?;
184            let ns = namespace.unwrap_or("");
185            let cd = cd_guard.get_or_insert_with(|| {
186                let mut cd = index::incremental::ChangeDetector::new();
187                cd.load_from_storage(&*self.storage, ns);
188                cd
189            });
190            let (changed, hash) = cd.check_changed(&path_str, &content);
191            if !changed {
192                tracing::debug!("Skipping unchanged file: {path_str}");
193                return Ok(());
194            }
195            // Expire static-analysis memories linked to symbols in this changed file
196            if self.config.memory.expire_enrichments_on_reindex {
197                match self.storage.expire_memories_for_file(&path_str) {
198                    Ok(0) => {}
199                    Ok(n) => tracing::debug!("Expired {n} enrichment memories for {path_str}"),
200                    Err(e) => tracing::warn!("Failed to expire memories for {path_str}: {e}"),
201                }
202            }
203            hash
204        };
205
206        // Check if this is a spec file (OpenAPI/AsyncAPI). If so, re-parse it
207        // and update endpoints/channels rather than treating it as code.
208        if is_spec_file_with_content(&path_str, &content) {
209            self.reparse_spec_file(path, namespace.unwrap_or(""))?;
210            // Record hash after successful spec parse
211            if let Ok(mut cd_guard) = self.change_detector.lock() {
212                if let Some(cd) = cd_guard.as_mut() {
213                    cd.record_hash(&path_str, hash);
214                    let _ = cd.save_to_storage(&*self.storage, namespace.unwrap_or(""));
215                }
216            }
217            return Ok(());
218        }
219
220        let parser = index::CodeParser::new();
221
222        let parse_result = match parser.parse_file(&path_str, &content) {
223            Some(pr) => pr,
224            None => return Ok(()), // Unsupported file type or parse failure
225        };
226
227        // Build a minimal IndexAndResolveResult for this single file
228        let mut file_paths = HashSet::new();
229        file_paths.insert(parse_result.file_path.clone());
230
231        // Populate the resolver with ALL known symbols from the in-memory graph
232        // so cross-file references (calls to functions in other files) can be
233        // resolved. Without this, only same-file references would resolve,
234        // causing the graph to gradually lose cross-file edges between full
235        // re-indexes.
236        let mut resolver = index::ReferenceResolver::new();
237        resolver.add_symbols(&parse_result.symbols);
238        if let Ok(graph) = self.lock_graph() {
239            let graph_symbols: Vec<index::Symbol> = graph
240                .get_all_nodes()
241                .iter()
242                .filter(|n| n.id.starts_with("sym:"))
243                .filter_map(index::symbol::symbol_from_graph_node)
244                .collect();
245            resolver.add_symbols(&graph_symbols);
246        }
247        let resolve_result = resolver.resolve_all_with_unresolved(&parse_result.references);
248
249        let results = IndexAndResolveResult {
250            index: index::IndexResult {
251                files_scanned: 1,
252                files_parsed: 1,
253                files_skipped: 0,
254                total_symbols: parse_result.symbols.len(),
255                total_references: parse_result.references.len(),
256                total_chunks: parse_result.chunks.len(),
257                parse_results: Vec::new(),
258            },
259            symbols: parse_result.symbols,
260            references: parse_result.references,
261            chunks: parse_result.chunks,
262            file_paths,
263            edges: resolve_result.edges,
264            unresolved: resolve_result.unresolved,
265            root_path: project_root
266                .map(|p| p.to_path_buf())
267                .unwrap_or_else(|| path.to_path_buf()),
268            scip_build: None,
269        };
270
271        self.persist_index_results(&results, namespace)?;
272
273        // Record new hash in the cached detector after successful persist
274        if let Ok(mut cd_guard) = self.change_detector.lock() {
275            if let Some(cd) = cd_guard.as_mut() {
276                cd.record_hash(&path_str, hash);
277                if let Err(e) = cd.save_to_storage(&*self.storage, namespace.unwrap_or("")) {
278                    tracing::warn!("Failed to save file hash for {path_str}: {e}");
279                }
280            }
281        }
282
283        Ok(())
284    }
285
286    // ── A2b: Symbol-Level Diff on Re-index ────────────────────────────
287
288    /// Remove symbols that existed for a file before re-indexing but are no
289    /// longer present in the new parse results. Returns count of cleaned symbols.
290    ///
291    /// For code→code edges (CALLS, IMPORTS, etc.), performs a hard delete.
292    /// For memory→symbol edges, creates a live redirected edge pointing to the
293    /// parent file node, preserving the memory→file connection so recall can
294    /// still traverse it. The original edge is then deleted along with the
295    /// stale symbol node.
296    ///
297    /// `old_symbol_ids` should be the set of symbol IDs that existed for this
298    /// file before re-indexing (collected from the in-memory graph by the caller
299    /// in a single pass across all files).
300    pub fn cleanup_stale_symbols(
301        &self,
302        file_path: &str,
303        old_symbol_ids: &HashSet<String>,
304        new_symbol_ids: &HashSet<String>,
305    ) -> Result<usize, CodememError> {
306        // Compute stale set: symbols that existed before but are not in the new parse
307        let stale_ids: Vec<&String> = old_symbol_ids
308            .iter()
309            .filter(|id| !new_symbol_ids.contains(*id))
310            .collect();
311
312        if stale_ids.is_empty() {
313            return Ok(0);
314        }
315
316        let count = stale_ids.len();
317        tracing::info!(
318            "Cleaning up {count} stale symbols for {file_path}: {:?}",
319            stale_ids
320        );
321
322        let file_node_id = format!("file:{file_path}");
323        let mut redirected_pairs: std::collections::HashSet<(String, String)> =
324            std::collections::HashSet::new();
325        let mut redirected_edges: Vec<codemem_core::Edge> = Vec::new();
326        for sym_id in &stale_ids {
327            // Before deleting the symbol, redirect memory→symbol edges to the
328            // parent file node, preserving historical context.
329            // Memory node IDs are UUIDs (no known prefix like sym:/file:/chunk:).
330            let edges = self.storage.get_edges_for_node(sym_id.as_str())?;
331            for edge in &edges {
332                let other = if edge.src.as_str() == sym_id.as_str() {
333                    &edge.dst
334                } else {
335                    &edge.src
336                };
337                let is_code_node = other.starts_with("sym:")
338                    || other.starts_with("file:")
339                    || other.starts_with("chunk:")
340                    || other.starts_with("pkg:");
341                if !is_code_node {
342                    // Skip if we already redirected this memory→file pair
343                    let pair = (other.to_string(), file_node_id.clone());
344                    if !redirected_pairs.insert(pair) {
345                        continue;
346                    }
347                    let mut redirected = edge.clone();
348                    if redirected.src.as_str() == sym_id.as_str() {
349                        redirected.src = file_node_id.clone();
350                    } else {
351                        redirected.dst = file_node_id.clone();
352                    }
353                    // Don't set valid_to — the redirect should be a live,
354                    // queryable edge so recall can still traverse memory→file.
355                    redirected.id = format!("{}-redirected", edge.id);
356                    if let Err(e) = self.storage.insert_graph_edge(&redirected) {
357                        tracing::warn!("Failed to redirect memory edge {}: {e}", edge.id);
358                    }
359                    redirected_edges.push(redirected);
360                }
361            }
362
363            // Delete all edges and the node itself
364            if let Err(e) = self.storage.delete_graph_edges_for_node(sym_id) {
365                tracing::warn!("Failed to delete edges for stale symbol {sym_id}: {e}");
366            }
367            if let Err(e) = self.storage.delete_graph_node(sym_id) {
368                tracing::warn!("Failed to delete stale symbol node {sym_id}: {e}");
369            }
370            if let Err(e) = self.storage.delete_embedding(sym_id) {
371                tracing::warn!("Failed to delete embedding for stale symbol {sym_id}: {e}");
372            }
373        }
374
375        // Clean up in-memory graph and vector index
376        {
377            let mut graph = self.lock_graph()?;
378            for sym_id in &stale_ids {
379                if let Err(e) = graph.remove_node(sym_id.as_str()) {
380                    tracing::warn!("Failed to remove stale {sym_id} from in-memory graph: {e}");
381                }
382            }
383            // Add redirected memory→file edges so they're visible to
384            // in-memory traversal (BFS, PageRank, recall) during this session.
385            for edge in redirected_edges {
386                let _ = graph.add_edge(edge);
387            }
388        }
389        {
390            let mut vec = self.lock_vector()?;
391            for sym_id in &stale_ids {
392                if let Err(e) = vec.remove(sym_id.as_str()) {
393                    tracing::warn!("Failed to remove stale {sym_id} from vector index: {e}");
394                }
395            }
396        }
397        // Remove stale entries from BM25 index so deleted/renamed symbols
398        // don't persist in text search results or skew IDF calculations.
399        if let Ok(mut bm25) = self.lock_bm25() {
400            for sym_id in &stale_ids {
401                bm25.remove_document(sym_id);
402            }
403        }
404
405        Ok(count)
406    }
407
408    // ── A3: File Deletion Cleanup ───────────────────────────────────────
409
410    /// Remove graph nodes, edges, and embeddings for a single deleted file.
411    fn cleanup_file_nodes(&self, file_path: &str) -> Result<(), CodememError> {
412        let file_node_id = format!("file:{file_path}");
413
414        // Remove all chunk nodes for this file
415        let chunk_prefix = format!("chunk:{file_path}:");
416        if let Err(e) = self.storage.delete_graph_nodes_by_prefix(&chunk_prefix) {
417            tracing::warn!("Failed to delete chunk nodes for {file_path}: {e}");
418        }
419
420        // Remove symbol nodes for this file by checking graph
421        let graph = self.lock_graph()?;
422        let sym_ids: Vec<String> = graph
423            .get_all_nodes()
424            .into_iter()
425            .filter(|n| {
426                n.id.starts_with("sym:")
427                    && n.payload.get("file_path").and_then(|v| v.as_str()) == Some(file_path)
428            })
429            .map(|n| n.id.clone())
430            .collect();
431        drop(graph);
432
433        for sym_id in &sym_ids {
434            if let Err(e) = self.storage.delete_graph_edges_for_node(sym_id) {
435                tracing::warn!("Failed to delete graph edges for {sym_id}: {e}");
436            }
437            if let Err(e) = self.storage.delete_graph_node(sym_id) {
438                tracing::warn!("Failed to delete graph node {sym_id}: {e}");
439            }
440            if let Err(e) = self.storage.delete_embedding(sym_id) {
441                tracing::warn!("Failed to delete embedding {sym_id}: {e}");
442            }
443        }
444
445        // Remove file node itself
446        if let Err(e) = self.storage.delete_graph_edges_for_node(&file_node_id) {
447            tracing::warn!("Failed to delete graph edges for {file_node_id}: {e}");
448        }
449        if let Err(e) = self.storage.delete_graph_node(&file_node_id) {
450            tracing::warn!("Failed to delete graph node {file_node_id}: {e}");
451        }
452
453        // Clean up in-memory graph
454        let mut graph = self.lock_graph()?;
455        for sym_id in &sym_ids {
456            if let Err(e) = graph.remove_node(sym_id) {
457                tracing::warn!("Failed to remove {sym_id} from in-memory graph: {e}");
458            }
459        }
460        // Remove chunk nodes from in-memory graph
461        let chunk_ids: Vec<String> = graph
462            .get_all_nodes()
463            .into_iter()
464            .filter(|n| n.id.starts_with(&format!("chunk:{file_path}:")))
465            .map(|n| n.id.clone())
466            .collect();
467        for chunk_id in &chunk_ids {
468            if let Err(e) = graph.remove_node(chunk_id) {
469                tracing::warn!("Failed to remove {chunk_id} from in-memory graph: {e}");
470            }
471        }
472        if let Err(e) = graph.remove_node(&file_node_id) {
473            tracing::warn!("Failed to remove {file_node_id} from in-memory graph: {e}");
474        }
475        drop(graph);
476
477        // Remove stale embeddings from vector index
478        let mut vec = self.lock_vector()?;
479        for sym_id in &sym_ids {
480            if let Err(e) = vec.remove(sym_id) {
481                tracing::warn!("Failed to remove {sym_id} from vector index: {e}");
482            }
483        }
484        for chunk_id in &chunk_ids {
485            if let Err(e) = vec.remove(chunk_id) {
486                tracing::warn!("Failed to remove {chunk_id} from vector index: {e}");
487            }
488        }
489        drop(vec);
490
491        // Remove deleted symbols/chunks from BM25 index
492        if let Ok(mut bm25) = self.lock_bm25() {
493            for sym_id in &sym_ids {
494                bm25.remove_document(sym_id);
495            }
496            for chunk_id in &chunk_ids {
497                bm25.remove_document(chunk_id);
498            }
499        }
500
501        self.save_index();
502        Ok(())
503    }
504
505    // ── A3b: Orphan Detection ─────────────────────────────────────────
506
507    /// Scan for orphaned symbol/chunk nodes whose files no longer exist on disk.
508    /// Also cleans up dangling edges (src or dst node doesn't exist).
509    /// Returns `(symbols_cleaned, edges_cleaned)`.
510    ///
511    /// When `project_root` is `None`, file-existence checks are skipped
512    /// (only dangling edge cleanup runs) to avoid CWD-dependent path
513    /// resolution that could cause mass deletion.
514    pub fn detect_orphans(
515        &self,
516        project_root: Option<&Path>,
517    ) -> Result<(usize, usize), CodememError> {
518        // Use storage for both nodes and edges to avoid in-memory/storage sync races.
519        let all_nodes = self.storage.all_graph_nodes()?;
520        let node_ids: HashSet<String> = all_nodes.iter().map(|n| n.id.clone()).collect();
521
522        let mut orphan_sym_ids: Vec<String> = Vec::new();
523
524        // Only check file existence when we have a known project root.
525        // Without it, relative paths resolve against CWD which may be wrong.
526        if let Some(root) = project_root {
527            for node in &all_nodes {
528                // Collect sym: and chunk: nodes whose backing files are gone
529                if node.id.starts_with("sym:") || node.id.starts_with("chunk:") {
530                    let file_path = match node.payload.get("file_path").and_then(|v| v.as_str()) {
531                        Some(fp) => fp,
532                        None => continue,
533                    };
534                    let abs_path = root.join(file_path);
535                    if !abs_path.exists() {
536                        orphan_sym_ids.push(node.id.clone());
537                    }
538                }
539                // Collect file: nodes whose backing files are gone
540                else if let Some(fp) = node.id.strip_prefix("file:") {
541                    let abs_path = root.join(fp);
542                    if !abs_path.exists() {
543                        orphan_sym_ids.push(node.id.clone());
544                    }
545                }
546                // Collect pkg: nodes whose directories are gone
547                else if let Some(dir) = node.id.strip_prefix("pkg:") {
548                    let dir_trimmed = dir.trim_end_matches('/');
549                    let abs_path = root.join(dir_trimmed);
550                    if !abs_path.exists() {
551                        orphan_sym_ids.push(node.id.clone());
552                    }
553                }
554            }
555        }
556
557        // Also find dangling edges (src or dst doesn't exist in graph)
558        let all_edges = self.storage.all_graph_edges()?;
559        let mut dangling_edge_ids: Vec<String> = Vec::new();
560        for edge in &all_edges {
561            if !node_ids.contains(&edge.src) || !node_ids.contains(&edge.dst) {
562                dangling_edge_ids.push(edge.id.clone());
563            }
564        }
565
566        let symbols_cleaned = orphan_sym_ids.len();
567
568        // Clean up orphan nodes
569        for sym_id in &orphan_sym_ids {
570            if let Err(e) = self.storage.delete_graph_edges_for_node(sym_id) {
571                tracing::warn!("Orphan cleanup: failed to delete edges for {sym_id}: {e}");
572            }
573            if let Err(e) = self.storage.delete_graph_node(sym_id) {
574                tracing::warn!("Orphan cleanup: failed to delete node {sym_id}: {e}");
575            }
576            if let Err(e) = self.storage.delete_embedding(sym_id) {
577                tracing::warn!("Orphan cleanup: failed to delete embedding {sym_id}: {e}");
578            }
579        }
580
581        // Clean up orphan nodes from in-memory graph + vector
582        if !orphan_sym_ids.is_empty() {
583            if let Ok(mut graph) = self.lock_graph() {
584                for sym_id in &orphan_sym_ids {
585                    let _ = graph.remove_node(sym_id);
586                }
587            }
588            if let Ok(mut vec) = self.lock_vector() {
589                for sym_id in &orphan_sym_ids {
590                    let _ = vec.remove(sym_id);
591                }
592            }
593        }
594
595        // Delete dangling edges that weren't already removed by node cleanup
596        let mut edges_cleaned = 0usize;
597        for edge_id in &dangling_edge_ids {
598            match self.storage.delete_graph_edge(edge_id) {
599                Ok(true) => edges_cleaned += 1,
600                Ok(false) => {} // Already deleted by node cleanup above
601                Err(e) => {
602                    tracing::warn!("Orphan cleanup: failed to delete dangling edge {edge_id}: {e}");
603                }
604            }
605        }
606
607        if symbols_cleaned > 0 || edges_cleaned > 0 {
608            tracing::info!(
609                "Orphan scan: cleaned {symbols_cleaned} symbol/chunk nodes, {edges_cleaned} dangling edges"
610            );
611        }
612
613        Ok((symbols_cleaned, edges_cleaned))
614    }
615
616    // ── A3c: Spec File Re-parsing ──────────────────────────────────────
617
618    /// Re-parse a spec file (OpenAPI/AsyncAPI) and update stored endpoints/channels.
619    fn reparse_spec_file(&self, path: &Path, namespace: &str) -> Result<(), CodememError> {
620        use crate::index::spec_parser::{parse_asyncapi, parse_openapi, SpecFileResult};
621
622        let result = if let Some(openapi) = parse_openapi(path) {
623            Some(SpecFileResult::OpenApi(openapi))
624        } else {
625            parse_asyncapi(path).map(SpecFileResult::AsyncApi)
626        };
627
628        match result {
629            Some(SpecFileResult::OpenApi(spec)) => {
630                for ep in &spec.endpoints {
631                    let _ = self.storage.store_api_endpoint(
632                        &ep.method,
633                        &ep.path,
634                        ep.operation_id.as_deref().unwrap_or(""),
635                        namespace,
636                    );
637                }
638                tracing::info!(
639                    "Re-parsed OpenAPI spec: {} endpoints from {}",
640                    spec.endpoints.len(),
641                    path.display()
642                );
643            }
644            Some(SpecFileResult::AsyncApi(spec)) => {
645                for ch in &spec.channels {
646                    let _ = self.storage.store_event_channel(
647                        &ch.channel,
648                        &ch.direction,
649                        ch.protocol.as_deref().unwrap_or(""),
650                        ch.operation_id.as_deref().unwrap_or(""),
651                        namespace,
652                        ch.description.as_deref().unwrap_or(""),
653                    );
654                }
655                tracing::info!(
656                    "Re-parsed AsyncAPI spec: {} channels from {}",
657                    spec.channels.len(),
658                    path.display()
659                );
660            }
661            None => {
662                tracing::debug!("Not a recognized spec file: {}", path.display());
663            }
664        }
665        Ok(())
666    }
667
668    // ── A4: Unified Analyze Pipeline ────────────────────────────────────
669
670    /// Full analysis pipeline: index → persist → enrich → recompute centrality.
671    ///
672    /// This is the single entry point for all callers (CLI, MCP, API).
673    /// Supports incremental indexing via `ChangeDetector`, progress callbacks,
674    /// and returns comprehensive results.
675    pub fn analyze(&self, options: AnalyzeOptions<'_>) -> Result<AnalyzeResult, CodememError> {
676        let root = options.path;
677
678        // Eagerly initialize embeddings/vector/BM25 for the full analysis pipeline.
679        // This triggers lazy init so that embed_and_persist() finds them ready.
680        // Skip if embeddings are not needed.
681        if !options.skip_embed {
682            drop(self.lock_embeddings());
683            drop(self.lock_vector());
684            drop(self.lock_bm25());
685        }
686
687        // 0. SCIP phase: run indexers, parse results, build graph data.
688        // Runs BEFORE ast-grep so we know which files SCIP covered.
689        let (scip_covered, scip_build) = if !options.skip_scip && self.config.scip.enabled {
690            match self.run_scip_phase(root, options.namespace) {
691                Ok((covered, build)) => (Some(covered), Some(build)),
692                Err(e) => {
693                    tracing::warn!("SCIP phase failed, falling back to ast-grep only: {e}");
694                    (None, None)
695                }
696            }
697        } else {
698            (None, None)
699        };
700
701        let scip_nodes_created = scip_build.as_ref().map_or(0, |b| b.nodes.len());
702        let scip_edges_created = scip_build.as_ref().map_or(0, |b| b.edges.len());
703        let scip_files_covered = scip_covered.as_ref().map_or(0, |s| s.len());
704
705        // 1. Index (ast-grep skips symbol extraction for SCIP-covered files)
706        // When force=true, ignore the change detector so all files are re-processed.
707        let mut indexer = match options.change_detector {
708            Some(cd) if !options.force => Indexer::with_change_detector(cd),
709            _ => Indexer::new(),
710        };
711        let resolved =
712            indexer.index_and_resolve_with_scip(root, scip_covered.as_ref(), scip_build)?;
713
714        // 2. Persist (with or without progress callback)
715        let persist = if options.skip_embed {
716            self.persist_graph_only(&resolved, Some(options.namespace))?
717        } else if let Some(ref on_progress) = options.progress {
718            self.persist_index_results_with_progress(
719                &resolved,
720                Some(options.namespace),
721                |done, total| {
722                    on_progress(AnalyzeProgress::Embedding { done, total });
723                },
724            )?
725        } else {
726            self.persist_index_results(&resolved, Some(options.namespace))?
727        };
728
729        // Cache results for structural queries
730        {
731            if let Ok(mut cache) = self.lock_index_cache() {
732                *cache = Some(crate::IndexCache {
733                    symbols: resolved.symbols,
734                    chunks: resolved.chunks,
735                    root_path: root.to_string_lossy().to_string(),
736                });
737            }
738        }
739
740        // 3. Enrich (skip if requested)
741        let enrichment = if options.skip_enrich {
742            crate::enrichment::EnrichmentPipelineResult {
743                results: serde_json::json!({}),
744                total_insights: 0,
745            }
746        } else {
747            let path_str = root.to_str().unwrap_or("");
748            self.run_enrichments(
749                path_str,
750                &[],
751                options.git_days,
752                Some(options.namespace),
753                None,
754            )
755        };
756
757        // 4. Recompute centrality scoped to this namespace so cross-project
758        //    scores from other indexed repos don't pollute PageRank here.
759        //    (PageRank only; betweenness is lazy/on-demand because Brandes'
760        //    algorithm is O(sqrt(n) * (V+E)) even with sampling, which is
761        //    too slow for large SCIP-indexed graphs with 50K+ nodes.)
762        self.lock_graph()?
763            .recompute_centrality_for_namespace(options.namespace);
764
765        // 5. Compute summary stats
766        let top_nodes = self
767            .find_important_nodes(10, 0.85, Some(options.namespace))
768            .unwrap_or_default();
769        let community_count = self.louvain_communities(1.0).map(|c| c.len()).unwrap_or(0);
770
771        // 6. Save indexes
772        self.save_index();
773
774        // Save incremental state
775        indexer
776            .change_detector()
777            .save_to_storage(self.storage(), options.namespace)?;
778
779        Ok(AnalyzeResult {
780            files_parsed: resolved.index.files_parsed,
781            files_skipped: resolved.index.files_skipped,
782            symbols_found: resolved.index.total_symbols,
783            edges_resolved: persist.edges_resolved,
784            chunks_stored: persist.chunks_stored,
785            symbols_embedded: persist.symbols_embedded,
786            chunks_embedded: persist.chunks_embedded,
787            chunks_pruned: persist.chunks_pruned,
788            symbols_pruned: persist.symbols_pruned,
789            enrichment_results: enrichment.results,
790            total_insights: enrichment.total_insights,
791            top_nodes,
792            community_count,
793            scip_nodes_created,
794            scip_edges_created,
795            scip_files_covered,
796        })
797    }
798
799    /// Run the SCIP phase: orchestrate indexers, parse results, build graph data.
800    fn run_scip_phase(
801        &self,
802        root: &Path,
803        namespace: &str,
804    ) -> Result<(HashSet<String>, index::scip::graph_builder::ScipBuildResult), CodememError> {
805        let orchestrator =
806            index::scip::orchestrate::ScipOrchestrator::new(self.config.scip.clone());
807        let orch_result = orchestrator.run(root, namespace)?;
808
809        if orch_result.scip_result.covered_files.is_empty() {
810            return Ok((
811                HashSet::new(),
812                index::scip::graph_builder::ScipBuildResult::default(),
813            ));
814        }
815
816        for (lang, err) in &orch_result.failed_languages {
817            tracing::warn!("SCIP indexer for {:?} failed: {}", lang, err);
818        }
819        for lang in &orch_result.indexed_languages {
820            tracing::info!("SCIP indexed {:?} successfully", lang);
821        }
822
823        let build = index::scip::graph_builder::build_graph(
824            &orch_result.scip_result,
825            Some(namespace),
826            &self.config.scip,
827        );
828        let covered: HashSet<String> = build.files_covered.clone();
829
830        tracing::info!(
831            "SCIP phase: {} nodes, {} edges, {} ext nodes, {} files covered, {} doc memories",
832            build.nodes.len(),
833            build.edges.len(),
834            build.ext_nodes_created,
835            covered.len(),
836            build.doc_memories_created,
837        );
838
839        Ok((covered, build))
840    }
841
842    // ── A8: Session Context Synthesis ───────────────────────────────────
843
844    /// Synthesize context for a new session: recent memories, pending analyses,
845    /// active patterns, and last session summary.
846    pub fn session_context(&self, namespace: Option<&str>) -> Result<SessionContext, CodememError> {
847        let now = chrono::Utc::now();
848        let cutoff_24h = now - chrono::Duration::hours(24);
849
850        // 1. Recent memories (last 24h)
851        let ids = match namespace {
852            Some(ns) => self.storage.list_memory_ids_for_namespace(ns)?,
853            None => self.storage.list_memory_ids()?,
854        };
855
856        let mut recent_memories = Vec::new();
857        let mut pending_analyses = Vec::new();
858
859        for id in ids.iter().rev().take(200) {
860            if let Ok(Some(m)) = self.storage.get_memory_no_touch(id) {
861                // Collect pending analyses
862                if m.tags.contains(&"pending-analysis".to_string()) {
863                    pending_analyses.push(m.clone());
864                }
865                // Collect recent memories from last 24h
866                if m.created_at >= cutoff_24h {
867                    recent_memories.push(m);
868                }
869                if recent_memories.len() >= 50 && pending_analyses.len() >= 10 {
870                    break;
871                }
872            }
873        }
874
875        // 2. Active patterns
876        let session_count = self.storage.session_count(namespace).unwrap_or(1).max(1);
877        let active_patterns = patterns::detect_patterns(
878            &*self.storage,
879            namespace,
880            2, // min_frequency
881            session_count,
882        )
883        .unwrap_or_default();
884
885        // 3. Last session summary
886        let last_session_summary = self
887            .storage
888            .list_sessions(namespace, 1)?
889            .into_iter()
890            .next()
891            .and_then(|s| s.summary);
892
893        Ok(SessionContext {
894            recent_memories,
895            pending_analyses,
896            active_patterns,
897            last_session_summary,
898        })
899    }
900}
901
902// ── Result Types ────────────────────────────────────────────────────────────
903
904/// Options for the unified `analyze()` pipeline.
905pub struct AnalyzeOptions<'a> {
906    pub path: &'a Path,
907    pub namespace: &'a str,
908    pub git_days: u64,
909    pub change_detector: Option<index::incremental::ChangeDetector>,
910    pub progress: Option<Box<dyn Fn(AnalyzeProgress) + Send + 'a>>,
911    /// Skip SCIP indexing — use ast-grep only (faster, less accurate).
912    pub skip_scip: bool,
913    /// Skip embedding phase (graph + chunks stored but not vectorized).
914    pub skip_embed: bool,
915    /// Skip enrichment phase (no git-history/complexity/etc. analysis).
916    pub skip_enrich: bool,
917    /// Force re-index even when file SHAs haven't changed.
918    pub force: bool,
919}
920
921/// Progress events emitted during analysis.
922#[derive(Debug, Clone)]
923pub enum AnalyzeProgress {
924    Embedding { done: usize, total: usize },
925}
926
927/// Result of the unified `analyze()` pipeline.
928#[derive(Debug)]
929pub struct AnalyzeResult {
930    pub files_parsed: usize,
931    pub files_skipped: usize,
932    pub symbols_found: usize,
933    pub edges_resolved: usize,
934    pub chunks_stored: usize,
935    pub symbols_embedded: usize,
936    pub chunks_embedded: usize,
937    pub chunks_pruned: usize,
938    pub symbols_pruned: usize,
939    pub enrichment_results: serde_json::Value,
940    pub total_insights: usize,
941    pub top_nodes: Vec<crate::graph_ops::RankedNode>,
942    pub community_count: usize,
943    /// SCIP nodes created (sym: + ext: nodes).
944    pub scip_nodes_created: usize,
945    /// SCIP edges created (CALLS, IMPORTS, READS, WRITES, IMPLEMENTS, etc.).
946    pub scip_edges_created: usize,
947    /// Files covered by SCIP indexers (ast-grep skipped symbol extraction for these).
948    pub scip_files_covered: usize,
949}
950
951/// Session context synthesized at session start.
952#[derive(Debug)]
953pub struct SessionContext {
954    /// Memories created in the last 24 hours.
955    pub recent_memories: Vec<MemoryNode>,
956    /// Memories tagged `pending-analysis` awaiting code-mapper review.
957    pub pending_analyses: Vec<MemoryNode>,
958    /// Cross-session patterns detected with sufficient frequency.
959    pub active_patterns: Vec<DetectedPattern>,
960    /// Summary text from the most recent session (if any).
961    pub last_session_summary: Option<String>,
962}
963
964#[cfg(test)]
965mod tests {
966    use super::*;
967    use codemem_core::{Edge, GraphNode, NodeKind, RelationshipType};
968    use std::collections::{HashMap, HashSet};
969
970    /// Create a test engine backed by a temporary database.
971    fn test_engine() -> CodememEngine {
972        let dir = tempfile::tempdir().unwrap();
973        let db_path = dir.path().join("test.db");
974        // Keep the tempdir alive by leaking it (tests are short-lived).
975        let _ = Box::leak(Box::new(dir));
976        CodememEngine::from_db_path(&db_path).unwrap()
977    }
978
979    fn graph_node(id: &str, kind: NodeKind, file_path: Option<&str>) -> GraphNode {
980        let mut payload = HashMap::new();
981        if let Some(fp) = file_path {
982            payload.insert(
983                "file_path".to_string(),
984                serde_json::Value::String(fp.to_string()),
985            );
986        }
987        GraphNode {
988            id: id.to_string(),
989            kind,
990            label: id.to_string(),
991            payload,
992            centrality: 0.0,
993            memory_id: None,
994            namespace: None,
995            valid_from: None,
996            valid_to: None,
997        }
998    }
999
1000    fn edge(src: &str, dst: &str, rel: RelationshipType) -> Edge {
1001        Edge {
1002            id: format!("{rel}:{src}->{dst}"),
1003            src: src.to_string(),
1004            dst: dst.to_string(),
1005            relationship: rel,
1006            weight: 1.0,
1007            properties: HashMap::new(),
1008            created_at: chrono::Utc::now(),
1009            valid_from: None,
1010            valid_to: None,
1011        }
1012    }
1013
1014    // ── cleanup_stale_symbols tests ──────────────────────────────────────
1015
1016    #[test]
1017    fn cleanup_stale_symbols_deletes_stale_nodes() {
1018        let engine = test_engine();
1019
1020        // Set up: file with two symbols, one will become stale
1021        let file = graph_node("file:src/a.rs", NodeKind::File, None);
1022        let sym_keep = graph_node("sym:a::keep", NodeKind::Function, Some("src/a.rs"));
1023        let sym_stale = graph_node("sym:a::stale", NodeKind::Function, Some("src/a.rs"));
1024
1025        {
1026            let mut g = engine.lock_graph().unwrap();
1027            g.add_node(file).unwrap();
1028            g.add_node(sym_keep.clone()).unwrap();
1029            g.add_node(sym_stale.clone()).unwrap();
1030            g.add_edge(edge(
1031                "file:src/a.rs",
1032                "sym:a::keep",
1033                RelationshipType::Contains,
1034            ))
1035            .unwrap();
1036            g.add_edge(edge(
1037                "file:src/a.rs",
1038                "sym:a::stale",
1039                RelationshipType::Contains,
1040            ))
1041            .unwrap();
1042        }
1043        // Also persist to storage so cleanup can find edges
1044        let _ =
1045            engine
1046                .storage
1047                .insert_graph_node(&graph_node("file:src/a.rs", NodeKind::File, None));
1048        let _ = engine.storage.insert_graph_node(&sym_keep);
1049        let _ = engine.storage.insert_graph_node(&sym_stale);
1050        let _ = engine.storage.insert_graph_edge(&edge(
1051            "file:src/a.rs",
1052            "sym:a::keep",
1053            RelationshipType::Contains,
1054        ));
1055        let _ = engine.storage.insert_graph_edge(&edge(
1056            "file:src/a.rs",
1057            "sym:a::stale",
1058            RelationshipType::Contains,
1059        ));
1060
1061        let old_ids: HashSet<String> = ["sym:a::keep", "sym:a::stale"]
1062            .iter()
1063            .map(|s| s.to_string())
1064            .collect();
1065        let new_ids: HashSet<String> = ["sym:a::keep"].iter().map(|s| s.to_string()).collect();
1066
1067        let cleaned = engine
1068            .cleanup_stale_symbols("src/a.rs", &old_ids, &new_ids)
1069            .unwrap();
1070        assert_eq!(cleaned, 1);
1071
1072        // Stale node should be gone from in-memory graph
1073        let g = engine.lock_graph().unwrap();
1074        assert!(g.get_node("sym:a::stale").unwrap().is_none());
1075        assert!(g.get_node("sym:a::keep").unwrap().is_some());
1076    }
1077
1078    #[test]
1079    fn cleanup_stale_symbols_redirects_memory_edges_to_graph() {
1080        let engine = test_engine();
1081
1082        let file = graph_node("file:src/a.rs", NodeKind::File, None);
1083        let sym_stale = graph_node("sym:a::old_fn", NodeKind::Function, Some("src/a.rs"));
1084        let mem = graph_node("mem-uuid-123", NodeKind::Memory, None);
1085
1086        {
1087            let mut g = engine.lock_graph().unwrap();
1088            g.add_node(file.clone()).unwrap();
1089            g.add_node(sym_stale.clone()).unwrap();
1090            g.add_node(mem.clone()).unwrap();
1091            g.add_edge(edge(
1092                "file:src/a.rs",
1093                "sym:a::old_fn",
1094                RelationshipType::Contains,
1095            ))
1096            .unwrap();
1097            g.add_edge(edge(
1098                "mem-uuid-123",
1099                "sym:a::old_fn",
1100                RelationshipType::RelatesTo,
1101            ))
1102            .unwrap();
1103        }
1104        let _ = engine.storage.insert_graph_node(&file);
1105        let _ = engine.storage.insert_graph_node(&sym_stale);
1106        let _ = engine.storage.insert_graph_node(&mem);
1107        let _ = engine.storage.insert_graph_edge(&edge(
1108            "file:src/a.rs",
1109            "sym:a::old_fn",
1110            RelationshipType::Contains,
1111        ));
1112        let _ = engine.storage.insert_graph_edge(&edge(
1113            "mem-uuid-123",
1114            "sym:a::old_fn",
1115            RelationshipType::RelatesTo,
1116        ));
1117
1118        let old_ids: HashSet<String> = ["sym:a::old_fn"].iter().map(|s| s.to_string()).collect();
1119        let new_ids: HashSet<String> = HashSet::new();
1120
1121        engine
1122            .cleanup_stale_symbols("src/a.rs", &old_ids, &new_ids)
1123            .unwrap();
1124
1125        // The redirected edge should be in the in-memory graph
1126        let g = engine.lock_graph().unwrap();
1127        let file_edges = g.get_edges("file:src/a.rs").unwrap();
1128        let has_redirect = file_edges.iter().any(|e| {
1129            (e.src == "mem-uuid-123" || e.dst == "mem-uuid-123") && e.id.contains("-redirected")
1130        });
1131        assert!(
1132            has_redirect,
1133            "redirected memory→file edge should be in the in-memory graph"
1134        );
1135    }
1136
1137    #[test]
1138    fn cleanup_stale_symbols_deduplicates_redirects() {
1139        let engine = test_engine();
1140
1141        let file = graph_node("file:src/a.rs", NodeKind::File, None);
1142        let sym1 = graph_node("sym:a::fn1", NodeKind::Function, Some("src/a.rs"));
1143        let sym2 = graph_node("sym:a::fn2", NodeKind::Function, Some("src/a.rs"));
1144        let mem = graph_node("mem-uuid-456", NodeKind::Memory, None);
1145
1146        // Same memory linked to two symbols in the same file
1147        let _ = engine.storage.insert_graph_node(&file);
1148        let _ = engine.storage.insert_graph_node(&sym1);
1149        let _ = engine.storage.insert_graph_node(&sym2);
1150        let _ = engine.storage.insert_graph_node(&mem);
1151        let _ = engine.storage.insert_graph_edge(&edge(
1152            "mem-uuid-456",
1153            "sym:a::fn1",
1154            RelationshipType::RelatesTo,
1155        ));
1156        let _ = engine.storage.insert_graph_edge(&edge(
1157            "mem-uuid-456",
1158            "sym:a::fn2",
1159            RelationshipType::RelatesTo,
1160        ));
1161
1162        {
1163            let mut g = engine.lock_graph().unwrap();
1164            g.add_node(file).unwrap();
1165            g.add_node(sym1).unwrap();
1166            g.add_node(sym2).unwrap();
1167            g.add_node(mem).unwrap();
1168        }
1169
1170        let old_ids: HashSet<String> = ["sym:a::fn1", "sym:a::fn2"]
1171            .iter()
1172            .map(|s| s.to_string())
1173            .collect();
1174        let new_ids: HashSet<String> = HashSet::new();
1175
1176        engine
1177            .cleanup_stale_symbols("src/a.rs", &old_ids, &new_ids)
1178            .unwrap();
1179
1180        // Should have exactly one redirect edge, not two
1181        let g = engine.lock_graph().unwrap();
1182        let file_edges = g.get_edges("file:src/a.rs").unwrap();
1183        let redirect_count = file_edges
1184            .iter()
1185            .filter(|e| e.id.contains("-redirected"))
1186            .count();
1187        assert_eq!(
1188            redirect_count, 1,
1189            "should have exactly 1 redirected edge, got {redirect_count}"
1190        );
1191    }
1192
1193    // ── detect_orphans tests ─────────────────────────────────────────────
1194
1195    #[test]
1196    fn detect_orphans_skips_file_check_when_no_root() {
1197        let engine = test_engine();
1198
1199        // Add a symbol node with a file path that definitely doesn't exist
1200        let sym = graph_node(
1201            "sym:nonexistent::fn",
1202            NodeKind::Function,
1203            Some("does/not/exist.rs"),
1204        );
1205        let _ = engine.storage.insert_graph_node(&sym);
1206        {
1207            let mut g = engine.lock_graph().unwrap();
1208            g.add_node(sym).unwrap();
1209        }
1210
1211        // With None, should NOT delete the node (skips file existence check)
1212        let (symbols_cleaned, _) = engine.detect_orphans(None).unwrap();
1213        assert_eq!(
1214            symbols_cleaned, 0,
1215            "detect_orphans(None) should not delete nodes based on file existence"
1216        );
1217    }
1218
1219    #[test]
1220    fn detect_orphans_removes_missing_files_with_root() {
1221        let dir = tempfile::tempdir().unwrap();
1222        let db_path = dir.path().join("test.db");
1223        let engine = CodememEngine::from_db_path(&db_path).unwrap();
1224
1225        // Add a symbol whose file doesn't exist under the project root
1226        let sym = graph_node(
1227            "sym:missing::fn",
1228            NodeKind::Function,
1229            Some("src/missing.rs"),
1230        );
1231        let _ = engine.storage.insert_graph_node(&sym);
1232        {
1233            let mut g = engine.lock_graph().unwrap();
1234            g.add_node(sym).unwrap();
1235        }
1236
1237        let (symbols_cleaned, _) = engine.detect_orphans(Some(dir.path())).unwrap();
1238        assert_eq!(symbols_cleaned, 1);
1239    }
1240
1241    #[test]
1242    fn detect_orphans_keeps_existing_files() {
1243        let dir = tempfile::tempdir().unwrap();
1244        let db_path = dir.path().join("test.db");
1245        let engine = CodememEngine::from_db_path(&db_path).unwrap();
1246
1247        // Create the actual file so it won't be orphaned
1248        let src_dir = dir.path().join("src");
1249        std::fs::create_dir_all(&src_dir).unwrap();
1250        std::fs::write(src_dir.join("exists.rs"), "fn main() {}").unwrap();
1251
1252        let sym = graph_node(
1253            "sym:exists::main",
1254            NodeKind::Function,
1255            Some("src/exists.rs"),
1256        );
1257        let _ = engine.storage.insert_graph_node(&sym);
1258        {
1259            let mut g = engine.lock_graph().unwrap();
1260            g.add_node(sym).unwrap();
1261        }
1262
1263        let (symbols_cleaned, _) = engine.detect_orphans(Some(dir.path())).unwrap();
1264        assert_eq!(symbols_cleaned, 0);
1265    }
1266
1267    // Note: dangling edge cleanup in detect_orphans is a defensive no-op
1268    // because graph_edges has ON DELETE CASCADE foreign keys on src/dst.
1269    // Deleting a node automatically cascades to its edges in SQLite.
1270}