Skip to main content

codemem_engine/
file_indexing.rs

1use crate::index::{self, IndexAndResolveResult, Indexer};
2use crate::patterns;
3use crate::CodememEngine;
4use codemem_core::{CodememError, DetectedPattern, MemoryNode};
5use std::collections::HashSet;
6use std::path::Path;
7use std::sync::atomic::Ordering;
8
9/// Check if a file is a spec file (OpenAPI/AsyncAPI) by name or content.
10///
11/// First checks well-known filenames. For other YAML/JSON files, peeks at
12/// the first bytes to look for `"openapi"`, `"swagger"`, or `"asyncapi"` keys.
13fn is_spec_file_with_content(path: &str, content: &[u8]) -> bool {
14    let filename = path.rsplit('/').next().unwrap_or(path);
15    let filename_lower = filename.to_lowercase();
16
17    // Fast path: well-known names
18    if matches!(
19        filename_lower.as_str(),
20        "openapi.yaml"
21            | "openapi.yml"
22            | "openapi.json"
23            | "swagger.yaml"
24            | "swagger.yml"
25            | "swagger.json"
26            | "asyncapi.yaml"
27            | "asyncapi.yml"
28            | "asyncapi.json"
29    ) {
30        return true;
31    }
32
33    // For any YAML/JSON file, peek at content for spec-identifying keys
34    let is_yaml_json = filename_lower.ends_with(".yaml")
35        || filename_lower.ends_with(".yml")
36        || filename_lower.ends_with(".json");
37    if !is_yaml_json {
38        return false;
39    }
40
41    let peek = std::str::from_utf8(&content[..content.len().min(300)]).unwrap_or("");
42    let peek_lower = peek.to_lowercase();
43    peek_lower.contains("\"openapi\"")
44        || peek_lower.contains("\"swagger\"")
45        || peek_lower.contains("\"asyncapi\"")
46        || peek_lower.contains("openapi:")
47        || peek_lower.contains("swagger:")
48        || peek_lower.contains("asyncapi:")
49}
50
51impl CodememEngine {
52    // ── Index Persistence ────────────────────────────────────────────────
53
54    /// Save the vector and BM25 indexes to disk if a db_path is configured.
55    /// Compacts the HNSW index if ghost entries exceed 20% of live entries.
56    /// Always clears the dirty flag so `flush_if_dirty()` won't double-save.
57    pub fn save_index(&self) {
58        if let Some(ref db_path) = self.db_path {
59            // Only save vector index if it has been lazily initialized.
60            if self.vector_ready() {
61                let idx_path = db_path.with_extension("idx");
62                if let Ok(mut vi) = self.lock_vector() {
63                    // Compact HNSW if ghost entries exceed threshold
64                    if vi.needs_compaction() {
65                        let ghost = vi.ghost_count();
66                        let live = vi.stats().count;
67                        tracing::info!(
68                            "HNSW ghost compaction: {ghost} ghosts / {live} live entries, rebuilding..."
69                        );
70                        if let Ok(embeddings) = self.storage.list_all_embeddings() {
71                            if let Err(e) = vi.rebuild_from_entries(&embeddings) {
72                                tracing::warn!("HNSW compaction failed: {e}");
73                            }
74                        }
75                    }
76                    if let Err(e) = vi.save(&idx_path) {
77                        tracing::warn!("Failed to save vector index: {e}");
78                    }
79                }
80            }
81
82            // Only save BM25 index if it has been lazily initialized.
83            if self.bm25_ready() {
84                let bm25_path = db_path.with_extension("bm25");
85                if let Ok(bm25) = self.lock_bm25() {
86                    if bm25.needs_save() {
87                        let data = bm25.serialize();
88                        let tmp_path = db_path.with_extension("bm25.tmp");
89                        if let Err(e) = std::fs::write(&tmp_path, &data)
90                            .and_then(|_| std::fs::rename(&tmp_path, &bm25_path))
91                        {
92                            tracing::warn!("Failed to save BM25 index: {e}");
93                        }
94                    }
95                }
96            }
97        }
98        self.dirty.store(false, Ordering::Release);
99    }
100
101    /// Reload the in-memory graph from the database.
102    pub fn reload_graph(&self) -> Result<(), CodememError> {
103        let new_graph = codemem_storage::graph::GraphEngine::from_storage(&*self.storage)?;
104        let mut graph = self.lock_graph()?;
105        *graph = Box::new(new_graph);
106        graph.recompute_centrality();
107        Ok(())
108    }
109
110    // ── A2: File Watcher Event Processing ───────────────────────────────
111
112    /// Process a single file watcher event by re-indexing changed/created files
113    /// or cleaning up deleted file nodes.
114    ///
115    /// Call this from a watcher event loop:
116    /// ```ignore
117    /// while let Ok(event) = watcher.receiver().recv() {
118    ///     engine.process_watch_event(&event, namespace, Some(root));
119    /// }
120    /// ```
121    pub fn process_watch_event(
122        &self,
123        event: &crate::watch::WatchEvent,
124        namespace: Option<&str>,
125        project_root: Option<&Path>,
126    ) -> Result<(), CodememError> {
127        match event {
128            crate::watch::WatchEvent::FileChanged(path)
129            | crate::watch::WatchEvent::FileCreated(path) => {
130                self.index_single_file(path, namespace, project_root)?;
131            }
132            crate::watch::WatchEvent::FileDeleted(path) => {
133                // Relativize the deleted path so the node ID matches what was indexed.
134                let rel = if let Some(root) = project_root {
135                    path.strip_prefix(root)
136                        .unwrap_or(path)
137                        .to_string_lossy()
138                        .to_string()
139                } else {
140                    path.to_string_lossy().to_string()
141                };
142                self.cleanup_file_nodes(&rel)?;
143            }
144        }
145        Ok(())
146    }
147
148    /// Index (or re-index) a single file: parse it, persist nodes/edges/embeddings,
149    /// and update the index cache.
150    ///
151    /// `project_root` is used to relativize the absolute `path` so node IDs are
152    /// portable. If `None`, the path is stored as-is (absolute).
153    ///
154    /// Uses SHA-256 hash dedup to skip re-indexing when content is unchanged.
155    /// This prevents duplicate work when both the PostToolUse hook and the
156    /// background file watcher fire for the same edit.
157    fn index_single_file(
158        &self,
159        path: &Path,
160        namespace: Option<&str>,
161        project_root: Option<&Path>,
162    ) -> Result<(), CodememError> {
163        let content = std::fs::read(path)?;
164
165        let path_str = if let Some(root) = project_root {
166            path.strip_prefix(root)
167                .unwrap_or(path)
168                .to_string_lossy()
169                .to_string()
170        } else {
171            path.to_string_lossy().to_string()
172        };
173
174        // SHA-256 dedup: skip if content unchanged since last index.
175        // Uses cached ChangeDetector to avoid reloading all hashes from storage per file.
176        let hash = {
177            let mut cd_guard = self
178                .change_detector
179                .lock()
180                .map_err(|_| CodememError::LockPoisoned("change_detector".into()))?;
181            let ns = namespace.unwrap_or("");
182            let cd = cd_guard.get_or_insert_with(|| {
183                let mut cd = index::incremental::ChangeDetector::new();
184                cd.load_from_storage(&*self.storage, ns);
185                cd
186            });
187            let (changed, hash) = cd.check_changed(&path_str, &content);
188            if !changed {
189                tracing::debug!("Skipping unchanged file: {path_str}");
190                return Ok(());
191            }
192            // Expire static-analysis memories linked to symbols in this changed file
193            if self.config.memory.expire_enrichments_on_reindex {
194                match self.storage.expire_memories_for_file(&path_str) {
195                    Ok(0) => {}
196                    Ok(n) => tracing::debug!("Expired {n} enrichment memories for {path_str}"),
197                    Err(e) => tracing::warn!("Failed to expire memories for {path_str}: {e}"),
198                }
199            }
200            hash
201        };
202
203        // Check if this is a spec file (OpenAPI/AsyncAPI). If so, re-parse it
204        // and update endpoints/channels rather than treating it as code.
205        if is_spec_file_with_content(&path_str, &content) {
206            self.reparse_spec_file(path, namespace.unwrap_or(""))?;
207            // Record hash after successful spec parse
208            if let Ok(mut cd_guard) = self.change_detector.lock() {
209                if let Some(cd) = cd_guard.as_mut() {
210                    cd.record_hash(&path_str, hash);
211                    let _ = cd.save_to_storage(&*self.storage, namespace.unwrap_or(""));
212                }
213            }
214            return Ok(());
215        }
216
217        let parser = index::CodeParser::new();
218
219        let parse_result = match parser.parse_file(&path_str, &content) {
220            Some(pr) => pr,
221            None => return Ok(()), // Unsupported file type or parse failure
222        };
223
224        // Build a minimal IndexAndResolveResult for this single file
225        let mut file_paths = HashSet::new();
226        file_paths.insert(parse_result.file_path.clone());
227
228        // Populate the resolver with ALL known symbols from the in-memory graph
229        // so cross-file references (calls to functions in other files) can be
230        // resolved. Without this, only same-file references would resolve,
231        // causing the graph to gradually lose cross-file edges between full
232        // re-indexes.
233        let mut resolver = index::ReferenceResolver::new();
234        resolver.add_symbols(&parse_result.symbols);
235        if let Ok(graph) = self.lock_graph() {
236            let graph_symbols: Vec<index::Symbol> = graph
237                .get_all_nodes()
238                .iter()
239                .filter(|n| n.id.starts_with("sym:"))
240                .filter_map(index::symbol::symbol_from_graph_node)
241                .collect();
242            resolver.add_symbols(&graph_symbols);
243        }
244        let resolve_result = resolver.resolve_all_with_unresolved(&parse_result.references);
245
246        let results = IndexAndResolveResult {
247            index: index::IndexResult {
248                files_scanned: 1,
249                files_parsed: 1,
250                files_skipped: 0,
251                total_symbols: parse_result.symbols.len(),
252                total_references: parse_result.references.len(),
253                total_chunks: parse_result.chunks.len(),
254                parse_results: Vec::new(),
255            },
256            symbols: parse_result.symbols,
257            references: parse_result.references,
258            chunks: parse_result.chunks,
259            file_paths,
260            edges: resolve_result.edges,
261            unresolved: resolve_result.unresolved,
262            root_path: project_root
263                .map(|p| p.to_path_buf())
264                .unwrap_or_else(|| path.to_path_buf()),
265            scip_build: None,
266        };
267
268        self.persist_index_results(&results, namespace)?;
269
270        // Record new hash in the cached detector after successful persist
271        if let Ok(mut cd_guard) = self.change_detector.lock() {
272            if let Some(cd) = cd_guard.as_mut() {
273                cd.record_hash(&path_str, hash);
274                if let Err(e) = cd.save_to_storage(&*self.storage, namespace.unwrap_or("")) {
275                    tracing::warn!("Failed to save file hash for {path_str}: {e}");
276                }
277            }
278        }
279
280        Ok(())
281    }
282
283    // ── A2b: Symbol-Level Diff on Re-index ────────────────────────────
284
285    /// Remove symbols that existed for a file before re-indexing but are no
286    /// longer present in the new parse results. Returns count of cleaned symbols.
287    ///
288    /// For code→code edges (CALLS, IMPORTS, etc.), performs a hard delete.
289    /// For memory→symbol edges, creates a live redirected edge pointing to the
290    /// parent file node, preserving the memory→file connection so recall can
291    /// still traverse it. The original edge is then deleted along with the
292    /// stale symbol node.
293    ///
294    /// `old_symbol_ids` should be the set of symbol IDs that existed for this
295    /// file before re-indexing (collected from the in-memory graph by the caller
296    /// in a single pass across all files).
297    pub fn cleanup_stale_symbols(
298        &self,
299        file_path: &str,
300        old_symbol_ids: &HashSet<String>,
301        new_symbol_ids: &HashSet<String>,
302    ) -> Result<usize, CodememError> {
303        // Compute stale set: symbols that existed before but are not in the new parse
304        let stale_ids: Vec<&String> = old_symbol_ids
305            .iter()
306            .filter(|id| !new_symbol_ids.contains(*id))
307            .collect();
308
309        if stale_ids.is_empty() {
310            return Ok(0);
311        }
312
313        let count = stale_ids.len();
314        tracing::info!(
315            "Cleaning up {count} stale symbols for {file_path}: {:?}",
316            stale_ids
317        );
318
319        let file_node_id = format!("file:{file_path}");
320        let mut redirected_pairs: std::collections::HashSet<(String, String)> =
321            std::collections::HashSet::new();
322        let mut redirected_edges: Vec<codemem_core::Edge> = Vec::new();
323        for sym_id in &stale_ids {
324            // Before deleting the symbol, redirect memory→symbol edges to the
325            // parent file node, preserving historical context.
326            // Memory node IDs are UUIDs (no known prefix like sym:/file:/chunk:).
327            let edges = self.storage.get_edges_for_node(sym_id.as_str())?;
328            for edge in &edges {
329                let other = if edge.src.as_str() == sym_id.as_str() {
330                    &edge.dst
331                } else {
332                    &edge.src
333                };
334                let is_code_node = other.starts_with("sym:")
335                    || other.starts_with("file:")
336                    || other.starts_with("chunk:")
337                    || other.starts_with("pkg:");
338                if !is_code_node {
339                    // Skip if we already redirected this memory→file pair
340                    let pair = (other.to_string(), file_node_id.clone());
341                    if !redirected_pairs.insert(pair) {
342                        continue;
343                    }
344                    let mut redirected = edge.clone();
345                    if redirected.src.as_str() == sym_id.as_str() {
346                        redirected.src = file_node_id.clone();
347                    } else {
348                        redirected.dst = file_node_id.clone();
349                    }
350                    // Don't set valid_to — the redirect should be a live,
351                    // queryable edge so recall can still traverse memory→file.
352                    redirected.id = format!("{}-redirected", edge.id);
353                    if let Err(e) = self.storage.insert_graph_edge(&redirected) {
354                        tracing::warn!("Failed to redirect memory edge {}: {e}", edge.id);
355                    }
356                    redirected_edges.push(redirected);
357                }
358            }
359
360            // Delete all edges and the node itself
361            if let Err(e) = self.storage.delete_graph_edges_for_node(sym_id) {
362                tracing::warn!("Failed to delete edges for stale symbol {sym_id}: {e}");
363            }
364            if let Err(e) = self.storage.delete_graph_node(sym_id) {
365                tracing::warn!("Failed to delete stale symbol node {sym_id}: {e}");
366            }
367            if let Err(e) = self.storage.delete_embedding(sym_id) {
368                tracing::warn!("Failed to delete embedding for stale symbol {sym_id}: {e}");
369            }
370        }
371
372        // Clean up in-memory graph and vector index
373        {
374            let mut graph = self.lock_graph()?;
375            for sym_id in &stale_ids {
376                if let Err(e) = graph.remove_node(sym_id.as_str()) {
377                    tracing::warn!("Failed to remove stale {sym_id} from in-memory graph: {e}");
378                }
379            }
380            // Add redirected memory→file edges so they're visible to
381            // in-memory traversal (BFS, PageRank, recall) during this session.
382            for edge in redirected_edges {
383                let _ = graph.add_edge(edge);
384            }
385        }
386        {
387            let mut vec = self.lock_vector()?;
388            for sym_id in &stale_ids {
389                if let Err(e) = vec.remove(sym_id.as_str()) {
390                    tracing::warn!("Failed to remove stale {sym_id} from vector index: {e}");
391                }
392            }
393        }
394        // Remove stale entries from BM25 index so deleted/renamed symbols
395        // don't persist in text search results or skew IDF calculations.
396        if let Ok(mut bm25) = self.lock_bm25() {
397            for sym_id in &stale_ids {
398                bm25.remove_document(sym_id);
399            }
400        }
401
402        Ok(count)
403    }
404
405    // ── A3: File Deletion Cleanup ───────────────────────────────────────
406
407    /// Remove graph nodes, edges, and embeddings for a single deleted file.
408    fn cleanup_file_nodes(&self, file_path: &str) -> Result<(), CodememError> {
409        let file_node_id = format!("file:{file_path}");
410
411        // Remove all chunk nodes for this file
412        let chunk_prefix = format!("chunk:{file_path}:");
413        if let Err(e) = self.storage.delete_graph_nodes_by_prefix(&chunk_prefix) {
414            tracing::warn!("Failed to delete chunk nodes for {file_path}: {e}");
415        }
416
417        // Remove symbol nodes for this file by checking graph
418        let graph = self.lock_graph()?;
419        let sym_ids: Vec<String> = graph
420            .get_all_nodes()
421            .into_iter()
422            .filter(|n| {
423                n.id.starts_with("sym:")
424                    && n.payload.get("file_path").and_then(|v| v.as_str()) == Some(file_path)
425            })
426            .map(|n| n.id.clone())
427            .collect();
428        drop(graph);
429
430        for sym_id in &sym_ids {
431            if let Err(e) = self.storage.delete_graph_edges_for_node(sym_id) {
432                tracing::warn!("Failed to delete graph edges for {sym_id}: {e}");
433            }
434            if let Err(e) = self.storage.delete_graph_node(sym_id) {
435                tracing::warn!("Failed to delete graph node {sym_id}: {e}");
436            }
437            if let Err(e) = self.storage.delete_embedding(sym_id) {
438                tracing::warn!("Failed to delete embedding {sym_id}: {e}");
439            }
440        }
441
442        // Remove file node itself
443        if let Err(e) = self.storage.delete_graph_edges_for_node(&file_node_id) {
444            tracing::warn!("Failed to delete graph edges for {file_node_id}: {e}");
445        }
446        if let Err(e) = self.storage.delete_graph_node(&file_node_id) {
447            tracing::warn!("Failed to delete graph node {file_node_id}: {e}");
448        }
449
450        // Clean up in-memory graph
451        let mut graph = self.lock_graph()?;
452        for sym_id in &sym_ids {
453            if let Err(e) = graph.remove_node(sym_id) {
454                tracing::warn!("Failed to remove {sym_id} from in-memory graph: {e}");
455            }
456        }
457        // Remove chunk nodes from in-memory graph
458        let chunk_ids: Vec<String> = graph
459            .get_all_nodes()
460            .into_iter()
461            .filter(|n| n.id.starts_with(&format!("chunk:{file_path}:")))
462            .map(|n| n.id.clone())
463            .collect();
464        for chunk_id in &chunk_ids {
465            if let Err(e) = graph.remove_node(chunk_id) {
466                tracing::warn!("Failed to remove {chunk_id} from in-memory graph: {e}");
467            }
468        }
469        if let Err(e) = graph.remove_node(&file_node_id) {
470            tracing::warn!("Failed to remove {file_node_id} from in-memory graph: {e}");
471        }
472        drop(graph);
473
474        // Remove stale embeddings from vector index
475        let mut vec = self.lock_vector()?;
476        for sym_id in &sym_ids {
477            if let Err(e) = vec.remove(sym_id) {
478                tracing::warn!("Failed to remove {sym_id} from vector index: {e}");
479            }
480        }
481        for chunk_id in &chunk_ids {
482            if let Err(e) = vec.remove(chunk_id) {
483                tracing::warn!("Failed to remove {chunk_id} from vector index: {e}");
484            }
485        }
486        drop(vec);
487
488        // Remove deleted symbols/chunks from BM25 index
489        if let Ok(mut bm25) = self.lock_bm25() {
490            for sym_id in &sym_ids {
491                bm25.remove_document(sym_id);
492            }
493            for chunk_id in &chunk_ids {
494                bm25.remove_document(chunk_id);
495            }
496        }
497
498        self.save_index();
499        Ok(())
500    }
501
502    // ── A3b: Orphan Detection ─────────────────────────────────────────
503
504    /// Scan for orphaned symbol/chunk nodes whose files no longer exist on disk.
505    /// Also cleans up dangling edges (src or dst node doesn't exist).
506    /// Returns `(symbols_cleaned, edges_cleaned)`.
507    ///
508    /// When `project_root` is `None`, file-existence checks are skipped
509    /// (only dangling edge cleanup runs) to avoid CWD-dependent path
510    /// resolution that could cause mass deletion.
511    pub fn detect_orphans(
512        &self,
513        project_root: Option<&Path>,
514    ) -> Result<(usize, usize), CodememError> {
515        // Use storage for both nodes and edges to avoid in-memory/storage sync races.
516        let all_nodes = self.storage.all_graph_nodes()?;
517        let node_ids: HashSet<String> = all_nodes.iter().map(|n| n.id.clone()).collect();
518
519        let mut orphan_sym_ids: Vec<String> = Vec::new();
520
521        // Only check file existence when we have a known project root.
522        // Without it, relative paths resolve against CWD which may be wrong.
523        if let Some(root) = project_root {
524            for node in &all_nodes {
525                // Collect sym: and chunk: nodes whose backing files are gone
526                if node.id.starts_with("sym:") || node.id.starts_with("chunk:") {
527                    let file_path = match node.payload.get("file_path").and_then(|v| v.as_str()) {
528                        Some(fp) => fp,
529                        None => continue,
530                    };
531                    let abs_path = root.join(file_path);
532                    if !abs_path.exists() {
533                        orphan_sym_ids.push(node.id.clone());
534                    }
535                }
536                // Collect file: nodes whose backing files are gone
537                else if let Some(fp) = node.id.strip_prefix("file:") {
538                    let abs_path = root.join(fp);
539                    if !abs_path.exists() {
540                        orphan_sym_ids.push(node.id.clone());
541                    }
542                }
543                // Collect pkg: nodes whose directories are gone
544                else if let Some(dir) = node.id.strip_prefix("pkg:") {
545                    let dir_trimmed = dir.trim_end_matches('/');
546                    let abs_path = root.join(dir_trimmed);
547                    if !abs_path.exists() {
548                        orphan_sym_ids.push(node.id.clone());
549                    }
550                }
551            }
552        }
553
554        // Also find dangling edges (src or dst doesn't exist in graph)
555        let all_edges = self.storage.all_graph_edges()?;
556        let mut dangling_edge_ids: Vec<String> = Vec::new();
557        for edge in &all_edges {
558            if !node_ids.contains(&edge.src) || !node_ids.contains(&edge.dst) {
559                dangling_edge_ids.push(edge.id.clone());
560            }
561        }
562
563        let symbols_cleaned = orphan_sym_ids.len();
564
565        // Clean up orphan nodes
566        for sym_id in &orphan_sym_ids {
567            if let Err(e) = self.storage.delete_graph_edges_for_node(sym_id) {
568                tracing::warn!("Orphan cleanup: failed to delete edges for {sym_id}: {e}");
569            }
570            if let Err(e) = self.storage.delete_graph_node(sym_id) {
571                tracing::warn!("Orphan cleanup: failed to delete node {sym_id}: {e}");
572            }
573            if let Err(e) = self.storage.delete_embedding(sym_id) {
574                tracing::warn!("Orphan cleanup: failed to delete embedding {sym_id}: {e}");
575            }
576        }
577
578        // Clean up orphan nodes from in-memory graph + vector
579        if !orphan_sym_ids.is_empty() {
580            if let Ok(mut graph) = self.lock_graph() {
581                for sym_id in &orphan_sym_ids {
582                    let _ = graph.remove_node(sym_id);
583                }
584            }
585            if let Ok(mut vec) = self.lock_vector() {
586                for sym_id in &orphan_sym_ids {
587                    let _ = vec.remove(sym_id);
588                }
589            }
590        }
591
592        // Delete dangling edges that weren't already removed by node cleanup
593        let mut edges_cleaned = 0usize;
594        for edge_id in &dangling_edge_ids {
595            match self.storage.delete_graph_edge(edge_id) {
596                Ok(true) => edges_cleaned += 1,
597                Ok(false) => {} // Already deleted by node cleanup above
598                Err(e) => {
599                    tracing::warn!("Orphan cleanup: failed to delete dangling edge {edge_id}: {e}");
600                }
601            }
602        }
603
604        if symbols_cleaned > 0 || edges_cleaned > 0 {
605            tracing::info!(
606                "Orphan scan: cleaned {symbols_cleaned} symbol/chunk nodes, {edges_cleaned} dangling edges"
607            );
608        }
609
610        Ok((symbols_cleaned, edges_cleaned))
611    }
612
613    // ── A3c: Spec File Re-parsing ──────────────────────────────────────
614
615    /// Re-parse a spec file (OpenAPI/AsyncAPI) and update stored endpoints/channels.
616    fn reparse_spec_file(&self, path: &Path, namespace: &str) -> Result<(), CodememError> {
617        use crate::index::spec_parser::{parse_asyncapi, parse_openapi, SpecFileResult};
618
619        let result = if let Some(openapi) = parse_openapi(path) {
620            Some(SpecFileResult::OpenApi(openapi))
621        } else {
622            parse_asyncapi(path).map(SpecFileResult::AsyncApi)
623        };
624
625        match result {
626            Some(SpecFileResult::OpenApi(spec)) => {
627                for ep in &spec.endpoints {
628                    let _ = self.storage.store_api_endpoint(
629                        &ep.method,
630                        &ep.path,
631                        ep.operation_id.as_deref().unwrap_or(""),
632                        namespace,
633                    );
634                }
635                tracing::info!(
636                    "Re-parsed OpenAPI spec: {} endpoints from {}",
637                    spec.endpoints.len(),
638                    path.display()
639                );
640            }
641            Some(SpecFileResult::AsyncApi(spec)) => {
642                for ch in &spec.channels {
643                    let _ = self.storage.store_event_channel(
644                        &ch.channel,
645                        &ch.direction,
646                        ch.protocol.as_deref().unwrap_or(""),
647                        ch.operation_id.as_deref().unwrap_or(""),
648                        namespace,
649                        ch.description.as_deref().unwrap_or(""),
650                    );
651                }
652                tracing::info!(
653                    "Re-parsed AsyncAPI spec: {} channels from {}",
654                    spec.channels.len(),
655                    path.display()
656                );
657            }
658            None => {
659                tracing::debug!("Not a recognized spec file: {}", path.display());
660            }
661        }
662        Ok(())
663    }
664
665    // ── A4: Unified Analyze Pipeline ────────────────────────────────────
666
667    /// Full analysis pipeline: index → persist → enrich → recompute centrality.
668    ///
669    /// This is the single entry point for all callers (CLI, MCP, API).
670    /// Supports incremental indexing via `ChangeDetector`, progress callbacks,
671    /// and returns comprehensive results.
672    pub fn analyze(&self, options: AnalyzeOptions<'_>) -> Result<AnalyzeResult, CodememError> {
673        let root = options.path;
674
675        // Eagerly initialize embeddings/vector/BM25 for the full analysis pipeline.
676        // This triggers lazy init so that embed_and_persist() finds them ready.
677        // Skip if embeddings are not needed.
678        if !options.skip_embed {
679            drop(self.lock_embeddings());
680            drop(self.lock_vector());
681            drop(self.lock_bm25());
682        }
683
684        // 0. SCIP phase: run indexers, parse results, build graph data.
685        // Runs BEFORE ast-grep so we know which files SCIP covered.
686        let (scip_covered, scip_build) = if !options.skip_scip && self.config.scip.enabled {
687            match self.run_scip_phase(root, options.namespace) {
688                Ok((covered, build)) => (Some(covered), Some(build)),
689                Err(e) => {
690                    tracing::warn!("SCIP phase failed, falling back to ast-grep only: {e}");
691                    (None, None)
692                }
693            }
694        } else {
695            (None, None)
696        };
697
698        let scip_nodes_created = scip_build.as_ref().map_or(0, |b| b.nodes.len());
699        let scip_edges_created = scip_build.as_ref().map_or(0, |b| b.edges.len());
700        let scip_files_covered = scip_covered.as_ref().map_or(0, |s| s.len());
701
702        // 1. Index (ast-grep skips symbol extraction for SCIP-covered files)
703        // When force=true, ignore the change detector so all files are re-processed.
704        let mut indexer = match options.change_detector {
705            Some(cd) if !options.force => Indexer::with_change_detector(cd),
706            _ => Indexer::new(),
707        };
708        let resolved =
709            indexer.index_and_resolve_with_scip(root, scip_covered.as_ref(), scip_build)?;
710
711        // 2. Persist (with or without progress callback)
712        let persist = if options.skip_embed {
713            self.persist_graph_only(&resolved, Some(options.namespace))?
714        } else if let Some(ref on_progress) = options.progress {
715            self.persist_index_results_with_progress(
716                &resolved,
717                Some(options.namespace),
718                |done, total| {
719                    on_progress(AnalyzeProgress::Embedding { done, total });
720                },
721            )?
722        } else {
723            self.persist_index_results(&resolved, Some(options.namespace))?
724        };
725
726        // Cache results for structural queries
727        {
728            if let Ok(mut cache) = self.lock_index_cache() {
729                *cache = Some(crate::IndexCache {
730                    symbols: resolved.symbols,
731                    chunks: resolved.chunks,
732                    root_path: root.to_string_lossy().to_string(),
733                });
734            }
735        }
736
737        // 3. Enrich (skip if requested)
738        let enrichment = if options.skip_enrich {
739            crate::enrichment::EnrichmentPipelineResult {
740                results: serde_json::json!({}),
741                total_insights: 0,
742            }
743        } else {
744            let path_str = root.to_str().unwrap_or("");
745            self.run_enrichments(
746                path_str,
747                &[],
748                options.git_days,
749                Some(options.namespace),
750                None,
751            )
752        };
753
754        // 4. Recompute centrality
755        self.lock_graph()?.recompute_centrality();
756
757        // 5. Compute summary stats
758        let top_nodes = self.find_important_nodes(10, 0.85).unwrap_or_default();
759        let community_count = self.louvain_communities(1.0).map(|c| c.len()).unwrap_or(0);
760
761        // 6. Save indexes
762        self.save_index();
763
764        // Save incremental state
765        indexer
766            .change_detector()
767            .save_to_storage(self.storage(), options.namespace)?;
768
769        Ok(AnalyzeResult {
770            files_parsed: resolved.index.files_parsed,
771            files_skipped: resolved.index.files_skipped,
772            symbols_found: resolved.index.total_symbols,
773            edges_resolved: persist.edges_resolved,
774            chunks_stored: persist.chunks_stored,
775            symbols_embedded: persist.symbols_embedded,
776            chunks_embedded: persist.chunks_embedded,
777            chunks_pruned: persist.chunks_pruned,
778            symbols_pruned: persist.symbols_pruned,
779            enrichment_results: enrichment.results,
780            total_insights: enrichment.total_insights,
781            top_nodes,
782            community_count,
783            scip_nodes_created,
784            scip_edges_created,
785            scip_files_covered,
786        })
787    }
788
789    /// Run the SCIP phase: orchestrate indexers, parse results, build graph data.
790    fn run_scip_phase(
791        &self,
792        root: &Path,
793        namespace: &str,
794    ) -> Result<(HashSet<String>, index::scip::graph_builder::ScipBuildResult), CodememError> {
795        let orchestrator =
796            index::scip::orchestrate::ScipOrchestrator::new(self.config.scip.clone());
797        let orch_result = orchestrator.run(root, namespace)?;
798
799        if orch_result.scip_result.covered_files.is_empty() {
800            return Ok((
801                HashSet::new(),
802                index::scip::graph_builder::ScipBuildResult::default(),
803            ));
804        }
805
806        for (lang, err) in &orch_result.failed_languages {
807            tracing::warn!("SCIP indexer for {:?} failed: {}", lang, err);
808        }
809        for lang in &orch_result.indexed_languages {
810            tracing::info!("SCIP indexed {:?} successfully", lang);
811        }
812
813        let build = index::scip::graph_builder::build_graph(
814            &orch_result.scip_result,
815            Some(namespace),
816            &self.config.scip,
817        );
818        let covered: HashSet<String> = build.files_covered.clone();
819
820        tracing::info!(
821            "SCIP phase: {} nodes, {} edges, {} ext nodes, {} files covered, {} doc memories",
822            build.nodes.len(),
823            build.edges.len(),
824            build.ext_nodes_created,
825            covered.len(),
826            build.doc_memories_created,
827        );
828
829        Ok((covered, build))
830    }
831
832    // ── A8: Session Context Synthesis ───────────────────────────────────
833
834    /// Synthesize context for a new session: recent memories, pending analyses,
835    /// active patterns, and last session summary.
836    pub fn session_context(&self, namespace: Option<&str>) -> Result<SessionContext, CodememError> {
837        let now = chrono::Utc::now();
838        let cutoff_24h = now - chrono::Duration::hours(24);
839
840        // 1. Recent memories (last 24h)
841        let ids = match namespace {
842            Some(ns) => self.storage.list_memory_ids_for_namespace(ns)?,
843            None => self.storage.list_memory_ids()?,
844        };
845
846        let mut recent_memories = Vec::new();
847        let mut pending_analyses = Vec::new();
848
849        for id in ids.iter().rev().take(200) {
850            if let Ok(Some(m)) = self.storage.get_memory_no_touch(id) {
851                // Collect pending analyses
852                if m.tags.contains(&"pending-analysis".to_string()) {
853                    pending_analyses.push(m.clone());
854                }
855                // Collect recent memories from last 24h
856                if m.created_at >= cutoff_24h {
857                    recent_memories.push(m);
858                }
859                if recent_memories.len() >= 50 && pending_analyses.len() >= 10 {
860                    break;
861                }
862            }
863        }
864
865        // 2. Active patterns
866        let session_count = self.storage.session_count(namespace).unwrap_or(1).max(1);
867        let active_patterns = patterns::detect_patterns(
868            &*self.storage,
869            namespace,
870            2, // min_frequency
871            session_count,
872        )
873        .unwrap_or_default();
874
875        // 3. Last session summary
876        let last_session_summary = self
877            .storage
878            .list_sessions(namespace, 1)?
879            .into_iter()
880            .next()
881            .and_then(|s| s.summary);
882
883        Ok(SessionContext {
884            recent_memories,
885            pending_analyses,
886            active_patterns,
887            last_session_summary,
888        })
889    }
890}
891
892// ── Result Types ────────────────────────────────────────────────────────────
893
894/// Options for the unified `analyze()` pipeline.
895pub struct AnalyzeOptions<'a> {
896    pub path: &'a Path,
897    pub namespace: &'a str,
898    pub git_days: u64,
899    pub change_detector: Option<index::incremental::ChangeDetector>,
900    pub progress: Option<Box<dyn Fn(AnalyzeProgress) + Send + 'a>>,
901    /// Skip SCIP indexing — use ast-grep only (faster, less accurate).
902    pub skip_scip: bool,
903    /// Skip embedding phase (graph + chunks stored but not vectorized).
904    pub skip_embed: bool,
905    /// Skip enrichment phase (no git-history/complexity/etc. analysis).
906    pub skip_enrich: bool,
907    /// Force re-index even when file SHAs haven't changed.
908    pub force: bool,
909}
910
911/// Progress events emitted during analysis.
912#[derive(Debug, Clone)]
913pub enum AnalyzeProgress {
914    Embedding { done: usize, total: usize },
915}
916
917/// Result of the unified `analyze()` pipeline.
918#[derive(Debug)]
919pub struct AnalyzeResult {
920    pub files_parsed: usize,
921    pub files_skipped: usize,
922    pub symbols_found: usize,
923    pub edges_resolved: usize,
924    pub chunks_stored: usize,
925    pub symbols_embedded: usize,
926    pub chunks_embedded: usize,
927    pub chunks_pruned: usize,
928    pub symbols_pruned: usize,
929    pub enrichment_results: serde_json::Value,
930    pub total_insights: usize,
931    pub top_nodes: Vec<crate::graph_ops::RankedNode>,
932    pub community_count: usize,
933    /// SCIP nodes created (sym: + ext: nodes).
934    pub scip_nodes_created: usize,
935    /// SCIP edges created (CALLS, IMPORTS, READS, WRITES, IMPLEMENTS, etc.).
936    pub scip_edges_created: usize,
937    /// Files covered by SCIP indexers (ast-grep skipped symbol extraction for these).
938    pub scip_files_covered: usize,
939}
940
941/// Session context synthesized at session start.
942#[derive(Debug)]
943pub struct SessionContext {
944    /// Memories created in the last 24 hours.
945    pub recent_memories: Vec<MemoryNode>,
946    /// Memories tagged `pending-analysis` awaiting code-mapper review.
947    pub pending_analyses: Vec<MemoryNode>,
948    /// Cross-session patterns detected with sufficient frequency.
949    pub active_patterns: Vec<DetectedPattern>,
950    /// Summary text from the most recent session (if any).
951    pub last_session_summary: Option<String>,
952}
953
954#[cfg(test)]
955mod tests {
956    use super::*;
957    use codemem_core::{Edge, GraphNode, NodeKind, RelationshipType};
958    use std::collections::{HashMap, HashSet};
959
960    /// Create a test engine backed by a temporary database.
961    fn test_engine() -> CodememEngine {
962        let dir = tempfile::tempdir().unwrap();
963        let db_path = dir.path().join("test.db");
964        // Keep the tempdir alive by leaking it (tests are short-lived).
965        let _ = Box::leak(Box::new(dir));
966        CodememEngine::from_db_path(&db_path).unwrap()
967    }
968
969    fn graph_node(id: &str, kind: NodeKind, file_path: Option<&str>) -> GraphNode {
970        let mut payload = HashMap::new();
971        if let Some(fp) = file_path {
972            payload.insert(
973                "file_path".to_string(),
974                serde_json::Value::String(fp.to_string()),
975            );
976        }
977        GraphNode {
978            id: id.to_string(),
979            kind,
980            label: id.to_string(),
981            payload,
982            centrality: 0.0,
983            memory_id: None,
984            namespace: None,
985            valid_from: None,
986            valid_to: None,
987        }
988    }
989
990    fn edge(src: &str, dst: &str, rel: RelationshipType) -> Edge {
991        Edge {
992            id: format!("{rel}:{src}->{dst}"),
993            src: src.to_string(),
994            dst: dst.to_string(),
995            relationship: rel,
996            weight: 1.0,
997            properties: HashMap::new(),
998            created_at: chrono::Utc::now(),
999            valid_from: None,
1000            valid_to: None,
1001        }
1002    }
1003
1004    // ── cleanup_stale_symbols tests ──────────────────────────────────────
1005
1006    #[test]
1007    fn cleanup_stale_symbols_deletes_stale_nodes() {
1008        let engine = test_engine();
1009
1010        // Set up: file with two symbols, one will become stale
1011        let file = graph_node("file:src/a.rs", NodeKind::File, None);
1012        let sym_keep = graph_node("sym:a::keep", NodeKind::Function, Some("src/a.rs"));
1013        let sym_stale = graph_node("sym:a::stale", NodeKind::Function, Some("src/a.rs"));
1014
1015        {
1016            let mut g = engine.lock_graph().unwrap();
1017            g.add_node(file).unwrap();
1018            g.add_node(sym_keep.clone()).unwrap();
1019            g.add_node(sym_stale.clone()).unwrap();
1020            g.add_edge(edge(
1021                "file:src/a.rs",
1022                "sym:a::keep",
1023                RelationshipType::Contains,
1024            ))
1025            .unwrap();
1026            g.add_edge(edge(
1027                "file:src/a.rs",
1028                "sym:a::stale",
1029                RelationshipType::Contains,
1030            ))
1031            .unwrap();
1032        }
1033        // Also persist to storage so cleanup can find edges
1034        let _ =
1035            engine
1036                .storage
1037                .insert_graph_node(&graph_node("file:src/a.rs", NodeKind::File, None));
1038        let _ = engine.storage.insert_graph_node(&sym_keep);
1039        let _ = engine.storage.insert_graph_node(&sym_stale);
1040        let _ = engine.storage.insert_graph_edge(&edge(
1041            "file:src/a.rs",
1042            "sym:a::keep",
1043            RelationshipType::Contains,
1044        ));
1045        let _ = engine.storage.insert_graph_edge(&edge(
1046            "file:src/a.rs",
1047            "sym:a::stale",
1048            RelationshipType::Contains,
1049        ));
1050
1051        let old_ids: HashSet<String> = ["sym:a::keep", "sym:a::stale"]
1052            .iter()
1053            .map(|s| s.to_string())
1054            .collect();
1055        let new_ids: HashSet<String> = ["sym:a::keep"].iter().map(|s| s.to_string()).collect();
1056
1057        let cleaned = engine
1058            .cleanup_stale_symbols("src/a.rs", &old_ids, &new_ids)
1059            .unwrap();
1060        assert_eq!(cleaned, 1);
1061
1062        // Stale node should be gone from in-memory graph
1063        let g = engine.lock_graph().unwrap();
1064        assert!(g.get_node("sym:a::stale").unwrap().is_none());
1065        assert!(g.get_node("sym:a::keep").unwrap().is_some());
1066    }
1067
1068    #[test]
1069    fn cleanup_stale_symbols_redirects_memory_edges_to_graph() {
1070        let engine = test_engine();
1071
1072        let file = graph_node("file:src/a.rs", NodeKind::File, None);
1073        let sym_stale = graph_node("sym:a::old_fn", NodeKind::Function, Some("src/a.rs"));
1074        let mem = graph_node("mem-uuid-123", NodeKind::Memory, None);
1075
1076        {
1077            let mut g = engine.lock_graph().unwrap();
1078            g.add_node(file.clone()).unwrap();
1079            g.add_node(sym_stale.clone()).unwrap();
1080            g.add_node(mem.clone()).unwrap();
1081            g.add_edge(edge(
1082                "file:src/a.rs",
1083                "sym:a::old_fn",
1084                RelationshipType::Contains,
1085            ))
1086            .unwrap();
1087            g.add_edge(edge(
1088                "mem-uuid-123",
1089                "sym:a::old_fn",
1090                RelationshipType::RelatesTo,
1091            ))
1092            .unwrap();
1093        }
1094        let _ = engine.storage.insert_graph_node(&file);
1095        let _ = engine.storage.insert_graph_node(&sym_stale);
1096        let _ = engine.storage.insert_graph_node(&mem);
1097        let _ = engine.storage.insert_graph_edge(&edge(
1098            "file:src/a.rs",
1099            "sym:a::old_fn",
1100            RelationshipType::Contains,
1101        ));
1102        let _ = engine.storage.insert_graph_edge(&edge(
1103            "mem-uuid-123",
1104            "sym:a::old_fn",
1105            RelationshipType::RelatesTo,
1106        ));
1107
1108        let old_ids: HashSet<String> = ["sym:a::old_fn"].iter().map(|s| s.to_string()).collect();
1109        let new_ids: HashSet<String> = HashSet::new();
1110
1111        engine
1112            .cleanup_stale_symbols("src/a.rs", &old_ids, &new_ids)
1113            .unwrap();
1114
1115        // The redirected edge should be in the in-memory graph
1116        let g = engine.lock_graph().unwrap();
1117        let file_edges = g.get_edges("file:src/a.rs").unwrap();
1118        let has_redirect = file_edges.iter().any(|e| {
1119            (e.src == "mem-uuid-123" || e.dst == "mem-uuid-123") && e.id.contains("-redirected")
1120        });
1121        assert!(
1122            has_redirect,
1123            "redirected memory→file edge should be in the in-memory graph"
1124        );
1125    }
1126
1127    #[test]
1128    fn cleanup_stale_symbols_deduplicates_redirects() {
1129        let engine = test_engine();
1130
1131        let file = graph_node("file:src/a.rs", NodeKind::File, None);
1132        let sym1 = graph_node("sym:a::fn1", NodeKind::Function, Some("src/a.rs"));
1133        let sym2 = graph_node("sym:a::fn2", NodeKind::Function, Some("src/a.rs"));
1134        let mem = graph_node("mem-uuid-456", NodeKind::Memory, None);
1135
1136        // Same memory linked to two symbols in the same file
1137        let _ = engine.storage.insert_graph_node(&file);
1138        let _ = engine.storage.insert_graph_node(&sym1);
1139        let _ = engine.storage.insert_graph_node(&sym2);
1140        let _ = engine.storage.insert_graph_node(&mem);
1141        let _ = engine.storage.insert_graph_edge(&edge(
1142            "mem-uuid-456",
1143            "sym:a::fn1",
1144            RelationshipType::RelatesTo,
1145        ));
1146        let _ = engine.storage.insert_graph_edge(&edge(
1147            "mem-uuid-456",
1148            "sym:a::fn2",
1149            RelationshipType::RelatesTo,
1150        ));
1151
1152        {
1153            let mut g = engine.lock_graph().unwrap();
1154            g.add_node(file).unwrap();
1155            g.add_node(sym1).unwrap();
1156            g.add_node(sym2).unwrap();
1157            g.add_node(mem).unwrap();
1158        }
1159
1160        let old_ids: HashSet<String> = ["sym:a::fn1", "sym:a::fn2"]
1161            .iter()
1162            .map(|s| s.to_string())
1163            .collect();
1164        let new_ids: HashSet<String> = HashSet::new();
1165
1166        engine
1167            .cleanup_stale_symbols("src/a.rs", &old_ids, &new_ids)
1168            .unwrap();
1169
1170        // Should have exactly one redirect edge, not two
1171        let g = engine.lock_graph().unwrap();
1172        let file_edges = g.get_edges("file:src/a.rs").unwrap();
1173        let redirect_count = file_edges
1174            .iter()
1175            .filter(|e| e.id.contains("-redirected"))
1176            .count();
1177        assert_eq!(
1178            redirect_count, 1,
1179            "should have exactly 1 redirected edge, got {redirect_count}"
1180        );
1181    }
1182
1183    // ── detect_orphans tests ─────────────────────────────────────────────
1184
1185    #[test]
1186    fn detect_orphans_skips_file_check_when_no_root() {
1187        let engine = test_engine();
1188
1189        // Add a symbol node with a file path that definitely doesn't exist
1190        let sym = graph_node(
1191            "sym:nonexistent::fn",
1192            NodeKind::Function,
1193            Some("does/not/exist.rs"),
1194        );
1195        let _ = engine.storage.insert_graph_node(&sym);
1196        {
1197            let mut g = engine.lock_graph().unwrap();
1198            g.add_node(sym).unwrap();
1199        }
1200
1201        // With None, should NOT delete the node (skips file existence check)
1202        let (symbols_cleaned, _) = engine.detect_orphans(None).unwrap();
1203        assert_eq!(
1204            symbols_cleaned, 0,
1205            "detect_orphans(None) should not delete nodes based on file existence"
1206        );
1207    }
1208
1209    #[test]
1210    fn detect_orphans_removes_missing_files_with_root() {
1211        let dir = tempfile::tempdir().unwrap();
1212        let db_path = dir.path().join("test.db");
1213        let engine = CodememEngine::from_db_path(&db_path).unwrap();
1214
1215        // Add a symbol whose file doesn't exist under the project root
1216        let sym = graph_node(
1217            "sym:missing::fn",
1218            NodeKind::Function,
1219            Some("src/missing.rs"),
1220        );
1221        let _ = engine.storage.insert_graph_node(&sym);
1222        {
1223            let mut g = engine.lock_graph().unwrap();
1224            g.add_node(sym).unwrap();
1225        }
1226
1227        let (symbols_cleaned, _) = engine.detect_orphans(Some(dir.path())).unwrap();
1228        assert_eq!(symbols_cleaned, 1);
1229    }
1230
1231    #[test]
1232    fn detect_orphans_keeps_existing_files() {
1233        let dir = tempfile::tempdir().unwrap();
1234        let db_path = dir.path().join("test.db");
1235        let engine = CodememEngine::from_db_path(&db_path).unwrap();
1236
1237        // Create the actual file so it won't be orphaned
1238        let src_dir = dir.path().join("src");
1239        std::fs::create_dir_all(&src_dir).unwrap();
1240        std::fs::write(src_dir.join("exists.rs"), "fn main() {}").unwrap();
1241
1242        let sym = graph_node(
1243            "sym:exists::main",
1244            NodeKind::Function,
1245            Some("src/exists.rs"),
1246        );
1247        let _ = engine.storage.insert_graph_node(&sym);
1248        {
1249            let mut g = engine.lock_graph().unwrap();
1250            g.add_node(sym).unwrap();
1251        }
1252
1253        let (symbols_cleaned, _) = engine.detect_orphans(Some(dir.path())).unwrap();
1254        assert_eq!(symbols_cleaned, 0);
1255    }
1256
1257    // Note: dangling edge cleanup in detect_orphans is a defensive no-op
1258    // because graph_edges has ON DELETE CASCADE foreign keys on src/dst.
1259    // Deleting a node automatically cascades to its edges in SQLite.
1260}