Skip to main content

codemem_engine/persistence/
mod.rs

1//! Graph persistence: persist indexing results (file/package/symbol/chunk nodes,
2//! edges, embeddings, compaction) into the storage and graph backends.
3
4mod compaction;
5pub mod cross_repo;
6
7use crate::index::{CodeChunk, ResolvedEdge, Symbol};
8use crate::IndexAndResolveResult;
9use codemem_core::{CodememError, Edge, GraphConfig, GraphNode, NodeKind, RelationshipType};
10use std::collections::{HashMap, HashSet};
11
12/// Counts of what was persisted by `persist_index_results`.
13#[derive(Debug, Default)]
14pub struct IndexPersistResult {
15    pub files_created: usize,
16    pub packages_created: usize,
17    pub symbols_stored: usize,
18    pub chunks_stored: usize,
19    pub edges_resolved: usize,
20    pub symbols_embedded: usize,
21    pub chunks_embedded: usize,
22    pub chunks_pruned: usize,
23    pub symbols_pruned: usize,
24}
25
26/// Counts of what was persisted by `persist_cross_repo_data`.
27#[derive(Debug, Default)]
28pub struct CrossRepoPersistResult {
29    pub packages_registered: usize,
30    pub unresolved_refs_stored: usize,
31    pub forward_edges_created: usize,
32    pub backward_edges_created: usize,
33    pub endpoints_detected: usize,
34    pub client_calls_detected: usize,
35    pub spec_endpoints_detected: usize,
36    pub event_channels_detected: usize,
37    pub http_edges_matched: usize,
38    pub event_edges_matched: usize,
39}
40
41/// Return the edge weight for a given relationship type, using config overrides
42/// for the three most common types (Contains, Calls, Imports).
43pub fn edge_weight_for(rel: &RelationshipType, config: &GraphConfig) -> f64 {
44    match rel {
45        RelationshipType::Calls => config.calls_edge_weight,
46        RelationshipType::Imports => config.imports_edge_weight,
47        RelationshipType::Contains => config.contains_edge_weight,
48        RelationshipType::TypeDefinition => config.type_definition_edge_weight,
49        RelationshipType::Reads => config.reads_edge_weight,
50        RelationshipType::Writes => config.writes_edge_weight,
51        RelationshipType::Overrides => config.overrides_edge_weight,
52        RelationshipType::Implements | RelationshipType::Inherits => 0.8,
53        RelationshipType::DependsOn => 0.7,
54        RelationshipType::CoChanged => 0.6,
55        RelationshipType::EvolvedInto | RelationshipType::Summarizes => 0.7,
56        RelationshipType::PartOf => 0.4,
57        RelationshipType::RelatesTo | RelationshipType::SharesTheme => 0.3,
58        RelationshipType::HttpCalls => 0.7,
59        RelationshipType::PublishesTo | RelationshipType::SubscribesTo => 0.6,
60        RelationshipType::ModifiedBy => 0.4,
61        _ => 0.5,
62    }
63}
64
65/// Intermediate counts from graph node persistence (before embedding).
66struct GraphPersistCounts {
67    packages_created: usize,
68    chunks_stored: usize,
69}
70
71impl super::CodememEngine {
72    /// Persist all indexing results (file nodes, package tree, symbol nodes, chunk nodes,
73    /// edges, embeddings, compaction) into storage and the in-memory graph.
74    ///
75    /// This is the full persistence pipeline called after `Indexer::index_and_resolve()`.
76    pub fn persist_index_results(
77        &self,
78        results: &IndexAndResolveResult,
79        namespace: Option<&str>,
80    ) -> Result<IndexPersistResult, CodememError> {
81        self.persist_index_results_with_progress(results, namespace, |_, _| {})
82    }
83
84    /// Like `persist_index_results`, but skips the embedding phase entirely.
85    /// Stores graph nodes, edges, and chunks without vectorizing them.
86    /// Also skips cross-repo linking — this is a fast graph-only mode intended
87    /// for rapid iteration (e.g., `--skip-embed`). Run a full `analyze` to
88    /// populate cross-repo data.
89    pub fn persist_graph_only(
90        &self,
91        results: &IndexAndResolveResult,
92        namespace: Option<&str>,
93    ) -> Result<IndexPersistResult, CodememError> {
94        let seen_files = &results.file_paths;
95        let graph_counts = self.persist_graph_nodes(results, namespace)?;
96
97        let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
98            self.compact_graph(seen_files)
99        } else {
100            (0, 0)
101        };
102
103        Ok(IndexPersistResult {
104            files_created: seen_files.len(),
105            packages_created: graph_counts.packages_created,
106            symbols_stored: results.symbols.len(),
107            chunks_stored: graph_counts.chunks_stored,
108            edges_resolved: results.edges.len(),
109            symbols_embedded: 0,
110            chunks_embedded: 0,
111            chunks_pruned,
112            symbols_pruned,
113        })
114    }
115
116    /// Like `persist_index_results`, but calls `on_progress(done, total)` during
117    /// the embedding phase so callers can display progress.
118    pub fn persist_index_results_with_progress(
119        &self,
120        results: &IndexAndResolveResult,
121        namespace: Option<&str>,
122        on_progress: impl Fn(usize, usize),
123    ) -> Result<IndexPersistResult, CodememError> {
124        let seen_files = &results.file_paths;
125
126        // 1. Persist all graph nodes and edges
127        let graph_counts = self.persist_graph_nodes(results, namespace)?;
128
129        // 2. Embed symbols and chunks
130        let (symbols_embedded, chunks_embedded) = self.embed_and_persist(
131            &results.symbols,
132            &results.chunks,
133            &results.edges,
134            on_progress,
135        )?;
136
137        // 3. Auto-compact
138        let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
139            self.compact_graph(seen_files)
140        } else {
141            (0, 0)
142        };
143
144        Ok(IndexPersistResult {
145            files_created: seen_files.len(),
146            packages_created: graph_counts.packages_created,
147            symbols_stored: results.symbols.len(),
148            chunks_stored: graph_counts.chunks_stored,
149            edges_resolved: results.edges.len(),
150            symbols_embedded,
151            chunks_embedded,
152            chunks_pruned,
153            symbols_pruned,
154        })
155    }
156
157    // ── Graph Node Persistence ───────────────────────────────────────────
158
159    /// Persist file, package, symbol, chunk nodes and all edges into storage
160    /// and the in-memory graph. Returns counts for the result struct.
161    fn persist_graph_nodes(
162        &self,
163        results: &IndexAndResolveResult,
164        namespace: Option<&str>,
165    ) -> Result<GraphPersistCounts, CodememError> {
166        let all_symbols = &results.symbols;
167        let all_chunks = &results.chunks;
168        let seen_files = &results.file_paths;
169        let edges = &results.edges;
170
171        let now = chrono::Utc::now();
172        let ns_string = namespace.map(|s| s.to_string());
173        let contains_weight = edge_weight_for(&RelationshipType::Contains, &self.config.graph);
174
175        let mut graph = self.lock_graph()?;
176
177        // ── File nodes
178        let file_nodes: Vec<GraphNode> = seen_files
179            .iter()
180            .map(|file_path| {
181                let mut payload = HashMap::new();
182                payload.insert(
183                    "file_path".to_string(),
184                    serde_json::Value::String(file_path.clone()),
185                );
186                GraphNode {
187                    id: format!("file:{file_path}"),
188                    kind: NodeKind::File,
189                    label: file_path.clone(),
190                    payload,
191                    centrality: 0.0,
192                    memory_id: None,
193                    namespace: ns_string.clone(),
194                    valid_from: None,
195                    valid_to: None,
196                }
197            })
198            .collect();
199        self.persist_nodes_to_storage_and_graph(&file_nodes, &mut **graph);
200
201        // ── Package (directory) nodes
202        let (dir_nodes, dir_edges, created_dirs) =
203            self.build_package_tree(seen_files, &ns_string, contains_weight, now);
204        self.persist_nodes_to_storage_and_graph(&dir_nodes, &mut **graph);
205        self.persist_edges_to_storage_and_graph(&dir_edges, &mut **graph);
206
207        // ── Symbol nodes + file→symbol edges
208        let (sym_nodes, sym_edges) =
209            Self::build_symbol_nodes(all_symbols, &ns_string, contains_weight, now);
210
211        // Clean up stale symbols: single pass over in-memory graph to collect
212        // existing symbols grouped by file, then diff against new parse results.
213        //
214        // Lock protocol: We collect old symbols while holding the graph lock,
215        // then drop it so `cleanup_stale_symbols` can acquire graph + vector
216        // locks internally. The re-acquire below is safe: cleanup only removes
217        // stale nodes that won't conflict with the inserts that follow.
218        let mut old_syms_by_file: HashMap<String, HashSet<String>> = HashMap::new();
219        for node in graph.get_all_nodes() {
220            if !node.id.starts_with("sym:") {
221                continue;
222            }
223            // Skip SCIP-sourced symbols (explicit and synthetic containment nodes)
224            // — they're managed by the SCIP pipeline, not ast-grep. Without this
225            // guard, re-indexing deletes all SCIP sym: nodes because their IDs
226            // don't match ast-grep's qualified names.
227            if matches!(
228                node.payload.get("source").and_then(|v| v.as_str()),
229                Some("scip" | "scip-synthetic")
230            ) {
231                continue;
232            }
233            let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) else {
234                continue;
235            };
236            if !seen_files.contains(fp) {
237                continue;
238            }
239            old_syms_by_file
240                .entry(fp.to_string())
241                .or_default()
242                .insert(node.id);
243        }
244        drop(graph);
245        for file_path in seen_files {
246            let new_sym_ids: HashSet<String> = sym_nodes
247                .iter()
248                .filter(|n| {
249                    n.payload.get("file_path").and_then(|v| v.as_str()) == Some(file_path.as_str())
250                })
251                .map(|n| n.id.clone())
252                .collect();
253            let empty = HashSet::new();
254            let old_sym_ids = old_syms_by_file.get(file_path).unwrap_or(&empty);
255            if let Err(e) = self.cleanup_stale_symbols(file_path, old_sym_ids, &new_sym_ids) {
256                tracing::warn!("Failed to cleanup stale symbols for {file_path}: {e}");
257            }
258        }
259        let mut graph = self.lock_graph()?; // Re-acquire lock
260
261        self.persist_nodes_to_storage_and_graph(&sym_nodes, &mut **graph);
262        self.persist_edges_to_storage_and_graph(&sym_edges, &mut **graph);
263
264        // ── Resolved reference edges
265        let ref_edges = Self::build_reference_edges(edges, &self.config.graph, now);
266        self.persist_edges_to_storage_and_graph(&ref_edges, &mut **graph);
267
268        // ── SCIP nodes + edges (compiler-grade)
269        if let Some(ref scip_build) = results.scip_build {
270            // Clean up stale SCIP nodes: collect existing SCIP-sourced sym: nodes
271            // for files covered by this SCIP run, then remove any not in the new set.
272            let new_scip_ids: HashSet<&str> =
273                scip_build.nodes.iter().map(|n| n.id.as_str()).collect();
274            let mut stale_scip_ids = Vec::new();
275            for node in graph.get_all_nodes() {
276                if !node.id.starts_with("sym:") {
277                    continue;
278                }
279                if !matches!(
280                    node.payload.get("source").and_then(|v| v.as_str()),
281                    Some("scip" | "scip-synthetic")
282                ) {
283                    continue;
284                }
285                if !new_scip_ids.contains(node.id.as_str()) {
286                    // Only clean up nodes in files that SCIP covered this run.
287                    if let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) {
288                        if seen_files.contains(fp) {
289                            stale_scip_ids.push(node.id.clone());
290                        }
291                    }
292                }
293            }
294            for stale_id in &stale_scip_ids {
295                let _ = graph.remove_node(stale_id);
296                let _ = self.storage.delete_graph_nodes_by_prefix(stale_id);
297                // Clean up orphan doc memories for removed symbols.
298                if let Some(qname) = stale_id.strip_prefix("sym:") {
299                    let doc_id = format!("scip-doc:{qname}");
300                    let _ = self.storage.delete_memory(&doc_id);
301                }
302            }
303            if !stale_scip_ids.is_empty() {
304                tracing::info!(
305                    "Cleaned up {} stale SCIP nodes from re-index",
306                    stale_scip_ids.len()
307                );
308            }
309
310            self.persist_nodes_to_storage_and_graph(&scip_build.nodes, &mut **graph);
311
312            // Multi-layer fusion: merge confidence when ast-grep and SCIP agree.
313            // Superseded ast-grep edges are removed to avoid duplicates.
314            let (fused_edges, superseded_ids) = Self::fuse_edges(&ref_edges, &scip_build.edges);
315
316            // Remove the low-confidence ast-grep edges that were fused into SCIP edges.
317            for edge_id in &superseded_ids {
318                let _ = graph.remove_edge(edge_id);
319                let _ = self.storage.delete_graph_edge(edge_id);
320            }
321
322            self.persist_edges_to_storage_and_graph(&fused_edges, &mut **graph);
323
324            // Persist hover doc memories and their RELATES_TO edges.
325            for (memory, related_node_id) in &scip_build.memories {
326                let _ = self.storage.insert_memory(memory);
327                let relates_edge = Edge {
328                    id: format!("relates:{}->mem:{}", related_node_id, memory.id),
329                    src: related_node_id.clone(),
330                    dst: format!("mem:{}", memory.id),
331                    relationship: RelationshipType::RelatesTo,
332                    weight: 0.3,
333                    properties: HashMap::new(),
334                    created_at: now,
335                    valid_from: Some(now),
336                    valid_to: None,
337                };
338                let _ = graph.add_edge(relates_edge.clone());
339                let _ = self.storage.insert_graph_edges_batch(&[relates_edge]);
340            }
341        }
342
343        // ── Chunk nodes + file→chunk / symbol→chunk edges
344        for file_path in seen_files {
345            let prefix = format!("chunk:{file_path}:");
346            let _ = self.storage.delete_graph_nodes_by_prefix(&prefix);
347        }
348        let (chunk_nodes, chunk_edges) =
349            Self::build_chunk_nodes(all_chunks, &ns_string, contains_weight, now);
350        let chunk_count = chunk_nodes.len();
351        self.persist_nodes_to_storage_and_graph(&chunk_nodes, &mut **graph);
352        self.persist_edges_to_storage_and_graph(&chunk_edges, &mut **graph);
353
354        drop(graph);
355
356        Ok(GraphPersistCounts {
357            packages_created: created_dirs,
358            chunks_stored: chunk_count,
359        })
360    }
361
362    /// Batch-insert nodes into both SQLite and the in-memory graph.
363    fn persist_nodes_to_storage_and_graph(
364        &self,
365        nodes: &[GraphNode],
366        graph: &mut dyn codemem_core::GraphBackend,
367    ) {
368        if let Err(e) = self.storage.insert_graph_nodes_batch(nodes) {
369            tracing::warn!("Failed to batch-insert {} graph nodes: {e}", nodes.len());
370        }
371        for node in nodes {
372            let _ = graph.add_node(node.clone());
373        }
374    }
375
376    /// Batch-insert edges into both SQLite and the in-memory graph.
377    fn persist_edges_to_storage_and_graph(
378        &self,
379        edges: &[Edge],
380        graph: &mut dyn codemem_core::GraphBackend,
381    ) {
382        if let Err(e) = self.storage.insert_graph_edges_batch(edges) {
383            tracing::warn!("Failed to batch-insert {} graph edges: {e}", edges.len());
384        }
385        for edge in edges {
386            let _ = graph.add_edge(edge.clone());
387        }
388    }
389
390    /// Build directory/package nodes and CONTAINS edges from file paths.
391    /// Returns (nodes, edges, number_of_dirs_created).
392    fn build_package_tree(
393        &self,
394        seen_files: &HashSet<String>,
395        ns_string: &Option<String>,
396        contains_weight: f64,
397        now: chrono::DateTime<chrono::Utc>,
398    ) -> (Vec<GraphNode>, Vec<Edge>, usize) {
399        let mut created_dirs: HashSet<String> = HashSet::new();
400        let mut created_edge_ids: HashSet<String> = HashSet::new();
401        let mut dir_nodes = Vec::new();
402        let mut dir_edges = Vec::new();
403
404        for file_path in seen_files {
405            let p = std::path::Path::new(file_path);
406            let mut ancestors: Vec<String> = Vec::new();
407            let mut current = p.parent();
408            while let Some(dir) = current {
409                let dir_str = dir.to_string_lossy().to_string();
410                if dir_str.is_empty() || dir_str == "." {
411                    break;
412                }
413                ancestors.push(dir_str);
414                current = dir.parent();
415            }
416            ancestors.reverse();
417            for (i, dir_str) in ancestors.iter().enumerate() {
418                let pkg_id = format!("pkg:{dir_str}/");
419                if created_dirs.insert(pkg_id.clone()) {
420                    dir_nodes.push(GraphNode {
421                        id: pkg_id.clone(),
422                        kind: NodeKind::Package,
423                        label: format!("{dir_str}/"),
424                        payload: HashMap::new(),
425                        centrality: 0.0,
426                        memory_id: None,
427                        namespace: ns_string.clone(),
428                        valid_from: None,
429                        valid_to: None,
430                    });
431                }
432                if i == 0 {
433                    continue;
434                }
435                let parent_pkg_id = format!("pkg:{}/", ancestors[i - 1]);
436                let edge_id = format!("contains:{parent_pkg_id}->{pkg_id}");
437                // Use local set for O(1) dedup instead of querying the graph
438                // for every directory. Edges persisted via INSERT OR REPLACE
439                // handle pre-existing edges from prior runs.
440                if !created_edge_ids.insert(edge_id.clone()) {
441                    continue;
442                }
443                dir_edges.push(Edge {
444                    id: edge_id,
445                    src: parent_pkg_id,
446                    dst: pkg_id.clone(),
447                    relationship: RelationshipType::Contains,
448                    weight: contains_weight,
449                    valid_from: Some(now),
450                    valid_to: None,
451                    properties: HashMap::new(),
452                    created_at: now,
453                });
454            }
455            if let Some(last_dir) = ancestors.last() {
456                let parent_pkg_id = format!("pkg:{last_dir}/");
457                let file_node_id = format!("file:{file_path}");
458                let edge_id = format!("contains:{parent_pkg_id}->{file_node_id}");
459                dir_edges.push(Edge {
460                    id: edge_id,
461                    src: parent_pkg_id,
462                    dst: file_node_id,
463                    relationship: RelationshipType::Contains,
464                    weight: contains_weight,
465                    valid_from: Some(now),
466                    valid_to: None,
467                    properties: HashMap::new(),
468                    created_at: now,
469                });
470            }
471        }
472
473        let count = created_dirs.len();
474        (dir_nodes, dir_edges, count)
475    }
476
477    /// Build symbol graph nodes and file→symbol CONTAINS edges.
478    fn build_symbol_nodes(
479        symbols: &[Symbol],
480        ns_string: &Option<String>,
481        contains_weight: f64,
482        now: chrono::DateTime<chrono::Utc>,
483    ) -> (Vec<GraphNode>, Vec<Edge>) {
484        let mut sym_nodes = Vec::with_capacity(symbols.len());
485        let mut sym_edges = Vec::with_capacity(symbols.len());
486
487        for sym in symbols {
488            let kind = NodeKind::from(sym.kind);
489            let payload = Self::build_symbol_payload(sym);
490
491            let sym_node_id = format!("sym:{}", sym.qualified_name);
492            sym_nodes.push(GraphNode {
493                id: sym_node_id.clone(),
494                kind,
495                label: sym.qualified_name.clone(),
496                payload,
497                centrality: 0.0,
498                memory_id: None,
499                namespace: ns_string.clone(),
500                valid_from: None,
501                valid_to: None,
502            });
503
504            let file_node_id = format!("file:{}", sym.file_path);
505            sym_edges.push(Edge {
506                id: format!("contains:{file_node_id}->{sym_node_id}"),
507                src: file_node_id,
508                dst: sym_node_id,
509                relationship: RelationshipType::Contains,
510                weight: contains_weight,
511                valid_from: Some(now),
512                valid_to: None,
513                properties: HashMap::new(),
514                created_at: now,
515            });
516        }
517
518        (sym_nodes, sym_edges)
519    }
520
521    /// Build the payload HashMap for a symbol's graph node.
522    fn build_symbol_payload(sym: &Symbol) -> HashMap<String, serde_json::Value> {
523        let mut payload = HashMap::new();
524        payload.insert(
525            "symbol_kind".to_string(),
526            serde_json::Value::String(sym.kind.to_string()),
527        );
528        payload.insert(
529            "signature".to_string(),
530            serde_json::Value::String(sym.signature.clone()),
531        );
532        payload.insert(
533            "file_path".to_string(),
534            serde_json::Value::String(sym.file_path.clone()),
535        );
536        payload.insert("line_start".to_string(), serde_json::json!(sym.line_start));
537        payload.insert("line_end".to_string(), serde_json::json!(sym.line_end));
538        payload.insert(
539            "visibility".to_string(),
540            serde_json::Value::String(sym.visibility.to_string()),
541        );
542        if let Some(ref doc) = sym.doc_comment {
543            payload.insert(
544                "doc_comment".to_string(),
545                serde_json::Value::String(doc.clone()),
546            );
547        }
548        if !sym.parameters.is_empty() {
549            payload.insert(
550                "parameters".to_string(),
551                serde_json::to_value(&sym.parameters).unwrap_or_default(),
552            );
553        }
554        if let Some(ref ret) = sym.return_type {
555            payload.insert(
556                "return_type".to_string(),
557                serde_json::Value::String(ret.clone()),
558            );
559        }
560        if sym.is_async {
561            payload.insert("is_async".to_string(), serde_json::json!(true));
562        }
563        if !sym.attributes.is_empty() {
564            payload.insert(
565                "attributes".to_string(),
566                serde_json::to_value(&sym.attributes).unwrap_or_default(),
567            );
568        }
569        if !sym.throws.is_empty() {
570            payload.insert(
571                "throws".to_string(),
572                serde_json::to_value(&sym.throws).unwrap_or_default(),
573            );
574        }
575        if let Some(ref gp) = sym.generic_params {
576            payload.insert(
577                "generic_params".to_string(),
578                serde_json::Value::String(gp.clone()),
579            );
580        }
581        if sym.is_abstract {
582            payload.insert("is_abstract".to_string(), serde_json::json!(true));
583        }
584        if let Some(ref parent) = sym.parent {
585            payload.insert(
586                "parent".to_string(),
587                serde_json::Value::String(parent.clone()),
588            );
589        }
590        payload
591    }
592
593    /// Build edges from resolved cross-file references.
594    /// ast-grep base confidence for multi-layer fusion.
595    const AST_GREP_BASE_CONFIDENCE: f64 = 0.10;
596
597    fn build_reference_edges(
598        edges: &[ResolvedEdge],
599        graph_config: &GraphConfig,
600        now: chrono::DateTime<chrono::Utc>,
601    ) -> Vec<Edge> {
602        edges
603            .iter()
604            .map(|edge| {
605                let mut properties = HashMap::new();
606                properties.insert("source".to_string(), serde_json::json!("ast-grep"));
607                properties.insert(
608                    "confidence".to_string(),
609                    serde_json::json!(Self::AST_GREP_BASE_CONFIDENCE),
610                );
611                properties.insert("source_layers".to_string(), serde_json::json!(["ast-grep"]));
612                // Scale edge weight by resolution confidence so low-confidence
613                // guesses (simple-name fallback) carry less weight in PageRank
614                // and betweenness centrality than exact matches.
615                let base_weight = edge_weight_for(&edge.relationship, graph_config);
616                let weight = base_weight * edge.resolution_confidence;
617                Edge {
618                    id: format!(
619                        "ref:{}->{}:{}",
620                        edge.source_qualified_name, edge.target_qualified_name, edge.relationship
621                    ),
622                    src: format!("sym:{}", edge.source_qualified_name),
623                    dst: format!("sym:{}", edge.target_qualified_name),
624                    relationship: edge.relationship,
625                    weight,
626                    valid_from: Some(now),
627                    valid_to: None,
628                    properties,
629                    created_at: now,
630                }
631            })
632            .collect()
633    }
634
635    /// Multi-layer edge fusion: when ast-grep and SCIP produce the same edge
636    /// (same src, dst, relationship), sum their confidences and merge source_layers.
637    /// SCIP edges not in ast-grep pass through unchanged.
638    ///
639    /// Returns `(fused_scip_edges, superseded_ast_grep_edge_ids)`. The caller must
640    /// remove the superseded ast-grep edges to avoid duplicates in the graph.
641    fn fuse_edges(ast_grep_edges: &[Edge], scip_edges: &[Edge]) -> (Vec<Edge>, Vec<String>) {
642        // Index ast-grep edges by (src, dst, relationship_str) → edge ID for O(1) lookup.
643        let ast_grep_index: HashMap<(String, String, String), &str> = ast_grep_edges
644            .iter()
645            .map(|e| {
646                (
647                    (e.src.clone(), e.dst.clone(), e.relationship.to_string()),
648                    e.id.as_str(),
649                )
650            })
651            .collect();
652
653        let mut superseded_ids = Vec::new();
654
655        let fused = scip_edges
656            .iter()
657            .map(|scip_edge| {
658                let key = (
659                    scip_edge.src.clone(),
660                    scip_edge.dst.clone(),
661                    scip_edge.relationship.to_string(),
662                );
663                if let Some(&ast_edge_id) = ast_grep_index.get(&key) {
664                    // Both layers agree — fuse confidence and mark ast-grep edge for removal.
665                    superseded_ids.push(ast_edge_id.to_string());
666                    let mut fused = scip_edge.clone();
667                    let scip_conf = scip_edge
668                        .properties
669                        .get("confidence")
670                        .and_then(|v| v.as_f64())
671                        .unwrap_or(0.15);
672                    let fused_conf = scip_conf + Self::AST_GREP_BASE_CONFIDENCE;
673                    fused
674                        .properties
675                        .insert("confidence".to_string(), serde_json::json!(fused_conf));
676                    fused.properties.insert(
677                        "source_layers".to_string(),
678                        serde_json::json!(["ast-grep", "scip"]),
679                    );
680                    fused
681                } else {
682                    scip_edge.clone()
683                }
684            })
685            .collect();
686
687        (fused, superseded_ids)
688    }
689
690    /// Build chunk graph nodes and file→chunk / symbol→chunk CONTAINS edges.
691    fn build_chunk_nodes(
692        chunks: &[CodeChunk],
693        ns_string: &Option<String>,
694        contains_weight: f64,
695        now: chrono::DateTime<chrono::Utc>,
696    ) -> (Vec<GraphNode>, Vec<Edge>) {
697        let mut chunk_nodes = Vec::with_capacity(chunks.len());
698        let mut chunk_edges = Vec::with_capacity(chunks.len() * 2);
699
700        for chunk in chunks {
701            let chunk_id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
702
703            let mut payload = HashMap::new();
704            payload.insert(
705                "file_path".to_string(),
706                serde_json::Value::String(chunk.file_path.clone()),
707            );
708            payload.insert(
709                "line_start".to_string(),
710                serde_json::json!(chunk.line_start),
711            );
712            payload.insert("line_end".to_string(), serde_json::json!(chunk.line_end));
713            payload.insert(
714                "node_kind".to_string(),
715                serde_json::Value::String(chunk.node_kind.clone()),
716            );
717            payload.insert(
718                "non_ws_chars".to_string(),
719                serde_json::json!(chunk.non_ws_chars),
720            );
721            if let Some(ref parent) = chunk.parent_symbol {
722                payload.insert(
723                    "parent_symbol".to_string(),
724                    serde_json::Value::String(parent.clone()),
725                );
726            }
727
728            chunk_nodes.push(GraphNode {
729                id: chunk_id.clone(),
730                kind: NodeKind::Chunk,
731                label: format!(
732                    "chunk:{}:{}..{}",
733                    chunk.file_path, chunk.line_start, chunk.line_end
734                ),
735                payload,
736                centrality: 0.0,
737                memory_id: None,
738                namespace: ns_string.clone(),
739                valid_from: None,
740                valid_to: None,
741            });
742
743            let file_node_id = format!("file:{}", chunk.file_path);
744            chunk_edges.push(Edge {
745                id: format!("contains:{file_node_id}->{chunk_id}"),
746                src: file_node_id,
747                dst: chunk_id.clone(),
748                relationship: RelationshipType::Contains,
749                weight: contains_weight,
750                valid_from: Some(now),
751                valid_to: None,
752                properties: HashMap::new(),
753                created_at: now,
754            });
755
756            if let Some(ref parent_sym) = chunk.parent_symbol {
757                let parent_node_id = format!("sym:{parent_sym}");
758                chunk_edges.push(Edge {
759                    id: format!("contains:{parent_node_id}->{chunk_id}"),
760                    src: parent_node_id,
761                    dst: chunk_id,
762                    relationship: RelationshipType::Contains,
763                    weight: contains_weight,
764                    valid_from: Some(now),
765                    valid_to: None,
766                    properties: HashMap::new(),
767                    created_at: now,
768                });
769            }
770        }
771
772        (chunk_nodes, chunk_edges)
773    }
774
775    // ── Embedding Persistence ────────────────────────────────────────────
776
777    /// Embed symbols and chunks, persisting embeddings to SQLite and the
778    /// vector index in batches with progress reporting.
779    ///
780    /// Returns (symbols_embedded, chunks_embedded).
781    fn embed_and_persist(
782        &self,
783        symbols: &[Symbol],
784        chunks: &[CodeChunk],
785        edges: &[ResolvedEdge],
786        on_progress: impl Fn(usize, usize),
787    ) -> Result<(usize, usize), CodememError> {
788        let mut symbols_embedded = 0usize;
789        let mut chunks_embedded = 0usize;
790
791        // Quick check: skip expensive text enrichment if embedding provider isn't loaded.
792        // This avoids triggering lazy init during lightweight operations (hooks).
793        if !self.embeddings_ready() {
794            return Ok((0, 0));
795        }
796
797        // Phase 1: Collect enriched texts without holding any lock.
798        let sym_texts: Vec<(String, String)> = symbols
799            .iter()
800            .map(|sym| {
801                let id = format!("sym:{}", sym.qualified_name);
802                let text = self.enrich_symbol_text(sym, edges);
803                (id, text)
804            })
805            .collect();
806        let chunk_texts: Vec<(String, String)> = chunks
807            .iter()
808            .map(|chunk| {
809                let id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
810                let text = self.enrich_chunk_text(chunk);
811                (id, text)
812            })
813            .collect();
814
815        // Phase 2+3: Embed in batches and persist progressively.
816        let embed_batch_size = self.config.embedding.batch_size;
817
818        let all_pairs: Vec<(String, String)> = sym_texts.into_iter().chain(chunk_texts).collect();
819        let total = all_pairs.len();
820        let sym_count = symbols.len();
821        let mut done = 0usize;
822
823        for batch in all_pairs.chunks(embed_batch_size) {
824            let texts: Vec<&str> = batch.iter().map(|(_, t)| t.as_str()).collect();
825
826            let t0 = std::time::Instant::now();
827            let embed_result = {
828                let emb = self.lock_embeddings()?;
829                match emb {
830                    Some(emb_guard) => emb_guard.embed_batch(&texts),
831                    None => break,
832                }
833            };
834
835            match embed_result {
836                Ok(embeddings) => {
837                    let embed_ms = t0.elapsed().as_millis();
838
839                    let t1 = std::time::Instant::now();
840                    let pairs: Vec<(&str, &[f32])> = batch
841                        .iter()
842                        .zip(embeddings.iter())
843                        .map(|((id, _), emb_vec)| (id.as_str(), emb_vec.as_slice()))
844                        .collect();
845                    if let Err(e) = self.storage.store_embeddings_batch(&pairs) {
846                        tracing::warn!("Failed to batch-store embeddings: {e}");
847                    }
848                    let sqlite_ms = t1.elapsed().as_millis();
849
850                    let t2 = std::time::Instant::now();
851                    let batch_items: Vec<(String, Vec<f32>)> = batch
852                        .iter()
853                        .zip(embeddings.into_iter())
854                        .map(|((id, _), emb_vec)| (id.clone(), emb_vec))
855                        .collect();
856                    let batch_len = batch_items.len();
857                    {
858                        let mut vec = self.lock_vector()?;
859                        if let Err(e) = vec.insert_batch(&batch_items) {
860                            tracing::warn!("Failed to batch-insert into vector index: {e}");
861                        }
862                    }
863                    let vector_ms = t2.elapsed().as_millis();
864
865                    let syms_in_batch = batch_len.min(sym_count.saturating_sub(done));
866                    symbols_embedded += syms_in_batch;
867                    chunks_embedded += batch_len - syms_in_batch;
868                    done += batch_len;
869
870                    tracing::debug!(
871                        "Embed batch {}: embed={embed_ms}ms sqlite={sqlite_ms}ms vector={vector_ms}ms",
872                        batch_len
873                    );
874                }
875                Err(e) => {
876                    tracing::warn!("embed_batch failed for chunk of {} texts: {e}", batch.len());
877                }
878            }
879            on_progress(done, total);
880        }
881        self.save_index();
882
883        Ok((symbols_embedded, chunks_embedded))
884    }
885}