Skip to main content

codemem_engine/persistence/
mod.rs

1//! Graph persistence: persist indexing results (file/package/symbol/chunk nodes,
2//! edges, embeddings, compaction) into the storage and graph backends.
3
4mod compaction;
5pub mod cross_repo;
6
7use crate::index::{CodeChunk, ResolvedEdge, Symbol};
8use crate::IndexAndResolveResult;
9use codemem_core::{CodememError, Edge, GraphConfig, GraphNode, NodeKind, RelationshipType};
10use std::collections::{HashMap, HashSet};
11
12/// Counts of what was persisted by `persist_index_results`.
13#[derive(Debug, Default)]
14pub struct IndexPersistResult {
15    pub files_created: usize,
16    pub packages_created: usize,
17    pub symbols_stored: usize,
18    pub chunks_stored: usize,
19    pub edges_resolved: usize,
20    pub symbols_embedded: usize,
21    pub chunks_embedded: usize,
22    pub chunks_pruned: usize,
23    pub symbols_pruned: usize,
24}
25
26/// Counts of what was persisted by `persist_cross_repo_data`.
27#[derive(Debug, Default)]
28pub struct CrossRepoPersistResult {
29    pub packages_registered: usize,
30    pub unresolved_refs_stored: usize,
31    pub forward_edges_created: usize,
32    pub backward_edges_created: usize,
33    pub endpoints_detected: usize,
34    pub client_calls_detected: usize,
35    pub spec_endpoints_detected: usize,
36    pub event_channels_detected: usize,
37    pub http_edges_matched: usize,
38    pub event_edges_matched: usize,
39}
40
41/// Return the edge weight for a given relationship type, using config overrides
42/// for the three most common types (Contains, Calls, Imports).
43pub fn edge_weight_for(rel: &RelationshipType, config: &GraphConfig) -> f64 {
44    match rel {
45        RelationshipType::Calls => config.calls_edge_weight,
46        RelationshipType::Imports => config.imports_edge_weight,
47        RelationshipType::Contains => config.contains_edge_weight,
48        RelationshipType::TypeDefinition => config.type_definition_edge_weight,
49        RelationshipType::Reads => config.reads_edge_weight,
50        RelationshipType::Writes => config.writes_edge_weight,
51        RelationshipType::Overrides => config.overrides_edge_weight,
52        RelationshipType::Implements | RelationshipType::Inherits => 0.8,
53        RelationshipType::DependsOn => 0.7,
54        RelationshipType::CoChanged => 0.6,
55        RelationshipType::EvolvedInto | RelationshipType::Summarizes => 0.7,
56        RelationshipType::PartOf => 0.4,
57        RelationshipType::RelatesTo | RelationshipType::SharesTheme => 0.3,
58        RelationshipType::HttpCalls => 0.7,
59        RelationshipType::PublishesTo | RelationshipType::SubscribesTo => 0.6,
60        _ => 0.5,
61    }
62}
63
64/// Intermediate counts from graph node persistence (before embedding).
65struct GraphPersistCounts {
66    packages_created: usize,
67    chunks_stored: usize,
68}
69
70impl super::CodememEngine {
71    /// Persist all indexing results (file nodes, package tree, symbol nodes, chunk nodes,
72    /// edges, embeddings, compaction) into storage and the in-memory graph.
73    ///
74    /// This is the full persistence pipeline called after `Indexer::index_and_resolve()`.
75    pub fn persist_index_results(
76        &self,
77        results: &IndexAndResolveResult,
78        namespace: Option<&str>,
79    ) -> Result<IndexPersistResult, CodememError> {
80        self.persist_index_results_with_progress(results, namespace, |_, _| {})
81    }
82
83    /// Like `persist_index_results`, but skips the embedding phase entirely.
84    /// Stores graph nodes, edges, and chunks without vectorizing them.
85    /// Also skips cross-repo linking — this is a fast graph-only mode intended
86    /// for rapid iteration (e.g., `--skip-embed`). Run a full `analyze` to
87    /// populate cross-repo data.
88    pub fn persist_graph_only(
89        &self,
90        results: &IndexAndResolveResult,
91        namespace: Option<&str>,
92    ) -> Result<IndexPersistResult, CodememError> {
93        let seen_files = &results.file_paths;
94        let graph_counts = self.persist_graph_nodes(results, namespace)?;
95
96        let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
97            self.compact_graph(seen_files)
98        } else {
99            (0, 0)
100        };
101
102        Ok(IndexPersistResult {
103            files_created: seen_files.len(),
104            packages_created: graph_counts.packages_created,
105            symbols_stored: results.symbols.len(),
106            chunks_stored: graph_counts.chunks_stored,
107            edges_resolved: results.edges.len(),
108            symbols_embedded: 0,
109            chunks_embedded: 0,
110            chunks_pruned,
111            symbols_pruned,
112        })
113    }
114
115    /// Like `persist_index_results`, but calls `on_progress(done, total)` during
116    /// the embedding phase so callers can display progress.
117    pub fn persist_index_results_with_progress(
118        &self,
119        results: &IndexAndResolveResult,
120        namespace: Option<&str>,
121        on_progress: impl Fn(usize, usize),
122    ) -> Result<IndexPersistResult, CodememError> {
123        let seen_files = &results.file_paths;
124
125        // 1. Persist all graph nodes and edges
126        let graph_counts = self.persist_graph_nodes(results, namespace)?;
127
128        // 2. Embed symbols and chunks
129        let (symbols_embedded, chunks_embedded) = self.embed_and_persist(
130            &results.symbols,
131            &results.chunks,
132            &results.edges,
133            on_progress,
134        )?;
135
136        // 3. Auto-compact
137        let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
138            self.compact_graph(seen_files)
139        } else {
140            (0, 0)
141        };
142
143        Ok(IndexPersistResult {
144            files_created: seen_files.len(),
145            packages_created: graph_counts.packages_created,
146            symbols_stored: results.symbols.len(),
147            chunks_stored: graph_counts.chunks_stored,
148            edges_resolved: results.edges.len(),
149            symbols_embedded,
150            chunks_embedded,
151            chunks_pruned,
152            symbols_pruned,
153        })
154    }
155
156    // ── Graph Node Persistence ───────────────────────────────────────────
157
158    /// Persist file, package, symbol, chunk nodes and all edges into storage
159    /// and the in-memory graph. Returns counts for the result struct.
160    fn persist_graph_nodes(
161        &self,
162        results: &IndexAndResolveResult,
163        namespace: Option<&str>,
164    ) -> Result<GraphPersistCounts, CodememError> {
165        let all_symbols = &results.symbols;
166        let all_chunks = &results.chunks;
167        let seen_files = &results.file_paths;
168        let edges = &results.edges;
169
170        let now = chrono::Utc::now();
171        let ns_string = namespace.map(|s| s.to_string());
172        let contains_weight = edge_weight_for(&RelationshipType::Contains, &self.config.graph);
173
174        let mut graph = self.lock_graph()?;
175
176        // ── File nodes
177        let file_nodes: Vec<GraphNode> = seen_files
178            .iter()
179            .map(|file_path| {
180                let mut payload = HashMap::new();
181                payload.insert(
182                    "file_path".to_string(),
183                    serde_json::Value::String(file_path.clone()),
184                );
185                GraphNode {
186                    id: format!("file:{file_path}"),
187                    kind: NodeKind::File,
188                    label: file_path.clone(),
189                    payload,
190                    centrality: 0.0,
191                    memory_id: None,
192                    namespace: ns_string.clone(),
193                }
194            })
195            .collect();
196        self.persist_nodes_to_storage_and_graph(&file_nodes, &mut **graph);
197
198        // ── Package (directory) nodes
199        let (dir_nodes, dir_edges, created_dirs) =
200            self.build_package_tree(seen_files, &ns_string, contains_weight, now);
201        self.persist_nodes_to_storage_and_graph(&dir_nodes, &mut **graph);
202        self.persist_edges_to_storage_and_graph(&dir_edges, &mut **graph);
203
204        // ── Symbol nodes + file→symbol edges
205        let (sym_nodes, sym_edges) =
206            Self::build_symbol_nodes(all_symbols, &ns_string, contains_weight, now);
207
208        // Clean up stale symbols: single pass over in-memory graph to collect
209        // existing symbols grouped by file, then diff against new parse results.
210        //
211        // Lock protocol: We collect old symbols while holding the graph lock,
212        // then drop it so `cleanup_stale_symbols` can acquire graph + vector
213        // locks internally. The re-acquire below is safe: cleanup only removes
214        // stale nodes that won't conflict with the inserts that follow.
215        let mut old_syms_by_file: HashMap<String, HashSet<String>> = HashMap::new();
216        for node in graph.get_all_nodes() {
217            if !node.id.starts_with("sym:") {
218                continue;
219            }
220            // Skip SCIP-sourced symbols (explicit and synthetic containment nodes)
221            // — they're managed by the SCIP pipeline, not ast-grep. Without this
222            // guard, re-indexing deletes all SCIP sym: nodes because their IDs
223            // don't match ast-grep's qualified names.
224            if matches!(
225                node.payload.get("source").and_then(|v| v.as_str()),
226                Some("scip" | "scip-synthetic")
227            ) {
228                continue;
229            }
230            let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) else {
231                continue;
232            };
233            if !seen_files.contains(fp) {
234                continue;
235            }
236            old_syms_by_file
237                .entry(fp.to_string())
238                .or_default()
239                .insert(node.id);
240        }
241        drop(graph);
242        for file_path in seen_files {
243            let new_sym_ids: HashSet<String> = sym_nodes
244                .iter()
245                .filter(|n| {
246                    n.payload.get("file_path").and_then(|v| v.as_str()) == Some(file_path.as_str())
247                })
248                .map(|n| n.id.clone())
249                .collect();
250            let empty = HashSet::new();
251            let old_sym_ids = old_syms_by_file.get(file_path).unwrap_or(&empty);
252            if let Err(e) = self.cleanup_stale_symbols(file_path, old_sym_ids, &new_sym_ids) {
253                tracing::warn!("Failed to cleanup stale symbols for {file_path}: {e}");
254            }
255        }
256        let mut graph = self.lock_graph()?; // Re-acquire lock
257
258        self.persist_nodes_to_storage_and_graph(&sym_nodes, &mut **graph);
259        self.persist_edges_to_storage_and_graph(&sym_edges, &mut **graph);
260
261        // ── Resolved reference edges
262        let ref_edges = Self::build_reference_edges(edges, &self.config.graph, now);
263        self.persist_edges_to_storage_and_graph(&ref_edges, &mut **graph);
264
265        // ── SCIP nodes + edges (compiler-grade)
266        if let Some(ref scip_build) = results.scip_build {
267            // Clean up stale SCIP nodes: collect existing SCIP-sourced sym: nodes
268            // for files covered by this SCIP run, then remove any not in the new set.
269            let new_scip_ids: HashSet<&str> =
270                scip_build.nodes.iter().map(|n| n.id.as_str()).collect();
271            let mut stale_scip_ids = Vec::new();
272            for node in graph.get_all_nodes() {
273                if !node.id.starts_with("sym:") {
274                    continue;
275                }
276                if !matches!(
277                    node.payload.get("source").and_then(|v| v.as_str()),
278                    Some("scip" | "scip-synthetic")
279                ) {
280                    continue;
281                }
282                if !new_scip_ids.contains(node.id.as_str()) {
283                    // Only clean up nodes in files that SCIP covered this run.
284                    if let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) {
285                        if seen_files.contains(fp) {
286                            stale_scip_ids.push(node.id.clone());
287                        }
288                    }
289                }
290            }
291            for stale_id in &stale_scip_ids {
292                let _ = graph.remove_node(stale_id);
293                let _ = self.storage.delete_graph_nodes_by_prefix(stale_id);
294                // Clean up orphan doc memories for removed symbols.
295                if let Some(qname) = stale_id.strip_prefix("sym:") {
296                    let doc_id = format!("scip-doc:{qname}");
297                    let _ = self.storage.delete_memory(&doc_id);
298                }
299            }
300            if !stale_scip_ids.is_empty() {
301                tracing::info!(
302                    "Cleaned up {} stale SCIP nodes from re-index",
303                    stale_scip_ids.len()
304                );
305            }
306
307            self.persist_nodes_to_storage_and_graph(&scip_build.nodes, &mut **graph);
308
309            // Multi-layer fusion: merge confidence when ast-grep and SCIP agree.
310            // Superseded ast-grep edges are removed to avoid duplicates.
311            let (fused_edges, superseded_ids) = Self::fuse_edges(&ref_edges, &scip_build.edges);
312
313            // Remove the low-confidence ast-grep edges that were fused into SCIP edges.
314            for edge_id in &superseded_ids {
315                let _ = graph.remove_edge(edge_id);
316                let _ = self.storage.delete_graph_edge(edge_id);
317            }
318
319            self.persist_edges_to_storage_and_graph(&fused_edges, &mut **graph);
320
321            // Persist hover doc memories and their RELATES_TO edges.
322            for (memory, related_node_id) in &scip_build.memories {
323                let _ = self.storage.insert_memory(memory);
324                let relates_edge = Edge {
325                    id: format!("relates:{}->mem:{}", related_node_id, memory.id),
326                    src: related_node_id.clone(),
327                    dst: format!("mem:{}", memory.id),
328                    relationship: RelationshipType::RelatesTo,
329                    weight: 0.3,
330                    properties: HashMap::new(),
331                    created_at: now,
332                    valid_from: Some(now),
333                    valid_to: None,
334                };
335                let _ = graph.add_edge(relates_edge.clone());
336                let _ = self.storage.insert_graph_edges_batch(&[relates_edge]);
337            }
338        }
339
340        // ── Chunk nodes + file→chunk / symbol→chunk edges
341        for file_path in seen_files {
342            let prefix = format!("chunk:{file_path}:");
343            let _ = self.storage.delete_graph_nodes_by_prefix(&prefix);
344        }
345        let (chunk_nodes, chunk_edges) =
346            Self::build_chunk_nodes(all_chunks, &ns_string, contains_weight, now);
347        let chunk_count = chunk_nodes.len();
348        self.persist_nodes_to_storage_and_graph(&chunk_nodes, &mut **graph);
349        self.persist_edges_to_storage_and_graph(&chunk_edges, &mut **graph);
350
351        drop(graph);
352
353        Ok(GraphPersistCounts {
354            packages_created: created_dirs,
355            chunks_stored: chunk_count,
356        })
357    }
358
359    /// Batch-insert nodes into both SQLite and the in-memory graph.
360    fn persist_nodes_to_storage_and_graph(
361        &self,
362        nodes: &[GraphNode],
363        graph: &mut dyn codemem_core::GraphBackend,
364    ) {
365        if let Err(e) = self.storage.insert_graph_nodes_batch(nodes) {
366            tracing::warn!("Failed to batch-insert {} graph nodes: {e}", nodes.len());
367        }
368        for node in nodes {
369            let _ = graph.add_node(node.clone());
370        }
371    }
372
373    /// Batch-insert edges into both SQLite and the in-memory graph.
374    fn persist_edges_to_storage_and_graph(
375        &self,
376        edges: &[Edge],
377        graph: &mut dyn codemem_core::GraphBackend,
378    ) {
379        if let Err(e) = self.storage.insert_graph_edges_batch(edges) {
380            tracing::warn!("Failed to batch-insert {} graph edges: {e}", edges.len());
381        }
382        for edge in edges {
383            let _ = graph.add_edge(edge.clone());
384        }
385    }
386
387    /// Build directory/package nodes and CONTAINS edges from file paths.
388    /// Returns (nodes, edges, number_of_dirs_created).
389    fn build_package_tree(
390        &self,
391        seen_files: &HashSet<String>,
392        ns_string: &Option<String>,
393        contains_weight: f64,
394        now: chrono::DateTime<chrono::Utc>,
395    ) -> (Vec<GraphNode>, Vec<Edge>, usize) {
396        let mut created_dirs: HashSet<String> = HashSet::new();
397        let mut created_edge_ids: HashSet<String> = HashSet::new();
398        let mut dir_nodes = Vec::new();
399        let mut dir_edges = Vec::new();
400
401        for file_path in seen_files {
402            let p = std::path::Path::new(file_path);
403            let mut ancestors: Vec<String> = Vec::new();
404            let mut current = p.parent();
405            while let Some(dir) = current {
406                let dir_str = dir.to_string_lossy().to_string();
407                if dir_str.is_empty() || dir_str == "." {
408                    break;
409                }
410                ancestors.push(dir_str);
411                current = dir.parent();
412            }
413            ancestors.reverse();
414            for (i, dir_str) in ancestors.iter().enumerate() {
415                let pkg_id = format!("pkg:{dir_str}/");
416                if created_dirs.insert(pkg_id.clone()) {
417                    dir_nodes.push(GraphNode {
418                        id: pkg_id.clone(),
419                        kind: NodeKind::Package,
420                        label: format!("{dir_str}/"),
421                        payload: HashMap::new(),
422                        centrality: 0.0,
423                        memory_id: None,
424                        namespace: ns_string.clone(),
425                    });
426                }
427                if i == 0 {
428                    continue;
429                }
430                let parent_pkg_id = format!("pkg:{}/", ancestors[i - 1]);
431                let edge_id = format!("contains:{parent_pkg_id}->{pkg_id}");
432                // Use local set for O(1) dedup instead of querying the graph
433                // for every directory. Edges persisted via INSERT OR REPLACE
434                // handle pre-existing edges from prior runs.
435                if !created_edge_ids.insert(edge_id.clone()) {
436                    continue;
437                }
438                dir_edges.push(Edge {
439                    id: edge_id,
440                    src: parent_pkg_id,
441                    dst: pkg_id.clone(),
442                    relationship: RelationshipType::Contains,
443                    weight: contains_weight,
444                    valid_from: Some(now),
445                    valid_to: None,
446                    properties: HashMap::new(),
447                    created_at: now,
448                });
449            }
450            if let Some(last_dir) = ancestors.last() {
451                let parent_pkg_id = format!("pkg:{last_dir}/");
452                let file_node_id = format!("file:{file_path}");
453                let edge_id = format!("contains:{parent_pkg_id}->{file_node_id}");
454                dir_edges.push(Edge {
455                    id: edge_id,
456                    src: parent_pkg_id,
457                    dst: file_node_id,
458                    relationship: RelationshipType::Contains,
459                    weight: contains_weight,
460                    valid_from: Some(now),
461                    valid_to: None,
462                    properties: HashMap::new(),
463                    created_at: now,
464                });
465            }
466        }
467
468        let count = created_dirs.len();
469        (dir_nodes, dir_edges, count)
470    }
471
472    /// Build symbol graph nodes and file→symbol CONTAINS edges.
473    fn build_symbol_nodes(
474        symbols: &[Symbol],
475        ns_string: &Option<String>,
476        contains_weight: f64,
477        now: chrono::DateTime<chrono::Utc>,
478    ) -> (Vec<GraphNode>, Vec<Edge>) {
479        let mut sym_nodes = Vec::with_capacity(symbols.len());
480        let mut sym_edges = Vec::with_capacity(symbols.len());
481
482        for sym in symbols {
483            let kind = NodeKind::from(sym.kind);
484            let payload = Self::build_symbol_payload(sym);
485
486            let sym_node_id = format!("sym:{}", sym.qualified_name);
487            sym_nodes.push(GraphNode {
488                id: sym_node_id.clone(),
489                kind,
490                label: sym.qualified_name.clone(),
491                payload,
492                centrality: 0.0,
493                memory_id: None,
494                namespace: ns_string.clone(),
495            });
496
497            let file_node_id = format!("file:{}", sym.file_path);
498            sym_edges.push(Edge {
499                id: format!("contains:{file_node_id}->{sym_node_id}"),
500                src: file_node_id,
501                dst: sym_node_id,
502                relationship: RelationshipType::Contains,
503                weight: contains_weight,
504                valid_from: Some(now),
505                valid_to: None,
506                properties: HashMap::new(),
507                created_at: now,
508            });
509        }
510
511        (sym_nodes, sym_edges)
512    }
513
514    /// Build the payload HashMap for a symbol's graph node.
515    fn build_symbol_payload(sym: &Symbol) -> HashMap<String, serde_json::Value> {
516        let mut payload = HashMap::new();
517        payload.insert(
518            "symbol_kind".to_string(),
519            serde_json::Value::String(sym.kind.to_string()),
520        );
521        payload.insert(
522            "signature".to_string(),
523            serde_json::Value::String(sym.signature.clone()),
524        );
525        payload.insert(
526            "file_path".to_string(),
527            serde_json::Value::String(sym.file_path.clone()),
528        );
529        payload.insert("line_start".to_string(), serde_json::json!(sym.line_start));
530        payload.insert("line_end".to_string(), serde_json::json!(sym.line_end));
531        payload.insert(
532            "visibility".to_string(),
533            serde_json::Value::String(sym.visibility.to_string()),
534        );
535        if let Some(ref doc) = sym.doc_comment {
536            payload.insert(
537                "doc_comment".to_string(),
538                serde_json::Value::String(doc.clone()),
539            );
540        }
541        if !sym.parameters.is_empty() {
542            payload.insert(
543                "parameters".to_string(),
544                serde_json::to_value(&sym.parameters).unwrap_or_default(),
545            );
546        }
547        if let Some(ref ret) = sym.return_type {
548            payload.insert(
549                "return_type".to_string(),
550                serde_json::Value::String(ret.clone()),
551            );
552        }
553        if sym.is_async {
554            payload.insert("is_async".to_string(), serde_json::json!(true));
555        }
556        if !sym.attributes.is_empty() {
557            payload.insert(
558                "attributes".to_string(),
559                serde_json::to_value(&sym.attributes).unwrap_or_default(),
560            );
561        }
562        if !sym.throws.is_empty() {
563            payload.insert(
564                "throws".to_string(),
565                serde_json::to_value(&sym.throws).unwrap_or_default(),
566            );
567        }
568        if let Some(ref gp) = sym.generic_params {
569            payload.insert(
570                "generic_params".to_string(),
571                serde_json::Value::String(gp.clone()),
572            );
573        }
574        if sym.is_abstract {
575            payload.insert("is_abstract".to_string(), serde_json::json!(true));
576        }
577        if let Some(ref parent) = sym.parent {
578            payload.insert(
579                "parent".to_string(),
580                serde_json::Value::String(parent.clone()),
581            );
582        }
583        payload
584    }
585
586    /// Build edges from resolved cross-file references.
587    /// ast-grep base confidence for multi-layer fusion.
588    const AST_GREP_BASE_CONFIDENCE: f64 = 0.10;
589
590    fn build_reference_edges(
591        edges: &[ResolvedEdge],
592        graph_config: &GraphConfig,
593        now: chrono::DateTime<chrono::Utc>,
594    ) -> Vec<Edge> {
595        edges
596            .iter()
597            .map(|edge| {
598                let mut properties = HashMap::new();
599                properties.insert("source".to_string(), serde_json::json!("ast-grep"));
600                properties.insert(
601                    "confidence".to_string(),
602                    serde_json::json!(Self::AST_GREP_BASE_CONFIDENCE),
603                );
604                properties.insert("source_layers".to_string(), serde_json::json!(["ast-grep"]));
605                // Scale edge weight by resolution confidence so low-confidence
606                // guesses (simple-name fallback) carry less weight in PageRank
607                // and betweenness centrality than exact matches.
608                let base_weight = edge_weight_for(&edge.relationship, graph_config);
609                let weight = base_weight * edge.resolution_confidence;
610                Edge {
611                    id: format!(
612                        "ref:{}->{}:{}",
613                        edge.source_qualified_name, edge.target_qualified_name, edge.relationship
614                    ),
615                    src: format!("sym:{}", edge.source_qualified_name),
616                    dst: format!("sym:{}", edge.target_qualified_name),
617                    relationship: edge.relationship,
618                    weight,
619                    valid_from: Some(now),
620                    valid_to: None,
621                    properties,
622                    created_at: now,
623                }
624            })
625            .collect()
626    }
627
628    /// Multi-layer edge fusion: when ast-grep and SCIP produce the same edge
629    /// (same src, dst, relationship), sum their confidences and merge source_layers.
630    /// SCIP edges not in ast-grep pass through unchanged.
631    ///
632    /// Returns `(fused_scip_edges, superseded_ast_grep_edge_ids)`. The caller must
633    /// remove the superseded ast-grep edges to avoid duplicates in the graph.
634    fn fuse_edges(ast_grep_edges: &[Edge], scip_edges: &[Edge]) -> (Vec<Edge>, Vec<String>) {
635        // Index ast-grep edges by (src, dst, relationship_str) → edge ID for O(1) lookup.
636        let ast_grep_index: HashMap<(String, String, String), &str> = ast_grep_edges
637            .iter()
638            .map(|e| {
639                (
640                    (e.src.clone(), e.dst.clone(), e.relationship.to_string()),
641                    e.id.as_str(),
642                )
643            })
644            .collect();
645
646        let mut superseded_ids = Vec::new();
647
648        let fused = scip_edges
649            .iter()
650            .map(|scip_edge| {
651                let key = (
652                    scip_edge.src.clone(),
653                    scip_edge.dst.clone(),
654                    scip_edge.relationship.to_string(),
655                );
656                if let Some(&ast_edge_id) = ast_grep_index.get(&key) {
657                    // Both layers agree — fuse confidence and mark ast-grep edge for removal.
658                    superseded_ids.push(ast_edge_id.to_string());
659                    let mut fused = scip_edge.clone();
660                    let scip_conf = scip_edge
661                        .properties
662                        .get("confidence")
663                        .and_then(|v| v.as_f64())
664                        .unwrap_or(0.15);
665                    let fused_conf = scip_conf + Self::AST_GREP_BASE_CONFIDENCE;
666                    fused
667                        .properties
668                        .insert("confidence".to_string(), serde_json::json!(fused_conf));
669                    fused.properties.insert(
670                        "source_layers".to_string(),
671                        serde_json::json!(["ast-grep", "scip"]),
672                    );
673                    fused
674                } else {
675                    scip_edge.clone()
676                }
677            })
678            .collect();
679
680        (fused, superseded_ids)
681    }
682
683    /// Build chunk graph nodes and file→chunk / symbol→chunk CONTAINS edges.
684    fn build_chunk_nodes(
685        chunks: &[CodeChunk],
686        ns_string: &Option<String>,
687        contains_weight: f64,
688        now: chrono::DateTime<chrono::Utc>,
689    ) -> (Vec<GraphNode>, Vec<Edge>) {
690        let mut chunk_nodes = Vec::with_capacity(chunks.len());
691        let mut chunk_edges = Vec::with_capacity(chunks.len() * 2);
692
693        for chunk in chunks {
694            let chunk_id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
695
696            let mut payload = HashMap::new();
697            payload.insert(
698                "file_path".to_string(),
699                serde_json::Value::String(chunk.file_path.clone()),
700            );
701            payload.insert(
702                "line_start".to_string(),
703                serde_json::json!(chunk.line_start),
704            );
705            payload.insert("line_end".to_string(), serde_json::json!(chunk.line_end));
706            payload.insert(
707                "node_kind".to_string(),
708                serde_json::Value::String(chunk.node_kind.clone()),
709            );
710            payload.insert(
711                "non_ws_chars".to_string(),
712                serde_json::json!(chunk.non_ws_chars),
713            );
714            if let Some(ref parent) = chunk.parent_symbol {
715                payload.insert(
716                    "parent_symbol".to_string(),
717                    serde_json::Value::String(parent.clone()),
718                );
719            }
720
721            chunk_nodes.push(GraphNode {
722                id: chunk_id.clone(),
723                kind: NodeKind::Chunk,
724                label: format!(
725                    "chunk:{}:{}..{}",
726                    chunk.file_path, chunk.line_start, chunk.line_end
727                ),
728                payload,
729                centrality: 0.0,
730                memory_id: None,
731                namespace: ns_string.clone(),
732            });
733
734            let file_node_id = format!("file:{}", chunk.file_path);
735            chunk_edges.push(Edge {
736                id: format!("contains:{file_node_id}->{chunk_id}"),
737                src: file_node_id,
738                dst: chunk_id.clone(),
739                relationship: RelationshipType::Contains,
740                weight: contains_weight,
741                valid_from: Some(now),
742                valid_to: None,
743                properties: HashMap::new(),
744                created_at: now,
745            });
746
747            if let Some(ref parent_sym) = chunk.parent_symbol {
748                let parent_node_id = format!("sym:{parent_sym}");
749                chunk_edges.push(Edge {
750                    id: format!("contains:{parent_node_id}->{chunk_id}"),
751                    src: parent_node_id,
752                    dst: chunk_id,
753                    relationship: RelationshipType::Contains,
754                    weight: contains_weight,
755                    valid_from: Some(now),
756                    valid_to: None,
757                    properties: HashMap::new(),
758                    created_at: now,
759                });
760            }
761        }
762
763        (chunk_nodes, chunk_edges)
764    }
765
766    // ── Embedding Persistence ────────────────────────────────────────────
767
768    /// Embed symbols and chunks, persisting embeddings to SQLite and the
769    /// vector index in batches with progress reporting.
770    ///
771    /// Returns (symbols_embedded, chunks_embedded).
772    fn embed_and_persist(
773        &self,
774        symbols: &[Symbol],
775        chunks: &[CodeChunk],
776        edges: &[ResolvedEdge],
777        on_progress: impl Fn(usize, usize),
778    ) -> Result<(usize, usize), CodememError> {
779        let mut symbols_embedded = 0usize;
780        let mut chunks_embedded = 0usize;
781
782        // Quick check: skip expensive text enrichment if embedding provider isn't loaded.
783        // This avoids triggering lazy init during lightweight operations (hooks).
784        if !self.embeddings_ready() {
785            return Ok((0, 0));
786        }
787
788        // Phase 1: Collect enriched texts without holding any lock.
789        let sym_texts: Vec<(String, String)> = symbols
790            .iter()
791            .map(|sym| {
792                let id = format!("sym:{}", sym.qualified_name);
793                let text = self.enrich_symbol_text(sym, edges);
794                (id, text)
795            })
796            .collect();
797        let chunk_texts: Vec<(String, String)> = chunks
798            .iter()
799            .map(|chunk| {
800                let id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
801                let text = self.enrich_chunk_text(chunk);
802                (id, text)
803            })
804            .collect();
805
806        // Phase 2+3: Embed in batches and persist progressively.
807        let embed_batch_size = self.config.embedding.batch_size;
808
809        let all_pairs: Vec<(String, String)> = sym_texts.into_iter().chain(chunk_texts).collect();
810        let total = all_pairs.len();
811        let sym_count = symbols.len();
812        let mut done = 0usize;
813
814        for batch in all_pairs.chunks(embed_batch_size) {
815            let texts: Vec<&str> = batch.iter().map(|(_, t)| t.as_str()).collect();
816
817            let t0 = std::time::Instant::now();
818            let embed_result = {
819                let emb = self.lock_embeddings()?;
820                match emb {
821                    Some(emb_guard) => emb_guard.embed_batch(&texts),
822                    None => break,
823                }
824            };
825
826            match embed_result {
827                Ok(embeddings) => {
828                    let embed_ms = t0.elapsed().as_millis();
829
830                    let t1 = std::time::Instant::now();
831                    let pairs: Vec<(&str, &[f32])> = batch
832                        .iter()
833                        .zip(embeddings.iter())
834                        .map(|((id, _), emb_vec)| (id.as_str(), emb_vec.as_slice()))
835                        .collect();
836                    if let Err(e) = self.storage.store_embeddings_batch(&pairs) {
837                        tracing::warn!("Failed to batch-store embeddings: {e}");
838                    }
839                    let sqlite_ms = t1.elapsed().as_millis();
840
841                    let t2 = std::time::Instant::now();
842                    let batch_items: Vec<(String, Vec<f32>)> = batch
843                        .iter()
844                        .zip(embeddings.into_iter())
845                        .map(|((id, _), emb_vec)| (id.clone(), emb_vec))
846                        .collect();
847                    let batch_len = batch_items.len();
848                    {
849                        let mut vec = self.lock_vector()?;
850                        if let Err(e) = vec.insert_batch(&batch_items) {
851                            tracing::warn!("Failed to batch-insert into vector index: {e}");
852                        }
853                    }
854                    let vector_ms = t2.elapsed().as_millis();
855
856                    let syms_in_batch = batch_len.min(sym_count.saturating_sub(done));
857                    symbols_embedded += syms_in_batch;
858                    chunks_embedded += batch_len - syms_in_batch;
859                    done += batch_len;
860
861                    tracing::debug!(
862                        "Embed batch {}: embed={embed_ms}ms sqlite={sqlite_ms}ms vector={vector_ms}ms",
863                        batch_len
864                    );
865                }
866                Err(e) => {
867                    tracing::warn!("embed_batch failed for chunk of {} texts: {e}", batch.len());
868                }
869            }
870            on_progress(done, total);
871        }
872        self.save_index();
873
874        Ok((symbols_embedded, chunks_embedded))
875    }
876}