Skip to main content

codemem_engine/persistence/
mod.rs

1//! Graph persistence: persist indexing results (file/package/symbol/chunk nodes,
2//! edges, embeddings, compaction) into the storage and graph backends.
3
4mod compaction;
5pub mod cross_repo;
6
7use crate::index::{CodeChunk, ResolvedEdge, Symbol};
8use crate::IndexAndResolveResult;
9use codemem_core::{
10    CodememError, Edge, GraphBackend, GraphConfig, GraphNode, NodeKind, RelationshipType,
11    VectorBackend,
12};
13use std::collections::{HashMap, HashSet};
14
15/// Counts of what was persisted by `persist_index_results`.
16#[derive(Debug, Default)]
17pub struct IndexPersistResult {
18    pub files_created: usize,
19    pub packages_created: usize,
20    pub symbols_stored: usize,
21    pub chunks_stored: usize,
22    pub edges_resolved: usize,
23    pub symbols_embedded: usize,
24    pub chunks_embedded: usize,
25    pub chunks_pruned: usize,
26    pub symbols_pruned: usize,
27}
28
29/// Counts of what was persisted by `persist_cross_repo_data`.
30#[derive(Debug, Default)]
31pub struct CrossRepoPersistResult {
32    pub packages_registered: usize,
33    pub unresolved_refs_stored: usize,
34    pub forward_edges_created: usize,
35    pub backward_edges_created: usize,
36    pub endpoints_detected: usize,
37    pub client_calls_detected: usize,
38}
39
40/// Return the edge weight for a given relationship type, using config overrides
41/// for the three most common types (Contains, Calls, Imports).
42pub fn edge_weight_for(rel: &RelationshipType, config: &GraphConfig) -> f64 {
43    match rel {
44        RelationshipType::Calls => config.calls_edge_weight,
45        RelationshipType::Imports => config.imports_edge_weight,
46        RelationshipType::Contains => config.contains_edge_weight,
47        RelationshipType::TypeDefinition => config.type_definition_edge_weight,
48        RelationshipType::Reads => config.reads_edge_weight,
49        RelationshipType::Writes => config.writes_edge_weight,
50        RelationshipType::Overrides => config.overrides_edge_weight,
51        RelationshipType::Implements | RelationshipType::Inherits => 0.8,
52        RelationshipType::DependsOn => 0.7,
53        RelationshipType::CoChanged => 0.6,
54        RelationshipType::EvolvedInto | RelationshipType::Summarizes => 0.7,
55        RelationshipType::PartOf => 0.4,
56        RelationshipType::RelatesTo | RelationshipType::SharesTheme => 0.3,
57        _ => 0.5,
58    }
59}
60
61/// Intermediate counts from graph node persistence (before embedding).
62struct GraphPersistCounts {
63    packages_created: usize,
64    chunks_stored: usize,
65}
66
67impl super::CodememEngine {
68    /// Persist all indexing results (file nodes, package tree, symbol nodes, chunk nodes,
69    /// edges, embeddings, compaction) into storage and the in-memory graph.
70    ///
71    /// This is the full persistence pipeline called after `Indexer::index_and_resolve()`.
72    pub fn persist_index_results(
73        &self,
74        results: &IndexAndResolveResult,
75        namespace: Option<&str>,
76    ) -> Result<IndexPersistResult, CodememError> {
77        self.persist_index_results_with_progress(results, namespace, |_, _| {})
78    }
79
80    /// Like `persist_index_results`, but skips the embedding phase entirely.
81    /// Stores graph nodes, edges, and chunks without vectorizing them.
82    /// Also skips cross-repo linking — this is a fast graph-only mode intended
83    /// for rapid iteration (e.g., `--skip-embed`). Run a full `analyze` to
84    /// populate cross-repo data.
85    pub fn persist_graph_only(
86        &self,
87        results: &IndexAndResolveResult,
88        namespace: Option<&str>,
89    ) -> Result<IndexPersistResult, CodememError> {
90        let seen_files = &results.file_paths;
91        let graph_counts = self.persist_graph_nodes(results, namespace)?;
92
93        let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
94            self.compact_graph(seen_files)
95        } else {
96            (0, 0)
97        };
98
99        Ok(IndexPersistResult {
100            files_created: seen_files.len(),
101            packages_created: graph_counts.packages_created,
102            symbols_stored: results.symbols.len(),
103            chunks_stored: graph_counts.chunks_stored,
104            edges_resolved: results.edges.len(),
105            symbols_embedded: 0,
106            chunks_embedded: 0,
107            chunks_pruned,
108            symbols_pruned,
109        })
110    }
111
112    /// Like `persist_index_results`, but calls `on_progress(done, total)` during
113    /// the embedding phase so callers can display progress.
114    pub fn persist_index_results_with_progress(
115        &self,
116        results: &IndexAndResolveResult,
117        namespace: Option<&str>,
118        on_progress: impl Fn(usize, usize),
119    ) -> Result<IndexPersistResult, CodememError> {
120        let seen_files = &results.file_paths;
121
122        // 1. Persist all graph nodes and edges
123        let graph_counts = self.persist_graph_nodes(results, namespace)?;
124
125        // 2. Embed symbols and chunks
126        let (symbols_embedded, chunks_embedded) = self.embed_and_persist(
127            &results.symbols,
128            &results.chunks,
129            &results.edges,
130            on_progress,
131        )?;
132
133        // 3. Auto-compact
134        let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
135            self.compact_graph(seen_files)
136        } else {
137            (0, 0)
138        };
139
140        Ok(IndexPersistResult {
141            files_created: seen_files.len(),
142            packages_created: graph_counts.packages_created,
143            symbols_stored: results.symbols.len(),
144            chunks_stored: graph_counts.chunks_stored,
145            edges_resolved: results.edges.len(),
146            symbols_embedded,
147            chunks_embedded,
148            chunks_pruned,
149            symbols_pruned,
150        })
151    }
152
153    // ── Graph Node Persistence ───────────────────────────────────────────
154
155    /// Persist file, package, symbol, chunk nodes and all edges into storage
156    /// and the in-memory graph. Returns counts for the result struct.
157    fn persist_graph_nodes(
158        &self,
159        results: &IndexAndResolveResult,
160        namespace: Option<&str>,
161    ) -> Result<GraphPersistCounts, CodememError> {
162        let all_symbols = &results.symbols;
163        let all_chunks = &results.chunks;
164        let seen_files = &results.file_paths;
165        let edges = &results.edges;
166
167        let now = chrono::Utc::now();
168        let ns_string = namespace.map(|s| s.to_string());
169        let contains_weight = edge_weight_for(&RelationshipType::Contains, &self.config.graph);
170
171        let mut graph = self.lock_graph()?;
172
173        // ── File nodes
174        let file_nodes: Vec<GraphNode> = seen_files
175            .iter()
176            .map(|file_path| {
177                let mut payload = HashMap::new();
178                payload.insert(
179                    "file_path".to_string(),
180                    serde_json::Value::String(file_path.clone()),
181                );
182                GraphNode {
183                    id: format!("file:{file_path}"),
184                    kind: NodeKind::File,
185                    label: file_path.clone(),
186                    payload,
187                    centrality: 0.0,
188                    memory_id: None,
189                    namespace: ns_string.clone(),
190                }
191            })
192            .collect();
193        self.persist_nodes_to_storage_and_graph(&file_nodes, &mut graph);
194
195        // ── Package (directory) nodes
196        let (dir_nodes, dir_edges, created_dirs) =
197            self.build_package_tree(seen_files, &ns_string, contains_weight, now, &graph);
198        self.persist_nodes_to_storage_and_graph(&dir_nodes, &mut graph);
199        self.persist_edges_to_storage_and_graph(&dir_edges, &mut graph);
200
201        // ── Symbol nodes + file→symbol edges
202        let (sym_nodes, sym_edges) =
203            Self::build_symbol_nodes(all_symbols, &ns_string, contains_weight, now);
204
205        // Clean up stale symbols: single pass over in-memory graph to collect
206        // existing symbols grouped by file, then diff against new parse results.
207        //
208        // Lock protocol: We collect old symbols while holding the graph lock,
209        // then drop it so `cleanup_stale_symbols` can acquire graph + vector
210        // locks internally. The re-acquire below is safe: cleanup only removes
211        // stale nodes that won't conflict with the inserts that follow.
212        let mut old_syms_by_file: HashMap<String, HashSet<String>> = HashMap::new();
213        for node in graph.get_all_nodes() {
214            if !node.id.starts_with("sym:") {
215                continue;
216            }
217            // Skip SCIP-sourced symbols (explicit and synthetic containment nodes)
218            // — they're managed by the SCIP pipeline, not ast-grep. Without this
219            // guard, re-indexing deletes all SCIP sym: nodes because their IDs
220            // don't match ast-grep's qualified names.
221            if matches!(
222                node.payload.get("source").and_then(|v| v.as_str()),
223                Some("scip" | "scip-synthetic")
224            ) {
225                continue;
226            }
227            let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) else {
228                continue;
229            };
230            if !seen_files.contains(fp) {
231                continue;
232            }
233            old_syms_by_file
234                .entry(fp.to_string())
235                .or_default()
236                .insert(node.id);
237        }
238        drop(graph);
239        for file_path in seen_files {
240            let new_sym_ids: HashSet<String> = sym_nodes
241                .iter()
242                .filter(|n| {
243                    n.payload.get("file_path").and_then(|v| v.as_str()) == Some(file_path.as_str())
244                })
245                .map(|n| n.id.clone())
246                .collect();
247            let empty = HashSet::new();
248            let old_sym_ids = old_syms_by_file.get(file_path).unwrap_or(&empty);
249            if let Err(e) = self.cleanup_stale_symbols(file_path, old_sym_ids, &new_sym_ids) {
250                tracing::warn!("Failed to cleanup stale symbols for {file_path}: {e}");
251            }
252        }
253        let mut graph = self.lock_graph()?; // Re-acquire lock
254
255        self.persist_nodes_to_storage_and_graph(&sym_nodes, &mut graph);
256        self.persist_edges_to_storage_and_graph(&sym_edges, &mut graph);
257
258        // ── Resolved reference edges
259        let ref_edges = Self::build_reference_edges(edges, &self.config.graph, now);
260        self.persist_edges_to_storage_and_graph(&ref_edges, &mut graph);
261
262        // ── SCIP nodes + edges (compiler-grade)
263        if let Some(ref scip_build) = results.scip_build {
264            // Clean up stale SCIP nodes: collect existing SCIP-sourced sym: nodes
265            // for files covered by this SCIP run, then remove any not in the new set.
266            let new_scip_ids: HashSet<&str> =
267                scip_build.nodes.iter().map(|n| n.id.as_str()).collect();
268            let mut stale_scip_ids = Vec::new();
269            for node in graph.get_all_nodes() {
270                if !node.id.starts_with("sym:") {
271                    continue;
272                }
273                if !matches!(
274                    node.payload.get("source").and_then(|v| v.as_str()),
275                    Some("scip" | "scip-synthetic")
276                ) {
277                    continue;
278                }
279                if !new_scip_ids.contains(node.id.as_str()) {
280                    // Only clean up nodes in files that SCIP covered this run.
281                    if let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) {
282                        if seen_files.contains(fp) {
283                            stale_scip_ids.push(node.id.clone());
284                        }
285                    }
286                }
287            }
288            for stale_id in &stale_scip_ids {
289                let _ = graph.remove_node(stale_id);
290                let _ = self.storage.delete_graph_nodes_by_prefix(stale_id);
291                // Clean up orphan doc memories for removed symbols.
292                if let Some(qname) = stale_id.strip_prefix("sym:") {
293                    let doc_id = format!("scip-doc:{qname}");
294                    let _ = self.storage.delete_memory(&doc_id);
295                }
296            }
297            if !stale_scip_ids.is_empty() {
298                tracing::info!(
299                    "Cleaned up {} stale SCIP nodes from re-index",
300                    stale_scip_ids.len()
301                );
302            }
303
304            self.persist_nodes_to_storage_and_graph(&scip_build.nodes, &mut graph);
305
306            // Multi-layer fusion: merge confidence when ast-grep and SCIP agree.
307            // Superseded ast-grep edges are removed to avoid duplicates.
308            let (fused_edges, superseded_ids) = Self::fuse_edges(&ref_edges, &scip_build.edges);
309
310            // Remove the low-confidence ast-grep edges that were fused into SCIP edges.
311            for edge_id in &superseded_ids {
312                let _ = graph.remove_edge(edge_id);
313                let _ = self.storage.delete_graph_edge(edge_id);
314            }
315
316            self.persist_edges_to_storage_and_graph(&fused_edges, &mut graph);
317
318            // Persist hover doc memories and their RELATES_TO edges.
319            for (memory, related_node_id) in &scip_build.memories {
320                let _ = self.storage.insert_memory(memory);
321                let relates_edge = Edge {
322                    id: format!("relates:{}->mem:{}", related_node_id, memory.id),
323                    src: related_node_id.clone(),
324                    dst: format!("mem:{}", memory.id),
325                    relationship: RelationshipType::RelatesTo,
326                    weight: 0.3,
327                    properties: HashMap::new(),
328                    created_at: now,
329                    valid_from: Some(now),
330                    valid_to: None,
331                };
332                let _ = graph.add_edge(relates_edge.clone());
333                let _ = self.storage.insert_graph_edges_batch(&[relates_edge]);
334            }
335        }
336
337        // ── Chunk nodes + file→chunk / symbol→chunk edges
338        for file_path in seen_files {
339            let prefix = format!("chunk:{file_path}:");
340            let _ = self.storage.delete_graph_nodes_by_prefix(&prefix);
341        }
342        let (chunk_nodes, chunk_edges) =
343            Self::build_chunk_nodes(all_chunks, &ns_string, contains_weight, now);
344        let chunk_count = chunk_nodes.len();
345        self.persist_nodes_to_storage_and_graph(&chunk_nodes, &mut graph);
346        self.persist_edges_to_storage_and_graph(&chunk_edges, &mut graph);
347
348        drop(graph);
349
350        Ok(GraphPersistCounts {
351            packages_created: created_dirs,
352            chunks_stored: chunk_count,
353        })
354    }
355
356    /// Batch-insert nodes into both SQLite and the in-memory graph.
357    fn persist_nodes_to_storage_and_graph(
358        &self,
359        nodes: &[GraphNode],
360        graph: &mut crate::GraphEngine,
361    ) {
362        if let Err(e) = self.storage.insert_graph_nodes_batch(nodes) {
363            tracing::warn!("Failed to batch-insert {} graph nodes: {e}", nodes.len());
364        }
365        for node in nodes {
366            let _ = graph.add_node(node.clone());
367        }
368    }
369
370    /// Batch-insert edges into both SQLite and the in-memory graph.
371    fn persist_edges_to_storage_and_graph(&self, edges: &[Edge], graph: &mut crate::GraphEngine) {
372        if let Err(e) = self.storage.insert_graph_edges_batch(edges) {
373            tracing::warn!("Failed to batch-insert {} graph edges: {e}", edges.len());
374        }
375        for edge in edges {
376            let _ = graph.add_edge(edge.clone());
377        }
378    }
379
380    /// Build directory/package nodes and CONTAINS edges from file paths.
381    /// Returns (nodes, edges, number_of_dirs_created).
382    fn build_package_tree(
383        &self,
384        seen_files: &HashSet<String>,
385        ns_string: &Option<String>,
386        contains_weight: f64,
387        now: chrono::DateTime<chrono::Utc>,
388        graph: &crate::GraphEngine,
389    ) -> (Vec<GraphNode>, Vec<Edge>, usize) {
390        let mut created_dirs: HashSet<String> = HashSet::new();
391        let mut dir_nodes = Vec::new();
392        let mut dir_edges = Vec::new();
393
394        for file_path in seen_files {
395            let p = std::path::Path::new(file_path);
396            let mut ancestors: Vec<String> = Vec::new();
397            let mut current = p.parent();
398            while let Some(dir) = current {
399                let dir_str = dir.to_string_lossy().to_string();
400                if dir_str.is_empty() || dir_str == "." {
401                    break;
402                }
403                ancestors.push(dir_str);
404                current = dir.parent();
405            }
406            ancestors.reverse();
407            for (i, dir_str) in ancestors.iter().enumerate() {
408                let pkg_id = format!("pkg:{dir_str}/");
409                if created_dirs.insert(pkg_id.clone()) {
410                    dir_nodes.push(GraphNode {
411                        id: pkg_id.clone(),
412                        kind: NodeKind::Package,
413                        label: format!("{dir_str}/"),
414                        payload: HashMap::new(),
415                        centrality: 0.0,
416                        memory_id: None,
417                        namespace: ns_string.clone(),
418                    });
419                }
420                if i == 0 {
421                    continue;
422                }
423                let parent_pkg_id = format!("pkg:{}/", ancestors[i - 1]);
424                let edge_id = format!("contains:{parent_pkg_id}->{pkg_id}");
425                if graph
426                    .get_edges(&parent_pkg_id)
427                    .unwrap_or_default()
428                    .iter()
429                    .any(|e| e.id == edge_id)
430                {
431                    continue;
432                }
433                dir_edges.push(Edge {
434                    id: edge_id,
435                    src: parent_pkg_id,
436                    dst: pkg_id.clone(),
437                    relationship: RelationshipType::Contains,
438                    weight: contains_weight,
439                    valid_from: Some(now),
440                    valid_to: None,
441                    properties: HashMap::new(),
442                    created_at: now,
443                });
444            }
445            if let Some(last_dir) = ancestors.last() {
446                let parent_pkg_id = format!("pkg:{last_dir}/");
447                let file_node_id = format!("file:{file_path}");
448                let edge_id = format!("contains:{parent_pkg_id}->{file_node_id}");
449                dir_edges.push(Edge {
450                    id: edge_id,
451                    src: parent_pkg_id,
452                    dst: file_node_id,
453                    relationship: RelationshipType::Contains,
454                    weight: contains_weight,
455                    valid_from: Some(now),
456                    valid_to: None,
457                    properties: HashMap::new(),
458                    created_at: now,
459                });
460            }
461        }
462
463        let count = created_dirs.len();
464        (dir_nodes, dir_edges, count)
465    }
466
467    /// Build symbol graph nodes and file→symbol CONTAINS edges.
468    fn build_symbol_nodes(
469        symbols: &[Symbol],
470        ns_string: &Option<String>,
471        contains_weight: f64,
472        now: chrono::DateTime<chrono::Utc>,
473    ) -> (Vec<GraphNode>, Vec<Edge>) {
474        let mut sym_nodes = Vec::with_capacity(symbols.len());
475        let mut sym_edges = Vec::with_capacity(symbols.len());
476
477        for sym in symbols {
478            let kind = NodeKind::from(sym.kind);
479            let payload = Self::build_symbol_payload(sym);
480
481            let sym_node_id = format!("sym:{}", sym.qualified_name);
482            sym_nodes.push(GraphNode {
483                id: sym_node_id.clone(),
484                kind,
485                label: sym.qualified_name.clone(),
486                payload,
487                centrality: 0.0,
488                memory_id: None,
489                namespace: ns_string.clone(),
490            });
491
492            let file_node_id = format!("file:{}", sym.file_path);
493            sym_edges.push(Edge {
494                id: format!("contains:{file_node_id}->{sym_node_id}"),
495                src: file_node_id,
496                dst: sym_node_id,
497                relationship: RelationshipType::Contains,
498                weight: contains_weight,
499                valid_from: Some(now),
500                valid_to: None,
501                properties: HashMap::new(),
502                created_at: now,
503            });
504        }
505
506        (sym_nodes, sym_edges)
507    }
508
509    /// Build the payload HashMap for a symbol's graph node.
510    fn build_symbol_payload(sym: &Symbol) -> HashMap<String, serde_json::Value> {
511        let mut payload = HashMap::new();
512        payload.insert(
513            "symbol_kind".to_string(),
514            serde_json::Value::String(sym.kind.to_string()),
515        );
516        payload.insert(
517            "signature".to_string(),
518            serde_json::Value::String(sym.signature.clone()),
519        );
520        payload.insert(
521            "file_path".to_string(),
522            serde_json::Value::String(sym.file_path.clone()),
523        );
524        payload.insert("line_start".to_string(), serde_json::json!(sym.line_start));
525        payload.insert("line_end".to_string(), serde_json::json!(sym.line_end));
526        payload.insert(
527            "visibility".to_string(),
528            serde_json::Value::String(sym.visibility.to_string()),
529        );
530        if let Some(ref doc) = sym.doc_comment {
531            payload.insert(
532                "doc_comment".to_string(),
533                serde_json::Value::String(doc.clone()),
534            );
535        }
536        if !sym.parameters.is_empty() {
537            payload.insert(
538                "parameters".to_string(),
539                serde_json::to_value(&sym.parameters).unwrap_or_default(),
540            );
541        }
542        if let Some(ref ret) = sym.return_type {
543            payload.insert(
544                "return_type".to_string(),
545                serde_json::Value::String(ret.clone()),
546            );
547        }
548        if sym.is_async {
549            payload.insert("is_async".to_string(), serde_json::json!(true));
550        }
551        if !sym.attributes.is_empty() {
552            payload.insert(
553                "attributes".to_string(),
554                serde_json::to_value(&sym.attributes).unwrap_or_default(),
555            );
556        }
557        if !sym.throws.is_empty() {
558            payload.insert(
559                "throws".to_string(),
560                serde_json::to_value(&sym.throws).unwrap_or_default(),
561            );
562        }
563        if let Some(ref gp) = sym.generic_params {
564            payload.insert(
565                "generic_params".to_string(),
566                serde_json::Value::String(gp.clone()),
567            );
568        }
569        if sym.is_abstract {
570            payload.insert("is_abstract".to_string(), serde_json::json!(true));
571        }
572        if let Some(ref parent) = sym.parent {
573            payload.insert(
574                "parent".to_string(),
575                serde_json::Value::String(parent.clone()),
576            );
577        }
578        payload
579    }
580
581    /// Build edges from resolved cross-file references.
582    /// ast-grep base confidence for multi-layer fusion.
583    const AST_GREP_BASE_CONFIDENCE: f64 = 0.10;
584
585    fn build_reference_edges(
586        edges: &[ResolvedEdge],
587        graph_config: &GraphConfig,
588        now: chrono::DateTime<chrono::Utc>,
589    ) -> Vec<Edge> {
590        edges
591            .iter()
592            .map(|edge| {
593                let mut properties = HashMap::new();
594                properties.insert("source".to_string(), serde_json::json!("ast-grep"));
595                properties.insert(
596                    "confidence".to_string(),
597                    serde_json::json!(Self::AST_GREP_BASE_CONFIDENCE),
598                );
599                properties.insert("source_layers".to_string(), serde_json::json!(["ast-grep"]));
600                Edge {
601                    id: format!(
602                        "ref:{}->{}:{}",
603                        edge.source_qualified_name, edge.target_qualified_name, edge.relationship
604                    ),
605                    src: format!("sym:{}", edge.source_qualified_name),
606                    dst: format!("sym:{}", edge.target_qualified_name),
607                    relationship: edge.relationship,
608                    weight: edge_weight_for(&edge.relationship, graph_config),
609                    valid_from: Some(now),
610                    valid_to: None,
611                    properties,
612                    created_at: now,
613                }
614            })
615            .collect()
616    }
617
618    /// Multi-layer edge fusion: when ast-grep and SCIP produce the same edge
619    /// (same src, dst, relationship), sum their confidences and merge source_layers.
620    /// SCIP edges not in ast-grep pass through unchanged.
621    ///
622    /// Returns `(fused_scip_edges, superseded_ast_grep_edge_ids)`. The caller must
623    /// remove the superseded ast-grep edges to avoid duplicates in the graph.
624    fn fuse_edges(ast_grep_edges: &[Edge], scip_edges: &[Edge]) -> (Vec<Edge>, Vec<String>) {
625        // Index ast-grep edges by (src, dst, relationship_str) → edge ID for O(1) lookup.
626        let ast_grep_index: HashMap<(String, String, String), &str> = ast_grep_edges
627            .iter()
628            .map(|e| {
629                (
630                    (e.src.clone(), e.dst.clone(), e.relationship.to_string()),
631                    e.id.as_str(),
632                )
633            })
634            .collect();
635
636        let mut superseded_ids = Vec::new();
637
638        let fused = scip_edges
639            .iter()
640            .map(|scip_edge| {
641                let key = (
642                    scip_edge.src.clone(),
643                    scip_edge.dst.clone(),
644                    scip_edge.relationship.to_string(),
645                );
646                if let Some(&ast_edge_id) = ast_grep_index.get(&key) {
647                    // Both layers agree — fuse confidence and mark ast-grep edge for removal.
648                    superseded_ids.push(ast_edge_id.to_string());
649                    let mut fused = scip_edge.clone();
650                    let scip_conf = scip_edge
651                        .properties
652                        .get("confidence")
653                        .and_then(|v| v.as_f64())
654                        .unwrap_or(0.15);
655                    let fused_conf = scip_conf + Self::AST_GREP_BASE_CONFIDENCE;
656                    fused
657                        .properties
658                        .insert("confidence".to_string(), serde_json::json!(fused_conf));
659                    fused.properties.insert(
660                        "source_layers".to_string(),
661                        serde_json::json!(["ast-grep", "scip"]),
662                    );
663                    fused
664                } else {
665                    scip_edge.clone()
666                }
667            })
668            .collect();
669
670        (fused, superseded_ids)
671    }
672
673    /// Build chunk graph nodes and file→chunk / symbol→chunk CONTAINS edges.
674    fn build_chunk_nodes(
675        chunks: &[CodeChunk],
676        ns_string: &Option<String>,
677        contains_weight: f64,
678        now: chrono::DateTime<chrono::Utc>,
679    ) -> (Vec<GraphNode>, Vec<Edge>) {
680        let mut chunk_nodes = Vec::with_capacity(chunks.len());
681        let mut chunk_edges = Vec::with_capacity(chunks.len() * 2);
682
683        for chunk in chunks {
684            let chunk_id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
685
686            let mut payload = HashMap::new();
687            payload.insert(
688                "file_path".to_string(),
689                serde_json::Value::String(chunk.file_path.clone()),
690            );
691            payload.insert(
692                "line_start".to_string(),
693                serde_json::json!(chunk.line_start),
694            );
695            payload.insert("line_end".to_string(), serde_json::json!(chunk.line_end));
696            payload.insert(
697                "node_kind".to_string(),
698                serde_json::Value::String(chunk.node_kind.clone()),
699            );
700            payload.insert(
701                "non_ws_chars".to_string(),
702                serde_json::json!(chunk.non_ws_chars),
703            );
704            if let Some(ref parent) = chunk.parent_symbol {
705                payload.insert(
706                    "parent_symbol".to_string(),
707                    serde_json::Value::String(parent.clone()),
708                );
709            }
710
711            chunk_nodes.push(GraphNode {
712                id: chunk_id.clone(),
713                kind: NodeKind::Chunk,
714                label: format!(
715                    "chunk:{}:{}..{}",
716                    chunk.file_path, chunk.line_start, chunk.line_end
717                ),
718                payload,
719                centrality: 0.0,
720                memory_id: None,
721                namespace: ns_string.clone(),
722            });
723
724            let file_node_id = format!("file:{}", chunk.file_path);
725            chunk_edges.push(Edge {
726                id: format!("contains:{file_node_id}->{chunk_id}"),
727                src: file_node_id,
728                dst: chunk_id.clone(),
729                relationship: RelationshipType::Contains,
730                weight: contains_weight,
731                valid_from: Some(now),
732                valid_to: None,
733                properties: HashMap::new(),
734                created_at: now,
735            });
736
737            if let Some(ref parent_sym) = chunk.parent_symbol {
738                let parent_node_id = format!("sym:{parent_sym}");
739                chunk_edges.push(Edge {
740                    id: format!("contains:{parent_node_id}->{chunk_id}"),
741                    src: parent_node_id,
742                    dst: chunk_id,
743                    relationship: RelationshipType::Contains,
744                    weight: contains_weight,
745                    valid_from: Some(now),
746                    valid_to: None,
747                    properties: HashMap::new(),
748                    created_at: now,
749                });
750            }
751        }
752
753        (chunk_nodes, chunk_edges)
754    }
755
756    // ── Embedding Persistence ────────────────────────────────────────────
757
758    /// Embed symbols and chunks, persisting embeddings to SQLite and the
759    /// vector index in batches with progress reporting.
760    ///
761    /// Returns (symbols_embedded, chunks_embedded).
762    fn embed_and_persist(
763        &self,
764        symbols: &[Symbol],
765        chunks: &[CodeChunk],
766        edges: &[ResolvedEdge],
767        on_progress: impl Fn(usize, usize),
768    ) -> Result<(usize, usize), CodememError> {
769        let mut symbols_embedded = 0usize;
770        let mut chunks_embedded = 0usize;
771
772        // Quick check: skip expensive text enrichment if embedding provider isn't loaded.
773        // This avoids triggering lazy init during lightweight operations (hooks).
774        if !self.embeddings_ready() {
775            return Ok((0, 0));
776        }
777
778        // Phase 1: Collect enriched texts without holding any lock.
779        let sym_texts: Vec<(String, String)> = symbols
780            .iter()
781            .map(|sym| {
782                let id = format!("sym:{}", sym.qualified_name);
783                let text = self.enrich_symbol_text(sym, edges);
784                (id, text)
785            })
786            .collect();
787        let chunk_texts: Vec<(String, String)> = chunks
788            .iter()
789            .map(|chunk| {
790                let id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
791                let text = self.enrich_chunk_text(chunk);
792                (id, text)
793            })
794            .collect();
795
796        // Phase 2+3: Embed in batches and persist progressively.
797        let embed_batch_size = self.config.embedding.batch_size;
798
799        let all_pairs: Vec<(String, String)> = sym_texts.into_iter().chain(chunk_texts).collect();
800        let total = all_pairs.len();
801        let sym_count = symbols.len();
802        let mut done = 0usize;
803
804        for batch in all_pairs.chunks(embed_batch_size) {
805            let texts: Vec<&str> = batch.iter().map(|(_, t)| t.as_str()).collect();
806
807            let t0 = std::time::Instant::now();
808            let embed_result = {
809                let emb = self.lock_embeddings()?;
810                match emb {
811                    Some(emb_guard) => emb_guard.embed_batch(&texts),
812                    None => break,
813                }
814            };
815
816            match embed_result {
817                Ok(embeddings) => {
818                    let embed_ms = t0.elapsed().as_millis();
819
820                    let t1 = std::time::Instant::now();
821                    let pairs: Vec<(&str, &[f32])> = batch
822                        .iter()
823                        .zip(embeddings.iter())
824                        .map(|((id, _), emb_vec)| (id.as_str(), emb_vec.as_slice()))
825                        .collect();
826                    if let Err(e) = self.storage.store_embeddings_batch(&pairs) {
827                        tracing::warn!("Failed to batch-store embeddings: {e}");
828                    }
829                    let sqlite_ms = t1.elapsed().as_millis();
830
831                    let t2 = std::time::Instant::now();
832                    let batch_items: Vec<(String, Vec<f32>)> = batch
833                        .iter()
834                        .zip(embeddings.into_iter())
835                        .map(|((id, _), emb_vec)| (id.clone(), emb_vec))
836                        .collect();
837                    let batch_len = batch_items.len();
838                    {
839                        let mut vec = self.lock_vector()?;
840                        if let Err(e) = vec.insert_batch(&batch_items) {
841                            tracing::warn!("Failed to batch-insert into vector index: {e}");
842                        }
843                    }
844                    let vector_ms = t2.elapsed().as_millis();
845
846                    let syms_in_batch = batch_len.min(sym_count.saturating_sub(done));
847                    symbols_embedded += syms_in_batch;
848                    chunks_embedded += batch_len - syms_in_batch;
849                    done += batch_len;
850
851                    tracing::debug!(
852                        "Embed batch {}: embed={embed_ms}ms sqlite={sqlite_ms}ms vector={vector_ms}ms",
853                        batch_len
854                    );
855                }
856                Err(e) => {
857                    tracing::warn!("embed_batch failed for chunk of {} texts: {e}", batch.len());
858                }
859            }
860            on_progress(done, total);
861        }
862        self.save_index();
863
864        Ok((symbols_embedded, chunks_embedded))
865    }
866}