Skip to main content

codemem_engine/persistence/
mod.rs

1//! Graph persistence: persist indexing results (file/package/symbol/chunk nodes,
2//! edges, embeddings, compaction) into the storage and graph backends.
3
4mod compaction;
5pub mod cross_repo;
6
7use crate::index::{CodeChunk, ResolvedEdge, Symbol};
8use crate::IndexAndResolveResult;
9use codemem_core::{CodememError, Edge, GraphConfig, GraphNode, NodeKind, RelationshipType};
10use std::collections::{HashMap, HashSet};
11
12/// Counts of what was persisted by `persist_index_results`.
13#[derive(Debug, Default)]
14pub struct IndexPersistResult {
15    pub files_created: usize,
16    pub packages_created: usize,
17    pub symbols_stored: usize,
18    pub chunks_stored: usize,
19    pub edges_resolved: usize,
20    pub symbols_embedded: usize,
21    pub chunks_embedded: usize,
22    pub chunks_pruned: usize,
23    pub symbols_pruned: usize,
24}
25
26/// Counts of what was persisted by `persist_cross_repo_data`.
27#[derive(Debug, Default)]
28pub struct CrossRepoPersistResult {
29    pub packages_registered: usize,
30    pub unresolved_refs_stored: usize,
31    pub forward_edges_created: usize,
32    pub backward_edges_created: usize,
33    pub endpoints_detected: usize,
34    pub client_calls_detected: usize,
35    pub spec_endpoints_detected: usize,
36    pub event_channels_detected: usize,
37    pub http_edges_matched: usize,
38    pub event_edges_matched: usize,
39}
40
41/// Return the edge weight for a given relationship type, using config overrides
42/// for the three most common types (Contains, Calls, Imports).
43pub fn edge_weight_for(rel: &RelationshipType, config: &GraphConfig) -> f64 {
44    match rel {
45        RelationshipType::Calls => config.calls_edge_weight,
46        RelationshipType::Imports => config.imports_edge_weight,
47        RelationshipType::Contains => config.contains_edge_weight,
48        RelationshipType::TypeDefinition => config.type_definition_edge_weight,
49        RelationshipType::Reads => config.reads_edge_weight,
50        RelationshipType::Writes => config.writes_edge_weight,
51        RelationshipType::Overrides => config.overrides_edge_weight,
52        RelationshipType::Implements | RelationshipType::Inherits => 0.8,
53        RelationshipType::DependsOn => 0.7,
54        RelationshipType::CoChanged => 0.6,
55        RelationshipType::EvolvedInto | RelationshipType::Summarizes => 0.7,
56        RelationshipType::PartOf => 0.4,
57        RelationshipType::RelatesTo | RelationshipType::SharesTheme => 0.3,
58        RelationshipType::HttpCalls => 0.7,
59        RelationshipType::PublishesTo | RelationshipType::SubscribesTo => 0.6,
60        RelationshipType::ModifiedBy => 0.4,
61        _ => 0.5,
62    }
63}
64
65/// Intermediate counts from graph node persistence (before embedding).
66struct GraphPersistCounts {
67    packages_created: usize,
68    chunks_stored: usize,
69}
70
71impl super::CodememEngine {
72    /// Persist all indexing results (file nodes, package tree, symbol nodes, chunk nodes,
73    /// edges, embeddings, compaction) into storage and the in-memory graph.
74    ///
75    /// This is the full persistence pipeline called after `Indexer::index_and_resolve()`.
76    pub fn persist_index_results(
77        &self,
78        results: &IndexAndResolveResult,
79        namespace: Option<&str>,
80    ) -> Result<IndexPersistResult, CodememError> {
81        self.persist_index_results_with_progress(results, namespace, |_, _| {})
82    }
83
84    /// Like `persist_index_results`, but skips the embedding phase entirely.
85    /// Stores graph nodes, edges, and chunks without vectorizing them.
86    /// Also skips cross-repo linking — this is a fast graph-only mode intended
87    /// for rapid iteration (e.g., `--skip-embed`). Run a full `analyze` to
88    /// populate cross-repo data.
89    pub fn persist_graph_only(
90        &self,
91        results: &IndexAndResolveResult,
92        namespace: Option<&str>,
93    ) -> Result<IndexPersistResult, CodememError> {
94        let seen_files = &results.file_paths;
95        let graph_counts = self.persist_graph_nodes(results, namespace)?;
96
97        let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
98            self.compact_graph(seen_files, namespace)
99        } else {
100            (0, 0)
101        };
102
103        Ok(IndexPersistResult {
104            files_created: seen_files.len(),
105            packages_created: graph_counts.packages_created,
106            symbols_stored: results.symbols.len(),
107            chunks_stored: graph_counts.chunks_stored,
108            edges_resolved: results.edges.len(),
109            symbols_embedded: 0,
110            chunks_embedded: 0,
111            chunks_pruned,
112            symbols_pruned,
113        })
114    }
115
116    /// Like `persist_index_results`, but calls `on_progress(done, total)` during
117    /// the embedding phase so callers can display progress.
118    pub fn persist_index_results_with_progress(
119        &self,
120        results: &IndexAndResolveResult,
121        namespace: Option<&str>,
122        on_progress: impl Fn(usize, usize),
123    ) -> Result<IndexPersistResult, CodememError> {
124        let seen_files = &results.file_paths;
125
126        // 1. Persist all graph nodes and edges
127        let graph_counts = self.persist_graph_nodes(results, namespace)?;
128
129        // 2. Embed symbols and chunks
130        let (symbols_embedded, chunks_embedded) = self.embed_and_persist(
131            &results.symbols,
132            &results.chunks,
133            &results.edges,
134            on_progress,
135        )?;
136
137        // 3. Auto-compact
138        let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
139            self.compact_graph(seen_files, namespace)
140        } else {
141            (0, 0)
142        };
143
144        Ok(IndexPersistResult {
145            files_created: seen_files.len(),
146            packages_created: graph_counts.packages_created,
147            symbols_stored: results.symbols.len(),
148            chunks_stored: graph_counts.chunks_stored,
149            edges_resolved: results.edges.len(),
150            symbols_embedded,
151            chunks_embedded,
152            chunks_pruned,
153            symbols_pruned,
154        })
155    }
156
157    // ── Graph Node Persistence ───────────────────────────────────────────
158
159    /// Persist file, package, symbol, chunk nodes and all edges into storage
160    /// and the in-memory graph. Returns counts for the result struct.
161    fn persist_graph_nodes(
162        &self,
163        results: &IndexAndResolveResult,
164        namespace: Option<&str>,
165    ) -> Result<GraphPersistCounts, CodememError> {
166        let all_symbols = &results.symbols;
167        let all_chunks = &results.chunks;
168        let seen_files = &results.file_paths;
169        let edges = &results.edges;
170
171        let now = chrono::Utc::now();
172        let ns_string = namespace.map(|s| s.to_string());
173        let contains_weight = edge_weight_for(&RelationshipType::Contains, &self.config.graph);
174
175        let mut graph = self.lock_graph()?;
176
177        // ── File nodes
178        let file_nodes: Vec<GraphNode> = seen_files
179            .iter()
180            .map(|file_path| {
181                let mut payload = HashMap::new();
182                payload.insert(
183                    "file_path".to_string(),
184                    serde_json::Value::String(file_path.clone()),
185                );
186                GraphNode {
187                    id: format!("file:{file_path}"),
188                    kind: NodeKind::File,
189                    label: file_path.clone(),
190                    payload,
191                    centrality: 0.0,
192                    memory_id: None,
193                    namespace: ns_string.clone(),
194                    valid_from: None,
195                    valid_to: None,
196                }
197            })
198            .collect();
199        self.persist_nodes_to_storage_and_graph(&file_nodes, &mut **graph);
200
201        // ── Package (directory) nodes
202        let (dir_nodes, dir_edges, created_dirs) =
203            self.build_package_tree(seen_files, &ns_string, contains_weight, now);
204        self.persist_nodes_to_storage_and_graph(&dir_nodes, &mut **graph);
205        self.persist_edges_to_storage_and_graph(&dir_edges, &mut **graph);
206
207        // ── Symbol nodes + file→symbol edges
208        let (sym_nodes, sym_edges) =
209            Self::build_symbol_nodes(all_symbols, &ns_string, contains_weight, now);
210
211        // Clean up stale symbols: single pass over in-memory graph to collect
212        // existing symbols grouped by file, then diff against new parse results.
213        //
214        // Lock protocol: We collect old symbols while holding the graph lock,
215        // then drop it so `cleanup_stale_symbols` can acquire graph + vector
216        // locks internally. The re-acquire below is safe: cleanup only removes
217        // stale nodes that won't conflict with the inserts that follow.
218        let mut old_syms_by_file: HashMap<String, HashSet<String>> = HashMap::new();
219        for node in graph.get_all_nodes() {
220            if !node.id.starts_with("sym:") {
221                continue;
222            }
223            // Skip SCIP-sourced symbols (explicit and synthetic containment nodes)
224            // — they're managed by the SCIP pipeline, not ast-grep. Without this
225            // guard, re-indexing deletes all SCIP sym: nodes because their IDs
226            // don't match ast-grep's qualified names.
227            if matches!(
228                node.payload.get("source").and_then(|v| v.as_str()),
229                Some("scip" | "scip-synthetic")
230            ) {
231                continue;
232            }
233            let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) else {
234                continue;
235            };
236            if !seen_files.contains(fp) {
237                continue;
238            }
239            old_syms_by_file
240                .entry(fp.to_string())
241                .or_default()
242                .insert(node.id);
243        }
244        drop(graph);
245        for file_path in seen_files {
246            let new_sym_ids: HashSet<String> = sym_nodes
247                .iter()
248                .filter(|n| {
249                    n.payload.get("file_path").and_then(|v| v.as_str()) == Some(file_path.as_str())
250                })
251                .map(|n| n.id.clone())
252                .collect();
253            let empty = HashSet::new();
254            let old_sym_ids = old_syms_by_file.get(file_path).unwrap_or(&empty);
255            if let Err(e) = self.cleanup_stale_symbols(file_path, old_sym_ids, &new_sym_ids) {
256                tracing::warn!("Failed to cleanup stale symbols for {file_path}: {e}");
257            }
258        }
259        let mut graph = self.lock_graph()?; // Re-acquire lock
260
261        self.persist_nodes_to_storage_and_graph(&sym_nodes, &mut **graph);
262        self.persist_edges_to_storage_and_graph(&sym_edges, &mut **graph);
263
264        // ── Resolved reference edges
265        let ref_edges = Self::build_reference_edges(edges, &self.config.graph, now);
266        self.persist_edges_to_storage_and_graph(&ref_edges, &mut **graph);
267
268        // ── SCIP nodes + edges (compiler-grade)
269        if let Some(ref scip_build) = results.scip_build {
270            // Clean up stale SCIP nodes: collect existing SCIP-sourced sym: nodes
271            // for files covered by this SCIP run, then remove any not in the new set.
272            let new_scip_ids: HashSet<&str> =
273                scip_build.nodes.iter().map(|n| n.id.as_str()).collect();
274            let mut stale_scip_ids = Vec::new();
275            for node in graph.get_all_nodes() {
276                if !node.id.starts_with("sym:") {
277                    continue;
278                }
279                if !matches!(
280                    node.payload.get("source").and_then(|v| v.as_str()),
281                    Some("scip" | "scip-synthetic")
282                ) {
283                    continue;
284                }
285                if !new_scip_ids.contains(node.id.as_str()) {
286                    // Only clean up nodes in files that SCIP covered this run.
287                    if let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) {
288                        if seen_files.contains(fp) {
289                            stale_scip_ids.push(node.id.clone());
290                        }
291                    }
292                }
293            }
294            for stale_id in &stale_scip_ids {
295                let _ = graph.remove_node(stale_id);
296                let _ = self.storage.delete_graph_nodes_by_prefix(stale_id);
297                // Clean up orphan doc memories for removed symbols.
298                if let Some(qname) = stale_id.strip_prefix("sym:") {
299                    let doc_id = format!("scip-doc:{qname}");
300                    let _ = self.storage.delete_memory(&doc_id);
301                }
302            }
303            if !stale_scip_ids.is_empty() {
304                tracing::info!(
305                    "Cleaned up {} stale SCIP nodes from re-index",
306                    stale_scip_ids.len()
307                );
308            }
309
310            self.persist_nodes_to_storage_and_graph(&scip_build.nodes, &mut **graph);
311
312            // Multi-layer fusion: merge confidence when ast-grep and SCIP agree.
313            // Superseded ast-grep edges are removed to avoid duplicates.
314            let (fused_edges, superseded_ids) = Self::fuse_edges(&ref_edges, &scip_build.edges);
315
316            // Remove the low-confidence ast-grep edges that were fused into SCIP edges.
317            for edge_id in &superseded_ids {
318                let _ = graph.remove_edge(edge_id);
319                let _ = self.storage.delete_graph_edge(edge_id);
320            }
321
322            self.persist_edges_to_storage_and_graph(&fused_edges, &mut **graph);
323
324            // Persist hover doc memories and their RELATES_TO edges.
325            for (memory, related_node_id) in &scip_build.memories {
326                let _ = self.storage.insert_memory(memory);
327                let relates_edge = Edge {
328                    id: format!("relates:{}->mem:{}", related_node_id, memory.id),
329                    src: related_node_id.clone(),
330                    dst: format!("mem:{}", memory.id),
331                    relationship: RelationshipType::RelatesTo,
332                    weight: 0.3,
333                    properties: HashMap::new(),
334                    created_at: now,
335                    valid_from: Some(now),
336                    valid_to: None,
337                };
338                let _ = graph.add_edge(relates_edge.clone());
339                let _ = self.storage.insert_graph_edges_batch(&[relates_edge]);
340            }
341        }
342
343        // ── Chunk nodes + file→chunk / symbol→chunk edges
344        for file_path in seen_files {
345            let prefix = format!("chunk:{file_path}:");
346            let _ = self.storage.delete_graph_nodes_by_prefix(&prefix);
347        }
348        let (chunk_nodes, chunk_edges) =
349            Self::build_chunk_nodes(all_chunks, &ns_string, contains_weight, now);
350        let chunk_count = chunk_nodes.len();
351        self.persist_nodes_to_storage_and_graph(&chunk_nodes, &mut **graph);
352        self.persist_edges_to_storage_and_graph(&chunk_edges, &mut **graph);
353
354        drop(graph);
355
356        Ok(GraphPersistCounts {
357            packages_created: created_dirs,
358            chunks_stored: chunk_count,
359        })
360    }
361
362    /// Batch-insert nodes into both SQLite and the in-memory graph.
363    fn persist_nodes_to_storage_and_graph(
364        &self,
365        nodes: &[GraphNode],
366        graph: &mut dyn codemem_core::GraphBackend,
367    ) {
368        if let Err(e) = self.storage.insert_graph_nodes_batch(nodes) {
369            tracing::warn!("Failed to batch-insert {} graph nodes: {e}", nodes.len());
370        }
371        for node in nodes {
372            let _ = graph.add_node(node.clone());
373        }
374    }
375
376    /// Batch-insert edges into both SQLite and the in-memory graph.
377    /// Filters out edges whose src or dst nodes don't exist in storage
378    /// to avoid FOREIGN KEY constraint violations (e.g. edges referencing
379    /// external symbols that weren't indexed).
380    fn persist_edges_to_storage_and_graph(
381        &self,
382        edges: &[Edge],
383        graph: &mut dyn codemem_core::GraphBackend,
384    ) {
385        // Collect all referenced node IDs and check which exist in storage.
386        let mut referenced_ids: std::collections::HashSet<&str> = std::collections::HashSet::new();
387        for edge in edges {
388            referenced_ids.insert(&edge.src);
389            referenced_ids.insert(&edge.dst);
390        }
391        let existing_ids: std::collections::HashSet<String> = referenced_ids
392            .iter()
393            .filter(|id| self.storage.get_graph_node(id).ok().flatten().is_some())
394            .map(|id| id.to_string())
395            .collect();
396
397        let valid_edges: Vec<&Edge> = edges
398            .iter()
399            .filter(|e| existing_ids.contains(&e.src) && existing_ids.contains(&e.dst))
400            .collect();
401
402        let skipped = edges.len() - valid_edges.len();
403        if skipped > 0 {
404            tracing::debug!("Skipped {} edges referencing non-existent nodes", skipped);
405        }
406
407        let owned: Vec<Edge> = valid_edges.into_iter().cloned().collect();
408        if let Err(e) = self.storage.insert_graph_edges_batch(&owned) {
409            tracing::warn!("Failed to batch-insert {} graph edges: {e}", owned.len());
410        }
411        for edge in &owned {
412            let _ = graph.add_edge(edge.clone());
413        }
414    }
415
416    /// Build directory/package nodes and CONTAINS edges from file paths.
417    /// Returns (nodes, edges, number_of_dirs_created).
418    fn build_package_tree(
419        &self,
420        seen_files: &HashSet<String>,
421        ns_string: &Option<String>,
422        contains_weight: f64,
423        now: chrono::DateTime<chrono::Utc>,
424    ) -> (Vec<GraphNode>, Vec<Edge>, usize) {
425        let mut created_dirs: HashSet<String> = HashSet::new();
426        let mut created_edge_ids: HashSet<String> = HashSet::new();
427        let mut dir_nodes = Vec::new();
428        let mut dir_edges = Vec::new();
429
430        for file_path in seen_files {
431            let p = std::path::Path::new(file_path);
432            let mut ancestors: Vec<String> = Vec::new();
433            let mut current = p.parent();
434            while let Some(dir) = current {
435                let dir_str = dir.to_string_lossy().to_string();
436                if dir_str.is_empty() || dir_str == "." {
437                    break;
438                }
439                ancestors.push(dir_str);
440                current = dir.parent();
441            }
442            ancestors.reverse();
443            for (i, dir_str) in ancestors.iter().enumerate() {
444                let pkg_id = format!("pkg:{dir_str}/");
445                if created_dirs.insert(pkg_id.clone()) {
446                    dir_nodes.push(GraphNode {
447                        id: pkg_id.clone(),
448                        kind: NodeKind::Package,
449                        label: format!("{dir_str}/"),
450                        payload: HashMap::new(),
451                        centrality: 0.0,
452                        memory_id: None,
453                        namespace: ns_string.clone(),
454                        valid_from: None,
455                        valid_to: None,
456                    });
457                }
458                if i == 0 {
459                    continue;
460                }
461                let parent_pkg_id = format!("pkg:{}/", ancestors[i - 1]);
462                let edge_id = format!("contains:{parent_pkg_id}->{pkg_id}");
463                // Use local set for O(1) dedup instead of querying the graph
464                // for every directory. Edges persisted via INSERT OR REPLACE
465                // handle pre-existing edges from prior runs.
466                if !created_edge_ids.insert(edge_id.clone()) {
467                    continue;
468                }
469                dir_edges.push(Edge {
470                    id: edge_id,
471                    src: parent_pkg_id,
472                    dst: pkg_id.clone(),
473                    relationship: RelationshipType::Contains,
474                    weight: contains_weight,
475                    valid_from: Some(now),
476                    valid_to: None,
477                    properties: HashMap::new(),
478                    created_at: now,
479                });
480            }
481            if let Some(last_dir) = ancestors.last() {
482                let parent_pkg_id = format!("pkg:{last_dir}/");
483                let file_node_id = format!("file:{file_path}");
484                let edge_id = format!("contains:{parent_pkg_id}->{file_node_id}");
485                dir_edges.push(Edge {
486                    id: edge_id,
487                    src: parent_pkg_id,
488                    dst: file_node_id,
489                    relationship: RelationshipType::Contains,
490                    weight: contains_weight,
491                    valid_from: Some(now),
492                    valid_to: None,
493                    properties: HashMap::new(),
494                    created_at: now,
495                });
496            }
497        }
498
499        let count = created_dirs.len();
500        (dir_nodes, dir_edges, count)
501    }
502
503    /// Build symbol graph nodes and file→symbol CONTAINS edges.
504    fn build_symbol_nodes(
505        symbols: &[Symbol],
506        ns_string: &Option<String>,
507        contains_weight: f64,
508        now: chrono::DateTime<chrono::Utc>,
509    ) -> (Vec<GraphNode>, Vec<Edge>) {
510        let mut sym_nodes = Vec::with_capacity(symbols.len());
511        let mut sym_edges = Vec::with_capacity(symbols.len());
512
513        for sym in symbols {
514            let kind = NodeKind::from(sym.kind);
515            let payload = Self::build_symbol_payload(sym);
516
517            let sym_node_id = format!("sym:{}", sym.qualified_name);
518            sym_nodes.push(GraphNode {
519                id: sym_node_id.clone(),
520                kind,
521                label: sym.qualified_name.clone(),
522                payload,
523                centrality: 0.0,
524                memory_id: None,
525                namespace: ns_string.clone(),
526                valid_from: None,
527                valid_to: None,
528            });
529
530            let file_node_id = format!("file:{}", sym.file_path);
531            sym_edges.push(Edge {
532                id: format!("contains:{file_node_id}->{sym_node_id}"),
533                src: file_node_id,
534                dst: sym_node_id,
535                relationship: RelationshipType::Contains,
536                weight: contains_weight,
537                valid_from: Some(now),
538                valid_to: None,
539                properties: HashMap::new(),
540                created_at: now,
541            });
542        }
543
544        (sym_nodes, sym_edges)
545    }
546
547    /// Build the payload HashMap for a symbol's graph node.
548    fn build_symbol_payload(sym: &Symbol) -> HashMap<String, serde_json::Value> {
549        let mut payload = HashMap::new();
550        payload.insert(
551            "symbol_kind".to_string(),
552            serde_json::Value::String(sym.kind.to_string()),
553        );
554        payload.insert(
555            "signature".to_string(),
556            serde_json::Value::String(sym.signature.clone()),
557        );
558        payload.insert(
559            "file_path".to_string(),
560            serde_json::Value::String(sym.file_path.clone()),
561        );
562        payload.insert("line_start".to_string(), serde_json::json!(sym.line_start));
563        payload.insert("line_end".to_string(), serde_json::json!(sym.line_end));
564        payload.insert(
565            "visibility".to_string(),
566            serde_json::Value::String(sym.visibility.to_string()),
567        );
568        if let Some(ref doc) = sym.doc_comment {
569            payload.insert(
570                "doc_comment".to_string(),
571                serde_json::Value::String(doc.clone()),
572            );
573        }
574        if !sym.parameters.is_empty() {
575            payload.insert(
576                "parameters".to_string(),
577                serde_json::to_value(&sym.parameters).unwrap_or_default(),
578            );
579        }
580        if let Some(ref ret) = sym.return_type {
581            payload.insert(
582                "return_type".to_string(),
583                serde_json::Value::String(ret.clone()),
584            );
585        }
586        if sym.is_async {
587            payload.insert("is_async".to_string(), serde_json::json!(true));
588        }
589        if !sym.attributes.is_empty() {
590            payload.insert(
591                "attributes".to_string(),
592                serde_json::to_value(&sym.attributes).unwrap_or_default(),
593            );
594        }
595        if !sym.throws.is_empty() {
596            payload.insert(
597                "throws".to_string(),
598                serde_json::to_value(&sym.throws).unwrap_or_default(),
599            );
600        }
601        if let Some(ref gp) = sym.generic_params {
602            payload.insert(
603                "generic_params".to_string(),
604                serde_json::Value::String(gp.clone()),
605            );
606        }
607        if sym.is_abstract {
608            payload.insert("is_abstract".to_string(), serde_json::json!(true));
609        }
610        if let Some(ref parent) = sym.parent {
611            payload.insert(
612                "parent".to_string(),
613                serde_json::Value::String(parent.clone()),
614            );
615        }
616        payload
617    }
618
619    /// Build edges from resolved cross-file references.
620    /// ast-grep base confidence for multi-layer fusion.
621    const AST_GREP_BASE_CONFIDENCE: f64 = 0.10;
622
623    fn build_reference_edges(
624        edges: &[ResolvedEdge],
625        graph_config: &GraphConfig,
626        now: chrono::DateTime<chrono::Utc>,
627    ) -> Vec<Edge> {
628        edges
629            .iter()
630            .map(|edge| {
631                let mut properties = HashMap::new();
632                properties.insert("source".to_string(), serde_json::json!("ast-grep"));
633                properties.insert(
634                    "confidence".to_string(),
635                    serde_json::json!(Self::AST_GREP_BASE_CONFIDENCE),
636                );
637                properties.insert("source_layers".to_string(), serde_json::json!(["ast-grep"]));
638                // Scale edge weight by resolution confidence so low-confidence
639                // guesses (simple-name fallback) carry less weight in PageRank
640                // and betweenness centrality than exact matches.
641                let base_weight = edge_weight_for(&edge.relationship, graph_config);
642                let weight = base_weight * edge.resolution_confidence;
643                Edge {
644                    id: format!(
645                        "ref:{}->{}:{}",
646                        edge.source_qualified_name, edge.target_qualified_name, edge.relationship
647                    ),
648                    src: format!("sym:{}", edge.source_qualified_name),
649                    dst: format!("sym:{}", edge.target_qualified_name),
650                    relationship: edge.relationship,
651                    weight,
652                    valid_from: Some(now),
653                    valid_to: None,
654                    properties,
655                    created_at: now,
656                }
657            })
658            .collect()
659    }
660
661    /// Multi-layer edge fusion: when ast-grep and SCIP produce the same edge
662    /// (same src, dst, relationship), sum their confidences and merge source_layers.
663    /// SCIP edges not in ast-grep pass through unchanged.
664    ///
665    /// Returns `(fused_scip_edges, superseded_ast_grep_edge_ids)`. The caller must
666    /// remove the superseded ast-grep edges to avoid duplicates in the graph.
667    fn fuse_edges(ast_grep_edges: &[Edge], scip_edges: &[Edge]) -> (Vec<Edge>, Vec<String>) {
668        // Index ast-grep edges by (src, dst, relationship_str) → edge ID for O(1) lookup.
669        let ast_grep_index: HashMap<(String, String, String), &str> = ast_grep_edges
670            .iter()
671            .map(|e| {
672                (
673                    (e.src.clone(), e.dst.clone(), e.relationship.to_string()),
674                    e.id.as_str(),
675                )
676            })
677            .collect();
678
679        let mut superseded_ids = Vec::new();
680
681        let fused = scip_edges
682            .iter()
683            .map(|scip_edge| {
684                let key = (
685                    scip_edge.src.clone(),
686                    scip_edge.dst.clone(),
687                    scip_edge.relationship.to_string(),
688                );
689                if let Some(&ast_edge_id) = ast_grep_index.get(&key) {
690                    // Both layers agree — fuse confidence and mark ast-grep edge for removal.
691                    superseded_ids.push(ast_edge_id.to_string());
692                    let mut fused = scip_edge.clone();
693                    let scip_conf = scip_edge
694                        .properties
695                        .get("confidence")
696                        .and_then(|v| v.as_f64())
697                        .unwrap_or(0.15);
698                    let fused_conf = scip_conf + Self::AST_GREP_BASE_CONFIDENCE;
699                    fused
700                        .properties
701                        .insert("confidence".to_string(), serde_json::json!(fused_conf));
702                    fused.properties.insert(
703                        "source_layers".to_string(),
704                        serde_json::json!(["ast-grep", "scip"]),
705                    );
706                    fused
707                } else {
708                    scip_edge.clone()
709                }
710            })
711            .collect();
712
713        (fused, superseded_ids)
714    }
715
716    /// Build chunk graph nodes and file→chunk / symbol→chunk CONTAINS edges.
717    fn build_chunk_nodes(
718        chunks: &[CodeChunk],
719        ns_string: &Option<String>,
720        contains_weight: f64,
721        now: chrono::DateTime<chrono::Utc>,
722    ) -> (Vec<GraphNode>, Vec<Edge>) {
723        let mut chunk_nodes = Vec::with_capacity(chunks.len());
724        let mut chunk_edges = Vec::with_capacity(chunks.len() * 2);
725
726        for chunk in chunks {
727            let chunk_id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
728
729            let mut payload = HashMap::new();
730            payload.insert(
731                "file_path".to_string(),
732                serde_json::Value::String(chunk.file_path.clone()),
733            );
734            payload.insert(
735                "line_start".to_string(),
736                serde_json::json!(chunk.line_start),
737            );
738            payload.insert("line_end".to_string(), serde_json::json!(chunk.line_end));
739            payload.insert(
740                "node_kind".to_string(),
741                serde_json::Value::String(chunk.node_kind.clone()),
742            );
743            payload.insert(
744                "non_ws_chars".to_string(),
745                serde_json::json!(chunk.non_ws_chars),
746            );
747            if let Some(ref parent) = chunk.parent_symbol {
748                payload.insert(
749                    "parent_symbol".to_string(),
750                    serde_json::Value::String(parent.clone()),
751                );
752            }
753
754            chunk_nodes.push(GraphNode {
755                id: chunk_id.clone(),
756                kind: NodeKind::Chunk,
757                label: format!(
758                    "chunk:{}:{}..{}",
759                    chunk.file_path, chunk.line_start, chunk.line_end
760                ),
761                payload,
762                centrality: 0.0,
763                memory_id: None,
764                namespace: ns_string.clone(),
765                valid_from: None,
766                valid_to: None,
767            });
768
769            let file_node_id = format!("file:{}", chunk.file_path);
770            chunk_edges.push(Edge {
771                id: format!("contains:{file_node_id}->{chunk_id}"),
772                src: file_node_id,
773                dst: chunk_id.clone(),
774                relationship: RelationshipType::Contains,
775                weight: contains_weight,
776                valid_from: Some(now),
777                valid_to: None,
778                properties: HashMap::new(),
779                created_at: now,
780            });
781
782            if let Some(ref parent_sym) = chunk.parent_symbol {
783                let parent_node_id = format!("sym:{parent_sym}");
784                chunk_edges.push(Edge {
785                    id: format!("contains:{parent_node_id}->{chunk_id}"),
786                    src: parent_node_id,
787                    dst: chunk_id,
788                    relationship: RelationshipType::Contains,
789                    weight: contains_weight,
790                    valid_from: Some(now),
791                    valid_to: None,
792                    properties: HashMap::new(),
793                    created_at: now,
794                });
795            }
796        }
797
798        (chunk_nodes, chunk_edges)
799    }
800
801    // ── Embedding Persistence ────────────────────────────────────────────
802
803    /// Embed symbols and chunks, persisting embeddings to SQLite and the
804    /// vector index in batches with progress reporting.
805    ///
806    /// Returns (symbols_embedded, chunks_embedded).
807    fn embed_and_persist(
808        &self,
809        symbols: &[Symbol],
810        chunks: &[CodeChunk],
811        edges: &[ResolvedEdge],
812        on_progress: impl Fn(usize, usize),
813    ) -> Result<(usize, usize), CodememError> {
814        let mut symbols_embedded = 0usize;
815        let mut chunks_embedded = 0usize;
816
817        // Quick check: skip expensive text enrichment if embedding provider isn't loaded.
818        // This avoids triggering lazy init during lightweight operations (hooks).
819        if !self.embeddings_ready() {
820            return Ok((0, 0));
821        }
822
823        // Phase 1: Collect enriched texts without holding any lock.
824        let sym_texts: Vec<(String, String)> = symbols
825            .iter()
826            .map(|sym| {
827                let id = format!("sym:{}", sym.qualified_name);
828                let text = self.enrich_symbol_text(sym, edges);
829                (id, text)
830            })
831            .collect();
832        let chunk_texts: Vec<(String, String)> = chunks
833            .iter()
834            .map(|chunk| {
835                let id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
836                let text = self.enrich_chunk_text(chunk);
837                (id, text)
838            })
839            .collect();
840
841        // Phase 2+3: Embed in batches and persist progressively.
842        let embed_batch_size = self.config.embedding.batch_size;
843
844        let all_pairs: Vec<(String, String)> = sym_texts.into_iter().chain(chunk_texts).collect();
845        let total = all_pairs.len();
846        let sym_count = symbols.len();
847        let mut done = 0usize;
848
849        for batch in all_pairs.chunks(embed_batch_size) {
850            let texts: Vec<&str> = batch.iter().map(|(_, t)| t.as_str()).collect();
851
852            let t0 = std::time::Instant::now();
853            let embed_result = {
854                let emb = self.lock_embeddings()?;
855                match emb {
856                    Some(emb_guard) => emb_guard.embed_batch(&texts),
857                    None => break,
858                }
859            };
860
861            match embed_result {
862                Ok(embeddings) => {
863                    let embed_ms = t0.elapsed().as_millis();
864
865                    let t1 = std::time::Instant::now();
866                    let pairs: Vec<(&str, &[f32])> = batch
867                        .iter()
868                        .zip(embeddings.iter())
869                        .map(|((id, _), emb_vec)| (id.as_str(), emb_vec.as_slice()))
870                        .collect();
871                    if let Err(e) = self.storage.store_embeddings_batch(&pairs) {
872                        tracing::warn!("Failed to batch-store embeddings: {e}");
873                    }
874                    let sqlite_ms = t1.elapsed().as_millis();
875
876                    let t2 = std::time::Instant::now();
877                    let batch_items: Vec<(String, Vec<f32>)> = batch
878                        .iter()
879                        .zip(embeddings.into_iter())
880                        .map(|((id, _), emb_vec)| (id.clone(), emb_vec))
881                        .collect();
882                    let batch_len = batch_items.len();
883                    {
884                        let mut vec = self.lock_vector()?;
885                        if let Err(e) = vec.insert_batch(&batch_items) {
886                            tracing::warn!("Failed to batch-insert into vector index: {e}");
887                        }
888                    }
889                    let vector_ms = t2.elapsed().as_millis();
890
891                    let syms_in_batch = batch_len.min(sym_count.saturating_sub(done));
892                    symbols_embedded += syms_in_batch;
893                    chunks_embedded += batch_len - syms_in_batch;
894                    done += batch_len;
895
896                    tracing::debug!(
897                        "Embed batch {}: embed={embed_ms}ms sqlite={sqlite_ms}ms vector={vector_ms}ms",
898                        batch_len
899                    );
900                }
901                Err(e) => {
902                    tracing::warn!("embed_batch failed for chunk of {} texts: {e}", batch.len());
903                }
904            }
905            on_progress(done, total);
906        }
907        self.save_index();
908
909        Ok((symbols_embedded, chunks_embedded))
910    }
911}