1mod compaction;
5pub mod cross_repo;
6
7use crate::index::{CodeChunk, ResolvedEdge, Symbol};
8use crate::IndexAndResolveResult;
9use codemem_core::{CodememError, Edge, GraphConfig, GraphNode, NodeKind, RelationshipType};
10use std::collections::{HashMap, HashSet};
11
12#[derive(Debug, Default)]
14pub struct IndexPersistResult {
15 pub files_created: usize,
16 pub packages_created: usize,
17 pub symbols_stored: usize,
18 pub chunks_stored: usize,
19 pub edges_resolved: usize,
20 pub symbols_embedded: usize,
21 pub chunks_embedded: usize,
22 pub chunks_pruned: usize,
23 pub symbols_pruned: usize,
24}
25
26#[derive(Debug, Default)]
28pub struct CrossRepoPersistResult {
29 pub packages_registered: usize,
30 pub unresolved_refs_stored: usize,
31 pub forward_edges_created: usize,
32 pub backward_edges_created: usize,
33 pub endpoints_detected: usize,
34 pub client_calls_detected: usize,
35 pub spec_endpoints_detected: usize,
36 pub event_channels_detected: usize,
37 pub http_edges_matched: usize,
38 pub event_edges_matched: usize,
39}
40
41pub fn edge_weight_for(rel: &RelationshipType, config: &GraphConfig) -> f64 {
44 match rel {
45 RelationshipType::Calls => config.calls_edge_weight,
46 RelationshipType::Imports => config.imports_edge_weight,
47 RelationshipType::Contains => config.contains_edge_weight,
48 RelationshipType::TypeDefinition => config.type_definition_edge_weight,
49 RelationshipType::Reads => config.reads_edge_weight,
50 RelationshipType::Writes => config.writes_edge_weight,
51 RelationshipType::Overrides => config.overrides_edge_weight,
52 RelationshipType::Implements | RelationshipType::Inherits => 0.8,
53 RelationshipType::DependsOn => 0.7,
54 RelationshipType::CoChanged => 0.6,
55 RelationshipType::EvolvedInto | RelationshipType::Summarizes => 0.7,
56 RelationshipType::PartOf => 0.4,
57 RelationshipType::RelatesTo | RelationshipType::SharesTheme => 0.3,
58 RelationshipType::HttpCalls => 0.7,
59 RelationshipType::PublishesTo | RelationshipType::SubscribesTo => 0.6,
60 RelationshipType::ModifiedBy => 0.4,
61 _ => 0.5,
62 }
63}
64
65struct GraphPersistCounts {
67 packages_created: usize,
68 chunks_stored: usize,
69}
70
71impl super::CodememEngine {
72 pub fn persist_index_results(
77 &self,
78 results: &IndexAndResolveResult,
79 namespace: Option<&str>,
80 ) -> Result<IndexPersistResult, CodememError> {
81 self.persist_index_results_with_progress(results, namespace, |_, _| {})
82 }
83
84 pub fn persist_graph_only(
90 &self,
91 results: &IndexAndResolveResult,
92 namespace: Option<&str>,
93 ) -> Result<IndexPersistResult, CodememError> {
94 let seen_files = &results.file_paths;
95 let graph_counts = self.persist_graph_nodes(results, namespace)?;
96
97 let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
98 self.compact_graph(seen_files, namespace)
99 } else {
100 (0, 0)
101 };
102
103 Ok(IndexPersistResult {
104 files_created: seen_files.len(),
105 packages_created: graph_counts.packages_created,
106 symbols_stored: results.symbols.len(),
107 chunks_stored: graph_counts.chunks_stored,
108 edges_resolved: results.edges.len(),
109 symbols_embedded: 0,
110 chunks_embedded: 0,
111 chunks_pruned,
112 symbols_pruned,
113 })
114 }
115
116 pub fn persist_index_results_with_progress(
119 &self,
120 results: &IndexAndResolveResult,
121 namespace: Option<&str>,
122 on_progress: impl Fn(usize, usize),
123 ) -> Result<IndexPersistResult, CodememError> {
124 let seen_files = &results.file_paths;
125
126 let graph_counts = self.persist_graph_nodes(results, namespace)?;
128
129 let (symbols_embedded, chunks_embedded) = self.embed_and_persist(
131 &results.symbols,
132 &results.chunks,
133 &results.edges,
134 on_progress,
135 )?;
136
137 let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
139 self.compact_graph(seen_files, namespace)
140 } else {
141 (0, 0)
142 };
143
144 Ok(IndexPersistResult {
145 files_created: seen_files.len(),
146 packages_created: graph_counts.packages_created,
147 symbols_stored: results.symbols.len(),
148 chunks_stored: graph_counts.chunks_stored,
149 edges_resolved: results.edges.len(),
150 symbols_embedded,
151 chunks_embedded,
152 chunks_pruned,
153 symbols_pruned,
154 })
155 }
156
157 fn persist_graph_nodes(
162 &self,
163 results: &IndexAndResolveResult,
164 namespace: Option<&str>,
165 ) -> Result<GraphPersistCounts, CodememError> {
166 let all_symbols = &results.symbols;
167 let all_chunks = &results.chunks;
168 let seen_files = &results.file_paths;
169 let edges = &results.edges;
170
171 let now = chrono::Utc::now();
172 let ns_string = namespace.map(|s| s.to_string());
173 let contains_weight = edge_weight_for(&RelationshipType::Contains, &self.config.graph);
174
175 let mut graph = self.lock_graph()?;
176
177 let file_nodes: Vec<GraphNode> = seen_files
179 .iter()
180 .map(|file_path| {
181 let mut payload = HashMap::new();
182 payload.insert(
183 "file_path".to_string(),
184 serde_json::Value::String(file_path.clone()),
185 );
186 GraphNode {
187 id: format!("file:{file_path}"),
188 kind: NodeKind::File,
189 label: file_path.clone(),
190 payload,
191 centrality: 0.0,
192 memory_id: None,
193 namespace: ns_string.clone(),
194 valid_from: None,
195 valid_to: None,
196 }
197 })
198 .collect();
199 self.persist_nodes_to_storage_and_graph(&file_nodes, &mut **graph);
200
201 let (dir_nodes, dir_edges, created_dirs) =
203 self.build_package_tree(seen_files, &ns_string, contains_weight, now);
204 self.persist_nodes_to_storage_and_graph(&dir_nodes, &mut **graph);
205 self.persist_edges_to_storage_and_graph(&dir_edges, &mut **graph);
206
207 let (sym_nodes, sym_edges) =
209 Self::build_symbol_nodes(all_symbols, &ns_string, contains_weight, now);
210
211 let mut old_syms_by_file: HashMap<String, HashSet<String>> = HashMap::new();
219 for node in graph.get_all_nodes() {
220 if !node.id.starts_with("sym:") {
221 continue;
222 }
223 if matches!(
228 node.payload.get("source").and_then(|v| v.as_str()),
229 Some("scip" | "scip-synthetic")
230 ) {
231 continue;
232 }
233 let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) else {
234 continue;
235 };
236 if !seen_files.contains(fp) {
237 continue;
238 }
239 old_syms_by_file
240 .entry(fp.to_string())
241 .or_default()
242 .insert(node.id);
243 }
244 drop(graph);
245 for file_path in seen_files {
246 let new_sym_ids: HashSet<String> = sym_nodes
247 .iter()
248 .filter(|n| {
249 n.payload.get("file_path").and_then(|v| v.as_str()) == Some(file_path.as_str())
250 })
251 .map(|n| n.id.clone())
252 .collect();
253 let empty = HashSet::new();
254 let old_sym_ids = old_syms_by_file.get(file_path).unwrap_or(&empty);
255 if let Err(e) = self.cleanup_stale_symbols(file_path, old_sym_ids, &new_sym_ids) {
256 tracing::warn!("Failed to cleanup stale symbols for {file_path}: {e}");
257 }
258 }
259 let mut graph = self.lock_graph()?; self.persist_nodes_to_storage_and_graph(&sym_nodes, &mut **graph);
262 self.persist_edges_to_storage_and_graph(&sym_edges, &mut **graph);
263
264 let ref_edges = Self::build_reference_edges(edges, &self.config.graph, now);
266 self.persist_edges_to_storage_and_graph(&ref_edges, &mut **graph);
267
268 if let Some(ref scip_build) = results.scip_build {
270 let new_scip_ids: HashSet<&str> =
273 scip_build.nodes.iter().map(|n| n.id.as_str()).collect();
274 let mut stale_scip_ids = Vec::new();
275 for node in graph.get_all_nodes() {
276 if !node.id.starts_with("sym:") {
277 continue;
278 }
279 if !matches!(
280 node.payload.get("source").and_then(|v| v.as_str()),
281 Some("scip" | "scip-synthetic")
282 ) {
283 continue;
284 }
285 if !new_scip_ids.contains(node.id.as_str()) {
286 if let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) {
288 if seen_files.contains(fp) {
289 stale_scip_ids.push(node.id.clone());
290 }
291 }
292 }
293 }
294 for stale_id in &stale_scip_ids {
295 let _ = graph.remove_node(stale_id);
296 let _ = self.storage.delete_graph_nodes_by_prefix(stale_id);
297 if let Some(qname) = stale_id.strip_prefix("sym:") {
299 let doc_id = format!("scip-doc:{qname}");
300 let _ = self.storage.delete_memory(&doc_id);
301 }
302 }
303 if !stale_scip_ids.is_empty() {
304 tracing::info!(
305 "Cleaned up {} stale SCIP nodes from re-index",
306 stale_scip_ids.len()
307 );
308 }
309
310 self.persist_nodes_to_storage_and_graph(&scip_build.nodes, &mut **graph);
311
312 let (fused_edges, superseded_ids) = Self::fuse_edges(&ref_edges, &scip_build.edges);
315
316 for edge_id in &superseded_ids {
318 let _ = graph.remove_edge(edge_id);
319 let _ = self.storage.delete_graph_edge(edge_id);
320 }
321
322 self.persist_edges_to_storage_and_graph(&fused_edges, &mut **graph);
323
324 for (memory, related_node_id) in &scip_build.memories {
326 let _ = self.storage.insert_memory(memory);
327 let relates_edge = Edge {
328 id: format!("relates:{}->mem:{}", related_node_id, memory.id),
329 src: related_node_id.clone(),
330 dst: format!("mem:{}", memory.id),
331 relationship: RelationshipType::RelatesTo,
332 weight: 0.3,
333 properties: HashMap::new(),
334 created_at: now,
335 valid_from: Some(now),
336 valid_to: None,
337 };
338 let _ = graph.add_edge(relates_edge.clone());
339 let _ = self.storage.insert_graph_edges_batch(&[relates_edge]);
340 }
341 }
342
343 for file_path in seen_files {
345 let prefix = format!("chunk:{file_path}:");
346 let _ = self.storage.delete_graph_nodes_by_prefix(&prefix);
347 }
348 let (chunk_nodes, chunk_edges) =
349 Self::build_chunk_nodes(all_chunks, &ns_string, contains_weight, now);
350 let chunk_count = chunk_nodes.len();
351 self.persist_nodes_to_storage_and_graph(&chunk_nodes, &mut **graph);
352 self.persist_edges_to_storage_and_graph(&chunk_edges, &mut **graph);
353
354 drop(graph);
355
356 Ok(GraphPersistCounts {
357 packages_created: created_dirs,
358 chunks_stored: chunk_count,
359 })
360 }
361
362 fn persist_nodes_to_storage_and_graph(
364 &self,
365 nodes: &[GraphNode],
366 graph: &mut dyn codemem_core::GraphBackend,
367 ) {
368 if let Err(e) = self.storage.insert_graph_nodes_batch(nodes) {
369 tracing::warn!("Failed to batch-insert {} graph nodes: {e}", nodes.len());
370 }
371 for node in nodes {
372 let _ = graph.add_node(node.clone());
373 }
374 }
375
376 fn persist_edges_to_storage_and_graph(
381 &self,
382 edges: &[Edge],
383 graph: &mut dyn codemem_core::GraphBackend,
384 ) {
385 let mut referenced_ids: std::collections::HashSet<&str> = std::collections::HashSet::new();
387 for edge in edges {
388 referenced_ids.insert(&edge.src);
389 referenced_ids.insert(&edge.dst);
390 }
391 let existing_ids: std::collections::HashSet<String> = referenced_ids
392 .iter()
393 .filter(|id| self.storage.get_graph_node(id).ok().flatten().is_some())
394 .map(|id| id.to_string())
395 .collect();
396
397 let valid_edges: Vec<&Edge> = edges
398 .iter()
399 .filter(|e| existing_ids.contains(&e.src) && existing_ids.contains(&e.dst))
400 .collect();
401
402 let skipped = edges.len() - valid_edges.len();
403 if skipped > 0 {
404 tracing::debug!("Skipped {} edges referencing non-existent nodes", skipped);
405 }
406
407 let owned: Vec<Edge> = valid_edges.into_iter().cloned().collect();
408 if let Err(e) = self.storage.insert_graph_edges_batch(&owned) {
409 tracing::warn!("Failed to batch-insert {} graph edges: {e}", owned.len());
410 }
411 for edge in &owned {
412 let _ = graph.add_edge(edge.clone());
413 }
414 }
415
416 fn build_package_tree(
419 &self,
420 seen_files: &HashSet<String>,
421 ns_string: &Option<String>,
422 contains_weight: f64,
423 now: chrono::DateTime<chrono::Utc>,
424 ) -> (Vec<GraphNode>, Vec<Edge>, usize) {
425 let mut created_dirs: HashSet<String> = HashSet::new();
426 let mut created_edge_ids: HashSet<String> = HashSet::new();
427 let mut dir_nodes = Vec::new();
428 let mut dir_edges = Vec::new();
429
430 for file_path in seen_files {
431 let p = std::path::Path::new(file_path);
432 let mut ancestors: Vec<String> = Vec::new();
433 let mut current = p.parent();
434 while let Some(dir) = current {
435 let dir_str = dir.to_string_lossy().to_string();
436 if dir_str.is_empty() || dir_str == "." {
437 break;
438 }
439 ancestors.push(dir_str);
440 current = dir.parent();
441 }
442 ancestors.reverse();
443 for (i, dir_str) in ancestors.iter().enumerate() {
444 let pkg_id = format!("pkg:{dir_str}/");
445 if created_dirs.insert(pkg_id.clone()) {
446 dir_nodes.push(GraphNode {
447 id: pkg_id.clone(),
448 kind: NodeKind::Package,
449 label: format!("{dir_str}/"),
450 payload: HashMap::new(),
451 centrality: 0.0,
452 memory_id: None,
453 namespace: ns_string.clone(),
454 valid_from: None,
455 valid_to: None,
456 });
457 }
458 if i == 0 {
459 continue;
460 }
461 let parent_pkg_id = format!("pkg:{}/", ancestors[i - 1]);
462 let edge_id = format!("contains:{parent_pkg_id}->{pkg_id}");
463 if !created_edge_ids.insert(edge_id.clone()) {
467 continue;
468 }
469 dir_edges.push(Edge {
470 id: edge_id,
471 src: parent_pkg_id,
472 dst: pkg_id.clone(),
473 relationship: RelationshipType::Contains,
474 weight: contains_weight,
475 valid_from: Some(now),
476 valid_to: None,
477 properties: HashMap::new(),
478 created_at: now,
479 });
480 }
481 if let Some(last_dir) = ancestors.last() {
482 let parent_pkg_id = format!("pkg:{last_dir}/");
483 let file_node_id = format!("file:{file_path}");
484 let edge_id = format!("contains:{parent_pkg_id}->{file_node_id}");
485 dir_edges.push(Edge {
486 id: edge_id,
487 src: parent_pkg_id,
488 dst: file_node_id,
489 relationship: RelationshipType::Contains,
490 weight: contains_weight,
491 valid_from: Some(now),
492 valid_to: None,
493 properties: HashMap::new(),
494 created_at: now,
495 });
496 }
497 }
498
499 let count = created_dirs.len();
500 (dir_nodes, dir_edges, count)
501 }
502
503 fn build_symbol_nodes(
505 symbols: &[Symbol],
506 ns_string: &Option<String>,
507 contains_weight: f64,
508 now: chrono::DateTime<chrono::Utc>,
509 ) -> (Vec<GraphNode>, Vec<Edge>) {
510 let mut sym_nodes = Vec::with_capacity(symbols.len());
511 let mut sym_edges = Vec::with_capacity(symbols.len());
512
513 for sym in symbols {
514 let kind = NodeKind::from(sym.kind);
515 let payload = Self::build_symbol_payload(sym);
516
517 let sym_node_id = format!("sym:{}", sym.qualified_name);
518 sym_nodes.push(GraphNode {
519 id: sym_node_id.clone(),
520 kind,
521 label: sym.qualified_name.clone(),
522 payload,
523 centrality: 0.0,
524 memory_id: None,
525 namespace: ns_string.clone(),
526 valid_from: None,
527 valid_to: None,
528 });
529
530 let file_node_id = format!("file:{}", sym.file_path);
531 sym_edges.push(Edge {
532 id: format!("contains:{file_node_id}->{sym_node_id}"),
533 src: file_node_id,
534 dst: sym_node_id,
535 relationship: RelationshipType::Contains,
536 weight: contains_weight,
537 valid_from: Some(now),
538 valid_to: None,
539 properties: HashMap::new(),
540 created_at: now,
541 });
542 }
543
544 (sym_nodes, sym_edges)
545 }
546
547 fn build_symbol_payload(sym: &Symbol) -> HashMap<String, serde_json::Value> {
549 let mut payload = HashMap::new();
550 payload.insert(
551 "symbol_kind".to_string(),
552 serde_json::Value::String(sym.kind.to_string()),
553 );
554 payload.insert(
555 "signature".to_string(),
556 serde_json::Value::String(sym.signature.clone()),
557 );
558 payload.insert(
559 "file_path".to_string(),
560 serde_json::Value::String(sym.file_path.clone()),
561 );
562 payload.insert("line_start".to_string(), serde_json::json!(sym.line_start));
563 payload.insert("line_end".to_string(), serde_json::json!(sym.line_end));
564 payload.insert(
565 "visibility".to_string(),
566 serde_json::Value::String(sym.visibility.to_string()),
567 );
568 if let Some(ref doc) = sym.doc_comment {
569 payload.insert(
570 "doc_comment".to_string(),
571 serde_json::Value::String(doc.clone()),
572 );
573 }
574 if !sym.parameters.is_empty() {
575 payload.insert(
576 "parameters".to_string(),
577 serde_json::to_value(&sym.parameters).unwrap_or_default(),
578 );
579 }
580 if let Some(ref ret) = sym.return_type {
581 payload.insert(
582 "return_type".to_string(),
583 serde_json::Value::String(ret.clone()),
584 );
585 }
586 if sym.is_async {
587 payload.insert("is_async".to_string(), serde_json::json!(true));
588 }
589 if !sym.attributes.is_empty() {
590 payload.insert(
591 "attributes".to_string(),
592 serde_json::to_value(&sym.attributes).unwrap_or_default(),
593 );
594 }
595 if !sym.throws.is_empty() {
596 payload.insert(
597 "throws".to_string(),
598 serde_json::to_value(&sym.throws).unwrap_or_default(),
599 );
600 }
601 if let Some(ref gp) = sym.generic_params {
602 payload.insert(
603 "generic_params".to_string(),
604 serde_json::Value::String(gp.clone()),
605 );
606 }
607 if sym.is_abstract {
608 payload.insert("is_abstract".to_string(), serde_json::json!(true));
609 }
610 if let Some(ref parent) = sym.parent {
611 payload.insert(
612 "parent".to_string(),
613 serde_json::Value::String(parent.clone()),
614 );
615 }
616 payload
617 }
618
619 const AST_GREP_BASE_CONFIDENCE: f64 = 0.10;
622
623 fn build_reference_edges(
624 edges: &[ResolvedEdge],
625 graph_config: &GraphConfig,
626 now: chrono::DateTime<chrono::Utc>,
627 ) -> Vec<Edge> {
628 edges
629 .iter()
630 .map(|edge| {
631 let mut properties = HashMap::new();
632 properties.insert("source".to_string(), serde_json::json!("ast-grep"));
633 properties.insert(
634 "confidence".to_string(),
635 serde_json::json!(Self::AST_GREP_BASE_CONFIDENCE),
636 );
637 properties.insert("source_layers".to_string(), serde_json::json!(["ast-grep"]));
638 let base_weight = edge_weight_for(&edge.relationship, graph_config);
642 let weight = base_weight * edge.resolution_confidence;
643 Edge {
644 id: format!(
645 "ref:{}->{}:{}",
646 edge.source_qualified_name, edge.target_qualified_name, edge.relationship
647 ),
648 src: format!("sym:{}", edge.source_qualified_name),
649 dst: format!("sym:{}", edge.target_qualified_name),
650 relationship: edge.relationship,
651 weight,
652 valid_from: Some(now),
653 valid_to: None,
654 properties,
655 created_at: now,
656 }
657 })
658 .collect()
659 }
660
661 fn fuse_edges(ast_grep_edges: &[Edge], scip_edges: &[Edge]) -> (Vec<Edge>, Vec<String>) {
668 let ast_grep_index: HashMap<(String, String, String), &str> = ast_grep_edges
670 .iter()
671 .map(|e| {
672 (
673 (e.src.clone(), e.dst.clone(), e.relationship.to_string()),
674 e.id.as_str(),
675 )
676 })
677 .collect();
678
679 let mut superseded_ids = Vec::new();
680
681 let fused = scip_edges
682 .iter()
683 .map(|scip_edge| {
684 let key = (
685 scip_edge.src.clone(),
686 scip_edge.dst.clone(),
687 scip_edge.relationship.to_string(),
688 );
689 if let Some(&ast_edge_id) = ast_grep_index.get(&key) {
690 superseded_ids.push(ast_edge_id.to_string());
692 let mut fused = scip_edge.clone();
693 let scip_conf = scip_edge
694 .properties
695 .get("confidence")
696 .and_then(|v| v.as_f64())
697 .unwrap_or(0.15);
698 let fused_conf = scip_conf + Self::AST_GREP_BASE_CONFIDENCE;
699 fused
700 .properties
701 .insert("confidence".to_string(), serde_json::json!(fused_conf));
702 fused.properties.insert(
703 "source_layers".to_string(),
704 serde_json::json!(["ast-grep", "scip"]),
705 );
706 fused
707 } else {
708 scip_edge.clone()
709 }
710 })
711 .collect();
712
713 (fused, superseded_ids)
714 }
715
716 fn build_chunk_nodes(
718 chunks: &[CodeChunk],
719 ns_string: &Option<String>,
720 contains_weight: f64,
721 now: chrono::DateTime<chrono::Utc>,
722 ) -> (Vec<GraphNode>, Vec<Edge>) {
723 let mut chunk_nodes = Vec::with_capacity(chunks.len());
724 let mut chunk_edges = Vec::with_capacity(chunks.len() * 2);
725
726 for chunk in chunks {
727 let chunk_id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
728
729 let mut payload = HashMap::new();
730 payload.insert(
731 "file_path".to_string(),
732 serde_json::Value::String(chunk.file_path.clone()),
733 );
734 payload.insert(
735 "line_start".to_string(),
736 serde_json::json!(chunk.line_start),
737 );
738 payload.insert("line_end".to_string(), serde_json::json!(chunk.line_end));
739 payload.insert(
740 "node_kind".to_string(),
741 serde_json::Value::String(chunk.node_kind.clone()),
742 );
743 payload.insert(
744 "non_ws_chars".to_string(),
745 serde_json::json!(chunk.non_ws_chars),
746 );
747 if let Some(ref parent) = chunk.parent_symbol {
748 payload.insert(
749 "parent_symbol".to_string(),
750 serde_json::Value::String(parent.clone()),
751 );
752 }
753
754 chunk_nodes.push(GraphNode {
755 id: chunk_id.clone(),
756 kind: NodeKind::Chunk,
757 label: format!(
758 "chunk:{}:{}..{}",
759 chunk.file_path, chunk.line_start, chunk.line_end
760 ),
761 payload,
762 centrality: 0.0,
763 memory_id: None,
764 namespace: ns_string.clone(),
765 valid_from: None,
766 valid_to: None,
767 });
768
769 let file_node_id = format!("file:{}", chunk.file_path);
770 chunk_edges.push(Edge {
771 id: format!("contains:{file_node_id}->{chunk_id}"),
772 src: file_node_id,
773 dst: chunk_id.clone(),
774 relationship: RelationshipType::Contains,
775 weight: contains_weight,
776 valid_from: Some(now),
777 valid_to: None,
778 properties: HashMap::new(),
779 created_at: now,
780 });
781
782 if let Some(ref parent_sym) = chunk.parent_symbol {
783 let parent_node_id = format!("sym:{parent_sym}");
784 chunk_edges.push(Edge {
785 id: format!("contains:{parent_node_id}->{chunk_id}"),
786 src: parent_node_id,
787 dst: chunk_id,
788 relationship: RelationshipType::Contains,
789 weight: contains_weight,
790 valid_from: Some(now),
791 valid_to: None,
792 properties: HashMap::new(),
793 created_at: now,
794 });
795 }
796 }
797
798 (chunk_nodes, chunk_edges)
799 }
800
801 fn embed_and_persist(
808 &self,
809 symbols: &[Symbol],
810 chunks: &[CodeChunk],
811 edges: &[ResolvedEdge],
812 on_progress: impl Fn(usize, usize),
813 ) -> Result<(usize, usize), CodememError> {
814 let mut symbols_embedded = 0usize;
815 let mut chunks_embedded = 0usize;
816
817 if !self.embeddings_ready() {
820 return Ok((0, 0));
821 }
822
823 let sym_texts: Vec<(String, String)> = symbols
825 .iter()
826 .map(|sym| {
827 let id = format!("sym:{}", sym.qualified_name);
828 let text = self.enrich_symbol_text(sym, edges);
829 (id, text)
830 })
831 .collect();
832 let chunk_texts: Vec<(String, String)> = chunks
833 .iter()
834 .map(|chunk| {
835 let id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
836 let text = self.enrich_chunk_text(chunk);
837 (id, text)
838 })
839 .collect();
840
841 let embed_batch_size = self.config.embedding.batch_size;
843
844 let all_pairs: Vec<(String, String)> = sym_texts.into_iter().chain(chunk_texts).collect();
845 let total = all_pairs.len();
846 let sym_count = symbols.len();
847 let mut done = 0usize;
848
849 for batch in all_pairs.chunks(embed_batch_size) {
850 let texts: Vec<&str> = batch.iter().map(|(_, t)| t.as_str()).collect();
851
852 let t0 = std::time::Instant::now();
853 let embed_result = {
854 let emb = self.lock_embeddings()?;
855 match emb {
856 Some(emb_guard) => emb_guard.embed_batch(&texts),
857 None => break,
858 }
859 };
860
861 match embed_result {
862 Ok(embeddings) => {
863 let embed_ms = t0.elapsed().as_millis();
864
865 let t1 = std::time::Instant::now();
866 let pairs: Vec<(&str, &[f32])> = batch
867 .iter()
868 .zip(embeddings.iter())
869 .map(|((id, _), emb_vec)| (id.as_str(), emb_vec.as_slice()))
870 .collect();
871 if let Err(e) = self.storage.store_embeddings_batch(&pairs) {
872 tracing::warn!("Failed to batch-store embeddings: {e}");
873 }
874 let sqlite_ms = t1.elapsed().as_millis();
875
876 let t2 = std::time::Instant::now();
877 let batch_items: Vec<(String, Vec<f32>)> = batch
878 .iter()
879 .zip(embeddings.into_iter())
880 .map(|((id, _), emb_vec)| (id.clone(), emb_vec))
881 .collect();
882 let batch_len = batch_items.len();
883 {
884 let mut vec = self.lock_vector()?;
885 if let Err(e) = vec.insert_batch(&batch_items) {
886 tracing::warn!("Failed to batch-insert into vector index: {e}");
887 }
888 }
889 let vector_ms = t2.elapsed().as_millis();
890
891 let syms_in_batch = batch_len.min(sym_count.saturating_sub(done));
892 symbols_embedded += syms_in_batch;
893 chunks_embedded += batch_len - syms_in_batch;
894 done += batch_len;
895
896 tracing::debug!(
897 "Embed batch {}: embed={embed_ms}ms sqlite={sqlite_ms}ms vector={vector_ms}ms",
898 batch_len
899 );
900 }
901 Err(e) => {
902 tracing::warn!("embed_batch failed for chunk of {} texts: {e}", batch.len());
903 }
904 }
905 on_progress(done, total);
906 }
907 self.save_index();
908
909 Ok((symbols_embedded, chunks_embedded))
910 }
911}