1mod compaction;
5pub mod cross_repo;
6
7use crate::index::{CodeChunk, ResolvedEdge, Symbol};
8use crate::IndexAndResolveResult;
9use codemem_core::{CodememError, Edge, GraphConfig, GraphNode, NodeKind, RelationshipType};
10use std::collections::{HashMap, HashSet};
11
12#[derive(Debug, Default)]
14pub struct IndexPersistResult {
15 pub files_created: usize,
16 pub packages_created: usize,
17 pub symbols_stored: usize,
18 pub chunks_stored: usize,
19 pub edges_resolved: usize,
20 pub symbols_embedded: usize,
21 pub chunks_embedded: usize,
22 pub chunks_pruned: usize,
23 pub symbols_pruned: usize,
24}
25
26#[derive(Debug, Default)]
28pub struct CrossRepoPersistResult {
29 pub packages_registered: usize,
30 pub unresolved_refs_stored: usize,
31 pub forward_edges_created: usize,
32 pub backward_edges_created: usize,
33 pub endpoints_detected: usize,
34 pub client_calls_detected: usize,
35 pub spec_endpoints_detected: usize,
36 pub event_channels_detected: usize,
37 pub http_edges_matched: usize,
38 pub event_edges_matched: usize,
39}
40
41pub fn edge_weight_for(rel: &RelationshipType, config: &GraphConfig) -> f64 {
44 match rel {
45 RelationshipType::Calls => config.calls_edge_weight,
46 RelationshipType::Imports => config.imports_edge_weight,
47 RelationshipType::Contains => config.contains_edge_weight,
48 RelationshipType::TypeDefinition => config.type_definition_edge_weight,
49 RelationshipType::Reads => config.reads_edge_weight,
50 RelationshipType::Writes => config.writes_edge_weight,
51 RelationshipType::Overrides => config.overrides_edge_weight,
52 RelationshipType::Implements | RelationshipType::Inherits => 0.8,
53 RelationshipType::DependsOn => 0.7,
54 RelationshipType::CoChanged => 0.6,
55 RelationshipType::EvolvedInto | RelationshipType::Summarizes => 0.7,
56 RelationshipType::PartOf => 0.4,
57 RelationshipType::RelatesTo | RelationshipType::SharesTheme => 0.3,
58 RelationshipType::HttpCalls => 0.7,
59 RelationshipType::PublishesTo | RelationshipType::SubscribesTo => 0.6,
60 RelationshipType::ModifiedBy => 0.4,
61 _ => 0.5,
62 }
63}
64
65struct GraphPersistCounts {
67 packages_created: usize,
68 chunks_stored: usize,
69}
70
71impl super::CodememEngine {
72 pub fn persist_index_results(
77 &self,
78 results: &IndexAndResolveResult,
79 namespace: Option<&str>,
80 ) -> Result<IndexPersistResult, CodememError> {
81 self.persist_index_results_with_progress(results, namespace, |_, _| {})
82 }
83
84 pub fn persist_graph_only(
90 &self,
91 results: &IndexAndResolveResult,
92 namespace: Option<&str>,
93 ) -> Result<IndexPersistResult, CodememError> {
94 let seen_files = &results.file_paths;
95 let graph_counts = self.persist_graph_nodes(results, namespace)?;
96
97 let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
98 self.compact_graph(seen_files)
99 } else {
100 (0, 0)
101 };
102
103 Ok(IndexPersistResult {
104 files_created: seen_files.len(),
105 packages_created: graph_counts.packages_created,
106 symbols_stored: results.symbols.len(),
107 chunks_stored: graph_counts.chunks_stored,
108 edges_resolved: results.edges.len(),
109 symbols_embedded: 0,
110 chunks_embedded: 0,
111 chunks_pruned,
112 symbols_pruned,
113 })
114 }
115
116 pub fn persist_index_results_with_progress(
119 &self,
120 results: &IndexAndResolveResult,
121 namespace: Option<&str>,
122 on_progress: impl Fn(usize, usize),
123 ) -> Result<IndexPersistResult, CodememError> {
124 let seen_files = &results.file_paths;
125
126 let graph_counts = self.persist_graph_nodes(results, namespace)?;
128
129 let (symbols_embedded, chunks_embedded) = self.embed_and_persist(
131 &results.symbols,
132 &results.chunks,
133 &results.edges,
134 on_progress,
135 )?;
136
137 let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
139 self.compact_graph(seen_files)
140 } else {
141 (0, 0)
142 };
143
144 Ok(IndexPersistResult {
145 files_created: seen_files.len(),
146 packages_created: graph_counts.packages_created,
147 symbols_stored: results.symbols.len(),
148 chunks_stored: graph_counts.chunks_stored,
149 edges_resolved: results.edges.len(),
150 symbols_embedded,
151 chunks_embedded,
152 chunks_pruned,
153 symbols_pruned,
154 })
155 }
156
157 fn persist_graph_nodes(
162 &self,
163 results: &IndexAndResolveResult,
164 namespace: Option<&str>,
165 ) -> Result<GraphPersistCounts, CodememError> {
166 let all_symbols = &results.symbols;
167 let all_chunks = &results.chunks;
168 let seen_files = &results.file_paths;
169 let edges = &results.edges;
170
171 let now = chrono::Utc::now();
172 let ns_string = namespace.map(|s| s.to_string());
173 let contains_weight = edge_weight_for(&RelationshipType::Contains, &self.config.graph);
174
175 let mut graph = self.lock_graph()?;
176
177 let file_nodes: Vec<GraphNode> = seen_files
179 .iter()
180 .map(|file_path| {
181 let mut payload = HashMap::new();
182 payload.insert(
183 "file_path".to_string(),
184 serde_json::Value::String(file_path.clone()),
185 );
186 GraphNode {
187 id: format!("file:{file_path}"),
188 kind: NodeKind::File,
189 label: file_path.clone(),
190 payload,
191 centrality: 0.0,
192 memory_id: None,
193 namespace: ns_string.clone(),
194 valid_from: None,
195 valid_to: None,
196 }
197 })
198 .collect();
199 self.persist_nodes_to_storage_and_graph(&file_nodes, &mut **graph);
200
201 let (dir_nodes, dir_edges, created_dirs) =
203 self.build_package_tree(seen_files, &ns_string, contains_weight, now);
204 self.persist_nodes_to_storage_and_graph(&dir_nodes, &mut **graph);
205 self.persist_edges_to_storage_and_graph(&dir_edges, &mut **graph);
206
207 let (sym_nodes, sym_edges) =
209 Self::build_symbol_nodes(all_symbols, &ns_string, contains_weight, now);
210
211 let mut old_syms_by_file: HashMap<String, HashSet<String>> = HashMap::new();
219 for node in graph.get_all_nodes() {
220 if !node.id.starts_with("sym:") {
221 continue;
222 }
223 if matches!(
228 node.payload.get("source").and_then(|v| v.as_str()),
229 Some("scip" | "scip-synthetic")
230 ) {
231 continue;
232 }
233 let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) else {
234 continue;
235 };
236 if !seen_files.contains(fp) {
237 continue;
238 }
239 old_syms_by_file
240 .entry(fp.to_string())
241 .or_default()
242 .insert(node.id);
243 }
244 drop(graph);
245 for file_path in seen_files {
246 let new_sym_ids: HashSet<String> = sym_nodes
247 .iter()
248 .filter(|n| {
249 n.payload.get("file_path").and_then(|v| v.as_str()) == Some(file_path.as_str())
250 })
251 .map(|n| n.id.clone())
252 .collect();
253 let empty = HashSet::new();
254 let old_sym_ids = old_syms_by_file.get(file_path).unwrap_or(&empty);
255 if let Err(e) = self.cleanup_stale_symbols(file_path, old_sym_ids, &new_sym_ids) {
256 tracing::warn!("Failed to cleanup stale symbols for {file_path}: {e}");
257 }
258 }
259 let mut graph = self.lock_graph()?; self.persist_nodes_to_storage_and_graph(&sym_nodes, &mut **graph);
262 self.persist_edges_to_storage_and_graph(&sym_edges, &mut **graph);
263
264 let ref_edges = Self::build_reference_edges(edges, &self.config.graph, now);
266 self.persist_edges_to_storage_and_graph(&ref_edges, &mut **graph);
267
268 if let Some(ref scip_build) = results.scip_build {
270 let new_scip_ids: HashSet<&str> =
273 scip_build.nodes.iter().map(|n| n.id.as_str()).collect();
274 let mut stale_scip_ids = Vec::new();
275 for node in graph.get_all_nodes() {
276 if !node.id.starts_with("sym:") {
277 continue;
278 }
279 if !matches!(
280 node.payload.get("source").and_then(|v| v.as_str()),
281 Some("scip" | "scip-synthetic")
282 ) {
283 continue;
284 }
285 if !new_scip_ids.contains(node.id.as_str()) {
286 if let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) {
288 if seen_files.contains(fp) {
289 stale_scip_ids.push(node.id.clone());
290 }
291 }
292 }
293 }
294 for stale_id in &stale_scip_ids {
295 let _ = graph.remove_node(stale_id);
296 let _ = self.storage.delete_graph_nodes_by_prefix(stale_id);
297 if let Some(qname) = stale_id.strip_prefix("sym:") {
299 let doc_id = format!("scip-doc:{qname}");
300 let _ = self.storage.delete_memory(&doc_id);
301 }
302 }
303 if !stale_scip_ids.is_empty() {
304 tracing::info!(
305 "Cleaned up {} stale SCIP nodes from re-index",
306 stale_scip_ids.len()
307 );
308 }
309
310 self.persist_nodes_to_storage_and_graph(&scip_build.nodes, &mut **graph);
311
312 let (fused_edges, superseded_ids) = Self::fuse_edges(&ref_edges, &scip_build.edges);
315
316 for edge_id in &superseded_ids {
318 let _ = graph.remove_edge(edge_id);
319 let _ = self.storage.delete_graph_edge(edge_id);
320 }
321
322 self.persist_edges_to_storage_and_graph(&fused_edges, &mut **graph);
323
324 for (memory, related_node_id) in &scip_build.memories {
326 let _ = self.storage.insert_memory(memory);
327 let relates_edge = Edge {
328 id: format!("relates:{}->mem:{}", related_node_id, memory.id),
329 src: related_node_id.clone(),
330 dst: format!("mem:{}", memory.id),
331 relationship: RelationshipType::RelatesTo,
332 weight: 0.3,
333 properties: HashMap::new(),
334 created_at: now,
335 valid_from: Some(now),
336 valid_to: None,
337 };
338 let _ = graph.add_edge(relates_edge.clone());
339 let _ = self.storage.insert_graph_edges_batch(&[relates_edge]);
340 }
341 }
342
343 for file_path in seen_files {
345 let prefix = format!("chunk:{file_path}:");
346 let _ = self.storage.delete_graph_nodes_by_prefix(&prefix);
347 }
348 let (chunk_nodes, chunk_edges) =
349 Self::build_chunk_nodes(all_chunks, &ns_string, contains_weight, now);
350 let chunk_count = chunk_nodes.len();
351 self.persist_nodes_to_storage_and_graph(&chunk_nodes, &mut **graph);
352 self.persist_edges_to_storage_and_graph(&chunk_edges, &mut **graph);
353
354 drop(graph);
355
356 Ok(GraphPersistCounts {
357 packages_created: created_dirs,
358 chunks_stored: chunk_count,
359 })
360 }
361
362 fn persist_nodes_to_storage_and_graph(
364 &self,
365 nodes: &[GraphNode],
366 graph: &mut dyn codemem_core::GraphBackend,
367 ) {
368 if let Err(e) = self.storage.insert_graph_nodes_batch(nodes) {
369 tracing::warn!("Failed to batch-insert {} graph nodes: {e}", nodes.len());
370 }
371 for node in nodes {
372 let _ = graph.add_node(node.clone());
373 }
374 }
375
376 fn persist_edges_to_storage_and_graph(
378 &self,
379 edges: &[Edge],
380 graph: &mut dyn codemem_core::GraphBackend,
381 ) {
382 if let Err(e) = self.storage.insert_graph_edges_batch(edges) {
383 tracing::warn!("Failed to batch-insert {} graph edges: {e}", edges.len());
384 }
385 for edge in edges {
386 let _ = graph.add_edge(edge.clone());
387 }
388 }
389
390 fn build_package_tree(
393 &self,
394 seen_files: &HashSet<String>,
395 ns_string: &Option<String>,
396 contains_weight: f64,
397 now: chrono::DateTime<chrono::Utc>,
398 ) -> (Vec<GraphNode>, Vec<Edge>, usize) {
399 let mut created_dirs: HashSet<String> = HashSet::new();
400 let mut created_edge_ids: HashSet<String> = HashSet::new();
401 let mut dir_nodes = Vec::new();
402 let mut dir_edges = Vec::new();
403
404 for file_path in seen_files {
405 let p = std::path::Path::new(file_path);
406 let mut ancestors: Vec<String> = Vec::new();
407 let mut current = p.parent();
408 while let Some(dir) = current {
409 let dir_str = dir.to_string_lossy().to_string();
410 if dir_str.is_empty() || dir_str == "." {
411 break;
412 }
413 ancestors.push(dir_str);
414 current = dir.parent();
415 }
416 ancestors.reverse();
417 for (i, dir_str) in ancestors.iter().enumerate() {
418 let pkg_id = format!("pkg:{dir_str}/");
419 if created_dirs.insert(pkg_id.clone()) {
420 dir_nodes.push(GraphNode {
421 id: pkg_id.clone(),
422 kind: NodeKind::Package,
423 label: format!("{dir_str}/"),
424 payload: HashMap::new(),
425 centrality: 0.0,
426 memory_id: None,
427 namespace: ns_string.clone(),
428 valid_from: None,
429 valid_to: None,
430 });
431 }
432 if i == 0 {
433 continue;
434 }
435 let parent_pkg_id = format!("pkg:{}/", ancestors[i - 1]);
436 let edge_id = format!("contains:{parent_pkg_id}->{pkg_id}");
437 if !created_edge_ids.insert(edge_id.clone()) {
441 continue;
442 }
443 dir_edges.push(Edge {
444 id: edge_id,
445 src: parent_pkg_id,
446 dst: pkg_id.clone(),
447 relationship: RelationshipType::Contains,
448 weight: contains_weight,
449 valid_from: Some(now),
450 valid_to: None,
451 properties: HashMap::new(),
452 created_at: now,
453 });
454 }
455 if let Some(last_dir) = ancestors.last() {
456 let parent_pkg_id = format!("pkg:{last_dir}/");
457 let file_node_id = format!("file:{file_path}");
458 let edge_id = format!("contains:{parent_pkg_id}->{file_node_id}");
459 dir_edges.push(Edge {
460 id: edge_id,
461 src: parent_pkg_id,
462 dst: file_node_id,
463 relationship: RelationshipType::Contains,
464 weight: contains_weight,
465 valid_from: Some(now),
466 valid_to: None,
467 properties: HashMap::new(),
468 created_at: now,
469 });
470 }
471 }
472
473 let count = created_dirs.len();
474 (dir_nodes, dir_edges, count)
475 }
476
477 fn build_symbol_nodes(
479 symbols: &[Symbol],
480 ns_string: &Option<String>,
481 contains_weight: f64,
482 now: chrono::DateTime<chrono::Utc>,
483 ) -> (Vec<GraphNode>, Vec<Edge>) {
484 let mut sym_nodes = Vec::with_capacity(symbols.len());
485 let mut sym_edges = Vec::with_capacity(symbols.len());
486
487 for sym in symbols {
488 let kind = NodeKind::from(sym.kind);
489 let payload = Self::build_symbol_payload(sym);
490
491 let sym_node_id = format!("sym:{}", sym.qualified_name);
492 sym_nodes.push(GraphNode {
493 id: sym_node_id.clone(),
494 kind,
495 label: sym.qualified_name.clone(),
496 payload,
497 centrality: 0.0,
498 memory_id: None,
499 namespace: ns_string.clone(),
500 valid_from: None,
501 valid_to: None,
502 });
503
504 let file_node_id = format!("file:{}", sym.file_path);
505 sym_edges.push(Edge {
506 id: format!("contains:{file_node_id}->{sym_node_id}"),
507 src: file_node_id,
508 dst: sym_node_id,
509 relationship: RelationshipType::Contains,
510 weight: contains_weight,
511 valid_from: Some(now),
512 valid_to: None,
513 properties: HashMap::new(),
514 created_at: now,
515 });
516 }
517
518 (sym_nodes, sym_edges)
519 }
520
521 fn build_symbol_payload(sym: &Symbol) -> HashMap<String, serde_json::Value> {
523 let mut payload = HashMap::new();
524 payload.insert(
525 "symbol_kind".to_string(),
526 serde_json::Value::String(sym.kind.to_string()),
527 );
528 payload.insert(
529 "signature".to_string(),
530 serde_json::Value::String(sym.signature.clone()),
531 );
532 payload.insert(
533 "file_path".to_string(),
534 serde_json::Value::String(sym.file_path.clone()),
535 );
536 payload.insert("line_start".to_string(), serde_json::json!(sym.line_start));
537 payload.insert("line_end".to_string(), serde_json::json!(sym.line_end));
538 payload.insert(
539 "visibility".to_string(),
540 serde_json::Value::String(sym.visibility.to_string()),
541 );
542 if let Some(ref doc) = sym.doc_comment {
543 payload.insert(
544 "doc_comment".to_string(),
545 serde_json::Value::String(doc.clone()),
546 );
547 }
548 if !sym.parameters.is_empty() {
549 payload.insert(
550 "parameters".to_string(),
551 serde_json::to_value(&sym.parameters).unwrap_or_default(),
552 );
553 }
554 if let Some(ref ret) = sym.return_type {
555 payload.insert(
556 "return_type".to_string(),
557 serde_json::Value::String(ret.clone()),
558 );
559 }
560 if sym.is_async {
561 payload.insert("is_async".to_string(), serde_json::json!(true));
562 }
563 if !sym.attributes.is_empty() {
564 payload.insert(
565 "attributes".to_string(),
566 serde_json::to_value(&sym.attributes).unwrap_or_default(),
567 );
568 }
569 if !sym.throws.is_empty() {
570 payload.insert(
571 "throws".to_string(),
572 serde_json::to_value(&sym.throws).unwrap_or_default(),
573 );
574 }
575 if let Some(ref gp) = sym.generic_params {
576 payload.insert(
577 "generic_params".to_string(),
578 serde_json::Value::String(gp.clone()),
579 );
580 }
581 if sym.is_abstract {
582 payload.insert("is_abstract".to_string(), serde_json::json!(true));
583 }
584 if let Some(ref parent) = sym.parent {
585 payload.insert(
586 "parent".to_string(),
587 serde_json::Value::String(parent.clone()),
588 );
589 }
590 payload
591 }
592
593 const AST_GREP_BASE_CONFIDENCE: f64 = 0.10;
596
597 fn build_reference_edges(
598 edges: &[ResolvedEdge],
599 graph_config: &GraphConfig,
600 now: chrono::DateTime<chrono::Utc>,
601 ) -> Vec<Edge> {
602 edges
603 .iter()
604 .map(|edge| {
605 let mut properties = HashMap::new();
606 properties.insert("source".to_string(), serde_json::json!("ast-grep"));
607 properties.insert(
608 "confidence".to_string(),
609 serde_json::json!(Self::AST_GREP_BASE_CONFIDENCE),
610 );
611 properties.insert("source_layers".to_string(), serde_json::json!(["ast-grep"]));
612 let base_weight = edge_weight_for(&edge.relationship, graph_config);
616 let weight = base_weight * edge.resolution_confidence;
617 Edge {
618 id: format!(
619 "ref:{}->{}:{}",
620 edge.source_qualified_name, edge.target_qualified_name, edge.relationship
621 ),
622 src: format!("sym:{}", edge.source_qualified_name),
623 dst: format!("sym:{}", edge.target_qualified_name),
624 relationship: edge.relationship,
625 weight,
626 valid_from: Some(now),
627 valid_to: None,
628 properties,
629 created_at: now,
630 }
631 })
632 .collect()
633 }
634
635 fn fuse_edges(ast_grep_edges: &[Edge], scip_edges: &[Edge]) -> (Vec<Edge>, Vec<String>) {
642 let ast_grep_index: HashMap<(String, String, String), &str> = ast_grep_edges
644 .iter()
645 .map(|e| {
646 (
647 (e.src.clone(), e.dst.clone(), e.relationship.to_string()),
648 e.id.as_str(),
649 )
650 })
651 .collect();
652
653 let mut superseded_ids = Vec::new();
654
655 let fused = scip_edges
656 .iter()
657 .map(|scip_edge| {
658 let key = (
659 scip_edge.src.clone(),
660 scip_edge.dst.clone(),
661 scip_edge.relationship.to_string(),
662 );
663 if let Some(&ast_edge_id) = ast_grep_index.get(&key) {
664 superseded_ids.push(ast_edge_id.to_string());
666 let mut fused = scip_edge.clone();
667 let scip_conf = scip_edge
668 .properties
669 .get("confidence")
670 .and_then(|v| v.as_f64())
671 .unwrap_or(0.15);
672 let fused_conf = scip_conf + Self::AST_GREP_BASE_CONFIDENCE;
673 fused
674 .properties
675 .insert("confidence".to_string(), serde_json::json!(fused_conf));
676 fused.properties.insert(
677 "source_layers".to_string(),
678 serde_json::json!(["ast-grep", "scip"]),
679 );
680 fused
681 } else {
682 scip_edge.clone()
683 }
684 })
685 .collect();
686
687 (fused, superseded_ids)
688 }
689
690 fn build_chunk_nodes(
692 chunks: &[CodeChunk],
693 ns_string: &Option<String>,
694 contains_weight: f64,
695 now: chrono::DateTime<chrono::Utc>,
696 ) -> (Vec<GraphNode>, Vec<Edge>) {
697 let mut chunk_nodes = Vec::with_capacity(chunks.len());
698 let mut chunk_edges = Vec::with_capacity(chunks.len() * 2);
699
700 for chunk in chunks {
701 let chunk_id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
702
703 let mut payload = HashMap::new();
704 payload.insert(
705 "file_path".to_string(),
706 serde_json::Value::String(chunk.file_path.clone()),
707 );
708 payload.insert(
709 "line_start".to_string(),
710 serde_json::json!(chunk.line_start),
711 );
712 payload.insert("line_end".to_string(), serde_json::json!(chunk.line_end));
713 payload.insert(
714 "node_kind".to_string(),
715 serde_json::Value::String(chunk.node_kind.clone()),
716 );
717 payload.insert(
718 "non_ws_chars".to_string(),
719 serde_json::json!(chunk.non_ws_chars),
720 );
721 if let Some(ref parent) = chunk.parent_symbol {
722 payload.insert(
723 "parent_symbol".to_string(),
724 serde_json::Value::String(parent.clone()),
725 );
726 }
727
728 chunk_nodes.push(GraphNode {
729 id: chunk_id.clone(),
730 kind: NodeKind::Chunk,
731 label: format!(
732 "chunk:{}:{}..{}",
733 chunk.file_path, chunk.line_start, chunk.line_end
734 ),
735 payload,
736 centrality: 0.0,
737 memory_id: None,
738 namespace: ns_string.clone(),
739 valid_from: None,
740 valid_to: None,
741 });
742
743 let file_node_id = format!("file:{}", chunk.file_path);
744 chunk_edges.push(Edge {
745 id: format!("contains:{file_node_id}->{chunk_id}"),
746 src: file_node_id,
747 dst: chunk_id.clone(),
748 relationship: RelationshipType::Contains,
749 weight: contains_weight,
750 valid_from: Some(now),
751 valid_to: None,
752 properties: HashMap::new(),
753 created_at: now,
754 });
755
756 if let Some(ref parent_sym) = chunk.parent_symbol {
757 let parent_node_id = format!("sym:{parent_sym}");
758 chunk_edges.push(Edge {
759 id: format!("contains:{parent_node_id}->{chunk_id}"),
760 src: parent_node_id,
761 dst: chunk_id,
762 relationship: RelationshipType::Contains,
763 weight: contains_weight,
764 valid_from: Some(now),
765 valid_to: None,
766 properties: HashMap::new(),
767 created_at: now,
768 });
769 }
770 }
771
772 (chunk_nodes, chunk_edges)
773 }
774
775 fn embed_and_persist(
782 &self,
783 symbols: &[Symbol],
784 chunks: &[CodeChunk],
785 edges: &[ResolvedEdge],
786 on_progress: impl Fn(usize, usize),
787 ) -> Result<(usize, usize), CodememError> {
788 let mut symbols_embedded = 0usize;
789 let mut chunks_embedded = 0usize;
790
791 if !self.embeddings_ready() {
794 return Ok((0, 0));
795 }
796
797 let sym_texts: Vec<(String, String)> = symbols
799 .iter()
800 .map(|sym| {
801 let id = format!("sym:{}", sym.qualified_name);
802 let text = self.enrich_symbol_text(sym, edges);
803 (id, text)
804 })
805 .collect();
806 let chunk_texts: Vec<(String, String)> = chunks
807 .iter()
808 .map(|chunk| {
809 let id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
810 let text = self.enrich_chunk_text(chunk);
811 (id, text)
812 })
813 .collect();
814
815 let embed_batch_size = self.config.embedding.batch_size;
817
818 let all_pairs: Vec<(String, String)> = sym_texts.into_iter().chain(chunk_texts).collect();
819 let total = all_pairs.len();
820 let sym_count = symbols.len();
821 let mut done = 0usize;
822
823 for batch in all_pairs.chunks(embed_batch_size) {
824 let texts: Vec<&str> = batch.iter().map(|(_, t)| t.as_str()).collect();
825
826 let t0 = std::time::Instant::now();
827 let embed_result = {
828 let emb = self.lock_embeddings()?;
829 match emb {
830 Some(emb_guard) => emb_guard.embed_batch(&texts),
831 None => break,
832 }
833 };
834
835 match embed_result {
836 Ok(embeddings) => {
837 let embed_ms = t0.elapsed().as_millis();
838
839 let t1 = std::time::Instant::now();
840 let pairs: Vec<(&str, &[f32])> = batch
841 .iter()
842 .zip(embeddings.iter())
843 .map(|((id, _), emb_vec)| (id.as_str(), emb_vec.as_slice()))
844 .collect();
845 if let Err(e) = self.storage.store_embeddings_batch(&pairs) {
846 tracing::warn!("Failed to batch-store embeddings: {e}");
847 }
848 let sqlite_ms = t1.elapsed().as_millis();
849
850 let t2 = std::time::Instant::now();
851 let batch_items: Vec<(String, Vec<f32>)> = batch
852 .iter()
853 .zip(embeddings.into_iter())
854 .map(|((id, _), emb_vec)| (id.clone(), emb_vec))
855 .collect();
856 let batch_len = batch_items.len();
857 {
858 let mut vec = self.lock_vector()?;
859 if let Err(e) = vec.insert_batch(&batch_items) {
860 tracing::warn!("Failed to batch-insert into vector index: {e}");
861 }
862 }
863 let vector_ms = t2.elapsed().as_millis();
864
865 let syms_in_batch = batch_len.min(sym_count.saturating_sub(done));
866 symbols_embedded += syms_in_batch;
867 chunks_embedded += batch_len - syms_in_batch;
868 done += batch_len;
869
870 tracing::debug!(
871 "Embed batch {}: embed={embed_ms}ms sqlite={sqlite_ms}ms vector={vector_ms}ms",
872 batch_len
873 );
874 }
875 Err(e) => {
876 tracing::warn!("embed_batch failed for chunk of {} texts: {e}", batch.len());
877 }
878 }
879 on_progress(done, total);
880 }
881 self.save_index();
882
883 Ok((symbols_embedded, chunks_embedded))
884 }
885}