1mod compaction;
5pub mod cross_repo;
6
7use crate::index::{CodeChunk, ResolvedEdge, Symbol};
8use crate::IndexAndResolveResult;
9use codemem_core::{CodememError, Edge, GraphConfig, GraphNode, NodeKind, RelationshipType};
10use std::collections::{HashMap, HashSet};
11
12#[derive(Debug, Default)]
14pub struct IndexPersistResult {
15 pub files_created: usize,
16 pub packages_created: usize,
17 pub symbols_stored: usize,
18 pub chunks_stored: usize,
19 pub edges_resolved: usize,
20 pub symbols_embedded: usize,
21 pub chunks_embedded: usize,
22 pub chunks_pruned: usize,
23 pub symbols_pruned: usize,
24}
25
26#[derive(Debug, Default)]
28pub struct CrossRepoPersistResult {
29 pub packages_registered: usize,
30 pub unresolved_refs_stored: usize,
31 pub forward_edges_created: usize,
32 pub backward_edges_created: usize,
33 pub endpoints_detected: usize,
34 pub client_calls_detected: usize,
35 pub spec_endpoints_detected: usize,
36 pub event_channels_detected: usize,
37 pub http_edges_matched: usize,
38 pub event_edges_matched: usize,
39}
40
41pub fn edge_weight_for(rel: &RelationshipType, config: &GraphConfig) -> f64 {
44 match rel {
45 RelationshipType::Calls => config.calls_edge_weight,
46 RelationshipType::Imports => config.imports_edge_weight,
47 RelationshipType::Contains => config.contains_edge_weight,
48 RelationshipType::TypeDefinition => config.type_definition_edge_weight,
49 RelationshipType::Reads => config.reads_edge_weight,
50 RelationshipType::Writes => config.writes_edge_weight,
51 RelationshipType::Overrides => config.overrides_edge_weight,
52 RelationshipType::Implements | RelationshipType::Inherits => 0.8,
53 RelationshipType::DependsOn => 0.7,
54 RelationshipType::CoChanged => 0.6,
55 RelationshipType::EvolvedInto | RelationshipType::Summarizes => 0.7,
56 RelationshipType::PartOf => 0.4,
57 RelationshipType::RelatesTo | RelationshipType::SharesTheme => 0.3,
58 RelationshipType::HttpCalls => 0.7,
59 RelationshipType::PublishesTo | RelationshipType::SubscribesTo => 0.6,
60 _ => 0.5,
61 }
62}
63
64struct GraphPersistCounts {
66 packages_created: usize,
67 chunks_stored: usize,
68}
69
70impl super::CodememEngine {
71 pub fn persist_index_results(
76 &self,
77 results: &IndexAndResolveResult,
78 namespace: Option<&str>,
79 ) -> Result<IndexPersistResult, CodememError> {
80 self.persist_index_results_with_progress(results, namespace, |_, _| {})
81 }
82
83 pub fn persist_graph_only(
89 &self,
90 results: &IndexAndResolveResult,
91 namespace: Option<&str>,
92 ) -> Result<IndexPersistResult, CodememError> {
93 let seen_files = &results.file_paths;
94 let graph_counts = self.persist_graph_nodes(results, namespace)?;
95
96 let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
97 self.compact_graph(seen_files)
98 } else {
99 (0, 0)
100 };
101
102 Ok(IndexPersistResult {
103 files_created: seen_files.len(),
104 packages_created: graph_counts.packages_created,
105 symbols_stored: results.symbols.len(),
106 chunks_stored: graph_counts.chunks_stored,
107 edges_resolved: results.edges.len(),
108 symbols_embedded: 0,
109 chunks_embedded: 0,
110 chunks_pruned,
111 symbols_pruned,
112 })
113 }
114
115 pub fn persist_index_results_with_progress(
118 &self,
119 results: &IndexAndResolveResult,
120 namespace: Option<&str>,
121 on_progress: impl Fn(usize, usize),
122 ) -> Result<IndexPersistResult, CodememError> {
123 let seen_files = &results.file_paths;
124
125 let graph_counts = self.persist_graph_nodes(results, namespace)?;
127
128 let (symbols_embedded, chunks_embedded) = self.embed_and_persist(
130 &results.symbols,
131 &results.chunks,
132 &results.edges,
133 on_progress,
134 )?;
135
136 let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
138 self.compact_graph(seen_files)
139 } else {
140 (0, 0)
141 };
142
143 Ok(IndexPersistResult {
144 files_created: seen_files.len(),
145 packages_created: graph_counts.packages_created,
146 symbols_stored: results.symbols.len(),
147 chunks_stored: graph_counts.chunks_stored,
148 edges_resolved: results.edges.len(),
149 symbols_embedded,
150 chunks_embedded,
151 chunks_pruned,
152 symbols_pruned,
153 })
154 }
155
156 fn persist_graph_nodes(
161 &self,
162 results: &IndexAndResolveResult,
163 namespace: Option<&str>,
164 ) -> Result<GraphPersistCounts, CodememError> {
165 let all_symbols = &results.symbols;
166 let all_chunks = &results.chunks;
167 let seen_files = &results.file_paths;
168 let edges = &results.edges;
169
170 let now = chrono::Utc::now();
171 let ns_string = namespace.map(|s| s.to_string());
172 let contains_weight = edge_weight_for(&RelationshipType::Contains, &self.config.graph);
173
174 let mut graph = self.lock_graph()?;
175
176 let file_nodes: Vec<GraphNode> = seen_files
178 .iter()
179 .map(|file_path| {
180 let mut payload = HashMap::new();
181 payload.insert(
182 "file_path".to_string(),
183 serde_json::Value::String(file_path.clone()),
184 );
185 GraphNode {
186 id: format!("file:{file_path}"),
187 kind: NodeKind::File,
188 label: file_path.clone(),
189 payload,
190 centrality: 0.0,
191 memory_id: None,
192 namespace: ns_string.clone(),
193 }
194 })
195 .collect();
196 self.persist_nodes_to_storage_and_graph(&file_nodes, &mut **graph);
197
198 let (dir_nodes, dir_edges, created_dirs) =
200 self.build_package_tree(seen_files, &ns_string, contains_weight, now);
201 self.persist_nodes_to_storage_and_graph(&dir_nodes, &mut **graph);
202 self.persist_edges_to_storage_and_graph(&dir_edges, &mut **graph);
203
204 let (sym_nodes, sym_edges) =
206 Self::build_symbol_nodes(all_symbols, &ns_string, contains_weight, now);
207
208 let mut old_syms_by_file: HashMap<String, HashSet<String>> = HashMap::new();
216 for node in graph.get_all_nodes() {
217 if !node.id.starts_with("sym:") {
218 continue;
219 }
220 if matches!(
225 node.payload.get("source").and_then(|v| v.as_str()),
226 Some("scip" | "scip-synthetic")
227 ) {
228 continue;
229 }
230 let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) else {
231 continue;
232 };
233 if !seen_files.contains(fp) {
234 continue;
235 }
236 old_syms_by_file
237 .entry(fp.to_string())
238 .or_default()
239 .insert(node.id);
240 }
241 drop(graph);
242 for file_path in seen_files {
243 let new_sym_ids: HashSet<String> = sym_nodes
244 .iter()
245 .filter(|n| {
246 n.payload.get("file_path").and_then(|v| v.as_str()) == Some(file_path.as_str())
247 })
248 .map(|n| n.id.clone())
249 .collect();
250 let empty = HashSet::new();
251 let old_sym_ids = old_syms_by_file.get(file_path).unwrap_or(&empty);
252 if let Err(e) = self.cleanup_stale_symbols(file_path, old_sym_ids, &new_sym_ids) {
253 tracing::warn!("Failed to cleanup stale symbols for {file_path}: {e}");
254 }
255 }
256 let mut graph = self.lock_graph()?; self.persist_nodes_to_storage_and_graph(&sym_nodes, &mut **graph);
259 self.persist_edges_to_storage_and_graph(&sym_edges, &mut **graph);
260
261 let ref_edges = Self::build_reference_edges(edges, &self.config.graph, now);
263 self.persist_edges_to_storage_and_graph(&ref_edges, &mut **graph);
264
265 if let Some(ref scip_build) = results.scip_build {
267 let new_scip_ids: HashSet<&str> =
270 scip_build.nodes.iter().map(|n| n.id.as_str()).collect();
271 let mut stale_scip_ids = Vec::new();
272 for node in graph.get_all_nodes() {
273 if !node.id.starts_with("sym:") {
274 continue;
275 }
276 if !matches!(
277 node.payload.get("source").and_then(|v| v.as_str()),
278 Some("scip" | "scip-synthetic")
279 ) {
280 continue;
281 }
282 if !new_scip_ids.contains(node.id.as_str()) {
283 if let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) {
285 if seen_files.contains(fp) {
286 stale_scip_ids.push(node.id.clone());
287 }
288 }
289 }
290 }
291 for stale_id in &stale_scip_ids {
292 let _ = graph.remove_node(stale_id);
293 let _ = self.storage.delete_graph_nodes_by_prefix(stale_id);
294 if let Some(qname) = stale_id.strip_prefix("sym:") {
296 let doc_id = format!("scip-doc:{qname}");
297 let _ = self.storage.delete_memory(&doc_id);
298 }
299 }
300 if !stale_scip_ids.is_empty() {
301 tracing::info!(
302 "Cleaned up {} stale SCIP nodes from re-index",
303 stale_scip_ids.len()
304 );
305 }
306
307 self.persist_nodes_to_storage_and_graph(&scip_build.nodes, &mut **graph);
308
309 let (fused_edges, superseded_ids) = Self::fuse_edges(&ref_edges, &scip_build.edges);
312
313 for edge_id in &superseded_ids {
315 let _ = graph.remove_edge(edge_id);
316 let _ = self.storage.delete_graph_edge(edge_id);
317 }
318
319 self.persist_edges_to_storage_and_graph(&fused_edges, &mut **graph);
320
321 for (memory, related_node_id) in &scip_build.memories {
323 let _ = self.storage.insert_memory(memory);
324 let relates_edge = Edge {
325 id: format!("relates:{}->mem:{}", related_node_id, memory.id),
326 src: related_node_id.clone(),
327 dst: format!("mem:{}", memory.id),
328 relationship: RelationshipType::RelatesTo,
329 weight: 0.3,
330 properties: HashMap::new(),
331 created_at: now,
332 valid_from: Some(now),
333 valid_to: None,
334 };
335 let _ = graph.add_edge(relates_edge.clone());
336 let _ = self.storage.insert_graph_edges_batch(&[relates_edge]);
337 }
338 }
339
340 for file_path in seen_files {
342 let prefix = format!("chunk:{file_path}:");
343 let _ = self.storage.delete_graph_nodes_by_prefix(&prefix);
344 }
345 let (chunk_nodes, chunk_edges) =
346 Self::build_chunk_nodes(all_chunks, &ns_string, contains_weight, now);
347 let chunk_count = chunk_nodes.len();
348 self.persist_nodes_to_storage_and_graph(&chunk_nodes, &mut **graph);
349 self.persist_edges_to_storage_and_graph(&chunk_edges, &mut **graph);
350
351 drop(graph);
352
353 Ok(GraphPersistCounts {
354 packages_created: created_dirs,
355 chunks_stored: chunk_count,
356 })
357 }
358
359 fn persist_nodes_to_storage_and_graph(
361 &self,
362 nodes: &[GraphNode],
363 graph: &mut dyn codemem_core::GraphBackend,
364 ) {
365 if let Err(e) = self.storage.insert_graph_nodes_batch(nodes) {
366 tracing::warn!("Failed to batch-insert {} graph nodes: {e}", nodes.len());
367 }
368 for node in nodes {
369 let _ = graph.add_node(node.clone());
370 }
371 }
372
373 fn persist_edges_to_storage_and_graph(
375 &self,
376 edges: &[Edge],
377 graph: &mut dyn codemem_core::GraphBackend,
378 ) {
379 if let Err(e) = self.storage.insert_graph_edges_batch(edges) {
380 tracing::warn!("Failed to batch-insert {} graph edges: {e}", edges.len());
381 }
382 for edge in edges {
383 let _ = graph.add_edge(edge.clone());
384 }
385 }
386
387 fn build_package_tree(
390 &self,
391 seen_files: &HashSet<String>,
392 ns_string: &Option<String>,
393 contains_weight: f64,
394 now: chrono::DateTime<chrono::Utc>,
395 ) -> (Vec<GraphNode>, Vec<Edge>, usize) {
396 let mut created_dirs: HashSet<String> = HashSet::new();
397 let mut created_edge_ids: HashSet<String> = HashSet::new();
398 let mut dir_nodes = Vec::new();
399 let mut dir_edges = Vec::new();
400
401 for file_path in seen_files {
402 let p = std::path::Path::new(file_path);
403 let mut ancestors: Vec<String> = Vec::new();
404 let mut current = p.parent();
405 while let Some(dir) = current {
406 let dir_str = dir.to_string_lossy().to_string();
407 if dir_str.is_empty() || dir_str == "." {
408 break;
409 }
410 ancestors.push(dir_str);
411 current = dir.parent();
412 }
413 ancestors.reverse();
414 for (i, dir_str) in ancestors.iter().enumerate() {
415 let pkg_id = format!("pkg:{dir_str}/");
416 if created_dirs.insert(pkg_id.clone()) {
417 dir_nodes.push(GraphNode {
418 id: pkg_id.clone(),
419 kind: NodeKind::Package,
420 label: format!("{dir_str}/"),
421 payload: HashMap::new(),
422 centrality: 0.0,
423 memory_id: None,
424 namespace: ns_string.clone(),
425 });
426 }
427 if i == 0 {
428 continue;
429 }
430 let parent_pkg_id = format!("pkg:{}/", ancestors[i - 1]);
431 let edge_id = format!("contains:{parent_pkg_id}->{pkg_id}");
432 if !created_edge_ids.insert(edge_id.clone()) {
436 continue;
437 }
438 dir_edges.push(Edge {
439 id: edge_id,
440 src: parent_pkg_id,
441 dst: pkg_id.clone(),
442 relationship: RelationshipType::Contains,
443 weight: contains_weight,
444 valid_from: Some(now),
445 valid_to: None,
446 properties: HashMap::new(),
447 created_at: now,
448 });
449 }
450 if let Some(last_dir) = ancestors.last() {
451 let parent_pkg_id = format!("pkg:{last_dir}/");
452 let file_node_id = format!("file:{file_path}");
453 let edge_id = format!("contains:{parent_pkg_id}->{file_node_id}");
454 dir_edges.push(Edge {
455 id: edge_id,
456 src: parent_pkg_id,
457 dst: file_node_id,
458 relationship: RelationshipType::Contains,
459 weight: contains_weight,
460 valid_from: Some(now),
461 valid_to: None,
462 properties: HashMap::new(),
463 created_at: now,
464 });
465 }
466 }
467
468 let count = created_dirs.len();
469 (dir_nodes, dir_edges, count)
470 }
471
472 fn build_symbol_nodes(
474 symbols: &[Symbol],
475 ns_string: &Option<String>,
476 contains_weight: f64,
477 now: chrono::DateTime<chrono::Utc>,
478 ) -> (Vec<GraphNode>, Vec<Edge>) {
479 let mut sym_nodes = Vec::with_capacity(symbols.len());
480 let mut sym_edges = Vec::with_capacity(symbols.len());
481
482 for sym in symbols {
483 let kind = NodeKind::from(sym.kind);
484 let payload = Self::build_symbol_payload(sym);
485
486 let sym_node_id = format!("sym:{}", sym.qualified_name);
487 sym_nodes.push(GraphNode {
488 id: sym_node_id.clone(),
489 kind,
490 label: sym.qualified_name.clone(),
491 payload,
492 centrality: 0.0,
493 memory_id: None,
494 namespace: ns_string.clone(),
495 });
496
497 let file_node_id = format!("file:{}", sym.file_path);
498 sym_edges.push(Edge {
499 id: format!("contains:{file_node_id}->{sym_node_id}"),
500 src: file_node_id,
501 dst: sym_node_id,
502 relationship: RelationshipType::Contains,
503 weight: contains_weight,
504 valid_from: Some(now),
505 valid_to: None,
506 properties: HashMap::new(),
507 created_at: now,
508 });
509 }
510
511 (sym_nodes, sym_edges)
512 }
513
514 fn build_symbol_payload(sym: &Symbol) -> HashMap<String, serde_json::Value> {
516 let mut payload = HashMap::new();
517 payload.insert(
518 "symbol_kind".to_string(),
519 serde_json::Value::String(sym.kind.to_string()),
520 );
521 payload.insert(
522 "signature".to_string(),
523 serde_json::Value::String(sym.signature.clone()),
524 );
525 payload.insert(
526 "file_path".to_string(),
527 serde_json::Value::String(sym.file_path.clone()),
528 );
529 payload.insert("line_start".to_string(), serde_json::json!(sym.line_start));
530 payload.insert("line_end".to_string(), serde_json::json!(sym.line_end));
531 payload.insert(
532 "visibility".to_string(),
533 serde_json::Value::String(sym.visibility.to_string()),
534 );
535 if let Some(ref doc) = sym.doc_comment {
536 payload.insert(
537 "doc_comment".to_string(),
538 serde_json::Value::String(doc.clone()),
539 );
540 }
541 if !sym.parameters.is_empty() {
542 payload.insert(
543 "parameters".to_string(),
544 serde_json::to_value(&sym.parameters).unwrap_or_default(),
545 );
546 }
547 if let Some(ref ret) = sym.return_type {
548 payload.insert(
549 "return_type".to_string(),
550 serde_json::Value::String(ret.clone()),
551 );
552 }
553 if sym.is_async {
554 payload.insert("is_async".to_string(), serde_json::json!(true));
555 }
556 if !sym.attributes.is_empty() {
557 payload.insert(
558 "attributes".to_string(),
559 serde_json::to_value(&sym.attributes).unwrap_or_default(),
560 );
561 }
562 if !sym.throws.is_empty() {
563 payload.insert(
564 "throws".to_string(),
565 serde_json::to_value(&sym.throws).unwrap_or_default(),
566 );
567 }
568 if let Some(ref gp) = sym.generic_params {
569 payload.insert(
570 "generic_params".to_string(),
571 serde_json::Value::String(gp.clone()),
572 );
573 }
574 if sym.is_abstract {
575 payload.insert("is_abstract".to_string(), serde_json::json!(true));
576 }
577 if let Some(ref parent) = sym.parent {
578 payload.insert(
579 "parent".to_string(),
580 serde_json::Value::String(parent.clone()),
581 );
582 }
583 payload
584 }
585
586 const AST_GREP_BASE_CONFIDENCE: f64 = 0.10;
589
590 fn build_reference_edges(
591 edges: &[ResolvedEdge],
592 graph_config: &GraphConfig,
593 now: chrono::DateTime<chrono::Utc>,
594 ) -> Vec<Edge> {
595 edges
596 .iter()
597 .map(|edge| {
598 let mut properties = HashMap::new();
599 properties.insert("source".to_string(), serde_json::json!("ast-grep"));
600 properties.insert(
601 "confidence".to_string(),
602 serde_json::json!(Self::AST_GREP_BASE_CONFIDENCE),
603 );
604 properties.insert("source_layers".to_string(), serde_json::json!(["ast-grep"]));
605 let base_weight = edge_weight_for(&edge.relationship, graph_config);
609 let weight = base_weight * edge.resolution_confidence;
610 Edge {
611 id: format!(
612 "ref:{}->{}:{}",
613 edge.source_qualified_name, edge.target_qualified_name, edge.relationship
614 ),
615 src: format!("sym:{}", edge.source_qualified_name),
616 dst: format!("sym:{}", edge.target_qualified_name),
617 relationship: edge.relationship,
618 weight,
619 valid_from: Some(now),
620 valid_to: None,
621 properties,
622 created_at: now,
623 }
624 })
625 .collect()
626 }
627
628 fn fuse_edges(ast_grep_edges: &[Edge], scip_edges: &[Edge]) -> (Vec<Edge>, Vec<String>) {
635 let ast_grep_index: HashMap<(String, String, String), &str> = ast_grep_edges
637 .iter()
638 .map(|e| {
639 (
640 (e.src.clone(), e.dst.clone(), e.relationship.to_string()),
641 e.id.as_str(),
642 )
643 })
644 .collect();
645
646 let mut superseded_ids = Vec::new();
647
648 let fused = scip_edges
649 .iter()
650 .map(|scip_edge| {
651 let key = (
652 scip_edge.src.clone(),
653 scip_edge.dst.clone(),
654 scip_edge.relationship.to_string(),
655 );
656 if let Some(&ast_edge_id) = ast_grep_index.get(&key) {
657 superseded_ids.push(ast_edge_id.to_string());
659 let mut fused = scip_edge.clone();
660 let scip_conf = scip_edge
661 .properties
662 .get("confidence")
663 .and_then(|v| v.as_f64())
664 .unwrap_or(0.15);
665 let fused_conf = scip_conf + Self::AST_GREP_BASE_CONFIDENCE;
666 fused
667 .properties
668 .insert("confidence".to_string(), serde_json::json!(fused_conf));
669 fused.properties.insert(
670 "source_layers".to_string(),
671 serde_json::json!(["ast-grep", "scip"]),
672 );
673 fused
674 } else {
675 scip_edge.clone()
676 }
677 })
678 .collect();
679
680 (fused, superseded_ids)
681 }
682
683 fn build_chunk_nodes(
685 chunks: &[CodeChunk],
686 ns_string: &Option<String>,
687 contains_weight: f64,
688 now: chrono::DateTime<chrono::Utc>,
689 ) -> (Vec<GraphNode>, Vec<Edge>) {
690 let mut chunk_nodes = Vec::with_capacity(chunks.len());
691 let mut chunk_edges = Vec::with_capacity(chunks.len() * 2);
692
693 for chunk in chunks {
694 let chunk_id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
695
696 let mut payload = HashMap::new();
697 payload.insert(
698 "file_path".to_string(),
699 serde_json::Value::String(chunk.file_path.clone()),
700 );
701 payload.insert(
702 "line_start".to_string(),
703 serde_json::json!(chunk.line_start),
704 );
705 payload.insert("line_end".to_string(), serde_json::json!(chunk.line_end));
706 payload.insert(
707 "node_kind".to_string(),
708 serde_json::Value::String(chunk.node_kind.clone()),
709 );
710 payload.insert(
711 "non_ws_chars".to_string(),
712 serde_json::json!(chunk.non_ws_chars),
713 );
714 if let Some(ref parent) = chunk.parent_symbol {
715 payload.insert(
716 "parent_symbol".to_string(),
717 serde_json::Value::String(parent.clone()),
718 );
719 }
720
721 chunk_nodes.push(GraphNode {
722 id: chunk_id.clone(),
723 kind: NodeKind::Chunk,
724 label: format!(
725 "chunk:{}:{}..{}",
726 chunk.file_path, chunk.line_start, chunk.line_end
727 ),
728 payload,
729 centrality: 0.0,
730 memory_id: None,
731 namespace: ns_string.clone(),
732 });
733
734 let file_node_id = format!("file:{}", chunk.file_path);
735 chunk_edges.push(Edge {
736 id: format!("contains:{file_node_id}->{chunk_id}"),
737 src: file_node_id,
738 dst: chunk_id.clone(),
739 relationship: RelationshipType::Contains,
740 weight: contains_weight,
741 valid_from: Some(now),
742 valid_to: None,
743 properties: HashMap::new(),
744 created_at: now,
745 });
746
747 if let Some(ref parent_sym) = chunk.parent_symbol {
748 let parent_node_id = format!("sym:{parent_sym}");
749 chunk_edges.push(Edge {
750 id: format!("contains:{parent_node_id}->{chunk_id}"),
751 src: parent_node_id,
752 dst: chunk_id,
753 relationship: RelationshipType::Contains,
754 weight: contains_weight,
755 valid_from: Some(now),
756 valid_to: None,
757 properties: HashMap::new(),
758 created_at: now,
759 });
760 }
761 }
762
763 (chunk_nodes, chunk_edges)
764 }
765
766 fn embed_and_persist(
773 &self,
774 symbols: &[Symbol],
775 chunks: &[CodeChunk],
776 edges: &[ResolvedEdge],
777 on_progress: impl Fn(usize, usize),
778 ) -> Result<(usize, usize), CodememError> {
779 let mut symbols_embedded = 0usize;
780 let mut chunks_embedded = 0usize;
781
782 if !self.embeddings_ready() {
785 return Ok((0, 0));
786 }
787
788 let sym_texts: Vec<(String, String)> = symbols
790 .iter()
791 .map(|sym| {
792 let id = format!("sym:{}", sym.qualified_name);
793 let text = self.enrich_symbol_text(sym, edges);
794 (id, text)
795 })
796 .collect();
797 let chunk_texts: Vec<(String, String)> = chunks
798 .iter()
799 .map(|chunk| {
800 let id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
801 let text = self.enrich_chunk_text(chunk);
802 (id, text)
803 })
804 .collect();
805
806 let embed_batch_size = self.config.embedding.batch_size;
808
809 let all_pairs: Vec<(String, String)> = sym_texts.into_iter().chain(chunk_texts).collect();
810 let total = all_pairs.len();
811 let sym_count = symbols.len();
812 let mut done = 0usize;
813
814 for batch in all_pairs.chunks(embed_batch_size) {
815 let texts: Vec<&str> = batch.iter().map(|(_, t)| t.as_str()).collect();
816
817 let t0 = std::time::Instant::now();
818 let embed_result = {
819 let emb = self.lock_embeddings()?;
820 match emb {
821 Some(emb_guard) => emb_guard.embed_batch(&texts),
822 None => break,
823 }
824 };
825
826 match embed_result {
827 Ok(embeddings) => {
828 let embed_ms = t0.elapsed().as_millis();
829
830 let t1 = std::time::Instant::now();
831 let pairs: Vec<(&str, &[f32])> = batch
832 .iter()
833 .zip(embeddings.iter())
834 .map(|((id, _), emb_vec)| (id.as_str(), emb_vec.as_slice()))
835 .collect();
836 if let Err(e) = self.storage.store_embeddings_batch(&pairs) {
837 tracing::warn!("Failed to batch-store embeddings: {e}");
838 }
839 let sqlite_ms = t1.elapsed().as_millis();
840
841 let t2 = std::time::Instant::now();
842 let batch_items: Vec<(String, Vec<f32>)> = batch
843 .iter()
844 .zip(embeddings.into_iter())
845 .map(|((id, _), emb_vec)| (id.clone(), emb_vec))
846 .collect();
847 let batch_len = batch_items.len();
848 {
849 let mut vec = self.lock_vector()?;
850 if let Err(e) = vec.insert_batch(&batch_items) {
851 tracing::warn!("Failed to batch-insert into vector index: {e}");
852 }
853 }
854 let vector_ms = t2.elapsed().as_millis();
855
856 let syms_in_batch = batch_len.min(sym_count.saturating_sub(done));
857 symbols_embedded += syms_in_batch;
858 chunks_embedded += batch_len - syms_in_batch;
859 done += batch_len;
860
861 tracing::debug!(
862 "Embed batch {}: embed={embed_ms}ms sqlite={sqlite_ms}ms vector={vector_ms}ms",
863 batch_len
864 );
865 }
866 Err(e) => {
867 tracing::warn!("embed_batch failed for chunk of {} texts: {e}", batch.len());
868 }
869 }
870 on_progress(done, total);
871 }
872 self.save_index();
873
874 Ok((symbols_embedded, chunks_embedded))
875 }
876}