1mod compaction;
5pub mod cross_repo;
6
7use crate::index::{CodeChunk, ResolvedEdge, Symbol};
8use crate::IndexAndResolveResult;
9use codemem_core::{
10 CodememError, Edge, GraphBackend, GraphConfig, GraphNode, NodeKind, RelationshipType,
11 VectorBackend,
12};
13use std::collections::{HashMap, HashSet};
14
15#[derive(Debug, Default)]
17pub struct IndexPersistResult {
18 pub files_created: usize,
19 pub packages_created: usize,
20 pub symbols_stored: usize,
21 pub chunks_stored: usize,
22 pub edges_resolved: usize,
23 pub symbols_embedded: usize,
24 pub chunks_embedded: usize,
25 pub chunks_pruned: usize,
26 pub symbols_pruned: usize,
27}
28
29#[derive(Debug, Default)]
31pub struct CrossRepoPersistResult {
32 pub packages_registered: usize,
33 pub unresolved_refs_stored: usize,
34 pub forward_edges_created: usize,
35 pub backward_edges_created: usize,
36 pub endpoints_detected: usize,
37 pub client_calls_detected: usize,
38}
39
40pub fn edge_weight_for(rel: &RelationshipType, config: &GraphConfig) -> f64 {
43 match rel {
44 RelationshipType::Calls => config.calls_edge_weight,
45 RelationshipType::Imports => config.imports_edge_weight,
46 RelationshipType::Contains => config.contains_edge_weight,
47 RelationshipType::TypeDefinition => config.type_definition_edge_weight,
48 RelationshipType::Reads => config.reads_edge_weight,
49 RelationshipType::Writes => config.writes_edge_weight,
50 RelationshipType::Overrides => config.overrides_edge_weight,
51 RelationshipType::Implements | RelationshipType::Inherits => 0.8,
52 RelationshipType::DependsOn => 0.7,
53 RelationshipType::CoChanged => 0.6,
54 RelationshipType::EvolvedInto | RelationshipType::Summarizes => 0.7,
55 RelationshipType::PartOf => 0.4,
56 RelationshipType::RelatesTo | RelationshipType::SharesTheme => 0.3,
57 _ => 0.5,
58 }
59}
60
61struct GraphPersistCounts {
63 packages_created: usize,
64 chunks_stored: usize,
65}
66
67impl super::CodememEngine {
68 pub fn persist_index_results(
73 &self,
74 results: &IndexAndResolveResult,
75 namespace: Option<&str>,
76 ) -> Result<IndexPersistResult, CodememError> {
77 self.persist_index_results_with_progress(results, namespace, |_, _| {})
78 }
79
80 pub fn persist_graph_only(
86 &self,
87 results: &IndexAndResolveResult,
88 namespace: Option<&str>,
89 ) -> Result<IndexPersistResult, CodememError> {
90 let seen_files = &results.file_paths;
91 let graph_counts = self.persist_graph_nodes(results, namespace)?;
92
93 let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
94 self.compact_graph(seen_files)
95 } else {
96 (0, 0)
97 };
98
99 Ok(IndexPersistResult {
100 files_created: seen_files.len(),
101 packages_created: graph_counts.packages_created,
102 symbols_stored: results.symbols.len(),
103 chunks_stored: graph_counts.chunks_stored,
104 edges_resolved: results.edges.len(),
105 symbols_embedded: 0,
106 chunks_embedded: 0,
107 chunks_pruned,
108 symbols_pruned,
109 })
110 }
111
112 pub fn persist_index_results_with_progress(
115 &self,
116 results: &IndexAndResolveResult,
117 namespace: Option<&str>,
118 on_progress: impl Fn(usize, usize),
119 ) -> Result<IndexPersistResult, CodememError> {
120 let seen_files = &results.file_paths;
121
122 let graph_counts = self.persist_graph_nodes(results, namespace)?;
124
125 let (symbols_embedded, chunks_embedded) = self.embed_and_persist(
127 &results.symbols,
128 &results.chunks,
129 &results.edges,
130 on_progress,
131 )?;
132
133 let (chunks_pruned, symbols_pruned) = if self.config.chunking.auto_compact {
135 self.compact_graph(seen_files)
136 } else {
137 (0, 0)
138 };
139
140 Ok(IndexPersistResult {
141 files_created: seen_files.len(),
142 packages_created: graph_counts.packages_created,
143 symbols_stored: results.symbols.len(),
144 chunks_stored: graph_counts.chunks_stored,
145 edges_resolved: results.edges.len(),
146 symbols_embedded,
147 chunks_embedded,
148 chunks_pruned,
149 symbols_pruned,
150 })
151 }
152
153 fn persist_graph_nodes(
158 &self,
159 results: &IndexAndResolveResult,
160 namespace: Option<&str>,
161 ) -> Result<GraphPersistCounts, CodememError> {
162 let all_symbols = &results.symbols;
163 let all_chunks = &results.chunks;
164 let seen_files = &results.file_paths;
165 let edges = &results.edges;
166
167 let now = chrono::Utc::now();
168 let ns_string = namespace.map(|s| s.to_string());
169 let contains_weight = edge_weight_for(&RelationshipType::Contains, &self.config.graph);
170
171 let mut graph = self.lock_graph()?;
172
173 let file_nodes: Vec<GraphNode> = seen_files
175 .iter()
176 .map(|file_path| {
177 let mut payload = HashMap::new();
178 payload.insert(
179 "file_path".to_string(),
180 serde_json::Value::String(file_path.clone()),
181 );
182 GraphNode {
183 id: format!("file:{file_path}"),
184 kind: NodeKind::File,
185 label: file_path.clone(),
186 payload,
187 centrality: 0.0,
188 memory_id: None,
189 namespace: ns_string.clone(),
190 }
191 })
192 .collect();
193 self.persist_nodes_to_storage_and_graph(&file_nodes, &mut graph);
194
195 let (dir_nodes, dir_edges, created_dirs) =
197 self.build_package_tree(seen_files, &ns_string, contains_weight, now, &graph);
198 self.persist_nodes_to_storage_and_graph(&dir_nodes, &mut graph);
199 self.persist_edges_to_storage_and_graph(&dir_edges, &mut graph);
200
201 let (sym_nodes, sym_edges) =
203 Self::build_symbol_nodes(all_symbols, &ns_string, contains_weight, now);
204
205 let mut old_syms_by_file: HashMap<String, HashSet<String>> = HashMap::new();
213 for node in graph.get_all_nodes() {
214 if !node.id.starts_with("sym:") {
215 continue;
216 }
217 if matches!(
222 node.payload.get("source").and_then(|v| v.as_str()),
223 Some("scip" | "scip-synthetic")
224 ) {
225 continue;
226 }
227 let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) else {
228 continue;
229 };
230 if !seen_files.contains(fp) {
231 continue;
232 }
233 old_syms_by_file
234 .entry(fp.to_string())
235 .or_default()
236 .insert(node.id);
237 }
238 drop(graph);
239 for file_path in seen_files {
240 let new_sym_ids: HashSet<String> = sym_nodes
241 .iter()
242 .filter(|n| {
243 n.payload.get("file_path").and_then(|v| v.as_str()) == Some(file_path.as_str())
244 })
245 .map(|n| n.id.clone())
246 .collect();
247 let empty = HashSet::new();
248 let old_sym_ids = old_syms_by_file.get(file_path).unwrap_or(&empty);
249 if let Err(e) = self.cleanup_stale_symbols(file_path, old_sym_ids, &new_sym_ids) {
250 tracing::warn!("Failed to cleanup stale symbols for {file_path}: {e}");
251 }
252 }
253 let mut graph = self.lock_graph()?; self.persist_nodes_to_storage_and_graph(&sym_nodes, &mut graph);
256 self.persist_edges_to_storage_and_graph(&sym_edges, &mut graph);
257
258 let ref_edges = Self::build_reference_edges(edges, &self.config.graph, now);
260 self.persist_edges_to_storage_and_graph(&ref_edges, &mut graph);
261
262 if let Some(ref scip_build) = results.scip_build {
264 let new_scip_ids: HashSet<&str> =
267 scip_build.nodes.iter().map(|n| n.id.as_str()).collect();
268 let mut stale_scip_ids = Vec::new();
269 for node in graph.get_all_nodes() {
270 if !node.id.starts_with("sym:") {
271 continue;
272 }
273 if !matches!(
274 node.payload.get("source").and_then(|v| v.as_str()),
275 Some("scip" | "scip-synthetic")
276 ) {
277 continue;
278 }
279 if !new_scip_ids.contains(node.id.as_str()) {
280 if let Some(fp) = node.payload.get("file_path").and_then(|v| v.as_str()) {
282 if seen_files.contains(fp) {
283 stale_scip_ids.push(node.id.clone());
284 }
285 }
286 }
287 }
288 for stale_id in &stale_scip_ids {
289 let _ = graph.remove_node(stale_id);
290 let _ = self.storage.delete_graph_nodes_by_prefix(stale_id);
291 if let Some(qname) = stale_id.strip_prefix("sym:") {
293 let doc_id = format!("scip-doc:{qname}");
294 let _ = self.storage.delete_memory(&doc_id);
295 }
296 }
297 if !stale_scip_ids.is_empty() {
298 tracing::info!(
299 "Cleaned up {} stale SCIP nodes from re-index",
300 stale_scip_ids.len()
301 );
302 }
303
304 self.persist_nodes_to_storage_and_graph(&scip_build.nodes, &mut graph);
305
306 let (fused_edges, superseded_ids) = Self::fuse_edges(&ref_edges, &scip_build.edges);
309
310 for edge_id in &superseded_ids {
312 let _ = graph.remove_edge(edge_id);
313 let _ = self.storage.delete_graph_edge(edge_id);
314 }
315
316 self.persist_edges_to_storage_and_graph(&fused_edges, &mut graph);
317
318 for (memory, related_node_id) in &scip_build.memories {
320 let _ = self.storage.insert_memory(memory);
321 let relates_edge = Edge {
322 id: format!("relates:{}->mem:{}", related_node_id, memory.id),
323 src: related_node_id.clone(),
324 dst: format!("mem:{}", memory.id),
325 relationship: RelationshipType::RelatesTo,
326 weight: 0.3,
327 properties: HashMap::new(),
328 created_at: now,
329 valid_from: Some(now),
330 valid_to: None,
331 };
332 let _ = graph.add_edge(relates_edge.clone());
333 let _ = self.storage.insert_graph_edges_batch(&[relates_edge]);
334 }
335 }
336
337 for file_path in seen_files {
339 let prefix = format!("chunk:{file_path}:");
340 let _ = self.storage.delete_graph_nodes_by_prefix(&prefix);
341 }
342 let (chunk_nodes, chunk_edges) =
343 Self::build_chunk_nodes(all_chunks, &ns_string, contains_weight, now);
344 let chunk_count = chunk_nodes.len();
345 self.persist_nodes_to_storage_and_graph(&chunk_nodes, &mut graph);
346 self.persist_edges_to_storage_and_graph(&chunk_edges, &mut graph);
347
348 drop(graph);
349
350 Ok(GraphPersistCounts {
351 packages_created: created_dirs,
352 chunks_stored: chunk_count,
353 })
354 }
355
356 fn persist_nodes_to_storage_and_graph(
358 &self,
359 nodes: &[GraphNode],
360 graph: &mut crate::GraphEngine,
361 ) {
362 if let Err(e) = self.storage.insert_graph_nodes_batch(nodes) {
363 tracing::warn!("Failed to batch-insert {} graph nodes: {e}", nodes.len());
364 }
365 for node in nodes {
366 let _ = graph.add_node(node.clone());
367 }
368 }
369
370 fn persist_edges_to_storage_and_graph(&self, edges: &[Edge], graph: &mut crate::GraphEngine) {
372 if let Err(e) = self.storage.insert_graph_edges_batch(edges) {
373 tracing::warn!("Failed to batch-insert {} graph edges: {e}", edges.len());
374 }
375 for edge in edges {
376 let _ = graph.add_edge(edge.clone());
377 }
378 }
379
380 fn build_package_tree(
383 &self,
384 seen_files: &HashSet<String>,
385 ns_string: &Option<String>,
386 contains_weight: f64,
387 now: chrono::DateTime<chrono::Utc>,
388 graph: &crate::GraphEngine,
389 ) -> (Vec<GraphNode>, Vec<Edge>, usize) {
390 let mut created_dirs: HashSet<String> = HashSet::new();
391 let mut dir_nodes = Vec::new();
392 let mut dir_edges = Vec::new();
393
394 for file_path in seen_files {
395 let p = std::path::Path::new(file_path);
396 let mut ancestors: Vec<String> = Vec::new();
397 let mut current = p.parent();
398 while let Some(dir) = current {
399 let dir_str = dir.to_string_lossy().to_string();
400 if dir_str.is_empty() || dir_str == "." {
401 break;
402 }
403 ancestors.push(dir_str);
404 current = dir.parent();
405 }
406 ancestors.reverse();
407 for (i, dir_str) in ancestors.iter().enumerate() {
408 let pkg_id = format!("pkg:{dir_str}/");
409 if created_dirs.insert(pkg_id.clone()) {
410 dir_nodes.push(GraphNode {
411 id: pkg_id.clone(),
412 kind: NodeKind::Package,
413 label: format!("{dir_str}/"),
414 payload: HashMap::new(),
415 centrality: 0.0,
416 memory_id: None,
417 namespace: ns_string.clone(),
418 });
419 }
420 if i == 0 {
421 continue;
422 }
423 let parent_pkg_id = format!("pkg:{}/", ancestors[i - 1]);
424 let edge_id = format!("contains:{parent_pkg_id}->{pkg_id}");
425 if graph
426 .get_edges(&parent_pkg_id)
427 .unwrap_or_default()
428 .iter()
429 .any(|e| e.id == edge_id)
430 {
431 continue;
432 }
433 dir_edges.push(Edge {
434 id: edge_id,
435 src: parent_pkg_id,
436 dst: pkg_id.clone(),
437 relationship: RelationshipType::Contains,
438 weight: contains_weight,
439 valid_from: Some(now),
440 valid_to: None,
441 properties: HashMap::new(),
442 created_at: now,
443 });
444 }
445 if let Some(last_dir) = ancestors.last() {
446 let parent_pkg_id = format!("pkg:{last_dir}/");
447 let file_node_id = format!("file:{file_path}");
448 let edge_id = format!("contains:{parent_pkg_id}->{file_node_id}");
449 dir_edges.push(Edge {
450 id: edge_id,
451 src: parent_pkg_id,
452 dst: file_node_id,
453 relationship: RelationshipType::Contains,
454 weight: contains_weight,
455 valid_from: Some(now),
456 valid_to: None,
457 properties: HashMap::new(),
458 created_at: now,
459 });
460 }
461 }
462
463 let count = created_dirs.len();
464 (dir_nodes, dir_edges, count)
465 }
466
467 fn build_symbol_nodes(
469 symbols: &[Symbol],
470 ns_string: &Option<String>,
471 contains_weight: f64,
472 now: chrono::DateTime<chrono::Utc>,
473 ) -> (Vec<GraphNode>, Vec<Edge>) {
474 let mut sym_nodes = Vec::with_capacity(symbols.len());
475 let mut sym_edges = Vec::with_capacity(symbols.len());
476
477 for sym in symbols {
478 let kind = NodeKind::from(sym.kind);
479 let payload = Self::build_symbol_payload(sym);
480
481 let sym_node_id = format!("sym:{}", sym.qualified_name);
482 sym_nodes.push(GraphNode {
483 id: sym_node_id.clone(),
484 kind,
485 label: sym.qualified_name.clone(),
486 payload,
487 centrality: 0.0,
488 memory_id: None,
489 namespace: ns_string.clone(),
490 });
491
492 let file_node_id = format!("file:{}", sym.file_path);
493 sym_edges.push(Edge {
494 id: format!("contains:{file_node_id}->{sym_node_id}"),
495 src: file_node_id,
496 dst: sym_node_id,
497 relationship: RelationshipType::Contains,
498 weight: contains_weight,
499 valid_from: Some(now),
500 valid_to: None,
501 properties: HashMap::new(),
502 created_at: now,
503 });
504 }
505
506 (sym_nodes, sym_edges)
507 }
508
509 fn build_symbol_payload(sym: &Symbol) -> HashMap<String, serde_json::Value> {
511 let mut payload = HashMap::new();
512 payload.insert(
513 "symbol_kind".to_string(),
514 serde_json::Value::String(sym.kind.to_string()),
515 );
516 payload.insert(
517 "signature".to_string(),
518 serde_json::Value::String(sym.signature.clone()),
519 );
520 payload.insert(
521 "file_path".to_string(),
522 serde_json::Value::String(sym.file_path.clone()),
523 );
524 payload.insert("line_start".to_string(), serde_json::json!(sym.line_start));
525 payload.insert("line_end".to_string(), serde_json::json!(sym.line_end));
526 payload.insert(
527 "visibility".to_string(),
528 serde_json::Value::String(sym.visibility.to_string()),
529 );
530 if let Some(ref doc) = sym.doc_comment {
531 payload.insert(
532 "doc_comment".to_string(),
533 serde_json::Value::String(doc.clone()),
534 );
535 }
536 if !sym.parameters.is_empty() {
537 payload.insert(
538 "parameters".to_string(),
539 serde_json::to_value(&sym.parameters).unwrap_or_default(),
540 );
541 }
542 if let Some(ref ret) = sym.return_type {
543 payload.insert(
544 "return_type".to_string(),
545 serde_json::Value::String(ret.clone()),
546 );
547 }
548 if sym.is_async {
549 payload.insert("is_async".to_string(), serde_json::json!(true));
550 }
551 if !sym.attributes.is_empty() {
552 payload.insert(
553 "attributes".to_string(),
554 serde_json::to_value(&sym.attributes).unwrap_or_default(),
555 );
556 }
557 if !sym.throws.is_empty() {
558 payload.insert(
559 "throws".to_string(),
560 serde_json::to_value(&sym.throws).unwrap_or_default(),
561 );
562 }
563 if let Some(ref gp) = sym.generic_params {
564 payload.insert(
565 "generic_params".to_string(),
566 serde_json::Value::String(gp.clone()),
567 );
568 }
569 if sym.is_abstract {
570 payload.insert("is_abstract".to_string(), serde_json::json!(true));
571 }
572 if let Some(ref parent) = sym.parent {
573 payload.insert(
574 "parent".to_string(),
575 serde_json::Value::String(parent.clone()),
576 );
577 }
578 payload
579 }
580
581 const AST_GREP_BASE_CONFIDENCE: f64 = 0.10;
584
585 fn build_reference_edges(
586 edges: &[ResolvedEdge],
587 graph_config: &GraphConfig,
588 now: chrono::DateTime<chrono::Utc>,
589 ) -> Vec<Edge> {
590 edges
591 .iter()
592 .map(|edge| {
593 let mut properties = HashMap::new();
594 properties.insert("source".to_string(), serde_json::json!("ast-grep"));
595 properties.insert(
596 "confidence".to_string(),
597 serde_json::json!(Self::AST_GREP_BASE_CONFIDENCE),
598 );
599 properties.insert("source_layers".to_string(), serde_json::json!(["ast-grep"]));
600 Edge {
601 id: format!(
602 "ref:{}->{}:{}",
603 edge.source_qualified_name, edge.target_qualified_name, edge.relationship
604 ),
605 src: format!("sym:{}", edge.source_qualified_name),
606 dst: format!("sym:{}", edge.target_qualified_name),
607 relationship: edge.relationship,
608 weight: edge_weight_for(&edge.relationship, graph_config),
609 valid_from: Some(now),
610 valid_to: None,
611 properties,
612 created_at: now,
613 }
614 })
615 .collect()
616 }
617
618 fn fuse_edges(ast_grep_edges: &[Edge], scip_edges: &[Edge]) -> (Vec<Edge>, Vec<String>) {
625 let ast_grep_index: HashMap<(String, String, String), &str> = ast_grep_edges
627 .iter()
628 .map(|e| {
629 (
630 (e.src.clone(), e.dst.clone(), e.relationship.to_string()),
631 e.id.as_str(),
632 )
633 })
634 .collect();
635
636 let mut superseded_ids = Vec::new();
637
638 let fused = scip_edges
639 .iter()
640 .map(|scip_edge| {
641 let key = (
642 scip_edge.src.clone(),
643 scip_edge.dst.clone(),
644 scip_edge.relationship.to_string(),
645 );
646 if let Some(&ast_edge_id) = ast_grep_index.get(&key) {
647 superseded_ids.push(ast_edge_id.to_string());
649 let mut fused = scip_edge.clone();
650 let scip_conf = scip_edge
651 .properties
652 .get("confidence")
653 .and_then(|v| v.as_f64())
654 .unwrap_or(0.15);
655 let fused_conf = scip_conf + Self::AST_GREP_BASE_CONFIDENCE;
656 fused
657 .properties
658 .insert("confidence".to_string(), serde_json::json!(fused_conf));
659 fused.properties.insert(
660 "source_layers".to_string(),
661 serde_json::json!(["ast-grep", "scip"]),
662 );
663 fused
664 } else {
665 scip_edge.clone()
666 }
667 })
668 .collect();
669
670 (fused, superseded_ids)
671 }
672
673 fn build_chunk_nodes(
675 chunks: &[CodeChunk],
676 ns_string: &Option<String>,
677 contains_weight: f64,
678 now: chrono::DateTime<chrono::Utc>,
679 ) -> (Vec<GraphNode>, Vec<Edge>) {
680 let mut chunk_nodes = Vec::with_capacity(chunks.len());
681 let mut chunk_edges = Vec::with_capacity(chunks.len() * 2);
682
683 for chunk in chunks {
684 let chunk_id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
685
686 let mut payload = HashMap::new();
687 payload.insert(
688 "file_path".to_string(),
689 serde_json::Value::String(chunk.file_path.clone()),
690 );
691 payload.insert(
692 "line_start".to_string(),
693 serde_json::json!(chunk.line_start),
694 );
695 payload.insert("line_end".to_string(), serde_json::json!(chunk.line_end));
696 payload.insert(
697 "node_kind".to_string(),
698 serde_json::Value::String(chunk.node_kind.clone()),
699 );
700 payload.insert(
701 "non_ws_chars".to_string(),
702 serde_json::json!(chunk.non_ws_chars),
703 );
704 if let Some(ref parent) = chunk.parent_symbol {
705 payload.insert(
706 "parent_symbol".to_string(),
707 serde_json::Value::String(parent.clone()),
708 );
709 }
710
711 chunk_nodes.push(GraphNode {
712 id: chunk_id.clone(),
713 kind: NodeKind::Chunk,
714 label: format!(
715 "chunk:{}:{}..{}",
716 chunk.file_path, chunk.line_start, chunk.line_end
717 ),
718 payload,
719 centrality: 0.0,
720 memory_id: None,
721 namespace: ns_string.clone(),
722 });
723
724 let file_node_id = format!("file:{}", chunk.file_path);
725 chunk_edges.push(Edge {
726 id: format!("contains:{file_node_id}->{chunk_id}"),
727 src: file_node_id,
728 dst: chunk_id.clone(),
729 relationship: RelationshipType::Contains,
730 weight: contains_weight,
731 valid_from: Some(now),
732 valid_to: None,
733 properties: HashMap::new(),
734 created_at: now,
735 });
736
737 if let Some(ref parent_sym) = chunk.parent_symbol {
738 let parent_node_id = format!("sym:{parent_sym}");
739 chunk_edges.push(Edge {
740 id: format!("contains:{parent_node_id}->{chunk_id}"),
741 src: parent_node_id,
742 dst: chunk_id,
743 relationship: RelationshipType::Contains,
744 weight: contains_weight,
745 valid_from: Some(now),
746 valid_to: None,
747 properties: HashMap::new(),
748 created_at: now,
749 });
750 }
751 }
752
753 (chunk_nodes, chunk_edges)
754 }
755
756 fn embed_and_persist(
763 &self,
764 symbols: &[Symbol],
765 chunks: &[CodeChunk],
766 edges: &[ResolvedEdge],
767 on_progress: impl Fn(usize, usize),
768 ) -> Result<(usize, usize), CodememError> {
769 let mut symbols_embedded = 0usize;
770 let mut chunks_embedded = 0usize;
771
772 if !self.embeddings_ready() {
775 return Ok((0, 0));
776 }
777
778 let sym_texts: Vec<(String, String)> = symbols
780 .iter()
781 .map(|sym| {
782 let id = format!("sym:{}", sym.qualified_name);
783 let text = self.enrich_symbol_text(sym, edges);
784 (id, text)
785 })
786 .collect();
787 let chunk_texts: Vec<(String, String)> = chunks
788 .iter()
789 .map(|chunk| {
790 let id = format!("chunk:{}:{}", chunk.file_path, chunk.index);
791 let text = self.enrich_chunk_text(chunk);
792 (id, text)
793 })
794 .collect();
795
796 let embed_batch_size = self.config.embedding.batch_size;
798
799 let all_pairs: Vec<(String, String)> = sym_texts.into_iter().chain(chunk_texts).collect();
800 let total = all_pairs.len();
801 let sym_count = symbols.len();
802 let mut done = 0usize;
803
804 for batch in all_pairs.chunks(embed_batch_size) {
805 let texts: Vec<&str> = batch.iter().map(|(_, t)| t.as_str()).collect();
806
807 let t0 = std::time::Instant::now();
808 let embed_result = {
809 let emb = self.lock_embeddings()?;
810 match emb {
811 Some(emb_guard) => emb_guard.embed_batch(&texts),
812 None => break,
813 }
814 };
815
816 match embed_result {
817 Ok(embeddings) => {
818 let embed_ms = t0.elapsed().as_millis();
819
820 let t1 = std::time::Instant::now();
821 let pairs: Vec<(&str, &[f32])> = batch
822 .iter()
823 .zip(embeddings.iter())
824 .map(|((id, _), emb_vec)| (id.as_str(), emb_vec.as_slice()))
825 .collect();
826 if let Err(e) = self.storage.store_embeddings_batch(&pairs) {
827 tracing::warn!("Failed to batch-store embeddings: {e}");
828 }
829 let sqlite_ms = t1.elapsed().as_millis();
830
831 let t2 = std::time::Instant::now();
832 let batch_items: Vec<(String, Vec<f32>)> = batch
833 .iter()
834 .zip(embeddings.into_iter())
835 .map(|((id, _), emb_vec)| (id.clone(), emb_vec))
836 .collect();
837 let batch_len = batch_items.len();
838 {
839 let mut vec = self.lock_vector()?;
840 if let Err(e) = vec.insert_batch(&batch_items) {
841 tracing::warn!("Failed to batch-insert into vector index: {e}");
842 }
843 }
844 let vector_ms = t2.elapsed().as_millis();
845
846 let syms_in_batch = batch_len.min(sym_count.saturating_sub(done));
847 symbols_embedded += syms_in_batch;
848 chunks_embedded += batch_len - syms_in_batch;
849 done += batch_len;
850
851 tracing::debug!(
852 "Embed batch {}: embed={embed_ms}ms sqlite={sqlite_ms}ms vector={vector_ms}ms",
853 batch_len
854 );
855 }
856 Err(e) => {
857 tracing::warn!("embed_batch failed for chunk of {} texts: {e}", batch.len());
858 }
859 }
860 on_progress(done, total);
861 }
862 self.save_index();
863
864 Ok((symbols_embedded, chunks_embedded))
865 }
866}