1pub mod adaptive;
2pub mod bm25;
4pub mod causal_analysis;
6pub mod enriched;
8#[cfg(feature = "pagerank")]
10pub mod hipporag_ppr;
11pub mod hybrid;
13pub mod pagerank_retrieval;
14pub mod symbolic_anchoring;
16
17#[cfg(feature = "parallel-processing")]
18use crate::parallel::ParallelProcessor;
19use crate::{
20 config::Config,
21 core::{ChunkId, EntityId, KnowledgeGraph},
22 summarization::DocumentTree,
23 vector::{EmbeddingGenerator, VectorUtils},
24 Result,
25};
26use std::collections::{HashMap, HashSet};
27
28pub use bm25::{BM25Result, BM25Retriever, Document as BM25Document};
29pub use enriched::{EnrichedRetrievalConfig, EnrichedRetriever};
30pub use hybrid::{FusionMethod, HybridConfig, HybridRetriever, HybridSearchResult};
31
32#[cfg(feature = "pagerank")]
33pub use pagerank_retrieval::{PageRankRetrievalSystem, ScoredResult};
34
35#[cfg(feature = "pagerank")]
36pub use hipporag_ppr::{Fact, HippoRAGConfig, HippoRAGRetriever};
37
38use crate::vector::store::VectorStore;
39
40pub struct RetrievalSystem {
42 vector_store: std::sync::Arc<dyn VectorStore>,
43 embedding_generator: EmbeddingGenerator,
44 config: RetrievalConfig,
45 #[cfg(feature = "parallel-processing")]
46 parallel_processor: Option<ParallelProcessor>,
47 #[cfg(feature = "pagerank")]
48 pagerank_retriever: Option<PageRankRetrievalSystem>,
49 enriched_retriever: Option<EnrichedRetriever>,
50 #[cfg(feature = "lazygraphrag")]
51 concept_filtering_enabled: bool,
52}
53
54impl RetrievalSystem {
55 pub fn new(config: &Config) -> Result<Self> {
57 let retrieval_config = RetrievalConfig {
58 top_k: config.retrieval.top_k,
59 similarity_threshold: 0.35,
60 max_expansion_depth: 2,
61 entity_weight: 0.4,
62 chunk_weight: 0.4,
63 graph_weight: 0.2,
64 #[cfg(feature = "lazygraphrag")]
65 use_concept_filtering: false,
66 #[cfg(feature = "lazygraphrag")]
67 concept_top_k: 20,
68 };
69
70 let vector_store =
73 std::sync::Arc::new(crate::vector::memory_store::MemoryVectorStore::new());
74
75 Ok(Self {
76 vector_store,
77 embedding_generator: EmbeddingGenerator::new(128), config: retrieval_config,
79 #[cfg(feature = "parallel-processing")]
80 parallel_processor: None,
81 #[cfg(feature = "pagerank")]
82 pagerank_retriever: None,
83 enriched_retriever: None,
84 #[cfg(feature = "lazygraphrag")]
85 concept_filtering_enabled: false,
86 })
87 }
88}
89
90#[derive(Debug, Clone)]
92pub struct RetrievalConfig {
93 pub top_k: usize,
95 pub similarity_threshold: f32,
97 pub max_expansion_depth: usize,
99 pub entity_weight: f32,
101 pub chunk_weight: f32,
103 pub graph_weight: f32,
105 #[cfg(feature = "lazygraphrag")]
107 pub use_concept_filtering: bool,
108 #[cfg(feature = "lazygraphrag")]
110 pub concept_top_k: usize,
111}
112
113impl Default for RetrievalConfig {
114 fn default() -> Self {
115 Self {
116 top_k: 10,
117 similarity_threshold: 0.7,
118 max_expansion_depth: 2,
119 entity_weight: 0.4,
120 chunk_weight: 0.4,
121 graph_weight: 0.2,
122 #[cfg(feature = "lazygraphrag")]
123 use_concept_filtering: false,
124 #[cfg(feature = "lazygraphrag")]
125 concept_top_k: 20,
126 }
127 }
128}
129
130#[derive(Debug, Clone)]
132pub struct SearchResult {
133 pub id: String,
135 pub content: String,
137 pub score: f32,
139 pub result_type: ResultType,
141 pub entities: Vec<String>,
143 pub source_chunks: Vec<String>,
145}
146
147#[derive(Debug, Clone, PartialEq, Eq, Hash)]
149pub enum ResultType {
150 Entity,
152 Chunk,
154 GraphPath,
156 HierarchicalSummary,
158 Hybrid,
160}
161
162#[derive(Debug, Clone)]
191pub struct ExplainedAnswer {
192 pub answer: String,
194 pub confidence: f32,
196 pub sources: Vec<SourceReference>,
198 pub reasoning_steps: Vec<ReasoningStep>,
200 pub key_entities: Vec<String>,
202 pub query_analysis: Option<QueryAnalysis>,
204}
205
206#[derive(Debug, Clone)]
208pub struct SourceReference {
209 pub id: String,
211 pub source_type: SourceType,
213 pub excerpt: String,
215 pub relevance_score: f32,
217}
218
219#[derive(Debug, Clone, PartialEq, Eq)]
221pub enum SourceType {
222 TextChunk,
224 Entity,
226 Relationship,
228 Summary,
230}
231
232#[derive(Debug, Clone)]
234pub struct ReasoningStep {
235 pub step_number: u8,
237 pub description: String,
239 pub entities_used: Vec<String>,
241 pub evidence_snippet: Option<String>,
243 pub confidence: f32,
245}
246
247impl ExplainedAnswer {
248 pub fn from_results(answer: String, search_results: &[SearchResult], query: &str) -> Self {
250 let confidence = if search_results.is_empty() {
252 0.0
253 } else {
254 let total_score: f32 = search_results.iter().map(|r| r.score).sum();
255 let avg_score = total_score / search_results.len() as f32;
256 (avg_score * 0.7 + 0.3).min(1.0).max(0.0)
258 };
259
260 let sources: Vec<SourceReference> = search_results
262 .iter()
263 .take(5) .map(|r| SourceReference {
265 id: r.id.clone(),
266 source_type: match r.result_type {
267 ResultType::Entity => SourceType::Entity,
268 ResultType::Chunk => SourceType::TextChunk,
269 ResultType::GraphPath => SourceType::Relationship,
270 ResultType::HierarchicalSummary => SourceType::Summary,
271 ResultType::Hybrid => SourceType::TextChunk,
272 },
273 excerpt: if r.content.len() > 200 {
274 format!("{}...", &r.content[..200])
275 } else {
276 r.content.clone()
277 },
278 relevance_score: r.score,
279 })
280 .collect();
281
282 let mut reasoning_steps = Vec::new();
284 let mut step_num = 1u8;
285
286 reasoning_steps.push(ReasoningStep {
288 step_number: step_num,
289 description: format!("Analyzed query: \"{}\"", query),
290 entities_used: vec![],
291 evidence_snippet: None,
292 confidence: 0.95,
293 });
294 step_num += 1;
295
296 let unique_entities: HashSet<_> = search_results
298 .iter()
299 .flat_map(|r| r.entities.iter().cloned())
300 .collect();
301 if !unique_entities.is_empty() {
302 reasoning_steps.push(ReasoningStep {
303 step_number: step_num,
304 description: format!("Found {} relevant entities", unique_entities.len()),
305 entities_used: unique_entities.iter().take(5).cloned().collect(),
306 evidence_snippet: None,
307 confidence: 0.85,
308 });
309 step_num += 1;
310 }
311
312 let chunk_count = search_results
314 .iter()
315 .filter(|r| r.result_type == ResultType::Chunk || r.result_type == ResultType::Hybrid)
316 .count();
317 if chunk_count > 0 {
318 reasoning_steps.push(ReasoningStep {
319 step_number: step_num,
320 description: format!("Retrieved {} relevant text chunks", chunk_count),
321 entities_used: vec![],
322 evidence_snippet: search_results.first().map(|r| {
323 if r.content.len() > 100 {
324 format!("{}...", &r.content[..100])
325 } else {
326 r.content.clone()
327 }
328 }),
329 confidence,
330 });
331 step_num += 1;
332 }
333
334 reasoning_steps.push(ReasoningStep {
336 step_number: step_num,
337 description: "Synthesized answer from retrieved information".to_string(),
338 entities_used: unique_entities.into_iter().take(3).collect(),
339 evidence_snippet: None,
340 confidence,
341 });
342
343 let key_entities: Vec<String> = search_results
345 .iter()
346 .flat_map(|r| r.entities.iter().cloned())
347 .take(10)
348 .collect();
349
350 Self {
351 answer,
352 confidence,
353 sources,
354 reasoning_steps,
355 key_entities,
356 query_analysis: None,
357 }
358 }
359
360 pub fn format_display(&self) -> String {
362 let mut output = String::new();
363
364 output.push_str(&format!("**Answer:** {}\n\n", self.answer));
366
367 output.push_str(&format!(
369 "**Confidence:** {:.0}%\n\n",
370 self.confidence * 100.0
371 ));
372
373 if !self.reasoning_steps.is_empty() {
375 output.push_str("**Reasoning:**\n");
376 for step in &self.reasoning_steps {
377 output.push_str(&format!(
378 "{}. {} (confidence: {:.0}%)\n",
379 step.step_number,
380 step.description,
381 step.confidence * 100.0
382 ));
383 if let Some(evidence) = &step.evidence_snippet {
384 output.push_str(&format!(" Evidence: \"{}\"\n", evidence));
385 }
386 }
387 output.push('\n');
388 }
389
390 if !self.sources.is_empty() {
392 output.push_str("**Sources:**\n");
393 for (i, source) in self.sources.iter().enumerate() {
394 output.push_str(&format!(
395 "{}. [{:?}] {} (relevance: {:.0}%)\n",
396 i + 1,
397 source.source_type,
398 source.id,
399 source.relevance_score * 100.0
400 ));
401 }
402 }
403
404 output
405 }
406}
407
408#[derive(Debug, Clone)]
414pub struct QueryAnalysis {
415 pub query_type: QueryType,
417 pub key_entities: Vec<String>,
419 pub concepts: Vec<String>,
421 pub intent: QueryIntent,
423 pub complexity_score: f32,
425}
426
427#[derive(Debug, Clone, PartialEq)]
429pub enum QueryType {
430 EntityFocused,
432 Conceptual,
434 Factual,
436 Exploratory,
438 Relationship,
440}
441
442#[derive(Debug, Clone, PartialEq)]
444pub enum QueryIntent {
445 Overview,
447 Detailed,
449 Comparative,
451 Causal,
453 Temporal,
455}
456
457#[derive(Debug, Clone)]
459pub struct QueryAnalysisResult {
460 pub query_type: QueryType,
462 pub confidence: f32,
464 pub keywords_matched: Vec<String>,
466 pub suggested_strategies: Vec<String>,
468 pub complexity_score: f32,
470}
471
472#[derive(Debug, Clone)]
474pub struct QueryResult {
475 pub query: String,
477 pub results: Vec<SearchResult>,
479 pub summary: Option<String>,
481 pub metadata: HashMap<String, String>,
483}
484
485impl RetrievalSystem {
486 #[cfg(feature = "parallel-processing")]
488 pub fn with_parallel_processing(
489 vector_store: std::sync::Arc<dyn VectorStore>,
490 embedding_generator: EmbeddingGenerator,
491 parallel_processor: ParallelProcessor,
492 ) -> Result<Self> {
493 let retrieval_config = RetrievalConfig::default();
498
499 Ok(Self {
500 vector_store,
501 embedding_generator,
502 config: retrieval_config,
503 parallel_processor: Some(parallel_processor),
504 #[cfg(feature = "pagerank")]
505 pagerank_retriever: None,
506 enriched_retriever: None,
507 #[cfg(feature = "lazygraphrag")]
508 concept_filtering_enabled: false,
509 })
510 }
511
512 pub async fn index_graph(&self, graph: &KnowledgeGraph) -> Result<()> {
514 for entity in graph.entities() {
516 if let Some(embedding) = &entity.embedding {
517 let id = format!("entity:{}", entity.id);
518 self.vector_store
520 .add_vector(&id, embedding.clone(), HashMap::new())
521 .await?;
522 }
523 }
524
525 for chunk in graph.chunks() {
527 if let Some(embedding) = &chunk.embedding {
528 let id = format!("chunk:{}", chunk.id);
529 self.vector_store
530 .add_vector(&id, embedding.clone(), HashMap::new())
531 .await?;
532 }
533 }
534
535 self.vector_store.initialize().await?;
537
538 Ok(())
539 }
540
541 #[cfg(feature = "pagerank")]
543 pub fn initialize_pagerank(&mut self, graph: &KnowledgeGraph) -> Result<()> {
544 use crate::graph::pagerank::{PageRankConfig, ScoreWeights};
545
546 tracing::debug!("Initializing high-performance PageRank retrieval system...");
547
548 let pagerank_config = PageRankConfig {
549 damping_factor: 0.85,
550 max_iterations: 50, tolerance: 1e-5, personalized: true,
553 #[cfg(feature = "parallel-processing")]
554 parallel_enabled: self.parallel_processor.is_some(),
555 #[cfg(not(feature = "parallel-processing"))]
556 parallel_enabled: false,
557 cache_size: 2000, sparse_threshold: 500,
559 incremental_updates: true,
560 simd_block_size: 64, };
562
563 let score_weights = ScoreWeights {
564 vector_weight: 0.3,
565 pagerank_weight: 0.5, chunk_weight: 0.15,
567 relationship_weight: 0.05,
568 };
569
570 let mut pagerank_retriever = PageRankRetrievalSystem::new(self.config.top_k)
571 .with_pagerank_config(pagerank_config)
572 .with_score_weights(score_weights)
573 .with_incremental_mode(true)
574 .with_min_threshold(0.05);
575
576 pagerank_retriever.precompute_global_pagerank(graph)?;
581
582 self.pagerank_retriever = Some(pagerank_retriever);
583
584 tracing::debug!("PageRank retrieval system initialized with 27x performance optimizations");
585 Ok(())
586 }
587
588 pub fn initialize_enriched(&mut self, config: Option<EnrichedRetrievalConfig>) -> Result<()> {
590 tracing::debug!("Initializing enriched metadata-aware retrieval system...");
591
592 let enriched_config = config.unwrap_or_default();
593 let enriched_retriever = EnrichedRetriever::with_config(enriched_config);
594
595 self.enriched_retriever = Some(enriched_retriever);
596
597 tracing::debug!("Enriched retrieval system initialized with metadata boosting");
598 Ok(())
599 }
600
601 #[cfg(feature = "pagerank")]
603 pub fn pagerank_query(
604 &self,
605 query: &str,
606 graph: &KnowledgeGraph,
607 max_results: Option<usize>,
608 ) -> Result<Vec<ScoredResult>> {
609 if let Some(pagerank_retriever) = &self.pagerank_retriever {
610 pagerank_retriever.search_with_pagerank(query, graph, max_results)
611 } else {
612 Err(crate::core::GraphRAGError::Retrieval {
613 message: "PageRank retriever not initialized. Call initialize_pagerank() first."
614 .to_string(),
615 })
616 }
617 }
618
619 #[cfg(feature = "pagerank")]
621 pub fn pagerank_batch_query(
622 &self,
623 queries: &[&str],
624 graph: &KnowledgeGraph,
625 max_results_per_query: Option<usize>,
626 ) -> Result<Vec<Vec<ScoredResult>>> {
627 if let Some(pagerank_retriever) = &self.pagerank_retriever {
628 pagerank_retriever.batch_search(queries, graph, max_results_per_query)
629 } else {
630 Err(crate::core::GraphRAGError::Retrieval {
631 message: "PageRank retriever not initialized. Call initialize_pagerank() first."
632 .to_string(),
633 })
634 }
635 }
636
637 pub fn query(&self, query: &str) -> Result<Vec<String>> {
639 Ok(vec![format!("Results for query: {}", query)])
647 }
648
649 pub async fn hybrid_query(
651 &mut self,
652 query: &str,
653 graph: &KnowledgeGraph,
654 ) -> Result<Vec<SearchResult>> {
655 self.hybrid_query_with_trees(query, graph, &HashMap::new())
656 .await
657 }
658
659 pub async fn hybrid_query_with_trees(
661 &mut self,
662 query: &str,
663 graph: &KnowledgeGraph,
664 document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
665 ) -> Result<Vec<SearchResult>> {
666 let analysis = self.analyze_query(query, graph)?;
668
669 let query_embedding = self.embedding_generator.generate_embedding(query);
671
672 let mut results = self
674 .execute_adaptive_retrieval(query, &query_embedding, graph, document_trees, &analysis)
675 .await?;
676
677 if let Some(enriched_retriever) = &self.enriched_retriever {
679 results = enriched_retriever.boost_with_metadata(results, query, graph)?;
681
682 results = enriched_retriever.filter_by_structure(query, results, graph)?;
684 }
685
686 Ok(results)
687 }
688
689 pub async fn legacy_hybrid_query(
691 &mut self,
692 query: &str,
693 graph: &KnowledgeGraph,
694 ) -> Result<Vec<SearchResult>> {
695 let query_embedding = self.embedding_generator.generate_embedding(query);
697
698 let results = self.comprehensive_search(&query_embedding, graph).await?;
700
701 Ok(results)
702 }
703
704 pub async fn add_embeddings_to_graph(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
706 #[cfg(feature = "parallel-processing")]
707 if let Some(processor) = self.parallel_processor.clone() {
708 return self.add_embeddings_parallel(graph, &processor).await;
709 }
710
711 self.add_embeddings_sequential(graph).await
712 }
713
714 #[cfg(feature = "parallel-processing")]
716 async fn add_embeddings_parallel(
717 &mut self,
718 graph: &mut KnowledgeGraph,
719 processor: &ParallelProcessor,
720 ) -> Result<()> {
721 let mut chunk_texts = Vec::new();
723 let mut entity_texts = Vec::new();
724
725 for chunk in graph.chunks() {
727 if chunk.embedding.is_none() {
728 chunk_texts.push((chunk.id.clone(), chunk.content.clone()));
729 }
730 }
731
732 for entity in graph.entities() {
734 if entity.embedding.is_none() {
735 let entity_text = format!("{} {}", entity.name, entity.entity_type);
736 entity_texts.push((entity.id.clone(), entity_text));
737 }
738 }
739
740 let total_items = chunk_texts.len() + entity_texts.len();
745 if processor.should_use_parallel(total_items) {
746 tracing::debug!(
747 "Processing {total_items} embeddings with enhanced sequential approach"
748 );
749 }
750
751 for (chunk_id, text) in chunk_texts {
753 let embedding = self.embedding_generator.generate_embedding(&text);
754 if let Some(chunk) = graph.get_chunk_mut(&chunk_id) {
755 chunk.embedding = Some(embedding);
756 }
757 }
758
759 for (entity_id, text) in entity_texts {
761 let embedding = self.embedding_generator.generate_embedding(&text);
762 if let Some(entity) = graph.get_entity_mut(&entity_id) {
763 entity.embedding = Some(embedding);
764 }
765 }
766
767 self.index_graph(graph).await?;
769
770 Ok(())
771 }
772
773 async fn add_embeddings_sequential(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
775 let _total_chunks = graph.chunks().count();
777 let _total_entities = graph.entities().count();
778 let mut chunk_count = 0;
782 for chunk in graph.chunks_mut() {
783 if chunk.embedding.is_none() {
784 let embedding = self.embedding_generator.generate_embedding(&chunk.content);
785 chunk.embedding = Some(embedding);
786 chunk_count += 1;
787 }
788 }
789
790 let mut entity_count = 0;
792 for entity in graph.entities_mut() {
793 if entity.embedding.is_none() {
794 let entity_text = format!("{} {}", entity.name, entity.entity_type);
796 let embedding = self.embedding_generator.generate_embedding(&entity_text);
797 entity.embedding = Some(embedding);
798 entity_count += 1;
799 }
800 }
801
802 tracing::debug!(
803 "Generated embeddings for {chunk_count} chunks and {entity_count} entities"
804 );
805
806 self.index_graph(graph).await?;
809
810 Ok(())
811 }
812
813 pub async fn batch_query(
816 &mut self,
817 queries: &[&str],
818 graph: &KnowledgeGraph,
819 ) -> Result<Vec<Vec<SearchResult>>> {
820 let processor =
821 self.parallel_processor
822 .as_ref()
823 .ok_or_else(|| crate::core::GraphRAGError::Config {
824 message: "Parallel processor not initialized".to_string(),
825 })?;
826
827 if !processor.should_use_parallel(queries.len()) {
828 let mut results = Vec::new();
830 for &query in queries {
831 results.push(self.hybrid_query(query, graph).await?);
832 }
833 return Ok(results);
834 }
835
836 #[cfg(feature = "parallel-processing")]
837 {
838 let chunk_size = processor.config().chunk_batch_size.min(queries.len());
843 tracing::debug!(
844 "Processing {} queries with enhanced sequential approach (chunk size: {})",
845 queries.len(),
846 chunk_size
847 );
848
849 let mut all_results = Vec::new();
850 for &query in queries {
851 match self.hybrid_query(query, graph).await {
852 Ok(results) => all_results.push(results),
853 Err(e) => {
854 tracing::warn!("Error processing query '{query}': {e}");
855 all_results.push(Vec::new());
856 },
857 }
858 }
859
860 Ok(all_results)
861 }
862
863 #[cfg(not(feature = "parallel-processing"))]
864 {
865 let mut results = Vec::new();
867 for &query in queries {
868 results.push(self.hybrid_query(query, graph).await?);
869 }
870 Ok(results)
871 }
872 }
873
874 pub fn analyze_query(&self, query: &str, graph: &KnowledgeGraph) -> Result<QueryAnalysis> {
876 let query_lower = query.to_lowercase();
877 let words: Vec<&str> = query_lower.split_whitespace().collect();
878
879 let mut key_entities = Vec::new();
881 for entity in graph.entities() {
882 let entity_name_lower = entity.name.to_lowercase();
883 if words
884 .iter()
885 .any(|&word| entity_name_lower.contains(word) || word.contains(&entity_name_lower))
886 {
887 key_entities.push(entity.name.clone());
888 }
889 }
890
891 let concepts: Vec<String> = words
893 .iter()
894 .filter(|&&word| word.len() > 3 && !self.is_stop_word(word))
895 .filter(|&&word| {
896 !key_entities.iter().any(|entity| {
897 entity.to_lowercase().contains(word) || word.contains(&entity.to_lowercase())
898 })
899 })
900 .map(|&word| word.to_string())
901 .collect();
902
903 let query_type = if !key_entities.is_empty() && key_entities.len() > 1 {
905 QueryType::Relationship
906 } else if !key_entities.is_empty() {
907 QueryType::EntityFocused
908 } else if self.has_abstract_concepts(&words) {
909 QueryType::Conceptual
910 } else if self.has_question_words(&words) {
911 QueryType::Exploratory
912 } else {
913 QueryType::Factual
914 };
915
916 let intent = if words
918 .iter()
919 .any(|&w| ["overview", "summary", "general", "about"].contains(&w))
920 {
921 QueryIntent::Overview
922 } else if words
923 .iter()
924 .any(|&w| ["detailed", "specific", "exactly", "precise"].contains(&w))
925 {
926 QueryIntent::Detailed
927 } else if words
928 .iter()
929 .any(|&w| ["compare", "vs", "versus", "between", "difference"].contains(&w))
930 {
931 QueryIntent::Comparative
932 } else if words
933 .iter()
934 .any(|&w| ["cause", "why", "because", "lead", "result"].contains(&w))
935 {
936 QueryIntent::Causal
937 } else if words
938 .iter()
939 .any(|&w| ["when", "time", "before", "after", "during"].contains(&w))
940 {
941 QueryIntent::Temporal
942 } else {
943 QueryIntent::Detailed
944 };
945
946 let complexity_score = (words.len() as f32 * 0.1
948 + key_entities.len() as f32 * 0.3
949 + concepts.len() as f32 * 0.2)
950 .min(1.0);
951
952 Ok(QueryAnalysis {
953 query_type,
954 key_entities,
955 concepts,
956 intent,
957 complexity_score,
958 })
959 }
960
961 pub async fn execute_adaptive_retrieval(
963 &mut self,
964 query: &str,
965 query_embedding: &[f32],
966 graph: &KnowledgeGraph,
967 document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
968 analysis: &QueryAnalysis,
969 ) -> Result<Vec<SearchResult>> {
970 let mut all_results = Vec::new();
971
972 let (vector_weight, graph_weight, hierarchical_weight) =
974 self.calculate_strategy_weights(analysis);
975
976 if vector_weight > 0.0 {
978 let mut vector_results = self
979 .vector_similarity_search(query_embedding, graph)
980 .await?;
981 for result in &mut vector_results {
982 result.score *= vector_weight;
983 }
984 all_results.extend(vector_results);
985 }
986
987 if graph_weight > 0.0 {
989 let mut graph_results = match analysis.query_type {
990 QueryType::EntityFocused | QueryType::Relationship => {
991 self.entity_centric_search(query_embedding, graph, &analysis.key_entities)?
992 },
993 _ => self.entity_based_search(query_embedding, graph)?,
994 };
995 for result in &mut graph_results {
996 result.score *= graph_weight;
997 }
998 all_results.extend(graph_results);
999 }
1000
1001 if hierarchical_weight > 0.0 && !document_trees.is_empty() {
1003 let mut hierarchical_results =
1004 self.hierarchical_search(query, document_trees, analysis)?;
1005 for result in &mut hierarchical_results {
1006 result.score *= hierarchical_weight;
1007 }
1008 all_results.extend(hierarchical_results);
1009 }
1010
1011 if analysis.complexity_score > 0.7 {
1013 let traversal_results =
1014 self.advanced_graph_traversal(query_embedding, graph, analysis)?;
1015 all_results.extend(traversal_results);
1016 }
1017
1018 let fusion_results = self.cross_strategy_fusion(&all_results, analysis)?;
1020 all_results.extend(fusion_results);
1021
1022 let final_results = self.adaptive_rank_and_deduplicate(all_results, analysis)?;
1024
1025 Ok(final_results.into_iter().take(self.config.top_k).collect())
1026 }
1027
1028 pub async fn comprehensive_search(
1030 &self,
1031 query_embedding: &[f32],
1032 graph: &KnowledgeGraph,
1033 ) -> Result<Vec<SearchResult>> {
1034 let mut all_results = Vec::new();
1035
1036 let vector_results = self
1038 .vector_similarity_search(query_embedding, graph)
1039 .await?;
1040 all_results.extend(vector_results);
1041
1042 let entity_results = self.entity_based_search(query_embedding, graph)?;
1044 all_results.extend(entity_results);
1045
1046 let graph_results = self.graph_traversal_search(query_embedding, graph)?;
1048 all_results.extend(graph_results);
1049
1050 let final_results = self.rank_and_deduplicate(all_results)?;
1052
1053 Ok(final_results.into_iter().take(self.config.top_k).collect())
1054 }
1055
1056 async fn vector_similarity_search(
1058 &self,
1059 query_embedding: &[f32],
1060 graph: &KnowledgeGraph,
1061 ) -> Result<Vec<SearchResult>> {
1062 let mut results = Vec::new();
1063
1064 let similar_vectors = self
1068 .vector_store
1069 .search(query_embedding, self.config.top_k * 2)
1070 .await?;
1071
1072 for store_result in similar_vectors {
1073 let id = store_result.id;
1074 let similarity = store_result.score;
1075 if similarity >= self.config.similarity_threshold {
1076 let result = if id.starts_with("entity:") {
1077 let entity_id = EntityId::new(id.strip_prefix("entity:").unwrap().to_string());
1078 graph.get_entity(&entity_id).map(|entity| SearchResult {
1079 id: entity.id.to_string(),
1080 content: entity.name.clone(),
1081 score: similarity * self.config.entity_weight,
1082 result_type: ResultType::Entity,
1083 entities: vec![entity.name.clone()],
1084 source_chunks: entity
1085 .mentions
1086 .iter()
1087 .map(|m| m.chunk_id.to_string())
1088 .collect(),
1089 })
1090 } else if id.starts_with("chunk:") {
1091 let chunk_id = ChunkId::new(id.strip_prefix("chunk:").unwrap().to_string());
1092 if let Some(chunk) = graph.get_chunk(&chunk_id) {
1093 let entity_names: Vec<String> = chunk
1094 .entities
1095 .iter()
1096 .filter_map(|eid| graph.get_entity(eid))
1097 .map(|e| e.name.clone())
1098 .collect();
1099
1100 Some(SearchResult {
1101 id: chunk.id.to_string(),
1102 content: chunk.content.clone(),
1103 score: similarity * self.config.chunk_weight,
1104 result_type: ResultType::Chunk,
1105 entities: entity_names,
1106 source_chunks: vec![chunk.id.to_string()],
1107 })
1108 } else {
1109 None
1110 }
1111 } else {
1112 None
1113 };
1114
1115 if let Some(search_result) = result {
1116 results.push(search_result);
1117 }
1118 }
1119 }
1120
1121 Ok(results)
1122 }
1123
1124 fn entity_based_search(
1126 &self,
1127 query_embedding: &[f32],
1128 graph: &KnowledgeGraph,
1129 ) -> Result<Vec<SearchResult>> {
1130 let mut results = Vec::new();
1131 let mut visited = HashSet::new();
1132
1133 let entity_similarities = self.find_relevant_entities(query_embedding, graph)?;
1135
1136 for (entity_id, similarity) in entity_similarities.into_iter().take(5) {
1137 if visited.contains(&entity_id) {
1138 continue;
1139 }
1140
1141 let expanded_entities = self.expand_through_relationships(
1143 &entity_id,
1144 graph,
1145 self.config.max_expansion_depth,
1146 &mut visited,
1147 )?;
1148
1149 for expanded_entity_id in expanded_entities {
1150 if let Some(entity) = graph.get_entity(&expanded_entity_id) {
1151 let expansion_penalty = if expanded_entity_id == entity_id {
1152 1.0
1153 } else {
1154 0.8
1155 };
1156
1157 results.push(SearchResult {
1158 id: entity.id.to_string(),
1159 content: format!("{} ({})", entity.name, entity.entity_type),
1160 score: similarity * expansion_penalty * self.config.entity_weight,
1161 result_type: ResultType::Entity,
1162 entities: vec![entity.name.clone()],
1163 source_chunks: entity
1164 .mentions
1165 .iter()
1166 .map(|m| m.chunk_id.to_string())
1167 .collect(),
1168 });
1169 }
1170 }
1171 }
1172
1173 Ok(results)
1174 }
1175
1176 fn calculate_strategy_weights(&self, analysis: &QueryAnalysis) -> (f32, f32, f32) {
1178 match (&analysis.query_type, &analysis.intent) {
1179 (QueryType::EntityFocused, _) => (0.5, 0.4, 0.1),
1182 (QueryType::Relationship, _) => (0.3, 0.6, 0.1),
1183 (QueryType::Conceptual, QueryIntent::Overview) => (0.2, 0.2, 0.6),
1184 (QueryType::Conceptual, _) => (0.4, 0.3, 0.3),
1185 (QueryType::Exploratory, QueryIntent::Overview) => (0.3, 0.2, 0.5),
1186 (QueryType::Exploratory, _) => (0.4, 0.4, 0.2),
1187 (QueryType::Factual, _) => (0.6, 0.3, 0.1),
1188 }
1189 }
1190
1191 fn entity_centric_search(
1193 &mut self,
1194 query_embedding: &[f32],
1195 graph: &KnowledgeGraph,
1196 key_entities: &[String],
1197 ) -> Result<Vec<SearchResult>> {
1198 let mut results = Vec::new();
1199 let mut visited = HashSet::new();
1200
1201 for entity_name in key_entities {
1202 if let Some(entity) = graph
1204 .entities()
1205 .find(|e| e.name.eq_ignore_ascii_case(entity_name))
1206 {
1207 results.push(SearchResult {
1209 id: entity.id.to_string(),
1210 content: format!("{} ({})", entity.name, entity.entity_type),
1211 score: 0.9, result_type: ResultType::Entity,
1213 entities: vec![entity.name.clone()],
1214 source_chunks: entity
1215 .mentions
1216 .iter()
1217 .map(|m| m.chunk_id.to_string())
1218 .collect(),
1219 });
1220
1221 let neighbors = graph.get_neighbors(&entity.id);
1223 for (neighbor, relationship) in neighbors {
1224 if !visited.contains(&neighbor.id) {
1225 visited.insert(neighbor.id.clone());
1226
1227 let rel_embedding = self
1229 .embedding_generator
1230 .generate_embedding(&relationship.relation_type);
1231 let rel_similarity =
1232 VectorUtils::cosine_similarity(query_embedding, &rel_embedding);
1233
1234 results.push(SearchResult {
1235 id: neighbor.id.to_string(),
1236 content: format!("{} ({})", neighbor.name, neighbor.entity_type),
1237 score: 0.7 * relationship.confidence * (1.0 + rel_similarity),
1238 result_type: ResultType::Entity,
1239 entities: vec![neighbor.name.clone()],
1240 source_chunks: neighbor
1241 .mentions
1242 .iter()
1243 .map(|m| m.chunk_id.to_string())
1244 .collect(),
1245 });
1246 }
1247 }
1248 }
1249 }
1250
1251 Ok(results)
1252 }
1253
1254 fn hierarchical_search(
1256 &self,
1257 query: &str,
1258 document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
1259 analysis: &QueryAnalysis,
1260 ) -> Result<Vec<SearchResult>> {
1261 let mut results = Vec::new();
1262 let max_results_per_tree = match analysis.intent {
1263 QueryIntent::Overview => 3,
1264 QueryIntent::Detailed => 8,
1265 _ => 5,
1266 };
1267
1268 for (doc_id, tree) in document_trees.iter() {
1269 let tree_summaries = tree.query(query, max_results_per_tree)?;
1270
1271 for (idx, summary) in tree_summaries.iter().enumerate() {
1272 let level_bonus = match analysis.intent {
1274 QueryIntent::Overview => 0.3,
1275 QueryIntent::Detailed => 0.2,
1276 _ => 0.0,
1277 };
1278
1279 results.push(SearchResult {
1280 id: format!("{}:summary:{}", doc_id, idx),
1281 content: summary.summary.clone(),
1282 score: summary.score + level_bonus,
1283 result_type: ResultType::HierarchicalSummary,
1284 entities: Vec::new(),
1285 source_chunks: vec![doc_id.to_string()],
1286 });
1287 }
1288 }
1289
1290 Ok(results)
1291 }
1292
1293 fn advanced_graph_traversal(
1295 &self,
1296 query_embedding: &[f32],
1297 graph: &KnowledgeGraph,
1298 analysis: &QueryAnalysis,
1299 ) -> Result<Vec<SearchResult>> {
1300 let mut results = Vec::new();
1301
1302 if analysis.query_type == QueryType::Relationship && analysis.key_entities.len() >= 2 {
1303 results.extend(self.find_entity_paths(graph, &analysis.key_entities)?);
1305 }
1306
1307 if analysis.complexity_score > 0.8 {
1308 results.extend(self.community_based_search(query_embedding, graph)?);
1310 }
1311
1312 Ok(results)
1313 }
1314
1315 fn cross_strategy_fusion(
1317 &self,
1318 all_results: &[SearchResult],
1319 _analysis: &QueryAnalysis,
1320 ) -> Result<Vec<SearchResult>> {
1321 let mut fusion_results = Vec::new();
1322
1323 let mut content_groups: HashMap<String, Vec<&SearchResult>> = HashMap::new();
1325
1326 for result in all_results {
1327 let content_key = Self::safe_truncate(&result.content, 50);
1328
1329 content_groups.entry(content_key).or_default().push(result);
1330 }
1331
1332 for (content_key, group) in content_groups {
1334 if group.len() > 1 {
1335 let types: HashSet<_> = group.iter().map(|r| &r.result_type).collect();
1336 if types.len() > 1 {
1337 let avg_score = group.iter().map(|r| r.score).sum::<f32>() / group.len() as f32;
1339 let boost = 0.2 * (types.len() - 1) as f32;
1340
1341 let all_entities: HashSet<_> =
1342 group.iter().flat_map(|r| r.entities.iter()).collect();
1343
1344 let all_chunks: HashSet<_> =
1345 group.iter().flat_map(|r| r.source_chunks.iter()).collect();
1346
1347 fusion_results.push(SearchResult {
1348 id: format!(
1349 "fusion_{}",
1350 content_key.chars().take(10).collect::<String>()
1351 ),
1352 content: group[0].content.clone(),
1353 score: (avg_score + boost).min(1.0),
1354 result_type: ResultType::Hybrid,
1355 entities: all_entities.into_iter().cloned().collect(),
1356 source_chunks: all_chunks.into_iter().cloned().collect(),
1357 });
1358 }
1359 }
1360 }
1361
1362 Ok(fusion_results)
1363 }
1364
1365 fn adaptive_rank_and_deduplicate(
1367 &self,
1368 mut results: Vec<SearchResult>,
1369 analysis: &QueryAnalysis,
1370 ) -> Result<Vec<SearchResult>> {
1371 for result in &mut results {
1373 match analysis.query_type {
1374 QueryType::EntityFocused => {
1375 if result.result_type == ResultType::Entity {
1376 result.score *= 1.2;
1377 }
1378 },
1379 QueryType::Conceptual => {
1380 if result.result_type == ResultType::HierarchicalSummary {
1381 result.score *= 1.1;
1382 }
1383 },
1384 QueryType::Relationship => {
1385 if result.entities.len() > 1 {
1386 result.score *= 1.15;
1387 }
1388 },
1389 _ => {},
1390 }
1391
1392 for entity in &analysis.key_entities {
1394 if result
1395 .entities
1396 .iter()
1397 .any(|e| e.eq_ignore_ascii_case(entity))
1398 {
1399 result.score *= 1.1;
1400 }
1401 }
1402 }
1403
1404 results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
1406
1407 let mut deduplicated = Vec::new();
1409 let mut seen_content = HashSet::new();
1410 let mut type_counts: HashMap<ResultType, usize> = HashMap::new();
1411
1412 for result in results {
1413 let content_signature = self.create_content_signature(&result.content);
1414
1415 if !seen_content.contains(&content_signature) {
1416 let type_count = type_counts.get(&result.result_type).unwrap_or(&0);
1417
1418 let max_per_type = match result.result_type {
1420 ResultType::Entity => self.config.top_k / 3,
1421 ResultType::Chunk => self.config.top_k / 2,
1422 ResultType::HierarchicalSummary => self.config.top_k / 4,
1423 ResultType::Hybrid => self.config.top_k / 4,
1424 ResultType::GraphPath => self.config.top_k / 5,
1425 };
1426
1427 if *type_count < max_per_type {
1428 seen_content.insert(content_signature);
1429 *type_counts.entry(result.result_type.clone()).or_insert(0) += 1;
1430 deduplicated.push(result);
1431 }
1432 }
1433 }
1434
1435 Ok(deduplicated)
1436 }
1437
1438 fn find_entity_paths(
1440 &self,
1441 graph: &KnowledgeGraph,
1442 key_entities: &[String],
1443 ) -> Result<Vec<SearchResult>> {
1444 let mut results = Vec::new();
1445
1446 if key_entities.len() < 2 {
1447 return Ok(results);
1448 }
1449
1450 if let (Some(source), Some(target)) = (
1452 graph
1453 .entities()
1454 .find(|e| e.name.eq_ignore_ascii_case(&key_entities[0])),
1455 graph
1456 .entities()
1457 .find(|e| e.name.eq_ignore_ascii_case(&key_entities[1])),
1458 ) {
1459 let path_description =
1460 format!("Connection between {} and {}", source.name, target.name);
1461 let neighbors_source = graph.get_neighbors(&source.id);
1462 let neighbors_target = graph.get_neighbors(&target.id);
1463
1464 if neighbors_source
1466 .iter()
1467 .any(|(neighbor, _)| neighbor.id == target.id)
1468 {
1469 results.push(SearchResult {
1470 id: format!("path_{}_{}", source.id, target.id),
1471 content: format!("Direct relationship: {path_description}"),
1472 score: 0.8,
1473 result_type: ResultType::GraphPath,
1474 entities: vec![source.name.clone(), target.name.clone()],
1475 source_chunks: Vec::new(),
1476 });
1477 }
1478
1479 for (neighbor_s, rel_s) in &neighbors_source {
1481 for (neighbor_t, rel_t) in &neighbors_target {
1482 if neighbor_s.id == neighbor_t.id {
1483 results.push(SearchResult {
1484 id: format!("path_{}_{}_{}", source.id, neighbor_s.id, target.id),
1485 content: format!(
1486 "Indirect relationship via {}: {} -> {} -> {}",
1487 neighbor_s.name, source.name, neighbor_s.name, target.name
1488 ),
1489 score: 0.6 * rel_s.confidence * rel_t.confidence,
1490 result_type: ResultType::GraphPath,
1491 entities: vec![
1492 source.name.clone(),
1493 neighbor_s.name.clone(),
1494 target.name.clone(),
1495 ],
1496 source_chunks: Vec::new(),
1497 });
1498 }
1499 }
1500 }
1501 }
1502
1503 Ok(results)
1504 }
1505
1506 fn community_based_search(
1508 &self,
1509 query_embedding: &[f32],
1510 graph: &KnowledgeGraph,
1511 ) -> Result<Vec<SearchResult>> {
1512 let mut results = Vec::new();
1513 let mut entity_scores: HashMap<String, f32> = HashMap::new();
1514
1515 for entity in graph.entities() {
1517 let neighbors = graph.get_neighbors(&entity.id);
1518 let centrality_score = neighbors.len() as f32 * 0.1;
1519
1520 if let Some(embedding) = &entity.embedding {
1522 let similarity = VectorUtils::cosine_similarity(query_embedding, embedding);
1523 entity_scores.insert(entity.id.to_string(), centrality_score + similarity);
1524 }
1525 }
1526
1527 let mut sorted_entities: Vec<_> = entity_scores.iter().collect();
1529 sorted_entities.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap());
1530
1531 for (entity_id, score) in sorted_entities.iter().take(3) {
1532 if let Some(entity) = graph.entities().find(|e| e.id.to_string() == **entity_id) {
1533 let mut entity_context = String::new();
1535 for mention in entity.mentions.iter().take(2) {
1536 if let Some(chunk) = graph.chunks().find(|c| c.id == mention.chunk_id) {
1537 let chunk_excerpt = if chunk.content.len() > 200 {
1538 format!("{}...", &chunk.content[..200])
1539 } else {
1540 chunk.content.clone()
1541 };
1542 entity_context.push_str(&chunk_excerpt);
1543 entity_context.push(' ');
1544 }
1545 }
1546
1547 if entity_context.is_empty() {
1549 entity_context = format!(
1550 "{} is a {} character in the story.",
1551 entity.name, entity.entity_type
1552 );
1553 }
1554
1555 results.push(SearchResult {
1556 id: entity.id.to_string(),
1557 content: entity_context,
1558 score: **score,
1559 result_type: ResultType::Entity,
1560 entities: vec![entity.name.clone()],
1561 source_chunks: entity
1562 .mentions
1563 .iter()
1564 .map(|m| m.chunk_id.to_string())
1565 .collect(),
1566 });
1567 }
1568 }
1569
1570 Ok(results)
1571 }
1572
1573 fn has_abstract_concepts(&self, words: &[&str]) -> bool {
1575 const ABSTRACT_INDICATORS: &[&str] = &[
1576 "concept",
1577 "idea",
1578 "theory",
1579 "principle",
1580 "philosophy",
1581 "meaning",
1582 "understanding",
1583 "knowledge",
1584 "wisdom",
1585 "truth",
1586 "beauty",
1587 "justice",
1588 ];
1589 words
1590 .iter()
1591 .any(|&word| ABSTRACT_INDICATORS.contains(&word))
1592 }
1593
1594 fn has_question_words(&self, words: &[&str]) -> bool {
1596 const QUESTION_WORDS: &[&str] = &[
1597 "what", "how", "why", "when", "where", "who", "which", "explain", "describe",
1598 ];
1599 words.iter().any(|&word| QUESTION_WORDS.contains(&word))
1600 }
1601
1602 fn create_content_signature(&self, content: &str) -> String {
1604 let prefix = Self::safe_truncate(content, 50);
1606 format!(
1607 "{}_{}",
1608 prefix
1609 .chars()
1610 .filter(|c| c.is_alphanumeric())
1611 .collect::<String>(),
1612 content.len()
1613 )
1614 }
1615
1616 fn graph_traversal_search(
1618 &self,
1619 _query_embedding: &[f32],
1620 _graph: &KnowledgeGraph,
1621 ) -> Result<Vec<SearchResult>> {
1622 Ok(Vec::new())
1630 }
1631
1632 fn find_relevant_entities(
1634 &self,
1635 query_embedding: &[f32],
1636 graph: &KnowledgeGraph,
1637 ) -> Result<Vec<(EntityId, f32)>> {
1638 let mut similarities = Vec::new();
1639
1640 for entity in graph.entities() {
1641 if let Some(embedding) = &entity.embedding {
1642 let similarity = VectorUtils::cosine_similarity(query_embedding, embedding);
1643 if similarity >= self.config.similarity_threshold {
1644 similarities.push((entity.id.clone(), similarity));
1645 }
1646 }
1647 }
1648
1649 similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1651
1652 Ok(similarities)
1653 }
1654
1655 fn expand_through_relationships(
1657 &self,
1658 start_entity: &EntityId,
1659 graph: &KnowledgeGraph,
1660 max_depth: usize,
1661 visited: &mut HashSet<EntityId>,
1662 ) -> Result<Vec<EntityId>> {
1663 let mut results = Vec::new();
1664 let mut current_level = vec![start_entity.clone()];
1665 visited.insert(start_entity.clone());
1666
1667 for _depth in 0..max_depth {
1668 let mut next_level = Vec::new();
1669
1670 for entity_id in ¤t_level {
1671 results.push(entity_id.clone());
1672
1673 let neighbors = graph.get_neighbors(entity_id);
1675 for (neighbor_entity, _relationship) in neighbors {
1676 if !visited.contains(&neighbor_entity.id) {
1677 visited.insert(neighbor_entity.id.clone());
1678 next_level.push(neighbor_entity.id.clone());
1679 }
1680 }
1681 }
1682
1683 if next_level.is_empty() {
1684 break;
1685 }
1686
1687 current_level = next_level;
1688 }
1689
1690 Ok(results)
1691 }
1692
1693 fn is_stop_word(&self, word: &str) -> bool {
1695 const STOP_WORDS: &[&str] = &[
1696 "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
1697 "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
1698 "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
1699 "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
1700 "go", "me",
1701 ];
1702 STOP_WORDS.contains(&word)
1703 }
1704
1705 fn rank_and_deduplicate(&self, mut results: Vec<SearchResult>) -> Result<Vec<SearchResult>> {
1707 results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
1709
1710 let mut seen_ids = HashSet::new();
1712 let mut deduplicated = Vec::new();
1713
1714 for result in results {
1715 if !seen_ids.contains(&result.id) {
1716 seen_ids.insert(result.id.clone());
1717 deduplicated.push(result);
1718 }
1719 }
1720
1721 Ok(deduplicated)
1722 }
1723
1724 pub async fn vector_search(
1726 &mut self,
1727 query: &str,
1728 max_results: usize,
1729 ) -> Result<Vec<SearchResult>> {
1730 let query_embedding = self.embedding_generator.generate_embedding(query);
1731 let similar_vectors = self
1732 .vector_store
1733 .search(&query_embedding, max_results)
1734 .await?;
1735
1736 let mut results = Vec::new();
1737 for store_result in similar_vectors {
1738 results.push(SearchResult {
1739 id: store_result.id.clone(),
1740 content: format!("Vector result for: {}", store_result.id),
1741 score: store_result.score,
1742 result_type: ResultType::Chunk,
1743 entities: Vec::new(),
1744 source_chunks: vec![store_result.id],
1745 });
1746 }
1747
1748 Ok(results)
1749 }
1750
1751 pub fn graph_search(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
1753 let mut results = Vec::new();
1755 results.push(SearchResult {
1756 id: format!("graph_result_{}", query.len()),
1757 content: format!("Graph-based result for: {query}"),
1758 score: 0.7,
1759 result_type: ResultType::GraphPath,
1760 entities: Vec::new(),
1761 source_chunks: Vec::new(),
1762 });
1763
1764 Ok(results.into_iter().take(max_results).collect())
1765 }
1766
1767 pub fn public_hierarchical_search(
1769 &self,
1770 query: &str,
1771 max_results: usize,
1772 ) -> Result<Vec<SearchResult>> {
1773 let mut results = Vec::new();
1775 results.push(SearchResult {
1776 id: format!("hierarchical_result_{}", query.len()),
1777 content: format!("Hierarchical result for: {query}"),
1778 score: 0.8,
1779 result_type: ResultType::HierarchicalSummary,
1780 entities: Vec::new(),
1781 source_chunks: Vec::new(),
1782 });
1783
1784 Ok(results.into_iter().take(max_results).collect())
1785 }
1786
1787 pub fn bm25_search(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
1789 let mut results = Vec::new();
1791 results.push(SearchResult {
1792 id: format!("bm25_result_{}", query.len()),
1793 content: format!("BM25 result for: {query}"),
1794 score: 0.75,
1795 result_type: ResultType::Chunk,
1796 entities: Vec::new(),
1797 source_chunks: Vec::new(),
1798 });
1799
1800 Ok(results.into_iter().take(max_results).collect())
1801 }
1802
1803 pub fn get_statistics(&self) -> RetrievalStatistics {
1805 RetrievalStatistics {
1808 indexed_vectors: 0, vector_dimension: 0, index_built: false, config: self.config.clone(),
1812 }
1813 }
1814
1815 fn safe_truncate(s: &str, max_bytes: usize) -> String {
1817 if s.len() <= max_bytes {
1818 return s.to_string();
1819 }
1820
1821 let mut end_idx = max_bytes;
1823 while end_idx > 0 && !s.is_char_boundary(end_idx) {
1824 end_idx -= 1;
1825 }
1826
1827 s[..end_idx].to_string()
1828 }
1829
1830 pub fn save_state_to_json(&self, file_path: &str) -> Result<()> {
1832 use std::fs;
1833
1834 let mut json_data = json::JsonValue::new_object();
1835
1836 json_data["metadata"] = json::object! {
1838 "format_version" => "1.0",
1839 "created_at" => chrono::Utc::now().to_rfc3339(),
1840 "config" => json::object! {
1841 "top_k" => self.config.top_k,
1842 "similarity_threshold" => self.config.similarity_threshold,
1843 "max_expansion_depth" => self.config.max_expansion_depth,
1844 "entity_weight" => self.config.entity_weight,
1845 "chunk_weight" => self.config.chunk_weight,
1846 "graph_weight" => self.config.graph_weight
1847 }
1848 };
1849
1850 json_data["vector_index"] = json::object! {
1853 "vector_count" => 0, "dimension" => 0, "index_built" => false, "min_norm" => 0.0, "max_norm" => 0.0, "avg_norm" => 0.0 };
1860
1861 json_data["embedding_generator"] = json::object! {
1863 "dimension" => self.embedding_generator.dimension(),
1864 "cached_words" => self.embedding_generator.cached_words()
1865 };
1866
1867 #[cfg(feature = "parallel-processing")]
1869 {
1870 json_data["parallel_enabled"] = self.parallel_processor.is_some().into();
1871 }
1872 #[cfg(not(feature = "parallel-processing"))]
1873 {
1874 json_data["parallel_enabled"] = false.into();
1875 }
1876
1877 fs::write(file_path, json_data.dump())?;
1879 tracing::info!("Retrieval system state saved to {file_path}");
1880
1881 Ok(())
1882 }
1883}
1884
1885#[derive(Debug)]
1887pub struct RetrievalStatistics {
1888 pub indexed_vectors: usize,
1890 pub vector_dimension: usize,
1892 pub index_built: bool,
1894 pub config: RetrievalConfig,
1896}
1897
1898impl RetrievalStatistics {
1899 #[allow(dead_code)]
1901 pub fn print(&self) {
1902 tracing::info!("Retrieval System Statistics:");
1903 tracing::info!(" Indexed vectors: {}", self.indexed_vectors);
1904 tracing::info!(" Vector dimension: {}", self.vector_dimension);
1905 tracing::info!(" Index built: {}", self.index_built);
1906 tracing::info!(" Configuration:");
1907 tracing::info!(" Top K: {}", self.config.top_k);
1908 tracing::info!(
1909 " Similarity threshold: {:.2}",
1910 self.config.similarity_threshold
1911 );
1912 tracing::info!(
1913 " Max expansion depth: {}",
1914 self.config.max_expansion_depth
1915 );
1916 tracing::info!(" Entity weight: {:.2}", self.config.entity_weight);
1917 tracing::info!(" Chunk weight: {:.2}", self.config.chunk_weight);
1918 tracing::info!(" Graph weight: {:.2}", self.config.graph_weight);
1919 }
1920}
1921
1922#[cfg(test)]
1923mod tests {
1924 use super::*;
1925 use crate::{config::Config, core::KnowledgeGraph};
1926
1927 #[test]
1928 fn test_retrieval_system_creation() {
1929 let config = Config::default();
1930 let retrieval = RetrievalSystem::new(&config);
1931 assert!(retrieval.is_ok());
1932 }
1933
1934 #[test]
1935 fn test_query_placeholder() {
1936 let config = Config::default();
1937 let retrieval = RetrievalSystem::new(&config).unwrap();
1938
1939 let results = retrieval.query("test query");
1940 assert!(results.is_ok());
1941
1942 let results = results.unwrap();
1943 assert!(!results.is_empty());
1944 assert!(results[0].contains("test query"));
1945 }
1946
1947 #[tokio::test]
1948 async fn test_graph_indexing() {
1949 let config = Config::default();
1950 let mut retrieval = RetrievalSystem::new(&config).unwrap();
1951 let graph = KnowledgeGraph::new();
1952
1953 let result = retrieval.index_graph(&graph).await;
1954 assert!(result.is_ok());
1955 }
1956
1957 #[test]
1962 fn test_explained_answer_creation() {
1963 let search_results = vec![
1964 SearchResult {
1965 id: "chunk_1".to_string(),
1966 content: "This is the first relevant chunk about climate change.".to_string(),
1967 score: 0.85,
1968 result_type: ResultType::Chunk,
1969 entities: vec!["climate".to_string(), "environment".to_string()],
1970 source_chunks: vec!["doc1_chunk1".to_string()],
1971 },
1972 SearchResult {
1973 id: "chunk_2".to_string(),
1974 content: "Another chunk discussing environmental policies.".to_string(),
1975 score: 0.72,
1976 result_type: ResultType::Chunk,
1977 entities: vec!["policy".to_string(), "environment".to_string()],
1978 source_chunks: vec!["doc1_chunk2".to_string()],
1979 },
1980 ];
1981
1982 let explained = ExplainedAnswer::from_results(
1983 "Climate change is a major environmental concern.".to_string(),
1984 &search_results,
1985 "What is climate change?",
1986 );
1987
1988 assert!(!explained.answer.is_empty());
1989 assert!(explained.confidence > 0.0 && explained.confidence <= 1.0);
1990 assert!(!explained.sources.is_empty());
1991 assert!(!explained.reasoning_steps.is_empty());
1992 }
1993
1994 #[test]
1995 fn test_explained_answer_empty_results() {
1996 let explained = ExplainedAnswer::from_results(
1997 "No relevant information found.".to_string(),
1998 &[],
1999 "What is something unknown?",
2000 );
2001
2002 assert_eq!(explained.confidence, 0.0);
2003 assert!(explained.sources.is_empty());
2004 assert!(!explained.reasoning_steps.is_empty()); }
2006
2007 #[test]
2008 fn test_explained_answer_format_display() {
2009 let search_results = vec![SearchResult {
2010 id: "test_chunk".to_string(),
2011 content: "Test content about technology.".to_string(),
2012 score: 0.9,
2013 result_type: ResultType::Chunk,
2014 entities: vec!["technology".to_string()],
2015 source_chunks: vec!["doc1_chunk1".to_string()],
2016 }];
2017
2018 let explained = ExplainedAnswer::from_results(
2019 "Technology is important.".to_string(),
2020 &search_results,
2021 "Why is technology important?",
2022 );
2023
2024 let formatted = explained.format_display();
2025
2026 assert!(formatted.contains("**Answer:**"));
2027 assert!(formatted.contains("**Confidence:**"));
2028 assert!(formatted.contains("**Reasoning:**"));
2029 assert!(formatted.contains("**Sources:**"));
2030 }
2031
2032 #[test]
2033 fn test_reasoning_steps_structure() {
2034 let search_results = vec![SearchResult {
2035 id: "entity_1".to_string(),
2036 content: "Entity description".to_string(),
2037 score: 0.8,
2038 result_type: ResultType::Entity,
2039 entities: vec!["person".to_string(), "organization".to_string()],
2040 source_chunks: vec![],
2041 }];
2042
2043 let explained = ExplainedAnswer::from_results(
2044 "Answer text".to_string(),
2045 &search_results,
2046 "Who are the key people?",
2047 );
2048
2049 for (i, step) in explained.reasoning_steps.iter().enumerate() {
2051 assert_eq!(step.step_number as usize, i + 1);
2052 assert!(!step.description.is_empty());
2053 assert!(step.confidence >= 0.0 && step.confidence <= 1.0);
2054 }
2055 }
2056
2057 #[test]
2058 fn test_source_reference_types() {
2059 let search_results = vec![
2060 SearchResult {
2061 id: "chunk".to_string(),
2062 content: "Chunk content".to_string(),
2063 score: 0.7,
2064 result_type: ResultType::Chunk,
2065 entities: vec![],
2066 source_chunks: vec![],
2067 },
2068 SearchResult {
2069 id: "entity".to_string(),
2070 content: "Entity content".to_string(),
2071 score: 0.6,
2072 result_type: ResultType::Entity,
2073 entities: vec![],
2074 source_chunks: vec![],
2075 },
2076 SearchResult {
2077 id: "path".to_string(),
2078 content: "Graph path content".to_string(),
2079 score: 0.5,
2080 result_type: ResultType::GraphPath,
2081 entities: vec![],
2082 source_chunks: vec![],
2083 },
2084 ];
2085
2086 let explained =
2087 ExplainedAnswer::from_results("Answer".to_string(), &search_results, "Query");
2088
2089 let source_types: Vec<_> = explained.sources.iter().map(|s| &s.source_type).collect();
2090 assert!(source_types.contains(&&SourceType::TextChunk));
2091 assert!(source_types.contains(&&SourceType::Entity));
2092 assert!(source_types.contains(&&SourceType::Relationship));
2093 }
2094}