1pub mod adaptive;
2pub mod bm25;
4pub mod enriched;
6pub mod hybrid;
8pub mod pagerank_retrieval;
9#[cfg(feature = "pagerank")]
11pub mod hipporag_ppr;
12
13use crate::{
14 config::Config,
15 core::{ChunkId, EntityId, KnowledgeGraph},
16 summarization::DocumentTree,
17 vector::{EmbeddingGenerator, VectorIndex, VectorUtils},
18 Result,
19};
20#[cfg(feature = "parallel-processing")]
21use crate::parallel::ParallelProcessor;
22use std::collections::{HashMap, HashSet};
23
24pub use bm25::{BM25Result, BM25Retriever, Document as BM25Document};
25pub use enriched::{EnrichedRetrievalConfig, EnrichedRetriever};
26pub use hybrid::{FusionMethod, HybridConfig, HybridRetriever, HybridSearchResult};
27
28#[cfg(feature = "pagerank")]
29pub use pagerank_retrieval::{PageRankRetrievalSystem, ScoredResult};
30
31#[cfg(feature = "pagerank")]
32pub use hipporag_ppr::{HippoRAGConfig, HippoRAGRetriever, Fact};
33
34pub struct RetrievalSystem {
36 vector_index: VectorIndex,
37 embedding_generator: EmbeddingGenerator,
38 config: RetrievalConfig,
39 #[cfg(feature = "parallel-processing")]
40 parallel_processor: Option<ParallelProcessor>,
41 #[cfg(feature = "pagerank")]
42 pagerank_retriever: Option<PageRankRetrievalSystem>,
43 enriched_retriever: Option<EnrichedRetriever>,
44}
45
46#[derive(Debug, Clone)]
48pub struct RetrievalConfig {
49 pub top_k: usize,
51 pub similarity_threshold: f32,
53 pub max_expansion_depth: usize,
55 pub entity_weight: f32,
57 pub chunk_weight: f32,
59 pub graph_weight: f32,
61}
62
63impl Default for RetrievalConfig {
64 fn default() -> Self {
65 Self {
66 top_k: 10,
67 similarity_threshold: 0.7,
68 max_expansion_depth: 2,
69 entity_weight: 0.4,
70 chunk_weight: 0.4,
71 graph_weight: 0.2,
72 }
73 }
74}
75
76#[derive(Debug, Clone)]
78pub struct SearchResult {
79 pub id: String,
81 pub content: String,
83 pub score: f32,
85 pub result_type: ResultType,
87 pub entities: Vec<String>,
89 pub source_chunks: Vec<String>,
91}
92
93#[derive(Debug, Clone, PartialEq, Eq, Hash)]
95pub enum ResultType {
96 Entity,
98 Chunk,
100 GraphPath,
102 HierarchicalSummary,
104 Hybrid,
106}
107
108#[derive(Debug, Clone)]
110pub struct QueryAnalysis {
111 pub query_type: QueryType,
113 pub key_entities: Vec<String>,
115 pub concepts: Vec<String>,
117 pub intent: QueryIntent,
119 pub complexity_score: f32,
121}
122
123#[derive(Debug, Clone, PartialEq)]
125pub enum QueryType {
126 EntityFocused,
128 Conceptual,
130 Factual,
132 Exploratory,
134 Relationship,
136}
137
138#[derive(Debug, Clone, PartialEq)]
140pub enum QueryIntent {
141 Overview,
143 Detailed,
145 Comparative,
147 Causal,
149 Temporal,
151}
152
153#[derive(Debug, Clone)]
155pub struct QueryAnalysisResult {
156 pub query_type: QueryType,
158 pub confidence: f32,
160 pub keywords_matched: Vec<String>,
162 pub suggested_strategies: Vec<String>,
164 pub complexity_score: f32,
166}
167
168#[derive(Debug, Clone)]
170pub struct QueryResult {
171 pub query: String,
173 pub results: Vec<SearchResult>,
175 pub summary: Option<String>,
177 pub metadata: HashMap<String, String>,
179}
180
181impl RetrievalSystem {
182 pub fn new(config: &Config) -> Result<Self> {
184 let retrieval_config = RetrievalConfig {
185 top_k: config.retrieval.top_k,
186 similarity_threshold: 0.35, max_expansion_depth: 2, entity_weight: 0.4, chunk_weight: 0.4, graph_weight: 0.2, };
192
193 Ok(Self {
194 vector_index: VectorIndex::new(),
195 embedding_generator: EmbeddingGenerator::new(128), config: retrieval_config,
197 #[cfg(feature = "parallel-processing")]
198 parallel_processor: None,
199 #[cfg(feature = "pagerank")]
200 pagerank_retriever: None,
201 enriched_retriever: None,
202 })
203 }
204
205 #[cfg(feature = "parallel-processing")]
207 pub fn with_parallel_processing(
208 config: &Config,
209 parallel_processor: ParallelProcessor,
210 ) -> Result<Self> {
211 let retrieval_config = RetrievalConfig {
212 top_k: config.retrieval.top_k,
213 similarity_threshold: 0.35, max_expansion_depth: 2,
215 entity_weight: 0.4,
216 chunk_weight: 0.4,
217 graph_weight: 0.2,
218 };
219
220 Ok(Self {
221 vector_index: VectorIndex::with_parallel_processing(parallel_processor.clone()),
222 embedding_generator: EmbeddingGenerator::with_parallel_processing(
223 128,
224 parallel_processor.clone(),
225 ),
226 config: retrieval_config,
227 parallel_processor: Some(parallel_processor),
228 #[cfg(feature = "pagerank")]
229 pagerank_retriever: None,
230 enriched_retriever: None,
231 })
232 }
233
234 pub fn index_graph(&mut self, graph: &KnowledgeGraph) -> Result<()> {
236 for entity in graph.entities() {
238 if let Some(embedding) = &entity.embedding {
239 let id = format!("entity:{}", entity.id);
240 self.vector_index.add_vector(id, embedding.clone())?;
241 }
242 }
243
244 for chunk in graph.chunks() {
246 if let Some(embedding) = &chunk.embedding {
247 let id = format!("chunk:{}", chunk.id);
248 self.vector_index.add_vector(id, embedding.clone())?;
249 }
250 }
251
252 if !self.vector_index.is_empty() {
254 self.vector_index.build_index()?;
255 }
256
257 Ok(())
258 }
259
260 #[cfg(feature = "pagerank")]
262 pub fn initialize_pagerank(&mut self, graph: &KnowledgeGraph) -> Result<()> {
263 use crate::graph::pagerank::{PageRankConfig, ScoreWeights};
264
265 tracing::debug!("Initializing high-performance PageRank retrieval system...");
266
267 let pagerank_config = PageRankConfig {
268 damping_factor: 0.85,
269 max_iterations: 50, tolerance: 1e-5, personalized: true,
272 #[cfg(feature = "parallel-processing")]
273 parallel_enabled: self.parallel_processor.is_some(),
274 #[cfg(not(feature = "parallel-processing"))]
275 parallel_enabled: false,
276 cache_size: 2000, sparse_threshold: 500,
278 incremental_updates: true,
279 simd_block_size: 64, };
281
282 let score_weights = ScoreWeights {
283 vector_weight: 0.3,
284 pagerank_weight: 0.5, chunk_weight: 0.15,
286 relationship_weight: 0.05,
287 };
288
289 let mut pagerank_retriever = PageRankRetrievalSystem::new(self.config.top_k)
290 .with_pagerank_config(pagerank_config)
291 .with_score_weights(score_weights)
292 .with_incremental_mode(true)
293 .with_min_threshold(0.05);
294
295 pagerank_retriever.initialize_vector_index(graph)?;
297
298 pagerank_retriever.precompute_global_pagerank(graph)?;
300
301 self.pagerank_retriever = Some(pagerank_retriever);
302
303 tracing::debug!("PageRank retrieval system initialized with 27x performance optimizations");
304 Ok(())
305 }
306
307 pub fn initialize_enriched(&mut self, config: Option<EnrichedRetrievalConfig>) -> Result<()> {
309 tracing::debug!("Initializing enriched metadata-aware retrieval system...");
310
311 let enriched_config = config.unwrap_or_default();
312 let enriched_retriever = EnrichedRetriever::with_config(enriched_config);
313
314 self.enriched_retriever = Some(enriched_retriever);
315
316 tracing::debug!("Enriched retrieval system initialized with metadata boosting");
317 Ok(())
318 }
319
320 #[cfg(feature = "pagerank")]
322 pub fn pagerank_query(
323 &self,
324 query: &str,
325 graph: &KnowledgeGraph,
326 max_results: Option<usize>,
327 ) -> Result<Vec<ScoredResult>> {
328 if let Some(pagerank_retriever) = &self.pagerank_retriever {
329 pagerank_retriever.search_with_pagerank(query, graph, max_results)
330 } else {
331 Err(crate::core::GraphRAGError::Retrieval {
332 message: "PageRank retriever not initialized. Call initialize_pagerank() first.".to_string(),
333 })
334 }
335 }
336
337 #[cfg(feature = "pagerank")]
339 pub fn pagerank_batch_query(
340 &self,
341 queries: &[&str],
342 graph: &KnowledgeGraph,
343 max_results_per_query: Option<usize>,
344 ) -> Result<Vec<Vec<ScoredResult>>> {
345 if let Some(pagerank_retriever) = &self.pagerank_retriever {
346 pagerank_retriever.batch_search(queries, graph, max_results_per_query)
347 } else {
348 Err(crate::core::GraphRAGError::Retrieval {
349 message: "PageRank retriever not initialized. Call initialize_pagerank() first.".to_string(),
350 })
351 }
352 }
353
354 pub fn query(&self, query: &str) -> Result<Vec<String>> {
356 Ok(vec![format!("Results for query: {}", query)])
364 }
365
366 pub fn hybrid_query(
368 &mut self,
369 query: &str,
370 graph: &KnowledgeGraph,
371 ) -> Result<Vec<SearchResult>> {
372 self.hybrid_query_with_trees(query, graph, &HashMap::new())
373 }
374
375 pub fn hybrid_query_with_trees(
377 &mut self,
378 query: &str,
379 graph: &KnowledgeGraph,
380 document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
381 ) -> Result<Vec<SearchResult>> {
382 let analysis = self.analyze_query(query, graph)?;
384
385 let query_embedding = self.embedding_generator.generate_embedding(query);
387
388 let mut results = self.execute_adaptive_retrieval(
390 query,
391 &query_embedding,
392 graph,
393 document_trees,
394 &analysis,
395 )?;
396
397 if let Some(enriched_retriever) = &self.enriched_retriever {
399 results = enriched_retriever.boost_with_metadata(results, query, graph)?;
401
402 results = enriched_retriever.filter_by_structure(query, results, graph)?;
404 }
405
406 Ok(results)
407 }
408
409 pub fn legacy_hybrid_query(
411 &mut self,
412 query: &str,
413 graph: &KnowledgeGraph,
414 ) -> Result<Vec<SearchResult>> {
415 let query_embedding = self.embedding_generator.generate_embedding(query);
417
418 let results = self.comprehensive_search(&query_embedding, graph)?;
420
421 Ok(results)
422 }
423
424 pub fn add_embeddings_to_graph(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
426 #[cfg(feature = "parallel-processing")]
427 if let Some(processor) = self.parallel_processor.clone() {
428 return self.add_embeddings_parallel(graph, &processor);
429 }
430
431 self.add_embeddings_sequential(graph)
432 }
433
434 #[cfg(feature = "parallel-processing")]
436 fn add_embeddings_parallel(
437 &mut self,
438 graph: &mut KnowledgeGraph,
439 processor: &ParallelProcessor,
440 ) -> Result<()> {
441 let mut chunk_texts = Vec::new();
443 let mut entity_texts = Vec::new();
444
445 for chunk in graph.chunks() {
447 if chunk.embedding.is_none() {
448 chunk_texts.push((chunk.id.clone(), chunk.content.clone()));
449 }
450 }
451
452 for entity in graph.entities() {
454 if entity.embedding.is_none() {
455 let entity_text = format!("{} {}", entity.name, entity.entity_type);
456 entity_texts.push((entity.id.clone(), entity_text));
457 }
458 }
459
460 let total_items = chunk_texts.len() + entity_texts.len();
465 if processor.should_use_parallel(total_items) {
466 tracing::debug!("Processing {total_items} embeddings with enhanced sequential approach");
467 }
468
469 for (chunk_id, text) in chunk_texts {
471 let embedding = self.embedding_generator.generate_embedding(&text);
472 if let Some(chunk) = graph.get_chunk_mut(&chunk_id) {
473 chunk.embedding = Some(embedding);
474 }
475 }
476
477 for (entity_id, text) in entity_texts {
479 let embedding = self.embedding_generator.generate_embedding(&text);
480 if let Some(entity) = graph.get_entity_mut(&entity_id) {
481 entity.embedding = Some(embedding);
482 }
483 }
484
485 self.index_graph(graph)?;
487
488 Ok(())
489 }
490
491 fn add_embeddings_sequential(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
493 let _total_chunks = graph.chunks().count();
495 let _total_entities = graph.entities().count();
496 let mut chunk_count = 0;
500 for chunk in graph.chunks_mut() {
501 if chunk.embedding.is_none() {
502 let embedding = self.embedding_generator.generate_embedding(&chunk.content);
503 chunk.embedding = Some(embedding);
504 chunk_count += 1;
505 }
506 }
507
508 let mut entity_count = 0;
510 for entity in graph.entities_mut() {
511 if entity.embedding.is_none() {
512 let entity_text = format!("{} {}", entity.name, entity.entity_type);
514 let embedding = self.embedding_generator.generate_embedding(&entity_text);
515 entity.embedding = Some(embedding);
516 entity_count += 1;
517 }
518 }
519
520 tracing::debug!("Generated embeddings for {chunk_count} chunks and {entity_count} entities");
521
522 self.index_graph(graph)?;
524
525 Ok(())
526 }
527
528 pub fn batch_query(
530 &mut self,
531 queries: &[&str],
532 graph: &KnowledgeGraph,
533 ) -> Result<Vec<Vec<SearchResult>>> {
534 #[cfg(feature = "parallel-processing")]
535 if let Some(processor) = &self.parallel_processor.clone() {
536 return self.batch_query_parallel(queries, graph, processor);
537 }
538
539 queries
541 .iter()
542 .map(|&query| self.hybrid_query(query, graph))
543 .collect()
544 }
545
546 #[cfg(feature = "parallel-processing")]
548 fn batch_query_parallel(
549 &mut self,
550 queries: &[&str],
551 graph: &KnowledgeGraph,
552 processor: &ParallelProcessor,
553 ) -> Result<Vec<Vec<SearchResult>>> {
554 if !processor.should_use_parallel(queries.len()) {
555 return queries
557 .iter()
558 .map(|&query| self.hybrid_query(query, graph))
559 .collect();
560 }
561
562 #[cfg(feature = "parallel-processing")]
563 {
564 let chunk_size = processor.config().chunk_batch_size.min(queries.len());
569 tracing::debug!(
570 "Processing {} queries with enhanced sequential approach (chunk size: {})",
571 queries.len(),
572 chunk_size
573 );
574
575 let mut all_results = Vec::new();
576 for &query in queries {
577 match self.hybrid_query(query, graph) {
578 Ok(results) => all_results.push(results),
579 Err(e) => {
580 tracing::warn!("Error processing query '{query}': {e}");
581 all_results.push(Vec::new());
582 }
583 }
584 }
585
586 Ok(all_results)
587 }
588
589 #[cfg(not(feature = "parallel-processing"))]
590 {
591 queries
593 .iter()
594 .map(|&query| self.hybrid_query(query, graph))
595 .collect()
596 }
597 }
598
599 pub fn analyze_query(&self, query: &str, graph: &KnowledgeGraph) -> Result<QueryAnalysis> {
601 let query_lower = query.to_lowercase();
602 let words: Vec<&str> = query_lower.split_whitespace().collect();
603
604 let mut key_entities = Vec::new();
606 for entity in graph.entities() {
607 let entity_name_lower = entity.name.to_lowercase();
608 if words
609 .iter()
610 .any(|&word| entity_name_lower.contains(word) || word.contains(&entity_name_lower))
611 {
612 key_entities.push(entity.name.clone());
613 }
614 }
615
616 let concepts: Vec<String> = words
618 .iter()
619 .filter(|&&word| word.len() > 3 && !self.is_stop_word(word))
620 .filter(|&&word| {
621 !key_entities.iter().any(|entity| {
622 entity.to_lowercase().contains(word) || word.contains(&entity.to_lowercase())
623 })
624 })
625 .map(|&word| word.to_string())
626 .collect();
627
628 let query_type = if !key_entities.is_empty() && key_entities.len() > 1 {
630 QueryType::Relationship
631 } else if !key_entities.is_empty() {
632 QueryType::EntityFocused
633 } else if self.has_abstract_concepts(&words) {
634 QueryType::Conceptual
635 } else if self.has_question_words(&words) {
636 QueryType::Exploratory
637 } else {
638 QueryType::Factual
639 };
640
641 let intent = if words
643 .iter()
644 .any(|&w| ["overview", "summary", "general", "about"].contains(&w))
645 {
646 QueryIntent::Overview
647 } else if words
648 .iter()
649 .any(|&w| ["detailed", "specific", "exactly", "precise"].contains(&w))
650 {
651 QueryIntent::Detailed
652 } else if words
653 .iter()
654 .any(|&w| ["compare", "vs", "versus", "between", "difference"].contains(&w))
655 {
656 QueryIntent::Comparative
657 } else if words
658 .iter()
659 .any(|&w| ["cause", "why", "because", "lead", "result"].contains(&w))
660 {
661 QueryIntent::Causal
662 } else if words
663 .iter()
664 .any(|&w| ["when", "time", "before", "after", "during"].contains(&w))
665 {
666 QueryIntent::Temporal
667 } else {
668 QueryIntent::Detailed
669 };
670
671 let complexity_score = (words.len() as f32 * 0.1
673 + key_entities.len() as f32 * 0.3
674 + concepts.len() as f32 * 0.2)
675 .min(1.0);
676
677 Ok(QueryAnalysis {
678 query_type,
679 key_entities,
680 concepts,
681 intent,
682 complexity_score,
683 })
684 }
685
686 pub fn execute_adaptive_retrieval(
688 &mut self,
689 query: &str,
690 query_embedding: &[f32],
691 graph: &KnowledgeGraph,
692 document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
693 analysis: &QueryAnalysis,
694 ) -> Result<Vec<SearchResult>> {
695 let mut all_results = Vec::new();
696
697 let (vector_weight, graph_weight, hierarchical_weight) =
699 self.calculate_strategy_weights(analysis);
700
701 if vector_weight > 0.0 {
703 let mut vector_results = self.vector_similarity_search(query_embedding, graph)?;
704 for result in &mut vector_results {
705 result.score *= vector_weight;
706 }
707 all_results.extend(vector_results);
708 }
709
710 if graph_weight > 0.0 {
712 let mut graph_results = match analysis.query_type {
713 QueryType::EntityFocused | QueryType::Relationship => {
714 self.entity_centric_search(query_embedding, graph, &analysis.key_entities)?
715 }
716 _ => self.entity_based_search(query_embedding, graph)?,
717 };
718 for result in &mut graph_results {
719 result.score *= graph_weight;
720 }
721 all_results.extend(graph_results);
722 }
723
724 if hierarchical_weight > 0.0 && !document_trees.is_empty() {
726 let mut hierarchical_results =
727 self.hierarchical_search(query, document_trees, analysis)?;
728 for result in &mut hierarchical_results {
729 result.score *= hierarchical_weight;
730 }
731 all_results.extend(hierarchical_results);
732 }
733
734 if analysis.complexity_score > 0.7 {
736 let traversal_results =
737 self.advanced_graph_traversal(query_embedding, graph, analysis)?;
738 all_results.extend(traversal_results);
739 }
740
741 let fusion_results = self.cross_strategy_fusion(&all_results, analysis)?;
743 all_results.extend(fusion_results);
744
745 let final_results = self.adaptive_rank_and_deduplicate(all_results, analysis)?;
747
748 Ok(final_results.into_iter().take(self.config.top_k).collect())
749 }
750
751 pub fn comprehensive_search(
753 &self,
754 query_embedding: &[f32],
755 graph: &KnowledgeGraph,
756 ) -> Result<Vec<SearchResult>> {
757 let mut all_results = Vec::new();
758
759 let vector_results = self.vector_similarity_search(query_embedding, graph)?;
761 all_results.extend(vector_results);
762
763 let entity_results = self.entity_based_search(query_embedding, graph)?;
765 all_results.extend(entity_results);
766
767 let graph_results = self.graph_traversal_search(query_embedding, graph)?;
769 all_results.extend(graph_results);
770
771 let final_results = self.rank_and_deduplicate(all_results)?;
773
774 Ok(final_results.into_iter().take(self.config.top_k).collect())
775 }
776
777 fn vector_similarity_search(
779 &self,
780 query_embedding: &[f32],
781 graph: &KnowledgeGraph,
782 ) -> Result<Vec<SearchResult>> {
783 let mut results = Vec::new();
784
785 let similar_vectors = self
787 .vector_index
788 .search(query_embedding, self.config.top_k * 2)?;
789
790 for (id, similarity) in similar_vectors {
791 if similarity >= self.config.similarity_threshold {
792 let result = if id.starts_with("entity:") {
793 let entity_id = EntityId::new(id.strip_prefix("entity:").unwrap().to_string());
794 graph.get_entity(&entity_id).map(|entity| SearchResult {
795 id: entity.id.to_string(),
796 content: entity.name.clone(),
797 score: similarity * self.config.entity_weight,
798 result_type: ResultType::Entity,
799 entities: vec![entity.name.clone()],
800 source_chunks: entity
801 .mentions
802 .iter()
803 .map(|m| m.chunk_id.to_string())
804 .collect(),
805 })
806 } else if id.starts_with("chunk:") {
807 let chunk_id = ChunkId::new(id.strip_prefix("chunk:").unwrap().to_string());
808 if let Some(chunk) = graph.get_chunk(&chunk_id) {
809 let entity_names: Vec<String> = chunk
810 .entities
811 .iter()
812 .filter_map(|eid| graph.get_entity(eid))
813 .map(|e| e.name.clone())
814 .collect();
815
816 Some(SearchResult {
817 id: chunk.id.to_string(),
818 content: chunk.content.clone(),
819 score: similarity * self.config.chunk_weight,
820 result_type: ResultType::Chunk,
821 entities: entity_names,
822 source_chunks: vec![chunk.id.to_string()],
823 })
824 } else {
825 None
826 }
827 } else {
828 None
829 };
830
831 if let Some(search_result) = result {
832 results.push(search_result);
833 }
834 }
835 }
836
837 Ok(results)
838 }
839
840 fn entity_based_search(
842 &self,
843 query_embedding: &[f32],
844 graph: &KnowledgeGraph,
845 ) -> Result<Vec<SearchResult>> {
846 let mut results = Vec::new();
847 let mut visited = HashSet::new();
848
849 let entity_similarities = self.find_relevant_entities(query_embedding, graph)?;
851
852 for (entity_id, similarity) in entity_similarities.into_iter().take(5) {
853 if visited.contains(&entity_id) {
854 continue;
855 }
856
857 let expanded_entities = self.expand_through_relationships(
859 &entity_id,
860 graph,
861 self.config.max_expansion_depth,
862 &mut visited,
863 )?;
864
865 for expanded_entity_id in expanded_entities {
866 if let Some(entity) = graph.get_entity(&expanded_entity_id) {
867 let expansion_penalty = if expanded_entity_id == entity_id {
868 1.0
869 } else {
870 0.8
871 };
872
873 results.push(SearchResult {
874 id: entity.id.to_string(),
875 content: format!("{} ({})", entity.name, entity.entity_type),
876 score: similarity * expansion_penalty * self.config.entity_weight,
877 result_type: ResultType::Entity,
878 entities: vec![entity.name.clone()],
879 source_chunks: entity
880 .mentions
881 .iter()
882 .map(|m| m.chunk_id.to_string())
883 .collect(),
884 });
885 }
886 }
887 }
888
889 Ok(results)
890 }
891
892 fn calculate_strategy_weights(&self, analysis: &QueryAnalysis) -> (f32, f32, f32) {
894 match (&analysis.query_type, &analysis.intent) {
895 (QueryType::EntityFocused, _) => (0.5, 0.4, 0.1),
898 (QueryType::Relationship, _) => (0.3, 0.6, 0.1),
899 (QueryType::Conceptual, QueryIntent::Overview) => (0.2, 0.2, 0.6),
900 (QueryType::Conceptual, _) => (0.4, 0.3, 0.3),
901 (QueryType::Exploratory, QueryIntent::Overview) => (0.3, 0.2, 0.5),
902 (QueryType::Exploratory, _) => (0.4, 0.4, 0.2),
903 (QueryType::Factual, _) => (0.6, 0.3, 0.1),
904 }
905 }
906
907 fn entity_centric_search(
909 &mut self,
910 query_embedding: &[f32],
911 graph: &KnowledgeGraph,
912 key_entities: &[String],
913 ) -> Result<Vec<SearchResult>> {
914 let mut results = Vec::new();
915 let mut visited = HashSet::new();
916
917 for entity_name in key_entities {
918 if let Some(entity) = graph
920 .entities()
921 .find(|e| e.name.eq_ignore_ascii_case(entity_name))
922 {
923 results.push(SearchResult {
925 id: entity.id.to_string(),
926 content: format!("{} ({})", entity.name, entity.entity_type),
927 score: 0.9, result_type: ResultType::Entity,
929 entities: vec![entity.name.clone()],
930 source_chunks: entity
931 .mentions
932 .iter()
933 .map(|m| m.chunk_id.to_string())
934 .collect(),
935 });
936
937 let neighbors = graph.get_neighbors(&entity.id);
939 for (neighbor, relationship) in neighbors {
940 if !visited.contains(&neighbor.id) {
941 visited.insert(neighbor.id.clone());
942
943 let rel_embedding = self
945 .embedding_generator
946 .generate_embedding(&relationship.relation_type);
947 let rel_similarity =
948 VectorUtils::cosine_similarity(query_embedding, &rel_embedding);
949
950 results.push(SearchResult {
951 id: neighbor.id.to_string(),
952 content: format!("{} ({})", neighbor.name, neighbor.entity_type),
953 score: 0.7 * relationship.confidence * (1.0 + rel_similarity),
954 result_type: ResultType::Entity,
955 entities: vec![neighbor.name.clone()],
956 source_chunks: neighbor
957 .mentions
958 .iter()
959 .map(|m| m.chunk_id.to_string())
960 .collect(),
961 });
962 }
963 }
964 }
965 }
966
967 Ok(results)
968 }
969
970 fn hierarchical_search(
972 &self,
973 query: &str,
974 document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
975 analysis: &QueryAnalysis,
976 ) -> Result<Vec<SearchResult>> {
977 let mut results = Vec::new();
978 let max_results_per_tree = match analysis.intent {
979 QueryIntent::Overview => 3,
980 QueryIntent::Detailed => 8,
981 _ => 5,
982 };
983
984 for (doc_id, tree) in document_trees.iter() {
985 let tree_summaries = tree.query(query, max_results_per_tree)?;
986
987 for (idx, summary) in tree_summaries.iter().enumerate() {
988 let level_bonus = match analysis.intent {
990 QueryIntent::Overview => 0.3,
991 QueryIntent::Detailed => 0.2,
992 _ => 0.0,
993 };
994
995 results.push(SearchResult {
996 id: format!("{}:summary:{}", doc_id, idx),
997 content: summary.summary.clone(),
998 score: summary.score + level_bonus,
999 result_type: ResultType::HierarchicalSummary,
1000 entities: Vec::new(),
1001 source_chunks: vec![doc_id.to_string()],
1002 });
1003 }
1004 }
1005
1006 Ok(results)
1007 }
1008
1009 fn advanced_graph_traversal(
1011 &self,
1012 query_embedding: &[f32],
1013 graph: &KnowledgeGraph,
1014 analysis: &QueryAnalysis,
1015 ) -> Result<Vec<SearchResult>> {
1016 let mut results = Vec::new();
1017
1018 if analysis.query_type == QueryType::Relationship && analysis.key_entities.len() >= 2 {
1019 results.extend(self.find_entity_paths(graph, &analysis.key_entities)?);
1021 }
1022
1023 if analysis.complexity_score > 0.8 {
1024 results.extend(self.community_based_search(query_embedding, graph)?);
1026 }
1027
1028 Ok(results)
1029 }
1030
1031 fn cross_strategy_fusion(
1033 &self,
1034 all_results: &[SearchResult],
1035 _analysis: &QueryAnalysis,
1036 ) -> Result<Vec<SearchResult>> {
1037 let mut fusion_results = Vec::new();
1038
1039 let mut content_groups: HashMap<String, Vec<&SearchResult>> = HashMap::new();
1041
1042 for result in all_results {
1043 let content_key = Self::safe_truncate(&result.content, 50);
1044
1045 content_groups.entry(content_key).or_default().push(result);
1046 }
1047
1048 for (content_key, group) in content_groups {
1050 if group.len() > 1 {
1051 let types: HashSet<_> = group.iter().map(|r| &r.result_type).collect();
1052 if types.len() > 1 {
1053 let avg_score = group.iter().map(|r| r.score).sum::<f32>() / group.len() as f32;
1055 let boost = 0.2 * (types.len() - 1) as f32;
1056
1057 let all_entities: HashSet<_> =
1058 group.iter().flat_map(|r| r.entities.iter()).collect();
1059
1060 let all_chunks: HashSet<_> =
1061 group.iter().flat_map(|r| r.source_chunks.iter()).collect();
1062
1063 fusion_results.push(SearchResult {
1064 id: format!(
1065 "fusion_{}",
1066 content_key.chars().take(10).collect::<String>()
1067 ),
1068 content: group[0].content.clone(),
1069 score: (avg_score + boost).min(1.0),
1070 result_type: ResultType::Hybrid,
1071 entities: all_entities.into_iter().cloned().collect(),
1072 source_chunks: all_chunks.into_iter().cloned().collect(),
1073 });
1074 }
1075 }
1076 }
1077
1078 Ok(fusion_results)
1079 }
1080
1081 fn adaptive_rank_and_deduplicate(
1083 &self,
1084 mut results: Vec<SearchResult>,
1085 analysis: &QueryAnalysis,
1086 ) -> Result<Vec<SearchResult>> {
1087 for result in &mut results {
1089 match analysis.query_type {
1090 QueryType::EntityFocused => {
1091 if result.result_type == ResultType::Entity {
1092 result.score *= 1.2;
1093 }
1094 }
1095 QueryType::Conceptual => {
1096 if result.result_type == ResultType::HierarchicalSummary {
1097 result.score *= 1.1;
1098 }
1099 }
1100 QueryType::Relationship => {
1101 if result.entities.len() > 1 {
1102 result.score *= 1.15;
1103 }
1104 }
1105 _ => {}
1106 }
1107
1108 for entity in &analysis.key_entities {
1110 if result
1111 .entities
1112 .iter()
1113 .any(|e| e.eq_ignore_ascii_case(entity))
1114 {
1115 result.score *= 1.1;
1116 }
1117 }
1118 }
1119
1120 results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
1122
1123 let mut deduplicated = Vec::new();
1125 let mut seen_content = HashSet::new();
1126 let mut type_counts: HashMap<ResultType, usize> = HashMap::new();
1127
1128 for result in results {
1129 let content_signature = self.create_content_signature(&result.content);
1130
1131 if !seen_content.contains(&content_signature) {
1132 let type_count = type_counts.get(&result.result_type).unwrap_or(&0);
1133
1134 let max_per_type = match result.result_type {
1136 ResultType::Entity => self.config.top_k / 3,
1137 ResultType::Chunk => self.config.top_k / 2,
1138 ResultType::HierarchicalSummary => self.config.top_k / 4,
1139 ResultType::Hybrid => self.config.top_k / 4,
1140 ResultType::GraphPath => self.config.top_k / 5,
1141 };
1142
1143 if *type_count < max_per_type {
1144 seen_content.insert(content_signature);
1145 *type_counts.entry(result.result_type.clone()).or_insert(0) += 1;
1146 deduplicated.push(result);
1147 }
1148 }
1149 }
1150
1151 Ok(deduplicated)
1152 }
1153
1154 fn find_entity_paths(
1156 &self,
1157 graph: &KnowledgeGraph,
1158 key_entities: &[String],
1159 ) -> Result<Vec<SearchResult>> {
1160 let mut results = Vec::new();
1161
1162 if key_entities.len() < 2 {
1163 return Ok(results);
1164 }
1165
1166 if let (Some(source), Some(target)) = (
1168 graph
1169 .entities()
1170 .find(|e| e.name.eq_ignore_ascii_case(&key_entities[0])),
1171 graph
1172 .entities()
1173 .find(|e| e.name.eq_ignore_ascii_case(&key_entities[1])),
1174 ) {
1175 let path_description =
1176 format!("Connection between {} and {}", source.name, target.name);
1177 let neighbors_source = graph.get_neighbors(&source.id);
1178 let neighbors_target = graph.get_neighbors(&target.id);
1179
1180 if neighbors_source
1182 .iter()
1183 .any(|(neighbor, _)| neighbor.id == target.id)
1184 {
1185 results.push(SearchResult {
1186 id: format!("path_{}_{}", source.id, target.id),
1187 content: format!("Direct relationship: {path_description}"),
1188 score: 0.8,
1189 result_type: ResultType::GraphPath,
1190 entities: vec![source.name.clone(), target.name.clone()],
1191 source_chunks: Vec::new(),
1192 });
1193 }
1194
1195 for (neighbor_s, rel_s) in &neighbors_source {
1197 for (neighbor_t, rel_t) in &neighbors_target {
1198 if neighbor_s.id == neighbor_t.id {
1199 results.push(SearchResult {
1200 id: format!("path_{}_{}_{}", source.id, neighbor_s.id, target.id),
1201 content: format!(
1202 "Indirect relationship via {}: {} -> {} -> {}",
1203 neighbor_s.name, source.name, neighbor_s.name, target.name
1204 ),
1205 score: 0.6 * rel_s.confidence * rel_t.confidence,
1206 result_type: ResultType::GraphPath,
1207 entities: vec![
1208 source.name.clone(),
1209 neighbor_s.name.clone(),
1210 target.name.clone(),
1211 ],
1212 source_chunks: Vec::new(),
1213 });
1214 }
1215 }
1216 }
1217 }
1218
1219 Ok(results)
1220 }
1221
1222 fn community_based_search(
1224 &self,
1225 query_embedding: &[f32],
1226 graph: &KnowledgeGraph,
1227 ) -> Result<Vec<SearchResult>> {
1228 let mut results = Vec::new();
1229 let mut entity_scores: HashMap<String, f32> = HashMap::new();
1230
1231 for entity in graph.entities() {
1233 let neighbors = graph.get_neighbors(&entity.id);
1234 let centrality_score = neighbors.len() as f32 * 0.1;
1235
1236 if let Some(embedding) = &entity.embedding {
1238 let similarity = VectorUtils::cosine_similarity(query_embedding, embedding);
1239 entity_scores.insert(entity.id.to_string(), centrality_score + similarity);
1240 }
1241 }
1242
1243 let mut sorted_entities: Vec<_> = entity_scores.iter().collect();
1245 sorted_entities.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap());
1246
1247 for (entity_id, score) in sorted_entities.iter().take(3) {
1248 if let Some(entity) = graph.entities().find(|e| e.id.to_string() == **entity_id) {
1249 let mut entity_context = String::new();
1251 for mention in entity.mentions.iter().take(2) {
1252 if let Some(chunk) = graph.chunks().find(|c| c.id == mention.chunk_id) {
1253 let chunk_excerpt = if chunk.content.len() > 200 {
1254 format!("{}...", &chunk.content[..200])
1255 } else {
1256 chunk.content.clone()
1257 };
1258 entity_context.push_str(&chunk_excerpt);
1259 entity_context.push(' ');
1260 }
1261 }
1262
1263 if entity_context.is_empty() {
1265 entity_context = format!(
1266 "{} is a {} character in the story.",
1267 entity.name, entity.entity_type
1268 );
1269 }
1270
1271 results.push(SearchResult {
1272 id: entity.id.to_string(),
1273 content: entity_context,
1274 score: **score,
1275 result_type: ResultType::Entity,
1276 entities: vec![entity.name.clone()],
1277 source_chunks: entity
1278 .mentions
1279 .iter()
1280 .map(|m| m.chunk_id.to_string())
1281 .collect(),
1282 });
1283 }
1284 }
1285
1286 Ok(results)
1287 }
1288
1289 fn has_abstract_concepts(&self, words: &[&str]) -> bool {
1291 const ABSTRACT_INDICATORS: &[&str] = &[
1292 "concept",
1293 "idea",
1294 "theory",
1295 "principle",
1296 "philosophy",
1297 "meaning",
1298 "understanding",
1299 "knowledge",
1300 "wisdom",
1301 "truth",
1302 "beauty",
1303 "justice",
1304 ];
1305 words
1306 .iter()
1307 .any(|&word| ABSTRACT_INDICATORS.contains(&word))
1308 }
1309
1310 fn has_question_words(&self, words: &[&str]) -> bool {
1312 const QUESTION_WORDS: &[&str] = &[
1313 "what", "how", "why", "when", "where", "who", "which", "explain", "describe",
1314 ];
1315 words.iter().any(|&word| QUESTION_WORDS.contains(&word))
1316 }
1317
1318 fn create_content_signature(&self, content: &str) -> String {
1320 let prefix = Self::safe_truncate(content, 50);
1322 format!(
1323 "{}_{}",
1324 prefix
1325 .chars()
1326 .filter(|c| c.is_alphanumeric())
1327 .collect::<String>(),
1328 content.len()
1329 )
1330 }
1331
1332 fn graph_traversal_search(
1334 &self,
1335 _query_embedding: &[f32],
1336 _graph: &KnowledgeGraph,
1337 ) -> Result<Vec<SearchResult>> {
1338 Ok(Vec::new())
1346 }
1347
1348 fn find_relevant_entities(
1350 &self,
1351 query_embedding: &[f32],
1352 graph: &KnowledgeGraph,
1353 ) -> Result<Vec<(EntityId, f32)>> {
1354 let mut similarities = Vec::new();
1355
1356 for entity in graph.entities() {
1357 if let Some(embedding) = &entity.embedding {
1358 let similarity = VectorUtils::cosine_similarity(query_embedding, embedding);
1359 if similarity >= self.config.similarity_threshold {
1360 similarities.push((entity.id.clone(), similarity));
1361 }
1362 }
1363 }
1364
1365 similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1367
1368 Ok(similarities)
1369 }
1370
1371 fn expand_through_relationships(
1373 &self,
1374 start_entity: &EntityId,
1375 graph: &KnowledgeGraph,
1376 max_depth: usize,
1377 visited: &mut HashSet<EntityId>,
1378 ) -> Result<Vec<EntityId>> {
1379 let mut results = Vec::new();
1380 let mut current_level = vec![start_entity.clone()];
1381 visited.insert(start_entity.clone());
1382
1383 for _depth in 0..max_depth {
1384 let mut next_level = Vec::new();
1385
1386 for entity_id in ¤t_level {
1387 results.push(entity_id.clone());
1388
1389 let neighbors = graph.get_neighbors(entity_id);
1391 for (neighbor_entity, _relationship) in neighbors {
1392 if !visited.contains(&neighbor_entity.id) {
1393 visited.insert(neighbor_entity.id.clone());
1394 next_level.push(neighbor_entity.id.clone());
1395 }
1396 }
1397 }
1398
1399 if next_level.is_empty() {
1400 break;
1401 }
1402
1403 current_level = next_level;
1404 }
1405
1406 Ok(results)
1407 }
1408
1409 fn is_stop_word(&self, word: &str) -> bool {
1411 const STOP_WORDS: &[&str] = &[
1412 "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
1413 "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
1414 "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
1415 "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
1416 "go", "me",
1417 ];
1418 STOP_WORDS.contains(&word)
1419 }
1420
1421 fn rank_and_deduplicate(&self, mut results: Vec<SearchResult>) -> Result<Vec<SearchResult>> {
1423 results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
1425
1426 let mut seen_ids = HashSet::new();
1428 let mut deduplicated = Vec::new();
1429
1430 for result in results {
1431 if !seen_ids.contains(&result.id) {
1432 seen_ids.insert(result.id.clone());
1433 deduplicated.push(result);
1434 }
1435 }
1436
1437 Ok(deduplicated)
1438 }
1439
1440 pub fn vector_search(&mut self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
1442 let query_embedding = self.embedding_generator.generate_embedding(query);
1443 let similar_vectors = self.vector_index.search(&query_embedding, max_results)?;
1444
1445 let mut results = Vec::new();
1446 for (id, score) in similar_vectors {
1447 results.push(SearchResult {
1448 id: id.clone(),
1449 content: format!("Vector result for: {id}"),
1450 score,
1451 result_type: ResultType::Chunk,
1452 entities: Vec::new(),
1453 source_chunks: vec![id],
1454 });
1455 }
1456
1457 Ok(results)
1458 }
1459
1460 pub fn graph_search(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
1462 let mut results = Vec::new();
1464 results.push(SearchResult {
1465 id: format!("graph_result_{}", query.len()),
1466 content: format!("Graph-based result for: {query}"),
1467 score: 0.7,
1468 result_type: ResultType::GraphPath,
1469 entities: Vec::new(),
1470 source_chunks: Vec::new(),
1471 });
1472
1473 Ok(results.into_iter().take(max_results).collect())
1474 }
1475
1476 pub fn public_hierarchical_search(
1478 &self,
1479 query: &str,
1480 max_results: usize,
1481 ) -> Result<Vec<SearchResult>> {
1482 let mut results = Vec::new();
1484 results.push(SearchResult {
1485 id: format!("hierarchical_result_{}", query.len()),
1486 content: format!("Hierarchical result for: {query}"),
1487 score: 0.8,
1488 result_type: ResultType::HierarchicalSummary,
1489 entities: Vec::new(),
1490 source_chunks: Vec::new(),
1491 });
1492
1493 Ok(results.into_iter().take(max_results).collect())
1494 }
1495
1496 pub fn bm25_search(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
1498 let mut results = Vec::new();
1500 results.push(SearchResult {
1501 id: format!("bm25_result_{}", query.len()),
1502 content: format!("BM25 result for: {query}"),
1503 score: 0.75,
1504 result_type: ResultType::Chunk,
1505 entities: Vec::new(),
1506 source_chunks: Vec::new(),
1507 });
1508
1509 Ok(results.into_iter().take(max_results).collect())
1510 }
1511
1512 pub fn get_statistics(&self) -> RetrievalStatistics {
1514 let vector_stats = self.vector_index.statistics();
1515
1516 RetrievalStatistics {
1517 indexed_vectors: vector_stats.vector_count,
1518 vector_dimension: vector_stats.dimension,
1519 index_built: vector_stats.index_built,
1520 config: self.config.clone(),
1521 }
1522 }
1523
1524 fn safe_truncate(s: &str, max_bytes: usize) -> String {
1526 if s.len() <= max_bytes {
1527 return s.to_string();
1528 }
1529
1530 let mut end_idx = max_bytes;
1532 while end_idx > 0 && !s.is_char_boundary(end_idx) {
1533 end_idx -= 1;
1534 }
1535
1536 s[..end_idx].to_string()
1537 }
1538
1539 pub fn save_state_to_json(&self, file_path: &str) -> Result<()> {
1541 use std::fs;
1542
1543 let mut json_data = json::JsonValue::new_object();
1544
1545 json_data["metadata"] = json::object! {
1547 "format_version" => "1.0",
1548 "created_at" => chrono::Utc::now().to_rfc3339(),
1549 "config" => json::object! {
1550 "top_k" => self.config.top_k,
1551 "similarity_threshold" => self.config.similarity_threshold,
1552 "max_expansion_depth" => self.config.max_expansion_depth,
1553 "entity_weight" => self.config.entity_weight,
1554 "chunk_weight" => self.config.chunk_weight,
1555 "graph_weight" => self.config.graph_weight
1556 }
1557 };
1558
1559 let vector_stats = self.vector_index.statistics();
1561 json_data["vector_index"] = json::object! {
1562 "vector_count" => vector_stats.vector_count,
1563 "dimension" => vector_stats.dimension,
1564 "index_built" => vector_stats.index_built,
1565 "min_norm" => vector_stats.min_norm,
1566 "max_norm" => vector_stats.max_norm,
1567 "avg_norm" => vector_stats.avg_norm
1568 };
1569
1570 json_data["embedding_generator"] = json::object! {
1572 "dimension" => self.embedding_generator.dimension(),
1573 "cached_words" => self.embedding_generator.cached_words()
1574 };
1575
1576 #[cfg(feature = "parallel-processing")]
1578 {
1579 json_data["parallel_enabled"] = self.parallel_processor.is_some().into();
1580 }
1581 #[cfg(not(feature = "parallel-processing"))]
1582 {
1583 json_data["parallel_enabled"] = false.into();
1584 }
1585
1586 fs::write(file_path, json_data.dump())?;
1588 tracing::info!("Retrieval system state saved to {file_path}");
1589
1590 Ok(())
1591 }
1592}
1593
1594#[derive(Debug)]
1596pub struct RetrievalStatistics {
1597 pub indexed_vectors: usize,
1599 pub vector_dimension: usize,
1601 pub index_built: bool,
1603 pub config: RetrievalConfig,
1605}
1606
1607impl RetrievalStatistics {
1608 #[allow(dead_code)]
1610 pub fn print(&self) {
1611 tracing::info!("Retrieval System Statistics:");
1612 tracing::info!(" Indexed vectors: {}", self.indexed_vectors);
1613 tracing::info!(" Vector dimension: {}", self.vector_dimension);
1614 tracing::info!(" Index built: {}", self.index_built);
1615 tracing::info!(" Configuration:");
1616 tracing::info!(" Top K: {}", self.config.top_k);
1617 tracing::info!(
1618 " Similarity threshold: {:.2}",
1619 self.config.similarity_threshold
1620 );
1621 tracing::info!(
1622 " Max expansion depth: {}",
1623 self.config.max_expansion_depth
1624 );
1625 tracing::info!(" Entity weight: {:.2}", self.config.entity_weight);
1626 tracing::info!(" Chunk weight: {:.2}", self.config.chunk_weight);
1627 tracing::info!(" Graph weight: {:.2}", self.config.graph_weight);
1628 }
1629}
1630
1631#[cfg(test)]
1632mod tests {
1633 use super::*;
1634 use crate::{config::Config, core::KnowledgeGraph};
1635
1636 #[test]
1637 fn test_retrieval_system_creation() {
1638 let config = Config::default();
1639 let retrieval = RetrievalSystem::new(&config);
1640 assert!(retrieval.is_ok());
1641 }
1642
1643 #[test]
1644 fn test_query_placeholder() {
1645 let config = Config::default();
1646 let retrieval = RetrievalSystem::new(&config).unwrap();
1647
1648 let results = retrieval.query("test query");
1649 assert!(results.is_ok());
1650
1651 let results = results.unwrap();
1652 assert!(!results.is_empty());
1653 assert!(results[0].contains("test query"));
1654 }
1655
1656 #[test]
1657 fn test_graph_indexing() {
1658 let config = Config::default();
1659 let mut retrieval = RetrievalSystem::new(&config).unwrap();
1660 let graph = KnowledgeGraph::new();
1661
1662 let result = retrieval.index_graph(&graph);
1663 assert!(result.is_ok());
1664 }
1665}