graphrag_core/retrieval/
mod.rs

1//! Retrieval system: finds the chunks and entities relevant to a query.
2//!
3//! Combines keyword search (BM25), vector similarity, and PageRank-weighted graph
4//! traversal, and can return `explained` answers with a source/reasoning trace.
5
6pub mod adaptive;
7/// BM25 text retrieval implementation for keyword-based search
8pub mod bm25;
9/// Structured answer types with reasoning trace (`ExplainedAnswer`, `SourceReference`, `ReasoningStep`)
10pub mod explained;
11/// Core retrieval data types (`SearchResult`, `RetrievalConfig`, query analysis enums, statistics).
12mod types;
13pub use types::*;
14/// Causal chain analysis for discovering cause-effect paths (Phase 2.3)
15pub mod causal_analysis;
16/// Enriched metadata-aware retrieval
17pub mod enriched;
18/// HippoRAG Personalized PageRank retrieval
19#[cfg(feature = "pagerank")]
20pub mod hipporag_ppr;
21/// Hybrid retrieval combining multiple search strategies
22pub mod hybrid;
23pub mod pagerank_retrieval;
24/// Symbolic anchoring for conceptual queries (Phase 2.1 - CatRAG)
25pub mod symbolic_anchoring;
26
27#[cfg(feature = "parallel-processing")]
28use crate::parallel::ParallelProcessor;
29use crate::{
30    config::Config,
31    core::{ChunkId, EntityId, KnowledgeGraph},
32    summarization::DocumentTree,
33    vector::{EmbeddingGenerator, VectorUtils},
34    Result,
35};
36use std::collections::{HashMap, HashSet};
37
38pub use bm25::{BM25Result, BM25Retriever, Document as BM25Document};
39pub use enriched::{EnrichedRetrievalConfig, EnrichedRetriever};
40pub use explained::{ExplainedAnswer, ReasoningStep, SourceReference, SourceType};
41pub use hybrid::{FusionMethod, HybridConfig, HybridRetriever, HybridSearchResult};
42
43#[cfg(feature = "pagerank")]
44pub use pagerank_retrieval::{PageRankRetrievalSystem, ScoredResult};
45
46#[cfg(feature = "pagerank")]
47pub use hipporag_ppr::{Fact, HippoRAGConfig, HippoRAGRetriever};
48
49use crate::vector::store::VectorStore;
50
51/// Retrieval system for querying the knowledge graph
52pub struct RetrievalSystem {
53    vector_store: std::sync::Arc<dyn VectorStore>,
54    embedding_generator: EmbeddingGenerator,
55    config: RetrievalConfig,
56    #[cfg(feature = "parallel-processing")]
57    parallel_processor: Option<ParallelProcessor>,
58    #[cfg(feature = "pagerank")]
59    pagerank_retriever: Option<PageRankRetrievalSystem>,
60    enriched_retriever: Option<EnrichedRetriever>,
61    #[cfg(feature = "lazygraphrag")]
62    concept_filtering_enabled: bool,
63}
64
65impl RetrievalSystem {
66    /// Create a new retrieval system
67    pub fn new(config: &Config) -> Result<Self> {
68        let retrieval_config = RetrievalConfig {
69            top_k: config.retrieval.top_k,
70            similarity_threshold: 0.35,
71            max_expansion_depth: 2,
72            entity_weight: 0.4,
73            chunk_weight: 0.4,
74            graph_weight: 0.2,
75            #[cfg(feature = "lazygraphrag")]
76            use_concept_filtering: false,
77            #[cfg(feature = "lazygraphrag")]
78            concept_top_k: 20,
79        };
80
81        // Default to MemoryVectorStore for now (mimics old behavior)
82        // In the future, this will select based on Config (LanceDB, Qdrant, etc.)
83        let vector_store =
84            std::sync::Arc::new(crate::vector::memory_store::MemoryVectorStore::new());
85
86        Ok(Self {
87            vector_store,
88            embedding_generator: EmbeddingGenerator::new(128), // 128-dimensional embeddings
89            config: retrieval_config,
90            #[cfg(feature = "parallel-processing")]
91            parallel_processor: None,
92            #[cfg(feature = "pagerank")]
93            pagerank_retriever: None,
94            enriched_retriever: None,
95            #[cfg(feature = "lazygraphrag")]
96            concept_filtering_enabled: false,
97        })
98    }
99}
100
101impl RetrievalSystem {
102    /// Create a new retrieval system with parallel processing support
103    #[cfg(feature = "parallel-processing")]
104    pub fn with_parallel_processing(
105        vector_store: std::sync::Arc<dyn VectorStore>,
106        embedding_generator: EmbeddingGenerator,
107        parallel_processor: ParallelProcessor,
108    ) -> Result<Self> {
109        // VectorStore trait is already Send + Sync and wrapped in Arc
110        // Can be safely used across threads for parallel operations
111        // EmbeddingGenerator operations can be parallelized with rayon
112
113        let retrieval_config = RetrievalConfig::default();
114
115        Ok(Self {
116            vector_store,
117            embedding_generator,
118            config: retrieval_config,
119            parallel_processor: Some(parallel_processor),
120            #[cfg(feature = "pagerank")]
121            pagerank_retriever: None,
122            enriched_retriever: None,
123            #[cfg(feature = "lazygraphrag")]
124            concept_filtering_enabled: false,
125        })
126    }
127
128    /// Index a knowledge graph for retrieval
129    pub async fn index_graph(&self, graph: &KnowledgeGraph) -> Result<()> {
130        // Index entity embeddings
131        for entity in graph.entities() {
132            if let Some(embedding) = &entity.embedding {
133                let id = format!("entity:{}", entity.id);
134                // Simple empty metadata for now, could add name/type
135                self.vector_store
136                    .add_vector(&id, embedding.clone(), HashMap::new())
137                    .await?;
138            }
139        }
140
141        // Index chunk embeddings
142        for chunk in graph.chunks() {
143            if let Some(embedding) = &chunk.embedding {
144                let id = format!("chunk:{}", chunk.id);
145                self.vector_store
146                    .add_vector(&id, embedding.clone(), HashMap::new())
147                    .await?;
148            }
149        }
150
151        // Initialize/Build if needed (some stores might need explicit commit)
152        self.vector_store.initialize().await?;
153
154        Ok(())
155    }
156
157    /// Initialize PageRank retrieval system (feature-gated)
158    #[cfg(feature = "pagerank")]
159    pub fn initialize_pagerank(&mut self, graph: &KnowledgeGraph) -> Result<()> {
160        use crate::graph::pagerank::{PageRankConfig, ScoreWeights};
161
162        #[cfg(feature = "tracing")]
163        tracing::debug!("Initializing high-performance PageRank retrieval system...");
164
165        let pagerank_config = PageRankConfig {
166            damping_factor: 0.85,
167            max_iterations: 50, // Reduced for faster convergence
168            tolerance: 1e-5,    // Slightly relaxed for speed
169            personalized: true,
170            #[cfg(feature = "parallel-processing")]
171            parallel_enabled: self.parallel_processor.is_some(),
172            #[cfg(not(feature = "parallel-processing"))]
173            parallel_enabled: false,
174            cache_size: 2000, // Large cache for better performance
175            sparse_threshold: 500,
176            incremental_updates: true,
177            simd_block_size: 64, // Optimized for modern CPUs
178        };
179
180        let score_weights = ScoreWeights {
181            vector_weight: 0.3,
182            pagerank_weight: 0.5, // Higher weight for PageRank like fast-GraphRAG
183            chunk_weight: 0.15,
184            relationship_weight: 0.05,
185        };
186
187        let mut pagerank_retriever = PageRankRetrievalSystem::new(self.config.top_k)
188            .with_pagerank_config(pagerank_config)
189            .with_score_weights(score_weights)
190            .with_incremental_mode(true)
191            .with_min_threshold(0.05);
192
193        // Initialize vector index
194        // pagerank_retriever.initialize_vector_index(graph)?;
195
196        // Pre-compute global PageRank scores for faster queries
197        pagerank_retriever.precompute_global_pagerank(graph)?;
198
199        self.pagerank_retriever = Some(pagerank_retriever);
200
201        #[cfg(feature = "tracing")]
202        tracing::debug!("PageRank retrieval system initialized with 27x performance optimizations");
203        Ok(())
204    }
205
206    /// Initialize enriched metadata-aware retrieval system
207    pub fn initialize_enriched(&mut self, config: Option<EnrichedRetrievalConfig>) -> Result<()> {
208        #[cfg(feature = "tracing")]
209        tracing::debug!("Initializing enriched metadata-aware retrieval system...");
210
211        let enriched_config = config.unwrap_or_default();
212        let enriched_retriever = EnrichedRetriever::with_config(enriched_config);
213
214        self.enriched_retriever = Some(enriched_retriever);
215
216        #[cfg(feature = "tracing")]
217        tracing::debug!("Enriched retrieval system initialized with metadata boosting");
218        Ok(())
219    }
220
221    /// Query using PageRank-enhanced retrieval (feature-gated)
222    #[cfg(feature = "pagerank")]
223    pub fn pagerank_query(
224        &self,
225        query: &str,
226        graph: &KnowledgeGraph,
227        max_results: Option<usize>,
228    ) -> Result<Vec<ScoredResult>> {
229        if let Some(pagerank_retriever) = &self.pagerank_retriever {
230            pagerank_retriever.search_with_pagerank(query, graph, max_results)
231        } else {
232            Err(crate::core::GraphRAGError::Retrieval {
233                message: "PageRank retriever not initialized. Call initialize_pagerank() first."
234                    .to_string(),
235            })
236        }
237    }
238
239    /// Batch PageRank queries for high throughput (feature-gated)
240    #[cfg(feature = "pagerank")]
241    pub fn pagerank_batch_query(
242        &self,
243        queries: &[&str],
244        graph: &KnowledgeGraph,
245        max_results_per_query: Option<usize>,
246    ) -> Result<Vec<Vec<ScoredResult>>> {
247        if let Some(pagerank_retriever) = &self.pagerank_retriever {
248            pagerank_retriever.batch_search(queries, graph, max_results_per_query)
249        } else {
250            Err(crate::core::GraphRAGError::Retrieval {
251                message: "PageRank retriever not initialized. Call initialize_pagerank() first."
252                    .to_string(),
253            })
254        }
255    }
256
257    /// Query the system for relevant information
258    pub fn query(&self, query: &str) -> Result<Vec<String>> {
259        // For now, return a placeholder implementation
260        // In a real system, this would:
261        // 1. Convert query to embedding
262        // 2. Search vector index
263        // 3. Expand through graph relationships
264        // 4. Rank and return results
265
266        Ok(vec![format!("Results for query: {}", query)])
267    }
268
269    /// Advanced hybrid query with strategy selection and hierarchical integration
270    pub async fn hybrid_query(
271        &mut self,
272        query: &str,
273        graph: &KnowledgeGraph,
274    ) -> Result<Vec<SearchResult>> {
275        self.hybrid_query_with_trees(query, graph, &HashMap::new())
276            .await
277    }
278
279    /// Hybrid query with access to document trees for hierarchical retrieval
280    pub async fn hybrid_query_with_trees(
281        &mut self,
282        query: &str,
283        graph: &KnowledgeGraph,
284        document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
285    ) -> Result<Vec<SearchResult>> {
286        // 1. Analyze query to determine optimal strategy
287        let analysis = self.analyze_query(query, graph)?;
288
289        // 2. Generate query embedding
290        let query_embedding = self.embedding_generator.generate_embedding(query);
291
292        // 3. Execute multi-strategy retrieval based on analysis
293        let mut results = self
294            .execute_adaptive_retrieval(query, &query_embedding, graph, document_trees, &analysis)
295            .await?;
296
297        // 4. Apply enriched metadata-aware boosting and filtering if enabled
298        if let Some(enriched_retriever) = &self.enriched_retriever {
299            // First apply metadata boosting to enhance relevance
300            results = enriched_retriever.boost_with_metadata(results, query, graph)?;
301
302            // Then apply structure filtering if query mentions chapters/sections
303            results = enriched_retriever.filter_by_structure(query, results, graph)?;
304        }
305
306        Ok(results)
307    }
308
309    /// Query the system using hybrid retrieval (vector + graph) - legacy method
310    pub async fn legacy_hybrid_query(
311        &mut self,
312        query: &str,
313        graph: &KnowledgeGraph,
314    ) -> Result<Vec<SearchResult>> {
315        // 1. Generate query embedding
316        let query_embedding = self.embedding_generator.generate_embedding(query);
317
318        // 2. Perform comprehensive search
319        let results = self.comprehensive_search(&query_embedding, graph).await?;
320
321        Ok(results)
322    }
323
324    /// Add embeddings to chunks and entities in the graph with parallel processing
325    pub async fn add_embeddings_to_graph(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
326        #[cfg(feature = "parallel-processing")]
327        if let Some(processor) = self.parallel_processor.clone() {
328            return self.add_embeddings_parallel(graph, &processor).await;
329        }
330
331        self.add_embeddings_sequential(graph).await
332    }
333
334    /// Parallel embedding generation with proper error handling and work-stealing
335    #[cfg(feature = "parallel-processing")]
336    async fn add_embeddings_parallel(
337        &mut self,
338        graph: &mut KnowledgeGraph,
339        processor: &ParallelProcessor,
340    ) -> Result<()> {
341        // Extract texts for embedding generation
342        let mut chunk_texts = Vec::new();
343        let mut entity_texts = Vec::new();
344
345        // Collect chunk texts that need embeddings
346        for chunk in graph.chunks() {
347            if chunk.embedding.is_none() {
348                chunk_texts.push((chunk.id.clone(), chunk.content.clone()));
349            }
350        }
351
352        // Collect entity texts that need embeddings
353        for entity in graph.entities() {
354            if entity.embedding.is_none() {
355                let entity_text = format!("{} {}", entity.name, entity.entity_type);
356                entity_texts.push((entity.id.clone(), entity_text));
357            }
358        }
359
360        // For parallel processing, we need to use a different approach since
361        // generate_embedding requires &mut self. We'll fall back to enhanced sequential
362        // processing with better chunking and monitoring for now.
363
364        let total_items = chunk_texts.len() + entity_texts.len();
365        if processor.should_use_parallel(total_items) {
366            #[cfg(feature = "tracing")]
367            tracing::debug!(
368                "Processing {total_items} embeddings with enhanced sequential approach"
369            );
370        }
371
372        // Process chunks
373        for (chunk_id, text) in chunk_texts {
374            let embedding = self.embedding_generator.generate_embedding(&text);
375            if let Some(chunk) = graph.get_chunk_mut(&chunk_id) {
376                chunk.embedding = Some(embedding);
377            }
378        }
379
380        // Process entities
381        for (entity_id, text) in entity_texts {
382            let embedding = self.embedding_generator.generate_embedding(&text);
383            if let Some(entity) = graph.get_entity_mut(&entity_id) {
384                entity.embedding = Some(embedding);
385            }
386        }
387
388        // Re-index the graph with new embeddings
389        self.index_graph(graph).await?;
390
391        Ok(())
392    }
393
394    /// Sequential embedding generation (fallback)
395    #[cfg_attr(not(feature = "tracing"), allow(unused_assignments, unused_variables))]
396    async fn add_embeddings_sequential(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
397        // Debug: Check total counts first (uncomment for debugging)
398        let _total_chunks = graph.chunks().count();
399        let _total_entities = graph.entities().count();
400        // println!("DEBUG: Found {} total chunks and {} total entities in graph", _total_chunks, _total_entities);
401
402        // Generate embeddings for all chunks
403        let mut chunk_count = 0;
404        for chunk in graph.chunks_mut() {
405            if chunk.embedding.is_none() {
406                let embedding = self.embedding_generator.generate_embedding(&chunk.content);
407                chunk.embedding = Some(embedding);
408                chunk_count += 1;
409            }
410        }
411
412        // Generate embeddings for all entities (using their name and context)
413        let mut entity_count = 0;
414        for entity in graph.entities_mut() {
415            if entity.embedding.is_none() {
416                // Create entity text from name and entity type
417                let entity_text = format!("{} {}", entity.name, entity.entity_type);
418                let embedding = self.embedding_generator.generate_embedding(&entity_text);
419                entity.embedding = Some(embedding);
420                entity_count += 1;
421            }
422        }
423
424        #[cfg(feature = "tracing")]
425        tracing::debug!(
426            "Generated embeddings for {chunk_count} chunks and {entity_count} entities"
427        );
428
429        // Re-index the graph with new embeddings
430        // Re-index the graph with new embeddings
431        self.index_graph(graph).await?;
432
433        Ok(())
434    }
435
436    /// Parallel batch query processing with optimized workload distribution
437    /// Batch process multiple queries efficiently
438    #[cfg(feature = "parallel-processing")]
439    pub async fn batch_query(
440        &mut self,
441        queries: &[&str],
442        graph: &KnowledgeGraph,
443    ) -> Result<Vec<Vec<SearchResult>>> {
444        let processor =
445            self.parallel_processor
446                .as_ref()
447                .ok_or_else(|| crate::core::GraphRAGError::Config {
448                    message: "Parallel processor not initialized".to_string(),
449                })?;
450
451        if !processor.should_use_parallel(queries.len()) {
452            let mut results = Vec::new();
453            for &query in queries {
454                results.push(self.hybrid_query(query, graph).await?);
455            }
456            return Ok(results);
457        }
458
459        let chunk_size = processor.config().chunk_batch_size.min(queries.len());
460        #[cfg(feature = "tracing")]
461        tracing::debug!(
462            "Processing {} queries with enhanced sequential approach (chunk size: {})",
463            queries.len(),
464            chunk_size
465        );
466
467        let mut all_results = Vec::new();
468        for &query in queries {
469            match self.hybrid_query(query, graph).await {
470                Ok(results) => all_results.push(results),
471                Err(e) => {
472                    #[cfg(feature = "tracing")]
473                    tracing::warn!("Error processing query '{query}': {e}");
474                    all_results.push(Vec::new());
475                },
476            }
477        }
478
479        Ok(all_results)
480    }
481
482    /// Sequential batch query (fallback when parallel-processing is disabled)
483    #[cfg(not(feature = "parallel-processing"))]
484    pub async fn batch_query(
485        &mut self,
486        queries: &[&str],
487        graph: &KnowledgeGraph,
488    ) -> Result<Vec<Vec<SearchResult>>> {
489        let mut results = Vec::new();
490        for &query in queries {
491            results.push(self.hybrid_query(query, graph).await?);
492        }
493        Ok(results)
494    }
495
496    /// Analyze query to determine optimal retrieval strategy
497    pub fn analyze_query(&self, query: &str, graph: &KnowledgeGraph) -> Result<QueryAnalysis> {
498        let query_lower = query.to_lowercase();
499        let words: Vec<&str> = query_lower.split_whitespace().collect();
500
501        // Detect key entities mentioned in the query
502        let mut key_entities = Vec::new();
503        for entity in graph.entities() {
504            let entity_name_lower = entity.name.to_lowercase();
505            if words
506                .iter()
507                .any(|&word| entity_name_lower.contains(word) || word.contains(&entity_name_lower))
508            {
509                key_entities.push(entity.name.clone());
510            }
511        }
512
513        // Extract concepts (non-entity meaningful words)
514        let concepts: Vec<String> = words
515            .iter()
516            .filter(|&&word| word.len() > 3 && !self.is_stop_word(word))
517            .filter(|&&word| {
518                !key_entities.iter().any(|entity| {
519                    entity.to_lowercase().contains(word) || word.contains(&entity.to_lowercase())
520                })
521            })
522            .map(|&word| word.to_string())
523            .collect();
524
525        // Determine query type
526        let query_type = if !key_entities.is_empty() && key_entities.len() > 1 {
527            QueryType::Relationship
528        } else if !key_entities.is_empty() {
529            QueryType::EntityFocused
530        } else if self.has_abstract_concepts(&words) {
531            QueryType::Conceptual
532        } else if self.has_question_words(&words) {
533            QueryType::Exploratory
534        } else {
535            QueryType::Factual
536        };
537
538        // Determine intent
539        let intent = if words
540            .iter()
541            .any(|&w| ["overview", "summary", "general", "about"].contains(&w))
542        {
543            QueryIntent::Overview
544        } else if words
545            .iter()
546            .any(|&w| ["detailed", "specific", "exactly", "precise"].contains(&w))
547        {
548            QueryIntent::Detailed
549        } else if words
550            .iter()
551            .any(|&w| ["compare", "vs", "versus", "between", "difference"].contains(&w))
552        {
553            QueryIntent::Comparative
554        } else if words
555            .iter()
556            .any(|&w| ["cause", "why", "because", "lead", "result"].contains(&w))
557        {
558            QueryIntent::Causal
559        } else if words
560            .iter()
561            .any(|&w| ["when", "time", "before", "after", "during"].contains(&w))
562        {
563            QueryIntent::Temporal
564        } else {
565            QueryIntent::Detailed
566        };
567
568        // Calculate complexity score
569        let complexity_score = (words.len() as f32 * 0.1
570            + key_entities.len() as f32 * 0.3
571            + concepts.len() as f32 * 0.2)
572            .min(1.0);
573
574        Ok(QueryAnalysis {
575            query_type,
576            key_entities,
577            concepts,
578            intent,
579            complexity_score,
580        })
581    }
582
583    /// Execute adaptive retrieval based on query analysis
584    pub async fn execute_adaptive_retrieval(
585        &mut self,
586        query: &str,
587        query_embedding: &[f32],
588        graph: &KnowledgeGraph,
589        document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
590        analysis: &QueryAnalysis,
591    ) -> Result<Vec<SearchResult>> {
592        let mut all_results = Vec::new();
593
594        // Strategy weights based on query analysis
595        let (vector_weight, graph_weight, hierarchical_weight) =
596            self.calculate_strategy_weights(analysis);
597
598        // 1. Vector similarity search (always included)
599        if vector_weight > 0.0 {
600            let mut vector_results = self
601                .vector_similarity_search(query_embedding, graph)
602                .await?;
603            for result in &mut vector_results {
604                result.score *= vector_weight;
605            }
606            all_results.extend(vector_results);
607        }
608
609        // 2. Graph-based search (emphasized for entity and relationship queries)
610        if graph_weight > 0.0 {
611            let mut graph_results = match analysis.query_type {
612                QueryType::EntityFocused | QueryType::Relationship => {
613                    self.entity_centric_search(query_embedding, graph, &analysis.key_entities)?
614                },
615                _ => self.entity_based_search(query_embedding, graph)?,
616            };
617            for result in &mut graph_results {
618                result.score *= graph_weight;
619            }
620            all_results.extend(graph_results);
621        }
622
623        // 3. Hierarchical search (emphasized for overview and conceptual queries)
624        if hierarchical_weight > 0.0 && !document_trees.is_empty() {
625            let mut hierarchical_results =
626                self.hierarchical_search(query, document_trees, analysis)?;
627            for result in &mut hierarchical_results {
628                result.score *= hierarchical_weight;
629            }
630            all_results.extend(hierarchical_results);
631        }
632
633        // 4. Advanced graph traversal for complex queries
634        if analysis.complexity_score > 0.7 {
635            let traversal_results =
636                self.advanced_graph_traversal(query_embedding, graph, analysis)?;
637            all_results.extend(traversal_results);
638        }
639
640        // 5. Cross-strategy fusion for hybrid results
641        let fusion_results = self.cross_strategy_fusion(&all_results, analysis)?;
642        all_results.extend(fusion_results);
643
644        // Final ranking and deduplication
645        let final_results = self.adaptive_rank_and_deduplicate(all_results, analysis)?;
646
647        Ok(final_results.into_iter().take(self.config.top_k).collect())
648    }
649
650    /// Comprehensive search that combines multiple retrieval strategies (legacy)
651    pub async fn comprehensive_search(
652        &self,
653        query_embedding: &[f32],
654        graph: &KnowledgeGraph,
655    ) -> Result<Vec<SearchResult>> {
656        let mut all_results = Vec::new();
657
658        // 1. Vector similarity search
659        let vector_results = self
660            .vector_similarity_search(query_embedding, graph)
661            .await?;
662        all_results.extend(vector_results);
663
664        // 2. Entity-based search
665        let entity_results = self.entity_based_search(query_embedding, graph)?;
666        all_results.extend(entity_results);
667
668        // 3. Graph traversal search
669        let graph_results = self.graph_traversal_search(query_embedding, graph)?;
670        all_results.extend(graph_results);
671
672        // Deduplicate and rank results
673        let final_results = self.rank_and_deduplicate(all_results)?;
674
675        Ok(final_results.into_iter().take(self.config.top_k).collect())
676    }
677
678    /// Vector similarity search
679    async fn vector_similarity_search(
680        &self,
681        query_embedding: &[f32],
682        graph: &KnowledgeGraph,
683    ) -> Result<Vec<SearchResult>> {
684        let mut results = Vec::new();
685
686        // Search for similar vectors
687        // Note: vector_store returns SearchResult struct from store module, we need to convert or us it
688        // The store::SearchResult is slightly different from retrieval::SearchResult (metadata map vs specific fields)
689        let similar_vectors = self
690            .vector_store
691            .search(query_embedding, self.config.top_k * 2)
692            .await?;
693
694        for store_result in similar_vectors {
695            let id = store_result.id;
696            let similarity = store_result.score;
697            if similarity >= self.config.similarity_threshold {
698                let result = if id.starts_with("entity:") {
699                    let entity_id = EntityId::new(
700                        id.strip_prefix("entity:")
701                            .expect("prefix checked")
702                            .to_string(),
703                    );
704                    graph.get_entity(&entity_id).map(|entity| SearchResult {
705                        id: entity.id.to_string(),
706                        content: entity.name.clone(),
707                        score: similarity * self.config.entity_weight,
708                        result_type: ResultType::Entity,
709                        entities: vec![entity.name.clone()],
710                        source_chunks: entity
711                            .mentions
712                            .iter()
713                            .map(|m| m.chunk_id.to_string())
714                            .collect(),
715                    })
716                } else if id.starts_with("chunk:") {
717                    let chunk_id = ChunkId::new(
718                        id.strip_prefix("chunk:")
719                            .expect("prefix checked")
720                            .to_string(),
721                    );
722                    if let Some(chunk) = graph.get_chunk(&chunk_id) {
723                        let entity_names: Vec<String> = chunk
724                            .entities
725                            .iter()
726                            .filter_map(|eid| graph.get_entity(eid))
727                            .map(|e| e.name.clone())
728                            .collect();
729
730                        Some(SearchResult {
731                            id: chunk.id.to_string(),
732                            content: chunk.content.clone(),
733                            score: similarity * self.config.chunk_weight,
734                            result_type: ResultType::Chunk,
735                            entities: entity_names,
736                            source_chunks: vec![chunk.id.to_string()],
737                        })
738                    } else {
739                        None
740                    }
741                } else {
742                    None
743                };
744
745                if let Some(search_result) = result {
746                    results.push(search_result);
747                }
748            }
749        }
750
751        Ok(results)
752    }
753
754    /// Entity-based search with graph expansion
755    fn entity_based_search(
756        &self,
757        query_embedding: &[f32],
758        graph: &KnowledgeGraph,
759    ) -> Result<Vec<SearchResult>> {
760        let mut results = Vec::new();
761        let mut visited = HashSet::new();
762
763        // Find most relevant entities
764        let entity_similarities = self.find_relevant_entities(query_embedding, graph)?;
765
766        for (entity_id, similarity) in entity_similarities.into_iter().take(5) {
767            if visited.contains(&entity_id) {
768                continue;
769            }
770
771            // Expand through graph relationships
772            let expanded_entities = self.expand_through_relationships(
773                &entity_id,
774                graph,
775                self.config.max_expansion_depth,
776                &mut visited,
777            )?;
778
779            for expanded_entity_id in expanded_entities {
780                if let Some(entity) = graph.get_entity(&expanded_entity_id) {
781                    let expansion_penalty = if expanded_entity_id == entity_id {
782                        1.0
783                    } else {
784                        0.8
785                    };
786
787                    results.push(SearchResult {
788                        id: entity.id.to_string(),
789                        content: format!("{} ({})", entity.name, entity.entity_type),
790                        score: similarity * expansion_penalty * self.config.entity_weight,
791                        result_type: ResultType::Entity,
792                        entities: vec![entity.name.clone()],
793                        source_chunks: entity
794                            .mentions
795                            .iter()
796                            .map(|m| m.chunk_id.to_string())
797                            .collect(),
798                    });
799                }
800            }
801        }
802
803        Ok(results)
804    }
805
806    /// Calculate strategy weights based on query analysis
807    fn calculate_strategy_weights(&self, analysis: &QueryAnalysis) -> (f32, f32, f32) {
808        match (&analysis.query_type, &analysis.intent) {
809            // For entity-focused queries, balance vector (chunks) and graph (entities) equally
810            // This ensures we get both entity information AND contextual chunks
811            (QueryType::EntityFocused, _) => (0.5, 0.4, 0.1),
812            (QueryType::Relationship, _) => (0.3, 0.6, 0.1),
813            (QueryType::Conceptual, QueryIntent::Overview) => (0.2, 0.2, 0.6),
814            (QueryType::Conceptual, _) => (0.4, 0.3, 0.3),
815            (QueryType::Exploratory, QueryIntent::Overview) => (0.3, 0.2, 0.5),
816            (QueryType::Exploratory, _) => (0.4, 0.4, 0.2),
817            (QueryType::Factual, _) => (0.6, 0.3, 0.1),
818        }
819    }
820
821    /// Entity-centric search focusing on specific entities
822    fn entity_centric_search(
823        &mut self,
824        query_embedding: &[f32],
825        graph: &KnowledgeGraph,
826        key_entities: &[String],
827    ) -> Result<Vec<SearchResult>> {
828        let mut results = Vec::new();
829        let mut visited = HashSet::new();
830
831        for entity_name in key_entities {
832            // Find the entity in the graph
833            if let Some(entity) = graph
834                .entities()
835                .find(|e| e.name.eq_ignore_ascii_case(entity_name))
836            {
837                // Add the entity itself
838                results.push(SearchResult {
839                    id: entity.id.to_string(),
840                    content: format!("{} ({})", entity.name, entity.entity_type),
841                    score: 0.9, // High score for exact entity match
842                    result_type: ResultType::Entity,
843                    entities: vec![entity.name.clone()],
844                    source_chunks: entity
845                        .mentions
846                        .iter()
847                        .map(|m| m.chunk_id.to_string())
848                        .collect(),
849                });
850
851                // Get entity neighbors with weighted scores
852                let neighbors = graph.get_neighbors(&entity.id);
853                for (neighbor, relationship) in neighbors {
854                    if !visited.contains(&neighbor.id) {
855                        visited.insert(neighbor.id.clone());
856
857                        // Calculate relationship relevance
858                        let rel_embedding = self
859                            .embedding_generator
860                            .generate_embedding(&relationship.relation_type);
861                        let rel_similarity =
862                            VectorUtils::cosine_similarity(query_embedding, &rel_embedding);
863
864                        results.push(SearchResult {
865                            id: neighbor.id.to_string(),
866                            content: format!("{} ({})", neighbor.name, neighbor.entity_type),
867                            score: 0.7 * relationship.confidence * (1.0 + rel_similarity),
868                            result_type: ResultType::Entity,
869                            entities: vec![neighbor.name.clone()],
870                            source_chunks: neighbor
871                                .mentions
872                                .iter()
873                                .map(|m| m.chunk_id.to_string())
874                                .collect(),
875                        });
876                    }
877                }
878            }
879        }
880
881        Ok(results)
882    }
883
884    /// Hierarchical search using document trees
885    fn hierarchical_search(
886        &self,
887        query: &str,
888        document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
889        analysis: &QueryAnalysis,
890    ) -> Result<Vec<SearchResult>> {
891        let mut results = Vec::new();
892        let max_results_per_tree = match analysis.intent {
893            QueryIntent::Overview => 3,
894            QueryIntent::Detailed => 8,
895            _ => 5,
896        };
897
898        for (doc_id, tree) in document_trees.iter() {
899            let tree_summaries = tree.query(query, max_results_per_tree)?;
900
901            for (idx, summary) in tree_summaries.iter().enumerate() {
902                // Convert tree query result to search result
903                let level_bonus = match analysis.intent {
904                    QueryIntent::Overview => 0.3,
905                    QueryIntent::Detailed => 0.2,
906                    _ => 0.0,
907                };
908
909                results.push(SearchResult {
910                    id: format!("{}:summary:{}", doc_id, idx),
911                    content: summary.summary.clone(),
912                    score: summary.score + level_bonus,
913                    result_type: ResultType::HierarchicalSummary,
914                    entities: Vec::new(),
915                    source_chunks: vec![doc_id.to_string()],
916                });
917            }
918        }
919
920        Ok(results)
921    }
922
923    /// Advanced graph traversal for complex queries
924    fn advanced_graph_traversal(
925        &self,
926        query_embedding: &[f32],
927        graph: &KnowledgeGraph,
928        analysis: &QueryAnalysis,
929    ) -> Result<Vec<SearchResult>> {
930        let mut results = Vec::new();
931
932        if analysis.query_type == QueryType::Relationship && analysis.key_entities.len() >= 2 {
933            // Find paths between entities
934            results.extend(self.find_entity_paths(graph, &analysis.key_entities)?);
935        }
936
937        if analysis.complexity_score > 0.8 {
938            // Community detection for exploratory queries
939            results.extend(self.community_based_search(query_embedding, graph)?);
940        }
941
942        Ok(results)
943    }
944
945    /// Cross-strategy fusion to create hybrid results
946    fn cross_strategy_fusion(
947        &self,
948        all_results: &[SearchResult],
949        _analysis: &QueryAnalysis,
950    ) -> Result<Vec<SearchResult>> {
951        let mut fusion_results = Vec::new();
952
953        // Group results by content similarity
954        let mut content_groups: HashMap<String, Vec<&SearchResult>> = HashMap::new();
955
956        for result in all_results {
957            let content_key = Self::safe_truncate(&result.content, 50);
958
959            content_groups.entry(content_key).or_default().push(result);
960        }
961
962        // Create fusion results for groups with multiple strategies
963        for (content_key, group) in content_groups {
964            if group.len() > 1 {
965                let types: HashSet<_> = group.iter().map(|r| &r.result_type).collect();
966                if types.len() > 1 {
967                    // This content was found by multiple strategies - boost confidence
968                    let avg_score = group.iter().map(|r| r.score).sum::<f32>() / group.len() as f32;
969                    let boost = 0.2 * (types.len() - 1) as f32;
970
971                    let all_entities: HashSet<_> =
972                        group.iter().flat_map(|r| r.entities.iter()).collect();
973
974                    let all_chunks: HashSet<_> =
975                        group.iter().flat_map(|r| r.source_chunks.iter()).collect();
976
977                    fusion_results.push(SearchResult {
978                        id: format!(
979                            "fusion_{}",
980                            content_key.chars().take(10).collect::<String>()
981                        ),
982                        content: group[0].content.clone(),
983                        score: (avg_score + boost).min(1.0),
984                        result_type: ResultType::Hybrid,
985                        entities: all_entities.into_iter().cloned().collect(),
986                        source_chunks: all_chunks.into_iter().cloned().collect(),
987                    });
988                }
989            }
990        }
991
992        Ok(fusion_results)
993    }
994
995    /// Adaptive ranking and deduplication based on query analysis
996    fn adaptive_rank_and_deduplicate(
997        &self,
998        mut results: Vec<SearchResult>,
999        analysis: &QueryAnalysis,
1000    ) -> Result<Vec<SearchResult>> {
1001        // Apply query-specific score adjustments
1002        for result in &mut results {
1003            match analysis.query_type {
1004                QueryType::EntityFocused if result.result_type == ResultType::Entity => {
1005                    result.score *= 1.2;
1006                },
1007                QueryType::Conceptual if result.result_type == ResultType::HierarchicalSummary => {
1008                    result.score *= 1.1;
1009                },
1010                QueryType::Relationship if result.entities.len() > 1 => {
1011                    result.score *= 1.15;
1012                },
1013                _ => {},
1014            }
1015
1016            // Boost results that contain key entities
1017            for entity in &analysis.key_entities {
1018                if result
1019                    .entities
1020                    .iter()
1021                    .any(|e| e.eq_ignore_ascii_case(entity))
1022                {
1023                    result.score *= 1.1;
1024                }
1025            }
1026        }
1027
1028        // Sort by adjusted scores
1029        results.sort_by(|a, b| {
1030            b.score
1031                .partial_cmp(&a.score)
1032                .unwrap_or(std::cmp::Ordering::Equal)
1033        });
1034
1035        // Diversity-aware deduplication
1036        let mut deduplicated = Vec::new();
1037        let mut seen_content = HashSet::new();
1038        let mut type_counts: HashMap<ResultType, usize> = HashMap::new();
1039
1040        for result in results {
1041            let content_signature = self.create_content_signature(&result.content);
1042
1043            if !seen_content.contains(&content_signature) {
1044                let type_count = type_counts.get(&result.result_type).unwrap_or(&0);
1045
1046                // Ensure diversity across result types
1047                let max_per_type = match result.result_type {
1048                    ResultType::Entity => self.config.top_k / 3,
1049                    ResultType::Chunk => self.config.top_k / 2,
1050                    ResultType::HierarchicalSummary => self.config.top_k / 4,
1051                    ResultType::Hybrid => self.config.top_k / 4,
1052                    ResultType::GraphPath => self.config.top_k / 5,
1053                };
1054
1055                if *type_count < max_per_type {
1056                    seen_content.insert(content_signature);
1057                    *type_counts.entry(result.result_type.clone()).or_insert(0) += 1;
1058                    deduplicated.push(result);
1059                }
1060            }
1061        }
1062
1063        Ok(deduplicated)
1064    }
1065
1066    /// Find paths between entities in the graph
1067    fn find_entity_paths(
1068        &self,
1069        graph: &KnowledgeGraph,
1070        key_entities: &[String],
1071    ) -> Result<Vec<SearchResult>> {
1072        let mut results = Vec::new();
1073
1074        if key_entities.len() < 2 {
1075            return Ok(results);
1076        }
1077
1078        // Simple path finding between first two entities
1079        if let (Some(source), Some(target)) = (
1080            graph
1081                .entities()
1082                .find(|e| e.name.eq_ignore_ascii_case(&key_entities[0])),
1083            graph
1084                .entities()
1085                .find(|e| e.name.eq_ignore_ascii_case(&key_entities[1])),
1086        ) {
1087            let path_description =
1088                format!("Connection between {} and {}", source.name, target.name);
1089            let neighbors_source = graph.get_neighbors(&source.id);
1090            let neighbors_target = graph.get_neighbors(&target.id);
1091
1092            // Check for direct connection
1093            if neighbors_source
1094                .iter()
1095                .any(|(neighbor, _)| neighbor.id == target.id)
1096            {
1097                results.push(SearchResult {
1098                    id: format!("path_{}_{}", source.id, target.id),
1099                    content: format!("Direct relationship: {path_description}"),
1100                    score: 0.8,
1101                    result_type: ResultType::GraphPath,
1102                    entities: vec![source.name.clone(), target.name.clone()],
1103                    source_chunks: Vec::new(),
1104                });
1105            }
1106
1107            // Check for indirect connections through common neighbors
1108            for (neighbor_s, rel_s) in &neighbors_source {
1109                for (neighbor_t, rel_t) in &neighbors_target {
1110                    if neighbor_s.id == neighbor_t.id {
1111                        results.push(SearchResult {
1112                            id: format!("path_{}_{}_{}", source.id, neighbor_s.id, target.id),
1113                            content: format!(
1114                                "Indirect relationship via {}: {} -> {} -> {}",
1115                                neighbor_s.name, source.name, neighbor_s.name, target.name
1116                            ),
1117                            score: 0.6 * rel_s.confidence * rel_t.confidence,
1118                            result_type: ResultType::GraphPath,
1119                            entities: vec![
1120                                source.name.clone(),
1121                                neighbor_s.name.clone(),
1122                                target.name.clone(),
1123                            ],
1124                            source_chunks: Vec::new(),
1125                        });
1126                    }
1127                }
1128            }
1129        }
1130
1131        Ok(results)
1132    }
1133
1134    /// Community-based search for exploratory queries
1135    fn community_based_search(
1136        &self,
1137        query_embedding: &[f32],
1138        graph: &KnowledgeGraph,
1139    ) -> Result<Vec<SearchResult>> {
1140        let mut results = Vec::new();
1141        let mut entity_scores: HashMap<String, f32> = HashMap::new();
1142
1143        // Calculate centrality-like scores for entities
1144        for entity in graph.entities() {
1145            let neighbors = graph.get_neighbors(&entity.id);
1146            let centrality_score = neighbors.len() as f32 * 0.1;
1147
1148            // Combine with embedding similarity
1149            if let Some(embedding) = &entity.embedding {
1150                let similarity = VectorUtils::cosine_similarity(query_embedding, embedding);
1151                entity_scores.insert(entity.id.to_string(), centrality_score + similarity);
1152            }
1153        }
1154
1155        // Select top entities by combined score
1156        let mut sorted_entities: Vec<_> = entity_scores.iter().collect();
1157        sorted_entities.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
1158
1159        for (entity_id, score) in sorted_entities.iter().take(3) {
1160            if let Some(entity) = graph.entities().find(|e| e.id.to_string() == **entity_id) {
1161                // Get context from chunks where this entity is mentioned
1162                let mut entity_context = String::new();
1163                for mention in entity.mentions.iter().take(2) {
1164                    if let Some(chunk) = graph.chunks().find(|c| c.id == mention.chunk_id) {
1165                        let chunk_excerpt = if chunk.content.len() > 200 {
1166                            format!("{}...", &chunk.content[..200])
1167                        } else {
1168                            chunk.content.clone()
1169                        };
1170                        entity_context.push_str(&chunk_excerpt);
1171                        entity_context.push(' ');
1172                    }
1173                }
1174
1175                // If no context found, provide a meaningful description
1176                if entity_context.is_empty() {
1177                    entity_context = format!(
1178                        "{} is a {} character in the story.",
1179                        entity.name, entity.entity_type
1180                    );
1181                }
1182
1183                results.push(SearchResult {
1184                    id: entity.id.to_string(),
1185                    content: entity_context,
1186                    score: **score,
1187                    result_type: ResultType::Entity,
1188                    entities: vec![entity.name.clone()],
1189                    source_chunks: entity
1190                        .mentions
1191                        .iter()
1192                        .map(|m| m.chunk_id.to_string())
1193                        .collect(),
1194                });
1195            }
1196        }
1197
1198        Ok(results)
1199    }
1200
1201    /// Helper method to detect abstract concepts
1202    fn has_abstract_concepts(&self, words: &[&str]) -> bool {
1203        const ABSTRACT_INDICATORS: &[&str] = &[
1204            "concept",
1205            "idea",
1206            "theory",
1207            "principle",
1208            "philosophy",
1209            "meaning",
1210            "understanding",
1211            "knowledge",
1212            "wisdom",
1213            "truth",
1214            "beauty",
1215            "justice",
1216        ];
1217        words
1218            .iter()
1219            .any(|&word| ABSTRACT_INDICATORS.contains(&word))
1220    }
1221
1222    /// Helper method to detect question words
1223    fn has_question_words(&self, words: &[&str]) -> bool {
1224        const QUESTION_WORDS: &[&str] = &[
1225            "what", "how", "why", "when", "where", "who", "which", "explain", "describe",
1226        ];
1227        words.iter().any(|&word| QUESTION_WORDS.contains(&word))
1228    }
1229
1230    /// Create content signature for deduplication
1231    fn create_content_signature(&self, content: &str) -> String {
1232        // Simple signature based on first 50 characters and length
1233        let prefix = Self::safe_truncate(content, 50);
1234        format!(
1235            "{}_{}",
1236            prefix
1237                .chars()
1238                .filter(|c| c.is_alphanumeric())
1239                .collect::<String>(),
1240            content.len()
1241        )
1242    }
1243
1244    /// Graph traversal search for path-based results (legacy)
1245    fn graph_traversal_search(
1246        &self,
1247        _query_embedding: &[f32],
1248        _graph: &KnowledgeGraph,
1249    ) -> Result<Vec<SearchResult>> {
1250        // Placeholder for graph traversal algorithms
1251        // This would implement algorithms like:
1252        // - Random walks
1253        // - Shortest paths between relevant entities
1254        // - Community detection
1255        // - PageRank-style scoring
1256
1257        Ok(Vec::new())
1258    }
1259
1260    /// Find entities most relevant to the query
1261    fn find_relevant_entities(
1262        &self,
1263        query_embedding: &[f32],
1264        graph: &KnowledgeGraph,
1265    ) -> Result<Vec<(EntityId, f32)>> {
1266        let mut similarities = Vec::new();
1267
1268        for entity in graph.entities() {
1269            if let Some(embedding) = &entity.embedding {
1270                let similarity = VectorUtils::cosine_similarity(query_embedding, embedding);
1271                if similarity >= self.config.similarity_threshold {
1272                    similarities.push((entity.id.clone(), similarity));
1273                }
1274            }
1275        }
1276
1277        // Sort by similarity
1278        similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
1279
1280        Ok(similarities)
1281    }
1282
1283    /// Expand search through graph relationships
1284    fn expand_through_relationships(
1285        &self,
1286        start_entity: &EntityId,
1287        graph: &KnowledgeGraph,
1288        max_depth: usize,
1289        visited: &mut HashSet<EntityId>,
1290    ) -> Result<Vec<EntityId>> {
1291        let mut results = Vec::new();
1292        let mut current_level = vec![start_entity.clone()];
1293        visited.insert(start_entity.clone());
1294
1295        for _depth in 0..max_depth {
1296            let mut next_level = Vec::new();
1297
1298            for entity_id in &current_level {
1299                results.push(entity_id.clone());
1300
1301                // Get neighbors through graph relationships
1302                let neighbors = graph.get_neighbors(entity_id);
1303                for (neighbor_entity, _relationship) in neighbors {
1304                    if !visited.contains(&neighbor_entity.id) {
1305                        visited.insert(neighbor_entity.id.clone());
1306                        next_level.push(neighbor_entity.id.clone());
1307                    }
1308                }
1309            }
1310
1311            if next_level.is_empty() {
1312                break;
1313            }
1314
1315            current_level = next_level;
1316        }
1317
1318        Ok(results)
1319    }
1320
1321    /// Simple stop word detection (English)
1322    fn is_stop_word(&self, word: &str) -> bool {
1323        const STOP_WORDS: &[&str] = &[
1324            "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
1325            "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
1326            "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
1327            "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
1328            "go", "me",
1329        ];
1330        STOP_WORDS.contains(&word)
1331    }
1332
1333    /// Rank and deduplicate search results (legacy)
1334    fn rank_and_deduplicate(&self, mut results: Vec<SearchResult>) -> Result<Vec<SearchResult>> {
1335        // Sort by score descending
1336        results.sort_by(|a, b| {
1337            b.score
1338                .partial_cmp(&a.score)
1339                .unwrap_or(std::cmp::Ordering::Equal)
1340        });
1341
1342        // Deduplicate by ID
1343        let mut seen_ids = HashSet::new();
1344        let mut deduplicated = Vec::new();
1345
1346        for result in results {
1347            if !seen_ids.contains(&result.id) {
1348                seen_ids.insert(result.id.clone());
1349                deduplicated.push(result);
1350            }
1351        }
1352
1353        Ok(deduplicated)
1354    }
1355
1356    /// Vector-based search
1357    pub async fn vector_search(
1358        &mut self,
1359        query: &str,
1360        max_results: usize,
1361    ) -> Result<Vec<SearchResult>> {
1362        let query_embedding = self.embedding_generator.generate_embedding(query);
1363        let similar_vectors = self
1364            .vector_store
1365            .search(&query_embedding, max_results)
1366            .await?;
1367
1368        let mut results = Vec::new();
1369        for store_result in similar_vectors {
1370            results.push(SearchResult {
1371                id: store_result.id.clone(),
1372                content: format!("Vector result for: {}", store_result.id),
1373                score: store_result.score,
1374                result_type: ResultType::Chunk,
1375                entities: Vec::new(),
1376                source_chunks: vec![store_result.id],
1377            });
1378        }
1379
1380        Ok(results)
1381    }
1382
1383    /// Graph-based search
1384    pub fn graph_search(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
1385        // Simplified graph search - in a real implementation this would traverse the graph
1386        let mut results = Vec::new();
1387        results.push(SearchResult {
1388            id: format!("graph_result_{}", query.len()),
1389            content: format!("Graph-based result for: {query}"),
1390            score: 0.7,
1391            result_type: ResultType::GraphPath,
1392            entities: Vec::new(),
1393            source_chunks: Vec::new(),
1394        });
1395
1396        Ok(results.into_iter().take(max_results).collect())
1397    }
1398
1399    /// Hierarchical search (public wrapper)
1400    pub fn public_hierarchical_search(
1401        &self,
1402        query: &str,
1403        max_results: usize,
1404    ) -> Result<Vec<SearchResult>> {
1405        // Simplified hierarchical search - in a real implementation this would use document trees
1406        let mut results = Vec::new();
1407        results.push(SearchResult {
1408            id: format!("hierarchical_result_{}", query.len()),
1409            content: format!("Hierarchical result for: {query}"),
1410            score: 0.8,
1411            result_type: ResultType::HierarchicalSummary,
1412            entities: Vec::new(),
1413            source_chunks: Vec::new(),
1414        });
1415
1416        Ok(results.into_iter().take(max_results).collect())
1417    }
1418
1419    /// BM25-based search
1420    pub fn bm25_search(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
1421        // Simplified BM25 search - in a real implementation this would use proper BM25 scoring
1422        let mut results = Vec::new();
1423        results.push(SearchResult {
1424            id: format!("bm25_result_{}", query.len()),
1425            content: format!("BM25 result for: {query}"),
1426            score: 0.75,
1427            result_type: ResultType::Chunk,
1428            entities: Vec::new(),
1429            source_chunks: Vec::new(),
1430        });
1431
1432        Ok(results.into_iter().take(max_results).collect())
1433    }
1434
1435    /// Get retrieval statistics
1436    pub fn get_statistics(&self) -> RetrievalStatistics {
1437        // let vector_stats = self.vector_index.statistics();
1438
1439        RetrievalStatistics {
1440            indexed_vectors: 0,  // vector_stats.vector_count,
1441            vector_dimension: 0, // vector_stats.dimension,
1442            index_built: false,  // vector_stats.index_built,
1443            config: self.config.clone(),
1444        }
1445    }
1446
1447    /// Safely truncate a string to a maximum byte length, respecting UTF-8 character boundaries
1448    fn safe_truncate(s: &str, max_bytes: usize) -> String {
1449        if s.len() <= max_bytes {
1450            return s.to_string();
1451        }
1452
1453        // Find the largest valid character boundary <= max_bytes
1454        let mut end_idx = max_bytes;
1455        while end_idx > 0 && !s.is_char_boundary(end_idx) {
1456            end_idx -= 1;
1457        }
1458
1459        s[..end_idx].to_string()
1460    }
1461
1462    /// Save retrieval system state to JSON file
1463    pub fn save_state_to_json(&self, file_path: &str) -> Result<()> {
1464        use std::fs;
1465
1466        let mut json_data = json::JsonValue::new_object();
1467
1468        // Add metadata
1469        json_data["metadata"] = json::object! {
1470            "format_version" => "1.0",
1471            "created_at" => chrono::Utc::now().to_rfc3339(),
1472            "config" => json::object! {
1473                "top_k" => self.config.top_k,
1474                "similarity_threshold" => self.config.similarity_threshold,
1475                "max_expansion_depth" => self.config.max_expansion_depth,
1476                "entity_weight" => self.config.entity_weight,
1477                "chunk_weight" => self.config.chunk_weight,
1478                "graph_weight" => self.config.graph_weight
1479            }
1480        };
1481
1482        // Add vector index statistics
1483        // let vector_stats = self.vector_index.statistics();
1484        json_data["vector_index"] = json::object! {
1485            "vector_count" => 0, // vector_stats.vector_count,
1486            "dimension" => 0, // vector_stats.dimension,
1487            "index_built" => false, // vector_stats.index_built,
1488            "min_norm" => 0.0, // vector_stats.min_norm,
1489            "max_norm" => 0.0, // vector_stats.max_norm,
1490            "avg_norm" => 0.0 // vector_stats.avg_norm
1491        };
1492
1493        // Add embedding generator info
1494        json_data["embedding_generator"] = json::object! {
1495            "dimension" => self.embedding_generator.dimension(),
1496            "cached_words" => self.embedding_generator.cached_words()
1497        };
1498
1499        // Add parallel processing info
1500        #[cfg(feature = "parallel-processing")]
1501        {
1502            json_data["parallel_enabled"] = self.parallel_processor.is_some().into();
1503        }
1504        #[cfg(not(feature = "parallel-processing"))]
1505        {
1506            json_data["parallel_enabled"] = false.into();
1507        }
1508
1509        // Save to file
1510        fs::write(file_path, json_data.dump())?;
1511        #[cfg(feature = "tracing")]
1512        tracing::info!("Retrieval system state saved to {file_path}");
1513
1514        Ok(())
1515    }
1516}
1517
1518/// Statistics about the retrieval system
1519#[cfg(test)]
1520mod tests {
1521    use super::*;
1522    use crate::{config::Config, core::KnowledgeGraph};
1523
1524    #[test]
1525    fn test_query_placeholder() {
1526        let config = Config::default();
1527        let retrieval = RetrievalSystem::new(&config).unwrap();
1528
1529        let results = retrieval.query("test query");
1530        assert!(results.is_ok());
1531
1532        let results = results.unwrap();
1533        assert!(!results.is_empty());
1534        assert!(results[0].contains("test query"));
1535    }
1536
1537    #[tokio::test]
1538    async fn test_graph_indexing() {
1539        let config = Config::default();
1540        let retrieval = RetrievalSystem::new(&config).unwrap();
1541        let graph = KnowledgeGraph::new();
1542
1543        let result = retrieval.index_graph(&graph).await;
1544        assert!(result.is_ok());
1545    }
1546
1547    // ============================================================================
1548    // ExplainedAnswer Tests
1549    // ============================================================================
1550
1551    #[test]
1552    fn test_explained_answer_creation() {
1553        let search_results = vec![
1554            SearchResult {
1555                id: "chunk_1".to_string(),
1556                content: "This is the first relevant chunk about climate change.".to_string(),
1557                score: 0.85,
1558                result_type: ResultType::Chunk,
1559                entities: vec!["climate".to_string(), "environment".to_string()],
1560                source_chunks: vec!["doc1_chunk1".to_string()],
1561            },
1562            SearchResult {
1563                id: "chunk_2".to_string(),
1564                content: "Another chunk discussing environmental policies.".to_string(),
1565                score: 0.72,
1566                result_type: ResultType::Chunk,
1567                entities: vec!["policy".to_string(), "environment".to_string()],
1568                source_chunks: vec!["doc1_chunk2".to_string()],
1569            },
1570        ];
1571
1572        let explained = ExplainedAnswer::from_results(
1573            "Climate change is a major environmental concern.".to_string(),
1574            &search_results,
1575            "What is climate change?",
1576        );
1577
1578        assert!(!explained.answer.is_empty());
1579        assert!(explained.confidence > 0.0 && explained.confidence <= 1.0);
1580        assert!(!explained.sources.is_empty());
1581        assert!(!explained.reasoning_steps.is_empty());
1582    }
1583
1584    #[test]
1585    fn test_explained_answer_empty_results() {
1586        let explained = ExplainedAnswer::from_results(
1587            "No relevant information found.".to_string(),
1588            &[],
1589            "What is something unknown?",
1590        );
1591
1592        assert_eq!(explained.confidence, 0.0);
1593        assert!(explained.sources.is_empty());
1594        assert!(!explained.reasoning_steps.is_empty()); // Should still have query analysis step
1595    }
1596
1597    #[test]
1598    fn test_explained_answer_format_display() {
1599        let search_results = vec![SearchResult {
1600            id: "test_chunk".to_string(),
1601            content: "Test content about technology.".to_string(),
1602            score: 0.9,
1603            result_type: ResultType::Chunk,
1604            entities: vec!["technology".to_string()],
1605            source_chunks: vec!["doc1_chunk1".to_string()],
1606        }];
1607
1608        let explained = ExplainedAnswer::from_results(
1609            "Technology is important.".to_string(),
1610            &search_results,
1611            "Why is technology important?",
1612        );
1613
1614        let formatted = explained.format_display();
1615
1616        assert!(formatted.contains("**Answer:**"));
1617        assert!(formatted.contains("**Confidence:**"));
1618        assert!(formatted.contains("**Reasoning:**"));
1619        assert!(formatted.contains("**Sources:**"));
1620    }
1621
1622    #[test]
1623    fn test_reasoning_steps_structure() {
1624        let search_results = vec![SearchResult {
1625            id: "entity_1".to_string(),
1626            content: "Entity description".to_string(),
1627            score: 0.8,
1628            result_type: ResultType::Entity,
1629            entities: vec!["person".to_string(), "organization".to_string()],
1630            source_chunks: vec![],
1631        }];
1632
1633        let explained = ExplainedAnswer::from_results(
1634            "Answer text".to_string(),
1635            &search_results,
1636            "Who are the key people?",
1637        );
1638
1639        // Check reasoning steps are numbered correctly
1640        for (i, step) in explained.reasoning_steps.iter().enumerate() {
1641            assert_eq!(step.step_number as usize, i + 1);
1642            assert!(!step.description.is_empty());
1643            assert!(step.confidence >= 0.0 && step.confidence <= 1.0);
1644        }
1645    }
1646
1647    #[test]
1648    fn test_source_reference_types() {
1649        let search_results = vec![
1650            SearchResult {
1651                id: "chunk".to_string(),
1652                content: "Chunk content".to_string(),
1653                score: 0.7,
1654                result_type: ResultType::Chunk,
1655                entities: vec![],
1656                source_chunks: vec![],
1657            },
1658            SearchResult {
1659                id: "entity".to_string(),
1660                content: "Entity content".to_string(),
1661                score: 0.6,
1662                result_type: ResultType::Entity,
1663                entities: vec![],
1664                source_chunks: vec![],
1665            },
1666            SearchResult {
1667                id: "path".to_string(),
1668                content: "Graph path content".to_string(),
1669                score: 0.5,
1670                result_type: ResultType::GraphPath,
1671                entities: vec![],
1672                source_chunks: vec![],
1673            },
1674        ];
1675
1676        let explained =
1677            ExplainedAnswer::from_results("Answer".to_string(), &search_results, "Query");
1678
1679        let source_types: Vec<_> = explained.sources.iter().map(|s| &s.source_type).collect();
1680        assert!(source_types.contains(&&SourceType::TextChunk));
1681        assert!(source_types.contains(&&SourceType::Entity));
1682        assert!(source_types.contains(&&SourceType::Relationship));
1683    }
1684}
graphrag_core/retrieval/mod.rs

graphrag_core/retrieval/
mod.rs