graphrag_core/retrieval/
mod.rs

1pub mod adaptive;
2/// BM25 text retrieval implementation for keyword-based search
3pub mod bm25;
4/// Enriched metadata-aware retrieval
5pub mod enriched;
6/// Hybrid retrieval combining multiple search strategies
7pub mod hybrid;
8pub mod pagerank_retrieval;
9/// HippoRAG Personalized PageRank retrieval
10#[cfg(feature = "pagerank")]
11pub mod hipporag_ppr;
12
13use crate::{
14    config::Config,
15    core::{ChunkId, EntityId, KnowledgeGraph},
16    summarization::DocumentTree,
17    vector::{EmbeddingGenerator, VectorIndex, VectorUtils},
18    Result,
19};
20#[cfg(feature = "parallel-processing")]
21use crate::parallel::ParallelProcessor;
22use std::collections::{HashMap, HashSet};
23
24pub use bm25::{BM25Result, BM25Retriever, Document as BM25Document};
25pub use enriched::{EnrichedRetrievalConfig, EnrichedRetriever};
26pub use hybrid::{FusionMethod, HybridConfig, HybridRetriever, HybridSearchResult};
27
28#[cfg(feature = "pagerank")]
29pub use pagerank_retrieval::{PageRankRetrievalSystem, ScoredResult};
30
31#[cfg(feature = "pagerank")]
32pub use hipporag_ppr::{HippoRAGConfig, HippoRAGRetriever, Fact};
33
34/// Retrieval system for querying the knowledge graph
35pub struct RetrievalSystem {
36    vector_index: VectorIndex,
37    embedding_generator: EmbeddingGenerator,
38    config: RetrievalConfig,
39    #[cfg(feature = "parallel-processing")]
40    parallel_processor: Option<ParallelProcessor>,
41    #[cfg(feature = "pagerank")]
42    pagerank_retriever: Option<PageRankRetrievalSystem>,
43    enriched_retriever: Option<EnrichedRetriever>,
44}
45
46/// Configuration parameters for the retrieval system
47#[derive(Debug, Clone)]
48pub struct RetrievalConfig {
49    /// Maximum number of results to return
50    pub top_k: usize,
51    /// Minimum similarity score threshold for results (typically -1.0 to 1.0)
52    pub similarity_threshold: f32,
53    /// Maximum depth for graph relationship expansion
54    pub max_expansion_depth: usize,
55    /// Weight for entity-based results in scoring (0.0 to 1.0)
56    pub entity_weight: f32,
57    /// Weight for chunk-based results in scoring (0.0 to 1.0)
58    pub chunk_weight: f32,
59    /// Weight for graph-based results in scoring (0.0 to 1.0)
60    pub graph_weight: f32,
61}
62
63impl Default for RetrievalConfig {
64    fn default() -> Self {
65        Self {
66            top_k: 10,
67            similarity_threshold: 0.7,
68            max_expansion_depth: 2,
69            entity_weight: 0.4,
70            chunk_weight: 0.4,
71            graph_weight: 0.2,
72        }
73    }
74}
75
76/// A search result containing relevant information
77#[derive(Debug, Clone)]
78pub struct SearchResult {
79    /// Unique identifier for this result
80    pub id: String,
81    /// Content or description of the result
82    pub content: String,
83    /// Relevance score (higher is better)
84    pub score: f32,
85    /// Type of result (entity, chunk, graph path, etc.)
86    pub result_type: ResultType,
87    /// Names of entities associated with this result
88    pub entities: Vec<String>,
89    /// IDs of source chunks this result is derived from
90    pub source_chunks: Vec<String>,
91}
92
93/// Type of search result indicating the retrieval strategy used
94#[derive(Debug, Clone, PartialEq, Eq, Hash)]
95pub enum ResultType {
96    /// Result from entity-based retrieval
97    Entity,
98    /// Result from text chunk retrieval
99    Chunk,
100    /// Result from graph path traversal
101    GraphPath,
102    /// Result from hierarchical document summarization
103    HierarchicalSummary,
104    /// Result from combining multiple retrieval strategies
105    Hybrid,
106}
107
108/// Query analysis results to determine optimal retrieval strategy
109#[derive(Debug, Clone)]
110pub struct QueryAnalysis {
111    /// Type of query based on content analysis
112    pub query_type: QueryType,
113    /// Key entities detected in the query
114    pub key_entities: Vec<String>,
115    /// Conceptual terms extracted from the query
116    pub concepts: Vec<String>,
117    /// Inferred user intent from the query
118    pub intent: QueryIntent,
119    /// Query complexity score (0.0 to 1.0)
120    pub complexity_score: f32,
121}
122
123/// Classification of query types for adaptive retrieval strategy selection
124#[derive(Debug, Clone, PartialEq)]
125pub enum QueryType {
126    /// Queries focused on specific entities
127    EntityFocused,
128    /// Abstract concept queries requiring broader context
129    Conceptual,
130    /// Specific fact retrieval queries
131    Factual,
132    /// Open-ended exploration queries
133    Exploratory,
134    /// Queries about relationships between entities
135    Relationship,
136}
137
138/// User intent classification for result presentation
139#[derive(Debug, Clone, PartialEq)]
140pub enum QueryIntent {
141    /// User wants a high-level summary or overview
142    Overview,
143    /// User wants detailed, specific information
144    Detailed,
145    /// User wants to compare multiple items
146    Comparative,
147    /// User wants to understand cause-effect relationships
148    Causal,
149    /// User wants time-based or chronological information
150    Temporal,
151}
152
153/// Query analysis result with additional metadata for adaptive retrieval
154#[derive(Debug, Clone)]
155pub struct QueryAnalysisResult {
156    /// Detected query type
157    pub query_type: QueryType,
158    /// Confidence score for the detected query type (0.0 to 1.0)
159    pub confidence: f32,
160    /// Keywords extracted and matched from the query
161    pub keywords_matched: Vec<String>,
162    /// Recommended retrieval strategies based on analysis
163    pub suggested_strategies: Vec<String>,
164    /// Overall query complexity score (0.0 to 1.0)
165    pub complexity_score: f32,
166}
167
168/// Query result with hierarchical summary
169#[derive(Debug, Clone)]
170pub struct QueryResult {
171    /// Original query string
172    pub query: String,
173    /// List of search results
174    pub results: Vec<SearchResult>,
175    /// Optional generated summary of all results
176    pub summary: Option<String>,
177    /// Additional metadata about the query execution
178    pub metadata: HashMap<String, String>,
179}
180
181impl RetrievalSystem {
182    /// Create a new retrieval system
183    pub fn new(config: &Config) -> Result<Self> {
184        let retrieval_config = RetrievalConfig {
185            top_k: config.retrieval.top_k,
186            similarity_threshold: 0.35, // Reasonable threshold for hash-based embeddings
187            max_expansion_depth: 2,     // Default, could be added to config
188            entity_weight: 0.4,         // Default, could be added to config
189            chunk_weight: 0.4,          // Default, could be added to config
190            graph_weight: 0.2,          // Default, could be added to config
191        };
192
193        Ok(Self {
194            vector_index: VectorIndex::new(),
195            embedding_generator: EmbeddingGenerator::new(128), // 128-dimensional embeddings
196            config: retrieval_config,
197            #[cfg(feature = "parallel-processing")]
198            parallel_processor: None,
199            #[cfg(feature = "pagerank")]
200            pagerank_retriever: None,
201            enriched_retriever: None,
202        })
203    }
204
205    /// Create a new retrieval system with parallel processing support
206    #[cfg(feature = "parallel-processing")]
207    pub fn with_parallel_processing(
208        config: &Config,
209        parallel_processor: ParallelProcessor,
210    ) -> Result<Self> {
211        let retrieval_config = RetrievalConfig {
212            top_k: config.retrieval.top_k,
213            similarity_threshold: 0.35, // Reasonable threshold for hash-based embeddings
214            max_expansion_depth: 2,
215            entity_weight: 0.4,
216            chunk_weight: 0.4,
217            graph_weight: 0.2,
218        };
219
220        Ok(Self {
221            vector_index: VectorIndex::with_parallel_processing(parallel_processor.clone()),
222            embedding_generator: EmbeddingGenerator::with_parallel_processing(
223                128,
224                parallel_processor.clone(),
225            ),
226            config: retrieval_config,
227            parallel_processor: Some(parallel_processor),
228            #[cfg(feature = "pagerank")]
229            pagerank_retriever: None,
230            enriched_retriever: None,
231        })
232    }
233
234    /// Index a knowledge graph for retrieval
235    pub fn index_graph(&mut self, graph: &KnowledgeGraph) -> Result<()> {
236        // Index entity embeddings
237        for entity in graph.entities() {
238            if let Some(embedding) = &entity.embedding {
239                let id = format!("entity:{}", entity.id);
240                self.vector_index.add_vector(id, embedding.clone())?;
241            }
242        }
243
244        // Index chunk embeddings
245        for chunk in graph.chunks() {
246            if let Some(embedding) = &chunk.embedding {
247                let id = format!("chunk:{}", chunk.id);
248                self.vector_index.add_vector(id, embedding.clone())?;
249            }
250        }
251
252        // Build the vector index
253        if !self.vector_index.is_empty() {
254            self.vector_index.build_index()?;
255        }
256
257        Ok(())
258    }
259
260    /// Initialize PageRank retrieval system (feature-gated)
261    #[cfg(feature = "pagerank")]
262    pub fn initialize_pagerank(&mut self, graph: &KnowledgeGraph) -> Result<()> {
263        use crate::graph::pagerank::{PageRankConfig, ScoreWeights};
264
265        tracing::debug!("Initializing high-performance PageRank retrieval system...");
266
267        let pagerank_config = PageRankConfig {
268            damping_factor: 0.85,
269            max_iterations: 50, // Reduced for faster convergence
270            tolerance: 1e-5,    // Slightly relaxed for speed
271            personalized: true,
272            #[cfg(feature = "parallel-processing")]
273            parallel_enabled: self.parallel_processor.is_some(),
274            #[cfg(not(feature = "parallel-processing"))]
275            parallel_enabled: false,
276            cache_size: 2000,   // Large cache for better performance
277            sparse_threshold: 500,
278            incremental_updates: true,
279            simd_block_size: 64, // Optimized for modern CPUs
280        };
281
282        let score_weights = ScoreWeights {
283            vector_weight: 0.3,
284            pagerank_weight: 0.5, // Higher weight for PageRank like fast-GraphRAG
285            chunk_weight: 0.15,
286            relationship_weight: 0.05,
287        };
288
289        let mut pagerank_retriever = PageRankRetrievalSystem::new(self.config.top_k)
290            .with_pagerank_config(pagerank_config)
291            .with_score_weights(score_weights)
292            .with_incremental_mode(true)
293            .with_min_threshold(0.05);
294
295        // Initialize vector index
296        pagerank_retriever.initialize_vector_index(graph)?;
297
298        // Pre-compute global PageRank scores for faster queries
299        pagerank_retriever.precompute_global_pagerank(graph)?;
300
301        self.pagerank_retriever = Some(pagerank_retriever);
302
303        tracing::debug!("PageRank retrieval system initialized with 27x performance optimizations");
304        Ok(())
305    }
306
307    /// Initialize enriched metadata-aware retrieval system
308    pub fn initialize_enriched(&mut self, config: Option<EnrichedRetrievalConfig>) -> Result<()> {
309        tracing::debug!("Initializing enriched metadata-aware retrieval system...");
310
311        let enriched_config = config.unwrap_or_default();
312        let enriched_retriever = EnrichedRetriever::with_config(enriched_config);
313
314        self.enriched_retriever = Some(enriched_retriever);
315
316        tracing::debug!("Enriched retrieval system initialized with metadata boosting");
317        Ok(())
318    }
319
320    /// Query using PageRank-enhanced retrieval (feature-gated)
321    #[cfg(feature = "pagerank")]
322    pub fn pagerank_query(
323        &self,
324        query: &str,
325        graph: &KnowledgeGraph,
326        max_results: Option<usize>,
327    ) -> Result<Vec<ScoredResult>> {
328        if let Some(pagerank_retriever) = &self.pagerank_retriever {
329            pagerank_retriever.search_with_pagerank(query, graph, max_results)
330        } else {
331            Err(crate::core::GraphRAGError::Retrieval {
332                message: "PageRank retriever not initialized. Call initialize_pagerank() first.".to_string(),
333            })
334        }
335    }
336
337    /// Batch PageRank queries for high throughput (feature-gated)
338    #[cfg(feature = "pagerank")]
339    pub fn pagerank_batch_query(
340        &self,
341        queries: &[&str],
342        graph: &KnowledgeGraph,
343        max_results_per_query: Option<usize>,
344    ) -> Result<Vec<Vec<ScoredResult>>> {
345        if let Some(pagerank_retriever) = &self.pagerank_retriever {
346            pagerank_retriever.batch_search(queries, graph, max_results_per_query)
347        } else {
348            Err(crate::core::GraphRAGError::Retrieval {
349                message: "PageRank retriever not initialized. Call initialize_pagerank() first.".to_string(),
350            })
351        }
352    }
353
354    /// Query the system for relevant information
355    pub fn query(&self, query: &str) -> Result<Vec<String>> {
356        // For now, return a placeholder implementation
357        // In a real system, this would:
358        // 1. Convert query to embedding
359        // 2. Search vector index
360        // 3. Expand through graph relationships
361        // 4. Rank and return results
362
363        Ok(vec![format!("Results for query: {}", query)])
364    }
365
366    /// Advanced hybrid query with strategy selection and hierarchical integration
367    pub fn hybrid_query(
368        &mut self,
369        query: &str,
370        graph: &KnowledgeGraph,
371    ) -> Result<Vec<SearchResult>> {
372        self.hybrid_query_with_trees(query, graph, &HashMap::new())
373    }
374
375    /// Hybrid query with access to document trees for hierarchical retrieval
376    pub fn hybrid_query_with_trees(
377        &mut self,
378        query: &str,
379        graph: &KnowledgeGraph,
380        document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
381    ) -> Result<Vec<SearchResult>> {
382        // 1. Analyze query to determine optimal strategy
383        let analysis = self.analyze_query(query, graph)?;
384
385        // 2. Generate query embedding
386        let query_embedding = self.embedding_generator.generate_embedding(query);
387
388        // 3. Execute multi-strategy retrieval based on analysis
389        let mut results = self.execute_adaptive_retrieval(
390            query,
391            &query_embedding,
392            graph,
393            document_trees,
394            &analysis,
395        )?;
396
397        // 4. Apply enriched metadata-aware boosting and filtering if enabled
398        if let Some(enriched_retriever) = &self.enriched_retriever {
399            // First apply metadata boosting to enhance relevance
400            results = enriched_retriever.boost_with_metadata(results, query, graph)?;
401
402            // Then apply structure filtering if query mentions chapters/sections
403            results = enriched_retriever.filter_by_structure(query, results, graph)?;
404        }
405
406        Ok(results)
407    }
408
409    /// Query the system using hybrid retrieval (vector + graph) - legacy method
410    pub fn legacy_hybrid_query(
411        &mut self,
412        query: &str,
413        graph: &KnowledgeGraph,
414    ) -> Result<Vec<SearchResult>> {
415        // 1. Generate query embedding
416        let query_embedding = self.embedding_generator.generate_embedding(query);
417
418        // 2. Perform comprehensive search
419        let results = self.comprehensive_search(&query_embedding, graph)?;
420
421        Ok(results)
422    }
423
424    /// Add embeddings to chunks and entities in the graph with parallel processing
425    pub fn add_embeddings_to_graph(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
426        #[cfg(feature = "parallel-processing")]
427        if let Some(processor) = self.parallel_processor.clone() {
428            return self.add_embeddings_parallel(graph, &processor);
429        }
430
431        self.add_embeddings_sequential(graph)
432    }
433
434    /// Parallel embedding generation with proper error handling and work-stealing
435    #[cfg(feature = "parallel-processing")]
436    fn add_embeddings_parallel(
437        &mut self,
438        graph: &mut KnowledgeGraph,
439        processor: &ParallelProcessor,
440    ) -> Result<()> {
441        // Extract texts for embedding generation
442        let mut chunk_texts = Vec::new();
443        let mut entity_texts = Vec::new();
444
445        // Collect chunk texts that need embeddings
446        for chunk in graph.chunks() {
447            if chunk.embedding.is_none() {
448                chunk_texts.push((chunk.id.clone(), chunk.content.clone()));
449            }
450        }
451
452        // Collect entity texts that need embeddings
453        for entity in graph.entities() {
454            if entity.embedding.is_none() {
455                let entity_text = format!("{} {}", entity.name, entity.entity_type);
456                entity_texts.push((entity.id.clone(), entity_text));
457            }
458        }
459
460        // For parallel processing, we need to use a different approach since
461        // generate_embedding requires &mut self. We'll fall back to enhanced sequential
462        // processing with better chunking and monitoring for now.
463
464        let total_items = chunk_texts.len() + entity_texts.len();
465        if processor.should_use_parallel(total_items) {
466            tracing::debug!("Processing {total_items} embeddings with enhanced sequential approach");
467        }
468
469        // Process chunks
470        for (chunk_id, text) in chunk_texts {
471            let embedding = self.embedding_generator.generate_embedding(&text);
472            if let Some(chunk) = graph.get_chunk_mut(&chunk_id) {
473                chunk.embedding = Some(embedding);
474            }
475        }
476
477        // Process entities
478        for (entity_id, text) in entity_texts {
479            let embedding = self.embedding_generator.generate_embedding(&text);
480            if let Some(entity) = graph.get_entity_mut(&entity_id) {
481                entity.embedding = Some(embedding);
482            }
483        }
484
485        // Re-index the graph with new embeddings
486        self.index_graph(graph)?;
487
488        Ok(())
489    }
490
491    /// Sequential embedding generation (fallback)
492    fn add_embeddings_sequential(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
493        // Debug: Check total counts first (uncomment for debugging)
494        let _total_chunks = graph.chunks().count();
495        let _total_entities = graph.entities().count();
496        // println!("DEBUG: Found {} total chunks and {} total entities in graph", _total_chunks, _total_entities);
497
498        // Generate embeddings for all chunks
499        let mut chunk_count = 0;
500        for chunk in graph.chunks_mut() {
501            if chunk.embedding.is_none() {
502                let embedding = self.embedding_generator.generate_embedding(&chunk.content);
503                chunk.embedding = Some(embedding);
504                chunk_count += 1;
505            }
506        }
507
508        // Generate embeddings for all entities (using their name and context)
509        let mut entity_count = 0;
510        for entity in graph.entities_mut() {
511            if entity.embedding.is_none() {
512                // Create entity text from name and entity type
513                let entity_text = format!("{} {}", entity.name, entity.entity_type);
514                let embedding = self.embedding_generator.generate_embedding(&entity_text);
515                entity.embedding = Some(embedding);
516                entity_count += 1;
517            }
518        }
519
520        tracing::debug!("Generated embeddings for {chunk_count} chunks and {entity_count} entities");
521
522        // Re-index the graph with new embeddings
523        self.index_graph(graph)?;
524
525        Ok(())
526    }
527
528    /// Parallel query processing for multiple queries
529    pub fn batch_query(
530        &mut self,
531        queries: &[&str],
532        graph: &KnowledgeGraph,
533    ) -> Result<Vec<Vec<SearchResult>>> {
534        #[cfg(feature = "parallel-processing")]
535        if let Some(processor) = &self.parallel_processor.clone() {
536            return self.batch_query_parallel(queries, graph, processor);
537        }
538
539        // Sequential fallback
540        queries
541            .iter()
542            .map(|&query| self.hybrid_query(query, graph))
543            .collect()
544    }
545
546    /// Parallel batch query processing with optimized workload distribution
547    #[cfg(feature = "parallel-processing")]
548    fn batch_query_parallel(
549        &mut self,
550        queries: &[&str],
551        graph: &KnowledgeGraph,
552        processor: &ParallelProcessor,
553    ) -> Result<Vec<Vec<SearchResult>>> {
554        if !processor.should_use_parallel(queries.len()) {
555            // Use sequential processing for small batches
556            return queries
557                .iter()
558                .map(|&query| self.hybrid_query(query, graph))
559                .collect();
560        }
561
562        #[cfg(feature = "parallel-processing")]
563        {
564            // For parallel query processing, we need to work around the borrowing limitations
565            // of the embedding generator. We'll use enhanced sequential processing with
566            // better monitoring and chunking for now.
567
568            let chunk_size = processor.config().chunk_batch_size.min(queries.len());
569            tracing::debug!(
570                "Processing {} queries with enhanced sequential approach (chunk size: {})",
571                queries.len(),
572                chunk_size
573            );
574
575            let mut all_results = Vec::new();
576            for &query in queries {
577                match self.hybrid_query(query, graph) {
578                    Ok(results) => all_results.push(results),
579                    Err(e) => {
580                        tracing::warn!("Error processing query '{query}': {e}");
581                        all_results.push(Vec::new());
582                    }
583                }
584            }
585
586            Ok(all_results)
587        }
588
589        #[cfg(not(feature = "parallel-processing"))]
590        {
591            // Sequential fallback when parallel processing is not available
592            queries
593                .iter()
594                .map(|&query| self.hybrid_query(query, graph))
595                .collect()
596        }
597    }
598
599    /// Analyze query to determine optimal retrieval strategy
600    pub fn analyze_query(&self, query: &str, graph: &KnowledgeGraph) -> Result<QueryAnalysis> {
601        let query_lower = query.to_lowercase();
602        let words: Vec<&str> = query_lower.split_whitespace().collect();
603
604        // Detect key entities mentioned in the query
605        let mut key_entities = Vec::new();
606        for entity in graph.entities() {
607            let entity_name_lower = entity.name.to_lowercase();
608            if words
609                .iter()
610                .any(|&word| entity_name_lower.contains(word) || word.contains(&entity_name_lower))
611            {
612                key_entities.push(entity.name.clone());
613            }
614        }
615
616        // Extract concepts (non-entity meaningful words)
617        let concepts: Vec<String> = words
618            .iter()
619            .filter(|&&word| word.len() > 3 && !self.is_stop_word(word))
620            .filter(|&&word| {
621                !key_entities.iter().any(|entity| {
622                    entity.to_lowercase().contains(word) || word.contains(&entity.to_lowercase())
623                })
624            })
625            .map(|&word| word.to_string())
626            .collect();
627
628        // Determine query type
629        let query_type = if !key_entities.is_empty() && key_entities.len() > 1 {
630            QueryType::Relationship
631        } else if !key_entities.is_empty() {
632            QueryType::EntityFocused
633        } else if self.has_abstract_concepts(&words) {
634            QueryType::Conceptual
635        } else if self.has_question_words(&words) {
636            QueryType::Exploratory
637        } else {
638            QueryType::Factual
639        };
640
641        // Determine intent
642        let intent = if words
643            .iter()
644            .any(|&w| ["overview", "summary", "general", "about"].contains(&w))
645        {
646            QueryIntent::Overview
647        } else if words
648            .iter()
649            .any(|&w| ["detailed", "specific", "exactly", "precise"].contains(&w))
650        {
651            QueryIntent::Detailed
652        } else if words
653            .iter()
654            .any(|&w| ["compare", "vs", "versus", "between", "difference"].contains(&w))
655        {
656            QueryIntent::Comparative
657        } else if words
658            .iter()
659            .any(|&w| ["cause", "why", "because", "lead", "result"].contains(&w))
660        {
661            QueryIntent::Causal
662        } else if words
663            .iter()
664            .any(|&w| ["when", "time", "before", "after", "during"].contains(&w))
665        {
666            QueryIntent::Temporal
667        } else {
668            QueryIntent::Detailed
669        };
670
671        // Calculate complexity score
672        let complexity_score = (words.len() as f32 * 0.1
673            + key_entities.len() as f32 * 0.3
674            + concepts.len() as f32 * 0.2)
675            .min(1.0);
676
677        Ok(QueryAnalysis {
678            query_type,
679            key_entities,
680            concepts,
681            intent,
682            complexity_score,
683        })
684    }
685
686    /// Execute adaptive retrieval based on query analysis
687    pub fn execute_adaptive_retrieval(
688        &mut self,
689        query: &str,
690        query_embedding: &[f32],
691        graph: &KnowledgeGraph,
692        document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
693        analysis: &QueryAnalysis,
694    ) -> Result<Vec<SearchResult>> {
695        let mut all_results = Vec::new();
696
697        // Strategy weights based on query analysis
698        let (vector_weight, graph_weight, hierarchical_weight) =
699            self.calculate_strategy_weights(analysis);
700
701        // 1. Vector similarity search (always included)
702        if vector_weight > 0.0 {
703            let mut vector_results = self.vector_similarity_search(query_embedding, graph)?;
704            for result in &mut vector_results {
705                result.score *= vector_weight;
706            }
707            all_results.extend(vector_results);
708        }
709
710        // 2. Graph-based search (emphasized for entity and relationship queries)
711        if graph_weight > 0.0 {
712            let mut graph_results = match analysis.query_type {
713                QueryType::EntityFocused | QueryType::Relationship => {
714                    self.entity_centric_search(query_embedding, graph, &analysis.key_entities)?
715                }
716                _ => self.entity_based_search(query_embedding, graph)?,
717            };
718            for result in &mut graph_results {
719                result.score *= graph_weight;
720            }
721            all_results.extend(graph_results);
722        }
723
724        // 3. Hierarchical search (emphasized for overview and conceptual queries)
725        if hierarchical_weight > 0.0 && !document_trees.is_empty() {
726            let mut hierarchical_results =
727                self.hierarchical_search(query, document_trees, analysis)?;
728            for result in &mut hierarchical_results {
729                result.score *= hierarchical_weight;
730            }
731            all_results.extend(hierarchical_results);
732        }
733
734        // 4. Advanced graph traversal for complex queries
735        if analysis.complexity_score > 0.7 {
736            let traversal_results =
737                self.advanced_graph_traversal(query_embedding, graph, analysis)?;
738            all_results.extend(traversal_results);
739        }
740
741        // 5. Cross-strategy fusion for hybrid results
742        let fusion_results = self.cross_strategy_fusion(&all_results, analysis)?;
743        all_results.extend(fusion_results);
744
745        // Final ranking and deduplication
746        let final_results = self.adaptive_rank_and_deduplicate(all_results, analysis)?;
747
748        Ok(final_results.into_iter().take(self.config.top_k).collect())
749    }
750
751    /// Comprehensive search that combines multiple retrieval strategies (legacy)
752    pub fn comprehensive_search(
753        &self,
754        query_embedding: &[f32],
755        graph: &KnowledgeGraph,
756    ) -> Result<Vec<SearchResult>> {
757        let mut all_results = Vec::new();
758
759        // 1. Vector similarity search
760        let vector_results = self.vector_similarity_search(query_embedding, graph)?;
761        all_results.extend(vector_results);
762
763        // 2. Entity-based search
764        let entity_results = self.entity_based_search(query_embedding, graph)?;
765        all_results.extend(entity_results);
766
767        // 3. Graph traversal search
768        let graph_results = self.graph_traversal_search(query_embedding, graph)?;
769        all_results.extend(graph_results);
770
771        // Deduplicate and rank results
772        let final_results = self.rank_and_deduplicate(all_results)?;
773
774        Ok(final_results.into_iter().take(self.config.top_k).collect())
775    }
776
777    /// Vector similarity search
778    fn vector_similarity_search(
779        &self,
780        query_embedding: &[f32],
781        graph: &KnowledgeGraph,
782    ) -> Result<Vec<SearchResult>> {
783        let mut results = Vec::new();
784
785        // Search for similar vectors
786        let similar_vectors = self
787            .vector_index
788            .search(query_embedding, self.config.top_k * 2)?;
789
790        for (id, similarity) in similar_vectors {
791            if similarity >= self.config.similarity_threshold {
792                let result = if id.starts_with("entity:") {
793                    let entity_id = EntityId::new(id.strip_prefix("entity:").unwrap().to_string());
794                    graph.get_entity(&entity_id).map(|entity| SearchResult {
795                        id: entity.id.to_string(),
796                        content: entity.name.clone(),
797                        score: similarity * self.config.entity_weight,
798                        result_type: ResultType::Entity,
799                        entities: vec![entity.name.clone()],
800                        source_chunks: entity
801                            .mentions
802                            .iter()
803                            .map(|m| m.chunk_id.to_string())
804                            .collect(),
805                    })
806                } else if id.starts_with("chunk:") {
807                    let chunk_id = ChunkId::new(id.strip_prefix("chunk:").unwrap().to_string());
808                    if let Some(chunk) = graph.get_chunk(&chunk_id) {
809                        let entity_names: Vec<String> = chunk
810                            .entities
811                            .iter()
812                            .filter_map(|eid| graph.get_entity(eid))
813                            .map(|e| e.name.clone())
814                            .collect();
815
816                        Some(SearchResult {
817                            id: chunk.id.to_string(),
818                            content: chunk.content.clone(),
819                            score: similarity * self.config.chunk_weight,
820                            result_type: ResultType::Chunk,
821                            entities: entity_names,
822                            source_chunks: vec![chunk.id.to_string()],
823                        })
824                    } else {
825                        None
826                    }
827                } else {
828                    None
829                };
830
831                if let Some(search_result) = result {
832                    results.push(search_result);
833                }
834            }
835        }
836
837        Ok(results)
838    }
839
840    /// Entity-based search with graph expansion
841    fn entity_based_search(
842        &self,
843        query_embedding: &[f32],
844        graph: &KnowledgeGraph,
845    ) -> Result<Vec<SearchResult>> {
846        let mut results = Vec::new();
847        let mut visited = HashSet::new();
848
849        // Find most relevant entities
850        let entity_similarities = self.find_relevant_entities(query_embedding, graph)?;
851
852        for (entity_id, similarity) in entity_similarities.into_iter().take(5) {
853            if visited.contains(&entity_id) {
854                continue;
855            }
856
857            // Expand through graph relationships
858            let expanded_entities = self.expand_through_relationships(
859                &entity_id,
860                graph,
861                self.config.max_expansion_depth,
862                &mut visited,
863            )?;
864
865            for expanded_entity_id in expanded_entities {
866                if let Some(entity) = graph.get_entity(&expanded_entity_id) {
867                    let expansion_penalty = if expanded_entity_id == entity_id {
868                        1.0
869                    } else {
870                        0.8
871                    };
872
873                    results.push(SearchResult {
874                        id: entity.id.to_string(),
875                        content: format!("{} ({})", entity.name, entity.entity_type),
876                        score: similarity * expansion_penalty * self.config.entity_weight,
877                        result_type: ResultType::Entity,
878                        entities: vec![entity.name.clone()],
879                        source_chunks: entity
880                            .mentions
881                            .iter()
882                            .map(|m| m.chunk_id.to_string())
883                            .collect(),
884                    });
885                }
886            }
887        }
888
889        Ok(results)
890    }
891
892    /// Calculate strategy weights based on query analysis
893    fn calculate_strategy_weights(&self, analysis: &QueryAnalysis) -> (f32, f32, f32) {
894        match (&analysis.query_type, &analysis.intent) {
895            // For entity-focused queries, balance vector (chunks) and graph (entities) equally
896            // This ensures we get both entity information AND contextual chunks
897            (QueryType::EntityFocused, _) => (0.5, 0.4, 0.1),
898            (QueryType::Relationship, _) => (0.3, 0.6, 0.1),
899            (QueryType::Conceptual, QueryIntent::Overview) => (0.2, 0.2, 0.6),
900            (QueryType::Conceptual, _) => (0.4, 0.3, 0.3),
901            (QueryType::Exploratory, QueryIntent::Overview) => (0.3, 0.2, 0.5),
902            (QueryType::Exploratory, _) => (0.4, 0.4, 0.2),
903            (QueryType::Factual, _) => (0.6, 0.3, 0.1),
904        }
905    }
906
907    /// Entity-centric search focusing on specific entities
908    fn entity_centric_search(
909        &mut self,
910        query_embedding: &[f32],
911        graph: &KnowledgeGraph,
912        key_entities: &[String],
913    ) -> Result<Vec<SearchResult>> {
914        let mut results = Vec::new();
915        let mut visited = HashSet::new();
916
917        for entity_name in key_entities {
918            // Find the entity in the graph
919            if let Some(entity) = graph
920                .entities()
921                .find(|e| e.name.eq_ignore_ascii_case(entity_name))
922            {
923                // Add the entity itself
924                results.push(SearchResult {
925                    id: entity.id.to_string(),
926                    content: format!("{} ({})", entity.name, entity.entity_type),
927                    score: 0.9, // High score for exact entity match
928                    result_type: ResultType::Entity,
929                    entities: vec![entity.name.clone()],
930                    source_chunks: entity
931                        .mentions
932                        .iter()
933                        .map(|m| m.chunk_id.to_string())
934                        .collect(),
935                });
936
937                // Get entity neighbors with weighted scores
938                let neighbors = graph.get_neighbors(&entity.id);
939                for (neighbor, relationship) in neighbors {
940                    if !visited.contains(&neighbor.id) {
941                        visited.insert(neighbor.id.clone());
942
943                        // Calculate relationship relevance
944                        let rel_embedding = self
945                            .embedding_generator
946                            .generate_embedding(&relationship.relation_type);
947                        let rel_similarity =
948                            VectorUtils::cosine_similarity(query_embedding, &rel_embedding);
949
950                        results.push(SearchResult {
951                            id: neighbor.id.to_string(),
952                            content: format!("{} ({})", neighbor.name, neighbor.entity_type),
953                            score: 0.7 * relationship.confidence * (1.0 + rel_similarity),
954                            result_type: ResultType::Entity,
955                            entities: vec![neighbor.name.clone()],
956                            source_chunks: neighbor
957                                .mentions
958                                .iter()
959                                .map(|m| m.chunk_id.to_string())
960                                .collect(),
961                        });
962                    }
963                }
964            }
965        }
966
967        Ok(results)
968    }
969
970    /// Hierarchical search using document trees
971    fn hierarchical_search(
972        &self,
973        query: &str,
974        document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
975        analysis: &QueryAnalysis,
976    ) -> Result<Vec<SearchResult>> {
977        let mut results = Vec::new();
978        let max_results_per_tree = match analysis.intent {
979            QueryIntent::Overview => 3,
980            QueryIntent::Detailed => 8,
981            _ => 5,
982        };
983
984        for (doc_id, tree) in document_trees.iter() {
985            let tree_summaries = tree.query(query, max_results_per_tree)?;
986
987            for (idx, summary) in tree_summaries.iter().enumerate() {
988                // Convert tree query result to search result
989                let level_bonus = match analysis.intent {
990                    QueryIntent::Overview => 0.3,
991                    QueryIntent::Detailed => 0.2,
992                    _ => 0.0,
993                };
994
995                results.push(SearchResult {
996                    id: format!("{}:summary:{}", doc_id, idx),
997                    content: summary.summary.clone(),
998                    score: summary.score + level_bonus,
999                    result_type: ResultType::HierarchicalSummary,
1000                    entities: Vec::new(),
1001                    source_chunks: vec![doc_id.to_string()],
1002                });
1003            }
1004        }
1005
1006        Ok(results)
1007    }
1008
1009    /// Advanced graph traversal for complex queries
1010    fn advanced_graph_traversal(
1011        &self,
1012        query_embedding: &[f32],
1013        graph: &KnowledgeGraph,
1014        analysis: &QueryAnalysis,
1015    ) -> Result<Vec<SearchResult>> {
1016        let mut results = Vec::new();
1017
1018        if analysis.query_type == QueryType::Relationship && analysis.key_entities.len() >= 2 {
1019            // Find paths between entities
1020            results.extend(self.find_entity_paths(graph, &analysis.key_entities)?);
1021        }
1022
1023        if analysis.complexity_score > 0.8 {
1024            // Community detection for exploratory queries
1025            results.extend(self.community_based_search(query_embedding, graph)?);
1026        }
1027
1028        Ok(results)
1029    }
1030
1031    /// Cross-strategy fusion to create hybrid results
1032    fn cross_strategy_fusion(
1033        &self,
1034        all_results: &[SearchResult],
1035        _analysis: &QueryAnalysis,
1036    ) -> Result<Vec<SearchResult>> {
1037        let mut fusion_results = Vec::new();
1038
1039        // Group results by content similarity
1040        let mut content_groups: HashMap<String, Vec<&SearchResult>> = HashMap::new();
1041
1042        for result in all_results {
1043            let content_key = Self::safe_truncate(&result.content, 50);
1044
1045            content_groups.entry(content_key).or_default().push(result);
1046        }
1047
1048        // Create fusion results for groups with multiple strategies
1049        for (content_key, group) in content_groups {
1050            if group.len() > 1 {
1051                let types: HashSet<_> = group.iter().map(|r| &r.result_type).collect();
1052                if types.len() > 1 {
1053                    // This content was found by multiple strategies - boost confidence
1054                    let avg_score = group.iter().map(|r| r.score).sum::<f32>() / group.len() as f32;
1055                    let boost = 0.2 * (types.len() - 1) as f32;
1056
1057                    let all_entities: HashSet<_> =
1058                        group.iter().flat_map(|r| r.entities.iter()).collect();
1059
1060                    let all_chunks: HashSet<_> =
1061                        group.iter().flat_map(|r| r.source_chunks.iter()).collect();
1062
1063                    fusion_results.push(SearchResult {
1064                        id: format!(
1065                            "fusion_{}",
1066                            content_key.chars().take(10).collect::<String>()
1067                        ),
1068                        content: group[0].content.clone(),
1069                        score: (avg_score + boost).min(1.0),
1070                        result_type: ResultType::Hybrid,
1071                        entities: all_entities.into_iter().cloned().collect(),
1072                        source_chunks: all_chunks.into_iter().cloned().collect(),
1073                    });
1074                }
1075            }
1076        }
1077
1078        Ok(fusion_results)
1079    }
1080
1081    /// Adaptive ranking and deduplication based on query analysis
1082    fn adaptive_rank_and_deduplicate(
1083        &self,
1084        mut results: Vec<SearchResult>,
1085        analysis: &QueryAnalysis,
1086    ) -> Result<Vec<SearchResult>> {
1087        // Apply query-specific score adjustments
1088        for result in &mut results {
1089            match analysis.query_type {
1090                QueryType::EntityFocused => {
1091                    if result.result_type == ResultType::Entity {
1092                        result.score *= 1.2;
1093                    }
1094                }
1095                QueryType::Conceptual => {
1096                    if result.result_type == ResultType::HierarchicalSummary {
1097                        result.score *= 1.1;
1098                    }
1099                }
1100                QueryType::Relationship => {
1101                    if result.entities.len() > 1 {
1102                        result.score *= 1.15;
1103                    }
1104                }
1105                _ => {}
1106            }
1107
1108            // Boost results that contain key entities
1109            for entity in &analysis.key_entities {
1110                if result
1111                    .entities
1112                    .iter()
1113                    .any(|e| e.eq_ignore_ascii_case(entity))
1114                {
1115                    result.score *= 1.1;
1116                }
1117            }
1118        }
1119
1120        // Sort by adjusted scores
1121        results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
1122
1123        // Diversity-aware deduplication
1124        let mut deduplicated = Vec::new();
1125        let mut seen_content = HashSet::new();
1126        let mut type_counts: HashMap<ResultType, usize> = HashMap::new();
1127
1128        for result in results {
1129            let content_signature = self.create_content_signature(&result.content);
1130
1131            if !seen_content.contains(&content_signature) {
1132                let type_count = type_counts.get(&result.result_type).unwrap_or(&0);
1133
1134                // Ensure diversity across result types
1135                let max_per_type = match result.result_type {
1136                    ResultType::Entity => self.config.top_k / 3,
1137                    ResultType::Chunk => self.config.top_k / 2,
1138                    ResultType::HierarchicalSummary => self.config.top_k / 4,
1139                    ResultType::Hybrid => self.config.top_k / 4,
1140                    ResultType::GraphPath => self.config.top_k / 5,
1141                };
1142
1143                if *type_count < max_per_type {
1144                    seen_content.insert(content_signature);
1145                    *type_counts.entry(result.result_type.clone()).or_insert(0) += 1;
1146                    deduplicated.push(result);
1147                }
1148            }
1149        }
1150
1151        Ok(deduplicated)
1152    }
1153
1154    /// Find paths between entities in the graph
1155    fn find_entity_paths(
1156        &self,
1157        graph: &KnowledgeGraph,
1158        key_entities: &[String],
1159    ) -> Result<Vec<SearchResult>> {
1160        let mut results = Vec::new();
1161
1162        if key_entities.len() < 2 {
1163            return Ok(results);
1164        }
1165
1166        // Simple path finding between first two entities
1167        if let (Some(source), Some(target)) = (
1168            graph
1169                .entities()
1170                .find(|e| e.name.eq_ignore_ascii_case(&key_entities[0])),
1171            graph
1172                .entities()
1173                .find(|e| e.name.eq_ignore_ascii_case(&key_entities[1])),
1174        ) {
1175            let path_description =
1176                format!("Connection between {} and {}", source.name, target.name);
1177            let neighbors_source = graph.get_neighbors(&source.id);
1178            let neighbors_target = graph.get_neighbors(&target.id);
1179
1180            // Check for direct connection
1181            if neighbors_source
1182                .iter()
1183                .any(|(neighbor, _)| neighbor.id == target.id)
1184            {
1185                results.push(SearchResult {
1186                    id: format!("path_{}_{}", source.id, target.id),
1187                    content: format!("Direct relationship: {path_description}"),
1188                    score: 0.8,
1189                    result_type: ResultType::GraphPath,
1190                    entities: vec![source.name.clone(), target.name.clone()],
1191                    source_chunks: Vec::new(),
1192                });
1193            }
1194
1195            // Check for indirect connections through common neighbors
1196            for (neighbor_s, rel_s) in &neighbors_source {
1197                for (neighbor_t, rel_t) in &neighbors_target {
1198                    if neighbor_s.id == neighbor_t.id {
1199                        results.push(SearchResult {
1200                            id: format!("path_{}_{}_{}", source.id, neighbor_s.id, target.id),
1201                            content: format!(
1202                                "Indirect relationship via {}: {} -> {} -> {}",
1203                                neighbor_s.name, source.name, neighbor_s.name, target.name
1204                            ),
1205                            score: 0.6 * rel_s.confidence * rel_t.confidence,
1206                            result_type: ResultType::GraphPath,
1207                            entities: vec![
1208                                source.name.clone(),
1209                                neighbor_s.name.clone(),
1210                                target.name.clone(),
1211                            ],
1212                            source_chunks: Vec::new(),
1213                        });
1214                    }
1215                }
1216            }
1217        }
1218
1219        Ok(results)
1220    }
1221
1222    /// Community-based search for exploratory queries
1223    fn community_based_search(
1224        &self,
1225        query_embedding: &[f32],
1226        graph: &KnowledgeGraph,
1227    ) -> Result<Vec<SearchResult>> {
1228        let mut results = Vec::new();
1229        let mut entity_scores: HashMap<String, f32> = HashMap::new();
1230
1231        // Calculate centrality-like scores for entities
1232        for entity in graph.entities() {
1233            let neighbors = graph.get_neighbors(&entity.id);
1234            let centrality_score = neighbors.len() as f32 * 0.1;
1235
1236            // Combine with embedding similarity
1237            if let Some(embedding) = &entity.embedding {
1238                let similarity = VectorUtils::cosine_similarity(query_embedding, embedding);
1239                entity_scores.insert(entity.id.to_string(), centrality_score + similarity);
1240            }
1241        }
1242
1243        // Select top entities by combined score
1244        let mut sorted_entities: Vec<_> = entity_scores.iter().collect();
1245        sorted_entities.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap());
1246
1247        for (entity_id, score) in sorted_entities.iter().take(3) {
1248            if let Some(entity) = graph.entities().find(|e| e.id.to_string() == **entity_id) {
1249                // Get context from chunks where this entity is mentioned
1250                let mut entity_context = String::new();
1251                for mention in entity.mentions.iter().take(2) {
1252                    if let Some(chunk) = graph.chunks().find(|c| c.id == mention.chunk_id) {
1253                        let chunk_excerpt = if chunk.content.len() > 200 {
1254                            format!("{}...", &chunk.content[..200])
1255                        } else {
1256                            chunk.content.clone()
1257                        };
1258                        entity_context.push_str(&chunk_excerpt);
1259                        entity_context.push(' ');
1260                    }
1261                }
1262
1263                // If no context found, provide a meaningful description
1264                if entity_context.is_empty() {
1265                    entity_context = format!(
1266                        "{} is a {} character in the story.",
1267                        entity.name, entity.entity_type
1268                    );
1269                }
1270
1271                results.push(SearchResult {
1272                    id: entity.id.to_string(),
1273                    content: entity_context,
1274                    score: **score,
1275                    result_type: ResultType::Entity,
1276                    entities: vec![entity.name.clone()],
1277                    source_chunks: entity
1278                        .mentions
1279                        .iter()
1280                        .map(|m| m.chunk_id.to_string())
1281                        .collect(),
1282                });
1283            }
1284        }
1285
1286        Ok(results)
1287    }
1288
1289    /// Helper method to detect abstract concepts
1290    fn has_abstract_concepts(&self, words: &[&str]) -> bool {
1291        const ABSTRACT_INDICATORS: &[&str] = &[
1292            "concept",
1293            "idea",
1294            "theory",
1295            "principle",
1296            "philosophy",
1297            "meaning",
1298            "understanding",
1299            "knowledge",
1300            "wisdom",
1301            "truth",
1302            "beauty",
1303            "justice",
1304        ];
1305        words
1306            .iter()
1307            .any(|&word| ABSTRACT_INDICATORS.contains(&word))
1308    }
1309
1310    /// Helper method to detect question words
1311    fn has_question_words(&self, words: &[&str]) -> bool {
1312        const QUESTION_WORDS: &[&str] = &[
1313            "what", "how", "why", "when", "where", "who", "which", "explain", "describe",
1314        ];
1315        words.iter().any(|&word| QUESTION_WORDS.contains(&word))
1316    }
1317
1318    /// Create content signature for deduplication
1319    fn create_content_signature(&self, content: &str) -> String {
1320        // Simple signature based on first 50 characters and length
1321        let prefix = Self::safe_truncate(content, 50);
1322        format!(
1323            "{}_{}",
1324            prefix
1325                .chars()
1326                .filter(|c| c.is_alphanumeric())
1327                .collect::<String>(),
1328            content.len()
1329        )
1330    }
1331
1332    /// Graph traversal search for path-based results (legacy)
1333    fn graph_traversal_search(
1334        &self,
1335        _query_embedding: &[f32],
1336        _graph: &KnowledgeGraph,
1337    ) -> Result<Vec<SearchResult>> {
1338        // Placeholder for graph traversal algorithms
1339        // This would implement algorithms like:
1340        // - Random walks
1341        // - Shortest paths between relevant entities
1342        // - Community detection
1343        // - PageRank-style scoring
1344
1345        Ok(Vec::new())
1346    }
1347
1348    /// Find entities most relevant to the query
1349    fn find_relevant_entities(
1350        &self,
1351        query_embedding: &[f32],
1352        graph: &KnowledgeGraph,
1353    ) -> Result<Vec<(EntityId, f32)>> {
1354        let mut similarities = Vec::new();
1355
1356        for entity in graph.entities() {
1357            if let Some(embedding) = &entity.embedding {
1358                let similarity = VectorUtils::cosine_similarity(query_embedding, embedding);
1359                if similarity >= self.config.similarity_threshold {
1360                    similarities.push((entity.id.clone(), similarity));
1361                }
1362            }
1363        }
1364
1365        // Sort by similarity
1366        similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1367
1368        Ok(similarities)
1369    }
1370
1371    /// Expand search through graph relationships
1372    fn expand_through_relationships(
1373        &self,
1374        start_entity: &EntityId,
1375        graph: &KnowledgeGraph,
1376        max_depth: usize,
1377        visited: &mut HashSet<EntityId>,
1378    ) -> Result<Vec<EntityId>> {
1379        let mut results = Vec::new();
1380        let mut current_level = vec![start_entity.clone()];
1381        visited.insert(start_entity.clone());
1382
1383        for _depth in 0..max_depth {
1384            let mut next_level = Vec::new();
1385
1386            for entity_id in &current_level {
1387                results.push(entity_id.clone());
1388
1389                // Get neighbors through graph relationships
1390                let neighbors = graph.get_neighbors(entity_id);
1391                for (neighbor_entity, _relationship) in neighbors {
1392                    if !visited.contains(&neighbor_entity.id) {
1393                        visited.insert(neighbor_entity.id.clone());
1394                        next_level.push(neighbor_entity.id.clone());
1395                    }
1396                }
1397            }
1398
1399            if next_level.is_empty() {
1400                break;
1401            }
1402
1403            current_level = next_level;
1404        }
1405
1406        Ok(results)
1407    }
1408
1409    /// Simple stop word detection (English)
1410    fn is_stop_word(&self, word: &str) -> bool {
1411        const STOP_WORDS: &[&str] = &[
1412            "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
1413            "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
1414            "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
1415            "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
1416            "go", "me",
1417        ];
1418        STOP_WORDS.contains(&word)
1419    }
1420
1421    /// Rank and deduplicate search results (legacy)
1422    fn rank_and_deduplicate(&self, mut results: Vec<SearchResult>) -> Result<Vec<SearchResult>> {
1423        // Sort by score descending
1424        results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
1425
1426        // Deduplicate by ID
1427        let mut seen_ids = HashSet::new();
1428        let mut deduplicated = Vec::new();
1429
1430        for result in results {
1431            if !seen_ids.contains(&result.id) {
1432                seen_ids.insert(result.id.clone());
1433                deduplicated.push(result);
1434            }
1435        }
1436
1437        Ok(deduplicated)
1438    }
1439
1440    /// Vector-based search
1441    pub fn vector_search(&mut self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
1442        let query_embedding = self.embedding_generator.generate_embedding(query);
1443        let similar_vectors = self.vector_index.search(&query_embedding, max_results)?;
1444
1445        let mut results = Vec::new();
1446        for (id, score) in similar_vectors {
1447            results.push(SearchResult {
1448                id: id.clone(),
1449                content: format!("Vector result for: {id}"),
1450                score,
1451                result_type: ResultType::Chunk,
1452                entities: Vec::new(),
1453                source_chunks: vec![id],
1454            });
1455        }
1456
1457        Ok(results)
1458    }
1459
1460    /// Graph-based search
1461    pub fn graph_search(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
1462        // Simplified graph search - in a real implementation this would traverse the graph
1463        let mut results = Vec::new();
1464        results.push(SearchResult {
1465            id: format!("graph_result_{}", query.len()),
1466            content: format!("Graph-based result for: {query}"),
1467            score: 0.7,
1468            result_type: ResultType::GraphPath,
1469            entities: Vec::new(),
1470            source_chunks: Vec::new(),
1471        });
1472
1473        Ok(results.into_iter().take(max_results).collect())
1474    }
1475
1476    /// Hierarchical search (public wrapper)
1477    pub fn public_hierarchical_search(
1478        &self,
1479        query: &str,
1480        max_results: usize,
1481    ) -> Result<Vec<SearchResult>> {
1482        // Simplified hierarchical search - in a real implementation this would use document trees
1483        let mut results = Vec::new();
1484        results.push(SearchResult {
1485            id: format!("hierarchical_result_{}", query.len()),
1486            content: format!("Hierarchical result for: {query}"),
1487            score: 0.8,
1488            result_type: ResultType::HierarchicalSummary,
1489            entities: Vec::new(),
1490            source_chunks: Vec::new(),
1491        });
1492
1493        Ok(results.into_iter().take(max_results).collect())
1494    }
1495
1496    /// BM25-based search
1497    pub fn bm25_search(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
1498        // Simplified BM25 search - in a real implementation this would use proper BM25 scoring
1499        let mut results = Vec::new();
1500        results.push(SearchResult {
1501            id: format!("bm25_result_{}", query.len()),
1502            content: format!("BM25 result for: {query}"),
1503            score: 0.75,
1504            result_type: ResultType::Chunk,
1505            entities: Vec::new(),
1506            source_chunks: Vec::new(),
1507        });
1508
1509        Ok(results.into_iter().take(max_results).collect())
1510    }
1511
1512    /// Get retrieval statistics
1513    pub fn get_statistics(&self) -> RetrievalStatistics {
1514        let vector_stats = self.vector_index.statistics();
1515
1516        RetrievalStatistics {
1517            indexed_vectors: vector_stats.vector_count,
1518            vector_dimension: vector_stats.dimension,
1519            index_built: vector_stats.index_built,
1520            config: self.config.clone(),
1521        }
1522    }
1523
1524    /// Safely truncate a string to a maximum byte length, respecting UTF-8 character boundaries
1525    fn safe_truncate(s: &str, max_bytes: usize) -> String {
1526        if s.len() <= max_bytes {
1527            return s.to_string();
1528        }
1529
1530        // Find the largest valid character boundary <= max_bytes
1531        let mut end_idx = max_bytes;
1532        while end_idx > 0 && !s.is_char_boundary(end_idx) {
1533            end_idx -= 1;
1534        }
1535
1536        s[..end_idx].to_string()
1537    }
1538
1539    /// Save retrieval system state to JSON file
1540    pub fn save_state_to_json(&self, file_path: &str) -> Result<()> {
1541        use std::fs;
1542
1543        let mut json_data = json::JsonValue::new_object();
1544
1545        // Add metadata
1546        json_data["metadata"] = json::object! {
1547            "format_version" => "1.0",
1548            "created_at" => chrono::Utc::now().to_rfc3339(),
1549            "config" => json::object! {
1550                "top_k" => self.config.top_k,
1551                "similarity_threshold" => self.config.similarity_threshold,
1552                "max_expansion_depth" => self.config.max_expansion_depth,
1553                "entity_weight" => self.config.entity_weight,
1554                "chunk_weight" => self.config.chunk_weight,
1555                "graph_weight" => self.config.graph_weight
1556            }
1557        };
1558
1559        // Add vector index statistics
1560        let vector_stats = self.vector_index.statistics();
1561        json_data["vector_index"] = json::object! {
1562            "vector_count" => vector_stats.vector_count,
1563            "dimension" => vector_stats.dimension,
1564            "index_built" => vector_stats.index_built,
1565            "min_norm" => vector_stats.min_norm,
1566            "max_norm" => vector_stats.max_norm,
1567            "avg_norm" => vector_stats.avg_norm
1568        };
1569
1570        // Add embedding generator info
1571        json_data["embedding_generator"] = json::object! {
1572            "dimension" => self.embedding_generator.dimension(),
1573            "cached_words" => self.embedding_generator.cached_words()
1574        };
1575
1576        // Add parallel processing info
1577        #[cfg(feature = "parallel-processing")]
1578        {
1579            json_data["parallel_enabled"] = self.parallel_processor.is_some().into();
1580        }
1581        #[cfg(not(feature = "parallel-processing"))]
1582        {
1583            json_data["parallel_enabled"] = false.into();
1584        }
1585
1586        // Save to file
1587        fs::write(file_path, json_data.dump())?;
1588        tracing::info!("Retrieval system state saved to {file_path}");
1589
1590        Ok(())
1591    }
1592}
1593
1594/// Statistics about the retrieval system
1595#[derive(Debug)]
1596pub struct RetrievalStatistics {
1597    /// Number of vectors indexed in the system
1598    pub indexed_vectors: usize,
1599    /// Dimensionality of the vector embeddings
1600    pub vector_dimension: usize,
1601    /// Whether the vector index has been built
1602    pub index_built: bool,
1603    /// Current retrieval configuration
1604    pub config: RetrievalConfig,
1605}
1606
1607impl RetrievalStatistics {
1608    /// Print retrieval statistics
1609    #[allow(dead_code)]
1610    pub fn print(&self) {
1611        tracing::info!("Retrieval System Statistics:");
1612        tracing::info!("  Indexed vectors: {}", self.indexed_vectors);
1613        tracing::info!("  Vector dimension: {}", self.vector_dimension);
1614        tracing::info!("  Index built: {}", self.index_built);
1615        tracing::info!("  Configuration:");
1616        tracing::info!("    Top K: {}", self.config.top_k);
1617        tracing::info!(
1618            "    Similarity threshold: {:.2}",
1619            self.config.similarity_threshold
1620        );
1621        tracing::info!(
1622            "    Max expansion depth: {}",
1623            self.config.max_expansion_depth
1624        );
1625        tracing::info!("    Entity weight: {:.2}", self.config.entity_weight);
1626        tracing::info!("    Chunk weight: {:.2}", self.config.chunk_weight);
1627        tracing::info!("    Graph weight: {:.2}", self.config.graph_weight);
1628    }
1629}
1630
1631#[cfg(test)]
1632mod tests {
1633    use super::*;
1634    use crate::{config::Config, core::KnowledgeGraph};
1635
1636    #[test]
1637    fn test_retrieval_system_creation() {
1638        let config = Config::default();
1639        let retrieval = RetrievalSystem::new(&config);
1640        assert!(retrieval.is_ok());
1641    }
1642
1643    #[test]
1644    fn test_query_placeholder() {
1645        let config = Config::default();
1646        let retrieval = RetrievalSystem::new(&config).unwrap();
1647
1648        let results = retrieval.query("test query");
1649        assert!(results.is_ok());
1650
1651        let results = results.unwrap();
1652        assert!(!results.is_empty());
1653        assert!(results[0].contains("test query"));
1654    }
1655
1656    #[test]
1657    fn test_graph_indexing() {
1658        let config = Config::default();
1659        let mut retrieval = RetrievalSystem::new(&config).unwrap();
1660        let graph = KnowledgeGraph::new();
1661
1662        let result = retrieval.index_graph(&graph);
1663        assert!(result.is_ok());
1664    }
1665}
graphrag_core/retrieval/mod.rs

graphrag_core/retrieval/
mod.rs