graphrag_core/retrieval/
mod.rs

1pub mod adaptive;
2/// BM25 text retrieval implementation for keyword-based search
3pub mod bm25;
4/// Causal chain analysis for discovering cause-effect paths (Phase 2.3)
5pub mod causal_analysis;
6/// Enriched metadata-aware retrieval
7pub mod enriched;
8/// HippoRAG Personalized PageRank retrieval
9#[cfg(feature = "pagerank")]
10pub mod hipporag_ppr;
11/// Hybrid retrieval combining multiple search strategies
12pub mod hybrid;
13pub mod pagerank_retrieval;
14/// Symbolic anchoring for conceptual queries (Phase 2.1 - CatRAG)
15pub mod symbolic_anchoring;
16
17#[cfg(feature = "parallel-processing")]
18use crate::parallel::ParallelProcessor;
19use crate::{
20    config::Config,
21    core::{ChunkId, EntityId, KnowledgeGraph},
22    summarization::DocumentTree,
23    vector::{EmbeddingGenerator, VectorUtils},
24    Result,
25};
26use std::collections::{HashMap, HashSet};
27
28pub use bm25::{BM25Result, BM25Retriever, Document as BM25Document};
29pub use enriched::{EnrichedRetrievalConfig, EnrichedRetriever};
30pub use hybrid::{FusionMethod, HybridConfig, HybridRetriever, HybridSearchResult};
31
32#[cfg(feature = "pagerank")]
33pub use pagerank_retrieval::{PageRankRetrievalSystem, ScoredResult};
34
35#[cfg(feature = "pagerank")]
36pub use hipporag_ppr::{Fact, HippoRAGConfig, HippoRAGRetriever};
37
38use crate::vector::store::VectorStore;
39
40/// Retrieval system for querying the knowledge graph
41pub struct RetrievalSystem {
42    vector_store: std::sync::Arc<dyn VectorStore>,
43    embedding_generator: EmbeddingGenerator,
44    config: RetrievalConfig,
45    #[cfg(feature = "parallel-processing")]
46    parallel_processor: Option<ParallelProcessor>,
47    #[cfg(feature = "pagerank")]
48    pagerank_retriever: Option<PageRankRetrievalSystem>,
49    enriched_retriever: Option<EnrichedRetriever>,
50    #[cfg(feature = "lazygraphrag")]
51    concept_filtering_enabled: bool,
52}
53
54impl RetrievalSystem {
55    /// Create a new retrieval system
56    pub fn new(config: &Config) -> Result<Self> {
57        let retrieval_config = RetrievalConfig {
58            top_k: config.retrieval.top_k,
59            similarity_threshold: 0.35,
60            max_expansion_depth: 2,
61            entity_weight: 0.4,
62            chunk_weight: 0.4,
63            graph_weight: 0.2,
64            #[cfg(feature = "lazygraphrag")]
65            use_concept_filtering: false,
66            #[cfg(feature = "lazygraphrag")]
67            concept_top_k: 20,
68        };
69
70        // Default to MemoryVectorStore for now (mimics old behavior)
71        // In the future, this will select based on Config (LanceDB, Qdrant, etc.)
72        let vector_store =
73            std::sync::Arc::new(crate::vector::memory_store::MemoryVectorStore::new());
74
75        Ok(Self {
76            vector_store,
77            embedding_generator: EmbeddingGenerator::new(128), // 128-dimensional embeddings
78            config: retrieval_config,
79            #[cfg(feature = "parallel-processing")]
80            parallel_processor: None,
81            #[cfg(feature = "pagerank")]
82            pagerank_retriever: None,
83            enriched_retriever: None,
84            #[cfg(feature = "lazygraphrag")]
85            concept_filtering_enabled: false,
86        })
87    }
88}
89
90/// Configuration parameters for the retrieval system
91#[derive(Debug, Clone)]
92pub struct RetrievalConfig {
93    /// Maximum number of results to return
94    pub top_k: usize,
95    /// Minimum similarity score threshold for results (typically -1.0 to 1.0)
96    pub similarity_threshold: f32,
97    /// Maximum depth for graph relationship expansion
98    pub max_expansion_depth: usize,
99    /// Weight for entity-based results in scoring (0.0 to 1.0)
100    pub entity_weight: f32,
101    /// Weight for chunk-based results in scoring (0.0 to 1.0)
102    pub chunk_weight: f32,
103    /// Weight for graph-based results in scoring (0.0 to 1.0)
104    pub graph_weight: f32,
105    /// Enable concept-based chunk filtering (requires lazygraphrag feature)
106    #[cfg(feature = "lazygraphrag")]
107    pub use_concept_filtering: bool,
108    /// Top-K concepts to select for filtering (requires lazygraphrag feature)
109    #[cfg(feature = "lazygraphrag")]
110    pub concept_top_k: usize,
111}
112
113impl Default for RetrievalConfig {
114    fn default() -> Self {
115        Self {
116            top_k: 10,
117            similarity_threshold: 0.7,
118            max_expansion_depth: 2,
119            entity_weight: 0.4,
120            chunk_weight: 0.4,
121            graph_weight: 0.2,
122            #[cfg(feature = "lazygraphrag")]
123            use_concept_filtering: false,
124            #[cfg(feature = "lazygraphrag")]
125            concept_top_k: 20,
126        }
127    }
128}
129
130/// A search result containing relevant information
131#[derive(Debug, Clone)]
132pub struct SearchResult {
133    /// Unique identifier for this result
134    pub id: String,
135    /// Content or description of the result
136    pub content: String,
137    /// Relevance score (higher is better)
138    pub score: f32,
139    /// Type of result (entity, chunk, graph path, etc.)
140    pub result_type: ResultType,
141    /// Names of entities associated with this result
142    pub entities: Vec<String>,
143    /// IDs of source chunks this result is derived from
144    pub source_chunks: Vec<String>,
145}
146
147/// Type of search result indicating the retrieval strategy used
148#[derive(Debug, Clone, PartialEq, Eq, Hash)]
149pub enum ResultType {
150    /// Result from entity-based retrieval
151    Entity,
152    /// Result from text chunk retrieval
153    Chunk,
154    /// Result from graph path traversal
155    GraphPath,
156    /// Result from hierarchical document summarization
157    HierarchicalSummary,
158    /// Result from combining multiple retrieval strategies
159    Hybrid,
160}
161
162// ============================================================================
163// EXPLAINED ANSWER - Structured answer with reasoning trace
164// ============================================================================
165
166/// An answer with detailed explanation of the reasoning process
167///
168/// This struct provides transparency into how the GraphRAG system
169/// arrived at its answer, including confidence scores, source references,
170/// and step-by-step reasoning.
171///
172/// # Example
173/// ```no_run
174/// use graphrag_core::prelude::*;
175///
176/// # async fn example() -> graphrag_core::Result<()> {
177/// let mut graphrag = GraphRAG::quick_start("Your document").await?;
178/// let explained = graphrag.ask_explained("What is the main topic?").await?;
179///
180/// println!("Answer: {}", explained.answer);
181/// println!("Confidence: {:.0}%", explained.confidence * 100.0);
182///
183/// for step in &explained.reasoning_steps {
184///     println!("Step {}: {} (confidence: {:.0}%)",
185///         step.step_number, step.description, step.confidence * 100.0);
186/// }
187/// # Ok(())
188/// # }
189/// ```
190#[derive(Debug, Clone)]
191pub struct ExplainedAnswer {
192    /// The answer text
193    pub answer: String,
194    /// Confidence score (0.0 to 1.0)
195    pub confidence: f32,
196    /// Sources used to generate the answer
197    pub sources: Vec<SourceReference>,
198    /// Step-by-step reasoning trace
199    pub reasoning_steps: Vec<ReasoningStep>,
200    /// Entities that were key to the answer
201    pub key_entities: Vec<String>,
202    /// Query analysis that guided retrieval
203    pub query_analysis: Option<QueryAnalysis>,
204}
205
206/// Reference to a source document or chunk used in the answer
207#[derive(Debug, Clone)]
208pub struct SourceReference {
209    /// Identifier of the source (chunk ID, document ID, or entity ID)
210    pub id: String,
211    /// Type of source
212    pub source_type: SourceType,
213    /// Relevant excerpt from the source
214    pub excerpt: String,
215    /// Relevance score to the query
216    pub relevance_score: f32,
217}
218
219/// Type of source reference
220#[derive(Debug, Clone, PartialEq, Eq)]
221pub enum SourceType {
222    /// A text chunk from a document
223    TextChunk,
224    /// An entity in the knowledge graph
225    Entity,
226    /// A relationship between entities
227    Relationship,
228    /// A document-level summary
229    Summary,
230}
231
232/// A single step in the reasoning process
233#[derive(Debug, Clone)]
234pub struct ReasoningStep {
235    /// Step number (1-indexed)
236    pub step_number: u8,
237    /// Description of what was done in this step
238    pub description: String,
239    /// IDs of entities involved in this step
240    pub entities_used: Vec<String>,
241    /// Evidence snippet that supports this step
242    pub evidence_snippet: Option<String>,
243    /// Confidence for this specific step
244    pub confidence: f32,
245}
246
247impl ExplainedAnswer {
248    /// Create a new explained answer from search results
249    pub fn from_results(answer: String, search_results: &[SearchResult], query: &str) -> Self {
250        // Calculate overall confidence from result scores
251        let confidence = if search_results.is_empty() {
252            0.0
253        } else {
254            let total_score: f32 = search_results.iter().map(|r| r.score).sum();
255            let avg_score = total_score / search_results.len() as f32;
256            // Normalize to 0-1 range (assuming scores are already somewhat normalized)
257            (avg_score * 0.7 + 0.3).min(1.0).max(0.0)
258        };
259
260        // Build source references
261        let sources: Vec<SourceReference> = search_results
262            .iter()
263            .take(5) // Top 5 sources
264            .map(|r| SourceReference {
265                id: r.id.clone(),
266                source_type: match r.result_type {
267                    ResultType::Entity => SourceType::Entity,
268                    ResultType::Chunk => SourceType::TextChunk,
269                    ResultType::GraphPath => SourceType::Relationship,
270                    ResultType::HierarchicalSummary => SourceType::Summary,
271                    ResultType::Hybrid => SourceType::TextChunk,
272                },
273                excerpt: if r.content.len() > 200 {
274                    format!("{}...", &r.content[..200])
275                } else {
276                    r.content.clone()
277                },
278                relevance_score: r.score,
279            })
280            .collect();
281
282        // Build reasoning steps
283        let mut reasoning_steps = Vec::new();
284        let mut step_num = 1u8;
285
286        // Step 1: Query analysis
287        reasoning_steps.push(ReasoningStep {
288            step_number: step_num,
289            description: format!("Analyzed query: \"{}\"", query),
290            entities_used: vec![],
291            evidence_snippet: None,
292            confidence: 0.95,
293        });
294        step_num += 1;
295
296        // Step 2: Entity retrieval
297        let unique_entities: HashSet<_> = search_results
298            .iter()
299            .flat_map(|r| r.entities.iter().cloned())
300            .collect();
301        if !unique_entities.is_empty() {
302            reasoning_steps.push(ReasoningStep {
303                step_number: step_num,
304                description: format!("Found {} relevant entities", unique_entities.len()),
305                entities_used: unique_entities.iter().take(5).cloned().collect(),
306                evidence_snippet: None,
307                confidence: 0.85,
308            });
309            step_num += 1;
310        }
311
312        // Step 3: Chunk retrieval
313        let chunk_count = search_results
314            .iter()
315            .filter(|r| r.result_type == ResultType::Chunk || r.result_type == ResultType::Hybrid)
316            .count();
317        if chunk_count > 0 {
318            reasoning_steps.push(ReasoningStep {
319                step_number: step_num,
320                description: format!("Retrieved {} relevant text chunks", chunk_count),
321                entities_used: vec![],
322                evidence_snippet: search_results.first().map(|r| {
323                    if r.content.len() > 100 {
324                        format!("{}...", &r.content[..100])
325                    } else {
326                        r.content.clone()
327                    }
328                }),
329                confidence,
330            });
331            step_num += 1;
332        }
333
334        // Step 4: Answer synthesis
335        reasoning_steps.push(ReasoningStep {
336            step_number: step_num,
337            description: "Synthesized answer from retrieved information".to_string(),
338            entities_used: unique_entities.into_iter().take(3).collect(),
339            evidence_snippet: None,
340            confidence,
341        });
342
343        // Collect key entities
344        let key_entities: Vec<String> = search_results
345            .iter()
346            .flat_map(|r| r.entities.iter().cloned())
347            .take(10)
348            .collect();
349
350        Self {
351            answer,
352            confidence,
353            sources,
354            reasoning_steps,
355            key_entities,
356            query_analysis: None,
357        }
358    }
359
360    /// Format the explained answer for display
361    pub fn format_display(&self) -> String {
362        let mut output = String::new();
363
364        // Answer
365        output.push_str(&format!("**Answer:** {}\n\n", self.answer));
366
367        // Confidence
368        output.push_str(&format!(
369            "**Confidence:** {:.0}%\n\n",
370            self.confidence * 100.0
371        ));
372
373        // Reasoning steps
374        if !self.reasoning_steps.is_empty() {
375            output.push_str("**Reasoning:**\n");
376            for step in &self.reasoning_steps {
377                output.push_str(&format!(
378                    "{}. {} (confidence: {:.0}%)\n",
379                    step.step_number,
380                    step.description,
381                    step.confidence * 100.0
382                ));
383                if let Some(evidence) = &step.evidence_snippet {
384                    output.push_str(&format!("   Evidence: \"{}\"\n", evidence));
385                }
386            }
387            output.push('\n');
388        }
389
390        // Sources
391        if !self.sources.is_empty() {
392            output.push_str("**Sources:**\n");
393            for (i, source) in self.sources.iter().enumerate() {
394                output.push_str(&format!(
395                    "{}. [{:?}] {} (relevance: {:.0}%)\n",
396                    i + 1,
397                    source.source_type,
398                    source.id,
399                    source.relevance_score * 100.0
400                ));
401            }
402        }
403
404        output
405    }
406}
407
408// ============================================================================
409// QUERY ANALYSIS - Adaptive retrieval strategy
410// ============================================================================
411
412/// Query analysis results to determine optimal retrieval strategy
413#[derive(Debug, Clone)]
414pub struct QueryAnalysis {
415    /// Type of query based on content analysis
416    pub query_type: QueryType,
417    /// Key entities detected in the query
418    pub key_entities: Vec<String>,
419    /// Conceptual terms extracted from the query
420    pub concepts: Vec<String>,
421    /// Inferred user intent from the query
422    pub intent: QueryIntent,
423    /// Query complexity score (0.0 to 1.0)
424    pub complexity_score: f32,
425}
426
427/// Classification of query types for adaptive retrieval strategy selection
428#[derive(Debug, Clone, PartialEq)]
429pub enum QueryType {
430    /// Queries focused on specific entities
431    EntityFocused,
432    /// Abstract concept queries requiring broader context
433    Conceptual,
434    /// Specific fact retrieval queries
435    Factual,
436    /// Open-ended exploration queries
437    Exploratory,
438    /// Queries about relationships between entities
439    Relationship,
440}
441
442/// User intent classification for result presentation
443#[derive(Debug, Clone, PartialEq)]
444pub enum QueryIntent {
445    /// User wants a high-level summary or overview
446    Overview,
447    /// User wants detailed, specific information
448    Detailed,
449    /// User wants to compare multiple items
450    Comparative,
451    /// User wants to understand cause-effect relationships
452    Causal,
453    /// User wants time-based or chronological information
454    Temporal,
455}
456
457/// Query analysis result with additional metadata for adaptive retrieval
458#[derive(Debug, Clone)]
459pub struct QueryAnalysisResult {
460    /// Detected query type
461    pub query_type: QueryType,
462    /// Confidence score for the detected query type (0.0 to 1.0)
463    pub confidence: f32,
464    /// Keywords extracted and matched from the query
465    pub keywords_matched: Vec<String>,
466    /// Recommended retrieval strategies based on analysis
467    pub suggested_strategies: Vec<String>,
468    /// Overall query complexity score (0.0 to 1.0)
469    pub complexity_score: f32,
470}
471
472/// Query result with hierarchical summary
473#[derive(Debug, Clone)]
474pub struct QueryResult {
475    /// Original query string
476    pub query: String,
477    /// List of search results
478    pub results: Vec<SearchResult>,
479    /// Optional generated summary of all results
480    pub summary: Option<String>,
481    /// Additional metadata about the query execution
482    pub metadata: HashMap<String, String>,
483}
484
485impl RetrievalSystem {
486    /// Create a new retrieval system with parallel processing support
487    #[cfg(feature = "parallel-processing")]
488    pub fn with_parallel_processing(
489        vector_store: std::sync::Arc<dyn VectorStore>,
490        embedding_generator: EmbeddingGenerator,
491        parallel_processor: ParallelProcessor,
492    ) -> Result<Self> {
493        // VectorStore trait is already Send + Sync and wrapped in Arc
494        // Can be safely used across threads for parallel operations
495        // EmbeddingGenerator operations can be parallelized with rayon
496
497        let retrieval_config = RetrievalConfig::default();
498
499        Ok(Self {
500            vector_store,
501            embedding_generator,
502            config: retrieval_config,
503            parallel_processor: Some(parallel_processor),
504            #[cfg(feature = "pagerank")]
505            pagerank_retriever: None,
506            enriched_retriever: None,
507            #[cfg(feature = "lazygraphrag")]
508            concept_filtering_enabled: false,
509        })
510    }
511
512    /// Index a knowledge graph for retrieval
513    pub async fn index_graph(&self, graph: &KnowledgeGraph) -> Result<()> {
514        // Index entity embeddings
515        for entity in graph.entities() {
516            if let Some(embedding) = &entity.embedding {
517                let id = format!("entity:{}", entity.id);
518                // Simple empty metadata for now, could add name/type
519                self.vector_store
520                    .add_vector(&id, embedding.clone(), HashMap::new())
521                    .await?;
522            }
523        }
524
525        // Index chunk embeddings
526        for chunk in graph.chunks() {
527            if let Some(embedding) = &chunk.embedding {
528                let id = format!("chunk:{}", chunk.id);
529                self.vector_store
530                    .add_vector(&id, embedding.clone(), HashMap::new())
531                    .await?;
532            }
533        }
534
535        // Initialize/Build if needed (some stores might need explicit commit)
536        self.vector_store.initialize().await?;
537
538        Ok(())
539    }
540
541    /// Initialize PageRank retrieval system (feature-gated)
542    #[cfg(feature = "pagerank")]
543    pub fn initialize_pagerank(&mut self, graph: &KnowledgeGraph) -> Result<()> {
544        use crate::graph::pagerank::{PageRankConfig, ScoreWeights};
545
546        tracing::debug!("Initializing high-performance PageRank retrieval system...");
547
548        let pagerank_config = PageRankConfig {
549            damping_factor: 0.85,
550            max_iterations: 50, // Reduced for faster convergence
551            tolerance: 1e-5,    // Slightly relaxed for speed
552            personalized: true,
553            #[cfg(feature = "parallel-processing")]
554            parallel_enabled: self.parallel_processor.is_some(),
555            #[cfg(not(feature = "parallel-processing"))]
556            parallel_enabled: false,
557            cache_size: 2000, // Large cache for better performance
558            sparse_threshold: 500,
559            incremental_updates: true,
560            simd_block_size: 64, // Optimized for modern CPUs
561        };
562
563        let score_weights = ScoreWeights {
564            vector_weight: 0.3,
565            pagerank_weight: 0.5, // Higher weight for PageRank like fast-GraphRAG
566            chunk_weight: 0.15,
567            relationship_weight: 0.05,
568        };
569
570        let mut pagerank_retriever = PageRankRetrievalSystem::new(self.config.top_k)
571            .with_pagerank_config(pagerank_config)
572            .with_score_weights(score_weights)
573            .with_incremental_mode(true)
574            .with_min_threshold(0.05);
575
576        // Initialize vector index
577        // pagerank_retriever.initialize_vector_index(graph)?;
578
579        // Pre-compute global PageRank scores for faster queries
580        pagerank_retriever.precompute_global_pagerank(graph)?;
581
582        self.pagerank_retriever = Some(pagerank_retriever);
583
584        tracing::debug!("PageRank retrieval system initialized with 27x performance optimizations");
585        Ok(())
586    }
587
588    /// Initialize enriched metadata-aware retrieval system
589    pub fn initialize_enriched(&mut self, config: Option<EnrichedRetrievalConfig>) -> Result<()> {
590        tracing::debug!("Initializing enriched metadata-aware retrieval system...");
591
592        let enriched_config = config.unwrap_or_default();
593        let enriched_retriever = EnrichedRetriever::with_config(enriched_config);
594
595        self.enriched_retriever = Some(enriched_retriever);
596
597        tracing::debug!("Enriched retrieval system initialized with metadata boosting");
598        Ok(())
599    }
600
601    /// Query using PageRank-enhanced retrieval (feature-gated)
602    #[cfg(feature = "pagerank")]
603    pub fn pagerank_query(
604        &self,
605        query: &str,
606        graph: &KnowledgeGraph,
607        max_results: Option<usize>,
608    ) -> Result<Vec<ScoredResult>> {
609        if let Some(pagerank_retriever) = &self.pagerank_retriever {
610            pagerank_retriever.search_with_pagerank(query, graph, max_results)
611        } else {
612            Err(crate::core::GraphRAGError::Retrieval {
613                message: "PageRank retriever not initialized. Call initialize_pagerank() first."
614                    .to_string(),
615            })
616        }
617    }
618
619    /// Batch PageRank queries for high throughput (feature-gated)
620    #[cfg(feature = "pagerank")]
621    pub fn pagerank_batch_query(
622        &self,
623        queries: &[&str],
624        graph: &KnowledgeGraph,
625        max_results_per_query: Option<usize>,
626    ) -> Result<Vec<Vec<ScoredResult>>> {
627        if let Some(pagerank_retriever) = &self.pagerank_retriever {
628            pagerank_retriever.batch_search(queries, graph, max_results_per_query)
629        } else {
630            Err(crate::core::GraphRAGError::Retrieval {
631                message: "PageRank retriever not initialized. Call initialize_pagerank() first."
632                    .to_string(),
633            })
634        }
635    }
636
637    /// Query the system for relevant information
638    pub fn query(&self, query: &str) -> Result<Vec<String>> {
639        // For now, return a placeholder implementation
640        // In a real system, this would:
641        // 1. Convert query to embedding
642        // 2. Search vector index
643        // 3. Expand through graph relationships
644        // 4. Rank and return results
645
646        Ok(vec![format!("Results for query: {}", query)])
647    }
648
649    /// Advanced hybrid query with strategy selection and hierarchical integration
650    pub async fn hybrid_query(
651        &mut self,
652        query: &str,
653        graph: &KnowledgeGraph,
654    ) -> Result<Vec<SearchResult>> {
655        self.hybrid_query_with_trees(query, graph, &HashMap::new())
656            .await
657    }
658
659    /// Hybrid query with access to document trees for hierarchical retrieval
660    pub async fn hybrid_query_with_trees(
661        &mut self,
662        query: &str,
663        graph: &KnowledgeGraph,
664        document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
665    ) -> Result<Vec<SearchResult>> {
666        // 1. Analyze query to determine optimal strategy
667        let analysis = self.analyze_query(query, graph)?;
668
669        // 2. Generate query embedding
670        let query_embedding = self.embedding_generator.generate_embedding(query);
671
672        // 3. Execute multi-strategy retrieval based on analysis
673        let mut results = self
674            .execute_adaptive_retrieval(query, &query_embedding, graph, document_trees, &analysis)
675            .await?;
676
677        // 4. Apply enriched metadata-aware boosting and filtering if enabled
678        if let Some(enriched_retriever) = &self.enriched_retriever {
679            // First apply metadata boosting to enhance relevance
680            results = enriched_retriever.boost_with_metadata(results, query, graph)?;
681
682            // Then apply structure filtering if query mentions chapters/sections
683            results = enriched_retriever.filter_by_structure(query, results, graph)?;
684        }
685
686        Ok(results)
687    }
688
689    /// Query the system using hybrid retrieval (vector + graph) - legacy method
690    pub async fn legacy_hybrid_query(
691        &mut self,
692        query: &str,
693        graph: &KnowledgeGraph,
694    ) -> Result<Vec<SearchResult>> {
695        // 1. Generate query embedding
696        let query_embedding = self.embedding_generator.generate_embedding(query);
697
698        // 2. Perform comprehensive search
699        let results = self.comprehensive_search(&query_embedding, graph).await?;
700
701        Ok(results)
702    }
703
704    /// Add embeddings to chunks and entities in the graph with parallel processing
705    pub async fn add_embeddings_to_graph(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
706        #[cfg(feature = "parallel-processing")]
707        if let Some(processor) = self.parallel_processor.clone() {
708            return self.add_embeddings_parallel(graph, &processor).await;
709        }
710
711        self.add_embeddings_sequential(graph).await
712    }
713
714    /// Parallel embedding generation with proper error handling and work-stealing
715    #[cfg(feature = "parallel-processing")]
716    async fn add_embeddings_parallel(
717        &mut self,
718        graph: &mut KnowledgeGraph,
719        processor: &ParallelProcessor,
720    ) -> Result<()> {
721        // Extract texts for embedding generation
722        let mut chunk_texts = Vec::new();
723        let mut entity_texts = Vec::new();
724
725        // Collect chunk texts that need embeddings
726        for chunk in graph.chunks() {
727            if chunk.embedding.is_none() {
728                chunk_texts.push((chunk.id.clone(), chunk.content.clone()));
729            }
730        }
731
732        // Collect entity texts that need embeddings
733        for entity in graph.entities() {
734            if entity.embedding.is_none() {
735                let entity_text = format!("{} {}", entity.name, entity.entity_type);
736                entity_texts.push((entity.id.clone(), entity_text));
737            }
738        }
739
740        // For parallel processing, we need to use a different approach since
741        // generate_embedding requires &mut self. We'll fall back to enhanced sequential
742        // processing with better chunking and monitoring for now.
743
744        let total_items = chunk_texts.len() + entity_texts.len();
745        if processor.should_use_parallel(total_items) {
746            tracing::debug!(
747                "Processing {total_items} embeddings with enhanced sequential approach"
748            );
749        }
750
751        // Process chunks
752        for (chunk_id, text) in chunk_texts {
753            let embedding = self.embedding_generator.generate_embedding(&text);
754            if let Some(chunk) = graph.get_chunk_mut(&chunk_id) {
755                chunk.embedding = Some(embedding);
756            }
757        }
758
759        // Process entities
760        for (entity_id, text) in entity_texts {
761            let embedding = self.embedding_generator.generate_embedding(&text);
762            if let Some(entity) = graph.get_entity_mut(&entity_id) {
763                entity.embedding = Some(embedding);
764            }
765        }
766
767        // Re-index the graph with new embeddings
768        self.index_graph(graph).await?;
769
770        Ok(())
771    }
772
773    /// Sequential embedding generation (fallback)
774    async fn add_embeddings_sequential(&mut self, graph: &mut KnowledgeGraph) -> Result<()> {
775        // Debug: Check total counts first (uncomment for debugging)
776        let _total_chunks = graph.chunks().count();
777        let _total_entities = graph.entities().count();
778        // println!("DEBUG: Found {} total chunks and {} total entities in graph", _total_chunks, _total_entities);
779
780        // Generate embeddings for all chunks
781        let mut chunk_count = 0;
782        for chunk in graph.chunks_mut() {
783            if chunk.embedding.is_none() {
784                let embedding = self.embedding_generator.generate_embedding(&chunk.content);
785                chunk.embedding = Some(embedding);
786                chunk_count += 1;
787            }
788        }
789
790        // Generate embeddings for all entities (using their name and context)
791        let mut entity_count = 0;
792        for entity in graph.entities_mut() {
793            if entity.embedding.is_none() {
794                // Create entity text from name and entity type
795                let entity_text = format!("{} {}", entity.name, entity.entity_type);
796                let embedding = self.embedding_generator.generate_embedding(&entity_text);
797                entity.embedding = Some(embedding);
798                entity_count += 1;
799            }
800        }
801
802        tracing::debug!(
803            "Generated embeddings for {chunk_count} chunks and {entity_count} entities"
804        );
805
806        // Re-index the graph with new embeddings
807        // Re-index the graph with new embeddings
808        self.index_graph(graph).await?;
809
810        Ok(())
811    }
812
813    /// Parallel batch query processing with optimized workload distribution
814    /// Batch process multiple queries efficiently
815    pub async fn batch_query(
816        &mut self,
817        queries: &[&str],
818        graph: &KnowledgeGraph,
819    ) -> Result<Vec<Vec<SearchResult>>> {
820        let processor =
821            self.parallel_processor
822                .as_ref()
823                .ok_or_else(|| crate::core::GraphRAGError::Config {
824                    message: "Parallel processor not initialized".to_string(),
825                })?;
826
827        if !processor.should_use_parallel(queries.len()) {
828            // Use sequential processing for small batches
829            let mut results = Vec::new();
830            for &query in queries {
831                results.push(self.hybrid_query(query, graph).await?);
832            }
833            return Ok(results);
834        }
835
836        #[cfg(feature = "parallel-processing")]
837        {
838            // For parallel query processing, we need to work around the borrowing limitations
839            // of the embedding generator. We'll use enhanced sequential processing with
840            // better monitoring and chunking for now.
841
842            let chunk_size = processor.config().chunk_batch_size.min(queries.len());
843            tracing::debug!(
844                "Processing {} queries with enhanced sequential approach (chunk size: {})",
845                queries.len(),
846                chunk_size
847            );
848
849            let mut all_results = Vec::new();
850            for &query in queries {
851                match self.hybrid_query(query, graph).await {
852                    Ok(results) => all_results.push(results),
853                    Err(e) => {
854                        tracing::warn!("Error processing query '{query}': {e}");
855                        all_results.push(Vec::new());
856                    },
857                }
858            }
859
860            Ok(all_results)
861        }
862
863        #[cfg(not(feature = "parallel-processing"))]
864        {
865            // Sequential fallback when parallel processing is not available
866            let mut results = Vec::new();
867            for &query in queries {
868                results.push(self.hybrid_query(query, graph).await?);
869            }
870            Ok(results)
871        }
872    }
873
874    /// Analyze query to determine optimal retrieval strategy
875    pub fn analyze_query(&self, query: &str, graph: &KnowledgeGraph) -> Result<QueryAnalysis> {
876        let query_lower = query.to_lowercase();
877        let words: Vec<&str> = query_lower.split_whitespace().collect();
878
879        // Detect key entities mentioned in the query
880        let mut key_entities = Vec::new();
881        for entity in graph.entities() {
882            let entity_name_lower = entity.name.to_lowercase();
883            if words
884                .iter()
885                .any(|&word| entity_name_lower.contains(word) || word.contains(&entity_name_lower))
886            {
887                key_entities.push(entity.name.clone());
888            }
889        }
890
891        // Extract concepts (non-entity meaningful words)
892        let concepts: Vec<String> = words
893            .iter()
894            .filter(|&&word| word.len() > 3 && !self.is_stop_word(word))
895            .filter(|&&word| {
896                !key_entities.iter().any(|entity| {
897                    entity.to_lowercase().contains(word) || word.contains(&entity.to_lowercase())
898                })
899            })
900            .map(|&word| word.to_string())
901            .collect();
902
903        // Determine query type
904        let query_type = if !key_entities.is_empty() && key_entities.len() > 1 {
905            QueryType::Relationship
906        } else if !key_entities.is_empty() {
907            QueryType::EntityFocused
908        } else if self.has_abstract_concepts(&words) {
909            QueryType::Conceptual
910        } else if self.has_question_words(&words) {
911            QueryType::Exploratory
912        } else {
913            QueryType::Factual
914        };
915
916        // Determine intent
917        let intent = if words
918            .iter()
919            .any(|&w| ["overview", "summary", "general", "about"].contains(&w))
920        {
921            QueryIntent::Overview
922        } else if words
923            .iter()
924            .any(|&w| ["detailed", "specific", "exactly", "precise"].contains(&w))
925        {
926            QueryIntent::Detailed
927        } else if words
928            .iter()
929            .any(|&w| ["compare", "vs", "versus", "between", "difference"].contains(&w))
930        {
931            QueryIntent::Comparative
932        } else if words
933            .iter()
934            .any(|&w| ["cause", "why", "because", "lead", "result"].contains(&w))
935        {
936            QueryIntent::Causal
937        } else if words
938            .iter()
939            .any(|&w| ["when", "time", "before", "after", "during"].contains(&w))
940        {
941            QueryIntent::Temporal
942        } else {
943            QueryIntent::Detailed
944        };
945
946        // Calculate complexity score
947        let complexity_score = (words.len() as f32 * 0.1
948            + key_entities.len() as f32 * 0.3
949            + concepts.len() as f32 * 0.2)
950            .min(1.0);
951
952        Ok(QueryAnalysis {
953            query_type,
954            key_entities,
955            concepts,
956            intent,
957            complexity_score,
958        })
959    }
960
961    /// Execute adaptive retrieval based on query analysis
962    pub async fn execute_adaptive_retrieval(
963        &mut self,
964        query: &str,
965        query_embedding: &[f32],
966        graph: &KnowledgeGraph,
967        document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
968        analysis: &QueryAnalysis,
969    ) -> Result<Vec<SearchResult>> {
970        let mut all_results = Vec::new();
971
972        // Strategy weights based on query analysis
973        let (vector_weight, graph_weight, hierarchical_weight) =
974            self.calculate_strategy_weights(analysis);
975
976        // 1. Vector similarity search (always included)
977        if vector_weight > 0.0 {
978            let mut vector_results = self
979                .vector_similarity_search(query_embedding, graph)
980                .await?;
981            for result in &mut vector_results {
982                result.score *= vector_weight;
983            }
984            all_results.extend(vector_results);
985        }
986
987        // 2. Graph-based search (emphasized for entity and relationship queries)
988        if graph_weight > 0.0 {
989            let mut graph_results = match analysis.query_type {
990                QueryType::EntityFocused | QueryType::Relationship => {
991                    self.entity_centric_search(query_embedding, graph, &analysis.key_entities)?
992                },
993                _ => self.entity_based_search(query_embedding, graph)?,
994            };
995            for result in &mut graph_results {
996                result.score *= graph_weight;
997            }
998            all_results.extend(graph_results);
999        }
1000
1001        // 3. Hierarchical search (emphasized for overview and conceptual queries)
1002        if hierarchical_weight > 0.0 && !document_trees.is_empty() {
1003            let mut hierarchical_results =
1004                self.hierarchical_search(query, document_trees, analysis)?;
1005            for result in &mut hierarchical_results {
1006                result.score *= hierarchical_weight;
1007            }
1008            all_results.extend(hierarchical_results);
1009        }
1010
1011        // 4. Advanced graph traversal for complex queries
1012        if analysis.complexity_score > 0.7 {
1013            let traversal_results =
1014                self.advanced_graph_traversal(query_embedding, graph, analysis)?;
1015            all_results.extend(traversal_results);
1016        }
1017
1018        // 5. Cross-strategy fusion for hybrid results
1019        let fusion_results = self.cross_strategy_fusion(&all_results, analysis)?;
1020        all_results.extend(fusion_results);
1021
1022        // Final ranking and deduplication
1023        let final_results = self.adaptive_rank_and_deduplicate(all_results, analysis)?;
1024
1025        Ok(final_results.into_iter().take(self.config.top_k).collect())
1026    }
1027
1028    /// Comprehensive search that combines multiple retrieval strategies (legacy)
1029    pub async fn comprehensive_search(
1030        &self,
1031        query_embedding: &[f32],
1032        graph: &KnowledgeGraph,
1033    ) -> Result<Vec<SearchResult>> {
1034        let mut all_results = Vec::new();
1035
1036        // 1. Vector similarity search
1037        let vector_results = self
1038            .vector_similarity_search(query_embedding, graph)
1039            .await?;
1040        all_results.extend(vector_results);
1041
1042        // 2. Entity-based search
1043        let entity_results = self.entity_based_search(query_embedding, graph)?;
1044        all_results.extend(entity_results);
1045
1046        // 3. Graph traversal search
1047        let graph_results = self.graph_traversal_search(query_embedding, graph)?;
1048        all_results.extend(graph_results);
1049
1050        // Deduplicate and rank results
1051        let final_results = self.rank_and_deduplicate(all_results)?;
1052
1053        Ok(final_results.into_iter().take(self.config.top_k).collect())
1054    }
1055
1056    /// Vector similarity search
1057    async fn vector_similarity_search(
1058        &self,
1059        query_embedding: &[f32],
1060        graph: &KnowledgeGraph,
1061    ) -> Result<Vec<SearchResult>> {
1062        let mut results = Vec::new();
1063
1064        // Search for similar vectors
1065        // Note: vector_store returns SearchResult struct from store module, we need to convert or us it
1066        // The store::SearchResult is slightly different from retrieval::SearchResult (metadata map vs specific fields)
1067        let similar_vectors = self
1068            .vector_store
1069            .search(query_embedding, self.config.top_k * 2)
1070            .await?;
1071
1072        for store_result in similar_vectors {
1073            let id = store_result.id;
1074            let similarity = store_result.score;
1075            if similarity >= self.config.similarity_threshold {
1076                let result = if id.starts_with("entity:") {
1077                    let entity_id = EntityId::new(id.strip_prefix("entity:").unwrap().to_string());
1078                    graph.get_entity(&entity_id).map(|entity| SearchResult {
1079                        id: entity.id.to_string(),
1080                        content: entity.name.clone(),
1081                        score: similarity * self.config.entity_weight,
1082                        result_type: ResultType::Entity,
1083                        entities: vec![entity.name.clone()],
1084                        source_chunks: entity
1085                            .mentions
1086                            .iter()
1087                            .map(|m| m.chunk_id.to_string())
1088                            .collect(),
1089                    })
1090                } else if id.starts_with("chunk:") {
1091                    let chunk_id = ChunkId::new(id.strip_prefix("chunk:").unwrap().to_string());
1092                    if let Some(chunk) = graph.get_chunk(&chunk_id) {
1093                        let entity_names: Vec<String> = chunk
1094                            .entities
1095                            .iter()
1096                            .filter_map(|eid| graph.get_entity(eid))
1097                            .map(|e| e.name.clone())
1098                            .collect();
1099
1100                        Some(SearchResult {
1101                            id: chunk.id.to_string(),
1102                            content: chunk.content.clone(),
1103                            score: similarity * self.config.chunk_weight,
1104                            result_type: ResultType::Chunk,
1105                            entities: entity_names,
1106                            source_chunks: vec![chunk.id.to_string()],
1107                        })
1108                    } else {
1109                        None
1110                    }
1111                } else {
1112                    None
1113                };
1114
1115                if let Some(search_result) = result {
1116                    results.push(search_result);
1117                }
1118            }
1119        }
1120
1121        Ok(results)
1122    }
1123
1124    /// Entity-based search with graph expansion
1125    fn entity_based_search(
1126        &self,
1127        query_embedding: &[f32],
1128        graph: &KnowledgeGraph,
1129    ) -> Result<Vec<SearchResult>> {
1130        let mut results = Vec::new();
1131        let mut visited = HashSet::new();
1132
1133        // Find most relevant entities
1134        let entity_similarities = self.find_relevant_entities(query_embedding, graph)?;
1135
1136        for (entity_id, similarity) in entity_similarities.into_iter().take(5) {
1137            if visited.contains(&entity_id) {
1138                continue;
1139            }
1140
1141            // Expand through graph relationships
1142            let expanded_entities = self.expand_through_relationships(
1143                &entity_id,
1144                graph,
1145                self.config.max_expansion_depth,
1146                &mut visited,
1147            )?;
1148
1149            for expanded_entity_id in expanded_entities {
1150                if let Some(entity) = graph.get_entity(&expanded_entity_id) {
1151                    let expansion_penalty = if expanded_entity_id == entity_id {
1152                        1.0
1153                    } else {
1154                        0.8
1155                    };
1156
1157                    results.push(SearchResult {
1158                        id: entity.id.to_string(),
1159                        content: format!("{} ({})", entity.name, entity.entity_type),
1160                        score: similarity * expansion_penalty * self.config.entity_weight,
1161                        result_type: ResultType::Entity,
1162                        entities: vec![entity.name.clone()],
1163                        source_chunks: entity
1164                            .mentions
1165                            .iter()
1166                            .map(|m| m.chunk_id.to_string())
1167                            .collect(),
1168                    });
1169                }
1170            }
1171        }
1172
1173        Ok(results)
1174    }
1175
1176    /// Calculate strategy weights based on query analysis
1177    fn calculate_strategy_weights(&self, analysis: &QueryAnalysis) -> (f32, f32, f32) {
1178        match (&analysis.query_type, &analysis.intent) {
1179            // For entity-focused queries, balance vector (chunks) and graph (entities) equally
1180            // This ensures we get both entity information AND contextual chunks
1181            (QueryType::EntityFocused, _) => (0.5, 0.4, 0.1),
1182            (QueryType::Relationship, _) => (0.3, 0.6, 0.1),
1183            (QueryType::Conceptual, QueryIntent::Overview) => (0.2, 0.2, 0.6),
1184            (QueryType::Conceptual, _) => (0.4, 0.3, 0.3),
1185            (QueryType::Exploratory, QueryIntent::Overview) => (0.3, 0.2, 0.5),
1186            (QueryType::Exploratory, _) => (0.4, 0.4, 0.2),
1187            (QueryType::Factual, _) => (0.6, 0.3, 0.1),
1188        }
1189    }
1190
1191    /// Entity-centric search focusing on specific entities
1192    fn entity_centric_search(
1193        &mut self,
1194        query_embedding: &[f32],
1195        graph: &KnowledgeGraph,
1196        key_entities: &[String],
1197    ) -> Result<Vec<SearchResult>> {
1198        let mut results = Vec::new();
1199        let mut visited = HashSet::new();
1200
1201        for entity_name in key_entities {
1202            // Find the entity in the graph
1203            if let Some(entity) = graph
1204                .entities()
1205                .find(|e| e.name.eq_ignore_ascii_case(entity_name))
1206            {
1207                // Add the entity itself
1208                results.push(SearchResult {
1209                    id: entity.id.to_string(),
1210                    content: format!("{} ({})", entity.name, entity.entity_type),
1211                    score: 0.9, // High score for exact entity match
1212                    result_type: ResultType::Entity,
1213                    entities: vec![entity.name.clone()],
1214                    source_chunks: entity
1215                        .mentions
1216                        .iter()
1217                        .map(|m| m.chunk_id.to_string())
1218                        .collect(),
1219                });
1220
1221                // Get entity neighbors with weighted scores
1222                let neighbors = graph.get_neighbors(&entity.id);
1223                for (neighbor, relationship) in neighbors {
1224                    if !visited.contains(&neighbor.id) {
1225                        visited.insert(neighbor.id.clone());
1226
1227                        // Calculate relationship relevance
1228                        let rel_embedding = self
1229                            .embedding_generator
1230                            .generate_embedding(&relationship.relation_type);
1231                        let rel_similarity =
1232                            VectorUtils::cosine_similarity(query_embedding, &rel_embedding);
1233
1234                        results.push(SearchResult {
1235                            id: neighbor.id.to_string(),
1236                            content: format!("{} ({})", neighbor.name, neighbor.entity_type),
1237                            score: 0.7 * relationship.confidence * (1.0 + rel_similarity),
1238                            result_type: ResultType::Entity,
1239                            entities: vec![neighbor.name.clone()],
1240                            source_chunks: neighbor
1241                                .mentions
1242                                .iter()
1243                                .map(|m| m.chunk_id.to_string())
1244                                .collect(),
1245                        });
1246                    }
1247                }
1248            }
1249        }
1250
1251        Ok(results)
1252    }
1253
1254    /// Hierarchical search using document trees
1255    fn hierarchical_search(
1256        &self,
1257        query: &str,
1258        document_trees: &HashMap<crate::core::DocumentId, DocumentTree>,
1259        analysis: &QueryAnalysis,
1260    ) -> Result<Vec<SearchResult>> {
1261        let mut results = Vec::new();
1262        let max_results_per_tree = match analysis.intent {
1263            QueryIntent::Overview => 3,
1264            QueryIntent::Detailed => 8,
1265            _ => 5,
1266        };
1267
1268        for (doc_id, tree) in document_trees.iter() {
1269            let tree_summaries = tree.query(query, max_results_per_tree)?;
1270
1271            for (idx, summary) in tree_summaries.iter().enumerate() {
1272                // Convert tree query result to search result
1273                let level_bonus = match analysis.intent {
1274                    QueryIntent::Overview => 0.3,
1275                    QueryIntent::Detailed => 0.2,
1276                    _ => 0.0,
1277                };
1278
1279                results.push(SearchResult {
1280                    id: format!("{}:summary:{}", doc_id, idx),
1281                    content: summary.summary.clone(),
1282                    score: summary.score + level_bonus,
1283                    result_type: ResultType::HierarchicalSummary,
1284                    entities: Vec::new(),
1285                    source_chunks: vec![doc_id.to_string()],
1286                });
1287            }
1288        }
1289
1290        Ok(results)
1291    }
1292
1293    /// Advanced graph traversal for complex queries
1294    fn advanced_graph_traversal(
1295        &self,
1296        query_embedding: &[f32],
1297        graph: &KnowledgeGraph,
1298        analysis: &QueryAnalysis,
1299    ) -> Result<Vec<SearchResult>> {
1300        let mut results = Vec::new();
1301
1302        if analysis.query_type == QueryType::Relationship && analysis.key_entities.len() >= 2 {
1303            // Find paths between entities
1304            results.extend(self.find_entity_paths(graph, &analysis.key_entities)?);
1305        }
1306
1307        if analysis.complexity_score > 0.8 {
1308            // Community detection for exploratory queries
1309            results.extend(self.community_based_search(query_embedding, graph)?);
1310        }
1311
1312        Ok(results)
1313    }
1314
1315    /// Cross-strategy fusion to create hybrid results
1316    fn cross_strategy_fusion(
1317        &self,
1318        all_results: &[SearchResult],
1319        _analysis: &QueryAnalysis,
1320    ) -> Result<Vec<SearchResult>> {
1321        let mut fusion_results = Vec::new();
1322
1323        // Group results by content similarity
1324        let mut content_groups: HashMap<String, Vec<&SearchResult>> = HashMap::new();
1325
1326        for result in all_results {
1327            let content_key = Self::safe_truncate(&result.content, 50);
1328
1329            content_groups.entry(content_key).or_default().push(result);
1330        }
1331
1332        // Create fusion results for groups with multiple strategies
1333        for (content_key, group) in content_groups {
1334            if group.len() > 1 {
1335                let types: HashSet<_> = group.iter().map(|r| &r.result_type).collect();
1336                if types.len() > 1 {
1337                    // This content was found by multiple strategies - boost confidence
1338                    let avg_score = group.iter().map(|r| r.score).sum::<f32>() / group.len() as f32;
1339                    let boost = 0.2 * (types.len() - 1) as f32;
1340
1341                    let all_entities: HashSet<_> =
1342                        group.iter().flat_map(|r| r.entities.iter()).collect();
1343
1344                    let all_chunks: HashSet<_> =
1345                        group.iter().flat_map(|r| r.source_chunks.iter()).collect();
1346
1347                    fusion_results.push(SearchResult {
1348                        id: format!(
1349                            "fusion_{}",
1350                            content_key.chars().take(10).collect::<String>()
1351                        ),
1352                        content: group[0].content.clone(),
1353                        score: (avg_score + boost).min(1.0),
1354                        result_type: ResultType::Hybrid,
1355                        entities: all_entities.into_iter().cloned().collect(),
1356                        source_chunks: all_chunks.into_iter().cloned().collect(),
1357                    });
1358                }
1359            }
1360        }
1361
1362        Ok(fusion_results)
1363    }
1364
1365    /// Adaptive ranking and deduplication based on query analysis
1366    fn adaptive_rank_and_deduplicate(
1367        &self,
1368        mut results: Vec<SearchResult>,
1369        analysis: &QueryAnalysis,
1370    ) -> Result<Vec<SearchResult>> {
1371        // Apply query-specific score adjustments
1372        for result in &mut results {
1373            match analysis.query_type {
1374                QueryType::EntityFocused => {
1375                    if result.result_type == ResultType::Entity {
1376                        result.score *= 1.2;
1377                    }
1378                },
1379                QueryType::Conceptual => {
1380                    if result.result_type == ResultType::HierarchicalSummary {
1381                        result.score *= 1.1;
1382                    }
1383                },
1384                QueryType::Relationship => {
1385                    if result.entities.len() > 1 {
1386                        result.score *= 1.15;
1387                    }
1388                },
1389                _ => {},
1390            }
1391
1392            // Boost results that contain key entities
1393            for entity in &analysis.key_entities {
1394                if result
1395                    .entities
1396                    .iter()
1397                    .any(|e| e.eq_ignore_ascii_case(entity))
1398                {
1399                    result.score *= 1.1;
1400                }
1401            }
1402        }
1403
1404        // Sort by adjusted scores
1405        results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
1406
1407        // Diversity-aware deduplication
1408        let mut deduplicated = Vec::new();
1409        let mut seen_content = HashSet::new();
1410        let mut type_counts: HashMap<ResultType, usize> = HashMap::new();
1411
1412        for result in results {
1413            let content_signature = self.create_content_signature(&result.content);
1414
1415            if !seen_content.contains(&content_signature) {
1416                let type_count = type_counts.get(&result.result_type).unwrap_or(&0);
1417
1418                // Ensure diversity across result types
1419                let max_per_type = match result.result_type {
1420                    ResultType::Entity => self.config.top_k / 3,
1421                    ResultType::Chunk => self.config.top_k / 2,
1422                    ResultType::HierarchicalSummary => self.config.top_k / 4,
1423                    ResultType::Hybrid => self.config.top_k / 4,
1424                    ResultType::GraphPath => self.config.top_k / 5,
1425                };
1426
1427                if *type_count < max_per_type {
1428                    seen_content.insert(content_signature);
1429                    *type_counts.entry(result.result_type.clone()).or_insert(0) += 1;
1430                    deduplicated.push(result);
1431                }
1432            }
1433        }
1434
1435        Ok(deduplicated)
1436    }
1437
1438    /// Find paths between entities in the graph
1439    fn find_entity_paths(
1440        &self,
1441        graph: &KnowledgeGraph,
1442        key_entities: &[String],
1443    ) -> Result<Vec<SearchResult>> {
1444        let mut results = Vec::new();
1445
1446        if key_entities.len() < 2 {
1447            return Ok(results);
1448        }
1449
1450        // Simple path finding between first two entities
1451        if let (Some(source), Some(target)) = (
1452            graph
1453                .entities()
1454                .find(|e| e.name.eq_ignore_ascii_case(&key_entities[0])),
1455            graph
1456                .entities()
1457                .find(|e| e.name.eq_ignore_ascii_case(&key_entities[1])),
1458        ) {
1459            let path_description =
1460                format!("Connection between {} and {}", source.name, target.name);
1461            let neighbors_source = graph.get_neighbors(&source.id);
1462            let neighbors_target = graph.get_neighbors(&target.id);
1463
1464            // Check for direct connection
1465            if neighbors_source
1466                .iter()
1467                .any(|(neighbor, _)| neighbor.id == target.id)
1468            {
1469                results.push(SearchResult {
1470                    id: format!("path_{}_{}", source.id, target.id),
1471                    content: format!("Direct relationship: {path_description}"),
1472                    score: 0.8,
1473                    result_type: ResultType::GraphPath,
1474                    entities: vec![source.name.clone(), target.name.clone()],
1475                    source_chunks: Vec::new(),
1476                });
1477            }
1478
1479            // Check for indirect connections through common neighbors
1480            for (neighbor_s, rel_s) in &neighbors_source {
1481                for (neighbor_t, rel_t) in &neighbors_target {
1482                    if neighbor_s.id == neighbor_t.id {
1483                        results.push(SearchResult {
1484                            id: format!("path_{}_{}_{}", source.id, neighbor_s.id, target.id),
1485                            content: format!(
1486                                "Indirect relationship via {}: {} -> {} -> {}",
1487                                neighbor_s.name, source.name, neighbor_s.name, target.name
1488                            ),
1489                            score: 0.6 * rel_s.confidence * rel_t.confidence,
1490                            result_type: ResultType::GraphPath,
1491                            entities: vec![
1492                                source.name.clone(),
1493                                neighbor_s.name.clone(),
1494                                target.name.clone(),
1495                            ],
1496                            source_chunks: Vec::new(),
1497                        });
1498                    }
1499                }
1500            }
1501        }
1502
1503        Ok(results)
1504    }
1505
1506    /// Community-based search for exploratory queries
1507    fn community_based_search(
1508        &self,
1509        query_embedding: &[f32],
1510        graph: &KnowledgeGraph,
1511    ) -> Result<Vec<SearchResult>> {
1512        let mut results = Vec::new();
1513        let mut entity_scores: HashMap<String, f32> = HashMap::new();
1514
1515        // Calculate centrality-like scores for entities
1516        for entity in graph.entities() {
1517            let neighbors = graph.get_neighbors(&entity.id);
1518            let centrality_score = neighbors.len() as f32 * 0.1;
1519
1520            // Combine with embedding similarity
1521            if let Some(embedding) = &entity.embedding {
1522                let similarity = VectorUtils::cosine_similarity(query_embedding, embedding);
1523                entity_scores.insert(entity.id.to_string(), centrality_score + similarity);
1524            }
1525        }
1526
1527        // Select top entities by combined score
1528        let mut sorted_entities: Vec<_> = entity_scores.iter().collect();
1529        sorted_entities.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap());
1530
1531        for (entity_id, score) in sorted_entities.iter().take(3) {
1532            if let Some(entity) = graph.entities().find(|e| e.id.to_string() == **entity_id) {
1533                // Get context from chunks where this entity is mentioned
1534                let mut entity_context = String::new();
1535                for mention in entity.mentions.iter().take(2) {
1536                    if let Some(chunk) = graph.chunks().find(|c| c.id == mention.chunk_id) {
1537                        let chunk_excerpt = if chunk.content.len() > 200 {
1538                            format!("{}...", &chunk.content[..200])
1539                        } else {
1540                            chunk.content.clone()
1541                        };
1542                        entity_context.push_str(&chunk_excerpt);
1543                        entity_context.push(' ');
1544                    }
1545                }
1546
1547                // If no context found, provide a meaningful description
1548                if entity_context.is_empty() {
1549                    entity_context = format!(
1550                        "{} is a {} character in the story.",
1551                        entity.name, entity.entity_type
1552                    );
1553                }
1554
1555                results.push(SearchResult {
1556                    id: entity.id.to_string(),
1557                    content: entity_context,
1558                    score: **score,
1559                    result_type: ResultType::Entity,
1560                    entities: vec![entity.name.clone()],
1561                    source_chunks: entity
1562                        .mentions
1563                        .iter()
1564                        .map(|m| m.chunk_id.to_string())
1565                        .collect(),
1566                });
1567            }
1568        }
1569
1570        Ok(results)
1571    }
1572
1573    /// Helper method to detect abstract concepts
1574    fn has_abstract_concepts(&self, words: &[&str]) -> bool {
1575        const ABSTRACT_INDICATORS: &[&str] = &[
1576            "concept",
1577            "idea",
1578            "theory",
1579            "principle",
1580            "philosophy",
1581            "meaning",
1582            "understanding",
1583            "knowledge",
1584            "wisdom",
1585            "truth",
1586            "beauty",
1587            "justice",
1588        ];
1589        words
1590            .iter()
1591            .any(|&word| ABSTRACT_INDICATORS.contains(&word))
1592    }
1593
1594    /// Helper method to detect question words
1595    fn has_question_words(&self, words: &[&str]) -> bool {
1596        const QUESTION_WORDS: &[&str] = &[
1597            "what", "how", "why", "when", "where", "who", "which", "explain", "describe",
1598        ];
1599        words.iter().any(|&word| QUESTION_WORDS.contains(&word))
1600    }
1601
1602    /// Create content signature for deduplication
1603    fn create_content_signature(&self, content: &str) -> String {
1604        // Simple signature based on first 50 characters and length
1605        let prefix = Self::safe_truncate(content, 50);
1606        format!(
1607            "{}_{}",
1608            prefix
1609                .chars()
1610                .filter(|c| c.is_alphanumeric())
1611                .collect::<String>(),
1612            content.len()
1613        )
1614    }
1615
1616    /// Graph traversal search for path-based results (legacy)
1617    fn graph_traversal_search(
1618        &self,
1619        _query_embedding: &[f32],
1620        _graph: &KnowledgeGraph,
1621    ) -> Result<Vec<SearchResult>> {
1622        // Placeholder for graph traversal algorithms
1623        // This would implement algorithms like:
1624        // - Random walks
1625        // - Shortest paths between relevant entities
1626        // - Community detection
1627        // - PageRank-style scoring
1628
1629        Ok(Vec::new())
1630    }
1631
1632    /// Find entities most relevant to the query
1633    fn find_relevant_entities(
1634        &self,
1635        query_embedding: &[f32],
1636        graph: &KnowledgeGraph,
1637    ) -> Result<Vec<(EntityId, f32)>> {
1638        let mut similarities = Vec::new();
1639
1640        for entity in graph.entities() {
1641            if let Some(embedding) = &entity.embedding {
1642                let similarity = VectorUtils::cosine_similarity(query_embedding, embedding);
1643                if similarity >= self.config.similarity_threshold {
1644                    similarities.push((entity.id.clone(), similarity));
1645                }
1646            }
1647        }
1648
1649        // Sort by similarity
1650        similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1651
1652        Ok(similarities)
1653    }
1654
1655    /// Expand search through graph relationships
1656    fn expand_through_relationships(
1657        &self,
1658        start_entity: &EntityId,
1659        graph: &KnowledgeGraph,
1660        max_depth: usize,
1661        visited: &mut HashSet<EntityId>,
1662    ) -> Result<Vec<EntityId>> {
1663        let mut results = Vec::new();
1664        let mut current_level = vec![start_entity.clone()];
1665        visited.insert(start_entity.clone());
1666
1667        for _depth in 0..max_depth {
1668            let mut next_level = Vec::new();
1669
1670            for entity_id in &current_level {
1671                results.push(entity_id.clone());
1672
1673                // Get neighbors through graph relationships
1674                let neighbors = graph.get_neighbors(entity_id);
1675                for (neighbor_entity, _relationship) in neighbors {
1676                    if !visited.contains(&neighbor_entity.id) {
1677                        visited.insert(neighbor_entity.id.clone());
1678                        next_level.push(neighbor_entity.id.clone());
1679                    }
1680                }
1681            }
1682
1683            if next_level.is_empty() {
1684                break;
1685            }
1686
1687            current_level = next_level;
1688        }
1689
1690        Ok(results)
1691    }
1692
1693    /// Simple stop word detection (English)
1694    fn is_stop_word(&self, word: &str) -> bool {
1695        const STOP_WORDS: &[&str] = &[
1696            "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
1697            "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
1698            "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
1699            "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
1700            "go", "me",
1701        ];
1702        STOP_WORDS.contains(&word)
1703    }
1704
1705    /// Rank and deduplicate search results (legacy)
1706    fn rank_and_deduplicate(&self, mut results: Vec<SearchResult>) -> Result<Vec<SearchResult>> {
1707        // Sort by score descending
1708        results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
1709
1710        // Deduplicate by ID
1711        let mut seen_ids = HashSet::new();
1712        let mut deduplicated = Vec::new();
1713
1714        for result in results {
1715            if !seen_ids.contains(&result.id) {
1716                seen_ids.insert(result.id.clone());
1717                deduplicated.push(result);
1718            }
1719        }
1720
1721        Ok(deduplicated)
1722    }
1723
1724    /// Vector-based search
1725    pub async fn vector_search(
1726        &mut self,
1727        query: &str,
1728        max_results: usize,
1729    ) -> Result<Vec<SearchResult>> {
1730        let query_embedding = self.embedding_generator.generate_embedding(query);
1731        let similar_vectors = self
1732            .vector_store
1733            .search(&query_embedding, max_results)
1734            .await?;
1735
1736        let mut results = Vec::new();
1737        for store_result in similar_vectors {
1738            results.push(SearchResult {
1739                id: store_result.id.clone(),
1740                content: format!("Vector result for: {}", store_result.id),
1741                score: store_result.score,
1742                result_type: ResultType::Chunk,
1743                entities: Vec::new(),
1744                source_chunks: vec![store_result.id],
1745            });
1746        }
1747
1748        Ok(results)
1749    }
1750
1751    /// Graph-based search
1752    pub fn graph_search(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
1753        // Simplified graph search - in a real implementation this would traverse the graph
1754        let mut results = Vec::new();
1755        results.push(SearchResult {
1756            id: format!("graph_result_{}", query.len()),
1757            content: format!("Graph-based result for: {query}"),
1758            score: 0.7,
1759            result_type: ResultType::GraphPath,
1760            entities: Vec::new(),
1761            source_chunks: Vec::new(),
1762        });
1763
1764        Ok(results.into_iter().take(max_results).collect())
1765    }
1766
1767    /// Hierarchical search (public wrapper)
1768    pub fn public_hierarchical_search(
1769        &self,
1770        query: &str,
1771        max_results: usize,
1772    ) -> Result<Vec<SearchResult>> {
1773        // Simplified hierarchical search - in a real implementation this would use document trees
1774        let mut results = Vec::new();
1775        results.push(SearchResult {
1776            id: format!("hierarchical_result_{}", query.len()),
1777            content: format!("Hierarchical result for: {query}"),
1778            score: 0.8,
1779            result_type: ResultType::HierarchicalSummary,
1780            entities: Vec::new(),
1781            source_chunks: Vec::new(),
1782        });
1783
1784        Ok(results.into_iter().take(max_results).collect())
1785    }
1786
1787    /// BM25-based search
1788    pub fn bm25_search(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
1789        // Simplified BM25 search - in a real implementation this would use proper BM25 scoring
1790        let mut results = Vec::new();
1791        results.push(SearchResult {
1792            id: format!("bm25_result_{}", query.len()),
1793            content: format!("BM25 result for: {query}"),
1794            score: 0.75,
1795            result_type: ResultType::Chunk,
1796            entities: Vec::new(),
1797            source_chunks: Vec::new(),
1798        });
1799
1800        Ok(results.into_iter().take(max_results).collect())
1801    }
1802
1803    /// Get retrieval statistics
1804    pub fn get_statistics(&self) -> RetrievalStatistics {
1805        // let vector_stats = self.vector_index.statistics();
1806
1807        RetrievalStatistics {
1808            indexed_vectors: 0,  // vector_stats.vector_count,
1809            vector_dimension: 0, // vector_stats.dimension,
1810            index_built: false,  // vector_stats.index_built,
1811            config: self.config.clone(),
1812        }
1813    }
1814
1815    /// Safely truncate a string to a maximum byte length, respecting UTF-8 character boundaries
1816    fn safe_truncate(s: &str, max_bytes: usize) -> String {
1817        if s.len() <= max_bytes {
1818            return s.to_string();
1819        }
1820
1821        // Find the largest valid character boundary <= max_bytes
1822        let mut end_idx = max_bytes;
1823        while end_idx > 0 && !s.is_char_boundary(end_idx) {
1824            end_idx -= 1;
1825        }
1826
1827        s[..end_idx].to_string()
1828    }
1829
1830    /// Save retrieval system state to JSON file
1831    pub fn save_state_to_json(&self, file_path: &str) -> Result<()> {
1832        use std::fs;
1833
1834        let mut json_data = json::JsonValue::new_object();
1835
1836        // Add metadata
1837        json_data["metadata"] = json::object! {
1838            "format_version" => "1.0",
1839            "created_at" => chrono::Utc::now().to_rfc3339(),
1840            "config" => json::object! {
1841                "top_k" => self.config.top_k,
1842                "similarity_threshold" => self.config.similarity_threshold,
1843                "max_expansion_depth" => self.config.max_expansion_depth,
1844                "entity_weight" => self.config.entity_weight,
1845                "chunk_weight" => self.config.chunk_weight,
1846                "graph_weight" => self.config.graph_weight
1847            }
1848        };
1849
1850        // Add vector index statistics
1851        // let vector_stats = self.vector_index.statistics();
1852        json_data["vector_index"] = json::object! {
1853            "vector_count" => 0, // vector_stats.vector_count,
1854            "dimension" => 0, // vector_stats.dimension,
1855            "index_built" => false, // vector_stats.index_built,
1856            "min_norm" => 0.0, // vector_stats.min_norm,
1857            "max_norm" => 0.0, // vector_stats.max_norm,
1858            "avg_norm" => 0.0 // vector_stats.avg_norm
1859        };
1860
1861        // Add embedding generator info
1862        json_data["embedding_generator"] = json::object! {
1863            "dimension" => self.embedding_generator.dimension(),
1864            "cached_words" => self.embedding_generator.cached_words()
1865        };
1866
1867        // Add parallel processing info
1868        #[cfg(feature = "parallel-processing")]
1869        {
1870            json_data["parallel_enabled"] = self.parallel_processor.is_some().into();
1871        }
1872        #[cfg(not(feature = "parallel-processing"))]
1873        {
1874            json_data["parallel_enabled"] = false.into();
1875        }
1876
1877        // Save to file
1878        fs::write(file_path, json_data.dump())?;
1879        tracing::info!("Retrieval system state saved to {file_path}");
1880
1881        Ok(())
1882    }
1883}
1884
1885/// Statistics about the retrieval system
1886#[derive(Debug)]
1887pub struct RetrievalStatistics {
1888    /// Number of vectors indexed in the system
1889    pub indexed_vectors: usize,
1890    /// Dimensionality of the vector embeddings
1891    pub vector_dimension: usize,
1892    /// Whether the vector index has been built
1893    pub index_built: bool,
1894    /// Current retrieval configuration
1895    pub config: RetrievalConfig,
1896}
1897
1898impl RetrievalStatistics {
1899    /// Print retrieval statistics
1900    #[allow(dead_code)]
1901    pub fn print(&self) {
1902        tracing::info!("Retrieval System Statistics:");
1903        tracing::info!("  Indexed vectors: {}", self.indexed_vectors);
1904        tracing::info!("  Vector dimension: {}", self.vector_dimension);
1905        tracing::info!("  Index built: {}", self.index_built);
1906        tracing::info!("  Configuration:");
1907        tracing::info!("    Top K: {}", self.config.top_k);
1908        tracing::info!(
1909            "    Similarity threshold: {:.2}",
1910            self.config.similarity_threshold
1911        );
1912        tracing::info!(
1913            "    Max expansion depth: {}",
1914            self.config.max_expansion_depth
1915        );
1916        tracing::info!("    Entity weight: {:.2}", self.config.entity_weight);
1917        tracing::info!("    Chunk weight: {:.2}", self.config.chunk_weight);
1918        tracing::info!("    Graph weight: {:.2}", self.config.graph_weight);
1919    }
1920}
1921
1922#[cfg(test)]
1923mod tests {
1924    use super::*;
1925    use crate::{config::Config, core::KnowledgeGraph};
1926
1927    #[test]
1928    fn test_retrieval_system_creation() {
1929        let config = Config::default();
1930        let retrieval = RetrievalSystem::new(&config);
1931        assert!(retrieval.is_ok());
1932    }
1933
1934    #[test]
1935    fn test_query_placeholder() {
1936        let config = Config::default();
1937        let retrieval = RetrievalSystem::new(&config).unwrap();
1938
1939        let results = retrieval.query("test query");
1940        assert!(results.is_ok());
1941
1942        let results = results.unwrap();
1943        assert!(!results.is_empty());
1944        assert!(results[0].contains("test query"));
1945    }
1946
1947    #[tokio::test]
1948    async fn test_graph_indexing() {
1949        let config = Config::default();
1950        let mut retrieval = RetrievalSystem::new(&config).unwrap();
1951        let graph = KnowledgeGraph::new();
1952
1953        let result = retrieval.index_graph(&graph).await;
1954        assert!(result.is_ok());
1955    }
1956
1957    // ============================================================================
1958    // ExplainedAnswer Tests
1959    // ============================================================================
1960
1961    #[test]
1962    fn test_explained_answer_creation() {
1963        let search_results = vec![
1964            SearchResult {
1965                id: "chunk_1".to_string(),
1966                content: "This is the first relevant chunk about climate change.".to_string(),
1967                score: 0.85,
1968                result_type: ResultType::Chunk,
1969                entities: vec!["climate".to_string(), "environment".to_string()],
1970                source_chunks: vec!["doc1_chunk1".to_string()],
1971            },
1972            SearchResult {
1973                id: "chunk_2".to_string(),
1974                content: "Another chunk discussing environmental policies.".to_string(),
1975                score: 0.72,
1976                result_type: ResultType::Chunk,
1977                entities: vec!["policy".to_string(), "environment".to_string()],
1978                source_chunks: vec!["doc1_chunk2".to_string()],
1979            },
1980        ];
1981
1982        let explained = ExplainedAnswer::from_results(
1983            "Climate change is a major environmental concern.".to_string(),
1984            &search_results,
1985            "What is climate change?",
1986        );
1987
1988        assert!(!explained.answer.is_empty());
1989        assert!(explained.confidence > 0.0 && explained.confidence <= 1.0);
1990        assert!(!explained.sources.is_empty());
1991        assert!(!explained.reasoning_steps.is_empty());
1992    }
1993
1994    #[test]
1995    fn test_explained_answer_empty_results() {
1996        let explained = ExplainedAnswer::from_results(
1997            "No relevant information found.".to_string(),
1998            &[],
1999            "What is something unknown?",
2000        );
2001
2002        assert_eq!(explained.confidence, 0.0);
2003        assert!(explained.sources.is_empty());
2004        assert!(!explained.reasoning_steps.is_empty()); // Should still have query analysis step
2005    }
2006
2007    #[test]
2008    fn test_explained_answer_format_display() {
2009        let search_results = vec![SearchResult {
2010            id: "test_chunk".to_string(),
2011            content: "Test content about technology.".to_string(),
2012            score: 0.9,
2013            result_type: ResultType::Chunk,
2014            entities: vec!["technology".to_string()],
2015            source_chunks: vec!["doc1_chunk1".to_string()],
2016        }];
2017
2018        let explained = ExplainedAnswer::from_results(
2019            "Technology is important.".to_string(),
2020            &search_results,
2021            "Why is technology important?",
2022        );
2023
2024        let formatted = explained.format_display();
2025
2026        assert!(formatted.contains("**Answer:**"));
2027        assert!(formatted.contains("**Confidence:**"));
2028        assert!(formatted.contains("**Reasoning:**"));
2029        assert!(formatted.contains("**Sources:**"));
2030    }
2031
2032    #[test]
2033    fn test_reasoning_steps_structure() {
2034        let search_results = vec![SearchResult {
2035            id: "entity_1".to_string(),
2036            content: "Entity description".to_string(),
2037            score: 0.8,
2038            result_type: ResultType::Entity,
2039            entities: vec!["person".to_string(), "organization".to_string()],
2040            source_chunks: vec![],
2041        }];
2042
2043        let explained = ExplainedAnswer::from_results(
2044            "Answer text".to_string(),
2045            &search_results,
2046            "Who are the key people?",
2047        );
2048
2049        // Check reasoning steps are numbered correctly
2050        for (i, step) in explained.reasoning_steps.iter().enumerate() {
2051            assert_eq!(step.step_number as usize, i + 1);
2052            assert!(!step.description.is_empty());
2053            assert!(step.confidence >= 0.0 && step.confidence <= 1.0);
2054        }
2055    }
2056
2057    #[test]
2058    fn test_source_reference_types() {
2059        let search_results = vec![
2060            SearchResult {
2061                id: "chunk".to_string(),
2062                content: "Chunk content".to_string(),
2063                score: 0.7,
2064                result_type: ResultType::Chunk,
2065                entities: vec![],
2066                source_chunks: vec![],
2067            },
2068            SearchResult {
2069                id: "entity".to_string(),
2070                content: "Entity content".to_string(),
2071                score: 0.6,
2072                result_type: ResultType::Entity,
2073                entities: vec![],
2074                source_chunks: vec![],
2075            },
2076            SearchResult {
2077                id: "path".to_string(),
2078                content: "Graph path content".to_string(),
2079                score: 0.5,
2080                result_type: ResultType::GraphPath,
2081                entities: vec![],
2082                source_chunks: vec![],
2083            },
2084        ];
2085
2086        let explained =
2087            ExplainedAnswer::from_results("Answer".to_string(), &search_results, "Query");
2088
2089        let source_types: Vec<_> = explained.sources.iter().map(|s| &s.source_type).collect();
2090        assert!(source_types.contains(&&SourceType::TextChunk));
2091        assert!(source_types.contains(&&SourceType::Entity));
2092        assert!(source_types.contains(&&SourceType::Relationship));
2093    }
2094}
graphrag_core/retrieval/mod.rs

graphrag_core/retrieval/
mod.rs