Skip to main content

graphrag_core/evaluation/
mod.rs

1//! Evaluation framework for GraphRAG system
2//!
3//! This module provides two complementary evaluation approaches:
4//!
5//! ## 1. LLM-based Query Result Evaluation
6//! Evaluate GraphRAG query results using LLM-based metrics:
7//! - Relevance: How relevant is the answer to the query?
8//! - Faithfulness: Is the answer grounded in the retrieved context?
9//! - Completeness: Does the answer address all aspects of the query?
10//! - Coherence: Is the answer well-structured and easy to understand?
11//! - Groundedness: Are entity names and relationships correctly mentioned?
12//!
13//! ## 2. Pipeline Phase Validation
14//! Validate each phase of the GraphRAG pipeline:
15//! - Document Processing: Chunking and enrichment validation
16//! - Entity Extraction: Entity quality and coverage checks
17//! - Relationship Extraction: Relationship validity and connectivity
18//! - Graph Construction: Overall graph structure validation
19
20pub mod pipeline_validation;
21
22pub use pipeline_validation::{
23    PhaseValidation, ValidationCheck, PipelineValidationReport,
24    DocumentProcessingValidator, EntityExtractionValidator,
25    RelationshipExtractionValidator, GraphConstructionValidator,
26};
27
28use crate::{Entity, Relationship, Result, GraphRAGError};
29use serde::{Deserialize, Serialize};
30use std::collections::HashMap;
31
32/// A query result from GraphRAG that can be evaluated
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct EvaluableQueryResult {
35    /// The original user query
36    pub query: String,
37    /// The generated answer/response
38    pub answer: String,
39    /// Retrieved entities used in the answer
40    pub retrieved_entities: Vec<Entity>,
41    /// Retrieved relationships used in the answer
42    pub retrieved_relationships: Vec<Relationship>,
43    /// Relevant text chunks/context
44    pub context_chunks: Vec<String>,
45    /// Metadata about the retrieval process
46    pub metadata: ResultMetadata,
47}
48
49/// Metadata about how the result was generated
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct ResultMetadata {
52    /// Number of entities retrieved
53    pub entities_count: usize,
54    /// Number of relationships retrieved
55    pub relationships_count: usize,
56    /// Number of context chunks used
57    pub chunks_count: usize,
58    /// Retrieval strategy used (semantic, keyword, hybrid)
59    pub retrieval_strategy: String,
60    /// Processing time in milliseconds
61    pub processing_time_ms: u64,
62    /// Additional custom fields
63    pub custom: HashMap<String, String>,
64}
65
66/// LLM evaluation prompt template for query results
67#[derive(Debug, Clone)]
68pub struct LLMEvaluationPrompt {
69    /// Template for the evaluation prompt
70    pub template: String,
71}
72
73impl Default for LLMEvaluationPrompt {
74    fn default() -> Self {
75        Self {
76            template: Self::default_template(),
77        }
78    }
79}
80
81impl LLMEvaluationPrompt {
82    /// Default evaluation prompt template
83    fn default_template() -> String {
84        r#"You are an expert evaluator for question-answering systems. Evaluate the following GraphRAG query result.
85
86## Query
87{query}
88
89## Generated Answer
90{answer}
91
92## Retrieved Context
93### Entities ({entities_count} total)
94{entities}
95
96### Relationships ({relationships_count} total)
97{relationships}
98
99### Text Chunks ({chunks_count} total)
100{chunks}
101
102## Evaluation Criteria
103Please evaluate the answer on the following dimensions (score 1-5, where 5 is best):
104
1051. **Relevance**: How well does the answer address the query?
106   - 5: Perfectly addresses the query
107   - 3: Partially addresses the query
108   - 1: Not relevant to the query
109
1102. **Faithfulness**: Is the answer grounded in the provided context?
111   - 5: Fully supported by context, no hallucination
112   - 3: Mostly supported, minor extrapolation
113   - 1: Contains unsupported claims
114
1153. **Completeness**: Does the answer cover all aspects of the query?
116   - 5: Comprehensive, addresses all aspects
117   - 3: Covers main points, misses some details
118   - 1: Incomplete, misses key information
119
1204. **Coherence**: Is the answer well-structured and clear?
121   - 5: Excellent structure, very clear
122   - 3: Adequate structure, somewhat clear
123   - 1: Poor structure, confusing
124
1255. **Groundedness**: Are entity names and relationships correctly mentioned?
126   - 5: All entities/relationships accurate
127   - 3: Minor inaccuracies
128   - 1: Significant errors in entity/relationship mentions
129
130## Output Format
131Provide your evaluation in the following JSON format:
132
133```json
134{{
135  "relevance": {{
136    "score": <1-5>,
137    "reasoning": "<brief explanation>"
138  }},
139  "faithfulness": {{
140    "score": <1-5>,
141    "reasoning": "<brief explanation>"
142  }},
143  "completeness": {{
144    "score": <1-5>,
145    "reasoning": "<brief explanation>"
146  }},
147  "coherence": {{
148    "score": <1-5>,
149    "reasoning": "<brief explanation>"
150  }},
151  "groundedness": {{
152    "score": <1-5>,
153    "reasoning": "<brief explanation>"
154  }},
155  "overall_score": <average of all scores>,
156  "summary": "<overall assessment in 2-3 sentences>"
157}}
158```
159
160Evaluate now:"#.to_string()
161    }
162
163    /// Generate evaluation prompt for a query result
164    pub fn generate(&self, result: &EvaluableQueryResult) -> String {
165        let entities_str = self.format_entities(&result.retrieved_entities);
166        let relationships_str = self.format_relationships(&result.retrieved_relationships);
167        let chunks_str = self.format_chunks(&result.context_chunks);
168
169        self.template
170            .replace("{query}", &result.query)
171            .replace("{answer}", &result.answer)
172            .replace("{entities_count}", &result.metadata.entities_count.to_string())
173            .replace("{relationships_count}", &result.metadata.relationships_count.to_string())
174            .replace("{chunks_count}", &result.metadata.chunks_count.to_string())
175            .replace("{entities}", &entities_str)
176            .replace("{relationships}", &relationships_str)
177            .replace("{chunks}", &chunks_str)
178    }
179
180    fn format_entities(&self, entities: &[Entity]) -> String {
181        if entities.is_empty() {
182            return "No entities retrieved.".to_string();
183        }
184
185        entities
186            .iter()
187            .take(10) // Limit to top 10 for prompt length
188            .map(|e| format!("- {} (type: {}, confidence: {:.2})", e.name, e.entity_type, e.confidence))
189            .collect::<Vec<_>>()
190            .join("\n")
191    }
192
193    fn format_relationships(&self, relationships: &[Relationship]) -> String {
194        if relationships.is_empty() {
195            return "No relationships retrieved.".to_string();
196        }
197
198        relationships
199            .iter()
200            .take(10) // Limit to top 10 for prompt length
201            .map(|r| format!("- {} --[{}]--> {} (confidence: {:.2})",
202                r.source, r.relation_type, r.target, r.confidence))
203            .collect::<Vec<_>>()
204            .join("\n")
205    }
206
207    fn format_chunks(&self, chunks: &[String]) -> String {
208        if chunks.is_empty() {
209            return "No context chunks retrieved.".to_string();
210        }
211
212        chunks
213            .iter()
214            .take(5) // Limit to top 5 chunks
215            .enumerate()
216            .map(|(i, chunk)| {
217                let preview = if chunk.len() > 200 {
218                    format!("{}...", &chunk[..200])
219                } else {
220                    chunk.clone()
221                };
222                format!("Chunk {}:\n{}\n", i + 1, preview)
223            })
224            .collect::<Vec<_>>()
225            .join("\n")
226    }
227}
228
229/// Parsed LLM evaluation result
230#[derive(Debug, Clone, Serialize, Deserialize)]
231pub struct LLMEvaluation {
232    /// Relevance score and reasoning
233    pub relevance: DimensionScore,
234    /// Faithfulness score and reasoning
235    pub faithfulness: DimensionScore,
236    /// Completeness score and reasoning
237    pub completeness: DimensionScore,
238    /// Coherence score and reasoning
239    pub coherence: DimensionScore,
240    /// Groundedness score and reasoning
241    pub groundedness: DimensionScore,
242    /// Overall average score
243    pub overall_score: f32,
244    /// Summary assessment
245    pub summary: String,
246}
247
248/// Score for a single evaluation dimension
249#[derive(Debug, Clone, Serialize, Deserialize)]
250pub struct DimensionScore {
251    /// Score 1-5
252    pub score: u8,
253    /// Reasoning for the score
254    pub reasoning: String,
255}
256
257impl LLMEvaluation {
258    /// Parse LLM response JSON into structured evaluation
259    pub fn from_json(json_str: &str) -> Result<Self> {
260        serde_json::from_str(json_str).map_err(|e| GraphRAGError::Serialization {
261            message: format!("Failed to parse LLM evaluation JSON: {}", e),
262        })
263    }
264
265    /// Check if the evaluation passes a minimum quality threshold
266    pub fn passes_threshold(&self, min_score: f32) -> bool {
267        self.overall_score >= min_score
268    }
269
270    /// Get the dimension with the lowest score
271    pub fn weakest_dimension(&self) -> (&str, &DimensionScore) {
272        let dimensions = [
273            ("relevance", &self.relevance),
274            ("faithfulness", &self.faithfulness),
275            ("completeness", &self.completeness),
276            ("coherence", &self.coherence),
277            ("groundedness", &self.groundedness),
278        ];
279
280        dimensions
281            .iter()
282            .min_by_key(|(_, score)| score.score)
283            .map(|(name, score)| (*name, *score))
284            .unwrap_or(("unknown", &self.relevance))
285    }
286
287    /// Generate a report string
288    pub fn report(&self) -> String {
289        format!(
290            r#"## LLM Evaluation Report
291
292**Overall Score**: {:.2}/5.0
293
294### Dimension Scores
295- Relevance:     {}/5 - {}
296- Faithfulness:  {}/5 - {}
297- Completeness:  {}/5 - {}
298- Coherence:     {}/5 - {}
299- Groundedness:  {}/5 - {}
300
301### Summary
302{}
303
304### Weakest Dimension
305{}: {} (score {}/5)
306"#,
307            self.overall_score,
308            self.relevance.score, self.relevance.reasoning,
309            self.faithfulness.score, self.faithfulness.reasoning,
310            self.completeness.score, self.completeness.reasoning,
311            self.coherence.score, self.coherence.reasoning,
312            self.groundedness.score, self.groundedness.reasoning,
313            self.summary,
314            self.weakest_dimension().0,
315            self.weakest_dimension().1.reasoning,
316            self.weakest_dimension().1.score
317        )
318    }
319}
320
321/// Builder for creating evaluable query results
322pub struct EvaluableQueryResultBuilder {
323    query: Option<String>,
324    answer: Option<String>,
325    entities: Vec<Entity>,
326    relationships: Vec<Relationship>,
327    chunks: Vec<String>,
328    retrieval_strategy: String,
329    processing_time_ms: u64,
330    custom: HashMap<String, String>,
331}
332
333impl EvaluableQueryResultBuilder {
334    /// Create a new builder
335    pub fn new() -> Self {
336        Self {
337            query: None,
338            answer: None,
339            entities: Vec::new(),
340            relationships: Vec::new(),
341            chunks: Vec::new(),
342            retrieval_strategy: "unknown".to_string(),
343            processing_time_ms: 0,
344            custom: HashMap::new(),
345        }
346    }
347
348    /// Set the query
349    pub fn query(mut self, query: impl Into<String>) -> Self {
350        self.query = Some(query.into());
351        self
352    }
353
354    /// Set the answer
355    pub fn answer(mut self, answer: impl Into<String>) -> Self {
356        self.answer = Some(answer.into());
357        self
358    }
359
360    /// Add retrieved entities
361    pub fn entities(mut self, entities: Vec<Entity>) -> Self {
362        self.entities = entities;
363        self
364    }
365
366    /// Add retrieved relationships
367    pub fn relationships(mut self, relationships: Vec<Relationship>) -> Self {
368        self.relationships = relationships;
369        self
370    }
371
372    /// Add context chunks
373    pub fn chunks(mut self, chunks: Vec<String>) -> Self {
374        self.chunks = chunks;
375        self
376    }
377
378    /// Set retrieval strategy
379    pub fn retrieval_strategy(mut self, strategy: impl Into<String>) -> Self {
380        self.retrieval_strategy = strategy.into();
381        self
382    }
383
384    /// Set processing time
385    pub fn processing_time_ms(mut self, time_ms: u64) -> Self {
386        self.processing_time_ms = time_ms;
387        self
388    }
389
390    /// Add custom metadata
391    pub fn custom_metadata(mut self, key: String, value: String) -> Self {
392        self.custom.insert(key, value);
393        self
394    }
395
396    /// Build the evaluable query result
397    pub fn build(self) -> Result<EvaluableQueryResult> {
398        let query = self.query.ok_or_else(|| GraphRAGError::Config {
399            message: "Query is required".to_string(),
400        })?;
401        let answer = self.answer.ok_or_else(|| GraphRAGError::Config {
402            message: "Answer is required".to_string(),
403        })?;
404
405        Ok(EvaluableQueryResult {
406            query,
407            answer,
408            metadata: ResultMetadata {
409                entities_count: self.entities.len(),
410                relationships_count: self.relationships.len(),
411                chunks_count: self.chunks.len(),
412                retrieval_strategy: self.retrieval_strategy,
413                processing_time_ms: self.processing_time_ms,
414                custom: self.custom,
415            },
416            retrieved_entities: self.entities,
417            retrieved_relationships: self.relationships,
418            context_chunks: self.chunks,
419        })
420    }
421}
422
423impl Default for EvaluableQueryResultBuilder {
424    fn default() -> Self {
425        Self::new()
426    }
427}
428
429#[cfg(test)]
430mod tests {
431    use super::*;
432    use crate::{EntityId, ChunkId};
433
434    #[test]
435    fn test_prompt_generation() {
436        let entity = Entity {
437            id: EntityId::new("e1".to_string()),
438            name: "Alice".to_string(),
439            entity_type: "person".to_string(),
440            confidence: 0.9,
441            mentions: vec![],
442            embedding: None,
443        };
444
445        let result = EvaluableQueryResultBuilder::new()
446            .query("Who is Alice?")
447            .answer("Alice is a person mentioned in the context.")
448            .entities(vec![entity])
449            .chunks(vec!["Alice works at Stanford.".to_string()])
450            .retrieval_strategy("semantic")
451            .build()
452            .unwrap();
453
454        let prompt = LLMEvaluationPrompt::default();
455        let generated = prompt.generate(&result);
456
457        assert!(generated.contains("Who is Alice?"));
458        assert!(generated.contains("Alice is a person"));
459        assert!(generated.contains("Alice (type: person"));
460        assert!(generated.contains("Evaluate now:"));
461    }
462
463    #[test]
464    fn test_evaluation_parsing() {
465        let json = r#"{
466            "relevance": {
467                "score": 5,
468                "reasoning": "Perfectly answers the question"
469            },
470            "faithfulness": {
471                "score": 4,
472                "reasoning": "Mostly grounded in context"
473            },
474            "completeness": {
475                "score": 4,
476                "reasoning": "Covers main points"
477            },
478            "coherence": {
479                "score": 5,
480                "reasoning": "Well structured"
481            },
482            "groundedness": {
483                "score": 5,
484                "reasoning": "All entities accurate"
485            },
486            "overall_score": 4.6,
487            "summary": "High quality answer"
488        }"#;
489
490        let eval = LLMEvaluation::from_json(json).unwrap();
491        assert_eq!(eval.relevance.score, 5);
492        assert_eq!(eval.faithfulness.score, 4);
493        assert!(eval.passes_threshold(4.0));
494        assert!(!eval.passes_threshold(5.0));
495    }
496
497    #[test]
498    fn test_weakest_dimension() {
499        let json = r#"{
500            "relevance": {"score": 5, "reasoning": "Perfect"},
501            "faithfulness": {"score": 3, "reasoning": "Some issues"},
502            "completeness": {"score": 4, "reasoning": "Good"},
503            "coherence": {"score": 5, "reasoning": "Excellent"},
504            "groundedness": {"score": 4, "reasoning": "Accurate"},
505            "overall_score": 4.2,
506            "summary": "Good overall"
507        }"#;
508
509        let eval = LLMEvaluation::from_json(json).unwrap();
510        let (name, score) = eval.weakest_dimension();
511        assert_eq!(name, "faithfulness");
512        assert_eq!(score.score, 3);
513    }
514
515    #[test]
516    fn test_report_generation() {
517        let json = r#"{
518            "relevance": {"score": 5, "reasoning": "Perfect"},
519            "faithfulness": {"score": 4, "reasoning": "Good"},
520            "completeness": {"score": 4, "reasoning": "Complete"},
521            "coherence": {"score": 5, "reasoning": "Clear"},
522            "groundedness": {"score": 4, "reasoning": "Accurate"},
523            "overall_score": 4.4,
524            "summary": "Excellent answer"
525        }"#;
526
527        let eval = LLMEvaluation::from_json(json).unwrap();
528        let report = eval.report();
529
530        // Check for numeric score (format may vary: 4.40 or 4.4)
531        assert!(report.contains("4.4") || report.contains("4.40"),
532                "Expected score 4.4 not found in report: {}", report);
533        assert!(report.contains("5/5") && report.contains("Relevance"),
534                "Expected 'Relevance: 5/5' not found");
535        assert!(report.contains("Excellent answer"));
536
537        // Verify the actual overall_score value
538        assert!((eval.overall_score - 4.4).abs() < 0.01);
539    }
540}