graphrag_core/evaluation/
mod.rs

1//! Evaluation framework for GraphRAG system
2//!
3//! This module provides two complementary evaluation approaches:
4//!
5//! ## 1. LLM-based Query Result Evaluation
6//! Evaluate GraphRAG query results using LLM-based metrics:
7//! - Relevance: How relevant is the answer to the query?
8//! - Faithfulness: Is the answer grounded in the retrieved context?
9//! - Completeness: Does the answer address all aspects of the query?
10//! - Coherence: Is the answer well-structured and easy to understand?
11//! - Groundedness: Are entity names and relationships correctly mentioned?
12//!
13//! ## 2. Pipeline Phase Validation
14//! Validate each phase of the GraphRAG pipeline:
15//! - Document Processing: Chunking and enrichment validation
16//! - Entity Extraction: Entity quality and coverage checks
17//! - Relationship Extraction: Relationship validity and connectivity
18//! - Graph Construction: Overall graph structure validation
19
20pub mod pipeline_validation;
21
22pub use pipeline_validation::{
23    DocumentProcessingValidator, EntityExtractionValidator, GraphConstructionValidator,
24    PhaseValidation, PipelineValidationReport, RelationshipExtractionValidator, ValidationCheck,
25};
26
27use crate::{Entity, GraphRAGError, Relationship, Result};
28use serde::{Deserialize, Serialize};
29use std::collections::HashMap;
30
31/// A query result from GraphRAG that can be evaluated
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct EvaluableQueryResult {
34    /// The original user query
35    pub query: String,
36    /// The generated answer/response
37    pub answer: String,
38    /// Retrieved entities used in the answer
39    pub retrieved_entities: Vec<Entity>,
40    /// Retrieved relationships used in the answer
41    pub retrieved_relationships: Vec<Relationship>,
42    /// Relevant text chunks/context
43    pub context_chunks: Vec<String>,
44    /// Metadata about the retrieval process
45    pub metadata: ResultMetadata,
46}
47
48/// Metadata about how the result was generated
49#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct ResultMetadata {
51    /// Number of entities retrieved
52    pub entities_count: usize,
53    /// Number of relationships retrieved
54    pub relationships_count: usize,
55    /// Number of context chunks used
56    pub chunks_count: usize,
57    /// Retrieval strategy used (semantic, keyword, hybrid)
58    pub retrieval_strategy: String,
59    /// Processing time in milliseconds
60    pub processing_time_ms: u64,
61    /// Additional custom fields
62    pub custom: HashMap<String, String>,
63}
64
65/// LLM evaluation prompt template for query results
66#[derive(Debug, Clone)]
67pub struct LLMEvaluationPrompt {
68    /// Template for the evaluation prompt
69    pub template: String,
70}
71
72impl Default for LLMEvaluationPrompt {
73    fn default() -> Self {
74        Self {
75            template: Self::default_template(),
76        }
77    }
78}
79
80impl LLMEvaluationPrompt {
81    /// Default evaluation prompt template
82    fn default_template() -> String {
83        r#"You are an expert evaluator for question-answering systems. Evaluate the following GraphRAG query result.
84
85## Query
86{query}
87
88## Generated Answer
89{answer}
90
91## Retrieved Context
92### Entities ({entities_count} total)
93{entities}
94
95### Relationships ({relationships_count} total)
96{relationships}
97
98### Text Chunks ({chunks_count} total)
99{chunks}
100
101## Evaluation Criteria
102Please evaluate the answer on the following dimensions (score 1-5, where 5 is best):
103
1041. **Relevance**: How well does the answer address the query?
105   - 5: Perfectly addresses the query
106   - 3: Partially addresses the query
107   - 1: Not relevant to the query
108
1092. **Faithfulness**: Is the answer grounded in the provided context?
110   - 5: Fully supported by context, no hallucination
111   - 3: Mostly supported, minor extrapolation
112   - 1: Contains unsupported claims
113
1143. **Completeness**: Does the answer cover all aspects of the query?
115   - 5: Comprehensive, addresses all aspects
116   - 3: Covers main points, misses some details
117   - 1: Incomplete, misses key information
118
1194. **Coherence**: Is the answer well-structured and clear?
120   - 5: Excellent structure, very clear
121   - 3: Adequate structure, somewhat clear
122   - 1: Poor structure, confusing
123
1245. **Groundedness**: Are entity names and relationships correctly mentioned?
125   - 5: All entities/relationships accurate
126   - 3: Minor inaccuracies
127   - 1: Significant errors in entity/relationship mentions
128
129## Output Format
130Provide your evaluation in the following JSON format:
131
132```json
133{{
134  "relevance": {{
135    "score": <1-5>,
136    "reasoning": "<brief explanation>"
137  }},
138  "faithfulness": {{
139    "score": <1-5>,
140    "reasoning": "<brief explanation>"
141  }},
142  "completeness": {{
143    "score": <1-5>,
144    "reasoning": "<brief explanation>"
145  }},
146  "coherence": {{
147    "score": <1-5>,
148    "reasoning": "<brief explanation>"
149  }},
150  "groundedness": {{
151    "score": <1-5>,
152    "reasoning": "<brief explanation>"
153  }},
154  "overall_score": <average of all scores>,
155  "summary": "<overall assessment in 2-3 sentences>"
156}}
157```
158
159Evaluate now:"#.to_string()
160    }
161
162    /// Generate evaluation prompt for a query result
163    pub fn generate(&self, result: &EvaluableQueryResult) -> String {
164        let entities_str = self.format_entities(&result.retrieved_entities);
165        let relationships_str = self.format_relationships(&result.retrieved_relationships);
166        let chunks_str = self.format_chunks(&result.context_chunks);
167
168        self.template
169            .replace("{query}", &result.query)
170            .replace("{answer}", &result.answer)
171            .replace(
172                "{entities_count}",
173                &result.metadata.entities_count.to_string(),
174            )
175            .replace(
176                "{relationships_count}",
177                &result.metadata.relationships_count.to_string(),
178            )
179            .replace("{chunks_count}", &result.metadata.chunks_count.to_string())
180            .replace("{entities}", &entities_str)
181            .replace("{relationships}", &relationships_str)
182            .replace("{chunks}", &chunks_str)
183    }
184
185    fn format_entities(&self, entities: &[Entity]) -> String {
186        if entities.is_empty() {
187            return "No entities retrieved.".to_string();
188        }
189
190        entities
191            .iter()
192            .take(10) // Limit to top 10 for prompt length
193            .map(|e| format!("- {} (type: {}, confidence: {:.2})", e.name, e.entity_type, e.confidence))
194            .collect::<Vec<_>>()
195            .join("\n")
196    }
197
198    fn format_relationships(&self, relationships: &[Relationship]) -> String {
199        if relationships.is_empty() {
200            return "No relationships retrieved.".to_string();
201        }
202
203        relationships
204            .iter()
205            .take(10) // Limit to top 10 for prompt length
206            .map(|r| format!("- {} --[{}]--> {} (confidence: {:.2})",
207                r.source, r.relation_type, r.target, r.confidence))
208            .collect::<Vec<_>>()
209            .join("\n")
210    }
211
212    fn format_chunks(&self, chunks: &[String]) -> String {
213        if chunks.is_empty() {
214            return "No context chunks retrieved.".to_string();
215        }
216
217        chunks
218            .iter()
219            .take(5) // Limit to top 5 chunks
220            .enumerate()
221            .map(|(i, chunk)| {
222                let preview = if chunk.len() > 200 {
223                    format!("{}...", &chunk[..200])
224                } else {
225                    chunk.clone()
226                };
227                format!("Chunk {}:\n{}\n", i + 1, preview)
228            })
229            .collect::<Vec<_>>()
230            .join("\n")
231    }
232}
233
234/// Parsed LLM evaluation result
235#[derive(Debug, Clone, Serialize, Deserialize)]
236pub struct LLMEvaluation {
237    /// Relevance score and reasoning
238    pub relevance: DimensionScore,
239    /// Faithfulness score and reasoning
240    pub faithfulness: DimensionScore,
241    /// Completeness score and reasoning
242    pub completeness: DimensionScore,
243    /// Coherence score and reasoning
244    pub coherence: DimensionScore,
245    /// Groundedness score and reasoning
246    pub groundedness: DimensionScore,
247    /// Overall average score
248    pub overall_score: f32,
249    /// Summary assessment
250    pub summary: String,
251}
252
253/// Score for a single evaluation dimension
254#[derive(Debug, Clone, Serialize, Deserialize)]
255pub struct DimensionScore {
256    /// Score 1-5
257    pub score: u8,
258    /// Reasoning for the score
259    pub reasoning: String,
260}
261
262impl LLMEvaluation {
263    /// Parse LLM response JSON into structured evaluation
264    pub fn from_json(json_str: &str) -> Result<Self> {
265        serde_json::from_str(json_str).map_err(|e| GraphRAGError::Serialization {
266            message: format!("Failed to parse LLM evaluation JSON: {}", e),
267        })
268    }
269
270    /// Check if the evaluation passes a minimum quality threshold
271    pub fn passes_threshold(&self, min_score: f32) -> bool {
272        self.overall_score >= min_score
273    }
274
275    /// Get the dimension with the lowest score
276    pub fn weakest_dimension(&self) -> (&str, &DimensionScore) {
277        let dimensions = [
278            ("relevance", &self.relevance),
279            ("faithfulness", &self.faithfulness),
280            ("completeness", &self.completeness),
281            ("coherence", &self.coherence),
282            ("groundedness", &self.groundedness),
283        ];
284
285        dimensions
286            .iter()
287            .min_by_key(|(_, score)| score.score)
288            .map(|(name, score)| (*name, *score))
289            .unwrap_or(("unknown", &self.relevance))
290    }
291
292    /// Generate a report string
293    pub fn report(&self) -> String {
294        format!(
295            r#"## LLM Evaluation Report
296
297**Overall Score**: {:.2}/5.0
298
299### Dimension Scores
300- Relevance:     {}/5 - {}
301- Faithfulness:  {}/5 - {}
302- Completeness:  {}/5 - {}
303- Coherence:     {}/5 - {}
304- Groundedness:  {}/5 - {}
305
306### Summary
307{}
308
309### Weakest Dimension
310{}: {} (score {}/5)
311"#,
312            self.overall_score,
313            self.relevance.score,
314            self.relevance.reasoning,
315            self.faithfulness.score,
316            self.faithfulness.reasoning,
317            self.completeness.score,
318            self.completeness.reasoning,
319            self.coherence.score,
320            self.coherence.reasoning,
321            self.groundedness.score,
322            self.groundedness.reasoning,
323            self.summary,
324            self.weakest_dimension().0,
325            self.weakest_dimension().1.reasoning,
326            self.weakest_dimension().1.score
327        )
328    }
329}
330
331/// Builder for creating evaluable query results
332pub struct EvaluableQueryResultBuilder {
333    query: Option<String>,
334    answer: Option<String>,
335    entities: Vec<Entity>,
336    relationships: Vec<Relationship>,
337    chunks: Vec<String>,
338    retrieval_strategy: String,
339    processing_time_ms: u64,
340    custom: HashMap<String, String>,
341}
342
343impl EvaluableQueryResultBuilder {
344    /// Create a new builder
345    pub fn new() -> Self {
346        Self {
347            query: None,
348            answer: None,
349            entities: Vec::new(),
350            relationships: Vec::new(),
351            chunks: Vec::new(),
352            retrieval_strategy: "unknown".to_string(),
353            processing_time_ms: 0,
354            custom: HashMap::new(),
355        }
356    }
357
358    /// Set the query
359    pub fn query(mut self, query: impl Into<String>) -> Self {
360        self.query = Some(query.into());
361        self
362    }
363
364    /// Set the answer
365    pub fn answer(mut self, answer: impl Into<String>) -> Self {
366        self.answer = Some(answer.into());
367        self
368    }
369
370    /// Add retrieved entities
371    pub fn entities(mut self, entities: Vec<Entity>) -> Self {
372        self.entities = entities;
373        self
374    }
375
376    /// Add retrieved relationships
377    pub fn relationships(mut self, relationships: Vec<Relationship>) -> Self {
378        self.relationships = relationships;
379        self
380    }
381
382    /// Add context chunks
383    pub fn chunks(mut self, chunks: Vec<String>) -> Self {
384        self.chunks = chunks;
385        self
386    }
387
388    /// Set retrieval strategy
389    pub fn retrieval_strategy(mut self, strategy: impl Into<String>) -> Self {
390        self.retrieval_strategy = strategy.into();
391        self
392    }
393
394    /// Set processing time
395    pub fn processing_time_ms(mut self, time_ms: u64) -> Self {
396        self.processing_time_ms = time_ms;
397        self
398    }
399
400    /// Add custom metadata
401    pub fn custom_metadata(mut self, key: String, value: String) -> Self {
402        self.custom.insert(key, value);
403        self
404    }
405
406    /// Build the evaluable query result
407    pub fn build(self) -> Result<EvaluableQueryResult> {
408        let query = self.query.ok_or_else(|| GraphRAGError::Config {
409            message: "Query is required".to_string(),
410        })?;
411        let answer = self.answer.ok_or_else(|| GraphRAGError::Config {
412            message: "Answer is required".to_string(),
413        })?;
414
415        Ok(EvaluableQueryResult {
416            query,
417            answer,
418            metadata: ResultMetadata {
419                entities_count: self.entities.len(),
420                relationships_count: self.relationships.len(),
421                chunks_count: self.chunks.len(),
422                retrieval_strategy: self.retrieval_strategy,
423                processing_time_ms: self.processing_time_ms,
424                custom: self.custom,
425            },
426            retrieved_entities: self.entities,
427            retrieved_relationships: self.relationships,
428            context_chunks: self.chunks,
429        })
430    }
431}
432
433impl Default for EvaluableQueryResultBuilder {
434    fn default() -> Self {
435        Self::new()
436    }
437}
438
439#[cfg(test)]
440mod tests {
441    use super::*;
442    use crate::EntityId;
443
444    #[test]
445    fn test_prompt_generation() {
446        let entity = Entity {
447            id: EntityId::new("e1".to_string()),
448            name: "Alice".to_string(),
449            entity_type: "person".to_string(),
450            confidence: 0.9,
451            mentions: vec![],
452            embedding: None,
453            first_mentioned: None,
454            last_mentioned: None,
455            temporal_validity: None,
456        };
457
458        let result = EvaluableQueryResultBuilder::new()
459            .query("Who is Alice?")
460            .answer("Alice is a person mentioned in the context.")
461            .entities(vec![entity])
462            .chunks(vec!["Alice works at Stanford.".to_string()])
463            .retrieval_strategy("semantic")
464            .build()
465            .unwrap();
466
467        let prompt = LLMEvaluationPrompt::default();
468        let generated = prompt.generate(&result);
469
470        assert!(generated.contains("Who is Alice?"));
471        assert!(generated.contains("Alice is a person"));
472        assert!(generated.contains("Alice (type: person"));
473        assert!(generated.contains("Evaluate now:"));
474    }
475
476    #[test]
477    fn test_evaluation_parsing() {
478        let json = r#"{
479            "relevance": {
480                "score": 5,
481                "reasoning": "Perfectly answers the question"
482            },
483            "faithfulness": {
484                "score": 4,
485                "reasoning": "Mostly grounded in context"
486            },
487            "completeness": {
488                "score": 4,
489                "reasoning": "Covers main points"
490            },
491            "coherence": {
492                "score": 5,
493                "reasoning": "Well structured"
494            },
495            "groundedness": {
496                "score": 5,
497                "reasoning": "All entities accurate"
498            },
499            "overall_score": 4.6,
500            "summary": "High quality answer"
501        }"#;
502
503        let eval = LLMEvaluation::from_json(json).unwrap();
504        assert_eq!(eval.relevance.score, 5);
505        assert_eq!(eval.faithfulness.score, 4);
506        assert!(eval.passes_threshold(4.0));
507        assert!(!eval.passes_threshold(5.0));
508    }
509
510    #[test]
511    fn test_weakest_dimension() {
512        let json = r#"{
513            "relevance": {"score": 5, "reasoning": "Perfect"},
514            "faithfulness": {"score": 3, "reasoning": "Some issues"},
515            "completeness": {"score": 4, "reasoning": "Good"},
516            "coherence": {"score": 5, "reasoning": "Excellent"},
517            "groundedness": {"score": 4, "reasoning": "Accurate"},
518            "overall_score": 4.2,
519            "summary": "Good overall"
520        }"#;
521
522        let eval = LLMEvaluation::from_json(json).unwrap();
523        let (name, score) = eval.weakest_dimension();
524        assert_eq!(name, "faithfulness");
525        assert_eq!(score.score, 3);
526    }
527
528    #[test]
529    fn test_report_generation() {
530        let json = r#"{
531            "relevance": {"score": 5, "reasoning": "Perfect"},
532            "faithfulness": {"score": 4, "reasoning": "Good"},
533            "completeness": {"score": 4, "reasoning": "Complete"},
534            "coherence": {"score": 5, "reasoning": "Clear"},
535            "groundedness": {"score": 4, "reasoning": "Accurate"},
536            "overall_score": 4.4,
537            "summary": "Excellent answer"
538        }"#;
539
540        let eval = LLMEvaluation::from_json(json).unwrap();
541        let report = eval.report();
542
543        // Check for numeric score (format may vary: 4.40 or 4.4)
544        assert!(
545            report.contains("4.4") || report.contains("4.40"),
546            "Expected score 4.4 not found in report: {}",
547            report
548        );
549        assert!(
550            report.contains("5/5") && report.contains("Relevance"),
551            "Expected 'Relevance: 5/5' not found"
552        );
553        assert!(report.contains("Excellent answer"));
554
555        // Verify the actual overall_score value
556        assert!((eval.overall_score - 4.4).abs() < 0.01);
557    }
558}
graphrag_core/evaluation/mod.rs

graphrag_core/evaluation/
mod.rs