rexis_rag/evaluation/mod.rs
1//! # RRAG Evaluation Framework
2//!
3//! Enterprise-grade evaluation framework for RAG systems providing comprehensive
4//! assessment capabilities based on RAGAS metrics, custom evaluation methods,
5//! and industry-standard benchmarks.
6//!
7//! This module offers a complete evaluation suite for RAG systems, enabling
8//! both component-level analysis (retrieval, generation) and end-to-end system
9//! evaluation. It supports multiple evaluation methodologies, automated benchmarking,
10//! and detailed performance analytics.
11//!
12//! ## Key Features
13//!
14//! - **RAGAS Integration**: Industry-standard RAG evaluation metrics
15//! - **Multi-Level Evaluation**: Component and system-level assessments
16//! - **Automated Benchmarking**: Built-in benchmark datasets and evaluation
17//! - **Custom Metrics**: Extensible framework for domain-specific evaluation
18//! - **Performance Analytics**: Detailed insights and recommendations
19//! - **Export Capabilities**: Multiple output formats (JSON, CSV, HTML, Markdown)
20//! - **Real-time Monitoring**: Live evaluation during system operation
21//!
22//! ## Evaluation Types
23//!
24//! 1. **RAGAS Metrics**: Faithfulness, Answer Relevancy, Context Precision, Context Recall
25//! 2. **Retrieval Evaluation**: Precision@K, Recall@K, MRR, NDCG
26//! 3. **Generation Evaluation**: BLEU, ROUGE, BERTScore, Semantic similarity
27//! 4. **End-to-End Evaluation**: Complete pipeline assessment
28//! 5. **Benchmark Evaluation**: Performance on standard datasets
29//!
30//! ## Examples
31//!
32//! ### Basic Evaluation Setup
33//! ```rust
34//! use rrag::evaluation::{
35//! EvaluationService, EvaluationConfig, EvaluationType,
36//! EvaluationData, TestQuery, GroundTruth
37//! };
38//!
39//! # async fn example() -> rrag::RragResult<()> {
40//! let config = EvaluationConfig {
41//! enabled_evaluations: vec![
42//! EvaluationType::Ragas,
43//! EvaluationType::Retrieval,
44//! EvaluationType::Generation,
45//! ],
46//! ..Default::default()
47//! };
48//!
49//! let mut evaluator = EvaluationService::new(config);
50//! tracing::debug!("📊 Evaluation service initialized with {} evaluators", 3);
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Running Comprehensive Evaluation
56//! ```rust
57//! use std::collections::HashMap;
58//!
59//! # async fn example() -> rrag::RragResult<()> {
60//! # let mut evaluator = rrag::evaluation::EvaluationService::new(rrag::evaluation::EvaluationConfig::default());
61//! // Prepare test data
62//! let test_queries = vec![
63//! rrag::evaluation::TestQuery {
64//! id: "q1".to_string(),
65//! query: "What is machine learning?".to_string(),
66//! query_type: Some("factual".to_string()),
67//! metadata: HashMap::new(),
68//! },
69//! rrag::evaluation::TestQuery {
70//! id: "q2".to_string(),
71//! query: "Explain neural networks in detail".to_string(),
72//! query_type: Some("conceptual".to_string()),
73//! metadata: HashMap::new(),
74//! },
75//! ];
76//!
77//! let ground_truth = vec![
78//! rrag::evaluation::GroundTruth {
79//! query_id: "q1".to_string(),
80//! relevant_docs: vec!["doc_ml_intro".to_string(), "doc_ml_basics".to_string()],
81//! expected_answer: Some(
82//! "Machine learning is a subset of AI that enables computers to learn...".to_string()
83//! ),
84//! relevance_judgments: [("doc_ml_intro".to_string(), 1.0)].iter().cloned().collect(),
85//! metadata: HashMap::new(),
86//! },
87//! ];
88//!
89//! let evaluation_data = rrag::evaluation::EvaluationData {
90//! queries: test_queries,
91//! ground_truth,
92//! system_responses: vec![], // Would be populated with actual system responses
93//! context: HashMap::new(),
94//! };
95//!
96//! // Run evaluation
97//! let results = evaluator.evaluate(evaluation_data).await?;
98//!
99//! for (eval_type, result) in results {
100//! tracing::debug!("🏆 {:?} Evaluation Results:", eval_type);
101//! for (metric, score) in result.overall_scores {
102//! tracing::debug!(" {}: {:.4}", metric, score);
103//! }
104//! }
105//! # Ok(())
106//! # }
107//! ```
108//!
109//! ### RAGAS Evaluation
110//! ```rust
111//! use rrag::evaluation::{
112//! ragas::{RagasEvaluator, RagasConfig, RagasMetric},
113//! SystemResponse, RetrievedDocument, SystemTiming
114//! };
115//!
116//! # async fn example() -> rrag::RragResult<()> {
117//! let ragas_config = RagasConfig {
118//! enabled_metrics: vec![
119//! RagasMetric::Faithfulness,
120//! RagasMetric::AnswerRelevancy,
121//! RagasMetric::ContextPrecision,
122//! RagasMetric::ContextRecall,
123//! ],
124//! ..Default::default()
125//! };
126//!
127//! let ragas_evaluator = RagasEvaluator::new(ragas_config);
128//!
129//! // Prepare system response for evaluation
130//! let system_response = SystemResponse {
131//! query_id: "q1".to_string(),
132//! retrieved_docs: vec![
133//! RetrievedDocument {
134//! doc_id: "doc_1".to_string(),
135//! content: "Machine learning is a method of data analysis...".to_string(),
136//! score: 0.95,
137//! rank: 1,
138//! metadata: HashMap::new(),
139//! }
140//! ],
141//! generated_answer: Some(
142//! "Machine learning is a subset of artificial intelligence...".to_string()
143//! ),
144//! timing: SystemTiming {
145//! total_time_ms: 250.0,
146//! retrieval_time_ms: 120.0,
147//! generation_time_ms: Some(130.0),
148//! reranking_time_ms: None,
149//! },
150//! metadata: HashMap::new(),
151//! };
152//!
153//! tracing::debug!("📈 RAGAS evaluation completed with {} metrics", 4);
154//! # Ok(())
155//! # }
156//! ```
157//!
158//! ### Retrieval-Specific Evaluation
159//! ```rust
160//! use rrag::evaluation::retrieval_eval::{
161//! RetrievalEvaluator, RetrievalEvalConfig, RetrievalMetric
162//! };
163//!
164//! # async fn example() -> rrag::RragResult<()> {
165//! let retrieval_config = RetrievalEvalConfig {
166//! metrics: vec![
167//! RetrievalMetric::PrecisionAtK(10),
168//! RetrievalMetric::RecallAtK(10),
169//! RetrievalMetric::MeanReciprocalRank,
170//! RetrievalMetric::NDCG(10),
171//! ],
172//! k_values: vec![1, 5, 10, 20],
173//! ..Default::default()
174//! };
175//!
176//! let retrieval_evaluator = RetrievalEvaluator::new(retrieval_config);
177//!
178//! // Results will include:
179//! // - Precision@1, @5, @10, @20
180//! // - Recall@1, @5, @10, @20
181//! // - Mean Reciprocal Rank
182//! // - Normalized Discounted Cumulative Gain
183//!
184//! tracing::debug!("🏁 Retrieval evaluation configured for multiple K values");
185//! # Ok(())
186//! # }
187//! ```
188//!
189//! ### Generation Quality Evaluation
190//! ```rust
191//! use rrag::evaluation::generation_eval::{
192//! GenerationEvaluator, GenerationEvalConfig, GenerationMetric
193//! };
194//!
195//! # async fn example() -> rrag::RragResult<()> {
196//! let generation_config = GenerationEvalConfig {
197//! metrics: vec![
198//! GenerationMetric::BLEU,
199//! GenerationMetric::ROUGE("rouge-l".to_string()),
200//! GenerationMetric::BERTScore,
201//! GenerationMetric::SemanticSimilarity,
202//! ],
203//! reference_free: false,
204//! ..Default::default()
205//! };
206//!
207//! let generation_evaluator = GenerationEvaluator::new(generation_config);
208//!
209//! // Evaluates generated answers against reference answers
210//! // Provides detailed analysis of:
211//! // - Lexical similarity (BLEU, ROUGE)
212//! // - Semantic similarity (BERTScore, embeddings)
213//! // - Factual accuracy
214//! // - Fluency and coherence
215//!
216//! tracing::debug!("✍️ Generation evaluation ready for quality assessment");
217//! # Ok(())
218//! # }
219//! ```
220//!
221//! ### End-to-End System Evaluation
222//! ```rust
223//! use rrag::evaluation::end_to_end::{
224//! EndToEndEvaluator, EndToEndConfig, E2EMetric
225//! };
226//!
227//! # async fn example() -> rrag::RragResult<()> {
228//! let e2e_config = EndToEndConfig {
229//! metrics: vec![
230//! E2EMetric::OverallAccuracy,
231//! E2EMetric::ResponseTime,
232//! E2EMetric::UserSatisfaction,
233//! E2EMetric::CostEfficiency,
234//! ],
235//! include_ablation_study: true,
236//! ..Default::default()
237//! };
238//!
239//! let e2e_evaluator = EndToEndEvaluator::new(e2e_config);
240//!
241//! // Comprehensive system evaluation including:
242//! // - End-to-end accuracy
243//! // - Performance benchmarks
244//! // - Resource utilization
245//! // - Error analysis
246//! // - Component contribution analysis
247//!
248//! tracing::debug!("🎆 End-to-end evaluation configured for complete system assessment");
249//! # Ok(())
250//! # }
251//! ```
252//!
253//! ### Automated Benchmarking
254//! ```rust
255//! use rrag::evaluation::benchmarks::{
256//! BenchmarkEvaluator, BenchmarkSuite, BenchmarkDataset
257//! };
258//!
259//! # async fn example() -> rrag::RragResult<()> {
260//! let benchmark_evaluator = BenchmarkEvaluator::new();
261//!
262//! let benchmark_suite = BenchmarkSuite {
263//! datasets: vec![
264//! BenchmarkDataset::MS_MARCO,
265//! BenchmarkDataset::Natural_Questions,
266//! BenchmarkDataset::SQuAD_2_0,
267//! BenchmarkDataset::BEIR,
268//! ],
269//! custom_datasets: vec![], // Add domain-specific datasets
270//! evaluation_mode: "comprehensive".to_string(),
271//! };
272//!
273//! // Run against standard benchmarks
274//! // let results = benchmark_evaluator.run_benchmark_suite(benchmark_suite).await?;
275//!
276//! tracing::debug!("📅 Benchmark evaluation ready with {} standard datasets", 4);
277//! # Ok(())
278//! # }
279//! ```
280//!
281//! ### Exporting Evaluation Results
282//! ```rust
283//! use rrag::evaluation::{ExportFormat, OutputConfig};
284//!
285//! # async fn example() -> rrag::RragResult<()> {
286//! # let evaluator = rrag::evaluation::EvaluationService::new(rrag::evaluation::EvaluationConfig::default());
287//! # let results = std::collections::HashMap::new(); // Mock results
288//! // Configure export options
289//! let output_config = OutputConfig {
290//! export_formats: vec![
291//! ExportFormat::Json, // Machine-readable results
292//! ExportFormat::Html, // Interactive reports
293//! ExportFormat::Csv, // Spreadsheet analysis
294//! ExportFormat::Markdown // Documentation
295//! ],
296//! output_dir: "./evaluation_results".to_string(),
297//! include_detailed_logs: true,
298//! generate_visualizations: true,
299//! };
300//!
301//! // Export comprehensive results
302//! evaluator.export_results(&results).await?;
303//!
304//! tracing::debug!("📊 Results exported in multiple formats:");
305//! tracing::debug!(" • evaluation_results.json - Complete data");
306//! tracing::debug!(" • evaluation_report.html - Interactive dashboard");
307//! tracing::debug!(" • evaluation_summary.csv - Quick analysis");
308//! tracing::debug!(" • evaluation_report.md - Documentation");
309//! # Ok(())
310//! # }
311//! ```
312//!
313//! ### Real-time Evaluation Monitoring
314//! ```rust
315//! # async fn example() -> rrag::RragResult<()> {
316//! # let evaluator = rrag::evaluation::EvaluationService::new(rrag::evaluation::EvaluationConfig::default());
317//! // Monitor evaluation metrics in real-time
318//! let metrics = evaluator.get_metrics()?;
319//!
320//! for (metric_name, records) in metrics {
321//! let latest = records.last().unwrap();
322//! match metric_name.as_str() {
323//! "evaluation_time_ms" => {
324//! if latest.value > 5000.0 {
325//! warn!(" Evaluation taking longer than expected: {:.1}ms", latest.value);
326//! }
327//! }
328//! "evaluation_errors" => {
329//! if latest.value > 0.0 {
330//! error!(" Evaluation errors detected: {}", latest.value);
331//! }
332//! }
333//! _ => {
334//! tracing::debug!("📈 {}: {:.3}", metric_name, latest.value);
335//! }
336//! }
337//! }
338//! # Ok(())
339//! # }
340//! ```
341//!
342//! ## Evaluation Best Practices
343//!
344//! ### Dataset Preparation
345//! - Use diverse, representative test queries
346//! - Include edge cases and challenging examples
347//! - Ensure high-quality ground truth annotations
348//! - Balance different query types and complexities
349//!
350//! ### Metric Selection
351//! - Choose metrics aligned with your use case
352//! - Combine automatic and human evaluation
353//! - Consider both accuracy and efficiency metrics
354//! - Include domain-specific evaluation criteria
355//!
356//! ### Performance Optimization
357//! - Run evaluations in batch for efficiency
358//! - Use parallel evaluation when possible
359//! - Cache expensive computations
360//! - Monitor resource usage during evaluation
361//!
362//! ### Result Interpretation
363//! - Consider statistical significance
364//! - Analyze results by query type and complexity
365//! - Look for systematic errors and patterns
366//! - Compare against established baselines
367//!
368//! ## Integration with RAG Systems
369//!
370//! ```rust
371//! use rrag::{RragSystemBuilder, evaluation::EvaluationConfig};
372//!
373//! # async fn example() -> rrag::RragResult<()> {
374//! let rag_system = RragSystemBuilder::new()
375//! .with_evaluation(
376//! EvaluationConfig::production()
377//! .with_ragas_metrics(true)
378//! .with_real_time_monitoring(true)
379//! .with_automated_benchmarking(true)
380//! )
381//! .build()
382//! .await?;
383//!
384//! // System automatically evaluates performance and provides insights
385//! let results = rag_system.search_with_evaluation("query", Some(10)).await?;
386//! # Ok(())
387//! # }
388//! ```
389
390pub mod benchmarks;
391pub mod end_to_end;
392pub mod generation_eval;
393pub mod metrics;
394pub mod ragas;
395pub mod retrieval_eval;
396
397use crate::{RragError, RragResult};
398use serde::{Deserialize, Serialize};
399use std::collections::HashMap;
400use tracing::{error, info};
401
402/// Main evaluation service
403pub struct EvaluationService {
404 /// Configuration
405 config: EvaluationConfig,
406
407 /// Evaluators for different components
408 evaluators: HashMap<EvaluationType, Box<dyn Evaluator>>,
409
410 /// Metrics collection
411 metrics_collector: Box<dyn MetricsCollector>,
412}
413
414/// Configuration for evaluation service
415#[derive(Debug, Clone)]
416pub struct EvaluationConfig {
417 /// Enabled evaluation types
418 pub enabled_evaluations: Vec<EvaluationType>,
419
420 /// RAGAS configuration
421 pub ragas_config: ragas::RagasConfig,
422
423 /// Retrieval evaluation config
424 pub retrieval_config: retrieval_eval::RetrievalEvalConfig,
425
426 /// Generation evaluation config
427 pub generation_config: generation_eval::GenerationEvalConfig,
428
429 /// End-to-end evaluation config
430 pub e2e_config: end_to_end::EndToEndConfig,
431
432 /// Output configuration
433 pub output_config: OutputConfig,
434}
435
436impl Default for EvaluationConfig {
437 fn default() -> Self {
438 Self {
439 enabled_evaluations: vec![
440 EvaluationType::Ragas,
441 EvaluationType::Retrieval,
442 EvaluationType::Generation,
443 ],
444 ragas_config: ragas::RagasConfig::default(),
445 retrieval_config: retrieval_eval::RetrievalEvalConfig::default(),
446 generation_config: generation_eval::GenerationEvalConfig::default(),
447 e2e_config: end_to_end::EndToEndConfig::default(),
448 output_config: OutputConfig::default(),
449 }
450 }
451}
452
453/// Types of evaluation
454#[derive(Debug, Clone, Hash, PartialEq, Eq, Serialize, Deserialize)]
455pub enum EvaluationType {
456 /// RAGAS metrics evaluation
457 Ragas,
458 /// Retrieval-specific evaluation
459 Retrieval,
460 /// Generation-specific evaluation
461 Generation,
462 /// End-to-end system evaluation
463 EndToEnd,
464 /// Benchmark evaluation
465 Benchmark,
466}
467
468/// Output configuration for evaluation results
469#[derive(Debug, Clone)]
470pub struct OutputConfig {
471 /// Export formats
472 pub export_formats: Vec<ExportFormat>,
473
474 /// Output directory
475 pub output_dir: String,
476
477 /// Include detailed logs
478 pub include_detailed_logs: bool,
479
480 /// Generate visualizations
481 pub generate_visualizations: bool,
482}
483
484impl Default for OutputConfig {
485 fn default() -> Self {
486 Self {
487 export_formats: vec![ExportFormat::Json, ExportFormat::Csv],
488 output_dir: "./evaluation_results".to_string(),
489 include_detailed_logs: true,
490 generate_visualizations: false,
491 }
492 }
493}
494
495/// Export formats for evaluation results
496#[derive(Debug, Clone)]
497pub enum ExportFormat {
498 Json,
499 Csv,
500 Html,
501 Markdown,
502}
503
504/// Main trait for evaluators
505pub trait Evaluator: Send + Sync {
506 /// Evaluator name
507 fn name(&self) -> &str;
508
509 /// Run evaluation
510 fn evaluate(&self, evaluation_data: &EvaluationData) -> RragResult<EvaluationResult>;
511
512 /// Get supported metrics
513 fn supported_metrics(&self) -> Vec<String>;
514
515 /// Get evaluator configuration
516 fn get_config(&self) -> EvaluatorConfig;
517}
518
519/// Configuration for individual evaluators
520#[derive(Debug, Clone)]
521pub struct EvaluatorConfig {
522 /// Evaluator name
523 pub name: String,
524
525 /// Version
526 pub version: String,
527
528 /// Supported metrics
529 pub metrics: Vec<String>,
530
531 /// Performance characteristics
532 pub performance: EvaluatorPerformance,
533}
534
535/// Performance characteristics of evaluators
536#[derive(Debug, Clone)]
537pub struct EvaluatorPerformance {
538 /// Average evaluation time per sample (ms)
539 pub avg_time_per_sample_ms: f32,
540
541 /// Memory usage (MB)
542 pub memory_usage_mb: f32,
543
544 /// Accuracy of evaluation
545 pub accuracy: f32,
546}
547
548/// Input data for evaluation
549#[derive(Debug, Clone, Serialize, Deserialize)]
550pub struct EvaluationData {
551 /// Test queries
552 pub queries: Vec<TestQuery>,
553
554 /// Ground truth data
555 pub ground_truth: Vec<GroundTruth>,
556
557 /// System responses
558 pub system_responses: Vec<SystemResponse>,
559
560 /// Additional context
561 pub context: HashMap<String, serde_json::Value>,
562}
563
564/// Test query
565#[derive(Debug, Clone, Serialize, Deserialize)]
566pub struct TestQuery {
567 /// Query ID
568 pub id: String,
569
570 /// Query text
571 pub query: String,
572
573 /// Expected query type/intent
574 pub query_type: Option<String>,
575
576 /// Query metadata
577 pub metadata: HashMap<String, serde_json::Value>,
578}
579
580/// Ground truth data for evaluation
581#[derive(Debug, Clone, Serialize, Deserialize)]
582pub struct GroundTruth {
583 /// Query ID
584 pub query_id: String,
585
586 /// Relevant document IDs
587 pub relevant_docs: Vec<String>,
588
589 /// Expected answer/response
590 pub expected_answer: Option<String>,
591
592 /// Relevance judgments (document_id -> relevance_score)
593 pub relevance_judgments: HashMap<String, f32>,
594
595 /// Additional ground truth data
596 pub metadata: HashMap<String, serde_json::Value>,
597}
598
599/// System response for evaluation
600#[derive(Debug, Clone, Serialize, Deserialize)]
601pub struct SystemResponse {
602 /// Query ID
603 pub query_id: String,
604
605 /// Retrieved documents
606 pub retrieved_docs: Vec<RetrievedDocument>,
607
608 /// Generated answer (if applicable)
609 pub generated_answer: Option<String>,
610
611 /// System timing information
612 pub timing: SystemTiming,
613
614 /// Response metadata
615 pub metadata: HashMap<String, serde_json::Value>,
616}
617
618/// Retrieved document information
619#[derive(Debug, Clone, Serialize, Deserialize)]
620pub struct RetrievedDocument {
621 /// Document ID
622 pub doc_id: String,
623
624 /// Document content
625 pub content: String,
626
627 /// Retrieval score
628 pub score: f32,
629
630 /// Rank in retrieval results
631 pub rank: usize,
632
633 /// Document metadata
634 pub metadata: HashMap<String, serde_json::Value>,
635}
636
637/// System timing information
638#[derive(Debug, Clone, Serialize, Deserialize)]
639pub struct SystemTiming {
640 /// Total response time (ms)
641 pub total_time_ms: f32,
642
643 /// Retrieval time (ms)
644 pub retrieval_time_ms: f32,
645
646 /// Generation time (ms)
647 pub generation_time_ms: Option<f32>,
648
649 /// Reranking time (ms)
650 pub reranking_time_ms: Option<f32>,
651}
652
653/// Evaluation result
654#[derive(Debug, Clone, Serialize, Deserialize)]
655pub struct EvaluationResult {
656 /// Evaluation ID
657 pub id: String,
658
659 /// Evaluation type
660 pub evaluation_type: String,
661
662 /// Overall scores
663 pub overall_scores: HashMap<String, f32>,
664
665 /// Per-query results
666 pub per_query_results: Vec<QueryEvaluationResult>,
667
668 /// Summary statistics
669 pub summary: EvaluationSummary,
670
671 /// Evaluation metadata
672 pub metadata: EvaluationMetadata,
673}
674
675/// Per-query evaluation result
676#[derive(Debug, Clone, Serialize, Deserialize)]
677pub struct QueryEvaluationResult {
678 /// Query ID
679 pub query_id: String,
680
681 /// Metric scores
682 pub scores: HashMap<String, f32>,
683
684 /// Error analysis
685 pub errors: Vec<EvaluationError>,
686
687 /// Additional details
688 pub details: HashMap<String, serde_json::Value>,
689}
690
691/// Evaluation error
692#[derive(Debug, Clone, Serialize, Deserialize)]
693pub struct EvaluationError {
694 /// Error type
695 pub error_type: String,
696
697 /// Error message
698 pub message: String,
699
700 /// Error severity
701 pub severity: ErrorSeverity,
702
703 /// Suggested fixes
704 pub suggestions: Vec<String>,
705}
706
707/// Error severity levels
708#[derive(Debug, Clone, Serialize, Deserialize)]
709pub enum ErrorSeverity {
710 Low,
711 Medium,
712 High,
713 Critical,
714}
715
716/// Evaluation summary
717#[derive(Debug, Clone, Serialize, Deserialize)]
718pub struct EvaluationSummary {
719 /// Number of queries evaluated
720 pub total_queries: usize,
721
722 /// Average scores across all metrics
723 pub avg_scores: HashMap<String, f32>,
724
725 /// Standard deviations
726 pub std_deviations: HashMap<String, f32>,
727
728 /// Performance statistics
729 pub performance_stats: PerformanceStats,
730
731 /// Key insights
732 pub insights: Vec<String>,
733
734 /// Recommendations
735 pub recommendations: Vec<String>,
736}
737
738/// Performance statistics
739#[derive(Debug, Clone, Serialize, Deserialize)]
740pub struct PerformanceStats {
741 /// Average evaluation time per query
742 pub avg_eval_time_ms: f32,
743
744 /// Total evaluation time
745 pub total_eval_time_ms: f32,
746
747 /// Memory usage during evaluation
748 pub peak_memory_usage_mb: f32,
749
750 /// Throughput (queries per second)
751 pub throughput_qps: f32,
752}
753
754/// Evaluation metadata
755#[derive(Debug, Clone, Serialize, Deserialize)]
756pub struct EvaluationMetadata {
757 /// Evaluation timestamp
758 pub timestamp: chrono::DateTime<chrono::Utc>,
759
760 /// Evaluation version
761 pub evaluation_version: String,
762
763 /// System configuration
764 pub system_config: HashMap<String, serde_json::Value>,
765
766 /// Environment information
767 pub environment: HashMap<String, String>,
768
769 /// Git commit hash (if available)
770 pub git_commit: Option<String>,
771}
772
773/// Trait for collecting metrics during evaluation
774pub trait MetricsCollector: Send + Sync {
775 /// Start collecting metrics
776 fn start_collection(&mut self) -> RragResult<()>;
777
778 /// Stop collecting metrics
779 fn stop_collection(&mut self) -> RragResult<()>;
780
781 /// Record a metric
782 fn record_metric(
783 &mut self,
784 name: &str,
785 value: f32,
786 labels: Option<&HashMap<String, String>>,
787 ) -> RragResult<()>;
788
789 /// Get collected metrics
790 fn get_metrics(&self) -> RragResult<HashMap<String, Vec<MetricRecord>>>;
791
792 /// Export metrics to file
793 fn export_metrics(&self, format: &ExportFormat, output_path: &str) -> RragResult<()>;
794}
795
796/// Individual metric record
797#[derive(Debug, Clone, Serialize, Deserialize)]
798pub struct MetricRecord {
799 /// Metric name
800 pub name: String,
801
802 /// Metric value
803 pub value: f32,
804
805 /// Timestamp
806 pub timestamp: chrono::DateTime<chrono::Utc>,
807
808 /// Labels/tags
809 pub labels: HashMap<String, String>,
810}
811
812impl EvaluationService {
813 /// Create new evaluation service
814 pub fn new(config: EvaluationConfig) -> Self {
815 let mut service = Self {
816 config: config.clone(),
817 evaluators: HashMap::new(),
818 metrics_collector: Box::new(DefaultMetricsCollector::new()),
819 };
820
821 // Initialize evaluators
822 service.initialize_evaluators();
823
824 service
825 }
826
827 /// Initialize evaluators based on configuration
828 fn initialize_evaluators(&mut self) {
829 for eval_type in &self.config.enabled_evaluations {
830 let evaluator: Box<dyn Evaluator> = match eval_type {
831 EvaluationType::Ragas => {
832 Box::new(ragas::RagasEvaluator::new(self.config.ragas_config.clone()))
833 }
834 EvaluationType::Retrieval => Box::new(retrieval_eval::RetrievalEvaluator::new(
835 self.config.retrieval_config.clone(),
836 )),
837 EvaluationType::Generation => Box::new(generation_eval::GenerationEvaluator::new(
838 self.config.generation_config.clone(),
839 )),
840 EvaluationType::EndToEnd => Box::new(end_to_end::EndToEndEvaluator::new(
841 self.config.e2e_config.clone(),
842 )),
843 EvaluationType::Benchmark => Box::new(benchmarks::BenchmarkEvaluator::new()),
844 };
845
846 self.evaluators.insert(eval_type.clone(), evaluator);
847 }
848 }
849
850 /// Run evaluation on provided data
851 pub async fn evaluate(
852 &mut self,
853 data: EvaluationData,
854 ) -> RragResult<HashMap<EvaluationType, EvaluationResult>> {
855 let mut results = HashMap::new();
856
857 // Start metrics collection
858 self.metrics_collector.start_collection()?;
859
860 let start_time = std::time::Instant::now();
861
862 // Run each enabled evaluation
863 for (eval_type, evaluator) in &self.evaluators {
864 tracing::debug!("Running {} evaluation...", evaluator.name());
865
866 let eval_start = std::time::Instant::now();
867
868 match evaluator.evaluate(&data) {
869 Ok(result) => {
870 let eval_time = eval_start.elapsed().as_millis() as f32;
871 self.metrics_collector.record_metric(
872 "evaluation_time_ms",
873 eval_time,
874 Some(
875 &[("evaluator".to_string(), evaluator.name().to_string())]
876 .iter()
877 .cloned()
878 .collect(),
879 ),
880 )?;
881
882 results.insert(eval_type.clone(), result);
883 tracing::debug!(
884 "✅ {} evaluation completed in {:.2}ms",
885 evaluator.name(),
886 eval_time
887 );
888 }
889 Err(e) => {
890 error!(" {} evaluation failed: {}", evaluator.name(), e);
891 self.metrics_collector.record_metric(
892 "evaluation_errors",
893 1.0,
894 Some(
895 &[("evaluator".to_string(), evaluator.name().to_string())]
896 .iter()
897 .cloned()
898 .collect(),
899 ),
900 )?;
901 }
902 }
903 }
904
905 let total_time = start_time.elapsed().as_millis() as f32;
906 self.metrics_collector
907 .record_metric("total_evaluation_time_ms", total_time, None)?;
908
909 // Stop metrics collection
910 self.metrics_collector.stop_collection()?;
911
912 Ok(results)
913 }
914
915 /// Export evaluation results
916 pub async fn export_results(
917 &self,
918 results: &HashMap<EvaluationType, EvaluationResult>,
919 ) -> RragResult<()> {
920 // Create output directory
921 std::fs::create_dir_all(&self.config.output_config.output_dir).map_err(|e| {
922 RragError::evaluation(format!("Failed to create output directory: {}", e))
923 })?;
924
925 for format in &self.config.output_config.export_formats {
926 match format {
927 ExportFormat::Json => self.export_json(results).await?,
928 ExportFormat::Csv => self.export_csv(results).await?,
929 ExportFormat::Html => self.export_html(results).await?,
930 ExportFormat::Markdown => self.export_markdown(results).await?,
931 }
932 }
933
934 Ok(())
935 }
936
937 /// Export results as JSON
938 async fn export_json(
939 &self,
940 results: &HashMap<EvaluationType, EvaluationResult>,
941 ) -> RragResult<()> {
942 let json_path = format!(
943 "{}/evaluation_results.json",
944 self.config.output_config.output_dir
945 );
946 let json_content = serde_json::to_string_pretty(results)
947 .map_err(|e| RragError::evaluation(format!("Failed to serialize results: {}", e)))?;
948
949 std::fs::write(&json_path, json_content)
950 .map_err(|e| RragError::evaluation(format!("Failed to write JSON file: {}", e)))?;
951
952 info!(" Results exported to {}", json_path);
953 Ok(())
954 }
955
956 /// Export results as CSV
957 async fn export_csv(
958 &self,
959 results: &HashMap<EvaluationType, EvaluationResult>,
960 ) -> RragResult<()> {
961 let csv_path = format!(
962 "{}/evaluation_summary.csv",
963 self.config.output_config.output_dir
964 );
965 let mut csv_content = String::new();
966
967 // Header
968 csv_content.push_str("evaluator,metric,value\n");
969
970 // Data
971 for (eval_type, result) in results {
972 for (metric, value) in &result.overall_scores {
973 csv_content.push_str(&format!("{:?},{},{}\n", eval_type, metric, value));
974 }
975 }
976
977 std::fs::write(&csv_path, csv_content)
978 .map_err(|e| RragError::evaluation(format!("Failed to write CSV file: {}", e)))?;
979
980 info!(" Summary exported to {}", csv_path);
981 Ok(())
982 }
983
984 /// Export results as HTML
985 async fn export_html(
986 &self,
987 results: &HashMap<EvaluationType, EvaluationResult>,
988 ) -> RragResult<()> {
989 let html_path = format!(
990 "{}/evaluation_report.html",
991 self.config.output_config.output_dir
992 );
993 let mut html_content = String::from(
994 r#"
995<!DOCTYPE html>
996<html>
997<head>
998 <title>RRAG Evaluation Report</title>
999 <style>
1000 body { font-family: Arial, sans-serif; margin: 40px; }
1001 .header { border-bottom: 2px solid #333; margin-bottom: 30px; }
1002 .evaluator { margin-bottom: 40px; border: 1px solid #ddd; padding: 20px; }
1003 .metric { margin: 10px 0; }
1004 .score { font-weight: bold; color: #2196F3; }
1005 table { border-collapse: collapse; width: 100%; }
1006 th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
1007 th { background-color: #f2f2f2; }
1008 </style>
1009</head>
1010<body>
1011 <div class="header">
1012 <h1>🎯 RRAG Evaluation Report</h1>
1013 <p>Generated on: "#,
1014 );
1015
1016 html_content.push_str(
1017 &chrono::Utc::now()
1018 .format("%Y-%m-%d %H:%M:%S UTC")
1019 .to_string(),
1020 );
1021 html_content.push_str("</p>\n </div>\n");
1022
1023 for (eval_type, result) in results {
1024 html_content.push_str(&format!(
1025 r#"
1026 <div class="evaluator">
1027 <h2>📊 {:?} Evaluation</h2>
1028 <h3>Overall Scores</h3>
1029 <table>
1030 <tr><th>Metric</th><th>Score</th></tr>"#,
1031 eval_type
1032 ));
1033
1034 for (metric, score) in &result.overall_scores {
1035 html_content.push_str(&format!(
1036 "<tr><td>{}</td><td class=\"score\">{:.4}</td></tr>",
1037 metric, score
1038 ));
1039 }
1040
1041 html_content.push_str("</table>\n");
1042
1043 if !result.summary.insights.is_empty() {
1044 html_content.push_str("<h3>Key Insights</h3><ul>");
1045 for insight in &result.summary.insights {
1046 html_content.push_str(&format!("<li>{}</li>", insight));
1047 }
1048 html_content.push_str("</ul>");
1049 }
1050
1051 html_content.push_str(" </div>\n");
1052 }
1053
1054 html_content.push_str("</body>\n</html>");
1055
1056 std::fs::write(&html_path, html_content)
1057 .map_err(|e| RragError::evaluation(format!("Failed to write HTML file: {}", e)))?;
1058
1059 info!(" Report exported to {}", html_path);
1060 Ok(())
1061 }
1062
1063 /// Export results as Markdown
1064 async fn export_markdown(
1065 &self,
1066 results: &HashMap<EvaluationType, EvaluationResult>,
1067 ) -> RragResult<()> {
1068 let md_path = format!(
1069 "{}/evaluation_report.md",
1070 self.config.output_config.output_dir
1071 );
1072 let mut md_content = String::from("# 🎯 RRAG Evaluation Report\n\n");
1073
1074 md_content.push_str(&format!(
1075 "**Generated on:** {}\n\n",
1076 chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC")
1077 ));
1078
1079 for (eval_type, result) in results {
1080 md_content.push_str(&format!("## 📊 {:?} Evaluation\n\n", eval_type));
1081
1082 md_content.push_str("### Overall Scores\n\n");
1083 md_content.push_str("| Metric | Score |\n|--------|-------|\n");
1084
1085 for (metric, score) in &result.overall_scores {
1086 md_content.push_str(&format!("| {} | {:.4} |\n", metric, score));
1087 }
1088
1089 if !result.summary.insights.is_empty() {
1090 md_content.push_str("\n### Key Insights\n\n");
1091 for insight in &result.summary.insights {
1092 md_content.push_str(&format!("- {}\n", insight));
1093 }
1094 }
1095
1096 if !result.summary.recommendations.is_empty() {
1097 md_content.push_str("\n### Recommendations\n\n");
1098 for recommendation in &result.summary.recommendations {
1099 md_content.push_str(&format!("- {}\n", recommendation));
1100 }
1101 }
1102
1103 md_content.push_str("\n---\n\n");
1104 }
1105
1106 std::fs::write(&md_path, md_content)
1107 .map_err(|e| RragError::evaluation(format!("Failed to write Markdown file: {}", e)))?;
1108
1109 info!(" Markdown report exported to {}", md_path);
1110 Ok(())
1111 }
1112
1113 /// Get evaluation metrics
1114 pub fn get_metrics(&self) -> RragResult<HashMap<String, Vec<MetricRecord>>> {
1115 self.metrics_collector.get_metrics()
1116 }
1117}
1118
1119/// Default metrics collector implementation
1120pub struct DefaultMetricsCollector {
1121 metrics: HashMap<String, Vec<MetricRecord>>,
1122 collecting: bool,
1123}
1124
1125impl DefaultMetricsCollector {
1126 pub fn new() -> Self {
1127 Self {
1128 metrics: HashMap::new(),
1129 collecting: false,
1130 }
1131 }
1132}
1133
1134impl MetricsCollector for DefaultMetricsCollector {
1135 fn start_collection(&mut self) -> RragResult<()> {
1136 self.collecting = true;
1137 self.metrics.clear();
1138 Ok(())
1139 }
1140
1141 fn stop_collection(&mut self) -> RragResult<()> {
1142 self.collecting = false;
1143 Ok(())
1144 }
1145
1146 fn record_metric(
1147 &mut self,
1148 name: &str,
1149 value: f32,
1150 labels: Option<&HashMap<String, String>>,
1151 ) -> RragResult<()> {
1152 if !self.collecting {
1153 return Ok(());
1154 }
1155
1156 let record = MetricRecord {
1157 name: name.to_string(),
1158 value,
1159 timestamp: chrono::Utc::now(),
1160 labels: labels.cloned().unwrap_or_default(),
1161 };
1162
1163 self.metrics
1164 .entry(name.to_string())
1165 .or_insert_with(Vec::new)
1166 .push(record);
1167 Ok(())
1168 }
1169
1170 fn get_metrics(&self) -> RragResult<HashMap<String, Vec<MetricRecord>>> {
1171 Ok(self.metrics.clone())
1172 }
1173
1174 fn export_metrics(&self, format: &ExportFormat, output_path: &str) -> RragResult<()> {
1175 match format {
1176 ExportFormat::Json => {
1177 let json_content = serde_json::to_string_pretty(&self.metrics).map_err(|e| {
1178 RragError::evaluation(format!("Failed to serialize metrics: {}", e))
1179 })?;
1180 std::fs::write(output_path, json_content).map_err(|e| {
1181 RragError::evaluation(format!("Failed to write metrics file: {}", e))
1182 })?;
1183 }
1184 _ => {
1185 return Err(RragError::evaluation(
1186 "Unsupported export format for metrics".to_string(),
1187 ));
1188 }
1189 }
1190 Ok(())
1191 }
1192}
1193
1194#[cfg(test)]
1195mod tests {
1196 use super::*;
1197
1198 #[test]
1199 fn test_evaluation_config_creation() {
1200 let config = EvaluationConfig::default();
1201 assert!(config.enabled_evaluations.contains(&EvaluationType::Ragas));
1202 assert!(config
1203 .enabled_evaluations
1204 .contains(&EvaluationType::Retrieval));
1205 assert!(config
1206 .enabled_evaluations
1207 .contains(&EvaluationType::Generation));
1208 }
1209
1210 #[test]
1211 fn test_evaluation_data_creation() {
1212 let query = TestQuery {
1213 id: "test_1".to_string(),
1214 query: "What is machine learning?".to_string(),
1215 query_type: Some("factual".to_string()),
1216 metadata: HashMap::new(),
1217 };
1218
1219 let ground_truth = GroundTruth {
1220 query_id: "test_1".to_string(),
1221 relevant_docs: vec!["doc_1".to_string(), "doc_2".to_string()],
1222 expected_answer: Some("Machine learning is...".to_string()),
1223 relevance_judgments: HashMap::new(),
1224 metadata: HashMap::new(),
1225 };
1226
1227 let data = EvaluationData {
1228 queries: vec![query],
1229 ground_truth: vec![ground_truth],
1230 system_responses: vec![],
1231 context: HashMap::new(),
1232 };
1233
1234 assert_eq!(data.queries.len(), 1);
1235 assert_eq!(data.ground_truth.len(), 1);
1236 }
1237
1238 #[test]
1239 fn test_metrics_collector() {
1240 let mut collector = DefaultMetricsCollector::new();
1241
1242 collector.start_collection().unwrap();
1243 collector.record_metric("test_metric", 0.85, None).unwrap();
1244 collector.stop_collection().unwrap();
1245
1246 let metrics = collector.get_metrics().unwrap();
1247 assert!(metrics.contains_key("test_metric"));
1248 assert_eq!(metrics["test_metric"].len(), 1);
1249 assert_eq!(metrics["test_metric"][0].value, 0.85);
1250 }
1251}