rexis_rag/evaluation/
mod.rs

1//! # RRAG Evaluation Framework
2//!
3//! Enterprise-grade evaluation framework for RAG systems providing comprehensive
4//! assessment capabilities based on RAGAS metrics, custom evaluation methods,
5//! and industry-standard benchmarks.
6//!
7//! This module offers a complete evaluation suite for RAG systems, enabling
8//! both component-level analysis (retrieval, generation) and end-to-end system
9//! evaluation. It supports multiple evaluation methodologies, automated benchmarking,
10//! and detailed performance analytics.
11//!
12//! ## Key Features
13//!
14//! - **RAGAS Integration**: Industry-standard RAG evaluation metrics
15//! - **Multi-Level Evaluation**: Component and system-level assessments
16//! - **Automated Benchmarking**: Built-in benchmark datasets and evaluation
17//! - **Custom Metrics**: Extensible framework for domain-specific evaluation
18//! - **Performance Analytics**: Detailed insights and recommendations
19//! - **Export Capabilities**: Multiple output formats (JSON, CSV, HTML, Markdown)
20//! - **Real-time Monitoring**: Live evaluation during system operation
21//!
22//! ## Evaluation Types
23//!
24//! 1. **RAGAS Metrics**: Faithfulness, Answer Relevancy, Context Precision, Context Recall
25//! 2. **Retrieval Evaluation**: Precision@K, Recall@K, MRR, NDCG
26//! 3. **Generation Evaluation**: BLEU, ROUGE, BERTScore, Semantic similarity
27//! 4. **End-to-End Evaluation**: Complete pipeline assessment
28//! 5. **Benchmark Evaluation**: Performance on standard datasets
29//!
30//! ## Examples
31//!
32//! ### Basic Evaluation Setup
33//! ```rust
34//! use rrag::evaluation::{
35//!     EvaluationService, EvaluationConfig, EvaluationType,
36//!     EvaluationData, TestQuery, GroundTruth
37//! };
38//!
39//! # async fn example() -> rrag::RragResult<()> {
40//! let config = EvaluationConfig {
41//!     enabled_evaluations: vec![
42//!         EvaluationType::Ragas,
43//!         EvaluationType::Retrieval,
44//!         EvaluationType::Generation,
45//!     ],
46//!     ..Default::default()
47//! };
48//!
49//! let mut evaluator = EvaluationService::new(config);
50//! tracing::debug!("📊 Evaluation service initialized with {} evaluators", 3);
51//! # Ok(())
52//! # }
53//! ```
54//!
55//! ### Running Comprehensive Evaluation
56//! ```rust
57//! use std::collections::HashMap;
58//!
59//! # async fn example() -> rrag::RragResult<()> {
60//! # let mut evaluator = rrag::evaluation::EvaluationService::new(rrag::evaluation::EvaluationConfig::default());
61//! // Prepare test data
62//! let test_queries = vec![
63//!     rrag::evaluation::TestQuery {
64//!         id: "q1".to_string(),
65//!         query: "What is machine learning?".to_string(),
66//!         query_type: Some("factual".to_string()),
67//!         metadata: HashMap::new(),
68//!     },
69//!     rrag::evaluation::TestQuery {
70//!         id: "q2".to_string(),
71//!         query: "Explain neural networks in detail".to_string(),
72//!         query_type: Some("conceptual".to_string()),
73//!         metadata: HashMap::new(),
74//!     },
75//! ];
76//!
77//! let ground_truth = vec![
78//!     rrag::evaluation::GroundTruth {
79//!         query_id: "q1".to_string(),
80//!         relevant_docs: vec!["doc_ml_intro".to_string(), "doc_ml_basics".to_string()],
81//!         expected_answer: Some(
82//!             "Machine learning is a subset of AI that enables computers to learn...".to_string()
83//!         ),
84//!         relevance_judgments: [("doc_ml_intro".to_string(), 1.0)].iter().cloned().collect(),
85//!         metadata: HashMap::new(),
86//!     },
87//! ];
88//!
89//! let evaluation_data = rrag::evaluation::EvaluationData {
90//!     queries: test_queries,
91//!     ground_truth,
92//!     system_responses: vec![], // Would be populated with actual system responses
93//!     context: HashMap::new(),
94//! };
95//!
96//! // Run evaluation
97//! let results = evaluator.evaluate(evaluation_data).await?;
98//!
99//! for (eval_type, result) in results {
100//!     tracing::debug!("🏆 {:?} Evaluation Results:", eval_type);
101//!     for (metric, score) in result.overall_scores {
102//!         tracing::debug!("  {}: {:.4}", metric, score);
103//!     }
104//! }
105//! # Ok(())
106//! # }
107//! ```
108//!
109//! ### RAGAS Evaluation
110//! ```rust
111//! use rrag::evaluation::{
112//!     ragas::{RagasEvaluator, RagasConfig, RagasMetric},
113//!     SystemResponse, RetrievedDocument, SystemTiming
114//! };
115//!
116//! # async fn example() -> rrag::RragResult<()> {
117//! let ragas_config = RagasConfig {
118//!     enabled_metrics: vec![
119//!         RagasMetric::Faithfulness,
120//!         RagasMetric::AnswerRelevancy,
121//!         RagasMetric::ContextPrecision,
122//!         RagasMetric::ContextRecall,
123//!     ],
124//!     ..Default::default()
125//! };
126//!
127//! let ragas_evaluator = RagasEvaluator::new(ragas_config);
128//!
129//! // Prepare system response for evaluation
130//! let system_response = SystemResponse {
131//!     query_id: "q1".to_string(),
132//!     retrieved_docs: vec![
133//!         RetrievedDocument {
134//!             doc_id: "doc_1".to_string(),
135//!             content: "Machine learning is a method of data analysis...".to_string(),
136//!             score: 0.95,
137//!             rank: 1,
138//!             metadata: HashMap::new(),
139//!         }
140//!     ],
141//!     generated_answer: Some(
142//!         "Machine learning is a subset of artificial intelligence...".to_string()
143//!     ),
144//!     timing: SystemTiming {
145//!         total_time_ms: 250.0,
146//!         retrieval_time_ms: 120.0,
147//!         generation_time_ms: Some(130.0),
148//!         reranking_time_ms: None,
149//!     },
150//!     metadata: HashMap::new(),
151//! };
152//!
153//! tracing::debug!("📈 RAGAS evaluation completed with {} metrics", 4);
154//! # Ok(())
155//! # }
156//! ```
157//!
158//! ### Retrieval-Specific Evaluation
159//! ```rust
160//! use rrag::evaluation::retrieval_eval::{
161//!     RetrievalEvaluator, RetrievalEvalConfig, RetrievalMetric
162//! };
163//!
164//! # async fn example() -> rrag::RragResult<()> {
165//! let retrieval_config = RetrievalEvalConfig {
166//!     metrics: vec![
167//!         RetrievalMetric::PrecisionAtK(10),
168//!         RetrievalMetric::RecallAtK(10),
169//!         RetrievalMetric::MeanReciprocalRank,
170//!         RetrievalMetric::NDCG(10),
171//!     ],
172//!     k_values: vec![1, 5, 10, 20],
173//!     ..Default::default()
174//! };
175//!
176//! let retrieval_evaluator = RetrievalEvaluator::new(retrieval_config);
177//!
178//! // Results will include:
179//! // - Precision@1, @5, @10, @20
180//! // - Recall@1, @5, @10, @20  
181//! // - Mean Reciprocal Rank
182//! // - Normalized Discounted Cumulative Gain
183//!
184//! tracing::debug!("🏁 Retrieval evaluation configured for multiple K values");
185//! # Ok(())
186//! # }
187//! ```
188//!
189//! ### Generation Quality Evaluation
190//! ```rust
191//! use rrag::evaluation::generation_eval::{
192//!     GenerationEvaluator, GenerationEvalConfig, GenerationMetric
193//! };
194//!
195//! # async fn example() -> rrag::RragResult<()> {
196//! let generation_config = GenerationEvalConfig {
197//!     metrics: vec![
198//!         GenerationMetric::BLEU,
199//!         GenerationMetric::ROUGE("rouge-l".to_string()),
200//!         GenerationMetric::BERTScore,
201//!         GenerationMetric::SemanticSimilarity,
202//!     ],
203//!     reference_free: false,
204//!     ..Default::default()
205//! };
206//!
207//! let generation_evaluator = GenerationEvaluator::new(generation_config);
208//!
209//! // Evaluates generated answers against reference answers
210//! // Provides detailed analysis of:
211//! // - Lexical similarity (BLEU, ROUGE)
212//! // - Semantic similarity (BERTScore, embeddings)
213//! // - Factual accuracy
214//! // - Fluency and coherence
215//!
216//! tracing::debug!("✍️ Generation evaluation ready for quality assessment");
217//! # Ok(())
218//! # }
219//! ```
220//!
221//! ### End-to-End System Evaluation
222//! ```rust
223//! use rrag::evaluation::end_to_end::{
224//!     EndToEndEvaluator, EndToEndConfig, E2EMetric
225//! };
226//!
227//! # async fn example() -> rrag::RragResult<()> {
228//! let e2e_config = EndToEndConfig {
229//!     metrics: vec![
230//!         E2EMetric::OverallAccuracy,
231//!         E2EMetric::ResponseTime,
232//!         E2EMetric::UserSatisfaction,
233//!         E2EMetric::CostEfficiency,
234//!     ],
235//!     include_ablation_study: true,
236//!     ..Default::default()
237//! };
238//!
239//! let e2e_evaluator = EndToEndEvaluator::new(e2e_config);
240//!
241//! // Comprehensive system evaluation including:
242//! // - End-to-end accuracy
243//! // - Performance benchmarks
244//! // - Resource utilization
245//! // - Error analysis
246//! // - Component contribution analysis
247//!
248//! tracing::debug!("🎆 End-to-end evaluation configured for complete system assessment");
249//! # Ok(())
250//! # }
251//! ```
252//!
253//! ### Automated Benchmarking
254//! ```rust
255//! use rrag::evaluation::benchmarks::{
256//!     BenchmarkEvaluator, BenchmarkSuite, BenchmarkDataset
257//! };
258//!
259//! # async fn example() -> rrag::RragResult<()> {
260//! let benchmark_evaluator = BenchmarkEvaluator::new();
261//!
262//! let benchmark_suite = BenchmarkSuite {
263//!     datasets: vec![
264//!         BenchmarkDataset::MS_MARCO,
265//!         BenchmarkDataset::Natural_Questions,
266//!         BenchmarkDataset::SQuAD_2_0,
267//!         BenchmarkDataset::BEIR,
268//!     ],
269//!     custom_datasets: vec![], // Add domain-specific datasets
270//!     evaluation_mode: "comprehensive".to_string(),
271//! };
272//!
273//! // Run against standard benchmarks
274//! // let results = benchmark_evaluator.run_benchmark_suite(benchmark_suite).await?;
275//!
276//! tracing::debug!("📅 Benchmark evaluation ready with {} standard datasets", 4);
277//! # Ok(())
278//! # }
279//! ```
280//!
281//! ### Exporting Evaluation Results
282//! ```rust
283//! use rrag::evaluation::{ExportFormat, OutputConfig};
284//!
285//! # async fn example() -> rrag::RragResult<()> {
286//! # let evaluator = rrag::evaluation::EvaluationService::new(rrag::evaluation::EvaluationConfig::default());
287//! # let results = std::collections::HashMap::new(); // Mock results
288//! // Configure export options
289//! let output_config = OutputConfig {
290//!     export_formats: vec![
291//!         ExportFormat::Json,    // Machine-readable results
292//!         ExportFormat::Html,    // Interactive reports
293//!         ExportFormat::Csv,     // Spreadsheet analysis
294//!         ExportFormat::Markdown // Documentation
295//!     ],
296//!     output_dir: "./evaluation_results".to_string(),
297//!     include_detailed_logs: true,
298//!     generate_visualizations: true,
299//! };
300//!
301//! // Export comprehensive results
302//! evaluator.export_results(&results).await?;
303//!
304//! tracing::debug!("📊 Results exported in multiple formats:");
305//! tracing::debug!("  • evaluation_results.json - Complete data");
306//! tracing::debug!("  • evaluation_report.html - Interactive dashboard");
307//! tracing::debug!("  • evaluation_summary.csv - Quick analysis");
308//! tracing::debug!("  • evaluation_report.md - Documentation");
309//! # Ok(())
310//! # }
311//! ```
312//!
313//! ### Real-time Evaluation Monitoring
314//! ```rust
315//! # async fn example() -> rrag::RragResult<()> {
316//! # let evaluator = rrag::evaluation::EvaluationService::new(rrag::evaluation::EvaluationConfig::default());
317//! // Monitor evaluation metrics in real-time
318//! let metrics = evaluator.get_metrics()?;
319//!
320//! for (metric_name, records) in metrics {
321//!     let latest = records.last().unwrap();
322//!     match metric_name.as_str() {
323//!         "evaluation_time_ms" => {
324//!             if latest.value > 5000.0 {
325//!                 warn!("  Evaluation taking longer than expected: {:.1}ms", latest.value);
326//!             }
327//!         }
328//!         "evaluation_errors" => {
329//!             if latest.value > 0.0 {
330//!                 error!(" Evaluation errors detected: {}", latest.value);
331//!             }
332//!         }
333//!         _ => {
334//!             tracing::debug!("📈 {}: {:.3}", metric_name, latest.value);
335//!         }
336//!     }
337//! }
338//! # Ok(())
339//! # }
340//! ```
341//!
342//! ## Evaluation Best Practices
343//!
344//! ### Dataset Preparation
345//! - Use diverse, representative test queries
346//! - Include edge cases and challenging examples
347//! - Ensure high-quality ground truth annotations
348//! - Balance different query types and complexities
349//!
350//! ### Metric Selection
351//! - Choose metrics aligned with your use case
352//! - Combine automatic and human evaluation
353//! - Consider both accuracy and efficiency metrics
354//! - Include domain-specific evaluation criteria
355//!
356//! ### Performance Optimization
357//! - Run evaluations in batch for efficiency
358//! - Use parallel evaluation when possible
359//! - Cache expensive computations
360//! - Monitor resource usage during evaluation
361//!
362//! ### Result Interpretation
363//! - Consider statistical significance
364//! - Analyze results by query type and complexity
365//! - Look for systematic errors and patterns
366//! - Compare against established baselines
367//!
368//! ## Integration with RAG Systems
369//!
370//! ```rust
371//! use rrag::{RragSystemBuilder, evaluation::EvaluationConfig};
372//!
373//! # async fn example() -> rrag::RragResult<()> {
374//! let rag_system = RragSystemBuilder::new()
375//!     .with_evaluation(
376//!         EvaluationConfig::production()
377//!             .with_ragas_metrics(true)
378//!             .with_real_time_monitoring(true)
379//!             .with_automated_benchmarking(true)
380//!     )
381//!     .build()
382//!     .await?;
383//!
384//! // System automatically evaluates performance and provides insights
385//! let results = rag_system.search_with_evaluation("query", Some(10)).await?;
386//! # Ok(())
387//! # }
388//! ```
389
390pub mod benchmarks;
391pub mod end_to_end;
392pub mod generation_eval;
393pub mod metrics;
394pub mod ragas;
395pub mod retrieval_eval;
396
397use crate::{RragError, RragResult};
398use serde::{Deserialize, Serialize};
399use std::collections::HashMap;
400use tracing::{error, info};
401
402/// Main evaluation service
403pub struct EvaluationService {
404    /// Configuration
405    config: EvaluationConfig,
406
407    /// Evaluators for different components
408    evaluators: HashMap<EvaluationType, Box<dyn Evaluator>>,
409
410    /// Metrics collection
411    metrics_collector: Box<dyn MetricsCollector>,
412}
413
414/// Configuration for evaluation service
415#[derive(Debug, Clone)]
416pub struct EvaluationConfig {
417    /// Enabled evaluation types
418    pub enabled_evaluations: Vec<EvaluationType>,
419
420    /// RAGAS configuration
421    pub ragas_config: ragas::RagasConfig,
422
423    /// Retrieval evaluation config
424    pub retrieval_config: retrieval_eval::RetrievalEvalConfig,
425
426    /// Generation evaluation config
427    pub generation_config: generation_eval::GenerationEvalConfig,
428
429    /// End-to-end evaluation config
430    pub e2e_config: end_to_end::EndToEndConfig,
431
432    /// Output configuration
433    pub output_config: OutputConfig,
434}
435
436impl Default for EvaluationConfig {
437    fn default() -> Self {
438        Self {
439            enabled_evaluations: vec![
440                EvaluationType::Ragas,
441                EvaluationType::Retrieval,
442                EvaluationType::Generation,
443            ],
444            ragas_config: ragas::RagasConfig::default(),
445            retrieval_config: retrieval_eval::RetrievalEvalConfig::default(),
446            generation_config: generation_eval::GenerationEvalConfig::default(),
447            e2e_config: end_to_end::EndToEndConfig::default(),
448            output_config: OutputConfig::default(),
449        }
450    }
451}
452
453/// Types of evaluation
454#[derive(Debug, Clone, Hash, PartialEq, Eq, Serialize, Deserialize)]
455pub enum EvaluationType {
456    /// RAGAS metrics evaluation
457    Ragas,
458    /// Retrieval-specific evaluation
459    Retrieval,
460    /// Generation-specific evaluation
461    Generation,
462    /// End-to-end system evaluation
463    EndToEnd,
464    /// Benchmark evaluation
465    Benchmark,
466}
467
468/// Output configuration for evaluation results
469#[derive(Debug, Clone)]
470pub struct OutputConfig {
471    /// Export formats
472    pub export_formats: Vec<ExportFormat>,
473
474    /// Output directory
475    pub output_dir: String,
476
477    /// Include detailed logs
478    pub include_detailed_logs: bool,
479
480    /// Generate visualizations
481    pub generate_visualizations: bool,
482}
483
484impl Default for OutputConfig {
485    fn default() -> Self {
486        Self {
487            export_formats: vec![ExportFormat::Json, ExportFormat::Csv],
488            output_dir: "./evaluation_results".to_string(),
489            include_detailed_logs: true,
490            generate_visualizations: false,
491        }
492    }
493}
494
495/// Export formats for evaluation results
496#[derive(Debug, Clone)]
497pub enum ExportFormat {
498    Json,
499    Csv,
500    Html,
501    Markdown,
502}
503
504/// Main trait for evaluators
505pub trait Evaluator: Send + Sync {
506    /// Evaluator name
507    fn name(&self) -> &str;
508
509    /// Run evaluation
510    fn evaluate(&self, evaluation_data: &EvaluationData) -> RragResult<EvaluationResult>;
511
512    /// Get supported metrics
513    fn supported_metrics(&self) -> Vec<String>;
514
515    /// Get evaluator configuration
516    fn get_config(&self) -> EvaluatorConfig;
517}
518
519/// Configuration for individual evaluators
520#[derive(Debug, Clone)]
521pub struct EvaluatorConfig {
522    /// Evaluator name
523    pub name: String,
524
525    /// Version
526    pub version: String,
527
528    /// Supported metrics
529    pub metrics: Vec<String>,
530
531    /// Performance characteristics
532    pub performance: EvaluatorPerformance,
533}
534
535/// Performance characteristics of evaluators
536#[derive(Debug, Clone)]
537pub struct EvaluatorPerformance {
538    /// Average evaluation time per sample (ms)
539    pub avg_time_per_sample_ms: f32,
540
541    /// Memory usage (MB)
542    pub memory_usage_mb: f32,
543
544    /// Accuracy of evaluation
545    pub accuracy: f32,
546}
547
548/// Input data for evaluation
549#[derive(Debug, Clone, Serialize, Deserialize)]
550pub struct EvaluationData {
551    /// Test queries
552    pub queries: Vec<TestQuery>,
553
554    /// Ground truth data
555    pub ground_truth: Vec<GroundTruth>,
556
557    /// System responses
558    pub system_responses: Vec<SystemResponse>,
559
560    /// Additional context
561    pub context: HashMap<String, serde_json::Value>,
562}
563
564/// Test query
565#[derive(Debug, Clone, Serialize, Deserialize)]
566pub struct TestQuery {
567    /// Query ID
568    pub id: String,
569
570    /// Query text
571    pub query: String,
572
573    /// Expected query type/intent
574    pub query_type: Option<String>,
575
576    /// Query metadata
577    pub metadata: HashMap<String, serde_json::Value>,
578}
579
580/// Ground truth data for evaluation
581#[derive(Debug, Clone, Serialize, Deserialize)]
582pub struct GroundTruth {
583    /// Query ID
584    pub query_id: String,
585
586    /// Relevant document IDs
587    pub relevant_docs: Vec<String>,
588
589    /// Expected answer/response
590    pub expected_answer: Option<String>,
591
592    /// Relevance judgments (document_id -> relevance_score)
593    pub relevance_judgments: HashMap<String, f32>,
594
595    /// Additional ground truth data
596    pub metadata: HashMap<String, serde_json::Value>,
597}
598
599/// System response for evaluation
600#[derive(Debug, Clone, Serialize, Deserialize)]
601pub struct SystemResponse {
602    /// Query ID
603    pub query_id: String,
604
605    /// Retrieved documents
606    pub retrieved_docs: Vec<RetrievedDocument>,
607
608    /// Generated answer (if applicable)
609    pub generated_answer: Option<String>,
610
611    /// System timing information
612    pub timing: SystemTiming,
613
614    /// Response metadata
615    pub metadata: HashMap<String, serde_json::Value>,
616}
617
618/// Retrieved document information
619#[derive(Debug, Clone, Serialize, Deserialize)]
620pub struct RetrievedDocument {
621    /// Document ID
622    pub doc_id: String,
623
624    /// Document content
625    pub content: String,
626
627    /// Retrieval score
628    pub score: f32,
629
630    /// Rank in retrieval results
631    pub rank: usize,
632
633    /// Document metadata
634    pub metadata: HashMap<String, serde_json::Value>,
635}
636
637/// System timing information
638#[derive(Debug, Clone, Serialize, Deserialize)]
639pub struct SystemTiming {
640    /// Total response time (ms)
641    pub total_time_ms: f32,
642
643    /// Retrieval time (ms)
644    pub retrieval_time_ms: f32,
645
646    /// Generation time (ms)
647    pub generation_time_ms: Option<f32>,
648
649    /// Reranking time (ms)
650    pub reranking_time_ms: Option<f32>,
651}
652
653/// Evaluation result
654#[derive(Debug, Clone, Serialize, Deserialize)]
655pub struct EvaluationResult {
656    /// Evaluation ID
657    pub id: String,
658
659    /// Evaluation type
660    pub evaluation_type: String,
661
662    /// Overall scores
663    pub overall_scores: HashMap<String, f32>,
664
665    /// Per-query results
666    pub per_query_results: Vec<QueryEvaluationResult>,
667
668    /// Summary statistics
669    pub summary: EvaluationSummary,
670
671    /// Evaluation metadata
672    pub metadata: EvaluationMetadata,
673}
674
675/// Per-query evaluation result
676#[derive(Debug, Clone, Serialize, Deserialize)]
677pub struct QueryEvaluationResult {
678    /// Query ID
679    pub query_id: String,
680
681    /// Metric scores
682    pub scores: HashMap<String, f32>,
683
684    /// Error analysis
685    pub errors: Vec<EvaluationError>,
686
687    /// Additional details
688    pub details: HashMap<String, serde_json::Value>,
689}
690
691/// Evaluation error
692#[derive(Debug, Clone, Serialize, Deserialize)]
693pub struct EvaluationError {
694    /// Error type
695    pub error_type: String,
696
697    /// Error message
698    pub message: String,
699
700    /// Error severity
701    pub severity: ErrorSeverity,
702
703    /// Suggested fixes
704    pub suggestions: Vec<String>,
705}
706
707/// Error severity levels
708#[derive(Debug, Clone, Serialize, Deserialize)]
709pub enum ErrorSeverity {
710    Low,
711    Medium,
712    High,
713    Critical,
714}
715
716/// Evaluation summary
717#[derive(Debug, Clone, Serialize, Deserialize)]
718pub struct EvaluationSummary {
719    /// Number of queries evaluated
720    pub total_queries: usize,
721
722    /// Average scores across all metrics
723    pub avg_scores: HashMap<String, f32>,
724
725    /// Standard deviations
726    pub std_deviations: HashMap<String, f32>,
727
728    /// Performance statistics
729    pub performance_stats: PerformanceStats,
730
731    /// Key insights
732    pub insights: Vec<String>,
733
734    /// Recommendations
735    pub recommendations: Vec<String>,
736}
737
738/// Performance statistics
739#[derive(Debug, Clone, Serialize, Deserialize)]
740pub struct PerformanceStats {
741    /// Average evaluation time per query
742    pub avg_eval_time_ms: f32,
743
744    /// Total evaluation time
745    pub total_eval_time_ms: f32,
746
747    /// Memory usage during evaluation
748    pub peak_memory_usage_mb: f32,
749
750    /// Throughput (queries per second)
751    pub throughput_qps: f32,
752}
753
754/// Evaluation metadata
755#[derive(Debug, Clone, Serialize, Deserialize)]
756pub struct EvaluationMetadata {
757    /// Evaluation timestamp
758    pub timestamp: chrono::DateTime<chrono::Utc>,
759
760    /// Evaluation version
761    pub evaluation_version: String,
762
763    /// System configuration
764    pub system_config: HashMap<String, serde_json::Value>,
765
766    /// Environment information
767    pub environment: HashMap<String, String>,
768
769    /// Git commit hash (if available)
770    pub git_commit: Option<String>,
771}
772
773/// Trait for collecting metrics during evaluation
774pub trait MetricsCollector: Send + Sync {
775    /// Start collecting metrics
776    fn start_collection(&mut self) -> RragResult<()>;
777
778    /// Stop collecting metrics
779    fn stop_collection(&mut self) -> RragResult<()>;
780
781    /// Record a metric
782    fn record_metric(
783        &mut self,
784        name: &str,
785        value: f32,
786        labels: Option<&HashMap<String, String>>,
787    ) -> RragResult<()>;
788
789    /// Get collected metrics
790    fn get_metrics(&self) -> RragResult<HashMap<String, Vec<MetricRecord>>>;
791
792    /// Export metrics to file
793    fn export_metrics(&self, format: &ExportFormat, output_path: &str) -> RragResult<()>;
794}
795
796/// Individual metric record
797#[derive(Debug, Clone, Serialize, Deserialize)]
798pub struct MetricRecord {
799    /// Metric name
800    pub name: String,
801
802    /// Metric value
803    pub value: f32,
804
805    /// Timestamp
806    pub timestamp: chrono::DateTime<chrono::Utc>,
807
808    /// Labels/tags
809    pub labels: HashMap<String, String>,
810}
811
812impl EvaluationService {
813    /// Create new evaluation service
814    pub fn new(config: EvaluationConfig) -> Self {
815        let mut service = Self {
816            config: config.clone(),
817            evaluators: HashMap::new(),
818            metrics_collector: Box::new(DefaultMetricsCollector::new()),
819        };
820
821        // Initialize evaluators
822        service.initialize_evaluators();
823
824        service
825    }
826
827    /// Initialize evaluators based on configuration
828    fn initialize_evaluators(&mut self) {
829        for eval_type in &self.config.enabled_evaluations {
830            let evaluator: Box<dyn Evaluator> = match eval_type {
831                EvaluationType::Ragas => {
832                    Box::new(ragas::RagasEvaluator::new(self.config.ragas_config.clone()))
833                }
834                EvaluationType::Retrieval => Box::new(retrieval_eval::RetrievalEvaluator::new(
835                    self.config.retrieval_config.clone(),
836                )),
837                EvaluationType::Generation => Box::new(generation_eval::GenerationEvaluator::new(
838                    self.config.generation_config.clone(),
839                )),
840                EvaluationType::EndToEnd => Box::new(end_to_end::EndToEndEvaluator::new(
841                    self.config.e2e_config.clone(),
842                )),
843                EvaluationType::Benchmark => Box::new(benchmarks::BenchmarkEvaluator::new()),
844            };
845
846            self.evaluators.insert(eval_type.clone(), evaluator);
847        }
848    }
849
850    /// Run evaluation on provided data
851    pub async fn evaluate(
852        &mut self,
853        data: EvaluationData,
854    ) -> RragResult<HashMap<EvaluationType, EvaluationResult>> {
855        let mut results = HashMap::new();
856
857        // Start metrics collection
858        self.metrics_collector.start_collection()?;
859
860        let start_time = std::time::Instant::now();
861
862        // Run each enabled evaluation
863        for (eval_type, evaluator) in &self.evaluators {
864            tracing::debug!("Running {} evaluation...", evaluator.name());
865
866            let eval_start = std::time::Instant::now();
867
868            match evaluator.evaluate(&data) {
869                Ok(result) => {
870                    let eval_time = eval_start.elapsed().as_millis() as f32;
871                    self.metrics_collector.record_metric(
872                        "evaluation_time_ms",
873                        eval_time,
874                        Some(
875                            &[("evaluator".to_string(), evaluator.name().to_string())]
876                                .iter()
877                                .cloned()
878                                .collect(),
879                        ),
880                    )?;
881
882                    results.insert(eval_type.clone(), result);
883                    tracing::debug!(
884                        "✅ {} evaluation completed in {:.2}ms",
885                        evaluator.name(),
886                        eval_time
887                    );
888                }
889                Err(e) => {
890                    error!(" {} evaluation failed: {}", evaluator.name(), e);
891                    self.metrics_collector.record_metric(
892                        "evaluation_errors",
893                        1.0,
894                        Some(
895                            &[("evaluator".to_string(), evaluator.name().to_string())]
896                                .iter()
897                                .cloned()
898                                .collect(),
899                        ),
900                    )?;
901                }
902            }
903        }
904
905        let total_time = start_time.elapsed().as_millis() as f32;
906        self.metrics_collector
907            .record_metric("total_evaluation_time_ms", total_time, None)?;
908
909        // Stop metrics collection
910        self.metrics_collector.stop_collection()?;
911
912        Ok(results)
913    }
914
915    /// Export evaluation results
916    pub async fn export_results(
917        &self,
918        results: &HashMap<EvaluationType, EvaluationResult>,
919    ) -> RragResult<()> {
920        // Create output directory
921        std::fs::create_dir_all(&self.config.output_config.output_dir).map_err(|e| {
922            RragError::evaluation(format!("Failed to create output directory: {}", e))
923        })?;
924
925        for format in &self.config.output_config.export_formats {
926            match format {
927                ExportFormat::Json => self.export_json(results).await?,
928                ExportFormat::Csv => self.export_csv(results).await?,
929                ExportFormat::Html => self.export_html(results).await?,
930                ExportFormat::Markdown => self.export_markdown(results).await?,
931            }
932        }
933
934        Ok(())
935    }
936
937    /// Export results as JSON
938    async fn export_json(
939        &self,
940        results: &HashMap<EvaluationType, EvaluationResult>,
941    ) -> RragResult<()> {
942        let json_path = format!(
943            "{}/evaluation_results.json",
944            self.config.output_config.output_dir
945        );
946        let json_content = serde_json::to_string_pretty(results)
947            .map_err(|e| RragError::evaluation(format!("Failed to serialize results: {}", e)))?;
948
949        std::fs::write(&json_path, json_content)
950            .map_err(|e| RragError::evaluation(format!("Failed to write JSON file: {}", e)))?;
951
952        info!(" Results exported to {}", json_path);
953        Ok(())
954    }
955
956    /// Export results as CSV
957    async fn export_csv(
958        &self,
959        results: &HashMap<EvaluationType, EvaluationResult>,
960    ) -> RragResult<()> {
961        let csv_path = format!(
962            "{}/evaluation_summary.csv",
963            self.config.output_config.output_dir
964        );
965        let mut csv_content = String::new();
966
967        // Header
968        csv_content.push_str("evaluator,metric,value\n");
969
970        // Data
971        for (eval_type, result) in results {
972            for (metric, value) in &result.overall_scores {
973                csv_content.push_str(&format!("{:?},{},{}\n", eval_type, metric, value));
974            }
975        }
976
977        std::fs::write(&csv_path, csv_content)
978            .map_err(|e| RragError::evaluation(format!("Failed to write CSV file: {}", e)))?;
979
980        info!(" Summary exported to {}", csv_path);
981        Ok(())
982    }
983
984    /// Export results as HTML
985    async fn export_html(
986        &self,
987        results: &HashMap<EvaluationType, EvaluationResult>,
988    ) -> RragResult<()> {
989        let html_path = format!(
990            "{}/evaluation_report.html",
991            self.config.output_config.output_dir
992        );
993        let mut html_content = String::from(
994            r#"
995<!DOCTYPE html>
996<html>
997<head>
998    <title>RRAG Evaluation Report</title>
999    <style>
1000        body { font-family: Arial, sans-serif; margin: 40px; }
1001        .header { border-bottom: 2px solid #333; margin-bottom: 30px; }
1002        .evaluator { margin-bottom: 40px; border: 1px solid #ddd; padding: 20px; }
1003        .metric { margin: 10px 0; }
1004        .score { font-weight: bold; color: #2196F3; }
1005        table { border-collapse: collapse; width: 100%; }
1006        th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
1007        th { background-color: #f2f2f2; }
1008    </style>
1009</head>
1010<body>
1011    <div class="header">
1012        <h1>🎯 RRAG Evaluation Report</h1>
1013        <p>Generated on: "#,
1014        );
1015
1016        html_content.push_str(
1017            &chrono::Utc::now()
1018                .format("%Y-%m-%d %H:%M:%S UTC")
1019                .to_string(),
1020        );
1021        html_content.push_str("</p>\n    </div>\n");
1022
1023        for (eval_type, result) in results {
1024            html_content.push_str(&format!(
1025                r#"
1026    <div class="evaluator">
1027        <h2>📊 {:?} Evaluation</h2>
1028        <h3>Overall Scores</h3>
1029        <table>
1030            <tr><th>Metric</th><th>Score</th></tr>"#,
1031                eval_type
1032            ));
1033
1034            for (metric, score) in &result.overall_scores {
1035                html_content.push_str(&format!(
1036                    "<tr><td>{}</td><td class=\"score\">{:.4}</td></tr>",
1037                    metric, score
1038                ));
1039            }
1040
1041            html_content.push_str("</table>\n");
1042
1043            if !result.summary.insights.is_empty() {
1044                html_content.push_str("<h3>Key Insights</h3><ul>");
1045                for insight in &result.summary.insights {
1046                    html_content.push_str(&format!("<li>{}</li>", insight));
1047                }
1048                html_content.push_str("</ul>");
1049            }
1050
1051            html_content.push_str("    </div>\n");
1052        }
1053
1054        html_content.push_str("</body>\n</html>");
1055
1056        std::fs::write(&html_path, html_content)
1057            .map_err(|e| RragError::evaluation(format!("Failed to write HTML file: {}", e)))?;
1058
1059        info!(" Report exported to {}", html_path);
1060        Ok(())
1061    }
1062
1063    /// Export results as Markdown
1064    async fn export_markdown(
1065        &self,
1066        results: &HashMap<EvaluationType, EvaluationResult>,
1067    ) -> RragResult<()> {
1068        let md_path = format!(
1069            "{}/evaluation_report.md",
1070            self.config.output_config.output_dir
1071        );
1072        let mut md_content = String::from("# 🎯 RRAG Evaluation Report\n\n");
1073
1074        md_content.push_str(&format!(
1075            "**Generated on:** {}\n\n",
1076            chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC")
1077        ));
1078
1079        for (eval_type, result) in results {
1080            md_content.push_str(&format!("## 📊 {:?} Evaluation\n\n", eval_type));
1081
1082            md_content.push_str("### Overall Scores\n\n");
1083            md_content.push_str("| Metric | Score |\n|--------|-------|\n");
1084
1085            for (metric, score) in &result.overall_scores {
1086                md_content.push_str(&format!("| {} | {:.4} |\n", metric, score));
1087            }
1088
1089            if !result.summary.insights.is_empty() {
1090                md_content.push_str("\n### Key Insights\n\n");
1091                for insight in &result.summary.insights {
1092                    md_content.push_str(&format!("- {}\n", insight));
1093                }
1094            }
1095
1096            if !result.summary.recommendations.is_empty() {
1097                md_content.push_str("\n### Recommendations\n\n");
1098                for recommendation in &result.summary.recommendations {
1099                    md_content.push_str(&format!("- {}\n", recommendation));
1100                }
1101            }
1102
1103            md_content.push_str("\n---\n\n");
1104        }
1105
1106        std::fs::write(&md_path, md_content)
1107            .map_err(|e| RragError::evaluation(format!("Failed to write Markdown file: {}", e)))?;
1108
1109        info!(" Markdown report exported to {}", md_path);
1110        Ok(())
1111    }
1112
1113    /// Get evaluation metrics
1114    pub fn get_metrics(&self) -> RragResult<HashMap<String, Vec<MetricRecord>>> {
1115        self.metrics_collector.get_metrics()
1116    }
1117}
1118
1119/// Default metrics collector implementation
1120pub struct DefaultMetricsCollector {
1121    metrics: HashMap<String, Vec<MetricRecord>>,
1122    collecting: bool,
1123}
1124
1125impl DefaultMetricsCollector {
1126    pub fn new() -> Self {
1127        Self {
1128            metrics: HashMap::new(),
1129            collecting: false,
1130        }
1131    }
1132}
1133
1134impl MetricsCollector for DefaultMetricsCollector {
1135    fn start_collection(&mut self) -> RragResult<()> {
1136        self.collecting = true;
1137        self.metrics.clear();
1138        Ok(())
1139    }
1140
1141    fn stop_collection(&mut self) -> RragResult<()> {
1142        self.collecting = false;
1143        Ok(())
1144    }
1145
1146    fn record_metric(
1147        &mut self,
1148        name: &str,
1149        value: f32,
1150        labels: Option<&HashMap<String, String>>,
1151    ) -> RragResult<()> {
1152        if !self.collecting {
1153            return Ok(());
1154        }
1155
1156        let record = MetricRecord {
1157            name: name.to_string(),
1158            value,
1159            timestamp: chrono::Utc::now(),
1160            labels: labels.cloned().unwrap_or_default(),
1161        };
1162
1163        self.metrics
1164            .entry(name.to_string())
1165            .or_insert_with(Vec::new)
1166            .push(record);
1167        Ok(())
1168    }
1169
1170    fn get_metrics(&self) -> RragResult<HashMap<String, Vec<MetricRecord>>> {
1171        Ok(self.metrics.clone())
1172    }
1173
1174    fn export_metrics(&self, format: &ExportFormat, output_path: &str) -> RragResult<()> {
1175        match format {
1176            ExportFormat::Json => {
1177                let json_content = serde_json::to_string_pretty(&self.metrics).map_err(|e| {
1178                    RragError::evaluation(format!("Failed to serialize metrics: {}", e))
1179                })?;
1180                std::fs::write(output_path, json_content).map_err(|e| {
1181                    RragError::evaluation(format!("Failed to write metrics file: {}", e))
1182                })?;
1183            }
1184            _ => {
1185                return Err(RragError::evaluation(
1186                    "Unsupported export format for metrics".to_string(),
1187                ));
1188            }
1189        }
1190        Ok(())
1191    }
1192}
1193
1194#[cfg(test)]
1195mod tests {
1196    use super::*;
1197
1198    #[test]
1199    fn test_evaluation_config_creation() {
1200        let config = EvaluationConfig::default();
1201        assert!(config.enabled_evaluations.contains(&EvaluationType::Ragas));
1202        assert!(config
1203            .enabled_evaluations
1204            .contains(&EvaluationType::Retrieval));
1205        assert!(config
1206            .enabled_evaluations
1207            .contains(&EvaluationType::Generation));
1208    }
1209
1210    #[test]
1211    fn test_evaluation_data_creation() {
1212        let query = TestQuery {
1213            id: "test_1".to_string(),
1214            query: "What is machine learning?".to_string(),
1215            query_type: Some("factual".to_string()),
1216            metadata: HashMap::new(),
1217        };
1218
1219        let ground_truth = GroundTruth {
1220            query_id: "test_1".to_string(),
1221            relevant_docs: vec!["doc_1".to_string(), "doc_2".to_string()],
1222            expected_answer: Some("Machine learning is...".to_string()),
1223            relevance_judgments: HashMap::new(),
1224            metadata: HashMap::new(),
1225        };
1226
1227        let data = EvaluationData {
1228            queries: vec![query],
1229            ground_truth: vec![ground_truth],
1230            system_responses: vec![],
1231            context: HashMap::new(),
1232        };
1233
1234        assert_eq!(data.queries.len(), 1);
1235        assert_eq!(data.ground_truth.len(), 1);
1236    }
1237
1238    #[test]
1239    fn test_metrics_collector() {
1240        let mut collector = DefaultMetricsCollector::new();
1241
1242        collector.start_collection().unwrap();
1243        collector.record_metric("test_metric", 0.85, None).unwrap();
1244        collector.stop_collection().unwrap();
1245
1246        let metrics = collector.get_metrics().unwrap();
1247        assert!(metrics.contains_key("test_metric"));
1248        assert_eq!(metrics["test_metric"].len(), 1);
1249        assert_eq!(metrics["test_metric"][0].value, 0.85);
1250    }
1251}