rexis_rag/evaluation/
end_to_end.rs

1//! # End-to-End Evaluation Module
2//!
3//! Comprehensive evaluation that considers the entire RAG pipeline
4//! including user experience, system performance, and holistic quality.
5
6use super::{
7    EvaluationData, EvaluationMetadata, EvaluationResult, EvaluationSummary, Evaluator,
8    EvaluatorConfig, EvaluatorPerformance, PerformanceStats,
9};
10use crate::RragResult;
11use serde::{Deserialize, Serialize};
12use std::collections::HashMap;
13use tracing::warn;
14
15/// End-to-end evaluator
16pub struct EndToEndEvaluator {
17    config: EndToEndConfig,
18    metrics: Vec<Box<dyn E2EMetric>>,
19}
20
21/// Configuration for end-to-end evaluation
22#[derive(Debug, Clone)]
23pub struct EndToEndConfig {
24    /// Enabled metrics
25    pub enabled_metrics: Vec<E2EMetricType>,
26
27    /// User experience weight
28    pub user_experience_weight: f32,
29
30    /// System performance weight
31    pub system_performance_weight: f32,
32
33    /// Quality weight
34    pub quality_weight: f32,
35
36    /// Robustness weight
37    pub robustness_weight: f32,
38
39    /// Performance thresholds
40    pub performance_thresholds: PerformanceThresholds,
41
42    /// User satisfaction config
43    pub user_satisfaction_config: UserSatisfactionConfig,
44
45    /// System reliability config
46    pub system_reliability_config: SystemReliabilityConfig,
47}
48
49impl Default for EndToEndConfig {
50    fn default() -> Self {
51        Self {
52            enabled_metrics: vec![
53                E2EMetricType::UserSatisfaction,
54                E2EMetricType::SystemLatency,
55                E2EMetricType::SystemThroughput,
56                E2EMetricType::OverallQuality,
57                E2EMetricType::Robustness,
58                E2EMetricType::Consistency,
59                E2EMetricType::Usability,
60            ],
61            user_experience_weight: 0.4,
62            system_performance_weight: 0.3,
63            quality_weight: 0.2,
64            robustness_weight: 0.1,
65            performance_thresholds: PerformanceThresholds::default(),
66            user_satisfaction_config: UserSatisfactionConfig::default(),
67            system_reliability_config: SystemReliabilityConfig::default(),
68        }
69    }
70}
71
72/// Types of end-to-end metrics
73#[derive(Debug, Clone, PartialEq, Eq)]
74pub enum E2EMetricType {
75    /// Overall user satisfaction
76    UserSatisfaction,
77    /// System response latency
78    SystemLatency,
79    /// System throughput
80    SystemThroughput,
81    /// Overall quality (combination of all quality metrics)
82    OverallQuality,
83    /// System robustness to different inputs
84    Robustness,
85    /// Consistency across similar queries
86    Consistency,
87    /// System usability and user experience
88    Usability,
89    /// Resource efficiency
90    ResourceEfficiency,
91    /// Error rate and reliability
92    ErrorRate,
93    /// Scalability under load
94    Scalability,
95    /// User engagement metrics
96    UserEngagement,
97    /// Trust and credibility
98    TrustScore,
99}
100
101/// Performance thresholds for evaluation
102#[derive(Debug, Clone)]
103pub struct PerformanceThresholds {
104    /// Maximum acceptable latency (ms)
105    pub max_latency_ms: f32,
106
107    /// Minimum throughput (queries per second)
108    pub min_throughput_qps: f32,
109
110    /// Maximum error rate (%)
111    pub max_error_rate: f32,
112
113    /// Minimum quality score
114    pub min_quality_score: f32,
115
116    /// Maximum resource usage (MB)
117    pub max_memory_usage_mb: f32,
118}
119
120impl Default for PerformanceThresholds {
121    fn default() -> Self {
122        Self {
123            max_latency_ms: 2000.0,
124            min_throughput_qps: 10.0,
125            max_error_rate: 5.0,
126            min_quality_score: 0.7,
127            max_memory_usage_mb: 1000.0,
128        }
129    }
130}
131
132/// User satisfaction configuration
133#[derive(Debug, Clone)]
134pub struct UserSatisfactionConfig {
135    /// Weight for answer quality
136    pub answer_quality_weight: f32,
137
138    /// Weight for response time
139    pub response_time_weight: f32,
140
141    /// Weight for relevance
142    pub relevance_weight: f32,
143
144    /// Weight for completeness
145    pub completeness_weight: f32,
146
147    /// Weight for clarity
148    pub clarity_weight: f32,
149}
150
151impl Default for UserSatisfactionConfig {
152    fn default() -> Self {
153        Self {
154            answer_quality_weight: 0.3,
155            response_time_weight: 0.2,
156            relevance_weight: 0.25,
157            completeness_weight: 0.15,
158            clarity_weight: 0.1,
159        }
160    }
161}
162
163/// System reliability configuration
164#[derive(Debug, Clone)]
165pub struct SystemReliabilityConfig {
166    /// Acceptable failure rate
167    pub acceptable_failure_rate: f32,
168
169    /// Recovery time threshold
170    pub recovery_time_threshold_ms: f32,
171
172    /// Consistency threshold
173    pub consistency_threshold: f32,
174}
175
176impl Default for SystemReliabilityConfig {
177    fn default() -> Self {
178        Self {
179            acceptable_failure_rate: 0.01,
180            recovery_time_threshold_ms: 5000.0,
181            consistency_threshold: 0.9,
182        }
183    }
184}
185
186/// Trait for end-to-end metrics
187pub trait E2EMetric: Send + Sync {
188    /// Metric name
189    fn name(&self) -> &str;
190
191    /// Metric type
192    fn metric_type(&self) -> E2EMetricType;
193
194    /// Evaluate metric across all queries
195    fn evaluate_system(
196        &self,
197        evaluation_data: &EvaluationData,
198        system_metrics: &SystemMetrics,
199    ) -> RragResult<f32>;
200
201    /// Get metric configuration
202    fn get_config(&self) -> E2EMetricConfig;
203}
204
205/// Configuration for E2E metrics
206#[derive(Debug, Clone)]
207pub struct E2EMetricConfig {
208    /// Metric name
209    pub name: String,
210
211    /// Requires system performance data
212    pub requires_performance_data: bool,
213
214    /// Requires user feedback
215    pub requires_user_feedback: bool,
216
217    /// Score range
218    pub score_range: (f32, f32),
219
220    /// Higher is better
221    pub higher_is_better: bool,
222
223    /// Evaluation level
224    pub evaluation_level: EvaluationLevel,
225}
226
227/// Level of evaluation
228#[derive(Debug, Clone)]
229pub enum EvaluationLevel {
230    /// Query-level evaluation
231    Query,
232    /// Session-level evaluation
233    Session,
234    /// System-level evaluation
235    System,
236}
237
238/// System performance metrics
239#[derive(Debug, Clone, Serialize, Deserialize)]
240pub struct SystemMetrics {
241    /// Average response time
242    pub avg_response_time_ms: f32,
243
244    /// Throughput (queries per second)
245    pub throughput_qps: f32,
246
247    /// Error rate
248    pub error_rate: f32,
249
250    /// Memory usage
251    pub memory_usage_mb: f32,
252
253    /// CPU usage
254    pub cpu_usage_percent: f32,
255
256    /// System availability
257    pub availability_percent: f32,
258
259    /// Cache hit rate
260    pub cache_hit_rate: f32,
261}
262
263impl Default for SystemMetrics {
264    fn default() -> Self {
265        Self {
266            avg_response_time_ms: 1000.0,
267            throughput_qps: 50.0,
268            error_rate: 1.0,
269            memory_usage_mb: 512.0,
270            cpu_usage_percent: 60.0,
271            availability_percent: 99.5,
272            cache_hit_rate: 0.8,
273        }
274    }
275}
276
277impl EndToEndEvaluator {
278    /// Create new end-to-end evaluator
279    pub fn new(config: EndToEndConfig) -> Self {
280        let mut evaluator = Self {
281            config: config.clone(),
282            metrics: Vec::new(),
283        };
284
285        // Initialize metrics based on configuration
286        evaluator.initialize_metrics();
287
288        evaluator
289    }
290
291    /// Initialize metrics based on configuration
292    fn initialize_metrics(&mut self) {
293        for metric_type in &self.config.enabled_metrics {
294            let metric: Box<dyn E2EMetric> = match metric_type {
295                E2EMetricType::UserSatisfaction => Box::new(UserSatisfactionMetric::new(
296                    self.config.user_satisfaction_config.clone(),
297                )),
298                E2EMetricType::SystemLatency => Box::new(SystemLatencyMetric::new(
299                    self.config.performance_thresholds.clone(),
300                )),
301                E2EMetricType::SystemThroughput => Box::new(SystemThroughputMetric::new(
302                    self.config.performance_thresholds.clone(),
303                )),
304                E2EMetricType::OverallQuality => Box::new(OverallQualityMetric::new()),
305                E2EMetricType::Robustness => Box::new(RobustnessMetric::new()),
306                E2EMetricType::Consistency => Box::new(ConsistencyMetric::new(
307                    self.config.system_reliability_config.clone(),
308                )),
309                E2EMetricType::Usability => Box::new(UsabilityMetric::new()),
310                E2EMetricType::ResourceEfficiency => Box::new(ResourceEfficiencyMetric::new(
311                    self.config.performance_thresholds.clone(),
312                )),
313                E2EMetricType::ErrorRate => Box::new(ErrorRateMetric::new(
314                    self.config.system_reliability_config.clone(),
315                )),
316                _ => continue, // Skip unsupported metrics
317            };
318
319            self.metrics.push(metric);
320        }
321    }
322}
323
324impl Evaluator for EndToEndEvaluator {
325    fn name(&self) -> &str {
326        "EndToEnd"
327    }
328
329    fn evaluate(&self, data: &EvaluationData) -> RragResult<EvaluationResult> {
330        let start_time = std::time::Instant::now();
331        let mut overall_scores = HashMap::new();
332        let per_query_results = Vec::new(); // E2E metrics are typically system-level
333
334        // Calculate system metrics from evaluation data
335        let system_metrics = self.calculate_system_metrics(data);
336
337        // Evaluate each metric
338        for metric in &self.metrics {
339            match metric.evaluate_system(data, &system_metrics) {
340                Ok(score) => {
341                    overall_scores.insert(metric.name().to_string(), score);
342                }
343                Err(e) => {
344                    warn!(" Failed to evaluate {}: {}", metric.name(), e);
345                }
346            }
347        }
348
349        // Calculate weighted overall score
350        let overall_score = self.calculate_overall_score(&overall_scores);
351        overall_scores.insert("overall_e2e_score".to_string(), overall_score);
352
353        let total_time = start_time.elapsed().as_millis() as f32;
354
355        // Generate insights and recommendations
356        let insights = self.generate_insights(&overall_scores, &system_metrics);
357        let recommendations = self.generate_recommendations(&overall_scores, &system_metrics);
358
359        Ok(EvaluationResult {
360            id: uuid::Uuid::new_v4().to_string(),
361            evaluation_type: "EndToEnd".to_string(),
362            overall_scores: overall_scores.clone(),
363            per_query_results,
364            summary: EvaluationSummary {
365                total_queries: data.queries.len(),
366                avg_scores: overall_scores.clone(),
367                std_deviations: HashMap::new(), // Not applicable for system-level metrics
368                performance_stats: PerformanceStats {
369                    avg_eval_time_ms: total_time,
370                    total_eval_time_ms: total_time,
371                    peak_memory_usage_mb: system_metrics.memory_usage_mb,
372                    throughput_qps: system_metrics.throughput_qps,
373                },
374                insights,
375                recommendations,
376            },
377            metadata: EvaluationMetadata {
378                timestamp: chrono::Utc::now(),
379                evaluation_version: "1.0.0".to_string(),
380                system_config: HashMap::new(),
381                environment: std::env::vars().collect(),
382                git_commit: None,
383            },
384        })
385    }
386
387    fn supported_metrics(&self) -> Vec<String> {
388        self.metrics.iter().map(|m| m.name().to_string()).collect()
389    }
390
391    fn get_config(&self) -> EvaluatorConfig {
392        EvaluatorConfig {
393            name: "EndToEnd".to_string(),
394            version: "1.0.0".to_string(),
395            metrics: self.supported_metrics(),
396            performance: EvaluatorPerformance {
397                avg_time_per_sample_ms: 200.0,
398                memory_usage_mb: 100.0,
399                accuracy: 0.9,
400            },
401        }
402    }
403}
404
405impl EndToEndEvaluator {
406    /// Calculate system metrics from evaluation data
407    fn calculate_system_metrics(&self, data: &EvaluationData) -> SystemMetrics {
408        let mut total_time = 0.0;
409        let mut error_count = 0;
410        let mut valid_responses = 0;
411
412        // Aggregate timing and error information
413        for response in &data.system_responses {
414            total_time += response.timing.total_time_ms;
415            valid_responses += 1;
416
417            // Check for errors (simplified)
418            if response.generated_answer.is_none() || response.retrieved_docs.is_empty() {
419                error_count += 1;
420            }
421        }
422
423        let avg_response_time = if valid_responses > 0 {
424            total_time / valid_responses as f32
425        } else {
426            0.0
427        };
428
429        let error_rate = if data.queries.len() > 0 {
430            (error_count as f32 / data.queries.len() as f32) * 100.0
431        } else {
432            0.0
433        };
434
435        let throughput = if total_time > 0.0 {
436            (valid_responses as f32 * 1000.0) / total_time // Convert to QPS
437        } else {
438            0.0
439        };
440
441        SystemMetrics {
442            avg_response_time_ms: avg_response_time,
443            throughput_qps: throughput,
444            error_rate,
445            memory_usage_mb: 256.0,  // Estimated
446            cpu_usage_percent: 45.0, // Estimated
447            availability_percent: 99.0,
448            cache_hit_rate: 0.7,
449        }
450    }
451
452    /// Calculate overall weighted score
453    fn calculate_overall_score(&self, scores: &HashMap<String, f32>) -> f32 {
454        let mut weighted_sum = 0.0;
455        let mut total_weight = 0.0;
456
457        // User experience metrics
458        if let Some(&user_satisfaction) = scores.get("user_satisfaction") {
459            weighted_sum += user_satisfaction * self.config.user_experience_weight;
460            total_weight += self.config.user_experience_weight;
461        }
462
463        // System performance metrics
464        let performance_metrics = ["system_latency", "system_throughput", "resource_efficiency"];
465        let mut performance_score = 0.0;
466        let mut performance_count = 0;
467
468        for metric in &performance_metrics {
469            if let Some(&score) = scores.get(*metric) {
470                performance_score += score;
471                performance_count += 1;
472            }
473        }
474
475        if performance_count > 0 {
476            performance_score /= performance_count as f32;
477            weighted_sum += performance_score * self.config.system_performance_weight;
478            total_weight += self.config.system_performance_weight;
479        }
480
481        // Quality metrics
482        if let Some(&quality) = scores.get("overall_quality") {
483            weighted_sum += quality * self.config.quality_weight;
484            total_weight += self.config.quality_weight;
485        }
486
487        // Robustness metrics
488        if let Some(&robustness) = scores.get("robustness") {
489            weighted_sum += robustness * self.config.robustness_weight;
490            total_weight += self.config.robustness_weight;
491        }
492
493        if total_weight > 0.0 {
494            weighted_sum / total_weight
495        } else {
496            0.0
497        }
498    }
499
500    /// Generate insights based on evaluation results
501    fn generate_insights(
502        &self,
503        scores: &HashMap<String, f32>,
504        metrics: &SystemMetrics,
505    ) -> Vec<String> {
506        let mut insights = Vec::new();
507
508        // Overall performance insights
509        if let Some(&overall_score) = scores.get("overall_e2e_score") {
510            if overall_score > 0.8 {
511                insights.push("🎯 Excellent end-to-end system performance".to_string());
512            } else if overall_score < 0.6 {
513                insights.push("⚠️ End-to-end system performance needs improvement".to_string());
514            }
515        }
516
517        // Latency insights
518        if metrics.avg_response_time_ms > self.config.performance_thresholds.max_latency_ms {
519            insights.push(format!(
520                "🐌 High latency detected: {:.1}ms (threshold: {:.1}ms)",
521                metrics.avg_response_time_ms, self.config.performance_thresholds.max_latency_ms
522            ));
523        }
524
525        // Throughput insights
526        if metrics.throughput_qps < self.config.performance_thresholds.min_throughput_qps {
527            insights.push(format!(
528                "📊 Low throughput: {:.1} QPS (minimum: {:.1} QPS)",
529                metrics.throughput_qps, self.config.performance_thresholds.min_throughput_qps
530            ));
531        }
532
533        // Error rate insights
534        if metrics.error_rate > self.config.performance_thresholds.max_error_rate {
535            insights.push(format!(
536                "🚨 High error rate: {:.1}% (threshold: {:.1}%)",
537                metrics.error_rate, self.config.performance_thresholds.max_error_rate
538            ));
539        }
540
541        // Resource efficiency insights
542        if metrics.memory_usage_mb > self.config.performance_thresholds.max_memory_usage_mb {
543            insights.push(format!(
544                "💾 High memory usage: {:.1}MB (threshold: {:.1}MB)",
545                metrics.memory_usage_mb, self.config.performance_thresholds.max_memory_usage_mb
546            ));
547        }
548
549        // User satisfaction insights
550        if let Some(&user_satisfaction) = scores.get("user_satisfaction") {
551            if user_satisfaction < 0.7 {
552                insights.push(
553                    "👥 User satisfaction below expectations - focus on UX improvements"
554                        .to_string(),
555                );
556            }
557        }
558
559        insights
560    }
561
562    /// Generate recommendations based on evaluation results
563    fn generate_recommendations(
564        &self,
565        scores: &HashMap<String, f32>,
566        metrics: &SystemMetrics,
567    ) -> Vec<String> {
568        let mut recommendations = Vec::new();
569
570        // Performance recommendations
571        if metrics.avg_response_time_ms > self.config.performance_thresholds.max_latency_ms {
572            recommendations
573                .push("⚡ Optimize response time with caching and parallel processing".to_string());
574            recommendations
575                .push("🔧 Consider upgrading hardware or scaling horizontally".to_string());
576        }
577
578        if metrics.throughput_qps < self.config.performance_thresholds.min_throughput_qps {
579            recommendations.push("📈 Implement load balancing and connection pooling".to_string());
580            recommendations.push("🚀 Consider async processing for better throughput".to_string());
581        }
582
583        if metrics.error_rate > self.config.performance_thresholds.max_error_rate {
584            recommendations
585                .push("🛡️ Implement better error handling and retry mechanisms".to_string());
586            recommendations.push("📊 Add comprehensive monitoring and alerting".to_string());
587        }
588
589        // User experience recommendations
590        if let Some(&user_satisfaction) = scores.get("user_satisfaction") {
591            if user_satisfaction < 0.7 {
592                recommendations
593                    .push("👤 Conduct user research to identify pain points".to_string());
594                recommendations
595                    .push("🎨 Improve user interface and interaction design".to_string());
596            }
597        }
598
599        // Quality recommendations
600        if let Some(&quality) = scores.get("overall_quality") {
601            if quality < 0.7 {
602                recommendations
603                    .push("📚 Improve training data quality and model fine-tuning".to_string());
604                recommendations
605                    .push("🔍 Implement better content filtering and validation".to_string());
606            }
607        }
608
609        // System reliability recommendations
610        if let Some(&consistency) = scores.get("consistency") {
611            if consistency < 0.8 {
612                recommendations.push(
613                    "🎯 Improve system consistency with better configuration management"
614                        .to_string(),
615                );
616                recommendations
617                    .push("🔄 Implement chaos engineering to test system resilience".to_string());
618            }
619        }
620
621        recommendations
622    }
623}
624
625// Individual E2E metric implementations
626struct UserSatisfactionMetric {
627    config: UserSatisfactionConfig,
628}
629
630impl UserSatisfactionMetric {
631    fn new(config: UserSatisfactionConfig) -> Self {
632        Self { config }
633    }
634}
635
636impl E2EMetric for UserSatisfactionMetric {
637    fn name(&self) -> &str {
638        "user_satisfaction"
639    }
640
641    fn metric_type(&self) -> E2EMetricType {
642        E2EMetricType::UserSatisfaction
643    }
644
645    fn evaluate_system(&self, data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
646        // Simulate user satisfaction based on various factors
647        let response_time_score = if metrics.avg_response_time_ms < 1000.0 {
648            1.0
649        } else if metrics.avg_response_time_ms < 3000.0 {
650            0.8 - (metrics.avg_response_time_ms - 1000.0) / 2000.0 * 0.3
651        } else {
652            0.5
653        };
654
655        // Quality score (based on having answers)
656        let answered_queries = data
657            .system_responses
658            .iter()
659            .filter(|r| r.generated_answer.is_some())
660            .count();
661        let answer_quality_score = answered_queries as f32 / data.queries.len() as f32;
662
663        // Relevance score (simplified)
664        let relevance_score = 0.8; // Placeholder
665
666        // Completeness score (based on retrieved documents)
667        let avg_docs = data
668            .system_responses
669            .iter()
670            .map(|r| r.retrieved_docs.len())
671            .sum::<usize>() as f32
672            / data.system_responses.len() as f32;
673        let completeness_score = (avg_docs / 5.0).min(1.0); // Normalize to 5 docs = 1.0
674
675        // Clarity score (simplified)
676        let clarity_score = 0.75; // Placeholder
677
678        // Weighted combination
679        let satisfaction = response_time_score * self.config.response_time_weight
680            + answer_quality_score * self.config.answer_quality_weight
681            + relevance_score * self.config.relevance_weight
682            + completeness_score * self.config.completeness_weight
683            + clarity_score * self.config.clarity_weight;
684
685        Ok(satisfaction.min(1.0))
686    }
687
688    fn get_config(&self) -> E2EMetricConfig {
689        E2EMetricConfig {
690            name: "user_satisfaction".to_string(),
691            requires_performance_data: true,
692            requires_user_feedback: false,
693            score_range: (0.0, 1.0),
694            higher_is_better: true,
695            evaluation_level: EvaluationLevel::System,
696        }
697    }
698}
699
700struct SystemLatencyMetric {
701    thresholds: PerformanceThresholds,
702}
703
704impl SystemLatencyMetric {
705    fn new(thresholds: PerformanceThresholds) -> Self {
706        Self { thresholds }
707    }
708}
709
710impl E2EMetric for SystemLatencyMetric {
711    fn name(&self) -> &str {
712        "system_latency"
713    }
714
715    fn metric_type(&self) -> E2EMetricType {
716        E2EMetricType::SystemLatency
717    }
718
719    fn evaluate_system(&self, _data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
720        // Score based on how well latency meets thresholds
721        let score = if metrics.avg_response_time_ms <= self.thresholds.max_latency_ms {
722            1.0 - (metrics.avg_response_time_ms / self.thresholds.max_latency_ms) * 0.2
723        } else {
724            // Penalty for exceeding threshold
725            let excess = metrics.avg_response_time_ms - self.thresholds.max_latency_ms;
726            let penalty = excess / self.thresholds.max_latency_ms;
727            (0.8 - penalty * 0.5).max(0.0)
728        };
729
730        Ok(score)
731    }
732
733    fn get_config(&self) -> E2EMetricConfig {
734        E2EMetricConfig {
735            name: "system_latency".to_string(),
736            requires_performance_data: true,
737            requires_user_feedback: false,
738            score_range: (0.0, 1.0),
739            higher_is_better: true,
740            evaluation_level: EvaluationLevel::System,
741        }
742    }
743}
744
745struct SystemThroughputMetric {
746    thresholds: PerformanceThresholds,
747}
748
749impl SystemThroughputMetric {
750    fn new(thresholds: PerformanceThresholds) -> Self {
751        Self { thresholds }
752    }
753}
754
755impl E2EMetric for SystemThroughputMetric {
756    fn name(&self) -> &str {
757        "system_throughput"
758    }
759
760    fn metric_type(&self) -> E2EMetricType {
761        E2EMetricType::SystemThroughput
762    }
763
764    fn evaluate_system(&self, _data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
765        // Score based on throughput relative to minimum threshold
766        let score = if metrics.throughput_qps >= self.thresholds.min_throughput_qps {
767            (metrics.throughput_qps / self.thresholds.min_throughput_qps).min(2.0) / 2.0
768        } else {
769            metrics.throughput_qps / self.thresholds.min_throughput_qps
770        };
771
772        Ok(score.min(1.0))
773    }
774
775    fn get_config(&self) -> E2EMetricConfig {
776        E2EMetricConfig {
777            name: "system_throughput".to_string(),
778            requires_performance_data: true,
779            requires_user_feedback: false,
780            score_range: (0.0, 1.0),
781            higher_is_better: true,
782            evaluation_level: EvaluationLevel::System,
783        }
784    }
785}
786
787// Placeholder implementations for other metrics
788macro_rules! impl_simple_e2e_metric {
789    ($name:ident, $metric_name:literal, $metric_type:expr, $default_score:expr) => {
790        struct $name;
791
792        impl $name {
793            fn new() -> Self {
794                Self
795            }
796        }
797
798        impl E2EMetric for $name {
799            fn name(&self) -> &str {
800                $metric_name
801            }
802
803            fn metric_type(&self) -> E2EMetricType {
804                $metric_type
805            }
806
807            fn evaluate_system(
808                &self,
809                _data: &EvaluationData,
810                _metrics: &SystemMetrics,
811            ) -> RragResult<f32> {
812                Ok($default_score)
813            }
814
815            fn get_config(&self) -> E2EMetricConfig {
816                E2EMetricConfig {
817                    name: $metric_name.to_string(),
818                    requires_performance_data: false,
819                    requires_user_feedback: false,
820                    score_range: (0.0, 1.0),
821                    higher_is_better: true,
822                    evaluation_level: EvaluationLevel::System,
823                }
824            }
825        }
826    };
827}
828
829struct OverallQualityMetric;
830
831impl OverallQualityMetric {
832    fn new() -> Self {
833        Self
834    }
835}
836
837impl E2EMetric for OverallQualityMetric {
838    fn name(&self) -> &str {
839        "overall_quality"
840    }
841
842    fn metric_type(&self) -> E2EMetricType {
843        E2EMetricType::OverallQuality
844    }
845
846    fn evaluate_system(&self, data: &EvaluationData, _metrics: &SystemMetrics) -> RragResult<f32> {
847        // Aggregate quality score based on successful responses
848        let successful_responses = data
849            .system_responses
850            .iter()
851            .filter(|r| r.generated_answer.is_some() && !r.retrieved_docs.is_empty())
852            .count();
853
854        let quality_score = successful_responses as f32 / data.queries.len() as f32;
855        Ok(quality_score)
856    }
857
858    fn get_config(&self) -> E2EMetricConfig {
859        E2EMetricConfig {
860            name: "overall_quality".to_string(),
861            requires_performance_data: false,
862            requires_user_feedback: false,
863            score_range: (0.0, 1.0),
864            higher_is_better: true,
865            evaluation_level: EvaluationLevel::System,
866        }
867    }
868}
869
870struct ConsistencyMetric {
871    config: SystemReliabilityConfig,
872}
873
874impl ConsistencyMetric {
875    fn new(config: SystemReliabilityConfig) -> Self {
876        Self { config }
877    }
878}
879
880impl E2EMetric for ConsistencyMetric {
881    fn name(&self) -> &str {
882        "consistency"
883    }
884
885    fn metric_type(&self) -> E2EMetricType {
886        E2EMetricType::Consistency
887    }
888
889    fn evaluate_system(&self, data: &EvaluationData, _metrics: &SystemMetrics) -> RragResult<f32> {
890        // Measure consistency in response times and quality
891        let response_times: Vec<f32> = data
892            .system_responses
893            .iter()
894            .map(|r| r.timing.total_time_ms)
895            .collect();
896
897        if response_times.is_empty() {
898            return Ok(0.0);
899        }
900
901        let mean_time = response_times.iter().sum::<f32>() / response_times.len() as f32;
902        let variance = response_times
903            .iter()
904            .map(|t| (t - mean_time).powi(2))
905            .sum::<f32>()
906            / response_times.len() as f32;
907        let std_dev = variance.sqrt();
908
909        // Consistency score based on coefficient of variation
910        let cv = if mean_time > 0.0 {
911            std_dev / mean_time
912        } else {
913            0.0
914        };
915        let consistency = (1.0 - cv).max(0.0);
916
917        Ok(consistency)
918    }
919
920    fn get_config(&self) -> E2EMetricConfig {
921        E2EMetricConfig {
922            name: "consistency".to_string(),
923            requires_performance_data: true,
924            requires_user_feedback: false,
925            score_range: (0.0, 1.0),
926            higher_is_better: true,
927            evaluation_level: EvaluationLevel::System,
928        }
929    }
930}
931
932struct ResourceEfficiencyMetric {
933    thresholds: PerformanceThresholds,
934}
935
936impl ResourceEfficiencyMetric {
937    fn new(thresholds: PerformanceThresholds) -> Self {
938        Self { thresholds }
939    }
940}
941
942impl E2EMetric for ResourceEfficiencyMetric {
943    fn name(&self) -> &str {
944        "resource_efficiency"
945    }
946
947    fn metric_type(&self) -> E2EMetricType {
948        E2EMetricType::ResourceEfficiency
949    }
950
951    fn evaluate_system(&self, _data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
952        // Score based on resource usage efficiency
953        let memory_score = if metrics.memory_usage_mb <= self.thresholds.max_memory_usage_mb {
954            1.0 - (metrics.memory_usage_mb / self.thresholds.max_memory_usage_mb) * 0.3
955        } else {
956            0.7 * (self.thresholds.max_memory_usage_mb / metrics.memory_usage_mb)
957        };
958
959        let cpu_score = if metrics.cpu_usage_percent <= 80.0 {
960            1.0 - (metrics.cpu_usage_percent / 100.0) * 0.2
961        } else {
962            0.8 * (80.0 / metrics.cpu_usage_percent)
963        };
964
965        let efficiency = (memory_score + cpu_score) / 2.0;
966        Ok(efficiency.min(1.0))
967    }
968
969    fn get_config(&self) -> E2EMetricConfig {
970        E2EMetricConfig {
971            name: "resource_efficiency".to_string(),
972            requires_performance_data: true,
973            requires_user_feedback: false,
974            score_range: (0.0, 1.0),
975            higher_is_better: true,
976            evaluation_level: EvaluationLevel::System,
977        }
978    }
979}
980
981struct ErrorRateMetric {
982    config: SystemReliabilityConfig,
983}
984
985impl ErrorRateMetric {
986    fn new(config: SystemReliabilityConfig) -> Self {
987        Self { config }
988    }
989}
990
991impl E2EMetric for ErrorRateMetric {
992    fn name(&self) -> &str {
993        "error_rate"
994    }
995
996    fn metric_type(&self) -> E2EMetricType {
997        E2EMetricType::ErrorRate
998    }
999
1000    fn evaluate_system(&self, _data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
1001        // Score based on error rate (lower error rate = higher score)
1002        let score = if metrics.error_rate <= self.config.acceptable_failure_rate * 100.0 {
1003            1.0 - (metrics.error_rate / 100.0) * 0.1
1004        } else {
1005            let excess = metrics.error_rate - (self.config.acceptable_failure_rate * 100.0);
1006            (0.9 - excess / 100.0 * 2.0).max(0.0)
1007        };
1008
1009        Ok(score)
1010    }
1011
1012    fn get_config(&self) -> E2EMetricConfig {
1013        E2EMetricConfig {
1014            name: "error_rate".to_string(),
1015            requires_performance_data: true,
1016            requires_user_feedback: false,
1017            score_range: (0.0, 1.0),
1018            higher_is_better: true,
1019            evaluation_level: EvaluationLevel::System,
1020        }
1021    }
1022}
1023
1024impl_simple_e2e_metric!(
1025    RobustnessMetric,
1026    "robustness",
1027    E2EMetricType::Robustness,
1028    0.8
1029);
1030impl_simple_e2e_metric!(UsabilityMetric, "usability", E2EMetricType::Usability, 0.85);
1031
1032#[cfg(test)]
1033mod tests {
1034    use super::*;
1035    use crate::evaluation::{
1036        GroundTruth, RetrievedDocument, SystemResponse, SystemTiming, TestQuery,
1037    };
1038
1039    #[test]
1040    fn test_user_satisfaction_metric() {
1041        let config = UserSatisfactionConfig::default();
1042        let metric = UserSatisfactionMetric::new(config);
1043
1044        let data = create_test_data();
1045        let system_metrics = SystemMetrics::default();
1046
1047        let score = metric.evaluate_system(&data, &system_metrics).unwrap();
1048        assert!(score >= 0.0 && score <= 1.0);
1049    }
1050
1051    #[test]
1052    fn test_system_latency_metric() {
1053        let thresholds = PerformanceThresholds::default();
1054        let metric = SystemLatencyMetric::new(thresholds);
1055
1056        let data = create_test_data();
1057        let mut system_metrics = SystemMetrics::default();
1058        system_metrics.avg_response_time_ms = 1500.0; // Within threshold
1059
1060        let score = metric.evaluate_system(&data, &system_metrics).unwrap();
1061        assert!(score > 0.5); // Should be good score for reasonable latency
1062    }
1063
1064    #[test]
1065    fn test_end_to_end_evaluator() {
1066        let config = EndToEndConfig::default();
1067        let evaluator = EndToEndEvaluator::new(config);
1068
1069        assert_eq!(evaluator.name(), "EndToEnd");
1070        assert!(!evaluator.supported_metrics().is_empty());
1071    }
1072
1073    fn create_test_data() -> EvaluationData {
1074        use super::super::*;
1075
1076        EvaluationData {
1077            queries: vec![TestQuery {
1078                id: "q1".to_string(),
1079                query: "What is machine learning?".to_string(),
1080                query_type: None,
1081                metadata: HashMap::new(),
1082            }],
1083            ground_truth: vec![GroundTruth {
1084                query_id: "q1".to_string(),
1085                relevant_docs: vec!["doc1".to_string()],
1086                expected_answer: Some("ML is AI subset".to_string()),
1087                relevance_judgments: HashMap::new(),
1088                metadata: HashMap::new(),
1089            }],
1090            system_responses: vec![SystemResponse {
1091                query_id: "q1".to_string(),
1092                retrieved_docs: vec![RetrievedDocument {
1093                    doc_id: "doc1".to_string(),
1094                    content: "Machine learning content".to_string(),
1095                    score: 0.9,
1096                    rank: 0,
1097                    metadata: HashMap::new(),
1098                }],
1099                generated_answer: Some("Machine learning is...".to_string()),
1100                timing: SystemTiming {
1101                    total_time_ms: 1000.0,
1102                    retrieval_time_ms: 500.0,
1103                    generation_time_ms: Some(400.0),
1104                    reranking_time_ms: Some(100.0),
1105                },
1106                metadata: HashMap::new(),
1107            }],
1108            context: HashMap::new(),
1109        }
1110    }
1111}
rexis_rag/evaluation/end_to_end.rs

rexis_rag/evaluation/
end_to_end.rs