1use super::{
7 EvaluationData, EvaluationMetadata, EvaluationResult, EvaluationSummary, Evaluator,
8 EvaluatorConfig, EvaluatorPerformance, PerformanceStats,
9};
10use crate::RragResult;
11use serde::{Deserialize, Serialize};
12use std::collections::HashMap;
13use tracing::warn;
14
15pub struct EndToEndEvaluator {
17 config: EndToEndConfig,
18 metrics: Vec<Box<dyn E2EMetric>>,
19}
20
21#[derive(Debug, Clone)]
23pub struct EndToEndConfig {
24 pub enabled_metrics: Vec<E2EMetricType>,
26
27 pub user_experience_weight: f32,
29
30 pub system_performance_weight: f32,
32
33 pub quality_weight: f32,
35
36 pub robustness_weight: f32,
38
39 pub performance_thresholds: PerformanceThresholds,
41
42 pub user_satisfaction_config: UserSatisfactionConfig,
44
45 pub system_reliability_config: SystemReliabilityConfig,
47}
48
49impl Default for EndToEndConfig {
50 fn default() -> Self {
51 Self {
52 enabled_metrics: vec![
53 E2EMetricType::UserSatisfaction,
54 E2EMetricType::SystemLatency,
55 E2EMetricType::SystemThroughput,
56 E2EMetricType::OverallQuality,
57 E2EMetricType::Robustness,
58 E2EMetricType::Consistency,
59 E2EMetricType::Usability,
60 ],
61 user_experience_weight: 0.4,
62 system_performance_weight: 0.3,
63 quality_weight: 0.2,
64 robustness_weight: 0.1,
65 performance_thresholds: PerformanceThresholds::default(),
66 user_satisfaction_config: UserSatisfactionConfig::default(),
67 system_reliability_config: SystemReliabilityConfig::default(),
68 }
69 }
70}
71
72#[derive(Debug, Clone, PartialEq, Eq)]
74pub enum E2EMetricType {
75 UserSatisfaction,
77 SystemLatency,
79 SystemThroughput,
81 OverallQuality,
83 Robustness,
85 Consistency,
87 Usability,
89 ResourceEfficiency,
91 ErrorRate,
93 Scalability,
95 UserEngagement,
97 TrustScore,
99}
100
101#[derive(Debug, Clone)]
103pub struct PerformanceThresholds {
104 pub max_latency_ms: f32,
106
107 pub min_throughput_qps: f32,
109
110 pub max_error_rate: f32,
112
113 pub min_quality_score: f32,
115
116 pub max_memory_usage_mb: f32,
118}
119
120impl Default for PerformanceThresholds {
121 fn default() -> Self {
122 Self {
123 max_latency_ms: 2000.0,
124 min_throughput_qps: 10.0,
125 max_error_rate: 5.0,
126 min_quality_score: 0.7,
127 max_memory_usage_mb: 1000.0,
128 }
129 }
130}
131
132#[derive(Debug, Clone)]
134pub struct UserSatisfactionConfig {
135 pub answer_quality_weight: f32,
137
138 pub response_time_weight: f32,
140
141 pub relevance_weight: f32,
143
144 pub completeness_weight: f32,
146
147 pub clarity_weight: f32,
149}
150
151impl Default for UserSatisfactionConfig {
152 fn default() -> Self {
153 Self {
154 answer_quality_weight: 0.3,
155 response_time_weight: 0.2,
156 relevance_weight: 0.25,
157 completeness_weight: 0.15,
158 clarity_weight: 0.1,
159 }
160 }
161}
162
163#[derive(Debug, Clone)]
165pub struct SystemReliabilityConfig {
166 pub acceptable_failure_rate: f32,
168
169 pub recovery_time_threshold_ms: f32,
171
172 pub consistency_threshold: f32,
174}
175
176impl Default for SystemReliabilityConfig {
177 fn default() -> Self {
178 Self {
179 acceptable_failure_rate: 0.01,
180 recovery_time_threshold_ms: 5000.0,
181 consistency_threshold: 0.9,
182 }
183 }
184}
185
186pub trait E2EMetric: Send + Sync {
188 fn name(&self) -> &str;
190
191 fn metric_type(&self) -> E2EMetricType;
193
194 fn evaluate_system(
196 &self,
197 evaluation_data: &EvaluationData,
198 system_metrics: &SystemMetrics,
199 ) -> RragResult<f32>;
200
201 fn get_config(&self) -> E2EMetricConfig;
203}
204
205#[derive(Debug, Clone)]
207pub struct E2EMetricConfig {
208 pub name: String,
210
211 pub requires_performance_data: bool,
213
214 pub requires_user_feedback: bool,
216
217 pub score_range: (f32, f32),
219
220 pub higher_is_better: bool,
222
223 pub evaluation_level: EvaluationLevel,
225}
226
227#[derive(Debug, Clone)]
229pub enum EvaluationLevel {
230 Query,
232 Session,
234 System,
236}
237
238#[derive(Debug, Clone, Serialize, Deserialize)]
240pub struct SystemMetrics {
241 pub avg_response_time_ms: f32,
243
244 pub throughput_qps: f32,
246
247 pub error_rate: f32,
249
250 pub memory_usage_mb: f32,
252
253 pub cpu_usage_percent: f32,
255
256 pub availability_percent: f32,
258
259 pub cache_hit_rate: f32,
261}
262
263impl Default for SystemMetrics {
264 fn default() -> Self {
265 Self {
266 avg_response_time_ms: 1000.0,
267 throughput_qps: 50.0,
268 error_rate: 1.0,
269 memory_usage_mb: 512.0,
270 cpu_usage_percent: 60.0,
271 availability_percent: 99.5,
272 cache_hit_rate: 0.8,
273 }
274 }
275}
276
277impl EndToEndEvaluator {
278 pub fn new(config: EndToEndConfig) -> Self {
280 let mut evaluator = Self {
281 config: config.clone(),
282 metrics: Vec::new(),
283 };
284
285 evaluator.initialize_metrics();
287
288 evaluator
289 }
290
291 fn initialize_metrics(&mut self) {
293 for metric_type in &self.config.enabled_metrics {
294 let metric: Box<dyn E2EMetric> = match metric_type {
295 E2EMetricType::UserSatisfaction => Box::new(UserSatisfactionMetric::new(
296 self.config.user_satisfaction_config.clone(),
297 )),
298 E2EMetricType::SystemLatency => Box::new(SystemLatencyMetric::new(
299 self.config.performance_thresholds.clone(),
300 )),
301 E2EMetricType::SystemThroughput => Box::new(SystemThroughputMetric::new(
302 self.config.performance_thresholds.clone(),
303 )),
304 E2EMetricType::OverallQuality => Box::new(OverallQualityMetric::new()),
305 E2EMetricType::Robustness => Box::new(RobustnessMetric::new()),
306 E2EMetricType::Consistency => Box::new(ConsistencyMetric::new(
307 self.config.system_reliability_config.clone(),
308 )),
309 E2EMetricType::Usability => Box::new(UsabilityMetric::new()),
310 E2EMetricType::ResourceEfficiency => Box::new(ResourceEfficiencyMetric::new(
311 self.config.performance_thresholds.clone(),
312 )),
313 E2EMetricType::ErrorRate => Box::new(ErrorRateMetric::new(
314 self.config.system_reliability_config.clone(),
315 )),
316 _ => continue, };
318
319 self.metrics.push(metric);
320 }
321 }
322}
323
324impl Evaluator for EndToEndEvaluator {
325 fn name(&self) -> &str {
326 "EndToEnd"
327 }
328
329 fn evaluate(&self, data: &EvaluationData) -> RragResult<EvaluationResult> {
330 let start_time = std::time::Instant::now();
331 let mut overall_scores = HashMap::new();
332 let per_query_results = Vec::new(); let system_metrics = self.calculate_system_metrics(data);
336
337 for metric in &self.metrics {
339 match metric.evaluate_system(data, &system_metrics) {
340 Ok(score) => {
341 overall_scores.insert(metric.name().to_string(), score);
342 }
343 Err(e) => {
344 warn!(" Failed to evaluate {}: {}", metric.name(), e);
345 }
346 }
347 }
348
349 let overall_score = self.calculate_overall_score(&overall_scores);
351 overall_scores.insert("overall_e2e_score".to_string(), overall_score);
352
353 let total_time = start_time.elapsed().as_millis() as f32;
354
355 let insights = self.generate_insights(&overall_scores, &system_metrics);
357 let recommendations = self.generate_recommendations(&overall_scores, &system_metrics);
358
359 Ok(EvaluationResult {
360 id: uuid::Uuid::new_v4().to_string(),
361 evaluation_type: "EndToEnd".to_string(),
362 overall_scores: overall_scores.clone(),
363 per_query_results,
364 summary: EvaluationSummary {
365 total_queries: data.queries.len(),
366 avg_scores: overall_scores.clone(),
367 std_deviations: HashMap::new(), performance_stats: PerformanceStats {
369 avg_eval_time_ms: total_time,
370 total_eval_time_ms: total_time,
371 peak_memory_usage_mb: system_metrics.memory_usage_mb,
372 throughput_qps: system_metrics.throughput_qps,
373 },
374 insights,
375 recommendations,
376 },
377 metadata: EvaluationMetadata {
378 timestamp: chrono::Utc::now(),
379 evaluation_version: "1.0.0".to_string(),
380 system_config: HashMap::new(),
381 environment: std::env::vars().collect(),
382 git_commit: None,
383 },
384 })
385 }
386
387 fn supported_metrics(&self) -> Vec<String> {
388 self.metrics.iter().map(|m| m.name().to_string()).collect()
389 }
390
391 fn get_config(&self) -> EvaluatorConfig {
392 EvaluatorConfig {
393 name: "EndToEnd".to_string(),
394 version: "1.0.0".to_string(),
395 metrics: self.supported_metrics(),
396 performance: EvaluatorPerformance {
397 avg_time_per_sample_ms: 200.0,
398 memory_usage_mb: 100.0,
399 accuracy: 0.9,
400 },
401 }
402 }
403}
404
405impl EndToEndEvaluator {
406 fn calculate_system_metrics(&self, data: &EvaluationData) -> SystemMetrics {
408 let mut total_time = 0.0;
409 let mut error_count = 0;
410 let mut valid_responses = 0;
411
412 for response in &data.system_responses {
414 total_time += response.timing.total_time_ms;
415 valid_responses += 1;
416
417 if response.generated_answer.is_none() || response.retrieved_docs.is_empty() {
419 error_count += 1;
420 }
421 }
422
423 let avg_response_time = if valid_responses > 0 {
424 total_time / valid_responses as f32
425 } else {
426 0.0
427 };
428
429 let error_rate = if data.queries.len() > 0 {
430 (error_count as f32 / data.queries.len() as f32) * 100.0
431 } else {
432 0.0
433 };
434
435 let throughput = if total_time > 0.0 {
436 (valid_responses as f32 * 1000.0) / total_time } else {
438 0.0
439 };
440
441 SystemMetrics {
442 avg_response_time_ms: avg_response_time,
443 throughput_qps: throughput,
444 error_rate,
445 memory_usage_mb: 256.0, cpu_usage_percent: 45.0, availability_percent: 99.0,
448 cache_hit_rate: 0.7,
449 }
450 }
451
452 fn calculate_overall_score(&self, scores: &HashMap<String, f32>) -> f32 {
454 let mut weighted_sum = 0.0;
455 let mut total_weight = 0.0;
456
457 if let Some(&user_satisfaction) = scores.get("user_satisfaction") {
459 weighted_sum += user_satisfaction * self.config.user_experience_weight;
460 total_weight += self.config.user_experience_weight;
461 }
462
463 let performance_metrics = ["system_latency", "system_throughput", "resource_efficiency"];
465 let mut performance_score = 0.0;
466 let mut performance_count = 0;
467
468 for metric in &performance_metrics {
469 if let Some(&score) = scores.get(*metric) {
470 performance_score += score;
471 performance_count += 1;
472 }
473 }
474
475 if performance_count > 0 {
476 performance_score /= performance_count as f32;
477 weighted_sum += performance_score * self.config.system_performance_weight;
478 total_weight += self.config.system_performance_weight;
479 }
480
481 if let Some(&quality) = scores.get("overall_quality") {
483 weighted_sum += quality * self.config.quality_weight;
484 total_weight += self.config.quality_weight;
485 }
486
487 if let Some(&robustness) = scores.get("robustness") {
489 weighted_sum += robustness * self.config.robustness_weight;
490 total_weight += self.config.robustness_weight;
491 }
492
493 if total_weight > 0.0 {
494 weighted_sum / total_weight
495 } else {
496 0.0
497 }
498 }
499
500 fn generate_insights(
502 &self,
503 scores: &HashMap<String, f32>,
504 metrics: &SystemMetrics,
505 ) -> Vec<String> {
506 let mut insights = Vec::new();
507
508 if let Some(&overall_score) = scores.get("overall_e2e_score") {
510 if overall_score > 0.8 {
511 insights.push("🎯 Excellent end-to-end system performance".to_string());
512 } else if overall_score < 0.6 {
513 insights.push("⚠️ End-to-end system performance needs improvement".to_string());
514 }
515 }
516
517 if metrics.avg_response_time_ms > self.config.performance_thresholds.max_latency_ms {
519 insights.push(format!(
520 "🐌 High latency detected: {:.1}ms (threshold: {:.1}ms)",
521 metrics.avg_response_time_ms, self.config.performance_thresholds.max_latency_ms
522 ));
523 }
524
525 if metrics.throughput_qps < self.config.performance_thresholds.min_throughput_qps {
527 insights.push(format!(
528 "📊 Low throughput: {:.1} QPS (minimum: {:.1} QPS)",
529 metrics.throughput_qps, self.config.performance_thresholds.min_throughput_qps
530 ));
531 }
532
533 if metrics.error_rate > self.config.performance_thresholds.max_error_rate {
535 insights.push(format!(
536 "🚨 High error rate: {:.1}% (threshold: {:.1}%)",
537 metrics.error_rate, self.config.performance_thresholds.max_error_rate
538 ));
539 }
540
541 if metrics.memory_usage_mb > self.config.performance_thresholds.max_memory_usage_mb {
543 insights.push(format!(
544 "💾 High memory usage: {:.1}MB (threshold: {:.1}MB)",
545 metrics.memory_usage_mb, self.config.performance_thresholds.max_memory_usage_mb
546 ));
547 }
548
549 if let Some(&user_satisfaction) = scores.get("user_satisfaction") {
551 if user_satisfaction < 0.7 {
552 insights.push(
553 "👥 User satisfaction below expectations - focus on UX improvements"
554 .to_string(),
555 );
556 }
557 }
558
559 insights
560 }
561
562 fn generate_recommendations(
564 &self,
565 scores: &HashMap<String, f32>,
566 metrics: &SystemMetrics,
567 ) -> Vec<String> {
568 let mut recommendations = Vec::new();
569
570 if metrics.avg_response_time_ms > self.config.performance_thresholds.max_latency_ms {
572 recommendations
573 .push("⚡ Optimize response time with caching and parallel processing".to_string());
574 recommendations
575 .push("🔧 Consider upgrading hardware or scaling horizontally".to_string());
576 }
577
578 if metrics.throughput_qps < self.config.performance_thresholds.min_throughput_qps {
579 recommendations.push("📈 Implement load balancing and connection pooling".to_string());
580 recommendations.push("🚀 Consider async processing for better throughput".to_string());
581 }
582
583 if metrics.error_rate > self.config.performance_thresholds.max_error_rate {
584 recommendations
585 .push("🛡️ Implement better error handling and retry mechanisms".to_string());
586 recommendations.push("📊 Add comprehensive monitoring and alerting".to_string());
587 }
588
589 if let Some(&user_satisfaction) = scores.get("user_satisfaction") {
591 if user_satisfaction < 0.7 {
592 recommendations
593 .push("👤 Conduct user research to identify pain points".to_string());
594 recommendations
595 .push("🎨 Improve user interface and interaction design".to_string());
596 }
597 }
598
599 if let Some(&quality) = scores.get("overall_quality") {
601 if quality < 0.7 {
602 recommendations
603 .push("📚 Improve training data quality and model fine-tuning".to_string());
604 recommendations
605 .push("🔍 Implement better content filtering and validation".to_string());
606 }
607 }
608
609 if let Some(&consistency) = scores.get("consistency") {
611 if consistency < 0.8 {
612 recommendations.push(
613 "🎯 Improve system consistency with better configuration management"
614 .to_string(),
615 );
616 recommendations
617 .push("🔄 Implement chaos engineering to test system resilience".to_string());
618 }
619 }
620
621 recommendations
622 }
623}
624
625struct UserSatisfactionMetric {
627 config: UserSatisfactionConfig,
628}
629
630impl UserSatisfactionMetric {
631 fn new(config: UserSatisfactionConfig) -> Self {
632 Self { config }
633 }
634}
635
636impl E2EMetric for UserSatisfactionMetric {
637 fn name(&self) -> &str {
638 "user_satisfaction"
639 }
640
641 fn metric_type(&self) -> E2EMetricType {
642 E2EMetricType::UserSatisfaction
643 }
644
645 fn evaluate_system(&self, data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
646 let response_time_score = if metrics.avg_response_time_ms < 1000.0 {
648 1.0
649 } else if metrics.avg_response_time_ms < 3000.0 {
650 0.8 - (metrics.avg_response_time_ms - 1000.0) / 2000.0 * 0.3
651 } else {
652 0.5
653 };
654
655 let answered_queries = data
657 .system_responses
658 .iter()
659 .filter(|r| r.generated_answer.is_some())
660 .count();
661 let answer_quality_score = answered_queries as f32 / data.queries.len() as f32;
662
663 let relevance_score = 0.8; let avg_docs = data
668 .system_responses
669 .iter()
670 .map(|r| r.retrieved_docs.len())
671 .sum::<usize>() as f32
672 / data.system_responses.len() as f32;
673 let completeness_score = (avg_docs / 5.0).min(1.0); let clarity_score = 0.75; let satisfaction = response_time_score * self.config.response_time_weight
680 + answer_quality_score * self.config.answer_quality_weight
681 + relevance_score * self.config.relevance_weight
682 + completeness_score * self.config.completeness_weight
683 + clarity_score * self.config.clarity_weight;
684
685 Ok(satisfaction.min(1.0))
686 }
687
688 fn get_config(&self) -> E2EMetricConfig {
689 E2EMetricConfig {
690 name: "user_satisfaction".to_string(),
691 requires_performance_data: true,
692 requires_user_feedback: false,
693 score_range: (0.0, 1.0),
694 higher_is_better: true,
695 evaluation_level: EvaluationLevel::System,
696 }
697 }
698}
699
700struct SystemLatencyMetric {
701 thresholds: PerformanceThresholds,
702}
703
704impl SystemLatencyMetric {
705 fn new(thresholds: PerformanceThresholds) -> Self {
706 Self { thresholds }
707 }
708}
709
710impl E2EMetric for SystemLatencyMetric {
711 fn name(&self) -> &str {
712 "system_latency"
713 }
714
715 fn metric_type(&self) -> E2EMetricType {
716 E2EMetricType::SystemLatency
717 }
718
719 fn evaluate_system(&self, _data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
720 let score = if metrics.avg_response_time_ms <= self.thresholds.max_latency_ms {
722 1.0 - (metrics.avg_response_time_ms / self.thresholds.max_latency_ms) * 0.2
723 } else {
724 let excess = metrics.avg_response_time_ms - self.thresholds.max_latency_ms;
726 let penalty = excess / self.thresholds.max_latency_ms;
727 (0.8 - penalty * 0.5).max(0.0)
728 };
729
730 Ok(score)
731 }
732
733 fn get_config(&self) -> E2EMetricConfig {
734 E2EMetricConfig {
735 name: "system_latency".to_string(),
736 requires_performance_data: true,
737 requires_user_feedback: false,
738 score_range: (0.0, 1.0),
739 higher_is_better: true,
740 evaluation_level: EvaluationLevel::System,
741 }
742 }
743}
744
745struct SystemThroughputMetric {
746 thresholds: PerformanceThresholds,
747}
748
749impl SystemThroughputMetric {
750 fn new(thresholds: PerformanceThresholds) -> Self {
751 Self { thresholds }
752 }
753}
754
755impl E2EMetric for SystemThroughputMetric {
756 fn name(&self) -> &str {
757 "system_throughput"
758 }
759
760 fn metric_type(&self) -> E2EMetricType {
761 E2EMetricType::SystemThroughput
762 }
763
764 fn evaluate_system(&self, _data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
765 let score = if metrics.throughput_qps >= self.thresholds.min_throughput_qps {
767 (metrics.throughput_qps / self.thresholds.min_throughput_qps).min(2.0) / 2.0
768 } else {
769 metrics.throughput_qps / self.thresholds.min_throughput_qps
770 };
771
772 Ok(score.min(1.0))
773 }
774
775 fn get_config(&self) -> E2EMetricConfig {
776 E2EMetricConfig {
777 name: "system_throughput".to_string(),
778 requires_performance_data: true,
779 requires_user_feedback: false,
780 score_range: (0.0, 1.0),
781 higher_is_better: true,
782 evaluation_level: EvaluationLevel::System,
783 }
784 }
785}
786
787macro_rules! impl_simple_e2e_metric {
789 ($name:ident, $metric_name:literal, $metric_type:expr, $default_score:expr) => {
790 struct $name;
791
792 impl $name {
793 fn new() -> Self {
794 Self
795 }
796 }
797
798 impl E2EMetric for $name {
799 fn name(&self) -> &str {
800 $metric_name
801 }
802
803 fn metric_type(&self) -> E2EMetricType {
804 $metric_type
805 }
806
807 fn evaluate_system(
808 &self,
809 _data: &EvaluationData,
810 _metrics: &SystemMetrics,
811 ) -> RragResult<f32> {
812 Ok($default_score)
813 }
814
815 fn get_config(&self) -> E2EMetricConfig {
816 E2EMetricConfig {
817 name: $metric_name.to_string(),
818 requires_performance_data: false,
819 requires_user_feedback: false,
820 score_range: (0.0, 1.0),
821 higher_is_better: true,
822 evaluation_level: EvaluationLevel::System,
823 }
824 }
825 }
826 };
827}
828
829struct OverallQualityMetric;
830
831impl OverallQualityMetric {
832 fn new() -> Self {
833 Self
834 }
835}
836
837impl E2EMetric for OverallQualityMetric {
838 fn name(&self) -> &str {
839 "overall_quality"
840 }
841
842 fn metric_type(&self) -> E2EMetricType {
843 E2EMetricType::OverallQuality
844 }
845
846 fn evaluate_system(&self, data: &EvaluationData, _metrics: &SystemMetrics) -> RragResult<f32> {
847 let successful_responses = data
849 .system_responses
850 .iter()
851 .filter(|r| r.generated_answer.is_some() && !r.retrieved_docs.is_empty())
852 .count();
853
854 let quality_score = successful_responses as f32 / data.queries.len() as f32;
855 Ok(quality_score)
856 }
857
858 fn get_config(&self) -> E2EMetricConfig {
859 E2EMetricConfig {
860 name: "overall_quality".to_string(),
861 requires_performance_data: false,
862 requires_user_feedback: false,
863 score_range: (0.0, 1.0),
864 higher_is_better: true,
865 evaluation_level: EvaluationLevel::System,
866 }
867 }
868}
869
870struct ConsistencyMetric {
871 config: SystemReliabilityConfig,
872}
873
874impl ConsistencyMetric {
875 fn new(config: SystemReliabilityConfig) -> Self {
876 Self { config }
877 }
878}
879
880impl E2EMetric for ConsistencyMetric {
881 fn name(&self) -> &str {
882 "consistency"
883 }
884
885 fn metric_type(&self) -> E2EMetricType {
886 E2EMetricType::Consistency
887 }
888
889 fn evaluate_system(&self, data: &EvaluationData, _metrics: &SystemMetrics) -> RragResult<f32> {
890 let response_times: Vec<f32> = data
892 .system_responses
893 .iter()
894 .map(|r| r.timing.total_time_ms)
895 .collect();
896
897 if response_times.is_empty() {
898 return Ok(0.0);
899 }
900
901 let mean_time = response_times.iter().sum::<f32>() / response_times.len() as f32;
902 let variance = response_times
903 .iter()
904 .map(|t| (t - mean_time).powi(2))
905 .sum::<f32>()
906 / response_times.len() as f32;
907 let std_dev = variance.sqrt();
908
909 let cv = if mean_time > 0.0 {
911 std_dev / mean_time
912 } else {
913 0.0
914 };
915 let consistency = (1.0 - cv).max(0.0);
916
917 Ok(consistency)
918 }
919
920 fn get_config(&self) -> E2EMetricConfig {
921 E2EMetricConfig {
922 name: "consistency".to_string(),
923 requires_performance_data: true,
924 requires_user_feedback: false,
925 score_range: (0.0, 1.0),
926 higher_is_better: true,
927 evaluation_level: EvaluationLevel::System,
928 }
929 }
930}
931
932struct ResourceEfficiencyMetric {
933 thresholds: PerformanceThresholds,
934}
935
936impl ResourceEfficiencyMetric {
937 fn new(thresholds: PerformanceThresholds) -> Self {
938 Self { thresholds }
939 }
940}
941
942impl E2EMetric for ResourceEfficiencyMetric {
943 fn name(&self) -> &str {
944 "resource_efficiency"
945 }
946
947 fn metric_type(&self) -> E2EMetricType {
948 E2EMetricType::ResourceEfficiency
949 }
950
951 fn evaluate_system(&self, _data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
952 let memory_score = if metrics.memory_usage_mb <= self.thresholds.max_memory_usage_mb {
954 1.0 - (metrics.memory_usage_mb / self.thresholds.max_memory_usage_mb) * 0.3
955 } else {
956 0.7 * (self.thresholds.max_memory_usage_mb / metrics.memory_usage_mb)
957 };
958
959 let cpu_score = if metrics.cpu_usage_percent <= 80.0 {
960 1.0 - (metrics.cpu_usage_percent / 100.0) * 0.2
961 } else {
962 0.8 * (80.0 / metrics.cpu_usage_percent)
963 };
964
965 let efficiency = (memory_score + cpu_score) / 2.0;
966 Ok(efficiency.min(1.0))
967 }
968
969 fn get_config(&self) -> E2EMetricConfig {
970 E2EMetricConfig {
971 name: "resource_efficiency".to_string(),
972 requires_performance_data: true,
973 requires_user_feedback: false,
974 score_range: (0.0, 1.0),
975 higher_is_better: true,
976 evaluation_level: EvaluationLevel::System,
977 }
978 }
979}
980
981struct ErrorRateMetric {
982 config: SystemReliabilityConfig,
983}
984
985impl ErrorRateMetric {
986 fn new(config: SystemReliabilityConfig) -> Self {
987 Self { config }
988 }
989}
990
991impl E2EMetric for ErrorRateMetric {
992 fn name(&self) -> &str {
993 "error_rate"
994 }
995
996 fn metric_type(&self) -> E2EMetricType {
997 E2EMetricType::ErrorRate
998 }
999
1000 fn evaluate_system(&self, _data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
1001 let score = if metrics.error_rate <= self.config.acceptable_failure_rate * 100.0 {
1003 1.0 - (metrics.error_rate / 100.0) * 0.1
1004 } else {
1005 let excess = metrics.error_rate - (self.config.acceptable_failure_rate * 100.0);
1006 (0.9 - excess / 100.0 * 2.0).max(0.0)
1007 };
1008
1009 Ok(score)
1010 }
1011
1012 fn get_config(&self) -> E2EMetricConfig {
1013 E2EMetricConfig {
1014 name: "error_rate".to_string(),
1015 requires_performance_data: true,
1016 requires_user_feedback: false,
1017 score_range: (0.0, 1.0),
1018 higher_is_better: true,
1019 evaluation_level: EvaluationLevel::System,
1020 }
1021 }
1022}
1023
1024impl_simple_e2e_metric!(
1025 RobustnessMetric,
1026 "robustness",
1027 E2EMetricType::Robustness,
1028 0.8
1029);
1030impl_simple_e2e_metric!(UsabilityMetric, "usability", E2EMetricType::Usability, 0.85);
1031
1032#[cfg(test)]
1033mod tests {
1034 use super::*;
1035 use crate::evaluation::{
1036 GroundTruth, RetrievedDocument, SystemResponse, SystemTiming, TestQuery,
1037 };
1038
1039 #[test]
1040 fn test_user_satisfaction_metric() {
1041 let config = UserSatisfactionConfig::default();
1042 let metric = UserSatisfactionMetric::new(config);
1043
1044 let data = create_test_data();
1045 let system_metrics = SystemMetrics::default();
1046
1047 let score = metric.evaluate_system(&data, &system_metrics).unwrap();
1048 assert!(score >= 0.0 && score <= 1.0);
1049 }
1050
1051 #[test]
1052 fn test_system_latency_metric() {
1053 let thresholds = PerformanceThresholds::default();
1054 let metric = SystemLatencyMetric::new(thresholds);
1055
1056 let data = create_test_data();
1057 let mut system_metrics = SystemMetrics::default();
1058 system_metrics.avg_response_time_ms = 1500.0; let score = metric.evaluate_system(&data, &system_metrics).unwrap();
1061 assert!(score > 0.5); }
1063
1064 #[test]
1065 fn test_end_to_end_evaluator() {
1066 let config = EndToEndConfig::default();
1067 let evaluator = EndToEndEvaluator::new(config);
1068
1069 assert_eq!(evaluator.name(), "EndToEnd");
1070 assert!(!evaluator.supported_metrics().is_empty());
1071 }
1072
1073 fn create_test_data() -> EvaluationData {
1074 use super::super::*;
1075
1076 EvaluationData {
1077 queries: vec![TestQuery {
1078 id: "q1".to_string(),
1079 query: "What is machine learning?".to_string(),
1080 query_type: None,
1081 metadata: HashMap::new(),
1082 }],
1083 ground_truth: vec![GroundTruth {
1084 query_id: "q1".to_string(),
1085 relevant_docs: vec!["doc1".to_string()],
1086 expected_answer: Some("ML is AI subset".to_string()),
1087 relevance_judgments: HashMap::new(),
1088 metadata: HashMap::new(),
1089 }],
1090 system_responses: vec![SystemResponse {
1091 query_id: "q1".to_string(),
1092 retrieved_docs: vec![RetrievedDocument {
1093 doc_id: "doc1".to_string(),
1094 content: "Machine learning content".to_string(),
1095 score: 0.9,
1096 rank: 0,
1097 metadata: HashMap::new(),
1098 }],
1099 generated_answer: Some("Machine learning is...".to_string()),
1100 timing: SystemTiming {
1101 total_time_ms: 1000.0,
1102 retrieval_time_ms: 500.0,
1103 generation_time_ms: Some(400.0),
1104 reranking_time_ms: Some(100.0),
1105 },
1106 metadata: HashMap::new(),
1107 }],
1108 context: HashMap::new(),
1109 }
1110 }
1111}