use super::{
EvaluationData, EvaluationMetadata, EvaluationResult, EvaluationSummary, Evaluator,
EvaluatorConfig, EvaluatorPerformance, PerformanceStats,
};
use crate::RragResult;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
pub struct EndToEndEvaluator {
config: EndToEndConfig,
metrics: Vec<Box<dyn E2EMetric>>,
}
#[derive(Debug, Clone)]
pub struct EndToEndConfig {
pub enabled_metrics: Vec<E2EMetricType>,
pub user_experience_weight: f32,
pub system_performance_weight: f32,
pub quality_weight: f32,
pub robustness_weight: f32,
pub performance_thresholds: PerformanceThresholds,
pub user_satisfaction_config: UserSatisfactionConfig,
pub system_reliability_config: SystemReliabilityConfig,
}
impl Default for EndToEndConfig {
fn default() -> Self {
Self {
enabled_metrics: vec![
E2EMetricType::UserSatisfaction,
E2EMetricType::SystemLatency,
E2EMetricType::SystemThroughput,
E2EMetricType::OverallQuality,
E2EMetricType::Robustness,
E2EMetricType::Consistency,
E2EMetricType::Usability,
],
user_experience_weight: 0.4,
system_performance_weight: 0.3,
quality_weight: 0.2,
robustness_weight: 0.1,
performance_thresholds: PerformanceThresholds::default(),
user_satisfaction_config: UserSatisfactionConfig::default(),
system_reliability_config: SystemReliabilityConfig::default(),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum E2EMetricType {
UserSatisfaction,
SystemLatency,
SystemThroughput,
OverallQuality,
Robustness,
Consistency,
Usability,
ResourceEfficiency,
ErrorRate,
Scalability,
UserEngagement,
TrustScore,
}
#[derive(Debug, Clone)]
pub struct PerformanceThresholds {
pub max_latency_ms: f32,
pub min_throughput_qps: f32,
pub max_error_rate: f32,
pub min_quality_score: f32,
pub max_memory_usage_mb: f32,
}
impl Default for PerformanceThresholds {
fn default() -> Self {
Self {
max_latency_ms: 2000.0,
min_throughput_qps: 10.0,
max_error_rate: 5.0,
min_quality_score: 0.7,
max_memory_usage_mb: 1000.0,
}
}
}
#[derive(Debug, Clone)]
pub struct UserSatisfactionConfig {
pub answer_quality_weight: f32,
pub response_time_weight: f32,
pub relevance_weight: f32,
pub completeness_weight: f32,
pub clarity_weight: f32,
}
impl Default for UserSatisfactionConfig {
fn default() -> Self {
Self {
answer_quality_weight: 0.3,
response_time_weight: 0.2,
relevance_weight: 0.25,
completeness_weight: 0.15,
clarity_weight: 0.1,
}
}
}
#[derive(Debug, Clone)]
pub struct SystemReliabilityConfig {
pub acceptable_failure_rate: f32,
pub recovery_time_threshold_ms: f32,
pub consistency_threshold: f32,
}
impl Default for SystemReliabilityConfig {
fn default() -> Self {
Self {
acceptable_failure_rate: 0.01,
recovery_time_threshold_ms: 5000.0,
consistency_threshold: 0.9,
}
}
}
pub trait E2EMetric: Send + Sync {
fn name(&self) -> &str;
fn metric_type(&self) -> E2EMetricType;
fn evaluate_system(
&self,
evaluation_data: &EvaluationData,
system_metrics: &SystemMetrics,
) -> RragResult<f32>;
fn get_config(&self) -> E2EMetricConfig;
}
#[derive(Debug, Clone)]
pub struct E2EMetricConfig {
pub name: String,
pub requires_performance_data: bool,
pub requires_user_feedback: bool,
pub score_range: (f32, f32),
pub higher_is_better: bool,
pub evaluation_level: EvaluationLevel,
}
#[derive(Debug, Clone)]
pub enum EvaluationLevel {
Query,
Session,
System,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemMetrics {
pub avg_response_time_ms: f32,
pub throughput_qps: f32,
pub error_rate: f32,
pub memory_usage_mb: f32,
pub cpu_usage_percent: f32,
pub availability_percent: f32,
pub cache_hit_rate: f32,
}
impl Default for SystemMetrics {
fn default() -> Self {
Self {
avg_response_time_ms: 1000.0,
throughput_qps: 50.0,
error_rate: 1.0,
memory_usage_mb: 512.0,
cpu_usage_percent: 60.0,
availability_percent: 99.5,
cache_hit_rate: 0.8,
}
}
}
impl EndToEndEvaluator {
pub fn new(config: EndToEndConfig) -> Self {
let mut evaluator = Self {
config: config.clone(),
metrics: Vec::new(),
};
evaluator.initialize_metrics();
evaluator
}
fn initialize_metrics(&mut self) {
for metric_type in &self.config.enabled_metrics {
let metric: Box<dyn E2EMetric> = match metric_type {
E2EMetricType::UserSatisfaction => Box::new(UserSatisfactionMetric::new(
self.config.user_satisfaction_config.clone(),
)),
E2EMetricType::SystemLatency => Box::new(SystemLatencyMetric::new(
self.config.performance_thresholds.clone(),
)),
E2EMetricType::SystemThroughput => Box::new(SystemThroughputMetric::new(
self.config.performance_thresholds.clone(),
)),
E2EMetricType::OverallQuality => Box::new(OverallQualityMetric::new()),
E2EMetricType::Robustness => Box::new(RobustnessMetric::new()),
E2EMetricType::Consistency => Box::new(ConsistencyMetric::new(
self.config.system_reliability_config.clone(),
)),
E2EMetricType::Usability => Box::new(UsabilityMetric::new()),
E2EMetricType::ResourceEfficiency => Box::new(ResourceEfficiencyMetric::new(
self.config.performance_thresholds.clone(),
)),
E2EMetricType::ErrorRate => Box::new(ErrorRateMetric::new(
self.config.system_reliability_config.clone(),
)),
_ => continue, };
self.metrics.push(metric);
}
}
}
impl Evaluator for EndToEndEvaluator {
fn name(&self) -> &str {
"EndToEnd"
}
fn evaluate(&self, data: &EvaluationData) -> RragResult<EvaluationResult> {
let start_time = std::time::Instant::now();
let mut overall_scores = HashMap::new();
let per_query_results = Vec::new();
let system_metrics = self.calculate_system_metrics(data);
for metric in &self.metrics {
match metric.evaluate_system(data, &system_metrics) {
Ok(score) => {
overall_scores.insert(metric.name().to_string(), score);
}
Err(e) => {
eprintln!("Warning: Failed to evaluate {}: {}", metric.name(), e);
}
}
}
let overall_score = self.calculate_overall_score(&overall_scores);
overall_scores.insert("overall_e2e_score".to_string(), overall_score);
let total_time = start_time.elapsed().as_millis() as f32;
let insights = self.generate_insights(&overall_scores, &system_metrics);
let recommendations = self.generate_recommendations(&overall_scores, &system_metrics);
Ok(EvaluationResult {
id: uuid::Uuid::new_v4().to_string(),
evaluation_type: "EndToEnd".to_string(),
overall_scores: overall_scores.clone(),
per_query_results,
summary: EvaluationSummary {
total_queries: data.queries.len(),
avg_scores: overall_scores.clone(),
std_deviations: HashMap::new(), performance_stats: PerformanceStats {
avg_eval_time_ms: total_time,
total_eval_time_ms: total_time,
peak_memory_usage_mb: system_metrics.memory_usage_mb,
throughput_qps: system_metrics.throughput_qps,
},
insights,
recommendations,
},
metadata: EvaluationMetadata {
timestamp: chrono::Utc::now(),
evaluation_version: "1.0.0".to_string(),
system_config: HashMap::new(),
environment: std::env::vars().collect(),
git_commit: None,
},
})
}
fn supported_metrics(&self) -> Vec<String> {
self.metrics.iter().map(|m| m.name().to_string()).collect()
}
fn get_config(&self) -> EvaluatorConfig {
EvaluatorConfig {
name: "EndToEnd".to_string(),
version: "1.0.0".to_string(),
metrics: self.supported_metrics(),
performance: EvaluatorPerformance {
avg_time_per_sample_ms: 200.0,
memory_usage_mb: 100.0,
accuracy: 0.9,
},
}
}
}
impl EndToEndEvaluator {
fn calculate_system_metrics(&self, data: &EvaluationData) -> SystemMetrics {
let mut total_time = 0.0;
let mut error_count = 0;
let mut valid_responses = 0;
for response in &data.system_responses {
total_time += response.timing.total_time_ms;
valid_responses += 1;
if response.generated_answer.is_none() || response.retrieved_docs.is_empty() {
error_count += 1;
}
}
let avg_response_time = if valid_responses > 0 {
total_time / valid_responses as f32
} else {
0.0
};
let error_rate = if data.queries.len() > 0 {
(error_count as f32 / data.queries.len() as f32) * 100.0
} else {
0.0
};
let throughput = if total_time > 0.0 {
(valid_responses as f32 * 1000.0) / total_time } else {
0.0
};
SystemMetrics {
avg_response_time_ms: avg_response_time,
throughput_qps: throughput,
error_rate,
memory_usage_mb: 256.0, cpu_usage_percent: 45.0, availability_percent: 99.0,
cache_hit_rate: 0.7,
}
}
fn calculate_overall_score(&self, scores: &HashMap<String, f32>) -> f32 {
let mut weighted_sum = 0.0;
let mut total_weight = 0.0;
if let Some(&user_satisfaction) = scores.get("user_satisfaction") {
weighted_sum += user_satisfaction * self.config.user_experience_weight;
total_weight += self.config.user_experience_weight;
}
let performance_metrics = ["system_latency", "system_throughput", "resource_efficiency"];
let mut performance_score = 0.0;
let mut performance_count = 0;
for metric in &performance_metrics {
if let Some(&score) = scores.get(*metric) {
performance_score += score;
performance_count += 1;
}
}
if performance_count > 0 {
performance_score /= performance_count as f32;
weighted_sum += performance_score * self.config.system_performance_weight;
total_weight += self.config.system_performance_weight;
}
if let Some(&quality) = scores.get("overall_quality") {
weighted_sum += quality * self.config.quality_weight;
total_weight += self.config.quality_weight;
}
if let Some(&robustness) = scores.get("robustness") {
weighted_sum += robustness * self.config.robustness_weight;
total_weight += self.config.robustness_weight;
}
if total_weight > 0.0 {
weighted_sum / total_weight
} else {
0.0
}
}
fn generate_insights(
&self,
scores: &HashMap<String, f32>,
metrics: &SystemMetrics,
) -> Vec<String> {
let mut insights = Vec::new();
if let Some(&overall_score) = scores.get("overall_e2e_score") {
if overall_score > 0.8 {
insights.push("🎯 Excellent end-to-end system performance".to_string());
} else if overall_score < 0.6 {
insights.push("⚠️ End-to-end system performance needs improvement".to_string());
}
}
if metrics.avg_response_time_ms > self.config.performance_thresholds.max_latency_ms {
insights.push(format!(
"🐌 High latency detected: {:.1}ms (threshold: {:.1}ms)",
metrics.avg_response_time_ms, self.config.performance_thresholds.max_latency_ms
));
}
if metrics.throughput_qps < self.config.performance_thresholds.min_throughput_qps {
insights.push(format!(
"📊 Low throughput: {:.1} QPS (minimum: {:.1} QPS)",
metrics.throughput_qps, self.config.performance_thresholds.min_throughput_qps
));
}
if metrics.error_rate > self.config.performance_thresholds.max_error_rate {
insights.push(format!(
"🚨 High error rate: {:.1}% (threshold: {:.1}%)",
metrics.error_rate, self.config.performance_thresholds.max_error_rate
));
}
if metrics.memory_usage_mb > self.config.performance_thresholds.max_memory_usage_mb {
insights.push(format!(
"💾 High memory usage: {:.1}MB (threshold: {:.1}MB)",
metrics.memory_usage_mb, self.config.performance_thresholds.max_memory_usage_mb
));
}
if let Some(&user_satisfaction) = scores.get("user_satisfaction") {
if user_satisfaction < 0.7 {
insights.push(
"👥 User satisfaction below expectations - focus on UX improvements"
.to_string(),
);
}
}
insights
}
fn generate_recommendations(
&self,
scores: &HashMap<String, f32>,
metrics: &SystemMetrics,
) -> Vec<String> {
let mut recommendations = Vec::new();
if metrics.avg_response_time_ms > self.config.performance_thresholds.max_latency_ms {
recommendations
.push("⚡ Optimize response time with caching and parallel processing".to_string());
recommendations
.push("🔧 Consider upgrading hardware or scaling horizontally".to_string());
}
if metrics.throughput_qps < self.config.performance_thresholds.min_throughput_qps {
recommendations.push("📈 Implement load balancing and connection pooling".to_string());
recommendations.push("🚀 Consider async processing for better throughput".to_string());
}
if metrics.error_rate > self.config.performance_thresholds.max_error_rate {
recommendations
.push("🛡️ Implement better error handling and retry mechanisms".to_string());
recommendations.push("📊 Add comprehensive monitoring and alerting".to_string());
}
if let Some(&user_satisfaction) = scores.get("user_satisfaction") {
if user_satisfaction < 0.7 {
recommendations
.push("👤 Conduct user research to identify pain points".to_string());
recommendations
.push("🎨 Improve user interface and interaction design".to_string());
}
}
if let Some(&quality) = scores.get("overall_quality") {
if quality < 0.7 {
recommendations
.push("📚 Improve training data quality and model fine-tuning".to_string());
recommendations
.push("🔍 Implement better content filtering and validation".to_string());
}
}
if let Some(&consistency) = scores.get("consistency") {
if consistency < 0.8 {
recommendations.push(
"🎯 Improve system consistency with better configuration management"
.to_string(),
);
recommendations
.push("🔄 Implement chaos engineering to test system resilience".to_string());
}
}
recommendations
}
}
struct UserSatisfactionMetric {
config: UserSatisfactionConfig,
}
impl UserSatisfactionMetric {
fn new(config: UserSatisfactionConfig) -> Self {
Self { config }
}
}
impl E2EMetric for UserSatisfactionMetric {
fn name(&self) -> &str {
"user_satisfaction"
}
fn metric_type(&self) -> E2EMetricType {
E2EMetricType::UserSatisfaction
}
fn evaluate_system(&self, data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
let response_time_score = if metrics.avg_response_time_ms < 1000.0 {
1.0
} else if metrics.avg_response_time_ms < 3000.0 {
0.8 - (metrics.avg_response_time_ms - 1000.0) / 2000.0 * 0.3
} else {
0.5
};
let answered_queries = data
.system_responses
.iter()
.filter(|r| r.generated_answer.is_some())
.count();
let answer_quality_score = answered_queries as f32 / data.queries.len() as f32;
let relevance_score = 0.8;
let avg_docs = data
.system_responses
.iter()
.map(|r| r.retrieved_docs.len())
.sum::<usize>() as f32
/ data.system_responses.len() as f32;
let completeness_score = (avg_docs / 5.0).min(1.0);
let clarity_score = 0.75;
let satisfaction = response_time_score * self.config.response_time_weight
+ answer_quality_score * self.config.answer_quality_weight
+ relevance_score * self.config.relevance_weight
+ completeness_score * self.config.completeness_weight
+ clarity_score * self.config.clarity_weight;
Ok(satisfaction.min(1.0))
}
fn get_config(&self) -> E2EMetricConfig {
E2EMetricConfig {
name: "user_satisfaction".to_string(),
requires_performance_data: true,
requires_user_feedback: false,
score_range: (0.0, 1.0),
higher_is_better: true,
evaluation_level: EvaluationLevel::System,
}
}
}
struct SystemLatencyMetric {
thresholds: PerformanceThresholds,
}
impl SystemLatencyMetric {
fn new(thresholds: PerformanceThresholds) -> Self {
Self { thresholds }
}
}
impl E2EMetric for SystemLatencyMetric {
fn name(&self) -> &str {
"system_latency"
}
fn metric_type(&self) -> E2EMetricType {
E2EMetricType::SystemLatency
}
fn evaluate_system(&self, _data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
let score = if metrics.avg_response_time_ms <= self.thresholds.max_latency_ms {
1.0 - (metrics.avg_response_time_ms / self.thresholds.max_latency_ms) * 0.2
} else {
let excess = metrics.avg_response_time_ms - self.thresholds.max_latency_ms;
let penalty = excess / self.thresholds.max_latency_ms;
(0.8 - penalty * 0.5).max(0.0)
};
Ok(score)
}
fn get_config(&self) -> E2EMetricConfig {
E2EMetricConfig {
name: "system_latency".to_string(),
requires_performance_data: true,
requires_user_feedback: false,
score_range: (0.0, 1.0),
higher_is_better: true,
evaluation_level: EvaluationLevel::System,
}
}
}
struct SystemThroughputMetric {
thresholds: PerformanceThresholds,
}
impl SystemThroughputMetric {
fn new(thresholds: PerformanceThresholds) -> Self {
Self { thresholds }
}
}
impl E2EMetric for SystemThroughputMetric {
fn name(&self) -> &str {
"system_throughput"
}
fn metric_type(&self) -> E2EMetricType {
E2EMetricType::SystemThroughput
}
fn evaluate_system(&self, _data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
let score = if metrics.throughput_qps >= self.thresholds.min_throughput_qps {
(metrics.throughput_qps / self.thresholds.min_throughput_qps).min(2.0) / 2.0
} else {
metrics.throughput_qps / self.thresholds.min_throughput_qps
};
Ok(score.min(1.0))
}
fn get_config(&self) -> E2EMetricConfig {
E2EMetricConfig {
name: "system_throughput".to_string(),
requires_performance_data: true,
requires_user_feedback: false,
score_range: (0.0, 1.0),
higher_is_better: true,
evaluation_level: EvaluationLevel::System,
}
}
}
macro_rules! impl_simple_e2e_metric {
($name:ident, $metric_name:literal, $metric_type:expr, $default_score:expr) => {
struct $name;
impl $name {
fn new() -> Self {
Self
}
}
impl E2EMetric for $name {
fn name(&self) -> &str {
$metric_name
}
fn metric_type(&self) -> E2EMetricType {
$metric_type
}
fn evaluate_system(
&self,
_data: &EvaluationData,
_metrics: &SystemMetrics,
) -> RragResult<f32> {
Ok($default_score)
}
fn get_config(&self) -> E2EMetricConfig {
E2EMetricConfig {
name: $metric_name.to_string(),
requires_performance_data: false,
requires_user_feedback: false,
score_range: (0.0, 1.0),
higher_is_better: true,
evaluation_level: EvaluationLevel::System,
}
}
}
};
}
struct OverallQualityMetric;
impl OverallQualityMetric {
fn new() -> Self {
Self
}
}
impl E2EMetric for OverallQualityMetric {
fn name(&self) -> &str {
"overall_quality"
}
fn metric_type(&self) -> E2EMetricType {
E2EMetricType::OverallQuality
}
fn evaluate_system(&self, data: &EvaluationData, _metrics: &SystemMetrics) -> RragResult<f32> {
let successful_responses = data
.system_responses
.iter()
.filter(|r| r.generated_answer.is_some() && !r.retrieved_docs.is_empty())
.count();
let quality_score = successful_responses as f32 / data.queries.len() as f32;
Ok(quality_score)
}
fn get_config(&self) -> E2EMetricConfig {
E2EMetricConfig {
name: "overall_quality".to_string(),
requires_performance_data: false,
requires_user_feedback: false,
score_range: (0.0, 1.0),
higher_is_better: true,
evaluation_level: EvaluationLevel::System,
}
}
}
struct ConsistencyMetric {
config: SystemReliabilityConfig,
}
impl ConsistencyMetric {
fn new(config: SystemReliabilityConfig) -> Self {
Self { config }
}
}
impl E2EMetric for ConsistencyMetric {
fn name(&self) -> &str {
"consistency"
}
fn metric_type(&self) -> E2EMetricType {
E2EMetricType::Consistency
}
fn evaluate_system(&self, data: &EvaluationData, _metrics: &SystemMetrics) -> RragResult<f32> {
let response_times: Vec<f32> = data
.system_responses
.iter()
.map(|r| r.timing.total_time_ms)
.collect();
if response_times.is_empty() {
return Ok(0.0);
}
let mean_time = response_times.iter().sum::<f32>() / response_times.len() as f32;
let variance = response_times
.iter()
.map(|t| (t - mean_time).powi(2))
.sum::<f32>()
/ response_times.len() as f32;
let std_dev = variance.sqrt();
let cv = if mean_time > 0.0 {
std_dev / mean_time
} else {
0.0
};
let consistency = (1.0 - cv).max(0.0);
Ok(consistency)
}
fn get_config(&self) -> E2EMetricConfig {
E2EMetricConfig {
name: "consistency".to_string(),
requires_performance_data: true,
requires_user_feedback: false,
score_range: (0.0, 1.0),
higher_is_better: true,
evaluation_level: EvaluationLevel::System,
}
}
}
struct ResourceEfficiencyMetric {
thresholds: PerformanceThresholds,
}
impl ResourceEfficiencyMetric {
fn new(thresholds: PerformanceThresholds) -> Self {
Self { thresholds }
}
}
impl E2EMetric for ResourceEfficiencyMetric {
fn name(&self) -> &str {
"resource_efficiency"
}
fn metric_type(&self) -> E2EMetricType {
E2EMetricType::ResourceEfficiency
}
fn evaluate_system(&self, _data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
let memory_score = if metrics.memory_usage_mb <= self.thresholds.max_memory_usage_mb {
1.0 - (metrics.memory_usage_mb / self.thresholds.max_memory_usage_mb) * 0.3
} else {
0.7 * (self.thresholds.max_memory_usage_mb / metrics.memory_usage_mb)
};
let cpu_score = if metrics.cpu_usage_percent <= 80.0 {
1.0 - (metrics.cpu_usage_percent / 100.0) * 0.2
} else {
0.8 * (80.0 / metrics.cpu_usage_percent)
};
let efficiency = (memory_score + cpu_score) / 2.0;
Ok(efficiency.min(1.0))
}
fn get_config(&self) -> E2EMetricConfig {
E2EMetricConfig {
name: "resource_efficiency".to_string(),
requires_performance_data: true,
requires_user_feedback: false,
score_range: (0.0, 1.0),
higher_is_better: true,
evaluation_level: EvaluationLevel::System,
}
}
}
struct ErrorRateMetric {
config: SystemReliabilityConfig,
}
impl ErrorRateMetric {
fn new(config: SystemReliabilityConfig) -> Self {
Self { config }
}
}
impl E2EMetric for ErrorRateMetric {
fn name(&self) -> &str {
"error_rate"
}
fn metric_type(&self) -> E2EMetricType {
E2EMetricType::ErrorRate
}
fn evaluate_system(&self, _data: &EvaluationData, metrics: &SystemMetrics) -> RragResult<f32> {
let score = if metrics.error_rate <= self.config.acceptable_failure_rate * 100.0 {
1.0 - (metrics.error_rate / 100.0) * 0.1
} else {
let excess = metrics.error_rate - (self.config.acceptable_failure_rate * 100.0);
(0.9 - excess / 100.0 * 2.0).max(0.0)
};
Ok(score)
}
fn get_config(&self) -> E2EMetricConfig {
E2EMetricConfig {
name: "error_rate".to_string(),
requires_performance_data: true,
requires_user_feedback: false,
score_range: (0.0, 1.0),
higher_is_better: true,
evaluation_level: EvaluationLevel::System,
}
}
}
impl_simple_e2e_metric!(
RobustnessMetric,
"robustness",
E2EMetricType::Robustness,
0.8
);
impl_simple_e2e_metric!(UsabilityMetric, "usability", E2EMetricType::Usability, 0.85);
#[cfg(test)]
mod tests {
use super::*;
use crate::evaluation::{
GroundTruth, RetrievedDocument, SystemResponse, SystemTiming, TestQuery,
};
#[test]
fn test_user_satisfaction_metric() {
let config = UserSatisfactionConfig::default();
let metric = UserSatisfactionMetric::new(config);
let data = create_test_data();
let system_metrics = SystemMetrics::default();
let score = metric.evaluate_system(&data, &system_metrics).unwrap();
assert!(score >= 0.0 && score <= 1.0);
}
#[test]
fn test_system_latency_metric() {
let thresholds = PerformanceThresholds::default();
let metric = SystemLatencyMetric::new(thresholds);
let data = create_test_data();
let mut system_metrics = SystemMetrics::default();
system_metrics.avg_response_time_ms = 1500.0;
let score = metric.evaluate_system(&data, &system_metrics).unwrap();
assert!(score > 0.5); }
#[test]
fn test_end_to_end_evaluator() {
let config = EndToEndConfig::default();
let evaluator = EndToEndEvaluator::new(config);
assert_eq!(evaluator.name(), "EndToEnd");
assert!(!evaluator.supported_metrics().is_empty());
}
fn create_test_data() -> EvaluationData {
use super::super::*;
EvaluationData {
queries: vec![TestQuery {
id: "q1".to_string(),
query: "What is machine learning?".to_string(),
query_type: None,
metadata: HashMap::new(),
}],
ground_truth: vec![GroundTruth {
query_id: "q1".to_string(),
relevant_docs: vec!["doc1".to_string()],
expected_answer: Some("ML is AI subset".to_string()),
relevance_judgments: HashMap::new(),
metadata: HashMap::new(),
}],
system_responses: vec![SystemResponse {
query_id: "q1".to_string(),
retrieved_docs: vec![RetrievedDocument {
doc_id: "doc1".to_string(),
content: "Machine learning content".to_string(),
score: 0.9,
rank: 0,
metadata: HashMap::new(),
}],
generated_answer: Some("Machine learning is...".to_string()),
timing: SystemTiming {
total_time_ms: 1000.0,
retrieval_time_ms: 500.0,
generation_time_ms: Some(400.0),
reranking_time_ms: Some(100.0),
},
metadata: HashMap::new(),
}],
context: HashMap::new(),
}
}
}