pub mod benchmarks;
pub mod end_to_end;
pub mod generation_eval;
pub mod metrics;
pub mod ragas;
pub mod retrieval_eval;
use crate::{RragError, RragResult};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
pub struct EvaluationService {
config: EvaluationConfig,
evaluators: HashMap<EvaluationType, Box<dyn Evaluator>>,
metrics_collector: Box<dyn MetricsCollector>,
}
#[derive(Debug, Clone)]
pub struct EvaluationConfig {
pub enabled_evaluations: Vec<EvaluationType>,
pub ragas_config: ragas::RagasConfig,
pub retrieval_config: retrieval_eval::RetrievalEvalConfig,
pub generation_config: generation_eval::GenerationEvalConfig,
pub e2e_config: end_to_end::EndToEndConfig,
pub output_config: OutputConfig,
}
impl Default for EvaluationConfig {
fn default() -> Self {
Self {
enabled_evaluations: vec![
EvaluationType::Ragas,
EvaluationType::Retrieval,
EvaluationType::Generation,
],
ragas_config: ragas::RagasConfig::default(),
retrieval_config: retrieval_eval::RetrievalEvalConfig::default(),
generation_config: generation_eval::GenerationEvalConfig::default(),
e2e_config: end_to_end::EndToEndConfig::default(),
output_config: OutputConfig::default(),
}
}
}
#[derive(Debug, Clone, Hash, PartialEq, Eq, Serialize, Deserialize)]
pub enum EvaluationType {
Ragas,
Retrieval,
Generation,
EndToEnd,
Benchmark,
}
#[derive(Debug, Clone)]
pub struct OutputConfig {
pub export_formats: Vec<ExportFormat>,
pub output_dir: String,
pub include_detailed_logs: bool,
pub generate_visualizations: bool,
}
impl Default for OutputConfig {
fn default() -> Self {
Self {
export_formats: vec![ExportFormat::Json, ExportFormat::Csv],
output_dir: "./evaluation_results".to_string(),
include_detailed_logs: true,
generate_visualizations: false,
}
}
}
#[derive(Debug, Clone)]
pub enum ExportFormat {
Json,
Csv,
Html,
Markdown,
}
pub trait Evaluator: Send + Sync {
fn name(&self) -> &str;
fn evaluate(&self, evaluation_data: &EvaluationData) -> RragResult<EvaluationResult>;
fn supported_metrics(&self) -> Vec<String>;
fn get_config(&self) -> EvaluatorConfig;
}
#[derive(Debug, Clone)]
pub struct EvaluatorConfig {
pub name: String,
pub version: String,
pub metrics: Vec<String>,
pub performance: EvaluatorPerformance,
}
#[derive(Debug, Clone)]
pub struct EvaluatorPerformance {
pub avg_time_per_sample_ms: f32,
pub memory_usage_mb: f32,
pub accuracy: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationData {
pub queries: Vec<TestQuery>,
pub ground_truth: Vec<GroundTruth>,
pub system_responses: Vec<SystemResponse>,
pub context: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TestQuery {
pub id: String,
pub query: String,
pub query_type: Option<String>,
pub metadata: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GroundTruth {
pub query_id: String,
pub relevant_docs: Vec<String>,
pub expected_answer: Option<String>,
pub relevance_judgments: HashMap<String, f32>,
pub metadata: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemResponse {
pub query_id: String,
pub retrieved_docs: Vec<RetrievedDocument>,
pub generated_answer: Option<String>,
pub timing: SystemTiming,
pub metadata: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RetrievedDocument {
pub doc_id: String,
pub content: String,
pub score: f32,
pub rank: usize,
pub metadata: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemTiming {
pub total_time_ms: f32,
pub retrieval_time_ms: f32,
pub generation_time_ms: Option<f32>,
pub reranking_time_ms: Option<f32>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationResult {
pub id: String,
pub evaluation_type: String,
pub overall_scores: HashMap<String, f32>,
pub per_query_results: Vec<QueryEvaluationResult>,
pub summary: EvaluationSummary,
pub metadata: EvaluationMetadata,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QueryEvaluationResult {
pub query_id: String,
pub scores: HashMap<String, f32>,
pub errors: Vec<EvaluationError>,
pub details: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationError {
pub error_type: String,
pub message: String,
pub severity: ErrorSeverity,
pub suggestions: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum ErrorSeverity {
Low,
Medium,
High,
Critical,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationSummary {
pub total_queries: usize,
pub avg_scores: HashMap<String, f32>,
pub std_deviations: HashMap<String, f32>,
pub performance_stats: PerformanceStats,
pub insights: Vec<String>,
pub recommendations: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceStats {
pub avg_eval_time_ms: f32,
pub total_eval_time_ms: f32,
pub peak_memory_usage_mb: f32,
pub throughput_qps: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationMetadata {
pub timestamp: chrono::DateTime<chrono::Utc>,
pub evaluation_version: String,
pub system_config: HashMap<String, serde_json::Value>,
pub environment: HashMap<String, String>,
pub git_commit: Option<String>,
}
pub trait MetricsCollector: Send + Sync {
fn start_collection(&mut self) -> RragResult<()>;
fn stop_collection(&mut self) -> RragResult<()>;
fn record_metric(
&mut self,
name: &str,
value: f32,
labels: Option<&HashMap<String, String>>,
) -> RragResult<()>;
fn get_metrics(&self) -> RragResult<HashMap<String, Vec<MetricRecord>>>;
fn export_metrics(&self, format: &ExportFormat, output_path: &str) -> RragResult<()>;
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetricRecord {
pub name: String,
pub value: f32,
pub timestamp: chrono::DateTime<chrono::Utc>,
pub labels: HashMap<String, String>,
}
impl EvaluationService {
pub fn new(config: EvaluationConfig) -> Self {
let mut service = Self {
config: config.clone(),
evaluators: HashMap::new(),
metrics_collector: Box::new(DefaultMetricsCollector::new()),
};
service.initialize_evaluators();
service
}
fn initialize_evaluators(&mut self) {
for eval_type in &self.config.enabled_evaluations {
let evaluator: Box<dyn Evaluator> = match eval_type {
EvaluationType::Ragas => {
Box::new(ragas::RagasEvaluator::new(self.config.ragas_config.clone()))
}
EvaluationType::Retrieval => Box::new(retrieval_eval::RetrievalEvaluator::new(
self.config.retrieval_config.clone(),
)),
EvaluationType::Generation => Box::new(generation_eval::GenerationEvaluator::new(
self.config.generation_config.clone(),
)),
EvaluationType::EndToEnd => Box::new(end_to_end::EndToEndEvaluator::new(
self.config.e2e_config.clone(),
)),
EvaluationType::Benchmark => Box::new(benchmarks::BenchmarkEvaluator::new()),
};
self.evaluators.insert(eval_type.clone(), evaluator);
}
}
pub async fn evaluate(
&mut self,
data: EvaluationData,
) -> RragResult<HashMap<EvaluationType, EvaluationResult>> {
let mut results = HashMap::new();
self.metrics_collector.start_collection()?;
let start_time = std::time::Instant::now();
for (eval_type, evaluator) in &self.evaluators {
println!("Running {} evaluation...", evaluator.name());
let eval_start = std::time::Instant::now();
match evaluator.evaluate(&data) {
Ok(result) => {
let eval_time = eval_start.elapsed().as_millis() as f32;
self.metrics_collector.record_metric(
"evaluation_time_ms",
eval_time,
Some(
&[("evaluator".to_string(), evaluator.name().to_string())]
.iter()
.cloned()
.collect(),
),
)?;
results.insert(eval_type.clone(), result);
println!(
"✅ {} evaluation completed in {:.2}ms",
evaluator.name(),
eval_time
);
}
Err(e) => {
eprintln!("❌ {} evaluation failed: {}", evaluator.name(), e);
self.metrics_collector.record_metric(
"evaluation_errors",
1.0,
Some(
&[("evaluator".to_string(), evaluator.name().to_string())]
.iter()
.cloned()
.collect(),
),
)?;
}
}
}
let total_time = start_time.elapsed().as_millis() as f32;
self.metrics_collector
.record_metric("total_evaluation_time_ms", total_time, None)?;
self.metrics_collector.stop_collection()?;
Ok(results)
}
pub async fn export_results(
&self,
results: &HashMap<EvaluationType, EvaluationResult>,
) -> RragResult<()> {
std::fs::create_dir_all(&self.config.output_config.output_dir).map_err(|e| {
RragError::evaluation(format!("Failed to create output directory: {}", e))
})?;
for format in &self.config.output_config.export_formats {
match format {
ExportFormat::Json => self.export_json(results).await?,
ExportFormat::Csv => self.export_csv(results).await?,
ExportFormat::Html => self.export_html(results).await?,
ExportFormat::Markdown => self.export_markdown(results).await?,
}
}
Ok(())
}
async fn export_json(
&self,
results: &HashMap<EvaluationType, EvaluationResult>,
) -> RragResult<()> {
let json_path = format!(
"{}/evaluation_results.json",
self.config.output_config.output_dir
);
let json_content = serde_json::to_string_pretty(results)
.map_err(|e| RragError::evaluation(format!("Failed to serialize results: {}", e)))?;
std::fs::write(&json_path, json_content)
.map_err(|e| RragError::evaluation(format!("Failed to write JSON file: {}", e)))?;
println!("✅ Results exported to {}", json_path);
Ok(())
}
async fn export_csv(
&self,
results: &HashMap<EvaluationType, EvaluationResult>,
) -> RragResult<()> {
let csv_path = format!(
"{}/evaluation_summary.csv",
self.config.output_config.output_dir
);
let mut csv_content = String::new();
csv_content.push_str("evaluator,metric,value\n");
for (eval_type, result) in results {
for (metric, value) in &result.overall_scores {
csv_content.push_str(&format!("{:?},{},{}\n", eval_type, metric, value));
}
}
std::fs::write(&csv_path, csv_content)
.map_err(|e| RragError::evaluation(format!("Failed to write CSV file: {}", e)))?;
println!("✅ Summary exported to {}", csv_path);
Ok(())
}
async fn export_html(
&self,
results: &HashMap<EvaluationType, EvaluationResult>,
) -> RragResult<()> {
let html_path = format!(
"{}/evaluation_report.html",
self.config.output_config.output_dir
);
let mut html_content = String::from(
r#"
<!DOCTYPE html>
<html>
<head>
<title>RRAG Evaluation Report</title>
<style>
body { font-family: Arial, sans-serif; margin: 40px; }
.header { border-bottom: 2px solid #333; margin-bottom: 30px; }
.evaluator { margin-bottom: 40px; border: 1px solid #ddd; padding: 20px; }
.metric { margin: 10px 0; }
.score { font-weight: bold; color: #2196F3; }
table { border-collapse: collapse; width: 100%; }
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
th { background-color: #f2f2f2; }
</style>
</head>
<body>
<div class="header">
<h1>🎯 RRAG Evaluation Report</h1>
<p>Generated on: "#,
);
html_content.push_str(
&chrono::Utc::now()
.format("%Y-%m-%d %H:%M:%S UTC")
.to_string(),
);
html_content.push_str("</p>\n </div>\n");
for (eval_type, result) in results {
html_content.push_str(&format!(
r#"
<div class="evaluator">
<h2>📊 {:?} Evaluation</h2>
<h3>Overall Scores</h3>
<table>
<tr><th>Metric</th><th>Score</th></tr>"#,
eval_type
));
for (metric, score) in &result.overall_scores {
html_content.push_str(&format!(
"<tr><td>{}</td><td class=\"score\">{:.4}</td></tr>",
metric, score
));
}
html_content.push_str("</table>\n");
if !result.summary.insights.is_empty() {
html_content.push_str("<h3>Key Insights</h3><ul>");
for insight in &result.summary.insights {
html_content.push_str(&format!("<li>{}</li>", insight));
}
html_content.push_str("</ul>");
}
html_content.push_str(" </div>\n");
}
html_content.push_str("</body>\n</html>");
std::fs::write(&html_path, html_content)
.map_err(|e| RragError::evaluation(format!("Failed to write HTML file: {}", e)))?;
println!("✅ Report exported to {}", html_path);
Ok(())
}
async fn export_markdown(
&self,
results: &HashMap<EvaluationType, EvaluationResult>,
) -> RragResult<()> {
let md_path = format!(
"{}/evaluation_report.md",
self.config.output_config.output_dir
);
let mut md_content = String::from("# 🎯 RRAG Evaluation Report\n\n");
md_content.push_str(&format!(
"**Generated on:** {}\n\n",
chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC")
));
for (eval_type, result) in results {
md_content.push_str(&format!("## 📊 {:?} Evaluation\n\n", eval_type));
md_content.push_str("### Overall Scores\n\n");
md_content.push_str("| Metric | Score |\n|--------|-------|\n");
for (metric, score) in &result.overall_scores {
md_content.push_str(&format!("| {} | {:.4} |\n", metric, score));
}
if !result.summary.insights.is_empty() {
md_content.push_str("\n### Key Insights\n\n");
for insight in &result.summary.insights {
md_content.push_str(&format!("- {}\n", insight));
}
}
if !result.summary.recommendations.is_empty() {
md_content.push_str("\n### Recommendations\n\n");
for recommendation in &result.summary.recommendations {
md_content.push_str(&format!("- {}\n", recommendation));
}
}
md_content.push_str("\n---\n\n");
}
std::fs::write(&md_path, md_content)
.map_err(|e| RragError::evaluation(format!("Failed to write Markdown file: {}", e)))?;
println!("✅ Markdown report exported to {}", md_path);
Ok(())
}
pub fn get_metrics(&self) -> RragResult<HashMap<String, Vec<MetricRecord>>> {
self.metrics_collector.get_metrics()
}
}
pub struct DefaultMetricsCollector {
metrics: HashMap<String, Vec<MetricRecord>>,
collecting: bool,
}
impl DefaultMetricsCollector {
pub fn new() -> Self {
Self {
metrics: HashMap::new(),
collecting: false,
}
}
}
impl MetricsCollector for DefaultMetricsCollector {
fn start_collection(&mut self) -> RragResult<()> {
self.collecting = true;
self.metrics.clear();
Ok(())
}
fn stop_collection(&mut self) -> RragResult<()> {
self.collecting = false;
Ok(())
}
fn record_metric(
&mut self,
name: &str,
value: f32,
labels: Option<&HashMap<String, String>>,
) -> RragResult<()> {
if !self.collecting {
return Ok(());
}
let record = MetricRecord {
name: name.to_string(),
value,
timestamp: chrono::Utc::now(),
labels: labels.cloned().unwrap_or_default(),
};
self.metrics
.entry(name.to_string())
.or_insert_with(Vec::new)
.push(record);
Ok(())
}
fn get_metrics(&self) -> RragResult<HashMap<String, Vec<MetricRecord>>> {
Ok(self.metrics.clone())
}
fn export_metrics(&self, format: &ExportFormat, output_path: &str) -> RragResult<()> {
match format {
ExportFormat::Json => {
let json_content = serde_json::to_string_pretty(&self.metrics).map_err(|e| {
RragError::evaluation(format!("Failed to serialize metrics: {}", e))
})?;
std::fs::write(output_path, json_content).map_err(|e| {
RragError::evaluation(format!("Failed to write metrics file: {}", e))
})?;
}
_ => {
return Err(RragError::evaluation(
"Unsupported export format for metrics".to_string(),
));
}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_evaluation_config_creation() {
let config = EvaluationConfig::default();
assert!(config.enabled_evaluations.contains(&EvaluationType::Ragas));
assert!(config
.enabled_evaluations
.contains(&EvaluationType::Retrieval));
assert!(config
.enabled_evaluations
.contains(&EvaluationType::Generation));
}
#[test]
fn test_evaluation_data_creation() {
let query = TestQuery {
id: "test_1".to_string(),
query: "What is machine learning?".to_string(),
query_type: Some("factual".to_string()),
metadata: HashMap::new(),
};
let ground_truth = GroundTruth {
query_id: "test_1".to_string(),
relevant_docs: vec!["doc_1".to_string(), "doc_2".to_string()],
expected_answer: Some("Machine learning is...".to_string()),
relevance_judgments: HashMap::new(),
metadata: HashMap::new(),
};
let data = EvaluationData {
queries: vec![query],
ground_truth: vec![ground_truth],
system_responses: vec![],
context: HashMap::new(),
};
assert_eq!(data.queries.len(), 1);
assert_eq!(data.ground_truth.len(), 1);
}
#[test]
fn test_metrics_collector() {
let mut collector = DefaultMetricsCollector::new();
collector.start_collection().unwrap();
collector.record_metric("test_metric", 0.85, None).unwrap();
collector.stop_collection().unwrap();
let metrics = collector.get_metrics().unwrap();
assert!(metrics.contains_key("test_metric"));
assert_eq!(metrics["test_metric"].len(), 1);
assert_eq!(metrics["test_metric"][0].value, 0.85);
}
}