#![deny(missing_docs)]
#![allow(clippy::redundant_closure)]
pub mod export;
pub mod metrics;
pub mod runner;
pub use export::EvalReportExport;
pub use metrics::{JudgeProvider, LlmJudgeMetric, SchemaValidationMetric, ToolUsageMetric};
pub use runner::{AsyncMetric, EvalAgent, EvalRunner, SyncMetricAdapter};
use serde::{Deserialize, Serialize};
pub struct EvalSuite {
name: String,
cases: Vec<TestCase>,
}
impl EvalSuite {
#[must_use]
pub fn new(name: impl Into<String>) -> Self {
Self {
name: name.into(),
cases: Vec::new(),
}
}
#[must_use]
pub fn add_case(mut self, case: TestCase) -> Self {
self.cases.push(case);
self
}
#[must_use]
pub fn name(&self) -> &str {
&self.name
}
#[must_use]
pub fn cases(&self) -> &[TestCase] {
&self.cases
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TestCase {
pub id: String,
pub input: String,
pub expected_keywords: Vec<String>,
pub expected_output: Option<String>,
}
impl TestCase {
#[must_use]
pub fn new(id: impl Into<String>, input: impl Into<String>) -> Self {
Self {
id: id.into(),
input: input.into(),
expected_keywords: Vec::new(),
expected_output: None,
}
}
#[must_use]
pub fn expect_contains(mut self, keyword: impl Into<String>) -> Self {
self.expected_keywords.push(keyword.into());
self
}
#[must_use]
pub fn expect_output(mut self, output: impl Into<String>) -> Self {
self.expected_output = Some(output.into());
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TestResult {
pub case_id: String,
pub actual_output: String,
pub scores: std::collections::HashMap<String, f64>,
pub passed: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalReport {
pub suite_name: String,
pub results: Vec<TestResult>,
pub average_score: f64,
pub passed: usize,
pub total: usize,
}
impl EvalReport {
#[must_use]
pub fn summary(&self) -> String {
format!(
"Eval Report: {}\n Passed: {}/{} ({:.1}%)\n Average Score: {:.2}",
self.suite_name,
self.passed,
self.total,
if self.total > 0 {
self.passed as f64 / self.total as f64 * 100.0
} else {
0.0
},
self.average_score,
)
}
}
pub trait Metric: Send + Sync + 'static {
fn name(&self) -> &'static str;
fn score(&self, input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64;
}
pub struct KeywordMetric;
impl Metric for KeywordMetric {
fn name(&self) -> &'static str {
"keyword_match"
}
fn score(&self, _input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64 {
if expected_keywords.is_empty() {
return 1.0;
}
let output_lower = actual_output.to_lowercase();
let matched = expected_keywords
.iter()
.filter(|kw| output_lower.contains(&kw.to_lowercase()))
.count();
matched as f64 / expected_keywords.len() as f64
}
}
pub struct LengthRelevancyMetric;
impl Metric for LengthRelevancyMetric {
fn name(&self) -> &'static str {
"length_relevancy"
}
fn score(&self, input: &str, actual_output: &str, _expected_keywords: &[&str]) -> f64 {
let input_len = input.len() as f64;
let output_len = actual_output.len() as f64;
if output_len == 0.0 {
return 0.0;
}
let ratio = output_len / input_len.max(1.0);
if (2.0..=10.0).contains(&ratio) {
1.0
} else if ratio < 2.0 {
ratio / 2.0
} else {
(10.0 / ratio).min(1.0)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_eval_suite_builder() {
let suite = EvalSuite::new("test_suite")
.add_case(TestCase::new("t1", "Hello"))
.add_case(TestCase::new("t2", "World"));
assert_eq!(suite.name(), "test_suite");
assert_eq!(suite.cases().len(), 2);
}
#[test]
fn test_test_case_builder() {
let tc = TestCase::new("t1", "prompt")
.expect_contains("keyword1")
.expect_contains("keyword2")
.expect_output("exact output");
assert_eq!(tc.expected_keywords.len(), 2);
assert_eq!(tc.expected_output, Some("exact output".into()));
}
#[test]
fn test_keyword_metric_all_match() {
let m = KeywordMetric;
let score = m.score("input", "hello world foo", &["hello", "world"]);
assert!((score - 1.0).abs() < f64::EPSILON);
}
#[test]
fn test_keyword_metric_partial_match() {
let m = KeywordMetric;
let score = m.score("input", "hello there", &["hello", "world"]);
assert!((score - 0.5).abs() < f64::EPSILON);
}
#[test]
fn test_keyword_metric_no_match() {
let m = KeywordMetric;
let score = m.score("input", "nothing here", &["hello", "world"]);
assert!((score - 0.0).abs() < f64::EPSILON);
}
#[test]
fn test_keyword_metric_empty_keywords() {
let m = KeywordMetric;
let score = m.score("input", "anything", &[]);
assert!((score - 1.0).abs() < f64::EPSILON);
}
#[test]
fn test_length_relevancy_ideal() {
let m = LengthRelevancyMetric;
let score = m.score("hello", "hello world this is a response text here!", &[]);
assert!(score > 0.5);
}
#[test]
fn test_length_relevancy_empty_output() {
let m = LengthRelevancyMetric;
let score = m.score("hello", "", &[]);
assert!((score - 0.0).abs() < f64::EPSILON);
}
#[test]
fn test_eval_report_summary() {
let report = EvalReport {
suite_name: "test".into(),
results: vec![],
average_score: 0.85,
passed: 8,
total: 10,
};
let s = report.summary();
assert!(s.contains("8/10"));
assert!(s.contains("80.0%"));
}
}