1#![deny(warnings)]
25#![deny(missing_docs)]
26#![warn(clippy::pedantic)]
27#![allow(clippy::module_name_repetitions)]
28#![allow(clippy::cast_precision_loss)] #![allow(clippy::doc_markdown)]
30
31pub mod export;
32pub mod metrics;
33pub mod runner;
34
35pub use export::EvalReportExport;
36pub use metrics::{JudgeProvider, LlmJudgeMetric, SchemaValidationMetric, ToolUsageMetric};
37pub use runner::{AsyncMetric, EvalAgent, EvalRunner, SyncMetricAdapter};
38
39use serde::{Deserialize, Serialize};
40
41pub struct EvalSuite {
43 name: String,
44 cases: Vec<TestCase>,
45}
46
47impl EvalSuite {
48 #[must_use]
50 pub fn new(name: impl Into<String>) -> Self {
51 Self {
52 name: name.into(),
53 cases: Vec::new(),
54 }
55 }
56
57 #[must_use]
59 pub fn add_case(mut self, case: TestCase) -> Self {
60 self.cases.push(case);
61 self
62 }
63
64 #[must_use]
66 pub fn name(&self) -> &str {
67 &self.name
68 }
69
70 #[must_use]
72 pub fn cases(&self) -> &[TestCase] {
73 &self.cases
74 }
75}
76
77#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct TestCase {
80 pub id: String,
82 pub input: String,
84 pub expected_keywords: Vec<String>,
86 pub expected_output: Option<String>,
88}
89
90impl TestCase {
91 #[must_use]
93 pub fn new(id: impl Into<String>, input: impl Into<String>) -> Self {
94 Self {
95 id: id.into(),
96 input: input.into(),
97 expected_keywords: Vec::new(),
98 expected_output: None,
99 }
100 }
101
102 #[must_use]
104 pub fn expect_contains(mut self, keyword: impl Into<String>) -> Self {
105 self.expected_keywords.push(keyword.into());
106 self
107 }
108
109 #[must_use]
111 pub fn expect_output(mut self, output: impl Into<String>) -> Self {
112 self.expected_output = Some(output.into());
113 self
114 }
115}
116
117#[derive(Debug, Clone, Serialize, Deserialize)]
119pub struct TestResult {
120 pub case_id: String,
122 pub actual_output: String,
124 pub scores: std::collections::HashMap<String, f64>,
126 pub passed: bool,
128}
129
130#[derive(Debug, Clone, Serialize, Deserialize)]
132pub struct EvalReport {
133 pub suite_name: String,
135 pub results: Vec<TestResult>,
137 pub average_score: f64,
139 pub passed: usize,
141 pub total: usize,
143}
144
145impl EvalReport {
146 #[must_use]
148 pub fn summary(&self) -> String {
149 format!(
150 "Eval Report: {}\n Passed: {}/{} ({:.1}%)\n Average Score: {:.2}",
151 self.suite_name,
152 self.passed,
153 self.total,
154 if self.total > 0 {
155 self.passed as f64 / self.total as f64 * 100.0
156 } else {
157 0.0
158 },
159 self.average_score,
160 )
161 }
162}
163
164pub trait Metric: Send + Sync + 'static {
166 fn name(&self) -> &'static str;
168
169 fn score(&self, input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64;
173}
174
175pub struct KeywordMetric;
179
180impl Metric for KeywordMetric {
181 fn name(&self) -> &'static str {
182 "keyword_match"
183 }
184
185 fn score(&self, _input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64 {
186 if expected_keywords.is_empty() {
187 return 1.0;
188 }
189 let output_lower = actual_output.to_lowercase();
190 let matched = expected_keywords
191 .iter()
192 .filter(|kw| output_lower.contains(&kw.to_lowercase()))
193 .count();
194 matched as f64 / expected_keywords.len() as f64
195 }
196}
197
198pub struct LengthRelevancyMetric;
202
203impl Metric for LengthRelevancyMetric {
204 fn name(&self) -> &'static str {
205 "length_relevancy"
206 }
207
208 fn score(&self, input: &str, actual_output: &str, _expected_keywords: &[&str]) -> f64 {
209 let input_len = input.len() as f64;
210 let output_len = actual_output.len() as f64;
211
212 if output_len == 0.0 {
213 return 0.0;
214 }
215
216 let ratio = output_len / input_len.max(1.0);
218 if (2.0..=10.0).contains(&ratio) {
219 1.0
220 } else if ratio < 2.0 {
221 ratio / 2.0
222 } else {
223 (10.0 / ratio).min(1.0)
224 }
225 }
226}
227
228#[cfg(test)]
229mod tests {
230 use super::*;
231
232 #[test]
233 fn test_eval_suite_builder() {
234 let suite = EvalSuite::new("test_suite")
235 .add_case(TestCase::new("t1", "Hello"))
236 .add_case(TestCase::new("t2", "World"));
237 assert_eq!(suite.name(), "test_suite");
238 assert_eq!(suite.cases().len(), 2);
239 }
240
241 #[test]
242 fn test_test_case_builder() {
243 let tc = TestCase::new("t1", "prompt")
244 .expect_contains("keyword1")
245 .expect_contains("keyword2")
246 .expect_output("exact output");
247 assert_eq!(tc.expected_keywords.len(), 2);
248 assert_eq!(tc.expected_output, Some("exact output".into()));
249 }
250
251 #[test]
252 fn test_keyword_metric_all_match() {
253 let m = KeywordMetric;
254 let score = m.score("input", "hello world foo", &["hello", "world"]);
255 assert!((score - 1.0).abs() < f64::EPSILON);
256 }
257
258 #[test]
259 fn test_keyword_metric_partial_match() {
260 let m = KeywordMetric;
261 let score = m.score("input", "hello there", &["hello", "world"]);
262 assert!((score - 0.5).abs() < f64::EPSILON);
263 }
264
265 #[test]
266 fn test_keyword_metric_no_match() {
267 let m = KeywordMetric;
268 let score = m.score("input", "nothing here", &["hello", "world"]);
269 assert!((score - 0.0).abs() < f64::EPSILON);
270 }
271
272 #[test]
273 fn test_keyword_metric_empty_keywords() {
274 let m = KeywordMetric;
275 let score = m.score("input", "anything", &[]);
276 assert!((score - 1.0).abs() < f64::EPSILON);
277 }
278
279 #[test]
280 fn test_length_relevancy_ideal() {
281 let m = LengthRelevancyMetric;
282 let score = m.score("hello", "hello world this is a response text here!", &[]);
284 assert!(score > 0.5);
285 }
286
287 #[test]
288 fn test_length_relevancy_empty_output() {
289 let m = LengthRelevancyMetric;
290 let score = m.score("hello", "", &[]);
291 assert!((score - 0.0).abs() < f64::EPSILON);
292 }
293
294 #[test]
295 fn test_eval_report_summary() {
296 let report = EvalReport {
297 suite_name: "test".into(),
298 results: vec![],
299 average_score: 0.85,
300 passed: 8,
301 total: 10,
302 };
303 let s = report.summary();
304 assert!(s.contains("8/10"));
305 assert!(s.contains("80.0%"));
306 }
307}