1#![deny(missing_docs)]
25#![allow(clippy::redundant_closure)]
26
27pub mod export;
28pub mod metrics;
29pub mod runner;
30
31pub use export::EvalReportExport;
32pub use metrics::{JudgeProvider, LlmJudgeMetric, SchemaValidationMetric, ToolUsageMetric};
33pub use runner::{AsyncMetric, EvalAgent, EvalRunner, SyncMetricAdapter};
34
35use serde::{Deserialize, Serialize};
36
37pub struct EvalSuite {
39 name: String,
40 cases: Vec<TestCase>,
41}
42
43impl EvalSuite {
44 #[must_use]
46 pub fn new(name: impl Into<String>) -> Self {
47 Self {
48 name: name.into(),
49 cases: Vec::new(),
50 }
51 }
52
53 #[must_use]
55 pub fn add_case(mut self, case: TestCase) -> Self {
56 self.cases.push(case);
57 self
58 }
59
60 #[must_use]
62 pub fn name(&self) -> &str {
63 &self.name
64 }
65
66 #[must_use]
68 pub fn cases(&self) -> &[TestCase] {
69 &self.cases
70 }
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct TestCase {
76 pub id: String,
78 pub input: String,
80 pub expected_keywords: Vec<String>,
82 pub expected_output: Option<String>,
84}
85
86impl TestCase {
87 #[must_use]
89 pub fn new(id: impl Into<String>, input: impl Into<String>) -> Self {
90 Self {
91 id: id.into(),
92 input: input.into(),
93 expected_keywords: Vec::new(),
94 expected_output: None,
95 }
96 }
97
98 #[must_use]
100 pub fn expect_contains(mut self, keyword: impl Into<String>) -> Self {
101 self.expected_keywords.push(keyword.into());
102 self
103 }
104
105 #[must_use]
107 pub fn expect_output(mut self, output: impl Into<String>) -> Self {
108 self.expected_output = Some(output.into());
109 self
110 }
111}
112
113#[derive(Debug, Clone, Serialize, Deserialize)]
115pub struct TestResult {
116 pub case_id: String,
118 pub actual_output: String,
120 pub scores: std::collections::HashMap<String, f64>,
122 pub passed: bool,
124}
125
126#[derive(Debug, Clone, Serialize, Deserialize)]
128pub struct EvalReport {
129 pub suite_name: String,
131 pub results: Vec<TestResult>,
133 pub average_score: f64,
135 pub passed: usize,
137 pub total: usize,
139}
140
141impl EvalReport {
142 #[must_use]
144 pub fn summary(&self) -> String {
145 format!(
146 "Eval Report: {}\n Passed: {}/{} ({:.1}%)\n Average Score: {:.2}",
147 self.suite_name,
148 self.passed,
149 self.total,
150 if self.total > 0 {
151 self.passed as f64 / self.total as f64 * 100.0
152 } else {
153 0.0
154 },
155 self.average_score,
156 )
157 }
158}
159
160pub trait Metric: Send + Sync + 'static {
162 fn name(&self) -> &'static str;
164
165 fn score(&self, input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64;
169}
170
171pub struct KeywordMetric;
175
176impl Metric for KeywordMetric {
177 fn name(&self) -> &'static str {
178 "keyword_match"
179 }
180
181 fn score(&self, _input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64 {
182 if expected_keywords.is_empty() {
183 return 1.0;
184 }
185 let output_lower = actual_output.to_lowercase();
186 let matched = expected_keywords
187 .iter()
188 .filter(|kw| output_lower.contains(&kw.to_lowercase()))
189 .count();
190 matched as f64 / expected_keywords.len() as f64
191 }
192}
193
194pub struct LengthRelevancyMetric;
198
199impl Metric for LengthRelevancyMetric {
200 fn name(&self) -> &'static str {
201 "length_relevancy"
202 }
203
204 fn score(&self, input: &str, actual_output: &str, _expected_keywords: &[&str]) -> f64 {
205 let input_len = input.len() as f64;
206 let output_len = actual_output.len() as f64;
207
208 if output_len == 0.0 {
209 return 0.0;
210 }
211
212 let ratio = output_len / input_len.max(1.0);
214 if (2.0..=10.0).contains(&ratio) {
215 1.0
216 } else if ratio < 2.0 {
217 ratio / 2.0
218 } else {
219 (10.0 / ratio).min(1.0)
220 }
221 }
222}
223
224#[cfg(test)]
225mod tests {
226 use super::*;
227
228 #[test]
229 fn test_eval_suite_builder() {
230 let suite = EvalSuite::new("test_suite")
231 .add_case(TestCase::new("t1", "Hello"))
232 .add_case(TestCase::new("t2", "World"));
233 assert_eq!(suite.name(), "test_suite");
234 assert_eq!(suite.cases().len(), 2);
235 }
236
237 #[test]
238 fn test_test_case_builder() {
239 let tc = TestCase::new("t1", "prompt")
240 .expect_contains("keyword1")
241 .expect_contains("keyword2")
242 .expect_output("exact output");
243 assert_eq!(tc.expected_keywords.len(), 2);
244 assert_eq!(tc.expected_output, Some("exact output".into()));
245 }
246
247 #[test]
248 fn test_keyword_metric_all_match() {
249 let m = KeywordMetric;
250 let score = m.score("input", "hello world foo", &["hello", "world"]);
251 assert!((score - 1.0).abs() < f64::EPSILON);
252 }
253
254 #[test]
255 fn test_keyword_metric_partial_match() {
256 let m = KeywordMetric;
257 let score = m.score("input", "hello there", &["hello", "world"]);
258 assert!((score - 0.5).abs() < f64::EPSILON);
259 }
260
261 #[test]
262 fn test_keyword_metric_no_match() {
263 let m = KeywordMetric;
264 let score = m.score("input", "nothing here", &["hello", "world"]);
265 assert!((score - 0.0).abs() < f64::EPSILON);
266 }
267
268 #[test]
269 fn test_keyword_metric_empty_keywords() {
270 let m = KeywordMetric;
271 let score = m.score("input", "anything", &[]);
272 assert!((score - 1.0).abs() < f64::EPSILON);
273 }
274
275 #[test]
276 fn test_length_relevancy_ideal() {
277 let m = LengthRelevancyMetric;
278 let score = m.score("hello", "hello world this is a response text here!", &[]);
280 assert!(score > 0.5);
281 }
282
283 #[test]
284 fn test_length_relevancy_empty_output() {
285 let m = LengthRelevancyMetric;
286 let score = m.score("hello", "", &[]);
287 assert!((score - 0.0).abs() < f64::EPSILON);
288 }
289
290 #[test]
291 fn test_eval_report_summary() {
292 let report = EvalReport {
293 suite_name: "test".into(),
294 results: vec![],
295 average_score: 0.85,
296 passed: 8,
297 total: 10,
298 };
299 let s = report.summary();
300 assert!(s.contains("8/10"));
301 assert!(s.contains("80.0%"));
302 }
303}