1#![deny(warnings)]
25#![deny(missing_docs)]
26#![warn(clippy::pedantic)]
27#![allow(clippy::module_name_repetitions)]
28#![allow(clippy::cast_precision_loss)] #![allow(clippy::doc_markdown)]
30
31use serde::{Deserialize, Serialize};
32
33pub struct EvalSuite {
35 name: String,
36 cases: Vec<TestCase>,
37}
38
39impl EvalSuite {
40 #[must_use]
42 pub fn new(name: impl Into<String>) -> Self {
43 Self {
44 name: name.into(),
45 cases: Vec::new(),
46 }
47 }
48
49 #[must_use]
51 pub fn add_case(mut self, case: TestCase) -> Self {
52 self.cases.push(case);
53 self
54 }
55
56 #[must_use]
58 pub fn name(&self) -> &str {
59 &self.name
60 }
61
62 #[must_use]
64 pub fn cases(&self) -> &[TestCase] {
65 &self.cases
66 }
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct TestCase {
72 pub id: String,
74 pub input: String,
76 pub expected_keywords: Vec<String>,
78 pub expected_output: Option<String>,
80}
81
82impl TestCase {
83 #[must_use]
85 pub fn new(id: impl Into<String>, input: impl Into<String>) -> Self {
86 Self {
87 id: id.into(),
88 input: input.into(),
89 expected_keywords: Vec::new(),
90 expected_output: None,
91 }
92 }
93
94 #[must_use]
96 pub fn expect_contains(mut self, keyword: impl Into<String>) -> Self {
97 self.expected_keywords.push(keyword.into());
98 self
99 }
100
101 #[must_use]
103 pub fn expect_output(mut self, output: impl Into<String>) -> Self {
104 self.expected_output = Some(output.into());
105 self
106 }
107}
108
109#[derive(Debug, Clone, Serialize)]
111pub struct TestResult {
112 pub case_id: String,
114 pub actual_output: String,
116 pub scores: std::collections::HashMap<String, f64>,
118 pub passed: bool,
120}
121
122#[derive(Debug, Clone, Serialize)]
124pub struct EvalReport {
125 pub suite_name: String,
127 pub results: Vec<TestResult>,
129 pub average_score: f64,
131 pub passed: usize,
133 pub total: usize,
135}
136
137impl EvalReport {
138 #[must_use]
140 pub fn summary(&self) -> String {
141 format!(
142 "Eval Report: {}\n Passed: {}/{} ({:.1}%)\n Average Score: {:.2}",
143 self.suite_name,
144 self.passed,
145 self.total,
146 if self.total > 0 {
147 self.passed as f64 / self.total as f64 * 100.0
148 } else {
149 0.0
150 },
151 self.average_score,
152 )
153 }
154}
155
156pub trait Metric: Send + Sync + 'static {
158 fn name(&self) -> &'static str;
160
161 fn score(&self, input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64;
165}
166
167pub struct KeywordMetric;
171
172impl Metric for KeywordMetric {
173 fn name(&self) -> &'static str {
174 "keyword_match"
175 }
176
177 fn score(&self, _input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64 {
178 if expected_keywords.is_empty() {
179 return 1.0;
180 }
181 let output_lower = actual_output.to_lowercase();
182 let matched = expected_keywords
183 .iter()
184 .filter(|kw| output_lower.contains(&kw.to_lowercase()))
185 .count();
186 matched as f64 / expected_keywords.len() as f64
187 }
188}
189
190pub struct LengthRelevancyMetric;
194
195impl Metric for LengthRelevancyMetric {
196 fn name(&self) -> &'static str {
197 "length_relevancy"
198 }
199
200 fn score(&self, input: &str, actual_output: &str, _expected_keywords: &[&str]) -> f64 {
201 let input_len = input.len() as f64;
202 let output_len = actual_output.len() as f64;
203
204 if output_len == 0.0 {
205 return 0.0;
206 }
207
208 let ratio = output_len / input_len.max(1.0);
210 if (2.0..=10.0).contains(&ratio) {
211 1.0
212 } else if ratio < 2.0 {
213 ratio / 2.0
214 } else {
215 (10.0 / ratio).min(1.0)
216 }
217 }
218}
219
220#[cfg(test)]
221mod tests {
222 use super::*;
223
224 #[test]
225 fn test_eval_suite_builder() {
226 let suite = EvalSuite::new("test_suite")
227 .add_case(TestCase::new("t1", "Hello"))
228 .add_case(TestCase::new("t2", "World"));
229 assert_eq!(suite.name(), "test_suite");
230 assert_eq!(suite.cases().len(), 2);
231 }
232
233 #[test]
234 fn test_test_case_builder() {
235 let tc = TestCase::new("t1", "prompt")
236 .expect_contains("keyword1")
237 .expect_contains("keyword2")
238 .expect_output("exact output");
239 assert_eq!(tc.expected_keywords.len(), 2);
240 assert_eq!(tc.expected_output, Some("exact output".into()));
241 }
242
243 #[test]
244 fn test_keyword_metric_all_match() {
245 let m = KeywordMetric;
246 let score = m.score("input", "hello world foo", &["hello", "world"]);
247 assert!((score - 1.0).abs() < f64::EPSILON);
248 }
249
250 #[test]
251 fn test_keyword_metric_partial_match() {
252 let m = KeywordMetric;
253 let score = m.score("input", "hello there", &["hello", "world"]);
254 assert!((score - 0.5).abs() < f64::EPSILON);
255 }
256
257 #[test]
258 fn test_keyword_metric_no_match() {
259 let m = KeywordMetric;
260 let score = m.score("input", "nothing here", &["hello", "world"]);
261 assert!((score - 0.0).abs() < f64::EPSILON);
262 }
263
264 #[test]
265 fn test_keyword_metric_empty_keywords() {
266 let m = KeywordMetric;
267 let score = m.score("input", "anything", &[]);
268 assert!((score - 1.0).abs() < f64::EPSILON);
269 }
270
271 #[test]
272 fn test_length_relevancy_ideal() {
273 let m = LengthRelevancyMetric;
274 let score = m.score("hello", "hello world this is a response text here!", &[]);
276 assert!(score > 0.5);
277 }
278
279 #[test]
280 fn test_length_relevancy_empty_output() {
281 let m = LengthRelevancyMetric;
282 let score = m.score("hello", "", &[]);
283 assert!((score - 0.0).abs() < f64::EPSILON);
284 }
285
286 #[test]
287 fn test_eval_report_summary() {
288 let report = EvalReport {
289 suite_name: "test".into(),
290 results: vec![],
291 average_score: 0.85,
292 passed: 8,
293 total: 10,
294 };
295 let s = report.summary();
296 assert!(s.contains("8/10"));
297 assert!(s.contains("80.0%"));
298 }
299}