Skip to main content

traitclaw_eval/
lib.rs

1//! Evaluation framework for the `TraitClaw` AI agent framework.
2//!
3//! Provides `EvalSuite`, `TestCase`, and `Metric` abstractions for
4//! measuring agent quality. Includes built-in metrics for relevancy
5//! and keyword matching.
6//!
7//! # Quick Start
8//!
9//! ```rust
10//! use traitclaw_eval::{EvalSuite, TestCase, KeywordMetric, Metric};
11//!
12//! let suite = EvalSuite::new("quality_tests")
13//!     .add_case(TestCase::new("greeting", "Say hello")
14//!         .expect_contains("hello"));
15//!
16//! assert_eq!(suite.name(), "quality_tests");
17//! assert_eq!(suite.cases().len(), 1);
18//!
19//! let metric = KeywordMetric;
20//! let score = metric.score("Say hello", "Hello! How can I help?", &["hello"]);
21//! assert!(score > 0.0);
22//! ```
23
24#![deny(warnings)]
25#![deny(missing_docs)]
26#![warn(clippy::pedantic)]
27#![allow(clippy::module_name_repetitions)]
28#![allow(clippy::cast_precision_loss)] // usize→f64 for scoring is acceptable
29#![allow(clippy::doc_markdown)]
30
31use serde::{Deserialize, Serialize};
32
33/// A suite of evaluation test cases.
34pub struct EvalSuite {
35    name: String,
36    cases: Vec<TestCase>,
37}
38
39impl EvalSuite {
40    /// Create a new evaluation suite.
41    #[must_use]
42    pub fn new(name: impl Into<String>) -> Self {
43        Self {
44            name: name.into(),
45            cases: Vec::new(),
46        }
47    }
48
49    /// Add a test case to the suite.
50    #[must_use]
51    pub fn add_case(mut self, case: TestCase) -> Self {
52        self.cases.push(case);
53        self
54    }
55
56    /// Get the suite name.
57    #[must_use]
58    pub fn name(&self) -> &str {
59        &self.name
60    }
61
62    /// Get all test cases.
63    #[must_use]
64    pub fn cases(&self) -> &[TestCase] {
65        &self.cases
66    }
67}
68
69/// A single evaluation test case.
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct TestCase {
72    /// Test case identifier.
73    pub id: String,
74    /// Input prompt for the agent.
75    pub input: String,
76    /// Expected output keywords (for keyword matching).
77    pub expected_keywords: Vec<String>,
78    /// Optional expected exact output.
79    pub expected_output: Option<String>,
80}
81
82impl TestCase {
83    /// Create a new test case.
84    #[must_use]
85    pub fn new(id: impl Into<String>, input: impl Into<String>) -> Self {
86        Self {
87            id: id.into(),
88            input: input.into(),
89            expected_keywords: Vec::new(),
90            expected_output: None,
91        }
92    }
93
94    /// Add an expected keyword to match in the output.
95    #[must_use]
96    pub fn expect_contains(mut self, keyword: impl Into<String>) -> Self {
97        self.expected_keywords.push(keyword.into());
98        self
99    }
100
101    /// Set the expected exact output.
102    #[must_use]
103    pub fn expect_output(mut self, output: impl Into<String>) -> Self {
104        self.expected_output = Some(output.into());
105        self
106    }
107}
108
109/// A single test case result.
110#[derive(Debug, Clone, Serialize)]
111pub struct TestResult {
112    /// Test case ID.
113    pub case_id: String,
114    /// The actual output from the agent.
115    pub actual_output: String,
116    /// Metric scores (metric_name to score 0.0-1.0).
117    pub scores: std::collections::HashMap<String, f64>,
118    /// Whether the test passed (all scores above threshold).
119    pub passed: bool,
120}
121
122/// An evaluation report.
123#[derive(Debug, Clone, Serialize)]
124pub struct EvalReport {
125    /// Suite name.
126    pub suite_name: String,
127    /// Individual test results.
128    pub results: Vec<TestResult>,
129    /// Average score across all tests and metrics.
130    pub average_score: f64,
131    /// Number of tests that passed.
132    pub passed: usize,
133    /// Total number of tests.
134    pub total: usize,
135}
136
137impl EvalReport {
138    /// Generate a human-readable summary.
139    #[must_use]
140    pub fn summary(&self) -> String {
141        format!(
142            "Eval Report: {}\n  Passed: {}/{} ({:.1}%)\n  Average Score: {:.2}",
143            self.suite_name,
144            self.passed,
145            self.total,
146            if self.total > 0 {
147                self.passed as f64 / self.total as f64 * 100.0
148            } else {
149                0.0
150            },
151            self.average_score,
152        )
153    }
154}
155
156/// Trait for evaluation metrics.
157pub trait Metric: Send + Sync + 'static {
158    /// Metric name.
159    fn name(&self) -> &'static str;
160
161    /// Score the actual output against the expected criteria.
162    ///
163    /// Returns a score from 0.0 (worst) to 1.0 (best).
164    fn score(&self, input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64;
165}
166
167/// Built-in keyword matching metric.
168///
169/// Scores based on the fraction of expected keywords found in the output.
170pub struct KeywordMetric;
171
172impl Metric for KeywordMetric {
173    fn name(&self) -> &'static str {
174        "keyword_match"
175    }
176
177    fn score(&self, _input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64 {
178        if expected_keywords.is_empty() {
179            return 1.0;
180        }
181        let output_lower = actual_output.to_lowercase();
182        let matched = expected_keywords
183            .iter()
184            .filter(|kw| output_lower.contains(&kw.to_lowercase()))
185            .count();
186        matched as f64 / expected_keywords.len() as f64
187    }
188}
189
190/// Built-in length-based relevancy metric.
191///
192/// Penalizes very short or very long responses relative to input length.
193pub struct LengthRelevancyMetric;
194
195impl Metric for LengthRelevancyMetric {
196    fn name(&self) -> &'static str {
197        "length_relevancy"
198    }
199
200    fn score(&self, input: &str, actual_output: &str, _expected_keywords: &[&str]) -> f64 {
201        let input_len = input.len() as f64;
202        let output_len = actual_output.len() as f64;
203
204        if output_len == 0.0 {
205            return 0.0;
206        }
207
208        // Ideal ratio: output is 2-10x the input length
209        let ratio = output_len / input_len.max(1.0);
210        if (2.0..=10.0).contains(&ratio) {
211            1.0
212        } else if ratio < 2.0 {
213            ratio / 2.0
214        } else {
215            (10.0 / ratio).min(1.0)
216        }
217    }
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223
224    #[test]
225    fn test_eval_suite_builder() {
226        let suite = EvalSuite::new("test_suite")
227            .add_case(TestCase::new("t1", "Hello"))
228            .add_case(TestCase::new("t2", "World"));
229        assert_eq!(suite.name(), "test_suite");
230        assert_eq!(suite.cases().len(), 2);
231    }
232
233    #[test]
234    fn test_test_case_builder() {
235        let tc = TestCase::new("t1", "prompt")
236            .expect_contains("keyword1")
237            .expect_contains("keyword2")
238            .expect_output("exact output");
239        assert_eq!(tc.expected_keywords.len(), 2);
240        assert_eq!(tc.expected_output, Some("exact output".into()));
241    }
242
243    #[test]
244    fn test_keyword_metric_all_match() {
245        let m = KeywordMetric;
246        let score = m.score("input", "hello world foo", &["hello", "world"]);
247        assert!((score - 1.0).abs() < f64::EPSILON);
248    }
249
250    #[test]
251    fn test_keyword_metric_partial_match() {
252        let m = KeywordMetric;
253        let score = m.score("input", "hello there", &["hello", "world"]);
254        assert!((score - 0.5).abs() < f64::EPSILON);
255    }
256
257    #[test]
258    fn test_keyword_metric_no_match() {
259        let m = KeywordMetric;
260        let score = m.score("input", "nothing here", &["hello", "world"]);
261        assert!((score - 0.0).abs() < f64::EPSILON);
262    }
263
264    #[test]
265    fn test_keyword_metric_empty_keywords() {
266        let m = KeywordMetric;
267        let score = m.score("input", "anything", &[]);
268        assert!((score - 1.0).abs() < f64::EPSILON);
269    }
270
271    #[test]
272    fn test_length_relevancy_ideal() {
273        let m = LengthRelevancyMetric;
274        // Output 5x input = ideal range
275        let score = m.score("hello", "hello world this is a response text here!", &[]);
276        assert!(score > 0.5);
277    }
278
279    #[test]
280    fn test_length_relevancy_empty_output() {
281        let m = LengthRelevancyMetric;
282        let score = m.score("hello", "", &[]);
283        assert!((score - 0.0).abs() < f64::EPSILON);
284    }
285
286    #[test]
287    fn test_eval_report_summary() {
288        let report = EvalReport {
289            suite_name: "test".into(),
290            results: vec![],
291            average_score: 0.85,
292            passed: 8,
293            total: 10,
294        };
295        let s = report.summary();
296        assert!(s.contains("8/10"));
297        assert!(s.contains("80.0%"));
298    }
299}