traitclaw_eval/
lib.rs

1//! Evaluation framework for the `TraitClaw` AI agent framework.
2//!
3//! Provides `EvalSuite`, `TestCase`, and `Metric` abstractions for
4//! measuring agent quality. Includes built-in metrics for relevancy
5//! and keyword matching.
6//!
7//! # Quick Start
8//!
9//! ```rust
10//! use traitclaw_eval::{EvalSuite, TestCase, KeywordMetric, Metric};
11//!
12//! let suite = EvalSuite::new("quality_tests")
13//!     .add_case(TestCase::new("greeting", "Say hello")
14//!         .expect_contains("hello"));
15//!
16//! assert_eq!(suite.name(), "quality_tests");
17//! assert_eq!(suite.cases().len(), 1);
18//!
19//! let metric = KeywordMetric;
20//! let score = metric.score("Say hello", "Hello! How can I help?", &["hello"]);
21//! assert!(score > 0.0);
22//! ```
23
24#![deny(missing_docs)]
25#![allow(clippy::redundant_closure)]
26
27pub mod export;
28pub mod metrics;
29pub mod runner;
30
31pub use export::EvalReportExport;
32pub use metrics::{JudgeProvider, LlmJudgeMetric, SchemaValidationMetric, ToolUsageMetric};
33pub use runner::{AsyncMetric, EvalAgent, EvalRunner, SyncMetricAdapter};
34
35use serde::{Deserialize, Serialize};
36
37/// A suite of evaluation test cases.
38pub struct EvalSuite {
39    name: String,
40    cases: Vec<TestCase>,
41}
42
43impl EvalSuite {
44    /// Create a new evaluation suite.
45    #[must_use]
46    pub fn new(name: impl Into<String>) -> Self {
47        Self {
48            name: name.into(),
49            cases: Vec::new(),
50        }
51    }
52
53    /// Add a test case to the suite.
54    #[must_use]
55    pub fn add_case(mut self, case: TestCase) -> Self {
56        self.cases.push(case);
57        self
58    }
59
60    /// Get the suite name.
61    #[must_use]
62    pub fn name(&self) -> &str {
63        &self.name
64    }
65
66    /// Get all test cases.
67    #[must_use]
68    pub fn cases(&self) -> &[TestCase] {
69        &self.cases
70    }
71}
72
73/// A single evaluation test case.
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct TestCase {
76    /// Test case identifier.
77    pub id: String,
78    /// Input prompt for the agent.
79    pub input: String,
80    /// Expected output keywords (for keyword matching).
81    pub expected_keywords: Vec<String>,
82    /// Optional expected exact output.
83    pub expected_output: Option<String>,
84}
85
86impl TestCase {
87    /// Create a new test case.
88    #[must_use]
89    pub fn new(id: impl Into<String>, input: impl Into<String>) -> Self {
90        Self {
91            id: id.into(),
92            input: input.into(),
93            expected_keywords: Vec::new(),
94            expected_output: None,
95        }
96    }
97
98    /// Add an expected keyword to match in the output.
99    #[must_use]
100    pub fn expect_contains(mut self, keyword: impl Into<String>) -> Self {
101        self.expected_keywords.push(keyword.into());
102        self
103    }
104
105    /// Set the expected exact output.
106    #[must_use]
107    pub fn expect_output(mut self, output: impl Into<String>) -> Self {
108        self.expected_output = Some(output.into());
109        self
110    }
111}
112
113/// A single test case result.
114#[derive(Debug, Clone, Serialize, Deserialize)]
115pub struct TestResult {
116    /// Test case ID.
117    pub case_id: String,
118    /// The actual output from the agent.
119    pub actual_output: String,
120    /// Metric scores (metric_name to score 0.0-1.0).
121    pub scores: std::collections::HashMap<String, f64>,
122    /// Whether the test passed (all scores above threshold).
123    pub passed: bool,
124}
125
126/// An evaluation report.
127#[derive(Debug, Clone, Serialize, Deserialize)]
128pub struct EvalReport {
129    /// Suite name.
130    pub suite_name: String,
131    /// Individual test results.
132    pub results: Vec<TestResult>,
133    /// Average score across all tests and metrics.
134    pub average_score: f64,
135    /// Number of tests that passed.
136    pub passed: usize,
137    /// Total number of tests.
138    pub total: usize,
139}
140
141impl EvalReport {
142    /// Generate a human-readable summary.
143    #[must_use]
144    pub fn summary(&self) -> String {
145        format!(
146            "Eval Report: {}\n  Passed: {}/{} ({:.1}%)\n  Average Score: {:.2}",
147            self.suite_name,
148            self.passed,
149            self.total,
150            if self.total > 0 {
151                self.passed as f64 / self.total as f64 * 100.0
152            } else {
153                0.0
154            },
155            self.average_score,
156        )
157    }
158}
159
160/// Trait for evaluation metrics.
161pub trait Metric: Send + Sync + 'static {
162    /// Metric name.
163    fn name(&self) -> &'static str;
164
165    /// Score the actual output against the expected criteria.
166    ///
167    /// Returns a score from 0.0 (worst) to 1.0 (best).
168    fn score(&self, input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64;
169}
170
171/// Built-in keyword matching metric.
172///
173/// Scores based on the fraction of expected keywords found in the output.
174pub struct KeywordMetric;
175
176impl Metric for KeywordMetric {
177    fn name(&self) -> &'static str {
178        "keyword_match"
179    }
180
181    fn score(&self, _input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64 {
182        if expected_keywords.is_empty() {
183            return 1.0;
184        }
185        let output_lower = actual_output.to_lowercase();
186        let matched = expected_keywords
187            .iter()
188            .filter(|kw| output_lower.contains(&kw.to_lowercase()))
189            .count();
190        matched as f64 / expected_keywords.len() as f64
191    }
192}
193
194/// Built-in length-based relevancy metric.
195///
196/// Penalizes very short or very long responses relative to input length.
197pub struct LengthRelevancyMetric;
198
199impl Metric for LengthRelevancyMetric {
200    fn name(&self) -> &'static str {
201        "length_relevancy"
202    }
203
204    fn score(&self, input: &str, actual_output: &str, _expected_keywords: &[&str]) -> f64 {
205        let input_len = input.len() as f64;
206        let output_len = actual_output.len() as f64;
207
208        if output_len == 0.0 {
209            return 0.0;
210        }
211
212        // Ideal ratio: output is 2-10x the input length
213        let ratio = output_len / input_len.max(1.0);
214        if (2.0..=10.0).contains(&ratio) {
215            1.0
216        } else if ratio < 2.0 {
217            ratio / 2.0
218        } else {
219            (10.0 / ratio).min(1.0)
220        }
221    }
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227
228    #[test]
229    fn test_eval_suite_builder() {
230        let suite = EvalSuite::new("test_suite")
231            .add_case(TestCase::new("t1", "Hello"))
232            .add_case(TestCase::new("t2", "World"));
233        assert_eq!(suite.name(), "test_suite");
234        assert_eq!(suite.cases().len(), 2);
235    }
236
237    #[test]
238    fn test_test_case_builder() {
239        let tc = TestCase::new("t1", "prompt")
240            .expect_contains("keyword1")
241            .expect_contains("keyword2")
242            .expect_output("exact output");
243        assert_eq!(tc.expected_keywords.len(), 2);
244        assert_eq!(tc.expected_output, Some("exact output".into()));
245    }
246
247    #[test]
248    fn test_keyword_metric_all_match() {
249        let m = KeywordMetric;
250        let score = m.score("input", "hello world foo", &["hello", "world"]);
251        assert!((score - 1.0).abs() < f64::EPSILON);
252    }
253
254    #[test]
255    fn test_keyword_metric_partial_match() {
256        let m = KeywordMetric;
257        let score = m.score("input", "hello there", &["hello", "world"]);
258        assert!((score - 0.5).abs() < f64::EPSILON);
259    }
260
261    #[test]
262    fn test_keyword_metric_no_match() {
263        let m = KeywordMetric;
264        let score = m.score("input", "nothing here", &["hello", "world"]);
265        assert!((score - 0.0).abs() < f64::EPSILON);
266    }
267
268    #[test]
269    fn test_keyword_metric_empty_keywords() {
270        let m = KeywordMetric;
271        let score = m.score("input", "anything", &[]);
272        assert!((score - 1.0).abs() < f64::EPSILON);
273    }
274
275    #[test]
276    fn test_length_relevancy_ideal() {
277        let m = LengthRelevancyMetric;
278        // Output 5x input = ideal range
279        let score = m.score("hello", "hello world this is a response text here!", &[]);
280        assert!(score > 0.5);
281    }
282
283    #[test]
284    fn test_length_relevancy_empty_output() {
285        let m = LengthRelevancyMetric;
286        let score = m.score("hello", "", &[]);
287        assert!((score - 0.0).abs() < f64::EPSILON);
288    }
289
290    #[test]
291    fn test_eval_report_summary() {
292        let report = EvalReport {
293            suite_name: "test".into(),
294            results: vec![],
295            average_score: 0.85,
296            passed: 8,
297            total: 10,
298        };
299        let s = report.summary();
300        assert!(s.contains("8/10"));
301        assert!(s.contains("80.0%"));
302    }
303}
traitclaw_eval/lib.rs

traitclaw_eval/
lib.rs