Skip to main content

traitclaw_eval/
lib.rs

1//! Evaluation framework for the `TraitClaw` AI agent framework.
2//!
3//! Provides `EvalSuite`, `TestCase`, and `Metric` abstractions for
4//! measuring agent quality. Includes built-in metrics for relevancy
5//! and keyword matching.
6//!
7//! # Quick Start
8//!
9//! ```rust
10//! use traitclaw_eval::{EvalSuite, TestCase, KeywordMetric, Metric};
11//!
12//! let suite = EvalSuite::new("quality_tests")
13//!     .add_case(TestCase::new("greeting", "Say hello")
14//!         .expect_contains("hello"));
15//!
16//! assert_eq!(suite.name(), "quality_tests");
17//! assert_eq!(suite.cases().len(), 1);
18//!
19//! let metric = KeywordMetric;
20//! let score = metric.score("Say hello", "Hello! How can I help?", &["hello"]);
21//! assert!(score > 0.0);
22//! ```
23
24#![deny(warnings)]
25#![deny(missing_docs)]
26#![warn(clippy::pedantic)]
27#![allow(clippy::module_name_repetitions)]
28#![allow(clippy::cast_precision_loss)] // usize→f64 for scoring is acceptable
29#![allow(clippy::doc_markdown)]
30
31pub mod export;
32pub mod metrics;
33pub mod runner;
34
35pub use export::EvalReportExport;
36pub use metrics::{JudgeProvider, LlmJudgeMetric, SchemaValidationMetric, ToolUsageMetric};
37pub use runner::{AsyncMetric, EvalAgent, EvalRunner, SyncMetricAdapter};
38
39use serde::{Deserialize, Serialize};
40
41/// A suite of evaluation test cases.
42pub struct EvalSuite {
43    name: String,
44    cases: Vec<TestCase>,
45}
46
47impl EvalSuite {
48    /// Create a new evaluation suite.
49    #[must_use]
50    pub fn new(name: impl Into<String>) -> Self {
51        Self {
52            name: name.into(),
53            cases: Vec::new(),
54        }
55    }
56
57    /// Add a test case to the suite.
58    #[must_use]
59    pub fn add_case(mut self, case: TestCase) -> Self {
60        self.cases.push(case);
61        self
62    }
63
64    /// Get the suite name.
65    #[must_use]
66    pub fn name(&self) -> &str {
67        &self.name
68    }
69
70    /// Get all test cases.
71    #[must_use]
72    pub fn cases(&self) -> &[TestCase] {
73        &self.cases
74    }
75}
76
77/// A single evaluation test case.
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct TestCase {
80    /// Test case identifier.
81    pub id: String,
82    /// Input prompt for the agent.
83    pub input: String,
84    /// Expected output keywords (for keyword matching).
85    pub expected_keywords: Vec<String>,
86    /// Optional expected exact output.
87    pub expected_output: Option<String>,
88}
89
90impl TestCase {
91    /// Create a new test case.
92    #[must_use]
93    pub fn new(id: impl Into<String>, input: impl Into<String>) -> Self {
94        Self {
95            id: id.into(),
96            input: input.into(),
97            expected_keywords: Vec::new(),
98            expected_output: None,
99        }
100    }
101
102    /// Add an expected keyword to match in the output.
103    #[must_use]
104    pub fn expect_contains(mut self, keyword: impl Into<String>) -> Self {
105        self.expected_keywords.push(keyword.into());
106        self
107    }
108
109    /// Set the expected exact output.
110    #[must_use]
111    pub fn expect_output(mut self, output: impl Into<String>) -> Self {
112        self.expected_output = Some(output.into());
113        self
114    }
115}
116
117/// A single test case result.
118#[derive(Debug, Clone, Serialize, Deserialize)]
119pub struct TestResult {
120    /// Test case ID.
121    pub case_id: String,
122    /// The actual output from the agent.
123    pub actual_output: String,
124    /// Metric scores (metric_name to score 0.0-1.0).
125    pub scores: std::collections::HashMap<String, f64>,
126    /// Whether the test passed (all scores above threshold).
127    pub passed: bool,
128}
129
130/// An evaluation report.
131#[derive(Debug, Clone, Serialize, Deserialize)]
132pub struct EvalReport {
133    /// Suite name.
134    pub suite_name: String,
135    /// Individual test results.
136    pub results: Vec<TestResult>,
137    /// Average score across all tests and metrics.
138    pub average_score: f64,
139    /// Number of tests that passed.
140    pub passed: usize,
141    /// Total number of tests.
142    pub total: usize,
143}
144
145impl EvalReport {
146    /// Generate a human-readable summary.
147    #[must_use]
148    pub fn summary(&self) -> String {
149        format!(
150            "Eval Report: {}\n  Passed: {}/{} ({:.1}%)\n  Average Score: {:.2}",
151            self.suite_name,
152            self.passed,
153            self.total,
154            if self.total > 0 {
155                self.passed as f64 / self.total as f64 * 100.0
156            } else {
157                0.0
158            },
159            self.average_score,
160        )
161    }
162}
163
164/// Trait for evaluation metrics.
165pub trait Metric: Send + Sync + 'static {
166    /// Metric name.
167    fn name(&self) -> &'static str;
168
169    /// Score the actual output against the expected criteria.
170    ///
171    /// Returns a score from 0.0 (worst) to 1.0 (best).
172    fn score(&self, input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64;
173}
174
175/// Built-in keyword matching metric.
176///
177/// Scores based on the fraction of expected keywords found in the output.
178pub struct KeywordMetric;
179
180impl Metric for KeywordMetric {
181    fn name(&self) -> &'static str {
182        "keyword_match"
183    }
184
185    fn score(&self, _input: &str, actual_output: &str, expected_keywords: &[&str]) -> f64 {
186        if expected_keywords.is_empty() {
187            return 1.0;
188        }
189        let output_lower = actual_output.to_lowercase();
190        let matched = expected_keywords
191            .iter()
192            .filter(|kw| output_lower.contains(&kw.to_lowercase()))
193            .count();
194        matched as f64 / expected_keywords.len() as f64
195    }
196}
197
198/// Built-in length-based relevancy metric.
199///
200/// Penalizes very short or very long responses relative to input length.
201pub struct LengthRelevancyMetric;
202
203impl Metric for LengthRelevancyMetric {
204    fn name(&self) -> &'static str {
205        "length_relevancy"
206    }
207
208    fn score(&self, input: &str, actual_output: &str, _expected_keywords: &[&str]) -> f64 {
209        let input_len = input.len() as f64;
210        let output_len = actual_output.len() as f64;
211
212        if output_len == 0.0 {
213            return 0.0;
214        }
215
216        // Ideal ratio: output is 2-10x the input length
217        let ratio = output_len / input_len.max(1.0);
218        if (2.0..=10.0).contains(&ratio) {
219            1.0
220        } else if ratio < 2.0 {
221            ratio / 2.0
222        } else {
223            (10.0 / ratio).min(1.0)
224        }
225    }
226}
227
228#[cfg(test)]
229mod tests {
230    use super::*;
231
232    #[test]
233    fn test_eval_suite_builder() {
234        let suite = EvalSuite::new("test_suite")
235            .add_case(TestCase::new("t1", "Hello"))
236            .add_case(TestCase::new("t2", "World"));
237        assert_eq!(suite.name(), "test_suite");
238        assert_eq!(suite.cases().len(), 2);
239    }
240
241    #[test]
242    fn test_test_case_builder() {
243        let tc = TestCase::new("t1", "prompt")
244            .expect_contains("keyword1")
245            .expect_contains("keyword2")
246            .expect_output("exact output");
247        assert_eq!(tc.expected_keywords.len(), 2);
248        assert_eq!(tc.expected_output, Some("exact output".into()));
249    }
250
251    #[test]
252    fn test_keyword_metric_all_match() {
253        let m = KeywordMetric;
254        let score = m.score("input", "hello world foo", &["hello", "world"]);
255        assert!((score - 1.0).abs() < f64::EPSILON);
256    }
257
258    #[test]
259    fn test_keyword_metric_partial_match() {
260        let m = KeywordMetric;
261        let score = m.score("input", "hello there", &["hello", "world"]);
262        assert!((score - 0.5).abs() < f64::EPSILON);
263    }
264
265    #[test]
266    fn test_keyword_metric_no_match() {
267        let m = KeywordMetric;
268        let score = m.score("input", "nothing here", &["hello", "world"]);
269        assert!((score - 0.0).abs() < f64::EPSILON);
270    }
271
272    #[test]
273    fn test_keyword_metric_empty_keywords() {
274        let m = KeywordMetric;
275        let score = m.score("input", "anything", &[]);
276        assert!((score - 1.0).abs() < f64::EPSILON);
277    }
278
279    #[test]
280    fn test_length_relevancy_ideal() {
281        let m = LengthRelevancyMetric;
282        // Output 5x input = ideal range
283        let score = m.score("hello", "hello world this is a response text here!", &[]);
284        assert!(score > 0.5);
285    }
286
287    #[test]
288    fn test_length_relevancy_empty_output() {
289        let m = LengthRelevancyMetric;
290        let score = m.score("hello", "", &[]);
291        assert!((score - 0.0).abs() < f64::EPSILON);
292    }
293
294    #[test]
295    fn test_eval_report_summary() {
296        let report = EvalReport {
297            suite_name: "test".into(),
298            results: vec![],
299            average_score: 0.85,
300            passed: 8,
301            total: 10,
302        };
303        let s = report.summary();
304        assert!(s.contains("8/10"));
305        assert!(s.contains("80.0%"));
306    }
307}