1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
//! Evaluation case and set types — define test scenarios for agents.
use serde::{Deserialize, Serialize};
/// A single turn in a conversation for evaluation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InvocationTurn {
/// The role of this turn (e.g., "user", "model").
pub role: String,
/// The text content of this turn.
pub content: String,
/// Tool calls made during this turn (if any).
#[serde(default)]
pub tool_calls: Vec<serde_json::Value>,
/// Tool results returned during this turn (if any).
#[serde(default)]
pub tool_results: Vec<serde_json::Value>,
}
/// A single invocation (conversation) for evaluation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Invocation {
/// Unique identifier for this invocation.
#[serde(default)]
pub id: String,
/// The turns of conversation in this invocation.
pub turns: Vec<InvocationTurn>,
/// Optional metadata about this invocation.
#[serde(default)]
pub metadata: serde_json::Value,
}
/// A single evaluation case — pairs actual invocations with optional expected results.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalCase {
/// Name of the eval case.
pub name: String,
/// The actual agent invocations to evaluate.
pub actual: Vec<Invocation>,
/// The expected (golden) invocations for comparison.
#[serde(default)]
pub expected: Vec<Invocation>,
/// Optional conversation scenario description.
#[serde(default)]
pub scenario: Option<String>,
}
/// A collection of evaluation cases.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalSet {
/// Name of this evaluation set.
pub name: String,
/// The evaluation cases in this set.
pub cases: Vec<EvalCase>,
/// Optional description.
#[serde(default)]
pub description: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn eval_case_serde_roundtrip() {
let case = EvalCase {
name: "test-case".into(),
actual: vec![Invocation {
id: "inv-1".into(),
turns: vec![InvocationTurn {
role: "user".into(),
content: "What is the weather?".into(),
tool_calls: vec![],
tool_results: vec![],
}],
metadata: serde_json::Value::Null,
}],
expected: vec![],
scenario: Some("Weather query".into()),
};
let json = serde_json::to_string(&case).unwrap();
let deserialized: EvalCase = serde_json::from_str(&json).unwrap();
assert_eq!(deserialized.name, "test-case");
assert_eq!(deserialized.actual.len(), 1);
}
#[test]
fn eval_set_construction() {
let set = EvalSet {
name: "suite-1".into(),
cases: vec![],
description: Some("Test suite".into()),
};
assert_eq!(set.name, "suite-1");
assert!(set.cases.is_empty());
}
}