swink_agent_eval/evaluators/agent.rs
1//! Agent/trajectory-family evaluators (T069, T070).
2//!
3//! Nine judge-backed evaluators score agent behavior across trajectory
4//! accuracy, task completion, user satisfaction, tone, knowledge retention,
5//! language detection, perceived error, and multi-agent interactions.
6//!
7//! * [`TrajectoryAccuracyEvaluator`] — trajectory quality without a reference
8//! (prompt: `trajectory_accuracy_v0`).
9//! * [`TrajectoryAccuracyWithRefEvaluator`] — trajectory accuracy against an
10//! expected trajectory (prompt: `trajectory_accuracy_with_ref_v0`).
11//! * [`TaskCompletionEvaluator`] — whether the declared assertion was met
12//! (prompt: `task_completion_v0`).
13//! * [`UserSatisfactionEvaluator`] — projected user satisfaction
14//! (prompt: `user_satisfaction_v0`).
15//! * [`AgentToneEvaluator`] — response tone quality (prompt: `agent_tone_v0`).
16//! * [`KnowledgeRetentionEvaluator`] — context retention across turns
17//! (prompt: `knowledge_retention_v0`).
18//! * [`LanguageDetectionEvaluator`] — language-match between prompt and response
19//! (prompt: `language_detection_v0`).
20//! * [`PerceivedErrorEvaluator`] — user-visible error signals in the response
21//! (prompt: `perceived_error_v0`).
22//! * [`InteractionsEvaluator`] — multi-agent interaction topology accuracy
23//! (prompt: `interactions_v0`).
24
25#![forbid(unsafe_code)]
26#![cfg(feature = "evaluator-agent")]
27
28use std::sync::Arc;
29
30use crate::evaluator::Evaluator;
31use crate::types::{EvalCase, EvalMetricResult, Invocation};
32
33use super::{JudgeEvaluatorConfig, build_prompt_context, evaluate_with_builtin};
34
35fn has_final_response(invocation: &Invocation) -> bool {
36 invocation
37 .final_response
38 .as_deref()
39 .is_some_and(|s| !s.trim().is_empty())
40}
41
42fn has_user_prompt(case: &EvalCase) -> bool {
43 !case.user_messages.is_empty()
44}
45
46/// Macro for single-rubric agent evaluators. Each evaluator's FR-020 criterion
47/// is supplied as a closure; bodies dispatch via [`evaluate_with_builtin`].
48macro_rules! agent_evaluator {
49 (
50 $(#[$meta:meta])*
51 $name:ident, $eval_name:literal, $template:literal, $criterion:expr
52 ) => {
53 $(#[$meta])*
54 pub struct $name {
55 config: JudgeEvaluatorConfig,
56 }
57
58 impl $name {
59 /// Construct with the supplied judge config.
60 #[must_use]
61 pub const fn new(config: JudgeEvaluatorConfig) -> Self {
62 Self { config }
63 }
64
65 /// Override the prompt template used by this evaluator.
66 #[must_use]
67 pub fn with_prompt(mut self, template: Arc<dyn crate::prompt::JudgePromptTemplate>) -> Self {
68 self.config = self.config.with_prompt(template);
69 self
70 }
71
72 /// Attach evaluator-level few-shot examples that render before any
73 /// case-level examples.
74 #[must_use]
75 pub fn with_few_shot(mut self, examples: Vec<crate::types::FewShotExample>) -> Self {
76 self.config = self.config.with_few_shot(examples);
77 self
78 }
79
80 /// Override the system prompt visible to the template render.
81 #[must_use]
82 pub fn with_system_prompt(mut self, prompt: impl Into<String>) -> Self {
83 self.config = self.config.with_system_prompt(prompt);
84 self
85 }
86
87 /// Attach an output schema for custom prompt templates.
88 #[must_use]
89 pub fn with_output_schema(mut self, schema: serde_json::Value) -> Self {
90 self.config = self.config.with_output_schema(schema);
91 self
92 }
93
94 /// Toggle judge reasoning capture.
95 #[must_use]
96 pub fn with_use_reasoning(mut self, flag: bool) -> Self {
97 self.config = self.config.with_use_reasoning(flag);
98 self
99 }
100
101 /// Override the feedback key used by downstream exporters.
102 #[must_use]
103 pub fn with_feedback_key(mut self, key: impl Into<String>) -> Self {
104 self.config = self.config.with_feedback_key(key);
105 self
106 }
107
108 /// Borrow the underlying config (e.g., to inspect the judge
109 /// registry or feedback key).
110 #[must_use]
111 pub const fn config(&self) -> &JudgeEvaluatorConfig {
112 &self.config
113 }
114 }
115
116 impl $crate::evaluators::JudgeEvaluatorBuilder for $name {
117 fn judge_config_mut(&mut self) -> &mut JudgeEvaluatorConfig {
118 &mut self.config
119 }
120 }
121
122 impl Evaluator for $name {
123 fn name(&self) -> &'static str {
124 $eval_name
125 }
126
127 fn evaluate(
128 &self,
129 case: &EvalCase,
130 invocation: &Invocation,
131 ) -> Option<EvalMetricResult> {
132 // FR-020: return None when the criterion is absent.
133 let criterion: fn(&EvalCase, &Invocation) -> bool = $criterion;
134 if !criterion(case, invocation) {
135 return None;
136 }
137
138 Some(evaluate_with_builtin(
139 $eval_name,
140 $template,
141 &self.config,
142 &build_prompt_context(&self.config, case, invocation),
143 ))
144 }
145 }
146 };
147}
148
149agent_evaluator! {
150 /// Trajectory accuracy without a reference trajectory
151 /// (prompt: `trajectory_accuracy_v0`).
152 ///
153 /// Criterion: the case must have a user prompt and the invocation must
154 /// have a non-empty final response.
155 TrajectoryAccuracyEvaluator,
156 "trajectory_accuracy",
157 "trajectory_accuracy_v0",
158 |case, invocation| has_user_prompt(case) && has_final_response(invocation)
159}
160
161agent_evaluator! {
162 /// Trajectory accuracy against an expected reference trajectory
163 /// (prompt: `trajectory_accuracy_with_ref_v0`).
164 ///
165 /// Criterion: the case must declare an `expected_trajectory`, have a user
166 /// prompt, and the invocation must have a non-empty final response.
167 TrajectoryAccuracyWithRefEvaluator,
168 "trajectory_accuracy_with_ref",
169 "trajectory_accuracy_with_ref_v0",
170 |case, invocation| case.expected_trajectory.is_some()
171 && has_user_prompt(case)
172 && has_final_response(invocation)
173}
174
175agent_evaluator! {
176 /// Task completion against a declared assertion
177 /// (prompt: `task_completion_v0`).
178 ///
179 /// Criterion: the case must declare an `expected_assertion` and the
180 /// invocation must have a non-empty final response.
181 TaskCompletionEvaluator,
182 "task_completion",
183 "task_completion_v0",
184 |case, invocation| case.expected_assertion.is_some() && has_final_response(invocation)
185}
186
187agent_evaluator! {
188 /// Projected user satisfaction with the response
189 /// (prompt: `user_satisfaction_v0`).
190 ///
191 /// Criterion: the case must have a user prompt and the invocation must
192 /// have a non-empty final response.
193 UserSatisfactionEvaluator,
194 "user_satisfaction",
195 "user_satisfaction_v0",
196 |case, invocation| has_user_prompt(case) && has_final_response(invocation)
197}
198
199agent_evaluator! {
200 /// Response tone quality — professional, helpful register
201 /// (prompt: `agent_tone_v0`).
202 ///
203 /// Criterion: the invocation must have a non-empty final response.
204 /// A user prompt is not required because tone is scored on the response
205 /// alone.
206 AgentToneEvaluator,
207 "agent_tone",
208 "agent_tone_v0",
209 |_case, invocation| has_final_response(invocation)
210}
211
212agent_evaluator! {
213 /// Knowledge retention across conversation turns
214 /// (prompt: `knowledge_retention_v0`).
215 ///
216 /// Criterion: the case must have a user prompt and the invocation must
217 /// have a non-empty final response.
218 KnowledgeRetentionEvaluator,
219 "knowledge_retention",
220 "knowledge_retention_v0",
221 |case, invocation| has_user_prompt(case) && has_final_response(invocation)
222}
223
224agent_evaluator! {
225 /// Language-match between prompt and response
226 /// (prompt: `language_detection_v0`).
227 ///
228 /// Criterion: the case must have a user prompt and the invocation must
229 /// have a non-empty final response.
230 LanguageDetectionEvaluator,
231 "language_detection",
232 "language_detection_v0",
233 |case, invocation| has_user_prompt(case) && has_final_response(invocation)
234}
235
236agent_evaluator! {
237 /// User-visible error signals in the response
238 /// (prompt: `perceived_error_v0`).
239 ///
240 /// Criterion: the invocation must have a non-empty final response.
241 /// A user prompt is not required because error signals are scored on the
242 /// response alone.
243 PerceivedErrorEvaluator,
244 "perceived_error",
245 "perceived_error_v0",
246 |_case, invocation| has_final_response(invocation)
247}
248
249agent_evaluator! {
250 /// Multi-agent interaction topology accuracy against declared expected
251 /// interactions (prompt: `interactions_v0`).
252 ///
253 /// Criterion: the case must declare `expected_interactions`, have a user
254 /// prompt, and the invocation must have a non-empty final response.
255 InteractionsEvaluator,
256 "interactions",
257 "interactions_v0",
258 |case, invocation| case.expected_interactions.is_some()
259 && has_user_prompt(case)
260 && has_final_response(invocation)
261}