Skip to main content

swink_agent_eval/evaluators/
agent.rs

1//! Agent/trajectory-family evaluators (T069, T070).
2//!
3//! Nine judge-backed evaluators score agent behavior across trajectory
4//! accuracy, task completion, user satisfaction, tone, knowledge retention,
5//! language detection, perceived error, and multi-agent interactions.
6//!
7//! * [`TrajectoryAccuracyEvaluator`] — trajectory quality without a reference
8//!   (prompt: `trajectory_accuracy_v0`).
9//! * [`TrajectoryAccuracyWithRefEvaluator`] — trajectory accuracy against an
10//!   expected trajectory (prompt: `trajectory_accuracy_with_ref_v0`).
11//! * [`TaskCompletionEvaluator`] — whether the declared assertion was met
12//!   (prompt: `task_completion_v0`).
13//! * [`UserSatisfactionEvaluator`] — projected user satisfaction
14//!   (prompt: `user_satisfaction_v0`).
15//! * [`AgentToneEvaluator`] — response tone quality (prompt: `agent_tone_v0`).
16//! * [`KnowledgeRetentionEvaluator`] — context retention across turns
17//!   (prompt: `knowledge_retention_v0`).
18//! * [`LanguageDetectionEvaluator`] — language-match between prompt and response
19//!   (prompt: `language_detection_v0`).
20//! * [`PerceivedErrorEvaluator`] — user-visible error signals in the response
21//!   (prompt: `perceived_error_v0`).
22//! * [`InteractionsEvaluator`] — multi-agent interaction topology accuracy
23//!   (prompt: `interactions_v0`).
24
25#![forbid(unsafe_code)]
26#![cfg(feature = "evaluator-agent")]
27
28use std::sync::Arc;
29
30use crate::evaluator::Evaluator;
31use crate::types::{EvalCase, EvalMetricResult, Invocation};
32
33use super::{JudgeEvaluatorConfig, build_prompt_context, evaluate_with_builtin};
34
35fn has_final_response(invocation: &Invocation) -> bool {
36    invocation
37        .final_response
38        .as_deref()
39        .is_some_and(|s| !s.trim().is_empty())
40}
41
42fn has_user_prompt(case: &EvalCase) -> bool {
43    !case.user_messages.is_empty()
44}
45
46/// Macro for single-rubric agent evaluators. Each evaluator's FR-020 criterion
47/// is supplied as a closure; bodies dispatch via [`evaluate_with_builtin`].
48macro_rules! agent_evaluator {
49    (
50        $(#[$meta:meta])*
51        $name:ident, $eval_name:literal, $template:literal, $criterion:expr
52    ) => {
53        $(#[$meta])*
54        pub struct $name {
55            config: JudgeEvaluatorConfig,
56        }
57
58        impl $name {
59            /// Construct with the supplied judge config.
60            #[must_use]
61            pub const fn new(config: JudgeEvaluatorConfig) -> Self {
62                Self { config }
63            }
64
65            /// Override the prompt template used by this evaluator.
66            #[must_use]
67            pub fn with_prompt(mut self, template: Arc<dyn crate::prompt::JudgePromptTemplate>) -> Self {
68                self.config = self.config.with_prompt(template);
69                self
70            }
71
72            /// Attach evaluator-level few-shot examples that render before any
73            /// case-level examples.
74            #[must_use]
75            pub fn with_few_shot(mut self, examples: Vec<crate::types::FewShotExample>) -> Self {
76                self.config = self.config.with_few_shot(examples);
77                self
78            }
79
80            /// Override the system prompt visible to the template render.
81            #[must_use]
82            pub fn with_system_prompt(mut self, prompt: impl Into<String>) -> Self {
83                self.config = self.config.with_system_prompt(prompt);
84                self
85            }
86
87            /// Attach an output schema for custom prompt templates.
88            #[must_use]
89            pub fn with_output_schema(mut self, schema: serde_json::Value) -> Self {
90                self.config = self.config.with_output_schema(schema);
91                self
92            }
93
94            /// Toggle judge reasoning capture.
95            #[must_use]
96            pub fn with_use_reasoning(mut self, flag: bool) -> Self {
97                self.config = self.config.with_use_reasoning(flag);
98                self
99            }
100
101            /// Override the feedback key used by downstream exporters.
102            #[must_use]
103            pub fn with_feedback_key(mut self, key: impl Into<String>) -> Self {
104                self.config = self.config.with_feedback_key(key);
105                self
106            }
107
108            /// Borrow the underlying config (e.g., to inspect the judge
109            /// registry or feedback key).
110            #[must_use]
111            pub const fn config(&self) -> &JudgeEvaluatorConfig {
112                &self.config
113            }
114        }
115
116        impl $crate::evaluators::JudgeEvaluatorBuilder for $name {
117            fn judge_config_mut(&mut self) -> &mut JudgeEvaluatorConfig {
118                &mut self.config
119            }
120        }
121
122        impl Evaluator for $name {
123            fn name(&self) -> &'static str {
124                $eval_name
125            }
126
127            fn evaluate(
128                &self,
129                case: &EvalCase,
130                invocation: &Invocation,
131            ) -> Option<EvalMetricResult> {
132                // FR-020: return None when the criterion is absent.
133                let criterion: fn(&EvalCase, &Invocation) -> bool = $criterion;
134                if !criterion(case, invocation) {
135                    return None;
136                }
137
138                Some(evaluate_with_builtin(
139                    $eval_name,
140                    $template,
141                    &self.config,
142                    &build_prompt_context(&self.config, case, invocation),
143                ))
144            }
145        }
146    };
147}
148
149agent_evaluator! {
150    /// Trajectory accuracy without a reference trajectory
151    /// (prompt: `trajectory_accuracy_v0`).
152    ///
153    /// Criterion: the case must have a user prompt and the invocation must
154    /// have a non-empty final response.
155    TrajectoryAccuracyEvaluator,
156    "trajectory_accuracy",
157    "trajectory_accuracy_v0",
158    |case, invocation| has_user_prompt(case) && has_final_response(invocation)
159}
160
161agent_evaluator! {
162    /// Trajectory accuracy against an expected reference trajectory
163    /// (prompt: `trajectory_accuracy_with_ref_v0`).
164    ///
165    /// Criterion: the case must declare an `expected_trajectory`, have a user
166    /// prompt, and the invocation must have a non-empty final response.
167    TrajectoryAccuracyWithRefEvaluator,
168    "trajectory_accuracy_with_ref",
169    "trajectory_accuracy_with_ref_v0",
170    |case, invocation| case.expected_trajectory.is_some()
171        && has_user_prompt(case)
172        && has_final_response(invocation)
173}
174
175agent_evaluator! {
176    /// Task completion against a declared assertion
177    /// (prompt: `task_completion_v0`).
178    ///
179    /// Criterion: the case must declare an `expected_assertion` and the
180    /// invocation must have a non-empty final response.
181    TaskCompletionEvaluator,
182    "task_completion",
183    "task_completion_v0",
184    |case, invocation| case.expected_assertion.is_some() && has_final_response(invocation)
185}
186
187agent_evaluator! {
188    /// Projected user satisfaction with the response
189    /// (prompt: `user_satisfaction_v0`).
190    ///
191    /// Criterion: the case must have a user prompt and the invocation must
192    /// have a non-empty final response.
193    UserSatisfactionEvaluator,
194    "user_satisfaction",
195    "user_satisfaction_v0",
196    |case, invocation| has_user_prompt(case) && has_final_response(invocation)
197}
198
199agent_evaluator! {
200    /// Response tone quality — professional, helpful register
201    /// (prompt: `agent_tone_v0`).
202    ///
203    /// Criterion: the invocation must have a non-empty final response.
204    /// A user prompt is not required because tone is scored on the response
205    /// alone.
206    AgentToneEvaluator,
207    "agent_tone",
208    "agent_tone_v0",
209    |_case, invocation| has_final_response(invocation)
210}
211
212agent_evaluator! {
213    /// Knowledge retention across conversation turns
214    /// (prompt: `knowledge_retention_v0`).
215    ///
216    /// Criterion: the case must have a user prompt and the invocation must
217    /// have a non-empty final response.
218    KnowledgeRetentionEvaluator,
219    "knowledge_retention",
220    "knowledge_retention_v0",
221    |case, invocation| has_user_prompt(case) && has_final_response(invocation)
222}
223
224agent_evaluator! {
225    /// Language-match between prompt and response
226    /// (prompt: `language_detection_v0`).
227    ///
228    /// Criterion: the case must have a user prompt and the invocation must
229    /// have a non-empty final response.
230    LanguageDetectionEvaluator,
231    "language_detection",
232    "language_detection_v0",
233    |case, invocation| has_user_prompt(case) && has_final_response(invocation)
234}
235
236agent_evaluator! {
237    /// User-visible error signals in the response
238    /// (prompt: `perceived_error_v0`).
239    ///
240    /// Criterion: the invocation must have a non-empty final response.
241    /// A user prompt is not required because error signals are scored on the
242    /// response alone.
243    PerceivedErrorEvaluator,
244    "perceived_error",
245    "perceived_error_v0",
246    |_case, invocation| has_final_response(invocation)
247}
248
249agent_evaluator! {
250    /// Multi-agent interaction topology accuracy against declared expected
251    /// interactions (prompt: `interactions_v0`).
252    ///
253    /// Criterion: the case must declare `expected_interactions`, have a user
254    /// prompt, and the invocation must have a non-empty final response.
255    InteractionsEvaluator,
256    "interactions",
257    "interactions_v0",
258    |case, invocation| case.expected_interactions.is_some()
259        && has_user_prompt(case)
260        && has_final_response(invocation)
261}