Skip to main content

swink_agent_eval/evaluators/
safety.rs

1//! Safety-family judge-backed evaluators (T062, T063, T064).
2//!
3//! Every evaluator in this module produces a binary pass/fail score and
4//! explicitly sets the default aggregator to [`crate::AllPass`] in its
5//! constructor (data-model §3 safety family).
6//!
7//! The [`HarmfulnessEvaluator`] and [`ToxicityEvaluator`] ship distinct
8//! rubrics (spec 043 clarification Q1):
9//!
10//! * `HarmfulnessEvaluator` is the broad rubric — self-harm, weapons,
11//!   illegal-activity, large-scale societal risk.
12//! * `ToxicityEvaluator` is the narrow rubric — hate speech, harassment,
13//!   slurs, directed insults.
14
15#![forbid(unsafe_code)]
16#![cfg(feature = "evaluator-safety")]
17
18use std::sync::Arc;
19
20use serde::{Deserialize, Serialize};
21
22use crate::aggregator::AllPass;
23use crate::evaluator::Evaluator;
24use crate::types::{EvalCase, EvalMetricResult, Invocation};
25
26use super::{JudgeEvaluatorConfig, build_prompt_context, evaluate_with_builtin};
27
28fn has_final_response(_case: &EvalCase, invocation: &Invocation) -> bool {
29    invocation
30        .final_response
31        .as_deref()
32        .is_some_and(|s| !s.trim().is_empty())
33}
34
35fn has_user_prompt(case: &EvalCase, _invocation: &Invocation) -> bool {
36    !case.user_messages.is_empty()
37}
38
39/// Set the AllPass aggregator on a config unless the caller already picked one.
40fn with_safety_default(config: JudgeEvaluatorConfig) -> JudgeEvaluatorConfig {
41    if config.aggregator.is_some() {
42        config
43    } else {
44        config.with_aggregator(Arc::new(AllPass))
45    }
46}
47
48macro_rules! safety_evaluator {
49    (
50        $(#[$meta:meta])*
51        $name:ident, $eval_name:literal, $template:literal, $criterion:expr
52    ) => {
53        $(#[$meta])*
54        pub struct $name {
55            config: JudgeEvaluatorConfig,
56        }
57
58        impl $name {
59            /// Construct with the supplied judge config, explicitly
60            /// defaulting the aggregator to [`crate::AllPass`] when no
61            /// override is configured (data-model §3 safety family).
62            #[must_use]
63            pub fn new(config: JudgeEvaluatorConfig) -> Self {
64                Self {
65                    config: with_safety_default(config),
66                }
67            }
68
69            /// Override the prompt template used by this evaluator.
70            #[must_use]
71            pub fn with_prompt(mut self, template: Arc<dyn crate::prompt::JudgePromptTemplate>) -> Self {
72                self.config = self.config.with_prompt(template);
73                self
74            }
75
76            /// Attach evaluator-level few-shot examples that render before any
77            /// case-level examples.
78            #[must_use]
79            pub fn with_few_shot(mut self, examples: Vec<crate::types::FewShotExample>) -> Self {
80                self.config = self.config.with_few_shot(examples);
81                self
82            }
83
84            /// Override the system prompt visible to the template render.
85            #[must_use]
86            pub fn with_system_prompt(mut self, prompt: impl Into<String>) -> Self {
87                self.config = self.config.with_system_prompt(prompt);
88                self
89            }
90
91            /// Attach an output schema for custom prompt templates.
92            #[must_use]
93            pub fn with_output_schema(mut self, schema: serde_json::Value) -> Self {
94                self.config = self.config.with_output_schema(schema);
95                self
96            }
97
98            /// Toggle judge reasoning capture.
99            #[must_use]
100            pub fn with_use_reasoning(mut self, flag: bool) -> Self {
101                self.config = self.config.with_use_reasoning(flag);
102                self
103            }
104
105            /// Override the feedback key used by downstream exporters.
106            #[must_use]
107            pub fn with_feedback_key(mut self, key: impl Into<String>) -> Self {
108                self.config = self.config.with_feedback_key(key);
109                self
110            }
111
112            /// Borrow the underlying config.
113            #[must_use]
114            pub const fn config(&self) -> &JudgeEvaluatorConfig {
115                &self.config
116            }
117        }
118
119        impl $crate::evaluators::JudgeEvaluatorBuilder for $name {
120            fn judge_config_mut(&mut self) -> &mut JudgeEvaluatorConfig {
121                &mut self.config
122            }
123        }
124
125        impl Evaluator for $name {
126            fn name(&self) -> &'static str {
127                $eval_name
128            }
129
130            fn evaluate(
131                &self,
132                case: &EvalCase,
133                invocation: &Invocation,
134            ) -> Option<EvalMetricResult> {
135                let criterion: fn(&EvalCase, &Invocation) -> bool = $criterion;
136                if !criterion(case, invocation) {
137                    return None;
138                }
139
140                Some(evaluate_with_builtin(
141                    $eval_name,
142                    $template,
143                    &self.config,
144                    &build_prompt_context(&self.config, case, invocation),
145                ))
146            }
147        }
148    };
149}
150
151safety_evaluator! {
152    /// Broad-spectrum harmful-content check (prompt: `harmfulness_v0`).
153    HarmfulnessEvaluator,
154    "harmfulness",
155    "harmfulness_v0",
156    has_final_response
157}
158
159safety_evaluator! {
160    /// Narrow rubric targeting hate speech, harassment, and slurs (prompt:
161    /// `toxicity_v0`). Distinct from [`HarmfulnessEvaluator`].
162    ToxicityEvaluator,
163    "toxicity",
164    "toxicity_v0",
165    has_final_response
166}
167
168safety_evaluator! {
169    /// Fairness / disparate-treatment check (prompt: `fairness_v0`).
170    FairnessEvaluator,
171    "fairness",
172    "fairness_v0",
173    has_final_response
174}
175
176safety_evaluator! {
177    /// Prompt-injection detector evaluated against the user prompt (prompt:
178    /// `prompt_injection_v0`). Criterion: the case must include at least one
179    /// user message.
180    PromptInjectionEvaluator,
181    "prompt_injection",
182    "prompt_injection_v0",
183    has_user_prompt
184}
185
186safety_evaluator! {
187    /// Code-injection detector evaluated against the user prompt (prompt:
188    /// `code_injection_v0`).
189    CodeInjectionEvaluator,
190    "code_injection",
191    "code_injection_v0",
192    has_user_prompt
193}
194
195/// PII categories recognised by [`PIILeakageEvaluator`].
196///
197/// `Other(String)` lets consumers add custom entity classes (e.g.,
198/// `"MedicalRecordNumber"`) without forking the evaluator.
199#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
200#[serde(rename_all = "snake_case")]
201pub enum PIIClass {
202    Email,
203    Phone,
204    /// Social Security Number (US).
205    Ssn,
206    CreditCard,
207    IpAddress,
208    ApiKey,
209    PersonalName,
210    Address,
211    /// Free-form class label; callers supply the class name.
212    Other(String),
213}
214
215impl PIIClass {
216    /// Canonical name used in prompt rendering and telemetry.
217    #[must_use]
218    pub fn canonical_name(&self) -> String {
219        match self {
220            Self::Email => "email".into(),
221            Self::Phone => "phone".into(),
222            Self::Ssn => "ssn".into(),
223            Self::CreditCard => "credit_card".into(),
224            Self::IpAddress => "ip_address".into(),
225            Self::ApiKey => "api_key".into(),
226            Self::PersonalName => "personal_name".into(),
227            Self::Address => "address".into(),
228            Self::Other(name) => name.clone(),
229        }
230    }
231
232    /// All built-in PII classes in stable registration order. `Other` is
233    /// intentionally excluded — it is a user-supplied extension.
234    #[must_use]
235    pub fn all_builtin() -> Vec<Self> {
236        vec![
237            Self::Email,
238            Self::Phone,
239            Self::Ssn,
240            Self::CreditCard,
241            Self::IpAddress,
242            Self::ApiKey,
243            Self::PersonalName,
244            Self::Address,
245        ]
246    }
247}
248
249/// PII-leakage detector (prompt: `pii_leakage_v0`).
250///
251/// Consumers pick which [`PIIClass`] variants participate in detection.
252/// The default constructor enables every built-in class.
253pub struct PIILeakageEvaluator {
254    config: JudgeEvaluatorConfig,
255    entity_classes: Vec<PIIClass>,
256}
257
258impl PIILeakageEvaluator {
259    /// Construct with every built-in PII class enabled.
260    #[must_use]
261    pub fn new(config: JudgeEvaluatorConfig) -> Self {
262        Self {
263            config: with_safety_default(config),
264            entity_classes: PIIClass::all_builtin(),
265        }
266    }
267
268    /// Construct with an explicit subset of classes. An empty `entity_classes`
269    /// is accepted but will always return a passing score (the evaluator has
270    /// nothing to look for).
271    #[must_use]
272    pub fn with_classes(config: JudgeEvaluatorConfig, entity_classes: Vec<PIIClass>) -> Self {
273        Self {
274            config: with_safety_default(config),
275            entity_classes,
276        }
277    }
278
279    /// Override the prompt template used by this evaluator.
280    #[must_use]
281    pub fn with_prompt(mut self, template: Arc<dyn crate::prompt::JudgePromptTemplate>) -> Self {
282        self.config = self.config.with_prompt(template);
283        self
284    }
285
286    /// Attach evaluator-level few-shot examples that render before any
287    /// case-level examples.
288    #[must_use]
289    pub fn with_few_shot(mut self, examples: Vec<crate::types::FewShotExample>) -> Self {
290        self.config = self.config.with_few_shot(examples);
291        self
292    }
293
294    /// Override the system prompt visible to the template render.
295    #[must_use]
296    pub fn with_system_prompt(mut self, prompt: impl Into<String>) -> Self {
297        self.config = self.config.with_system_prompt(prompt);
298        self
299    }
300
301    /// Attach an output schema for custom prompt templates.
302    #[must_use]
303    pub fn with_output_schema(mut self, schema: serde_json::Value) -> Self {
304        self.config = self.config.with_output_schema(schema);
305        self
306    }
307
308    /// Toggle judge reasoning capture.
309    #[must_use]
310    pub fn with_use_reasoning(mut self, flag: bool) -> Self {
311        self.config = self.config.with_use_reasoning(flag);
312        self
313    }
314
315    /// Override the feedback key used by downstream exporters.
316    #[must_use]
317    pub fn with_feedback_key(mut self, key: impl Into<String>) -> Self {
318        self.config = self.config.with_feedback_key(key);
319        self
320    }
321
322    /// Borrow the configured class list.
323    #[must_use]
324    pub fn entity_classes(&self) -> &[PIIClass] {
325        &self.entity_classes
326    }
327
328    /// Borrow the underlying config.
329    #[must_use]
330    pub const fn config(&self) -> &JudgeEvaluatorConfig {
331        &self.config
332    }
333}
334
335impl crate::evaluators::JudgeEvaluatorBuilder for PIILeakageEvaluator {
336    fn judge_config_mut(&mut self) -> &mut JudgeEvaluatorConfig {
337        &mut self.config
338    }
339}
340
341impl Evaluator for PIILeakageEvaluator {
342    fn name(&self) -> &'static str {
343        "pii_leakage"
344    }
345
346    fn evaluate(&self, case: &EvalCase, invocation: &Invocation) -> Option<EvalMetricResult> {
347        if !has_final_response(case, invocation) {
348            return None;
349        }
350
351        // Render the active class list into the prompt's custom namespace so
352        // the `pii_leakage_v0` template can surface it if consumers override
353        // the rubric. Built-in template ignores the custom field today.
354        let mut ctx = build_prompt_context(&self.config, case, invocation);
355        let classes: Vec<serde_json::Value> = self
356            .entity_classes
357            .iter()
358            .map(|c| serde_json::Value::String(c.canonical_name()))
359            .collect();
360        ctx = ctx.with_custom(std::collections::HashMap::from([(
361            "pii_entity_classes".to_string(),
362            serde_json::Value::Array(classes),
363        )]));
364
365        Some(evaluate_with_builtin(
366            "pii_leakage",
367            "pii_leakage_v0",
368            &self.config,
369            &ctx,
370        ))
371    }
372}