swink_agent_eval/evaluators/
mod.rs

1//! Extended evaluator families for advanced eval features.
2//!
3//! This module owns the shared [`JudgeEvaluatorConfig`] (T055) and the shared
4//! [`dispatch_judge`] helper (T056) used by every judge-backed evaluator that
5//! ships in spec 043.
6//!
7//! Concrete per-family evaluators land in follow-up tasks (T057–T086); this
8//! file only provides the building blocks so those evaluators can be authored
9//! independently without duplicating dispatch logic.
10
11#![cfg(feature = "judge-core")]
12
13use std::path::Path;
14use std::sync::Arc;
15
16use serde::{Deserialize, Serialize};
17use serde_json::{Map, Value};
18
19use crate::aggregator::{Aggregator, Average};
20use crate::judge::{JudgeError, JudgeRegistry, JudgeVerdict};
21use crate::prompt::{JudgePromptTemplate, PromptContext, PromptError};
22use crate::score::Score;
23use crate::types::{AttachmentError, EvalMetricResult, MaterializedAttachment};
24use crate::url_filter::UrlFilter;
25
26// ─── US1d (deterministic) ───────────────────────────────────────────────────
27//
28// The module list below is owned by the US1d (deterministic / code / sandbox /
29// multimodal) slice of spec 043. Keep additions inside this block so the US1c
30// (judge-family) slice can land independent module declarations above without
31// a mechanical merge conflict.
32
33#[cfg(feature = "evaluator-simple")]
34pub mod simple;
35#[cfg(feature = "evaluator-structured")]
36pub mod structured;
37
38#[cfg(feature = "evaluator-code")]
39pub mod code;
40
41#[cfg(feature = "multimodal")]
42pub mod multimodal;
43
44// ─── Judge-family evaluator submodules (T057–T070) ──────────────────────────
45//
46// Each family file re-exports concrete evaluators gated behind its own cargo
47// feature flag so consumers only pay for the families they opt into.
48//
49// Quality and Safety families land in US1c; RAG and Agent families are
50// deferred to a US1c follow-up PR to keep the US1c diff reviewable.
51
52#[cfg(feature = "evaluator-agent")]
53pub mod agent;
54#[cfg(feature = "evaluator-quality")]
55pub mod quality;
56#[cfg(feature = "evaluator-rag")]
57pub mod rag;
58#[cfg(feature = "evaluator-safety")]
59pub mod safety;
60
61/// Per-instance configuration shared by every judge-backed evaluator (T055).
62///
63/// A `None` template means "use the evaluator's built-in `_v0` template".
64/// Builder methods on each concrete evaluator surface the individual knobs
65/// (see data-model §3 "Base Evaluator extensions").
66pub struct JudgeEvaluatorConfig {
67    /// Prompt template override. When `None`, the evaluator uses its built-in
68    /// `_v0` template from `PromptTemplateRegistry::builtin()`.
69    pub template: Option<Arc<dyn JudgePromptTemplate>>,
70    /// Few-shot examples injected ahead of the rendered prompt.
71    pub few_shot_examples: Vec<crate::types::FewShotExample>,
72    /// Optional system-prompt override applied ahead of the rendered prompt.
73    pub system_prompt: Option<String>,
74    /// Optional output-schema identifier used by structured-output evaluators.
75    pub output_schema: Option<serde_json::Value>,
76    /// Whether the judge should emit a reasoning field. Defaults to `true`.
77    pub use_reasoning: bool,
78    /// Optional feedback key forwarded to telemetry/reporter backends
79    /// (e.g., LangSmith).
80    pub feedback_key: Option<String>,
81    /// Optional aggregator override. When `None`, callers use `Average`.
82    pub aggregator: Option<Arc<dyn Aggregator>>,
83    /// Required judge registry — the evaluator has no default judge model.
84    pub judge_registry: Arc<JudgeRegistry>,
85}
86
87impl std::fmt::Debug for JudgeEvaluatorConfig {
88    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
89        f.debug_struct("JudgeEvaluatorConfig")
90            .field("template", &self.template.as_ref().map(|t| t.version()))
91            .field("few_shot_examples", &self.few_shot_examples.len())
92            .field("system_prompt", &self.system_prompt.is_some())
93            .field("output_schema", &self.output_schema.is_some())
94            .field("use_reasoning", &self.use_reasoning)
95            .field("feedback_key", &self.feedback_key)
96            .field("aggregator", &self.aggregator.is_some())
97            .field("judge_registry", &self.judge_registry)
98            .finish()
99    }
100}
101
102impl JudgeEvaluatorConfig {
103    /// Construct a default config bound to the given judge registry (T055).
104    ///
105    /// Named `default_with` because [`Default`] can't take arguments; the
106    /// config has no sensible default without a judge registry (FR-007/FR-010).
107    #[must_use]
108    pub fn default_with(judge_registry: Arc<JudgeRegistry>) -> Self {
109        Self {
110            template: None,
111            few_shot_examples: Vec::new(),
112            system_prompt: None,
113            output_schema: None,
114            use_reasoning: true,
115            feedback_key: None,
116            aggregator: None,
117            judge_registry,
118        }
119    }
120
121    /// Override the prompt template.
122    #[must_use]
123    pub fn with_prompt(mut self, template: Arc<dyn JudgePromptTemplate>) -> Self {
124        self.template = Some(template);
125        self
126    }
127
128    /// Backward-compatible alias for [`Self::with_prompt`].
129    #[must_use]
130    pub fn with_template(self, template: Arc<dyn JudgePromptTemplate>) -> Self {
131        self.with_prompt(template)
132    }
133
134    /// Attach few-shot examples.
135    #[must_use]
136    pub fn with_few_shot(mut self, examples: Vec<crate::types::FewShotExample>) -> Self {
137        self.few_shot_examples = examples;
138        self
139    }
140
141    /// Override the system prompt.
142    #[must_use]
143    pub fn with_system_prompt(mut self, prompt: impl Into<String>) -> Self {
144        self.system_prompt = Some(prompt.into());
145        self
146    }
147
148    /// Override the output schema.
149    #[must_use]
150    pub fn with_output_schema(mut self, schema: serde_json::Value) -> Self {
151        self.output_schema = Some(schema);
152        self
153    }
154
155    /// Toggle the use-reasoning flag.
156    #[must_use]
157    pub const fn with_use_reasoning(mut self, flag: bool) -> Self {
158        self.use_reasoning = flag;
159        self
160    }
161
162    /// Override the feedback key.
163    #[must_use]
164    pub fn with_feedback_key(mut self, key: impl Into<String>) -> Self {
165        self.feedback_key = Some(key.into());
166        self
167    }
168
169    /// Override the aggregator.
170    #[must_use]
171    pub fn with_aggregator(mut self, aggregator: Arc<dyn Aggregator>) -> Self {
172        self.aggregator = Some(aggregator);
173        self
174    }
175
176    /// Effective aggregator: the configured override or the default (`Average`).
177    #[must_use]
178    pub fn effective_aggregator(&self) -> Arc<dyn Aggregator> {
179        self.aggregator.clone().unwrap_or_else(|| Arc::new(Average))
180    }
181}
182
183/// Build the merged prompt context shared by every judge-backed evaluator.
184///
185/// The shared config can override the case's system prompt, prepend evaluator-
186/// level few-shot examples, and expose additional per-dispatch metadata through
187/// the `custom.*` namespace for custom templates.
188#[must_use]
189pub fn build_prompt_context(
190    config: &JudgeEvaluatorConfig,
191    case: &crate::types::EvalCase,
192    invocation: &crate::types::Invocation,
193) -> PromptContext {
194    let mut case = case.clone();
195    if let Some(system_prompt) = &config.system_prompt {
196        case.system_prompt.clone_from(system_prompt);
197    }
198    let case_few_shot_examples = case.few_shot_examples.clone();
199
200    let mut ctx = PromptContext::new(Arc::new(case), Arc::new(invocation.clone()));
201
202    let mut few_shot_examples =
203        Vec::with_capacity(config.few_shot_examples.len() + case_few_shot_examples.len());
204    few_shot_examples.extend(config.few_shot_examples.iter().cloned());
205    few_shot_examples.extend(case_few_shot_examples);
206    if !few_shot_examples.is_empty() {
207        ctx = ctx.with_few_shot_examples(few_shot_examples);
208    }
209
210    let mut custom = Map::new();
211    custom.insert("use_reasoning".into(), Value::Bool(config.use_reasoning));
212    if let Some(system_prompt) = &config.system_prompt {
213        custom.insert("system_prompt".into(), Value::String(system_prompt.clone()));
214    }
215    if let Some(output_schema) = &config.output_schema {
216        custom.insert("output_schema".into(), output_schema.clone());
217    }
218    if let Some(feedback_key) = &config.feedback_key {
219        custom.insert("feedback_key".into(), Value::String(feedback_key.clone()));
220    }
221    if !custom.is_empty() {
222        ctx = ctx.with_custom(custom.into_iter().collect());
223    }
224
225    ctx
226}
227
228/// Fluent builder surface exposed on every judge-backed evaluator (T105).
229///
230/// Complements the per-evaluator inherent `with_prompt` / `with_few_shot`
231/// methods: implementors own a [`JudgeEvaluatorConfig`] and return `&mut`
232/// access via [`Self::judge_config_mut`]. Default method implementations
233/// route each customisation knob through the shared config so downstream
234/// users can write generic code that customises any judge-backed evaluator:
235///
236/// ```rust,ignore
237/// use std::sync::Arc;
238/// use swink_agent_eval::{
239///     CorrectnessEvaluator, JudgeEvaluatorBuilder, JudgeEvaluatorConfig,
240///     JudgePromptTemplate,
241/// };
242///
243/// fn customise<E: JudgeEvaluatorBuilder>(eval: E, t: Arc<dyn JudgePromptTemplate>) -> E {
244///     eval.with_prompt(t).with_use_reasoning(false)
245/// }
246/// ```
247///
248/// The inherent methods on each evaluator struct shadow these defaults for
249/// callers who don't need the generic trait surface — both paths route
250/// through the same [`JudgeEvaluatorConfig`].
251pub trait JudgeEvaluatorBuilder: Sized {
252    /// Borrow the evaluator's underlying [`JudgeEvaluatorConfig`] for
253    /// mutation by the default builder methods.
254    fn judge_config_mut(&mut self) -> &mut JudgeEvaluatorConfig;
255
256    /// Override the built-in prompt template.
257    #[must_use]
258    fn with_prompt(mut self, template: Arc<dyn JudgePromptTemplate>) -> Self {
259        self.judge_config_mut().template = Some(template);
260        self
261    }
262
263    /// Attach few-shot examples.
264    #[must_use]
265    fn with_few_shot(mut self, examples: Vec<crate::types::FewShotExample>) -> Self {
266        self.judge_config_mut().few_shot_examples = examples;
267        self
268    }
269
270    /// Override the system prompt applied ahead of the rendered prompt.
271    #[must_use]
272    fn with_system_prompt(mut self, prompt: impl Into<String>) -> Self {
273        self.judge_config_mut().system_prompt = Some(prompt.into());
274        self
275    }
276
277    /// Override the output-schema identifier used by structured-output
278    /// evaluators.
279    #[must_use]
280    fn with_output_schema(mut self, schema: serde_json::Value) -> Self {
281        self.judge_config_mut().output_schema = Some(schema);
282        self
283    }
284
285    /// Toggle the `use_reasoning` flag.
286    #[must_use]
287    fn with_use_reasoning(mut self, flag: bool) -> Self {
288        self.judge_config_mut().use_reasoning = flag;
289        self
290    }
291
292    /// Override the feedback key forwarded to telemetry / reporter backends.
293    #[must_use]
294    fn with_feedback_key(mut self, key: impl Into<String>) -> Self {
295        self.judge_config_mut().feedback_key = Some(key.into());
296        self
297    }
298
299    /// Override the aggregator applied to per-sample judge scores.
300    #[must_use]
301    fn with_aggregator(mut self, aggregator: Arc<dyn Aggregator>) -> Self {
302        self.judge_config_mut().aggregator = Some(aggregator);
303        self
304    }
305}
306
307/// Convenience macro that implements [`JudgeEvaluatorBuilder`] for a struct
308/// holding a `config: JudgeEvaluatorConfig` field.
309#[macro_export]
310macro_rules! impl_judge_evaluator_builder {
311    ($ty:ty) => {
312        impl $crate::evaluators::JudgeEvaluatorBuilder for $ty {
313            fn judge_config_mut(&mut self) -> &mut $crate::evaluators::JudgeEvaluatorConfig {
314                &mut self.config
315            }
316        }
317    };
318}
319
320/// Structured detail record attached to [`EvalMetricResult::details`] (T056).
321///
322/// The existing `details: Option<String>` field retains its historical shape;
323/// structured detail variants are serialized as JSON and surfaced through the
324/// string. Helpers on this type render the canonical representation.
325///
326/// **Note**: this enum is the "Detail" surface referenced by FR-021 — the
327/// `ScoreClamped` variant is authored here for the first time. PR body notes
328/// that `EvalMetricResult::details` remains `Option<String>` for
329/// serde-compat.
330#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
331#[serde(tag = "kind", rename_all = "snake_case")]
332pub enum Detail {
333    /// Judge-returned score was outside `[0.0, 1.0]` and has been clamped.
334    ScoreClamped { original: f64, clamped: f64 },
335    /// The prompt template version used for this dispatch.
336    PromptVersion { version: String },
337    /// Feedback key consumed by downstream exporters such as LangSmith.
338    FeedbackKey { key: String },
339    /// Human-readable note carried verbatim.
340    Note { text: String },
341}
342
343impl Detail {
344    /// Render the detail as a single canonical JSON line.
345    #[must_use]
346    pub fn to_json_line(&self) -> String {
347        serde_json::to_string(self).unwrap_or_else(|_| "{}".to_string())
348    }
349}
350
351/// Helper that assembles structured detail lines into the free-form
352/// `EvalMetricResult::details: Option<String>` field.
353///
354/// Each detail is serialized as one JSON line; this keeps the existing
355/// `Option<String>` type shape while giving downstream consumers a
356/// deterministic parse path.
357#[derive(Debug, Default, Clone)]
358pub struct DetailBuffer {
359    entries: Vec<Detail>,
360}
361
362impl DetailBuffer {
363    /// Empty buffer.
364    #[must_use]
365    pub fn new() -> Self {
366        Self::default()
367    }
368
369    /// Append a detail.
370    pub fn push(&mut self, detail: Detail) {
371        self.entries.push(detail);
372    }
373
374    /// Number of buffered detail entries.
375    #[must_use]
376    pub fn len(&self) -> usize {
377        self.entries.len()
378    }
379
380    /// Whether the buffer holds no detail entries.
381    #[must_use]
382    pub fn is_empty(&self) -> bool {
383        self.entries.is_empty()
384    }
385
386    /// Borrow the buffered detail entries.
387    #[must_use]
388    pub fn entries(&self) -> &[Detail] {
389        &self.entries
390    }
391
392    /// Render to the `Option<String>` shape of `EvalMetricResult::details`.
393    #[must_use]
394    pub fn into_details_string(self) -> Option<String> {
395        if self.entries.is_empty() {
396            return None;
397        }
398        let lines: Vec<String> = self.entries.iter().map(Detail::to_json_line).collect();
399        Some(lines.join("\n"))
400    }
401}
402
403/// Errors produced by [`dispatch_judge`] (T056).
404#[derive(Debug, thiserror::Error)]
405pub enum DispatchError {
406    /// Prompt render or template lookup failed.
407    #[error("prompt: {0}")]
408    Prompt(#[from] PromptError),
409    /// Judge call failed.
410    #[error("judge: {0}")]
411    Judge(#[from] JudgeError),
412    /// Attachment materialization failed.
413    #[error("attachment: {0}")]
414    Attachment(#[from] AttachmentError),
415}
416
417/// Structured errors surfaced by concrete evaluators in this module tree (T080–T082).
418///
419/// Evaluators fold these into [`EvalMetricResult`] via `Score::fail()` with the
420/// error message copied into `details`; the type exists primarily so callers
421/// (tests, reporters) can reason about the failure mode programmatically.
422#[derive(Debug, thiserror::Error)]
423pub enum EvaluatorError {
424    /// The current platform cannot run this evaluator (e.g. Windows sandbox).
425    #[error("evaluator unsupported on this platform: {reason}")]
426    UnsupportedPlatform {
427        /// Free-form explanation of the missing platform capability.
428        reason: String,
429    },
430    /// A sandbox resource-limit cap was exceeded at evaluation time (T081).
431    #[error("sandbox limit exceeded: {limit}")]
432    SandboxLimitExceeded {
433        /// Name of the exceeded limit (`wall_clock`, `cpu`, `memory`, `fds`, `network`).
434        limit: String,
435    },
436    /// The evaluator could not carry out a deterministic operation.
437    #[error("evaluator execution error: {reason}")]
438    Execution {
439        /// Human-readable explanation of the failure.
440        reason: String,
441    },
442}
443
444impl EvaluatorError {
445    /// Convenience: render the error as the `details` string paired with `Score::fail()`.
446    #[must_use]
447    pub fn into_metric_details(self) -> String {
448        self.to_string()
449    }
450}
451
452/// Outcome of a [`dispatch_judge`] call (T056).
453#[derive(Debug, Clone)]
454pub struct DispatchOutcome {
455    /// Clamped score in `[0.0, 1.0]`.
456    pub score: Score,
457    /// The judge's own pass/fail determination.
458    pub pass: bool,
459    /// Structured detail entries (prompt_version, optional ScoreClamped).
460    pub details: DetailBuffer,
461    /// Raw verdict for downstream evaluators that need label/reason.
462    pub verdict: JudgeVerdict,
463}
464
465/// Shared judge-dispatch helper (T056).
466///
467/// Responsibilities:
468///
469/// * Render the supplied prompt template (or the config override) via
470///   `MinijinjaTemplate`.
471/// * Dispatch the rendered prompt through the config's [`JudgeRegistry`].
472/// * Record `prompt_version` as a structured [`Detail::PromptVersion`] entry
473///   (FR-011).
474/// * Clamp the returned score to `[0.0, 1.0]` and, when the raw score was
475///   outside that range, push a [`Detail::ScoreClamped { original, clamped }`]
476///   entry (FR-021 extension).
477///
478/// `dispatch_judge` does NOT itself encode FR-020 `None`-return semantics —
479/// that is the concrete evaluator's responsibility because only the evaluator
480/// knows which case fields are its criterion. Callers typically short-circuit
481/// before calling `dispatch_judge` and return `None` from their `evaluate`
482/// implementation.
483pub async fn dispatch_judge(
484    config: &JudgeEvaluatorConfig,
485    builtin_template: Arc<dyn JudgePromptTemplate>,
486    context: &PromptContext,
487) -> Result<DispatchOutcome, DispatchError> {
488    let template = config.template.clone().unwrap_or(builtin_template);
489    let prompt_version = template.version().to_string();
490
491    // Per `build_prompt_context` the incoming `context` already has every
492    // config-level customisation merged in (system prompt override,
493    // config-level few-shot examples prepended to case-level ones, and the
494    // `custom.*` namespace populated). Render verbatim.
495    let rendered = template.render(context)?;
496    let verdict = config.judge_registry.client().judge(&rendered).await?;
497
498    let mut details = DetailBuffer::new();
499    details.push(Detail::PromptVersion {
500        version: prompt_version,
501    });
502    if let Some(feedback_key) = config.feedback_key.clone() {
503        details.push(Detail::FeedbackKey { key: feedback_key });
504    }
505
506    let raw = verdict.score;
507    let clamped = raw.clamp(0.0, 1.0);
508    if (raw - clamped).abs() > f64::EPSILON {
509        details.push(Detail::ScoreClamped {
510            original: raw,
511            clamped,
512        });
513    }
514
515    let score = Score::new(clamped, 0.5);
516
517    Ok(DispatchOutcome {
518        score,
519        pass: verdict.pass,
520        details,
521        verdict,
522    })
523}
524
525/// Drive an async future to completion from the sync `Evaluator::evaluate`
526/// entry point, regardless of the caller's Tokio runtime state.
527///
528/// Multi-thread runtime active → `block_in_place` + the ambient
529/// `Handle::block_on` so the host runtime keeps scheduling other tasks.
530/// Otherwise → build an ephemeral current-thread runtime and `block_on` it.
531///
532/// ## Known limitation
533/// Running from *inside* a single-threaded Tokio runtime will panic with
534/// "Cannot start a runtime from within a runtime". This is an inherent
535/// Tokio constraint — use a multi-thread runtime or call from sync context.
536pub fn block_on<F, T>(future: F) -> T
537where
538    F: std::future::Future<Output = T>,
539{
540    use tokio::runtime::{Handle, RuntimeFlavor};
541
542    if let Ok(handle) = Handle::try_current()
543        && handle.runtime_flavor() == RuntimeFlavor::MultiThread
544    {
545        return tokio::task::block_in_place(|| handle.block_on(future));
546    }
547
548    let rt = tokio::runtime::Builder::new_current_thread()
549        .enable_all()
550        .build()
551        .expect("build ephemeral current-thread runtime");
552    rt.block_on(future)
553}
554
555/// Materialize every attachment on the case through the shared attachment
556/// pipeline (T086).
557///
558/// This is the narrow wiring point for FR-019: any judge-backed evaluator can
559/// call [`materialize_case_attachments`] to get a `Vec<MaterializedAttachment>`
560/// without re-implementing path resolution, base64 handling, or SSRF-filtered
561/// URL fetching. The helper lives next to [`dispatch_judge`] so every caller
562/// sees the same wiring.
563///
564/// Returns an empty vector when the case has no attachments; the
565/// [`PromptContext`] passed downstream remains cheap to clone.
566pub async fn materialize_case_attachments(
567    case: &crate::types::EvalCase,
568    eval_set_root: &Path,
569    filter: &dyn UrlFilter,
570) -> Result<Vec<MaterializedAttachment>, AttachmentError> {
571    let mut out = Vec::with_capacity(case.attachments.len());
572    for attachment in &case.attachments {
573        let materialized = attachment.materialize(eval_set_root, filter).await?;
574        out.push(materialized);
575    }
576    Ok(out)
577}
578
579/// Convenience: finalize a [`DispatchOutcome`] (plus optional judge reason)
580/// into an [`EvalMetricResult`], preserving the `Option<String>` shape of
581/// `details`.
582#[must_use]
583pub fn finish_metric_result(
584    evaluator_name: impl Into<String>,
585    outcome: DispatchOutcome,
586) -> EvalMetricResult {
587    let mut buffer = outcome.details;
588    if let Some(reason) = outcome.verdict.reason.as_ref() {
589        buffer.push(Detail::Note {
590            text: reason.clone(),
591        });
592    }
593    EvalMetricResult {
594        evaluator_name: evaluator_name.into(),
595        score: outcome.score,
596        details: buffer.into_details_string(),
597    }
598}
599
600/// Drive an async workload to completion from the sync [`crate::Evaluator::evaluate`]
601/// entry point, regardless of the caller's Tokio runtime state.
602///
603/// Mirrors the pattern documented on
604/// [`crate::SemanticToolSelectionEvaluator`] (spec 023): when a multi-thread
605/// Tokio runtime is active we use `block_in_place` + the ambient
606/// `Handle::block_on`; otherwise we build an ephemeral current-thread runtime.
607/// Calling this from inside a single-threaded runtime will panic — an
608/// inherent Tokio constraint, not a bug.
609pub fn drive_judge_call<F, Fut, T>(make_future: F) -> T
610where
611    F: FnOnce() -> Fut,
612    Fut: std::future::Future<Output = T>,
613{
614    use tokio::runtime::{Handle, RuntimeFlavor};
615
616    if let Ok(handle) = Handle::try_current()
617        && handle.runtime_flavor() == RuntimeFlavor::MultiThread
618    {
619        return tokio::task::block_in_place(|| handle.block_on(make_future()));
620    }
621
622    let rt = tokio::runtime::Builder::new_current_thread()
623        .enable_all()
624        .build()
625        .expect("build current-thread runtime for judge calls");
626    rt.block_on(make_future())
627}
628
629/// Sync helper for judge-backed evaluators.
630///
631/// Locates the built-in template by version, dispatches via
632/// [`dispatch_judge`], and finalises the [`EvalMetricResult`] via
633/// [`finish_metric_result`]. Dispatch errors map to `Score::fail()` with the
634/// error captured in `details` (FR-014 / FR-021).
635///
636/// Concrete evaluators are responsible for deciding whether their criterion
637/// is set before calling this helper (FR-020). The helper itself never
638/// returns `None`; once invoked, it always produces a metric result.
639#[must_use]
640pub fn evaluate_with_builtin(
641    evaluator_name: &'static str,
642    template_version: &'static str,
643    config: &JudgeEvaluatorConfig,
644    context: &PromptContext,
645) -> EvalMetricResult {
646    let builtin = crate::prompt::PromptTemplateRegistry::builtin()
647        .get(template_version)
648        .unwrap_or_else(|| panic!("built-in template {template_version} is missing"));
649
650    let dispatch = drive_judge_call(|| async { dispatch_judge(config, builtin, context).await });
651
652    match dispatch {
653        Ok(outcome) => finish_metric_result(evaluator_name.to_string(), outcome),
654        Err(err) => EvalMetricResult {
655            evaluator_name: evaluator_name.to_string(),
656            score: Score::fail(),
657            details: Some(format!("{evaluator_name}: dispatch error — {err}")),
658        },
659    }
660}
661
662#[cfg(test)]
663mod tests {
664    use super::*;
665    use crate::judge::{JudgeClient, JudgeRegistry};
666    use crate::prompt::{MinijinjaTemplate, PromptContext, PromptFamily};
667    use crate::types::{EvalCase, Invocation};
668    use std::sync::Arc;
669    use std::sync::Mutex;
670    use std::time::Duration;
671
672    use swink_agent::{Cost, ModelSpec, StopReason, Usage};
673
674    struct FixedJudge {
675        score: f64,
676        reason: Option<String>,
677        last_prompt: Mutex<Option<String>>,
678    }
679
680    impl JudgeClient for FixedJudge {
681        fn judge<'a>(&'a self, prompt: &'a str) -> crate::judge::JudgeFuture<'a> {
682            Box::pin(async move {
683                *self.last_prompt.lock().unwrap() = Some(prompt.to_string());
684                Ok(JudgeVerdict {
685                    score: self.score,
686                    pass: (0.5..=1.0).contains(&self.score),
687                    reason: self.reason.clone(),
688                    label: None,
689                })
690            })
691        }
692    }
693
694    fn make_case() -> EvalCase {
695        EvalCase {
696            id: "case-1".into(),
697            name: "Case One".into(),
698            description: None,
699            system_prompt: "answer".into(),
700            user_messages: vec!["hi".into()],
701            expected_trajectory: None,
702            expected_response: None,
703            expected_assertion: None,
704            expected_interactions: None,
705            few_shot_examples: vec![],
706            budget: None,
707            evaluators: vec![],
708            metadata: serde_json::Value::Null,
709            attachments: vec![],
710            session_id: None,
711            expected_environment_state: None,
712            expected_tool_intent: None,
713            semantic_tool_selection: false,
714            state_capture: None,
715        }
716    }
717
718    fn make_invocation() -> Invocation {
719        Invocation {
720            turns: vec![],
721            total_usage: Usage::default(),
722            total_cost: Cost::default(),
723            total_duration: Duration::from_millis(1),
724            final_response: Some("42".into()),
725            stop_reason: StopReason::Stop,
726            model: ModelSpec::new("test", "judge-target"),
727        }
728    }
729
730    fn make_registry(score: f64) -> (Arc<JudgeRegistry>, Arc<FixedJudge>) {
731        let judge = Arc::new(FixedJudge {
732            score,
733            reason: Some("ok".into()),
734            last_prompt: Mutex::new(None),
735        });
736        let registry = JudgeRegistry::builder(judge.clone() as Arc<dyn JudgeClient>, "mock-model")
737            .build()
738            .expect("registry builds");
739        (Arc::new(registry), judge)
740    }
741
742    fn make_template() -> Arc<dyn JudgePromptTemplate> {
743        Arc::new(
744            MinijinjaTemplate::new(
745                "mock_v0",
746                PromptFamily::Quality,
747                "Case={{ case.name }} Actual={{ invocation.final_response }}",
748            )
749            .expect("template compiles"),
750        )
751    }
752
753    fn make_context(case: &EvalCase, invocation: &Invocation) -> PromptContext {
754        PromptContext::new(Arc::new(case.clone()), Arc::new(invocation.clone()))
755    }
756
757    #[tokio::test]
758    async fn dispatch_records_prompt_version() {
759        let (registry, _) = make_registry(0.8);
760        let config = JudgeEvaluatorConfig::default_with(registry);
761        let template = make_template();
762        let case = make_case();
763        let invocation = make_invocation();
764        let ctx = make_context(&case, &invocation);
765
766        let outcome = dispatch_judge(&config, template, &ctx).await.unwrap();
767
768        assert!(
769            outcome
770                .details
771                .entries()
772                .iter()
773                .any(|d| matches!(d, Detail::PromptVersion { version } if version == "mock_v0"))
774        );
775        assert!(
776            !outcome
777                .details
778                .entries()
779                .iter()
780                .any(|d| matches!(d, Detail::ScoreClamped { .. }))
781        );
782        assert!((outcome.score.value - 0.8).abs() < f64::EPSILON);
783    }
784
785    #[tokio::test]
786    async fn dispatch_clamps_out_of_range_scores() {
787        let (registry, _) = make_registry(1.3);
788        let config = JudgeEvaluatorConfig::default_with(registry);
789        let template = make_template();
790        let case = make_case();
791        let invocation = make_invocation();
792        let ctx = make_context(&case, &invocation);
793
794        let outcome = dispatch_judge(&config, template, &ctx).await.unwrap();
795
796        // Score clamped to 1.0.
797        assert!((outcome.score.value - 1.0).abs() < f64::EPSILON);
798        // ScoreClamped detail present with original 1.3 and clamped 1.0.
799        let clamp = outcome
800            .details
801            .entries()
802            .iter()
803            .find_map(|d| match d {
804                Detail::ScoreClamped { original, clamped } => Some((*original, *clamped)),
805                _ => None,
806            })
807            .expect("ScoreClamped detail present");
808        assert!((clamp.0 - 1.3).abs() < f64::EPSILON);
809        assert!((clamp.1 - 1.0).abs() < f64::EPSILON);
810    }
811
812    #[tokio::test]
813    async fn dispatch_clamps_negative_scores() {
814        let (registry, _) = make_registry(-0.2);
815        let config = JudgeEvaluatorConfig::default_with(registry);
816        let template = make_template();
817        let case = make_case();
818        let invocation = make_invocation();
819        let ctx = make_context(&case, &invocation);
820
821        let outcome = dispatch_judge(&config, template, &ctx).await.unwrap();
822
823        assert!((outcome.score.value - 0.0).abs() < f64::EPSILON);
824        assert!(
825            outcome
826                .details
827                .entries()
828                .iter()
829                .any(|d| matches!(d, Detail::ScoreClamped { .. }))
830        );
831    }
832
833    #[tokio::test]
834    async fn dispatch_uses_config_override_when_present() {
835        let (registry, judge) = make_registry(0.5);
836        let custom: Arc<dyn JudgePromptTemplate> = Arc::new(
837            MinijinjaTemplate::new(
838                "mock_v1",
839                PromptFamily::Quality,
840                "override Case={{ case.id }}",
841            )
842            .unwrap(),
843        );
844        let config = JudgeEvaluatorConfig::default_with(registry).with_template(custom);
845        let builtin = make_template(); // would render "mock_v0" but override wins
846        let case = make_case();
847        let invocation = make_invocation();
848        let ctx = make_context(&case, &invocation);
849
850        let outcome = dispatch_judge(&config, builtin, &ctx).await.unwrap();
851
852        // The recorded prompt_version must come from the override, not the builtin.
853        let recorded_version = outcome
854            .details
855            .entries()
856            .iter()
857            .find_map(|d| match d {
858                Detail::PromptVersion { version } => Some(version.clone()),
859                _ => None,
860            })
861            .expect("prompt version recorded");
862        assert_eq!(recorded_version, "mock_v1");
863
864        // The judge must have seen the override prompt.
865        let seen = judge.last_prompt.lock().unwrap().clone().unwrap();
866        assert!(seen.starts_with("override Case=case-1"));
867    }
868
869    #[test]
870    fn detail_buffer_round_trips_through_details_string() {
871        let mut buffer = DetailBuffer::new();
872        buffer.push(Detail::PromptVersion {
873            version: "v0".into(),
874        });
875        buffer.push(Detail::ScoreClamped {
876            original: 1.2,
877            clamped: 1.0,
878        });
879        let rendered = buffer.into_details_string().expect("some");
880        // Two JSON lines, parseable.
881        let parsed: Vec<Detail> = rendered
882            .lines()
883            .map(|line| serde_json::from_str::<Detail>(line).unwrap())
884            .collect();
885        assert_eq!(parsed.len(), 2);
886        assert!(matches!(parsed[0], Detail::PromptVersion { .. }));
887        assert!(matches!(parsed[1], Detail::ScoreClamped { .. }));
888    }
889
890    #[test]
891    fn empty_detail_buffer_renders_none() {
892        assert!(DetailBuffer::new().into_details_string().is_none());
893    }
894
895    #[test]
896    fn config_builder_surface() {
897        let (registry, _) = make_registry(0.5);
898        let config = JudgeEvaluatorConfig::default_with(registry)
899            .with_system_prompt("sys")
900            .with_use_reasoning(false)
901            .with_feedback_key("fb");
902        assert_eq!(config.system_prompt.as_deref(), Some("sys"));
903        assert!(!config.use_reasoning);
904        assert_eq!(config.feedback_key.as_deref(), Some("fb"));
905    }
906
907    #[tokio::test]
908    async fn dispatch_records_feedback_key_when_configured() {
909        let (registry, _) = make_registry(0.8);
910        let config =
911            JudgeEvaluatorConfig::default_with(registry).with_feedback_key("quality.score");
912        let template = make_template();
913        let case = make_case();
914        let invocation = make_invocation();
915        let ctx = make_context(&case, &invocation);
916
917        let outcome = dispatch_judge(&config, template, &ctx).await.unwrap();
918
919        assert!(
920            outcome
921                .details
922                .entries()
923                .iter()
924                .any(|d| matches!(d, Detail::FeedbackKey { key } if key == "quality.score"))
925        );
926    }
927}
swink_agent_eval/evaluators/mod.rs

swink_agent_eval/evaluators/
mod.rs