#![cfg(feature = "judge-core")]
use std::path::Path;
use std::sync::Arc;
use serde::{Deserialize, Serialize};
use serde_json::{Map, Value};
use crate::aggregator::{Aggregator, Average};
use crate::judge::{JudgeError, JudgeRegistry, JudgeVerdict};
use crate::prompt::{JudgePromptTemplate, PromptContext, PromptError};
use crate::score::Score;
use crate::types::{AttachmentError, EvalMetricResult, MaterializedAttachment};
use crate::url_filter::UrlFilter;
#[cfg(feature = "evaluator-simple")]
pub mod simple;
#[cfg(feature = "evaluator-structured")]
pub mod structured;
#[cfg(feature = "evaluator-code")]
pub mod code;
#[cfg(feature = "multimodal")]
pub mod multimodal;
#[cfg(feature = "evaluator-agent")]
pub mod agent;
#[cfg(feature = "evaluator-quality")]
pub mod quality;
#[cfg(feature = "evaluator-rag")]
pub mod rag;
#[cfg(feature = "evaluator-safety")]
pub mod safety;
pub struct JudgeEvaluatorConfig {
pub template: Option<Arc<dyn JudgePromptTemplate>>,
pub few_shot_examples: Vec<crate::types::FewShotExample>,
pub system_prompt: Option<String>,
pub output_schema: Option<serde_json::Value>,
pub use_reasoning: bool,
pub feedback_key: Option<String>,
pub aggregator: Option<Arc<dyn Aggregator>>,
pub judge_registry: Arc<JudgeRegistry>,
}
impl std::fmt::Debug for JudgeEvaluatorConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("JudgeEvaluatorConfig")
.field("template", &self.template.as_ref().map(|t| t.version()))
.field("few_shot_examples", &self.few_shot_examples.len())
.field("system_prompt", &self.system_prompt.is_some())
.field("output_schema", &self.output_schema.is_some())
.field("use_reasoning", &self.use_reasoning)
.field("feedback_key", &self.feedback_key)
.field("aggregator", &self.aggregator.is_some())
.field("judge_registry", &self.judge_registry)
.finish()
}
}
impl JudgeEvaluatorConfig {
#[must_use]
pub fn default_with(judge_registry: Arc<JudgeRegistry>) -> Self {
Self {
template: None,
few_shot_examples: Vec::new(),
system_prompt: None,
output_schema: None,
use_reasoning: true,
feedback_key: None,
aggregator: None,
judge_registry,
}
}
#[must_use]
pub fn with_prompt(mut self, template: Arc<dyn JudgePromptTemplate>) -> Self {
self.template = Some(template);
self
}
#[must_use]
pub fn with_template(self, template: Arc<dyn JudgePromptTemplate>) -> Self {
self.with_prompt(template)
}
#[must_use]
pub fn with_few_shot(mut self, examples: Vec<crate::types::FewShotExample>) -> Self {
self.few_shot_examples = examples;
self
}
#[must_use]
pub fn with_system_prompt(mut self, prompt: impl Into<String>) -> Self {
self.system_prompt = Some(prompt.into());
self
}
#[must_use]
pub fn with_output_schema(mut self, schema: serde_json::Value) -> Self {
self.output_schema = Some(schema);
self
}
#[must_use]
pub const fn with_use_reasoning(mut self, flag: bool) -> Self {
self.use_reasoning = flag;
self
}
#[must_use]
pub fn with_feedback_key(mut self, key: impl Into<String>) -> Self {
self.feedback_key = Some(key.into());
self
}
#[must_use]
pub fn with_aggregator(mut self, aggregator: Arc<dyn Aggregator>) -> Self {
self.aggregator = Some(aggregator);
self
}
#[must_use]
pub fn effective_aggregator(&self) -> Arc<dyn Aggregator> {
self.aggregator.clone().unwrap_or_else(|| Arc::new(Average))
}
}
#[must_use]
pub fn build_prompt_context(
config: &JudgeEvaluatorConfig,
case: &crate::types::EvalCase,
invocation: &crate::types::Invocation,
) -> PromptContext {
let mut case = case.clone();
if let Some(system_prompt) = &config.system_prompt {
case.system_prompt.clone_from(system_prompt);
}
let case_few_shot_examples = case.few_shot_examples.clone();
let mut ctx = PromptContext::new(Arc::new(case), Arc::new(invocation.clone()));
let mut few_shot_examples =
Vec::with_capacity(config.few_shot_examples.len() + case_few_shot_examples.len());
few_shot_examples.extend(config.few_shot_examples.iter().cloned());
few_shot_examples.extend(case_few_shot_examples);
if !few_shot_examples.is_empty() {
ctx = ctx.with_few_shot_examples(few_shot_examples);
}
let mut custom = Map::new();
custom.insert("use_reasoning".into(), Value::Bool(config.use_reasoning));
if let Some(system_prompt) = &config.system_prompt {
custom.insert("system_prompt".into(), Value::String(system_prompt.clone()));
}
if let Some(output_schema) = &config.output_schema {
custom.insert("output_schema".into(), output_schema.clone());
}
if let Some(feedback_key) = &config.feedback_key {
custom.insert("feedback_key".into(), Value::String(feedback_key.clone()));
}
if !custom.is_empty() {
ctx = ctx.with_custom(custom.into_iter().collect());
}
ctx
}
pub trait JudgeEvaluatorBuilder: Sized {
fn judge_config_mut(&mut self) -> &mut JudgeEvaluatorConfig;
#[must_use]
fn with_prompt(mut self, template: Arc<dyn JudgePromptTemplate>) -> Self {
self.judge_config_mut().template = Some(template);
self
}
#[must_use]
fn with_few_shot(mut self, examples: Vec<crate::types::FewShotExample>) -> Self {
self.judge_config_mut().few_shot_examples = examples;
self
}
#[must_use]
fn with_system_prompt(mut self, prompt: impl Into<String>) -> Self {
self.judge_config_mut().system_prompt = Some(prompt.into());
self
}
#[must_use]
fn with_output_schema(mut self, schema: serde_json::Value) -> Self {
self.judge_config_mut().output_schema = Some(schema);
self
}
#[must_use]
fn with_use_reasoning(mut self, flag: bool) -> Self {
self.judge_config_mut().use_reasoning = flag;
self
}
#[must_use]
fn with_feedback_key(mut self, key: impl Into<String>) -> Self {
self.judge_config_mut().feedback_key = Some(key.into());
self
}
#[must_use]
fn with_aggregator(mut self, aggregator: Arc<dyn Aggregator>) -> Self {
self.judge_config_mut().aggregator = Some(aggregator);
self
}
}
#[macro_export]
macro_rules! impl_judge_evaluator_builder {
($ty:ty) => {
impl $crate::evaluators::JudgeEvaluatorBuilder for $ty {
fn judge_config_mut(&mut self) -> &mut $crate::evaluators::JudgeEvaluatorConfig {
&mut self.config
}
}
};
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum Detail {
ScoreClamped { original: f64, clamped: f64 },
PromptVersion { version: String },
FeedbackKey { key: String },
Note { text: String },
}
impl Detail {
#[must_use]
pub fn to_json_line(&self) -> String {
serde_json::to_string(self).unwrap_or_else(|_| "{}".to_string())
}
}
#[derive(Debug, Default, Clone)]
pub struct DetailBuffer {
entries: Vec<Detail>,
}
impl DetailBuffer {
#[must_use]
pub fn new() -> Self {
Self::default()
}
pub fn push(&mut self, detail: Detail) {
self.entries.push(detail);
}
#[must_use]
pub fn len(&self) -> usize {
self.entries.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
#[must_use]
pub fn entries(&self) -> &[Detail] {
&self.entries
}
#[must_use]
pub fn into_details_string(self) -> Option<String> {
if self.entries.is_empty() {
return None;
}
let lines: Vec<String> = self.entries.iter().map(Detail::to_json_line).collect();
Some(lines.join("\n"))
}
}
#[derive(Debug, thiserror::Error)]
pub enum DispatchError {
#[error("prompt: {0}")]
Prompt(#[from] PromptError),
#[error("judge: {0}")]
Judge(#[from] JudgeError),
#[error("attachment: {0}")]
Attachment(#[from] AttachmentError),
}
#[derive(Debug, thiserror::Error)]
pub enum EvaluatorError {
#[error("evaluator unsupported on this platform: {reason}")]
UnsupportedPlatform {
reason: String,
},
#[error("sandbox limit exceeded: {limit}")]
SandboxLimitExceeded {
limit: String,
},
#[error("evaluator execution error: {reason}")]
Execution {
reason: String,
},
}
impl EvaluatorError {
#[must_use]
pub fn into_metric_details(self) -> String {
self.to_string()
}
}
#[derive(Debug, Clone)]
pub struct DispatchOutcome {
pub score: Score,
pub pass: bool,
pub details: DetailBuffer,
pub verdict: JudgeVerdict,
}
pub async fn dispatch_judge(
config: &JudgeEvaluatorConfig,
builtin_template: Arc<dyn JudgePromptTemplate>,
context: &PromptContext,
) -> Result<DispatchOutcome, DispatchError> {
let template = config.template.clone().unwrap_or(builtin_template);
let prompt_version = template.version().to_string();
let rendered = template.render(context)?;
let verdict = config.judge_registry.client().judge(&rendered).await?;
let mut details = DetailBuffer::new();
details.push(Detail::PromptVersion {
version: prompt_version,
});
if let Some(feedback_key) = config.feedback_key.clone() {
details.push(Detail::FeedbackKey { key: feedback_key });
}
let raw = verdict.score;
let clamped = raw.clamp(0.0, 1.0);
if (raw - clamped).abs() > f64::EPSILON {
details.push(Detail::ScoreClamped {
original: raw,
clamped,
});
}
let score = Score::new(clamped, 0.5);
Ok(DispatchOutcome {
score,
pass: verdict.pass,
details,
verdict,
})
}
pub fn block_on<F, T>(future: F) -> T
where
F: std::future::Future<Output = T>,
{
use tokio::runtime::{Handle, RuntimeFlavor};
if let Ok(handle) = Handle::try_current()
&& handle.runtime_flavor() == RuntimeFlavor::MultiThread
{
return tokio::task::block_in_place(|| handle.block_on(future));
}
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.expect("build ephemeral current-thread runtime");
rt.block_on(future)
}
pub async fn materialize_case_attachments(
case: &crate::types::EvalCase,
eval_set_root: &Path,
filter: &dyn UrlFilter,
) -> Result<Vec<MaterializedAttachment>, AttachmentError> {
let mut out = Vec::with_capacity(case.attachments.len());
for attachment in &case.attachments {
let materialized = attachment.materialize(eval_set_root, filter).await?;
out.push(materialized);
}
Ok(out)
}
#[must_use]
pub fn finish_metric_result(
evaluator_name: impl Into<String>,
outcome: DispatchOutcome,
) -> EvalMetricResult {
let mut buffer = outcome.details;
if let Some(reason) = outcome.verdict.reason.as_ref() {
buffer.push(Detail::Note {
text: reason.clone(),
});
}
EvalMetricResult {
evaluator_name: evaluator_name.into(),
score: outcome.score,
details: buffer.into_details_string(),
}
}
pub fn drive_judge_call<F, Fut, T>(make_future: F) -> T
where
F: FnOnce() -> Fut,
Fut: std::future::Future<Output = T>,
{
use tokio::runtime::{Handle, RuntimeFlavor};
if let Ok(handle) = Handle::try_current()
&& handle.runtime_flavor() == RuntimeFlavor::MultiThread
{
return tokio::task::block_in_place(|| handle.block_on(make_future()));
}
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.expect("build current-thread runtime for judge calls");
rt.block_on(make_future())
}
#[must_use]
pub fn evaluate_with_builtin(
evaluator_name: &'static str,
template_version: &'static str,
config: &JudgeEvaluatorConfig,
context: &PromptContext,
) -> EvalMetricResult {
let builtin = crate::prompt::PromptTemplateRegistry::builtin()
.get(template_version)
.unwrap_or_else(|| panic!("built-in template {template_version} is missing"));
let dispatch = drive_judge_call(|| async { dispatch_judge(config, builtin, context).await });
match dispatch {
Ok(outcome) => finish_metric_result(evaluator_name.to_string(), outcome),
Err(err) => EvalMetricResult {
evaluator_name: evaluator_name.to_string(),
score: Score::fail(),
details: Some(format!("{evaluator_name}: dispatch error — {err}")),
},
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::judge::{JudgeClient, JudgeRegistry};
use crate::prompt::{MinijinjaTemplate, PromptContext, PromptFamily};
use crate::types::{EvalCase, Invocation};
use std::sync::Arc;
use std::sync::Mutex;
use std::time::Duration;
use swink_agent::{Cost, ModelSpec, StopReason, Usage};
struct FixedJudge {
score: f64,
reason: Option<String>,
last_prompt: Mutex<Option<String>>,
}
impl JudgeClient for FixedJudge {
fn judge<'a>(&'a self, prompt: &'a str) -> crate::judge::JudgeFuture<'a> {
Box::pin(async move {
*self.last_prompt.lock().unwrap() = Some(prompt.to_string());
Ok(JudgeVerdict {
score: self.score,
pass: (0.5..=1.0).contains(&self.score),
reason: self.reason.clone(),
label: None,
})
})
}
}
fn make_case() -> EvalCase {
EvalCase {
id: "case-1".into(),
name: "Case One".into(),
description: None,
system_prompt: "answer".into(),
user_messages: vec!["hi".into()],
expected_trajectory: None,
expected_response: None,
expected_assertion: None,
expected_interactions: None,
few_shot_examples: vec![],
budget: None,
evaluators: vec![],
metadata: serde_json::Value::Null,
attachments: vec![],
session_id: None,
expected_environment_state: None,
expected_tool_intent: None,
semantic_tool_selection: false,
state_capture: None,
}
}
fn make_invocation() -> Invocation {
Invocation {
turns: vec![],
total_usage: Usage::default(),
total_cost: Cost::default(),
total_duration: Duration::from_millis(1),
final_response: Some("42".into()),
stop_reason: StopReason::Stop,
model: ModelSpec::new("test", "judge-target"),
}
}
fn make_registry(score: f64) -> (Arc<JudgeRegistry>, Arc<FixedJudge>) {
let judge = Arc::new(FixedJudge {
score,
reason: Some("ok".into()),
last_prompt: Mutex::new(None),
});
let registry = JudgeRegistry::builder(judge.clone() as Arc<dyn JudgeClient>, "mock-model")
.build()
.expect("registry builds");
(Arc::new(registry), judge)
}
fn make_template() -> Arc<dyn JudgePromptTemplate> {
Arc::new(
MinijinjaTemplate::new(
"mock_v0",
PromptFamily::Quality,
"Case={{ case.name }} Actual={{ invocation.final_response }}",
)
.expect("template compiles"),
)
}
fn make_context(case: &EvalCase, invocation: &Invocation) -> PromptContext {
PromptContext::new(Arc::new(case.clone()), Arc::new(invocation.clone()))
}
#[tokio::test]
async fn dispatch_records_prompt_version() {
let (registry, _) = make_registry(0.8);
let config = JudgeEvaluatorConfig::default_with(registry);
let template = make_template();
let case = make_case();
let invocation = make_invocation();
let ctx = make_context(&case, &invocation);
let outcome = dispatch_judge(&config, template, &ctx).await.unwrap();
assert!(
outcome
.details
.entries()
.iter()
.any(|d| matches!(d, Detail::PromptVersion { version } if version == "mock_v0"))
);
assert!(
!outcome
.details
.entries()
.iter()
.any(|d| matches!(d, Detail::ScoreClamped { .. }))
);
assert!((outcome.score.value - 0.8).abs() < f64::EPSILON);
}
#[tokio::test]
async fn dispatch_clamps_out_of_range_scores() {
let (registry, _) = make_registry(1.3);
let config = JudgeEvaluatorConfig::default_with(registry);
let template = make_template();
let case = make_case();
let invocation = make_invocation();
let ctx = make_context(&case, &invocation);
let outcome = dispatch_judge(&config, template, &ctx).await.unwrap();
assert!((outcome.score.value - 1.0).abs() < f64::EPSILON);
let clamp = outcome
.details
.entries()
.iter()
.find_map(|d| match d {
Detail::ScoreClamped { original, clamped } => Some((*original, *clamped)),
_ => None,
})
.expect("ScoreClamped detail present");
assert!((clamp.0 - 1.3).abs() < f64::EPSILON);
assert!((clamp.1 - 1.0).abs() < f64::EPSILON);
}
#[tokio::test]
async fn dispatch_clamps_negative_scores() {
let (registry, _) = make_registry(-0.2);
let config = JudgeEvaluatorConfig::default_with(registry);
let template = make_template();
let case = make_case();
let invocation = make_invocation();
let ctx = make_context(&case, &invocation);
let outcome = dispatch_judge(&config, template, &ctx).await.unwrap();
assert!((outcome.score.value - 0.0).abs() < f64::EPSILON);
assert!(
outcome
.details
.entries()
.iter()
.any(|d| matches!(d, Detail::ScoreClamped { .. }))
);
}
#[tokio::test]
async fn dispatch_uses_config_override_when_present() {
let (registry, judge) = make_registry(0.5);
let custom: Arc<dyn JudgePromptTemplate> = Arc::new(
MinijinjaTemplate::new(
"mock_v1",
PromptFamily::Quality,
"override Case={{ case.id }}",
)
.unwrap(),
);
let config = JudgeEvaluatorConfig::default_with(registry).with_template(custom);
let builtin = make_template(); let case = make_case();
let invocation = make_invocation();
let ctx = make_context(&case, &invocation);
let outcome = dispatch_judge(&config, builtin, &ctx).await.unwrap();
let recorded_version = outcome
.details
.entries()
.iter()
.find_map(|d| match d {
Detail::PromptVersion { version } => Some(version.clone()),
_ => None,
})
.expect("prompt version recorded");
assert_eq!(recorded_version, "mock_v1");
let seen = judge.last_prompt.lock().unwrap().clone().unwrap();
assert!(seen.starts_with("override Case=case-1"));
}
#[test]
fn detail_buffer_round_trips_through_details_string() {
let mut buffer = DetailBuffer::new();
buffer.push(Detail::PromptVersion {
version: "v0".into(),
});
buffer.push(Detail::ScoreClamped {
original: 1.2,
clamped: 1.0,
});
let rendered = buffer.into_details_string().expect("some");
let parsed: Vec<Detail> = rendered
.lines()
.map(|line| serde_json::from_str::<Detail>(line).unwrap())
.collect();
assert_eq!(parsed.len(), 2);
assert!(matches!(parsed[0], Detail::PromptVersion { .. }));
assert!(matches!(parsed[1], Detail::ScoreClamped { .. }));
}
#[test]
fn empty_detail_buffer_renders_none() {
assert!(DetailBuffer::new().into_details_string().is_none());
}
#[test]
fn config_builder_surface() {
let (registry, _) = make_registry(0.5);
let config = JudgeEvaluatorConfig::default_with(registry)
.with_system_prompt("sys")
.with_use_reasoning(false)
.with_feedback_key("fb");
assert_eq!(config.system_prompt.as_deref(), Some("sys"));
assert!(!config.use_reasoning);
assert_eq!(config.feedback_key.as_deref(), Some("fb"));
}
#[tokio::test]
async fn dispatch_records_feedback_key_when_configured() {
let (registry, _) = make_registry(0.8);
let config =
JudgeEvaluatorConfig::default_with(registry).with_feedback_key("quality.score");
let template = make_template();
let case = make_case();
let invocation = make_invocation();
let ctx = make_context(&case, &invocation);
let outcome = dispatch_judge(&config, template, &ctx).await.unwrap();
assert!(
outcome
.details
.entries()
.iter()
.any(|d| matches!(d, Detail::FeedbackKey { key } if key == "quality.score"))
);
}
}