1#![cfg(feature = "judge-core")]
12
13use std::path::Path;
14use std::sync::Arc;
15
16use serde::{Deserialize, Serialize};
17use serde_json::{Map, Value};
18
19use crate::aggregator::{Aggregator, Average};
20use crate::judge::{JudgeError, JudgeRegistry, JudgeVerdict};
21use crate::prompt::{JudgePromptTemplate, PromptContext, PromptError};
22use crate::score::Score;
23use crate::types::{AttachmentError, EvalMetricResult, MaterializedAttachment};
24use crate::url_filter::UrlFilter;
25
26#[cfg(feature = "evaluator-simple")]
34pub mod simple;
35#[cfg(feature = "evaluator-structured")]
36pub mod structured;
37
38#[cfg(feature = "evaluator-code")]
39pub mod code;
40
41#[cfg(feature = "multimodal")]
42pub mod multimodal;
43
44#[cfg(feature = "evaluator-agent")]
53pub mod agent;
54#[cfg(feature = "evaluator-quality")]
55pub mod quality;
56#[cfg(feature = "evaluator-rag")]
57pub mod rag;
58#[cfg(feature = "evaluator-safety")]
59pub mod safety;
60
61pub struct JudgeEvaluatorConfig {
67 pub template: Option<Arc<dyn JudgePromptTemplate>>,
70 pub few_shot_examples: Vec<crate::types::FewShotExample>,
72 pub system_prompt: Option<String>,
74 pub output_schema: Option<serde_json::Value>,
76 pub use_reasoning: bool,
78 pub feedback_key: Option<String>,
81 pub aggregator: Option<Arc<dyn Aggregator>>,
83 pub judge_registry: Arc<JudgeRegistry>,
85}
86
87impl std::fmt::Debug for JudgeEvaluatorConfig {
88 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
89 f.debug_struct("JudgeEvaluatorConfig")
90 .field("template", &self.template.as_ref().map(|t| t.version()))
91 .field("few_shot_examples", &self.few_shot_examples.len())
92 .field("system_prompt", &self.system_prompt.is_some())
93 .field("output_schema", &self.output_schema.is_some())
94 .field("use_reasoning", &self.use_reasoning)
95 .field("feedback_key", &self.feedback_key)
96 .field("aggregator", &self.aggregator.is_some())
97 .field("judge_registry", &self.judge_registry)
98 .finish()
99 }
100}
101
102impl JudgeEvaluatorConfig {
103 #[must_use]
108 pub fn default_with(judge_registry: Arc<JudgeRegistry>) -> Self {
109 Self {
110 template: None,
111 few_shot_examples: Vec::new(),
112 system_prompt: None,
113 output_schema: None,
114 use_reasoning: true,
115 feedback_key: None,
116 aggregator: None,
117 judge_registry,
118 }
119 }
120
121 #[must_use]
123 pub fn with_prompt(mut self, template: Arc<dyn JudgePromptTemplate>) -> Self {
124 self.template = Some(template);
125 self
126 }
127
128 #[must_use]
130 pub fn with_template(self, template: Arc<dyn JudgePromptTemplate>) -> Self {
131 self.with_prompt(template)
132 }
133
134 #[must_use]
136 pub fn with_few_shot(mut self, examples: Vec<crate::types::FewShotExample>) -> Self {
137 self.few_shot_examples = examples;
138 self
139 }
140
141 #[must_use]
143 pub fn with_system_prompt(mut self, prompt: impl Into<String>) -> Self {
144 self.system_prompt = Some(prompt.into());
145 self
146 }
147
148 #[must_use]
150 pub fn with_output_schema(mut self, schema: serde_json::Value) -> Self {
151 self.output_schema = Some(schema);
152 self
153 }
154
155 #[must_use]
157 pub const fn with_use_reasoning(mut self, flag: bool) -> Self {
158 self.use_reasoning = flag;
159 self
160 }
161
162 #[must_use]
164 pub fn with_feedback_key(mut self, key: impl Into<String>) -> Self {
165 self.feedback_key = Some(key.into());
166 self
167 }
168
169 #[must_use]
171 pub fn with_aggregator(mut self, aggregator: Arc<dyn Aggregator>) -> Self {
172 self.aggregator = Some(aggregator);
173 self
174 }
175
176 #[must_use]
178 pub fn effective_aggregator(&self) -> Arc<dyn Aggregator> {
179 self.aggregator.clone().unwrap_or_else(|| Arc::new(Average))
180 }
181}
182
183#[must_use]
189pub fn build_prompt_context(
190 config: &JudgeEvaluatorConfig,
191 case: &crate::types::EvalCase,
192 invocation: &crate::types::Invocation,
193) -> PromptContext {
194 let mut case = case.clone();
195 if let Some(system_prompt) = &config.system_prompt {
196 case.system_prompt.clone_from(system_prompt);
197 }
198 let case_few_shot_examples = case.few_shot_examples.clone();
199
200 let mut ctx = PromptContext::new(Arc::new(case), Arc::new(invocation.clone()));
201
202 let mut few_shot_examples =
203 Vec::with_capacity(config.few_shot_examples.len() + case_few_shot_examples.len());
204 few_shot_examples.extend(config.few_shot_examples.iter().cloned());
205 few_shot_examples.extend(case_few_shot_examples);
206 if !few_shot_examples.is_empty() {
207 ctx = ctx.with_few_shot_examples(few_shot_examples);
208 }
209
210 let mut custom = Map::new();
211 custom.insert("use_reasoning".into(), Value::Bool(config.use_reasoning));
212 if let Some(system_prompt) = &config.system_prompt {
213 custom.insert("system_prompt".into(), Value::String(system_prompt.clone()));
214 }
215 if let Some(output_schema) = &config.output_schema {
216 custom.insert("output_schema".into(), output_schema.clone());
217 }
218 if let Some(feedback_key) = &config.feedback_key {
219 custom.insert("feedback_key".into(), Value::String(feedback_key.clone()));
220 }
221 if !custom.is_empty() {
222 ctx = ctx.with_custom(custom.into_iter().collect());
223 }
224
225 ctx
226}
227
228pub trait JudgeEvaluatorBuilder: Sized {
252 fn judge_config_mut(&mut self) -> &mut JudgeEvaluatorConfig;
255
256 #[must_use]
258 fn with_prompt(mut self, template: Arc<dyn JudgePromptTemplate>) -> Self {
259 self.judge_config_mut().template = Some(template);
260 self
261 }
262
263 #[must_use]
265 fn with_few_shot(mut self, examples: Vec<crate::types::FewShotExample>) -> Self {
266 self.judge_config_mut().few_shot_examples = examples;
267 self
268 }
269
270 #[must_use]
272 fn with_system_prompt(mut self, prompt: impl Into<String>) -> Self {
273 self.judge_config_mut().system_prompt = Some(prompt.into());
274 self
275 }
276
277 #[must_use]
280 fn with_output_schema(mut self, schema: serde_json::Value) -> Self {
281 self.judge_config_mut().output_schema = Some(schema);
282 self
283 }
284
285 #[must_use]
287 fn with_use_reasoning(mut self, flag: bool) -> Self {
288 self.judge_config_mut().use_reasoning = flag;
289 self
290 }
291
292 #[must_use]
294 fn with_feedback_key(mut self, key: impl Into<String>) -> Self {
295 self.judge_config_mut().feedback_key = Some(key.into());
296 self
297 }
298
299 #[must_use]
301 fn with_aggregator(mut self, aggregator: Arc<dyn Aggregator>) -> Self {
302 self.judge_config_mut().aggregator = Some(aggregator);
303 self
304 }
305}
306
307#[macro_export]
310macro_rules! impl_judge_evaluator_builder {
311 ($ty:ty) => {
312 impl $crate::evaluators::JudgeEvaluatorBuilder for $ty {
313 fn judge_config_mut(&mut self) -> &mut $crate::evaluators::JudgeEvaluatorConfig {
314 &mut self.config
315 }
316 }
317 };
318}
319
320#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
331#[serde(tag = "kind", rename_all = "snake_case")]
332pub enum Detail {
333 ScoreClamped { original: f64, clamped: f64 },
335 PromptVersion { version: String },
337 FeedbackKey { key: String },
339 Note { text: String },
341}
342
343impl Detail {
344 #[must_use]
346 pub fn to_json_line(&self) -> String {
347 serde_json::to_string(self).unwrap_or_else(|_| "{}".to_string())
348 }
349}
350
351#[derive(Debug, Default, Clone)]
358pub struct DetailBuffer {
359 entries: Vec<Detail>,
360}
361
362impl DetailBuffer {
363 #[must_use]
365 pub fn new() -> Self {
366 Self::default()
367 }
368
369 pub fn push(&mut self, detail: Detail) {
371 self.entries.push(detail);
372 }
373
374 #[must_use]
376 pub fn len(&self) -> usize {
377 self.entries.len()
378 }
379
380 #[must_use]
382 pub fn is_empty(&self) -> bool {
383 self.entries.is_empty()
384 }
385
386 #[must_use]
388 pub fn entries(&self) -> &[Detail] {
389 &self.entries
390 }
391
392 #[must_use]
394 pub fn into_details_string(self) -> Option<String> {
395 if self.entries.is_empty() {
396 return None;
397 }
398 let lines: Vec<String> = self.entries.iter().map(Detail::to_json_line).collect();
399 Some(lines.join("\n"))
400 }
401}
402
403#[derive(Debug, thiserror::Error)]
405pub enum DispatchError {
406 #[error("prompt: {0}")]
408 Prompt(#[from] PromptError),
409 #[error("judge: {0}")]
411 Judge(#[from] JudgeError),
412 #[error("attachment: {0}")]
414 Attachment(#[from] AttachmentError),
415}
416
417#[derive(Debug, thiserror::Error)]
423pub enum EvaluatorError {
424 #[error("evaluator unsupported on this platform: {reason}")]
426 UnsupportedPlatform {
427 reason: String,
429 },
430 #[error("sandbox limit exceeded: {limit}")]
432 SandboxLimitExceeded {
433 limit: String,
435 },
436 #[error("evaluator execution error: {reason}")]
438 Execution {
439 reason: String,
441 },
442}
443
444impl EvaluatorError {
445 #[must_use]
447 pub fn into_metric_details(self) -> String {
448 self.to_string()
449 }
450}
451
452#[derive(Debug, Clone)]
454pub struct DispatchOutcome {
455 pub score: Score,
457 pub pass: bool,
459 pub details: DetailBuffer,
461 pub verdict: JudgeVerdict,
463}
464
465pub async fn dispatch_judge(
484 config: &JudgeEvaluatorConfig,
485 builtin_template: Arc<dyn JudgePromptTemplate>,
486 context: &PromptContext,
487) -> Result<DispatchOutcome, DispatchError> {
488 let template = config.template.clone().unwrap_or(builtin_template);
489 let prompt_version = template.version().to_string();
490
491 let rendered = template.render(context)?;
496 let verdict = config.judge_registry.client().judge(&rendered).await?;
497
498 let mut details = DetailBuffer::new();
499 details.push(Detail::PromptVersion {
500 version: prompt_version,
501 });
502 if let Some(feedback_key) = config.feedback_key.clone() {
503 details.push(Detail::FeedbackKey { key: feedback_key });
504 }
505
506 let raw = verdict.score;
507 let clamped = raw.clamp(0.0, 1.0);
508 if (raw - clamped).abs() > f64::EPSILON {
509 details.push(Detail::ScoreClamped {
510 original: raw,
511 clamped,
512 });
513 }
514
515 let score = Score::new(clamped, 0.5);
516
517 Ok(DispatchOutcome {
518 score,
519 pass: verdict.pass,
520 details,
521 verdict,
522 })
523}
524
525pub fn block_on<F, T>(future: F) -> T
537where
538 F: std::future::Future<Output = T>,
539{
540 use tokio::runtime::{Handle, RuntimeFlavor};
541
542 if let Ok(handle) = Handle::try_current()
543 && handle.runtime_flavor() == RuntimeFlavor::MultiThread
544 {
545 return tokio::task::block_in_place(|| handle.block_on(future));
546 }
547
548 let rt = tokio::runtime::Builder::new_current_thread()
549 .enable_all()
550 .build()
551 .expect("build ephemeral current-thread runtime");
552 rt.block_on(future)
553}
554
555pub async fn materialize_case_attachments(
567 case: &crate::types::EvalCase,
568 eval_set_root: &Path,
569 filter: &dyn UrlFilter,
570) -> Result<Vec<MaterializedAttachment>, AttachmentError> {
571 let mut out = Vec::with_capacity(case.attachments.len());
572 for attachment in &case.attachments {
573 let materialized = attachment.materialize(eval_set_root, filter).await?;
574 out.push(materialized);
575 }
576 Ok(out)
577}
578
579#[must_use]
583pub fn finish_metric_result(
584 evaluator_name: impl Into<String>,
585 outcome: DispatchOutcome,
586) -> EvalMetricResult {
587 let mut buffer = outcome.details;
588 if let Some(reason) = outcome.verdict.reason.as_ref() {
589 buffer.push(Detail::Note {
590 text: reason.clone(),
591 });
592 }
593 EvalMetricResult {
594 evaluator_name: evaluator_name.into(),
595 score: outcome.score,
596 details: buffer.into_details_string(),
597 }
598}
599
600pub fn drive_judge_call<F, Fut, T>(make_future: F) -> T
610where
611 F: FnOnce() -> Fut,
612 Fut: std::future::Future<Output = T>,
613{
614 use tokio::runtime::{Handle, RuntimeFlavor};
615
616 if let Ok(handle) = Handle::try_current()
617 && handle.runtime_flavor() == RuntimeFlavor::MultiThread
618 {
619 return tokio::task::block_in_place(|| handle.block_on(make_future()));
620 }
621
622 let rt = tokio::runtime::Builder::new_current_thread()
623 .enable_all()
624 .build()
625 .expect("build current-thread runtime for judge calls");
626 rt.block_on(make_future())
627}
628
629#[must_use]
640pub fn evaluate_with_builtin(
641 evaluator_name: &'static str,
642 template_version: &'static str,
643 config: &JudgeEvaluatorConfig,
644 context: &PromptContext,
645) -> EvalMetricResult {
646 let builtin = crate::prompt::PromptTemplateRegistry::builtin()
647 .get(template_version)
648 .unwrap_or_else(|| panic!("built-in template {template_version} is missing"));
649
650 let dispatch = drive_judge_call(|| async { dispatch_judge(config, builtin, context).await });
651
652 match dispatch {
653 Ok(outcome) => finish_metric_result(evaluator_name.to_string(), outcome),
654 Err(err) => EvalMetricResult {
655 evaluator_name: evaluator_name.to_string(),
656 score: Score::fail(),
657 details: Some(format!("{evaluator_name}: dispatch error — {err}")),
658 },
659 }
660}
661
662#[cfg(test)]
663mod tests {
664 use super::*;
665 use crate::judge::{JudgeClient, JudgeRegistry};
666 use crate::prompt::{MinijinjaTemplate, PromptContext, PromptFamily};
667 use crate::types::{EvalCase, Invocation};
668 use std::sync::Arc;
669 use std::sync::Mutex;
670 use std::time::Duration;
671
672 use swink_agent::{Cost, ModelSpec, StopReason, Usage};
673
674 struct FixedJudge {
675 score: f64,
676 reason: Option<String>,
677 last_prompt: Mutex<Option<String>>,
678 }
679
680 impl JudgeClient for FixedJudge {
681 fn judge<'a>(&'a self, prompt: &'a str) -> crate::judge::JudgeFuture<'a> {
682 Box::pin(async move {
683 *self.last_prompt.lock().unwrap() = Some(prompt.to_string());
684 Ok(JudgeVerdict {
685 score: self.score,
686 pass: (0.5..=1.0).contains(&self.score),
687 reason: self.reason.clone(),
688 label: None,
689 })
690 })
691 }
692 }
693
694 fn make_case() -> EvalCase {
695 EvalCase {
696 id: "case-1".into(),
697 name: "Case One".into(),
698 description: None,
699 system_prompt: "answer".into(),
700 user_messages: vec!["hi".into()],
701 expected_trajectory: None,
702 expected_response: None,
703 expected_assertion: None,
704 expected_interactions: None,
705 few_shot_examples: vec![],
706 budget: None,
707 evaluators: vec![],
708 metadata: serde_json::Value::Null,
709 attachments: vec![],
710 session_id: None,
711 expected_environment_state: None,
712 expected_tool_intent: None,
713 semantic_tool_selection: false,
714 state_capture: None,
715 }
716 }
717
718 fn make_invocation() -> Invocation {
719 Invocation {
720 turns: vec![],
721 total_usage: Usage::default(),
722 total_cost: Cost::default(),
723 total_duration: Duration::from_millis(1),
724 final_response: Some("42".into()),
725 stop_reason: StopReason::Stop,
726 model: ModelSpec::new("test", "judge-target"),
727 }
728 }
729
730 fn make_registry(score: f64) -> (Arc<JudgeRegistry>, Arc<FixedJudge>) {
731 let judge = Arc::new(FixedJudge {
732 score,
733 reason: Some("ok".into()),
734 last_prompt: Mutex::new(None),
735 });
736 let registry = JudgeRegistry::builder(judge.clone() as Arc<dyn JudgeClient>, "mock-model")
737 .build()
738 .expect("registry builds");
739 (Arc::new(registry), judge)
740 }
741
742 fn make_template() -> Arc<dyn JudgePromptTemplate> {
743 Arc::new(
744 MinijinjaTemplate::new(
745 "mock_v0",
746 PromptFamily::Quality,
747 "Case={{ case.name }} Actual={{ invocation.final_response }}",
748 )
749 .expect("template compiles"),
750 )
751 }
752
753 fn make_context(case: &EvalCase, invocation: &Invocation) -> PromptContext {
754 PromptContext::new(Arc::new(case.clone()), Arc::new(invocation.clone()))
755 }
756
757 #[tokio::test]
758 async fn dispatch_records_prompt_version() {
759 let (registry, _) = make_registry(0.8);
760 let config = JudgeEvaluatorConfig::default_with(registry);
761 let template = make_template();
762 let case = make_case();
763 let invocation = make_invocation();
764 let ctx = make_context(&case, &invocation);
765
766 let outcome = dispatch_judge(&config, template, &ctx).await.unwrap();
767
768 assert!(
769 outcome
770 .details
771 .entries()
772 .iter()
773 .any(|d| matches!(d, Detail::PromptVersion { version } if version == "mock_v0"))
774 );
775 assert!(
776 !outcome
777 .details
778 .entries()
779 .iter()
780 .any(|d| matches!(d, Detail::ScoreClamped { .. }))
781 );
782 assert!((outcome.score.value - 0.8).abs() < f64::EPSILON);
783 }
784
785 #[tokio::test]
786 async fn dispatch_clamps_out_of_range_scores() {
787 let (registry, _) = make_registry(1.3);
788 let config = JudgeEvaluatorConfig::default_with(registry);
789 let template = make_template();
790 let case = make_case();
791 let invocation = make_invocation();
792 let ctx = make_context(&case, &invocation);
793
794 let outcome = dispatch_judge(&config, template, &ctx).await.unwrap();
795
796 assert!((outcome.score.value - 1.0).abs() < f64::EPSILON);
798 let clamp = outcome
800 .details
801 .entries()
802 .iter()
803 .find_map(|d| match d {
804 Detail::ScoreClamped { original, clamped } => Some((*original, *clamped)),
805 _ => None,
806 })
807 .expect("ScoreClamped detail present");
808 assert!((clamp.0 - 1.3).abs() < f64::EPSILON);
809 assert!((clamp.1 - 1.0).abs() < f64::EPSILON);
810 }
811
812 #[tokio::test]
813 async fn dispatch_clamps_negative_scores() {
814 let (registry, _) = make_registry(-0.2);
815 let config = JudgeEvaluatorConfig::default_with(registry);
816 let template = make_template();
817 let case = make_case();
818 let invocation = make_invocation();
819 let ctx = make_context(&case, &invocation);
820
821 let outcome = dispatch_judge(&config, template, &ctx).await.unwrap();
822
823 assert!((outcome.score.value - 0.0).abs() < f64::EPSILON);
824 assert!(
825 outcome
826 .details
827 .entries()
828 .iter()
829 .any(|d| matches!(d, Detail::ScoreClamped { .. }))
830 );
831 }
832
833 #[tokio::test]
834 async fn dispatch_uses_config_override_when_present() {
835 let (registry, judge) = make_registry(0.5);
836 let custom: Arc<dyn JudgePromptTemplate> = Arc::new(
837 MinijinjaTemplate::new(
838 "mock_v1",
839 PromptFamily::Quality,
840 "override Case={{ case.id }}",
841 )
842 .unwrap(),
843 );
844 let config = JudgeEvaluatorConfig::default_with(registry).with_template(custom);
845 let builtin = make_template(); let case = make_case();
847 let invocation = make_invocation();
848 let ctx = make_context(&case, &invocation);
849
850 let outcome = dispatch_judge(&config, builtin, &ctx).await.unwrap();
851
852 let recorded_version = outcome
854 .details
855 .entries()
856 .iter()
857 .find_map(|d| match d {
858 Detail::PromptVersion { version } => Some(version.clone()),
859 _ => None,
860 })
861 .expect("prompt version recorded");
862 assert_eq!(recorded_version, "mock_v1");
863
864 let seen = judge.last_prompt.lock().unwrap().clone().unwrap();
866 assert!(seen.starts_with("override Case=case-1"));
867 }
868
869 #[test]
870 fn detail_buffer_round_trips_through_details_string() {
871 let mut buffer = DetailBuffer::new();
872 buffer.push(Detail::PromptVersion {
873 version: "v0".into(),
874 });
875 buffer.push(Detail::ScoreClamped {
876 original: 1.2,
877 clamped: 1.0,
878 });
879 let rendered = buffer.into_details_string().expect("some");
880 let parsed: Vec<Detail> = rendered
882 .lines()
883 .map(|line| serde_json::from_str::<Detail>(line).unwrap())
884 .collect();
885 assert_eq!(parsed.len(), 2);
886 assert!(matches!(parsed[0], Detail::PromptVersion { .. }));
887 assert!(matches!(parsed[1], Detail::ScoreClamped { .. }));
888 }
889
890 #[test]
891 fn empty_detail_buffer_renders_none() {
892 assert!(DetailBuffer::new().into_details_string().is_none());
893 }
894
895 #[test]
896 fn config_builder_surface() {
897 let (registry, _) = make_registry(0.5);
898 let config = JudgeEvaluatorConfig::default_with(registry)
899 .with_system_prompt("sys")
900 .with_use_reasoning(false)
901 .with_feedback_key("fb");
902 assert_eq!(config.system_prompt.as_deref(), Some("sys"));
903 assert!(!config.use_reasoning);
904 assert_eq!(config.feedback_key.as_deref(), Some("fb"));
905 }
906
907 #[tokio::test]
908 async fn dispatch_records_feedback_key_when_configured() {
909 let (registry, _) = make_registry(0.8);
910 let config =
911 JudgeEvaluatorConfig::default_with(registry).with_feedback_key("quality.score");
912 let template = make_template();
913 let case = make_case();
914 let invocation = make_invocation();
915 let ctx = make_context(&case, &invocation);
916
917 let outcome = dispatch_judge(&config, template, &ctx).await.unwrap();
918
919 assert!(
920 outcome
921 .details
922 .entries()
923 .iter()
924 .any(|d| matches!(d, Detail::FeedbackKey { key } if key == "quality.score"))
925 );
926 }
927}