1use std::collections::{BTreeMap, HashSet};
4use std::path::{Component, Path, PathBuf};
5use std::sync::Arc;
6use std::time::Duration;
7
8use serde::{Deserialize, Deserializer, Serialize, Serializer};
9use sha2::{Digest, Sha256};
10use swink_agent::{AssistantMessage, Cost, ModelSpec, StopReason, ToolResultMessage, Usage};
11use swink_agent_policies::{BudgetPolicy, MaxTurnsPolicy};
12use thiserror::Error;
13use url::Url;
14use uuid::Uuid;
15
16use crate::error::EvalError;
17use crate::score::{Score, Verdict};
18use crate::url_filter::UrlFilter;
19
20#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct RecordedToolCall {
25 pub id: String,
27 pub name: String,
29 pub arguments: serde_json::Value,
31}
32
33#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct TurnRecord {
36 pub turn_index: usize,
38 pub assistant_message: AssistantMessage,
40 pub tool_calls: Vec<RecordedToolCall>,
42 pub tool_results: Vec<ToolResultMessage>,
44 pub duration: Duration,
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct Invocation {
51 pub turns: Vec<TurnRecord>,
53 pub total_usage: Usage,
55 pub total_cost: Cost,
57 pub total_duration: Duration,
59 pub final_response: Option<String>,
61 pub stop_reason: StopReason,
63 pub model: ModelSpec,
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct ExpectedToolCall {
72 pub tool_name: String,
74 #[serde(default, skip_serializing_if = "Option::is_none")]
76 pub arguments: Option<serde_json::Value>,
77}
78
79#[derive(Clone, Serialize, Deserialize)]
81#[serde(tag = "mode", rename_all = "snake_case")]
82pub enum ResponseCriteria {
83 Exact { expected: String },
85 Contains { substring: String },
87 Regex { pattern: String },
89 #[serde(skip)]
91 Custom(#[serde(skip)] Arc<dyn Fn(&str) -> Score + Send + Sync>),
92}
93
94impl std::fmt::Debug for ResponseCriteria {
95 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96 match self {
97 Self::Exact { expected } => {
98 f.debug_struct("Exact").field("expected", expected).finish()
99 }
100 Self::Contains { substring } => f
101 .debug_struct("Contains")
102 .field("substring", substring)
103 .finish(),
104 Self::Regex { pattern } => f.debug_struct("Regex").field("pattern", pattern).finish(),
105 Self::Custom(_) => f.debug_tuple("Custom").field(&"<fn>").finish(),
106 }
107 }
108}
109
110#[derive(Debug, Clone, Serialize, Deserialize)]
116pub struct EnvironmentState {
117 pub name: String,
121 pub state: serde_json::Value,
123}
124
125#[derive(Debug, Clone, Serialize, Deserialize)]
130pub struct ToolIntent {
131 pub intent: String,
133 #[serde(default, skip_serializing_if = "Option::is_none")]
135 pub tool_name: Option<String>,
136}
137
138pub type StateCapture = Arc<dyn Fn(&Invocation) -> Vec<EnvironmentState> + Send + Sync>;
147
148#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
150pub struct Assertion {
151 pub description: String,
153 pub kind: AssertionKind,
155}
156
157#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
159#[serde(rename_all = "snake_case")]
160pub enum AssertionKind {
161 GoalCompleted,
163 UserSatisfied,
165 ToolInvoked(String),
167 Custom { predicate: String },
169}
170
171#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
173pub struct InteractionExpectation {
174 pub from: String,
176 pub to: String,
178 pub description: String,
180}
181
182#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
184pub struct FewShotExample {
185 pub input: String,
187 pub expected: String,
189 #[serde(default, skip_serializing_if = "Option::is_none")]
191 pub reasoning: Option<String>,
192}
193
194#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
196#[serde(rename_all = "snake_case")]
197pub enum Attachment {
198 Path(PathBuf),
200 Base64 { mime: String, bytes: Vec<u8> },
202 Url(String),
204}
205
206#[derive(Debug, Clone, PartialEq, Eq)]
208pub struct MaterializedAttachment {
209 pub mime: String,
210 pub bytes: Vec<u8>,
211}
212
213#[derive(Debug, Error)]
215pub enum AttachmentError {
216 #[error("attachment path not found: {0}")]
217 PathNotFound(PathBuf),
218 #[error("attachment decode failed: {0}")]
219 DecodeError(String),
220 #[error("attachment URL blocked: {url}: {reason}")]
221 UrlBlocked { url: String, reason: String },
222 #[error("attachment fetch failed: {url}: status {status}")]
223 FetchFailed { url: String, status: u16 },
224 #[error("unsupported attachment MIME type: {mime}")]
225 UnsupportedMime { mime: String },
226}
227
228impl Attachment {
229 pub async fn materialize(
233 &self,
234 eval_set_root: &Path,
235 filter: &dyn UrlFilter,
236 ) -> Result<MaterializedAttachment, AttachmentError> {
237 match self {
238 Self::Path(path) => materialize_path(eval_set_root, path).await,
239 Self::Base64 { mime, bytes } => {
240 validate_attachment_mime(mime)?;
241 Ok(MaterializedAttachment {
242 mime: normalize_mime(mime),
243 bytes: bytes.clone(),
244 })
245 }
246 Self::Url(url) => materialize_url(url, filter).await,
247 }
248 }
249}
250
251async fn materialize_path(
252 eval_set_root: &Path,
253 path: &Path,
254) -> Result<MaterializedAttachment, AttachmentError> {
255 if path.is_absolute()
256 || path
257 .components()
258 .any(|component| component == Component::ParentDir)
259 {
260 return Err(AttachmentError::PathNotFound(path.to_path_buf()));
261 }
262
263 let full_path = eval_set_root.join(path);
264 let bytes = tokio::fs::read(&full_path)
265 .await
266 .map_err(|_| AttachmentError::PathNotFound(path.to_path_buf()))?;
267 let mime = mime_from_path(path)?;
268
269 Ok(MaterializedAttachment { mime, bytes })
270}
271
272async fn materialize_url(
273 url: &str,
274 filter: &dyn UrlFilter,
275) -> Result<MaterializedAttachment, AttachmentError> {
276 let parsed = Url::parse(url).map_err(|err| AttachmentError::UrlBlocked {
277 url: url.to_string(),
278 reason: err.to_string(),
279 })?;
280
281 validate_remote_url(&parsed, filter)?;
282
283 materialize_checked_url(parsed, filter).await
284}
285
286#[cfg(feature = "multimodal")]
287async fn materialize_checked_url(
288 parsed: Url,
289 filter: &dyn UrlFilter,
290) -> Result<MaterializedAttachment, AttachmentError> {
291 let client = reqwest::Client::builder()
292 .redirect(reqwest::redirect::Policy::none())
293 .build()
294 .map_err(|_| AttachmentError::FetchFailed {
295 url: parsed.as_str().to_string(),
296 status: 0,
297 })?;
298 let mut current = parsed;
299
300 for _ in 0..10 {
301 let url = current.as_str().to_string();
302 let response =
303 client
304 .get(current.clone())
305 .send()
306 .await
307 .map_err(|_| AttachmentError::FetchFailed {
308 url: url.clone(),
309 status: 0,
310 })?;
311 let status = response.status();
312
313 if status.is_redirection() {
314 let location = response
315 .headers()
316 .get(reqwest::header::LOCATION)
317 .and_then(|value| value.to_str().ok())
318 .ok_or_else(|| AttachmentError::FetchFailed {
319 url: url.clone(),
320 status: status.as_u16(),
321 })?;
322 current = resolve_redirect_target(¤t, location, filter)?;
323 continue;
324 }
325
326 if !status.is_success() {
327 return Err(AttachmentError::FetchFailed {
328 url,
329 status: status.as_u16(),
330 });
331 }
332
333 let content_type = response
334 .headers()
335 .get(reqwest::header::CONTENT_TYPE)
336 .and_then(|value| value.to_str().ok())
337 .map(normalize_mime);
338 let mime = match content_type {
339 Some(mime) => {
340 validate_attachment_mime(&mime)?;
341 mime
342 }
343 None => mime_from_url_path(&url)?,
344 };
345 let bytes = response
346 .bytes()
347 .await
348 .map_err(|_| AttachmentError::FetchFailed { url, status: 0 })?
349 .to_vec();
350
351 return Ok(MaterializedAttachment { mime, bytes });
352 }
353
354 Err(AttachmentError::FetchFailed {
355 url: current.as_str().to_string(),
356 status: 0,
357 })
358}
359
360#[cfg(not(feature = "multimodal"))]
361#[allow(clippy::unused_async)]
362async fn materialize_checked_url(
363 parsed: Url,
364 _filter: &dyn UrlFilter,
365) -> Result<MaterializedAttachment, AttachmentError> {
366 Err(AttachmentError::FetchFailed {
367 url: parsed.as_str().to_string(),
368 status: 0,
369 })
370}
371
372fn validate_remote_url(url: &Url, filter: &dyn UrlFilter) -> Result<(), AttachmentError> {
373 if url.scheme() != "https" {
374 return Err(AttachmentError::UrlBlocked {
375 url: url.as_str().to_string(),
376 reason: "only https URLs are supported".to_string(),
377 });
378 }
379
380 if !filter.allows(url) {
381 return Err(AttachmentError::UrlBlocked {
382 url: url.as_str().to_string(),
383 reason: "blocked by URL filter".to_string(),
384 });
385 }
386
387 Ok(())
388}
389
390#[cfg(feature = "multimodal")]
391fn resolve_redirect_target(
392 current: &Url,
393 location: &str,
394 filter: &dyn UrlFilter,
395) -> Result<Url, AttachmentError> {
396 let redirected = current
397 .join(location)
398 .map_err(|err| AttachmentError::UrlBlocked {
399 url: current.as_str().to_string(),
400 reason: format!("invalid redirect target: {err}"),
401 })?;
402 validate_remote_url(&redirected, filter)?;
403 Ok(redirected)
404}
405
406fn mime_from_path(path: &Path) -> Result<String, AttachmentError> {
407 let extension = path
408 .extension()
409 .and_then(|extension| extension.to_str())
410 .unwrap_or_default()
411 .to_ascii_lowercase();
412 let mime = match extension.as_str() {
413 "png" => "image/png",
414 "jpg" | "jpeg" => "image/jpeg",
415 "gif" => "image/gif",
416 "webp" => "image/webp",
417 _ => {
418 return Err(AttachmentError::UnsupportedMime {
419 mime: "application/octet-stream".to_string(),
420 });
421 }
422 };
423 Ok(mime.to_string())
424}
425
426#[cfg(feature = "multimodal")]
427fn mime_from_url_path(url: &str) -> Result<String, AttachmentError> {
428 let parsed = Url::parse(url).map_err(|_| AttachmentError::UnsupportedMime {
429 mime: "application/octet-stream".to_string(),
430 })?;
431 mime_from_path(Path::new(parsed.path()))
432}
433
434fn normalize_mime(mime: &str) -> String {
435 mime.split(';')
436 .next()
437 .unwrap_or(mime)
438 .trim()
439 .to_ascii_lowercase()
440}
441
442fn validate_attachment_mime(mime: &str) -> Result<(), AttachmentError> {
443 let mime = normalize_mime(mime);
444 match mime.as_str() {
445 "image/png" | "image/jpeg" | "image/gif" | "image/webp" => Ok(()),
446 _ => Err(AttachmentError::UnsupportedMime { mime }),
447 }
448}
449
450pub const CASE_NAMESPACE: Uuid = Uuid::from_bytes([
455 37, 101, 28, 203, 118, 231, 87, 244, 147, 248, 152, 59, 222, 174, 80, 226,
456]);
457
458#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
461pub struct CaseFingerprint {
462 pub id: String,
463 pub name: String,
464 pub description: Option<String>,
465 pub system_prompt: String,
466 pub user_messages: Vec<String>,
467 pub expected_trajectory: Option<Vec<ExpectedToolCallFingerprint>>,
468 pub expected_response: Option<ResponseCriteriaFingerprint>,
469 pub expected_assertion: Option<Assertion>,
470 pub expected_interactions: Option<Vec<InteractionExpectation>>,
471 pub few_shot_examples: Vec<FewShotExample>,
472 pub budget: Option<BudgetConstraintsFingerprint>,
473 pub evaluators: Vec<String>,
474 pub metadata: CanonicalJsonValue,
475 pub attachments: Vec<AttachmentFingerprint>,
476 pub expected_environment_state: Option<Vec<EnvironmentStateFingerprint>>,
477 pub expected_tool_intent: Option<ToolIntentFingerprint>,
478 pub semantic_tool_selection: bool,
479}
480
481#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
482pub struct ExpectedToolCallFingerprint {
483 pub tool_name: String,
484 pub arguments: Option<CanonicalJsonValue>,
485}
486
487#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
488pub enum ResponseCriteriaFingerprint {
489 Exact { expected: String },
490 Contains { substring: String },
491 Regex { pattern: String },
492 Custom,
493}
494
495#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
496pub struct BudgetConstraintsFingerprint {
497 pub cost_limit_bits: Option<u64>,
498 pub input_limit: Option<u64>,
499 pub output_limit: Option<u64>,
500 pub turn_limit: Option<usize>,
501}
502
503#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
504pub struct EnvironmentStateFingerprint {
505 pub name: String,
506 pub state: CanonicalJsonValue,
507}
508
509#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
510pub struct ToolIntentFingerprint {
511 pub intent: String,
512 pub tool_name: Option<String>,
513}
514
515#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
516pub enum AttachmentFingerprint {
517 Path(String),
518 Base64 { mime: String, sha256: String },
519 Url(String),
520}
521
522#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
523#[serde(tag = "kind", content = "value", rename_all = "snake_case")]
524pub enum CanonicalJsonValue {
525 Null,
526 Bool(bool),
527 Number(String),
528 String(String),
529 Array(Vec<Self>),
530 Object(BTreeMap<String, Self>),
531}
532
533impl From<&serde_json::Value> for CanonicalJsonValue {
534 fn from(value: &serde_json::Value) -> Self {
535 match value {
536 serde_json::Value::Null => Self::Null,
537 serde_json::Value::Bool(value) => Self::Bool(*value),
538 serde_json::Value::Number(value) => Self::Number(value.to_string()),
539 serde_json::Value::String(value) => Self::String(value.clone()),
540 serde_json::Value::Array(values) => {
541 Self::Array(values.iter().map(Self::from).collect())
542 }
543 serde_json::Value::Object(values) => Self::Object(
544 values
545 .iter()
546 .map(|(key, value)| (key.clone(), Self::from(value)))
547 .collect(),
548 ),
549 }
550 }
551}
552
553impl From<&ExpectedToolCall> for ExpectedToolCallFingerprint {
554 fn from(call: &ExpectedToolCall) -> Self {
555 Self {
556 tool_name: call.tool_name.clone(),
557 arguments: call.arguments.as_ref().map(CanonicalJsonValue::from),
558 }
559 }
560}
561
562impl From<&ResponseCriteria> for ResponseCriteriaFingerprint {
563 fn from(criteria: &ResponseCriteria) -> Self {
564 match criteria {
565 ResponseCriteria::Exact { expected } => Self::Exact {
566 expected: expected.clone(),
567 },
568 ResponseCriteria::Contains { substring } => Self::Contains {
569 substring: substring.clone(),
570 },
571 ResponseCriteria::Regex { pattern } => Self::Regex {
572 pattern: pattern.clone(),
573 },
574 ResponseCriteria::Custom(_) => Self::Custom,
575 }
576 }
577}
578
579impl From<&BudgetConstraints> for BudgetConstraintsFingerprint {
580 fn from(budget: &BudgetConstraints) -> Self {
581 Self {
582 cost_limit_bits: budget.max_cost.map(f64::to_bits),
583 input_limit: budget.max_input,
584 output_limit: budget.max_output,
585 turn_limit: budget.max_turns,
586 }
587 }
588}
589
590impl From<&EnvironmentState> for EnvironmentStateFingerprint {
591 fn from(state: &EnvironmentState) -> Self {
592 Self {
593 name: state.name.clone(),
594 state: CanonicalJsonValue::from(&state.state),
595 }
596 }
597}
598
599impl From<&ToolIntent> for ToolIntentFingerprint {
600 fn from(intent: &ToolIntent) -> Self {
601 Self {
602 intent: intent.intent.clone(),
603 tool_name: intent.tool_name.clone(),
604 }
605 }
606}
607
608impl From<&Attachment> for AttachmentFingerprint {
609 fn from(attachment: &Attachment) -> Self {
610 match attachment {
611 Attachment::Path(path) => Self::Path(path.to_string_lossy().replace('\\', "/")),
612 Attachment::Base64 { mime, bytes } => {
613 let digest = Sha256::digest(bytes);
614 Self::Base64 {
615 mime: normalize_mime(mime),
616 sha256: hex_lower(&digest),
617 }
618 }
619 Attachment::Url(url) => Self::Url(url.clone()),
620 }
621 }
622}
623
624fn hex_lower(bytes: &[u8]) -> String {
625 const HEX: &[u8; 16] = b"0123456789abcdef";
626 let mut out = String::with_capacity(bytes.len() * 2);
627 for byte in bytes {
628 out.push(HEX[(byte >> 4) as usize] as char);
629 out.push(HEX[(byte & 0x0f) as usize] as char);
630 }
631 out
632}
633
634#[derive(Debug, Clone, Serialize, Deserialize)]
636pub struct BudgetConstraints {
637 #[serde(default, skip_serializing_if = "Option::is_none")]
639 pub max_cost: Option<f64>,
640 #[serde(default, skip_serializing_if = "Option::is_none")]
642 pub max_input: Option<u64>,
643 #[serde(default, skip_serializing_if = "Option::is_none")]
645 pub max_output: Option<u64>,
646 #[serde(default, skip_serializing_if = "Option::is_none")]
648 pub max_turns: Option<usize>,
649}
650
651impl BudgetConstraints {
652 #[must_use]
654 pub fn to_policies(&self) -> (Option<BudgetPolicy>, Option<MaxTurnsPolicy>) {
655 let budget_policy =
656 if self.max_cost.is_none() && self.max_input.is_none() && self.max_output.is_none() {
657 None
658 } else {
659 let mut policy = BudgetPolicy::new();
660 if let Some(max_cost) = self.max_cost {
661 policy = policy.max_cost(max_cost);
662 }
663 if let Some(max_input) = self.max_input {
664 policy = policy.max_input(max_input);
665 }
666 if let Some(max_output) = self.max_output {
667 policy = policy.max_output(max_output);
668 }
669 Some(policy)
670 };
671
672 let max_turns_policy = self.max_turns.map(MaxTurnsPolicy::new);
673
674 (budget_policy, max_turns_policy)
675 }
676}
677
678#[derive(Clone, Serialize, Deserialize)]
684pub struct EvalCase {
685 pub id: String,
687 pub name: String,
689 #[serde(default, skip_serializing_if = "Option::is_none")]
691 pub description: Option<String>,
692 pub system_prompt: String,
694 pub user_messages: Vec<String>,
696 #[serde(default, skip_serializing_if = "Option::is_none")]
698 pub expected_trajectory: Option<Vec<ExpectedToolCall>>,
699 #[serde(default, skip_serializing_if = "Option::is_none")]
701 pub expected_response: Option<ResponseCriteria>,
702 #[serde(default, skip_serializing_if = "Option::is_none")]
704 pub expected_assertion: Option<Assertion>,
705 #[serde(default, skip_serializing_if = "Option::is_none")]
707 pub expected_interactions: Option<Vec<InteractionExpectation>>,
708 #[serde(default, skip_serializing_if = "Vec::is_empty")]
710 pub few_shot_examples: Vec<FewShotExample>,
711 #[serde(default, skip_serializing_if = "Option::is_none")]
713 pub budget: Option<BudgetConstraints>,
714 #[serde(default, skip_serializing_if = "Vec::is_empty")]
716 pub evaluators: Vec<String>,
717 #[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
719 pub metadata: serde_json::Value,
720 #[serde(default, skip_serializing_if = "Vec::is_empty")]
722 pub attachments: Vec<Attachment>,
723 #[serde(
726 default,
727 skip_serializing_if = "Option::is_none",
728 serialize_with = "serialize_optional_uuid",
729 deserialize_with = "deserialize_optional_uuid"
730 )]
731 pub session_id: Option<Uuid>,
732 #[serde(default, skip_serializing_if = "Option::is_none")]
737 pub expected_environment_state: Option<Vec<EnvironmentState>>,
738 #[serde(default, skip_serializing_if = "Option::is_none")]
740 pub expected_tool_intent: Option<ToolIntent>,
741 #[serde(default, skip_serializing_if = "is_false")]
743 pub semantic_tool_selection: bool,
744 #[serde(skip)]
747 pub state_capture: Option<StateCapture>,
748}
749
750impl std::fmt::Debug for EvalCase {
751 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
752 f.debug_struct("EvalCase")
753 .field("id", &self.id)
754 .field("name", &self.name)
755 .field("description", &self.description)
756 .field("system_prompt", &self.system_prompt)
757 .field("user_messages", &self.user_messages)
758 .field("expected_trajectory", &self.expected_trajectory)
759 .field("expected_response", &self.expected_response)
760 .field("expected_assertion", &self.expected_assertion)
761 .field("expected_interactions", &self.expected_interactions)
762 .field("few_shot_examples", &self.few_shot_examples)
763 .field("budget", &self.budget)
764 .field("evaluators", &self.evaluators)
765 .field("metadata", &self.metadata)
766 .field("attachments", &self.attachments)
767 .field("session_id", &self.session_id)
768 .field(
769 "expected_environment_state",
770 &self.expected_environment_state,
771 )
772 .field("expected_tool_intent", &self.expected_tool_intent)
773 .field("semantic_tool_selection", &self.semantic_tool_selection)
774 .field(
775 "state_capture",
776 &self.state_capture.as_ref().map(|_| "<fn>"),
777 )
778 .finish()
779 }
780}
781
782impl From<&EvalCase> for CaseFingerprint {
783 fn from(case: &EvalCase) -> Self {
784 Self {
785 id: case.id.clone(),
786 name: case.name.clone(),
787 description: case.description.clone(),
788 system_prompt: case.system_prompt.clone(),
789 user_messages: case.user_messages.clone(),
790 expected_trajectory: case.expected_trajectory.as_ref().map(|calls| {
791 calls
792 .iter()
793 .map(ExpectedToolCallFingerprint::from)
794 .collect()
795 }),
796 expected_response: case
797 .expected_response
798 .as_ref()
799 .map(ResponseCriteriaFingerprint::from),
800 expected_assertion: case.expected_assertion.clone(),
801 expected_interactions: case.expected_interactions.clone(),
802 few_shot_examples: case.few_shot_examples.clone(),
803 budget: case.budget.as_ref().map(BudgetConstraintsFingerprint::from),
804 evaluators: case.evaluators.clone(),
805 metadata: CanonicalJsonValue::from(&case.metadata),
806 attachments: case
807 .attachments
808 .iter()
809 .map(AttachmentFingerprint::from)
810 .collect(),
811 expected_environment_state: case.expected_environment_state.as_ref().map(|states| {
812 states
813 .iter()
814 .map(EnvironmentStateFingerprint::from)
815 .collect()
816 }),
817 expected_tool_intent: case
818 .expected_tool_intent
819 .as_ref()
820 .map(ToolIntentFingerprint::from),
821 semantic_tool_selection: case.semantic_tool_selection,
822 }
823 }
824}
825
826impl EvalCase {
827 #[must_use]
830 pub fn content_fingerprint(&self) -> CaseFingerprint {
831 CaseFingerprint::from(self)
832 }
833
834 #[must_use]
841 pub fn default_session_id(&self) -> Uuid {
842 let canonical =
843 serde_json::to_vec(&self.content_fingerprint()).expect("case fingerprint serializes");
844 let digest = Sha256::digest(canonical);
845 Uuid::new_v5(&CASE_NAMESPACE, digest.as_slice())
846 }
847
848 pub fn validate(&self) -> Result<(), EvalError> {
850 if let Some(assertion) = &self.expected_assertion {
851 validate_non_empty_field(
852 &self.id,
853 "expected_assertion.description",
854 &assertion.description,
855 )?;
856 match &assertion.kind {
857 AssertionKind::GoalCompleted | AssertionKind::UserSatisfied => {}
858 AssertionKind::ToolInvoked(tool_name) => {
859 validate_non_empty_field(
860 &self.id,
861 "expected_assertion.kind.tool_name",
862 tool_name,
863 )?;
864 }
865 AssertionKind::Custom { predicate } => {
866 validate_non_empty_field(
867 &self.id,
868 "expected_assertion.kind.predicate",
869 predicate,
870 )?;
871 }
872 }
873 }
874
875 if let Some(interactions) = &self.expected_interactions {
876 for (index, interaction) in interactions.iter().enumerate() {
877 let field_prefix = format!("expected_interactions[{index}]");
878 validate_non_empty_field(
879 &self.id,
880 &format!("{field_prefix}.from"),
881 &interaction.from,
882 )?;
883 validate_non_empty_field(&self.id, &format!("{field_prefix}.to"), &interaction.to)?;
884 validate_non_empty_field(
885 &self.id,
886 &format!("{field_prefix}.description"),
887 &interaction.description,
888 )?;
889 }
890 }
891
892 for (index, example) in self.few_shot_examples.iter().enumerate() {
893 let field_prefix = format!("few_shot_examples[{index}]");
894 validate_non_empty_field(&self.id, &format!("{field_prefix}.input"), &example.input)?;
895 validate_non_empty_field(
896 &self.id,
897 &format!("{field_prefix}.expected"),
898 &example.expected,
899 )?;
900 if let Some(reasoning) = &example.reasoning {
901 validate_non_empty_field(
902 &self.id,
903 &format!("{field_prefix}.reasoning"),
904 reasoning,
905 )?;
906 }
907 }
908
909 for (index, attachment) in self.attachments.iter().enumerate() {
910 validate_attachment_declaration(&self.id, index, attachment)?;
911 }
912
913 if let Some(states) = &self.expected_environment_state {
914 let mut seen: HashSet<&str> = HashSet::with_capacity(states.len());
915 for state in states {
916 if !seen.insert(state.name.as_str()) {
917 return Err(EvalError::invalid_case(format!(
918 "case `{case_id}`: duplicate expected_environment_state name `{name}`",
919 case_id = self.id,
920 name = state.name,
921 )));
922 }
923 }
924 }
925
926 Ok(())
927 }
928}
929
930#[allow(clippy::trivially_copy_pass_by_ref)]
931const fn is_false(b: &bool) -> bool {
932 !*b
933}
934
935#[derive(Debug, Clone, Serialize, Deserialize)]
937pub struct EvalSet {
938 pub id: String,
940 pub name: String,
942 #[serde(default, skip_serializing_if = "Option::is_none")]
944 pub description: Option<String>,
945 pub cases: Vec<EvalCase>,
947}
948
949#[derive(Debug, Clone, Serialize, Deserialize)]
953pub struct EvalMetricResult {
954 pub evaluator_name: String,
956 pub score: Score,
958 #[serde(default, skip_serializing_if = "Option::is_none")]
960 pub details: Option<String>,
961}
962
963#[derive(Debug, Clone, Serialize, Deserialize)]
965pub struct EvalCaseResult {
966 pub case_id: String,
968 pub invocation: Invocation,
970 pub metric_results: Vec<EvalMetricResult>,
972 pub verdict: Verdict,
974}
975
976#[derive(Debug, Clone, Serialize, Deserialize)]
978pub struct EvalSetResult {
979 pub eval_set_id: String,
981 pub case_results: Vec<EvalCaseResult>,
983 pub summary: EvalSummary,
985 pub timestamp: u64,
987}
988
989#[derive(Debug, Clone, Serialize, Deserialize)]
991pub struct EvalSummary {
992 pub total_cases: usize,
994 pub passed: usize,
996 pub failed: usize,
998 pub total_cost: Cost,
1000 pub total_usage: Usage,
1002 pub total_duration: Duration,
1004}
1005
1006pub fn validate_eval_case(case: &EvalCase) -> Result<(), EvalError> {
1019 case.validate()
1020}
1021
1022pub fn validate_eval_set(set: &EvalSet) -> Result<(), EvalError> {
1024 let mut seen_case_ids: HashSet<&str> = HashSet::with_capacity(set.cases.len());
1025 for case in &set.cases {
1026 if !seen_case_ids.insert(case.id.as_str()) {
1027 return Err(EvalError::invalid_case(format!(
1028 "eval set `{set_id}`: duplicate case id `{case_id}`",
1029 set_id = set.id,
1030 case_id = case.id,
1031 )));
1032 }
1033 case.validate()?;
1034 }
1035 Ok(())
1036}
1037
1038fn validate_non_empty_field(case_id: &str, field: &str, value: &str) -> Result<(), EvalError> {
1039 if value.trim().is_empty() {
1040 return Err(EvalError::invalid_case(format!(
1041 "case `{case_id}`: `{field}` must not be blank"
1042 )));
1043 }
1044 Ok(())
1045}
1046
1047fn validate_attachment_declaration(
1048 case_id: &str,
1049 index: usize,
1050 attachment: &Attachment,
1051) -> Result<(), EvalError> {
1052 match attachment {
1053 Attachment::Path(path) => {
1054 if path.as_os_str().is_empty()
1055 || path.is_absolute()
1056 || path
1057 .components()
1058 .any(|component| component == Component::ParentDir)
1059 {
1060 return Err(EvalError::invalid_case(format!(
1061 "case `{case_id}`: attachments[{index}] path must stay relative to the eval-set root"
1062 )));
1063 }
1064 }
1065 Attachment::Base64 { mime, .. } => {
1066 validate_attachment_mime(mime).map_err(|err| {
1067 EvalError::invalid_case(format!(
1068 "case `{case_id}`: attachments[{index}] invalid MIME: {err}"
1069 ))
1070 })?;
1071 }
1072 Attachment::Url(url) => {
1073 let parsed = Url::parse(url).map_err(|err| {
1074 EvalError::invalid_case(format!(
1075 "case `{case_id}`: attachments[{index}] invalid URL: {err}"
1076 ))
1077 })?;
1078 if parsed.scheme() != "https" {
1079 return Err(EvalError::invalid_case(format!(
1080 "case `{case_id}`: attachments[{index}] URL must use https"
1081 )));
1082 }
1083 }
1084 }
1085
1086 Ok(())
1087}
1088
1089#[allow(clippy::ref_option)]
1090fn serialize_optional_uuid<S>(value: &Option<Uuid>, serializer: S) -> Result<S::Ok, S::Error>
1091where
1092 S: Serializer,
1093{
1094 match value {
1095 Some(uuid) => serializer.serialize_some(&uuid.to_string()),
1096 None => serializer.serialize_none(),
1097 }
1098}
1099
1100fn deserialize_optional_uuid<'de, D>(deserializer: D) -> Result<Option<Uuid>, D::Error>
1101where
1102 D: Deserializer<'de>,
1103{
1104 let value = Option::<String>::deserialize(deserializer)?;
1105 value
1106 .map(|value| {
1107 Uuid::parse_str(&value).map_err(|err| serde::de::Error::custom(err.to_string()))
1108 })
1109 .transpose()
1110}
1111
1112#[cfg(test)]
1113mod validation_tests {
1114 use super::*;
1115
1116 fn base_case(id: &str) -> EvalCase {
1117 EvalCase {
1118 id: id.to_string(),
1119 name: id.to_string(),
1120 description: None,
1121 system_prompt: String::new(),
1122 user_messages: vec!["hi".to_string()],
1123 expected_trajectory: None,
1124 expected_response: None,
1125 expected_assertion: None,
1126 expected_interactions: None,
1127 few_shot_examples: vec![],
1128 budget: None,
1129 evaluators: vec![],
1130 metadata: serde_json::Value::Null,
1131 attachments: vec![],
1132 session_id: None,
1133 expected_environment_state: None,
1134 expected_tool_intent: None,
1135 semantic_tool_selection: false,
1136 state_capture: None,
1137 }
1138 }
1139
1140 #[test]
1141 fn validate_accepts_unique_environment_state_names() {
1142 let mut case = base_case("c1");
1143 case.expected_environment_state = Some(vec![
1144 EnvironmentState {
1145 name: "alpha".into(),
1146 state: serde_json::json!({"v": 1}),
1147 },
1148 EnvironmentState {
1149 name: "beta".into(),
1150 state: serde_json::json!({"v": 2}),
1151 },
1152 ]);
1153 assert!(validate_eval_case(&case).is_ok());
1154 }
1155
1156 #[test]
1157 fn validate_rejects_duplicate_environment_state_names() {
1158 let mut case = base_case("dup");
1159 case.expected_environment_state = Some(vec![
1160 EnvironmentState {
1161 name: "alpha".into(),
1162 state: serde_json::json!({"v": 1}),
1163 },
1164 EnvironmentState {
1165 name: "alpha".into(),
1166 state: serde_json::json!({"v": 2}),
1167 },
1168 ]);
1169 let err = validate_eval_case(&case).expect_err("duplicate should be rejected");
1170 match err {
1171 EvalError::InvalidCase { reason } => {
1172 assert!(reason.contains("alpha"), "reason: {reason}");
1173 assert!(reason.contains("dup"), "reason mentions case id: {reason}");
1174 }
1175 other => panic!("expected InvalidCase, got {other:?}"),
1176 }
1177 }
1178
1179 #[test]
1180 fn validate_none_environment_state_is_ok() {
1181 let case = base_case("none");
1182 assert!(validate_eval_case(&case).is_ok());
1183 }
1184
1185 #[test]
1186 fn validate_eval_set_propagates_case_errors() {
1187 let mut case = base_case("bad");
1188 case.expected_environment_state = Some(vec![
1189 EnvironmentState {
1190 name: "x".into(),
1191 state: serde_json::Value::Null,
1192 },
1193 EnvironmentState {
1194 name: "x".into(),
1195 state: serde_json::Value::Null,
1196 },
1197 ]);
1198 let set = EvalSet {
1199 id: "set".into(),
1200 name: "Set".into(),
1201 description: None,
1202 cases: vec![case],
1203 };
1204 assert!(validate_eval_set(&set).is_err());
1205 }
1206
1207 #[test]
1208 fn environment_state_serde_round_trip() {
1209 let state = EnvironmentState {
1210 name: "db".into(),
1211 state: serde_json::json!({"rows": 3, "schema": "public"}),
1212 };
1213 let json = serde_json::to_string(&state).unwrap();
1214 let back: EnvironmentState = serde_json::from_str(&json).unwrap();
1215 assert_eq!(back.name, state.name);
1216 assert_eq!(back.state, state.state);
1217 }
1218
1219 #[test]
1220 fn eval_case_serde_round_trip_with_v2_fields() {
1221 let mut case = base_case("v2");
1222 case.expected_environment_state = Some(vec![EnvironmentState {
1223 name: "alpha".into(),
1224 state: serde_json::json!({"n": 1}),
1225 }]);
1226 case.expected_tool_intent = Some(ToolIntent {
1227 intent: "read config".into(),
1228 tool_name: Some("read_file".into()),
1229 });
1230 case.expected_assertion = Some(Assertion {
1231 description: "goal completed".into(),
1232 kind: AssertionKind::GoalCompleted,
1233 });
1234 case.expected_interactions = Some(vec![InteractionExpectation {
1235 from: "planner".into(),
1236 to: "worker".into(),
1237 description: "delegates the task".into(),
1238 }]);
1239 case.few_shot_examples = vec![FewShotExample {
1240 input: "hello".into(),
1241 expected: "world".into(),
1242 reasoning: Some("example".into()),
1243 }];
1244 case.session_id = Some(Uuid::nil());
1245 case.semantic_tool_selection = true;
1246 let yaml_like = serde_json::to_string(&case).unwrap();
1247 let back: EvalCase = serde_json::from_str(&yaml_like).unwrap();
1248 assert_eq!(back.expected_environment_state.as_ref().unwrap().len(), 1);
1249 assert_eq!(
1250 back.expected_tool_intent.as_ref().unwrap().intent,
1251 "read config"
1252 );
1253 assert_eq!(
1254 back.expected_assertion.as_ref().unwrap().description,
1255 "goal completed"
1256 );
1257 assert_eq!(back.expected_interactions.as_ref().unwrap().len(), 1);
1258 assert_eq!(back.few_shot_examples.len(), 1);
1259 assert_eq!(back.session_id, Some(Uuid::nil()));
1260 assert!(back.semantic_tool_selection);
1261 assert!(back.attachments.is_empty());
1262 assert!(back.state_capture.is_none());
1263 }
1264
1265 #[test]
1266 fn case_namespace_matches_oid_derived_value() {
1267 assert_eq!(
1268 CASE_NAMESPACE,
1269 Uuid::new_v5(&Uuid::NAMESPACE_OID, b"swink-agent-eval.case")
1270 );
1271 }
1272
1273 #[test]
1274 fn default_session_id_is_deterministic_for_same_case() {
1275 let mut case = base_case("stable");
1276 case.metadata = serde_json::json!({
1277 "beta": [2, {"y": true, "x": false}],
1278 "alpha": {"nested_b": 2, "nested_a": 1}
1279 });
1280 case.expected_response = Some(ResponseCriteria::Contains {
1281 substring: "ok".into(),
1282 });
1283 case.expected_trajectory = Some(vec![ExpectedToolCall {
1284 tool_name: "read_file".into(),
1285 arguments: Some(serde_json::json!({"path": "./project-alpha/config.toml"})),
1286 }]);
1287
1288 let first = case.default_session_id();
1289 let second = case.default_session_id();
1290 assert_eq!(first, second);
1291 }
1292
1293 #[test]
1294 fn default_session_id_is_stable_across_json_key_order() {
1295 let mut left = base_case("ordered");
1296 left.metadata = serde_json::json!({
1297 "alpha": {"x": 1, "y": 2},
1298 "beta": [3, 4]
1299 });
1300 left.expected_environment_state = Some(vec![EnvironmentState {
1301 name: "workspace".into(),
1302 state: serde_json::json!({"files": {"b": 2, "a": 1}}),
1303 }]);
1304
1305 let mut right = left.clone();
1306 right.metadata = serde_json::from_str(r#"{"beta":[3,4],"alpha":{"y":2,"x":1}}"#)
1307 .expect("valid metadata json");
1308 right.expected_environment_state = Some(vec![EnvironmentState {
1309 name: "workspace".into(),
1310 state: serde_json::from_str(r#"{"files":{"a":1,"b":2}}"#).expect("valid state json"),
1311 }]);
1312
1313 assert_eq!(left.default_session_id(), right.default_session_id());
1314 }
1315
1316 #[test]
1317 fn default_session_id_changes_when_case_content_changes() {
1318 let mut case = base_case("mutates");
1319 let original = case.default_session_id();
1320 case.user_messages.push("follow-up".into());
1321 assert_ne!(original, case.default_session_id());
1322 }
1323}
1324
1325#[cfg(test)]
1326mod budget_policy_tests {
1327 use super::*;
1328 use swink_agent::{Cost, PolicyContext, PolicyVerdict, PreTurnPolicy, SessionState, Usage};
1329
1330 fn make_ctx<'a>(turn_index: usize, usage: &'a Usage, cost: &'a Cost) -> PolicyContext<'a> {
1331 let state = Box::leak(Box::new(SessionState::new()));
1332 PolicyContext {
1333 turn_index,
1334 accumulated_usage: usage,
1335 accumulated_cost: cost,
1336 message_count: 0,
1337 overflow_signal: false,
1338 new_messages: &[],
1339 state,
1340 }
1341 }
1342
1343 #[test]
1344 fn budget_constraints_to_policies_none_when_unset() {
1345 let constraints = BudgetConstraints {
1346 max_cost: None,
1347 max_input: None,
1348 max_output: None,
1349 max_turns: None,
1350 };
1351
1352 let (budget_policy, max_turns_policy) = constraints.to_policies();
1353
1354 assert!(budget_policy.is_none());
1355 assert!(max_turns_policy.is_none());
1356 }
1357
1358 #[test]
1359 fn budget_constraints_to_policies_builds_budget_only_for_cost() {
1360 let constraints = BudgetConstraints {
1361 max_cost: Some(1.0),
1362 max_input: None,
1363 max_output: None,
1364 max_turns: None,
1365 };
1366
1367 let (budget_policy, max_turns_policy) = constraints.to_policies();
1368 let usage = Usage::default();
1369 let cost = Cost {
1370 total: 1.0,
1371 ..Default::default()
1372 };
1373 let ctx = make_ctx(0, &usage, &cost);
1374
1375 assert!(matches!(
1376 PreTurnPolicy::evaluate(&budget_policy.unwrap(), &ctx),
1377 PolicyVerdict::Stop(_)
1378 ));
1379 assert!(max_turns_policy.is_none());
1380 }
1381
1382 #[test]
1383 fn budget_constraints_to_policies_builds_budget_only_for_input_output() {
1384 let constraints = BudgetConstraints {
1385 max_cost: None,
1386 max_input: Some(10),
1387 max_output: Some(20),
1388 max_turns: None,
1389 };
1390
1391 let (budget_policy, max_turns_policy) = constraints.to_policies();
1392 let usage = Usage {
1393 input: 10,
1394 output: 20,
1395 total: 30,
1396 ..Default::default()
1397 };
1398 let cost = Cost::default();
1399 let ctx = make_ctx(0, &usage, &cost);
1400
1401 assert!(matches!(
1402 PreTurnPolicy::evaluate(&budget_policy.unwrap(), &ctx),
1403 PolicyVerdict::Stop(_)
1404 ));
1405 assert!(max_turns_policy.is_none());
1406 }
1407
1408 #[test]
1409 fn budget_constraints_to_policies_builds_both_policies_when_needed() {
1410 let constraints = BudgetConstraints {
1411 max_cost: Some(2.0),
1412 max_input: None,
1413 max_output: None,
1414 max_turns: Some(3),
1415 };
1416
1417 let (budget_policy, max_turns_policy) = constraints.to_policies();
1418 let usage = Usage::default();
1419 let cost = Cost {
1420 total: 2.0,
1421 ..Default::default()
1422 };
1423 let budget_ctx = make_ctx(0, &usage, &cost);
1424 let turn_cost = Cost::default();
1425 let turn_ctx = make_ctx(3, &usage, &turn_cost);
1426
1427 assert!(matches!(
1428 PreTurnPolicy::evaluate(&budget_policy.unwrap(), &budget_ctx),
1429 PolicyVerdict::Stop(_)
1430 ));
1431 assert!(matches!(
1432 PreTurnPolicy::evaluate(&max_turns_policy.unwrap(), &turn_ctx),
1433 PolicyVerdict::Stop(_)
1434 ));
1435 }
1436}
1437
1438#[cfg(all(test, feature = "multimodal"))]
1439mod attachment_url_tests {
1440 use super::*;
1441
1442 struct AllowListedFilter;
1443
1444 impl UrlFilter for AllowListedFilter {
1445 fn allows(&self, url: &Url) -> bool {
1446 matches!(
1447 url.host_str(),
1448 Some("assets.example.com" | "cdn.example.com")
1449 )
1450 }
1451 }
1452
1453 #[test]
1454 fn resolve_redirect_target_revalidates_each_hop_against_filter() {
1455 let current = Url::parse("https://assets.example.com/image.png").unwrap();
1456 let err = resolve_redirect_target(
1457 ¤t,
1458 "https://169.254.169.254/latest/meta-data",
1459 &AllowListedFilter,
1460 )
1461 .expect_err("redirect target should be revalidated");
1462
1463 match err {
1464 AttachmentError::UrlBlocked { url, reason } => {
1465 assert_eq!(url, "https://169.254.169.254/latest/meta-data");
1466 assert!(reason.contains("blocked by URL filter"));
1467 }
1468 other => panic!("expected UrlBlocked, got {other:?}"),
1469 }
1470 }
1471
1472 #[test]
1473 fn resolve_redirect_target_rejects_http_downgrades() {
1474 let current = Url::parse("https://assets.example.com/image.png").unwrap();
1475 let err = resolve_redirect_target(
1476 ¤t,
1477 "http://cdn.example.com/image.png",
1478 &AllowListedFilter,
1479 )
1480 .expect_err("http redirect should be rejected");
1481
1482 match err {
1483 AttachmentError::UrlBlocked { url, reason } => {
1484 assert_eq!(url, "http://cdn.example.com/image.png");
1485 assert!(reason.contains("only https URLs are supported"));
1486 }
1487 other => panic!("expected UrlBlocked, got {other:?}"),
1488 }
1489 }
1490
1491 #[test]
1492 fn resolve_redirect_target_allows_relative_https_redirects_when_filter_passes() {
1493 let current = Url::parse("https://assets.example.com/path/start.png").unwrap();
1494 let redirected =
1495 resolve_redirect_target(¤t, "../final.webp", &AllowListedFilter).unwrap();
1496
1497 assert_eq!(redirected.as_str(), "https://assets.example.com/final.webp");
1498 }
1499}