swink_agent_eval/
types.rs

1//! Data types for evaluation cases, invocations, and results.
2
3use std::collections::{BTreeMap, HashSet};
4use std::path::{Component, Path, PathBuf};
5use std::sync::Arc;
6use std::time::Duration;
7
8use serde::{Deserialize, Deserializer, Serialize, Serializer};
9use sha2::{Digest, Sha256};
10use swink_agent::{AssistantMessage, Cost, ModelSpec, StopReason, ToolResultMessage, Usage};
11use swink_agent_policies::{BudgetPolicy, MaxTurnsPolicy};
12use thiserror::Error;
13use url::Url;
14use uuid::Uuid;
15
16use crate::error::EvalError;
17use crate::score::{Score, Verdict};
18use crate::url_filter::UrlFilter;
19
20// ─── Recorded Data ──────────────────────────────────────────────────────────
21
22/// A tool call as captured from the agent event stream.
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct RecordedToolCall {
25    /// Provider-assigned tool call ID.
26    pub id: String,
27    /// Name of the tool that was invoked.
28    pub name: String,
29    /// Parsed JSON arguments passed to the tool.
30    pub arguments: serde_json::Value,
31}
32
33/// A single recorded turn from an agent run.
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct TurnRecord {
36    /// Zero-based index of this turn within the run.
37    pub turn_index: usize,
38    /// The assistant message produced during this turn.
39    pub assistant_message: AssistantMessage,
40    /// Tool calls made during this turn (in execution order).
41    pub tool_calls: Vec<RecordedToolCall>,
42    /// Tool results returned during this turn.
43    pub tool_results: Vec<ToolResultMessage>,
44    /// Wall-clock duration of this turn.
45    pub duration: Duration,
46}
47
48/// Complete trace of an agent run, built by [`TrajectoryCollector`](crate::TrajectoryCollector).
49#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct Invocation {
51    /// All turns in execution order.
52    pub turns: Vec<TurnRecord>,
53    /// Aggregated token usage across all turns.
54    pub total_usage: Usage,
55    /// Aggregated cost across all turns.
56    pub total_cost: Cost,
57    /// Wall-clock duration of the entire run.
58    pub total_duration: Duration,
59    /// Extracted text from the final assistant message, if any.
60    pub final_response: Option<String>,
61    /// Stop reason from the final turn.
62    pub stop_reason: StopReason,
63    /// Model used for this run.
64    pub model: ModelSpec,
65}
66
67// ─── Expected Data ──────────────────────────────────────────────────────────
68
69/// A single expected tool invocation in a golden path.
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct ExpectedToolCall {
72    /// The tool name that should be called.
73    pub tool_name: String,
74    /// If present, the arguments must match exactly (JSON equality).
75    #[serde(default, skip_serializing_if = "Option::is_none")]
76    pub arguments: Option<serde_json::Value>,
77}
78
79/// Criteria for matching the final response text.
80#[derive(Clone, Serialize, Deserialize)]
81#[serde(tag = "mode", rename_all = "snake_case")]
82pub enum ResponseCriteria {
83    /// Response must match exactly.
84    Exact { expected: String },
85    /// Response must contain the given substring.
86    Contains { substring: String },
87    /// Response must match the given regex pattern.
88    Regex { pattern: String },
89    /// Custom scoring function (not serializable — set programmatically).
90    #[serde(skip)]
91    Custom(#[serde(skip)] Arc<dyn Fn(&str) -> Score + Send + Sync>),
92}
93
94impl std::fmt::Debug for ResponseCriteria {
95    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96        match self {
97            Self::Exact { expected } => {
98                f.debug_struct("Exact").field("expected", expected).finish()
99            }
100            Self::Contains { substring } => f
101                .debug_struct("Contains")
102                .field("substring", substring)
103                .finish(),
104            Self::Regex { pattern } => f.debug_struct("Regex").field("pattern", pattern).finish(),
105            Self::Custom(_) => f.debug_tuple("Custom").field(&"<fn>").finish(),
106        }
107    }
108}
109
110/// Named snapshot of an environment state produced by a [`StateCapture`].
111///
112/// Used with `EvalCase::expected_environment_state` to assert that after the
113/// agent completes, the captured environment matches the expected values via
114/// full JSON equality (FR-013, FR-015).
115#[derive(Debug, Clone, Serialize, Deserialize)]
116pub struct EnvironmentState {
117    /// Identifier for this state entry. Duplicate names within a single
118    /// `expected_environment_state` are rejected at case-load time
119    /// (FR-015, SC-009).
120    pub name: String,
121    /// Expected (or captured) JSON value; compared for full JSON equality.
122    pub state: serde_json::Value,
123}
124
125/// Expected semantic tool intent used by the tool-parameter semantic evaluator.
126///
127/// When `tool_name` is `Some`, only tool calls whose name matches are judged;
128/// other calls are skipped (not Pass, not Fail).
129#[derive(Debug, Clone, Serialize, Deserialize)]
130pub struct ToolIntent {
131    /// Natural-language description of what the tool call should accomplish.
132    pub intent: String,
133    /// When `Some`, restrict judging to tool calls with this exact name.
134    #[serde(default, skip_serializing_if = "Option::is_none")]
135    pub tool_name: Option<String>,
136}
137
138/// Callback that captures the environment state after an agent run completes.
139///
140/// Registered programmatically on an [`EvalCase`] (or supplied by the
141/// `AgentFactory`). The callback is invoked once after the agent finishes; its
142/// output populates the "actual" side for the `EnvironmentStateEvaluator`.
143///
144/// Panics are caught by the evaluator and surfaced as `Score::fail()` with the
145/// panic message (FR-014).
146pub type StateCapture = Arc<dyn Fn(&Invocation) -> Vec<EnvironmentState> + Send + Sync>;
147
148/// Judge-evaluated assertion expected to hold after an agent invocation.
149#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
150pub struct Assertion {
151    /// Natural-language assertion description.
152    pub description: String,
153    /// Machine-readable assertion category.
154    pub kind: AssertionKind,
155}
156
157/// Assertion categories used by judge-backed evaluators.
158#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
159#[serde(rename_all = "snake_case")]
160pub enum AssertionKind {
161    /// The user's goal was completed.
162    GoalCompleted,
163    /// The user appears satisfied with the outcome.
164    UserSatisfied,
165    /// A named tool must be invoked.
166    ToolInvoked(String),
167    /// Free-form predicate evaluated by a judge-backed evaluator.
168    Custom { predicate: String },
169}
170
171/// Expected interaction between agents, tools, or hand-off participants.
172#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
173pub struct InteractionExpectation {
174    /// Source participant or component.
175    pub from: String,
176    /// Target participant or component.
177    pub to: String,
178    /// Expected interaction description.
179    pub description: String,
180}
181
182/// Example shown to a judge prompt before the case being evaluated.
183#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
184pub struct FewShotExample {
185    /// Example input.
186    pub input: String,
187    /// Expected output or verdict.
188    pub expected: String,
189    /// Optional reasoning to include with the example.
190    #[serde(default, skip_serializing_if = "Option::is_none")]
191    pub reasoning: Option<String>,
192}
193
194/// Multimodal attachment reference attached to an evaluation case.
195#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
196#[serde(rename_all = "snake_case")]
197pub enum Attachment {
198    /// File path resolved relative to the eval-set root at materialization time.
199    Path(PathBuf),
200    /// Self-contained bytes with an explicit MIME type.
201    Base64 { mime: String, bytes: Vec<u8> },
202    /// Remote HTTPS resource guarded by a [`UrlFilter`].
203    Url(String),
204}
205
206/// Bytes ready for judge-client payload construction.
207#[derive(Debug, Clone, PartialEq, Eq)]
208pub struct MaterializedAttachment {
209    pub mime: String,
210    pub bytes: Vec<u8>,
211}
212
213/// Structured attachment materialization errors.
214#[derive(Debug, Error)]
215pub enum AttachmentError {
216    #[error("attachment path not found: {0}")]
217    PathNotFound(PathBuf),
218    #[error("attachment decode failed: {0}")]
219    DecodeError(String),
220    #[error("attachment URL blocked: {url}: {reason}")]
221    UrlBlocked { url: String, reason: String },
222    #[error("attachment fetch failed: {url}: status {status}")]
223    FetchFailed { url: String, status: u16 },
224    #[error("unsupported attachment MIME type: {mime}")]
225    UnsupportedMime { mime: String },
226}
227
228impl Attachment {
229    /// Materialize an attachment into bytes suitable for judge dispatch.
230    ///
231    /// URL fetching is available when the `multimodal` feature is enabled.
232    pub async fn materialize(
233        &self,
234        eval_set_root: &Path,
235        filter: &dyn UrlFilter,
236    ) -> Result<MaterializedAttachment, AttachmentError> {
237        match self {
238            Self::Path(path) => materialize_path(eval_set_root, path).await,
239            Self::Base64 { mime, bytes } => {
240                validate_attachment_mime(mime)?;
241                Ok(MaterializedAttachment {
242                    mime: normalize_mime(mime),
243                    bytes: bytes.clone(),
244                })
245            }
246            Self::Url(url) => materialize_url(url, filter).await,
247        }
248    }
249}
250
251async fn materialize_path(
252    eval_set_root: &Path,
253    path: &Path,
254) -> Result<MaterializedAttachment, AttachmentError> {
255    if path.is_absolute()
256        || path
257            .components()
258            .any(|component| component == Component::ParentDir)
259    {
260        return Err(AttachmentError::PathNotFound(path.to_path_buf()));
261    }
262
263    let full_path = eval_set_root.join(path);
264    let bytes = tokio::fs::read(&full_path)
265        .await
266        .map_err(|_| AttachmentError::PathNotFound(path.to_path_buf()))?;
267    let mime = mime_from_path(path)?;
268
269    Ok(MaterializedAttachment { mime, bytes })
270}
271
272async fn materialize_url(
273    url: &str,
274    filter: &dyn UrlFilter,
275) -> Result<MaterializedAttachment, AttachmentError> {
276    let parsed = Url::parse(url).map_err(|err| AttachmentError::UrlBlocked {
277        url: url.to_string(),
278        reason: err.to_string(),
279    })?;
280
281    validate_remote_url(&parsed, filter)?;
282
283    materialize_checked_url(parsed, filter).await
284}
285
286#[cfg(feature = "multimodal")]
287async fn materialize_checked_url(
288    parsed: Url,
289    filter: &dyn UrlFilter,
290) -> Result<MaterializedAttachment, AttachmentError> {
291    let client = reqwest::Client::builder()
292        .redirect(reqwest::redirect::Policy::none())
293        .build()
294        .map_err(|_| AttachmentError::FetchFailed {
295            url: parsed.as_str().to_string(),
296            status: 0,
297        })?;
298    let mut current = parsed;
299
300    for _ in 0..10 {
301        let url = current.as_str().to_string();
302        let response =
303            client
304                .get(current.clone())
305                .send()
306                .await
307                .map_err(|_| AttachmentError::FetchFailed {
308                    url: url.clone(),
309                    status: 0,
310                })?;
311        let status = response.status();
312
313        if status.is_redirection() {
314            let location = response
315                .headers()
316                .get(reqwest::header::LOCATION)
317                .and_then(|value| value.to_str().ok())
318                .ok_or_else(|| AttachmentError::FetchFailed {
319                    url: url.clone(),
320                    status: status.as_u16(),
321                })?;
322            current = resolve_redirect_target(&current, location, filter)?;
323            continue;
324        }
325
326        if !status.is_success() {
327            return Err(AttachmentError::FetchFailed {
328                url,
329                status: status.as_u16(),
330            });
331        }
332
333        let content_type = response
334            .headers()
335            .get(reqwest::header::CONTENT_TYPE)
336            .and_then(|value| value.to_str().ok())
337            .map(normalize_mime);
338        let mime = match content_type {
339            Some(mime) => {
340                validate_attachment_mime(&mime)?;
341                mime
342            }
343            None => mime_from_url_path(&url)?,
344        };
345        let bytes = response
346            .bytes()
347            .await
348            .map_err(|_| AttachmentError::FetchFailed { url, status: 0 })?
349            .to_vec();
350
351        return Ok(MaterializedAttachment { mime, bytes });
352    }
353
354    Err(AttachmentError::FetchFailed {
355        url: current.as_str().to_string(),
356        status: 0,
357    })
358}
359
360#[cfg(not(feature = "multimodal"))]
361#[allow(clippy::unused_async)]
362async fn materialize_checked_url(
363    parsed: Url,
364    _filter: &dyn UrlFilter,
365) -> Result<MaterializedAttachment, AttachmentError> {
366    Err(AttachmentError::FetchFailed {
367        url: parsed.as_str().to_string(),
368        status: 0,
369    })
370}
371
372fn validate_remote_url(url: &Url, filter: &dyn UrlFilter) -> Result<(), AttachmentError> {
373    if url.scheme() != "https" {
374        return Err(AttachmentError::UrlBlocked {
375            url: url.as_str().to_string(),
376            reason: "only https URLs are supported".to_string(),
377        });
378    }
379
380    if !filter.allows(url) {
381        return Err(AttachmentError::UrlBlocked {
382            url: url.as_str().to_string(),
383            reason: "blocked by URL filter".to_string(),
384        });
385    }
386
387    Ok(())
388}
389
390#[cfg(feature = "multimodal")]
391fn resolve_redirect_target(
392    current: &Url,
393    location: &str,
394    filter: &dyn UrlFilter,
395) -> Result<Url, AttachmentError> {
396    let redirected = current
397        .join(location)
398        .map_err(|err| AttachmentError::UrlBlocked {
399            url: current.as_str().to_string(),
400            reason: format!("invalid redirect target: {err}"),
401        })?;
402    validate_remote_url(&redirected, filter)?;
403    Ok(redirected)
404}
405
406fn mime_from_path(path: &Path) -> Result<String, AttachmentError> {
407    let extension = path
408        .extension()
409        .and_then(|extension| extension.to_str())
410        .unwrap_or_default()
411        .to_ascii_lowercase();
412    let mime = match extension.as_str() {
413        "png" => "image/png",
414        "jpg" | "jpeg" => "image/jpeg",
415        "gif" => "image/gif",
416        "webp" => "image/webp",
417        _ => {
418            return Err(AttachmentError::UnsupportedMime {
419                mime: "application/octet-stream".to_string(),
420            });
421        }
422    };
423    Ok(mime.to_string())
424}
425
426#[cfg(feature = "multimodal")]
427fn mime_from_url_path(url: &str) -> Result<String, AttachmentError> {
428    let parsed = Url::parse(url).map_err(|_| AttachmentError::UnsupportedMime {
429        mime: "application/octet-stream".to_string(),
430    })?;
431    mime_from_path(Path::new(parsed.path()))
432}
433
434fn normalize_mime(mime: &str) -> String {
435    mime.split(';')
436        .next()
437        .unwrap_or(mime)
438        .trim()
439        .to_ascii_lowercase()
440}
441
442fn validate_attachment_mime(mime: &str) -> Result<(), AttachmentError> {
443    let mime = normalize_mime(mime);
444    match mime.as_str() {
445        "image/png" | "image/jpeg" | "image/gif" | "image/webp" => Ok(()),
446        _ => Err(AttachmentError::UnsupportedMime { mime }),
447    }
448}
449
450/// Stable namespace for deterministic case-derived session IDs.
451///
452/// Pinned to `Uuid::new_v5(&Uuid::NAMESPACE_OID, b"swink-agent-eval.case")`
453/// per spec 043 research R-014.
454pub const CASE_NAMESPACE: Uuid = Uuid::from_bytes([
455    37, 101, 28, 203, 118, 231, 87, 244, 147, 248, 152, 59, 222, 174, 80, 226,
456]);
457
458/// Canonical serializable projection of an [`EvalCase`] used for deterministic
459/// session IDs and future cache keys.
460#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
461pub struct CaseFingerprint {
462    pub id: String,
463    pub name: String,
464    pub description: Option<String>,
465    pub system_prompt: String,
466    pub user_messages: Vec<String>,
467    pub expected_trajectory: Option<Vec<ExpectedToolCallFingerprint>>,
468    pub expected_response: Option<ResponseCriteriaFingerprint>,
469    pub expected_assertion: Option<Assertion>,
470    pub expected_interactions: Option<Vec<InteractionExpectation>>,
471    pub few_shot_examples: Vec<FewShotExample>,
472    pub budget: Option<BudgetConstraintsFingerprint>,
473    pub evaluators: Vec<String>,
474    pub metadata: CanonicalJsonValue,
475    pub attachments: Vec<AttachmentFingerprint>,
476    pub expected_environment_state: Option<Vec<EnvironmentStateFingerprint>>,
477    pub expected_tool_intent: Option<ToolIntentFingerprint>,
478    pub semantic_tool_selection: bool,
479}
480
481#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
482pub struct ExpectedToolCallFingerprint {
483    pub tool_name: String,
484    pub arguments: Option<CanonicalJsonValue>,
485}
486
487#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
488pub enum ResponseCriteriaFingerprint {
489    Exact { expected: String },
490    Contains { substring: String },
491    Regex { pattern: String },
492    Custom,
493}
494
495#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
496pub struct BudgetConstraintsFingerprint {
497    pub cost_limit_bits: Option<u64>,
498    pub input_limit: Option<u64>,
499    pub output_limit: Option<u64>,
500    pub turn_limit: Option<usize>,
501}
502
503#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
504pub struct EnvironmentStateFingerprint {
505    pub name: String,
506    pub state: CanonicalJsonValue,
507}
508
509#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
510pub struct ToolIntentFingerprint {
511    pub intent: String,
512    pub tool_name: Option<String>,
513}
514
515#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
516pub enum AttachmentFingerprint {
517    Path(String),
518    Base64 { mime: String, sha256: String },
519    Url(String),
520}
521
522#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
523#[serde(tag = "kind", content = "value", rename_all = "snake_case")]
524pub enum CanonicalJsonValue {
525    Null,
526    Bool(bool),
527    Number(String),
528    String(String),
529    Array(Vec<Self>),
530    Object(BTreeMap<String, Self>),
531}
532
533impl From<&serde_json::Value> for CanonicalJsonValue {
534    fn from(value: &serde_json::Value) -> Self {
535        match value {
536            serde_json::Value::Null => Self::Null,
537            serde_json::Value::Bool(value) => Self::Bool(*value),
538            serde_json::Value::Number(value) => Self::Number(value.to_string()),
539            serde_json::Value::String(value) => Self::String(value.clone()),
540            serde_json::Value::Array(values) => {
541                Self::Array(values.iter().map(Self::from).collect())
542            }
543            serde_json::Value::Object(values) => Self::Object(
544                values
545                    .iter()
546                    .map(|(key, value)| (key.clone(), Self::from(value)))
547                    .collect(),
548            ),
549        }
550    }
551}
552
553impl From<&ExpectedToolCall> for ExpectedToolCallFingerprint {
554    fn from(call: &ExpectedToolCall) -> Self {
555        Self {
556            tool_name: call.tool_name.clone(),
557            arguments: call.arguments.as_ref().map(CanonicalJsonValue::from),
558        }
559    }
560}
561
562impl From<&ResponseCriteria> for ResponseCriteriaFingerprint {
563    fn from(criteria: &ResponseCriteria) -> Self {
564        match criteria {
565            ResponseCriteria::Exact { expected } => Self::Exact {
566                expected: expected.clone(),
567            },
568            ResponseCriteria::Contains { substring } => Self::Contains {
569                substring: substring.clone(),
570            },
571            ResponseCriteria::Regex { pattern } => Self::Regex {
572                pattern: pattern.clone(),
573            },
574            ResponseCriteria::Custom(_) => Self::Custom,
575        }
576    }
577}
578
579impl From<&BudgetConstraints> for BudgetConstraintsFingerprint {
580    fn from(budget: &BudgetConstraints) -> Self {
581        Self {
582            cost_limit_bits: budget.max_cost.map(f64::to_bits),
583            input_limit: budget.max_input,
584            output_limit: budget.max_output,
585            turn_limit: budget.max_turns,
586        }
587    }
588}
589
590impl From<&EnvironmentState> for EnvironmentStateFingerprint {
591    fn from(state: &EnvironmentState) -> Self {
592        Self {
593            name: state.name.clone(),
594            state: CanonicalJsonValue::from(&state.state),
595        }
596    }
597}
598
599impl From<&ToolIntent> for ToolIntentFingerprint {
600    fn from(intent: &ToolIntent) -> Self {
601        Self {
602            intent: intent.intent.clone(),
603            tool_name: intent.tool_name.clone(),
604        }
605    }
606}
607
608impl From<&Attachment> for AttachmentFingerprint {
609    fn from(attachment: &Attachment) -> Self {
610        match attachment {
611            Attachment::Path(path) => Self::Path(path.to_string_lossy().replace('\\', "/")),
612            Attachment::Base64 { mime, bytes } => {
613                let digest = Sha256::digest(bytes);
614                Self::Base64 {
615                    mime: normalize_mime(mime),
616                    sha256: hex_lower(&digest),
617                }
618            }
619            Attachment::Url(url) => Self::Url(url.clone()),
620        }
621    }
622}
623
624fn hex_lower(bytes: &[u8]) -> String {
625    const HEX: &[u8; 16] = b"0123456789abcdef";
626    let mut out = String::with_capacity(bytes.len() * 2);
627    for byte in bytes {
628        out.push(HEX[(byte >> 4) as usize] as char);
629        out.push(HEX[(byte & 0x0f) as usize] as char);
630    }
631    out
632}
633
634/// Budget constraints for cost and latency governance.
635#[derive(Debug, Clone, Serialize, Deserialize)]
636pub struct BudgetConstraints {
637    /// Maximum allowed cost in dollars.
638    #[serde(default, skip_serializing_if = "Option::is_none")]
639    pub max_cost: Option<f64>,
640    /// Maximum allowed input tokens.
641    #[serde(default, skip_serializing_if = "Option::is_none")]
642    pub max_input: Option<u64>,
643    /// Maximum allowed output tokens.
644    #[serde(default, skip_serializing_if = "Option::is_none")]
645    pub max_output: Option<u64>,
646    /// Maximum allowed number of turns.
647    #[serde(default, skip_serializing_if = "Option::is_none")]
648    pub max_turns: Option<usize>,
649}
650
651impl BudgetConstraints {
652    /// Convert budget constraints into loop policies for agent construction.
653    #[must_use]
654    pub fn to_policies(&self) -> (Option<BudgetPolicy>, Option<MaxTurnsPolicy>) {
655        let budget_policy =
656            if self.max_cost.is_none() && self.max_input.is_none() && self.max_output.is_none() {
657                None
658            } else {
659                let mut policy = BudgetPolicy::new();
660                if let Some(max_cost) = self.max_cost {
661                    policy = policy.max_cost(max_cost);
662                }
663                if let Some(max_input) = self.max_input {
664                    policy = policy.max_input(max_input);
665                }
666                if let Some(max_output) = self.max_output {
667                    policy = policy.max_output(max_output);
668                }
669                Some(policy)
670            };
671
672        let max_turns_policy = self.max_turns.map(MaxTurnsPolicy::new);
673
674        (budget_policy, max_turns_policy)
675    }
676}
677
678// ─── Eval Case & Set ────────────────────────────────────────────────────────
679
680/// A single evaluation scenario.
681///
682/// Defines the agent prompt, expected outcomes, and which evaluators to run.
683#[derive(Clone, Serialize, Deserialize)]
684pub struct EvalCase {
685    /// Unique identifier for this case.
686    pub id: String,
687    /// Human-readable name.
688    pub name: String,
689    /// Optional description of what this case tests.
690    #[serde(default, skip_serializing_if = "Option::is_none")]
691    pub description: Option<String>,
692    /// System prompt for the agent.
693    pub system_prompt: String,
694    /// Initial user messages (the prompt).
695    pub user_messages: Vec<String>,
696    /// Expected tool call trajectory (golden path).
697    #[serde(default, skip_serializing_if = "Option::is_none")]
698    pub expected_trajectory: Option<Vec<ExpectedToolCall>>,
699    /// Expected final response criteria.
700    #[serde(default, skip_serializing_if = "Option::is_none")]
701    pub expected_response: Option<ResponseCriteria>,
702    /// Judge-evaluated assertion expected to hold after the run.
703    #[serde(default, skip_serializing_if = "Option::is_none")]
704    pub expected_assertion: Option<Assertion>,
705    /// Expected interactions or hand-offs within the run.
706    #[serde(default, skip_serializing_if = "Option::is_none")]
707    pub expected_interactions: Option<Vec<InteractionExpectation>>,
708    /// Prompt examples injected ahead of judge-backed evaluations.
709    #[serde(default, skip_serializing_if = "Vec::is_empty")]
710    pub few_shot_examples: Vec<FewShotExample>,
711    /// Cost/budget governance constraints.
712    #[serde(default, skip_serializing_if = "Option::is_none")]
713    pub budget: Option<BudgetConstraints>,
714    /// Names of evaluators to run. Empty means all registered evaluators.
715    #[serde(default, skip_serializing_if = "Vec::is_empty")]
716    pub evaluators: Vec<String>,
717    /// Arbitrary metadata for user-defined extensions and filtering.
718    #[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
719    pub metadata: serde_json::Value,
720    /// Multimodal data references consumed by multimodal evaluators.
721    #[serde(default, skip_serializing_if = "Vec::is_empty")]
722    pub attachments: Vec<Attachment>,
723    /// Stable case/session identifier. When absent, callers may derive one
724    /// deterministically via [`Self::default_session_id`].
725    #[serde(
726        default,
727        skip_serializing_if = "Option::is_none",
728        serialize_with = "serialize_optional_uuid",
729        deserialize_with = "deserialize_optional_uuid"
730    )]
731    pub session_id: Option<Uuid>,
732    /// Expected environment-state snapshots keyed by name (FR-013).
733    ///
734    /// Compared against the output of `state_capture` via full JSON equality.
735    /// Duplicate names are rejected at case-load time (FR-015, SC-009).
736    #[serde(default, skip_serializing_if = "Option::is_none")]
737    pub expected_environment_state: Option<Vec<EnvironmentState>>,
738    /// Expected semantic tool intent for the tool-parameter evaluator (FR-012).
739    #[serde(default, skip_serializing_if = "Option::is_none")]
740    pub expected_tool_intent: Option<ToolIntent>,
741    /// Enable semantic tool-selection scoring for this case (FR-011).
742    #[serde(default, skip_serializing_if = "is_false")]
743    pub semantic_tool_selection: bool,
744    /// Callback that produces the actual environment state after the agent
745    /// completes. Programmatic only — mirrors `ResponseCriteria::Custom`.
746    #[serde(skip)]
747    pub state_capture: Option<StateCapture>,
748}
749
750impl std::fmt::Debug for EvalCase {
751    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
752        f.debug_struct("EvalCase")
753            .field("id", &self.id)
754            .field("name", &self.name)
755            .field("description", &self.description)
756            .field("system_prompt", &self.system_prompt)
757            .field("user_messages", &self.user_messages)
758            .field("expected_trajectory", &self.expected_trajectory)
759            .field("expected_response", &self.expected_response)
760            .field("expected_assertion", &self.expected_assertion)
761            .field("expected_interactions", &self.expected_interactions)
762            .field("few_shot_examples", &self.few_shot_examples)
763            .field("budget", &self.budget)
764            .field("evaluators", &self.evaluators)
765            .field("metadata", &self.metadata)
766            .field("attachments", &self.attachments)
767            .field("session_id", &self.session_id)
768            .field(
769                "expected_environment_state",
770                &self.expected_environment_state,
771            )
772            .field("expected_tool_intent", &self.expected_tool_intent)
773            .field("semantic_tool_selection", &self.semantic_tool_selection)
774            .field(
775                "state_capture",
776                &self.state_capture.as_ref().map(|_| "<fn>"),
777            )
778            .finish()
779    }
780}
781
782impl From<&EvalCase> for CaseFingerprint {
783    fn from(case: &EvalCase) -> Self {
784        Self {
785            id: case.id.clone(),
786            name: case.name.clone(),
787            description: case.description.clone(),
788            system_prompt: case.system_prompt.clone(),
789            user_messages: case.user_messages.clone(),
790            expected_trajectory: case.expected_trajectory.as_ref().map(|calls| {
791                calls
792                    .iter()
793                    .map(ExpectedToolCallFingerprint::from)
794                    .collect()
795            }),
796            expected_response: case
797                .expected_response
798                .as_ref()
799                .map(ResponseCriteriaFingerprint::from),
800            expected_assertion: case.expected_assertion.clone(),
801            expected_interactions: case.expected_interactions.clone(),
802            few_shot_examples: case.few_shot_examples.clone(),
803            budget: case.budget.as_ref().map(BudgetConstraintsFingerprint::from),
804            evaluators: case.evaluators.clone(),
805            metadata: CanonicalJsonValue::from(&case.metadata),
806            attachments: case
807                .attachments
808                .iter()
809                .map(AttachmentFingerprint::from)
810                .collect(),
811            expected_environment_state: case.expected_environment_state.as_ref().map(|states| {
812                states
813                    .iter()
814                    .map(EnvironmentStateFingerprint::from)
815                    .collect()
816            }),
817            expected_tool_intent: case
818                .expected_tool_intent
819                .as_ref()
820                .map(ToolIntentFingerprint::from),
821            semantic_tool_selection: case.semantic_tool_selection,
822        }
823    }
824}
825
826impl EvalCase {
827    /// Canonical serializable projection used by deterministic ID and cache-key
828    /// derivation.
829    #[must_use]
830    pub fn content_fingerprint(&self) -> CaseFingerprint {
831        CaseFingerprint::from(self)
832    }
833
834    /// Deterministically derive the default session ID for this case.
835    ///
836    /// Programmatic-only closures such as `state_capture` and
837    /// `ResponseCriteria::Custom` bodies are never serialized directly.
838    /// Instead, this hashes a stable canonical fingerprint that preserves the
839    /// presence of custom criteria while avoiding pointer-address instability.
840    #[must_use]
841    pub fn default_session_id(&self) -> Uuid {
842        let canonical =
843            serde_json::to_vec(&self.content_fingerprint()).expect("case fingerprint serializes");
844        let digest = Sha256::digest(canonical);
845        Uuid::new_v5(&CASE_NAMESPACE, digest.as_slice())
846    }
847
848    /// Validate this case's static configuration.
849    pub fn validate(&self) -> Result<(), EvalError> {
850        if let Some(assertion) = &self.expected_assertion {
851            validate_non_empty_field(
852                &self.id,
853                "expected_assertion.description",
854                &assertion.description,
855            )?;
856            match &assertion.kind {
857                AssertionKind::GoalCompleted | AssertionKind::UserSatisfied => {}
858                AssertionKind::ToolInvoked(tool_name) => {
859                    validate_non_empty_field(
860                        &self.id,
861                        "expected_assertion.kind.tool_name",
862                        tool_name,
863                    )?;
864                }
865                AssertionKind::Custom { predicate } => {
866                    validate_non_empty_field(
867                        &self.id,
868                        "expected_assertion.kind.predicate",
869                        predicate,
870                    )?;
871                }
872            }
873        }
874
875        if let Some(interactions) = &self.expected_interactions {
876            for (index, interaction) in interactions.iter().enumerate() {
877                let field_prefix = format!("expected_interactions[{index}]");
878                validate_non_empty_field(
879                    &self.id,
880                    &format!("{field_prefix}.from"),
881                    &interaction.from,
882                )?;
883                validate_non_empty_field(&self.id, &format!("{field_prefix}.to"), &interaction.to)?;
884                validate_non_empty_field(
885                    &self.id,
886                    &format!("{field_prefix}.description"),
887                    &interaction.description,
888                )?;
889            }
890        }
891
892        for (index, example) in self.few_shot_examples.iter().enumerate() {
893            let field_prefix = format!("few_shot_examples[{index}]");
894            validate_non_empty_field(&self.id, &format!("{field_prefix}.input"), &example.input)?;
895            validate_non_empty_field(
896                &self.id,
897                &format!("{field_prefix}.expected"),
898                &example.expected,
899            )?;
900            if let Some(reasoning) = &example.reasoning {
901                validate_non_empty_field(
902                    &self.id,
903                    &format!("{field_prefix}.reasoning"),
904                    reasoning,
905                )?;
906            }
907        }
908
909        for (index, attachment) in self.attachments.iter().enumerate() {
910            validate_attachment_declaration(&self.id, index, attachment)?;
911        }
912
913        if let Some(states) = &self.expected_environment_state {
914            let mut seen: HashSet<&str> = HashSet::with_capacity(states.len());
915            for state in states {
916                if !seen.insert(state.name.as_str()) {
917                    return Err(EvalError::invalid_case(format!(
918                        "case `{case_id}`: duplicate expected_environment_state name `{name}`",
919                        case_id = self.id,
920                        name = state.name,
921                    )));
922                }
923            }
924        }
925
926        Ok(())
927    }
928}
929
930#[allow(clippy::trivially_copy_pass_by_ref)]
931const fn is_false(b: &bool) -> bool {
932    !*b
933}
934
935/// A named collection of evaluation cases.
936#[derive(Debug, Clone, Serialize, Deserialize)]
937pub struct EvalSet {
938    /// Unique identifier for this set.
939    pub id: String,
940    /// Human-readable name.
941    pub name: String,
942    /// Optional description.
943    #[serde(default, skip_serializing_if = "Option::is_none")]
944    pub description: Option<String>,
945    /// The cases in this set.
946    pub cases: Vec<EvalCase>,
947}
948
949// ─── Results ────────────────────────────────────────────────────────────────
950
951/// Per-evaluator result for a single case.
952#[derive(Debug, Clone, Serialize, Deserialize)]
953pub struct EvalMetricResult {
954    /// Name of the evaluator that produced this result.
955    pub evaluator_name: String,
956    /// The numeric score.
957    pub score: Score,
958    /// Optional human-readable details about the scoring.
959    #[serde(default, skip_serializing_if = "Option::is_none")]
960    pub details: Option<String>,
961}
962
963/// Result of evaluating a single case.
964#[derive(Debug, Clone, Serialize, Deserialize)]
965pub struct EvalCaseResult {
966    /// The case ID that was evaluated.
967    pub case_id: String,
968    /// The captured invocation trace.
969    pub invocation: Invocation,
970    /// Per-evaluator metric results.
971    pub metric_results: Vec<EvalMetricResult>,
972    /// Overall verdict (all metrics must pass).
973    pub verdict: Verdict,
974}
975
976/// Result of evaluating an entire eval set.
977#[derive(Debug, Clone, Serialize, Deserialize)]
978pub struct EvalSetResult {
979    /// The eval set ID.
980    pub eval_set_id: String,
981    /// Per-case results.
982    pub case_results: Vec<EvalCaseResult>,
983    /// Aggregated summary statistics.
984    pub summary: EvalSummary,
985    /// Unix timestamp when this result was produced.
986    pub timestamp: u64,
987}
988
989/// Aggregated statistics for an eval set run.
990#[derive(Debug, Clone, Serialize, Deserialize)]
991pub struct EvalSummary {
992    /// Total number of cases evaluated.
993    pub total_cases: usize,
994    /// Number of cases that passed all metrics.
995    pub passed: usize,
996    /// Number of cases that failed at least one metric.
997    pub failed: usize,
998    /// Aggregated cost across all cases.
999    pub total_cost: Cost,
1000    /// Aggregated token usage across all cases.
1001    pub total_usage: Usage,
1002    /// Total wall-clock duration across all cases.
1003    pub total_duration: Duration,
1004}
1005
1006// ─── Case-load Validation (FR-015, SC-009) ──────────────────────────────────
1007
1008/// Validate a single [`EvalCase`] against the case-load rules.
1009///
1010/// Currently enforces:
1011///
1012/// * `expected_environment_state` — names MUST be unique. Duplicates are
1013///   rejected with [`EvalError::InvalidCase`] pointing at the offending name
1014///   (FR-015, SC-009).
1015///
1016/// This check is shared by [`validate_eval_set`] and the YAML loader so
1017/// programmatic constructors get the same guarantees as on-disk configs.
1018pub fn validate_eval_case(case: &EvalCase) -> Result<(), EvalError> {
1019    case.validate()
1020}
1021
1022/// Validate an entire [`EvalSet`], short-circuiting on the first invalid case.
1023pub fn validate_eval_set(set: &EvalSet) -> Result<(), EvalError> {
1024    let mut seen_case_ids: HashSet<&str> = HashSet::with_capacity(set.cases.len());
1025    for case in &set.cases {
1026        if !seen_case_ids.insert(case.id.as_str()) {
1027            return Err(EvalError::invalid_case(format!(
1028                "eval set `{set_id}`: duplicate case id `{case_id}`",
1029                set_id = set.id,
1030                case_id = case.id,
1031            )));
1032        }
1033        case.validate()?;
1034    }
1035    Ok(())
1036}
1037
1038fn validate_non_empty_field(case_id: &str, field: &str, value: &str) -> Result<(), EvalError> {
1039    if value.trim().is_empty() {
1040        return Err(EvalError::invalid_case(format!(
1041            "case `{case_id}`: `{field}` must not be blank"
1042        )));
1043    }
1044    Ok(())
1045}
1046
1047fn validate_attachment_declaration(
1048    case_id: &str,
1049    index: usize,
1050    attachment: &Attachment,
1051) -> Result<(), EvalError> {
1052    match attachment {
1053        Attachment::Path(path) => {
1054            if path.as_os_str().is_empty()
1055                || path.is_absolute()
1056                || path
1057                    .components()
1058                    .any(|component| component == Component::ParentDir)
1059            {
1060                return Err(EvalError::invalid_case(format!(
1061                    "case `{case_id}`: attachments[{index}] path must stay relative to the eval-set root"
1062                )));
1063            }
1064        }
1065        Attachment::Base64 { mime, .. } => {
1066            validate_attachment_mime(mime).map_err(|err| {
1067                EvalError::invalid_case(format!(
1068                    "case `{case_id}`: attachments[{index}] invalid MIME: {err}"
1069                ))
1070            })?;
1071        }
1072        Attachment::Url(url) => {
1073            let parsed = Url::parse(url).map_err(|err| {
1074                EvalError::invalid_case(format!(
1075                    "case `{case_id}`: attachments[{index}] invalid URL: {err}"
1076                ))
1077            })?;
1078            if parsed.scheme() != "https" {
1079                return Err(EvalError::invalid_case(format!(
1080                    "case `{case_id}`: attachments[{index}] URL must use https"
1081                )));
1082            }
1083        }
1084    }
1085
1086    Ok(())
1087}
1088
1089#[allow(clippy::ref_option)]
1090fn serialize_optional_uuid<S>(value: &Option<Uuid>, serializer: S) -> Result<S::Ok, S::Error>
1091where
1092    S: Serializer,
1093{
1094    match value {
1095        Some(uuid) => serializer.serialize_some(&uuid.to_string()),
1096        None => serializer.serialize_none(),
1097    }
1098}
1099
1100fn deserialize_optional_uuid<'de, D>(deserializer: D) -> Result<Option<Uuid>, D::Error>
1101where
1102    D: Deserializer<'de>,
1103{
1104    let value = Option::<String>::deserialize(deserializer)?;
1105    value
1106        .map(|value| {
1107            Uuid::parse_str(&value).map_err(|err| serde::de::Error::custom(err.to_string()))
1108        })
1109        .transpose()
1110}
1111
1112#[cfg(test)]
1113mod validation_tests {
1114    use super::*;
1115
1116    fn base_case(id: &str) -> EvalCase {
1117        EvalCase {
1118            id: id.to_string(),
1119            name: id.to_string(),
1120            description: None,
1121            system_prompt: String::new(),
1122            user_messages: vec!["hi".to_string()],
1123            expected_trajectory: None,
1124            expected_response: None,
1125            expected_assertion: None,
1126            expected_interactions: None,
1127            few_shot_examples: vec![],
1128            budget: None,
1129            evaluators: vec![],
1130            metadata: serde_json::Value::Null,
1131            attachments: vec![],
1132            session_id: None,
1133            expected_environment_state: None,
1134            expected_tool_intent: None,
1135            semantic_tool_selection: false,
1136            state_capture: None,
1137        }
1138    }
1139
1140    #[test]
1141    fn validate_accepts_unique_environment_state_names() {
1142        let mut case = base_case("c1");
1143        case.expected_environment_state = Some(vec![
1144            EnvironmentState {
1145                name: "alpha".into(),
1146                state: serde_json::json!({"v": 1}),
1147            },
1148            EnvironmentState {
1149                name: "beta".into(),
1150                state: serde_json::json!({"v": 2}),
1151            },
1152        ]);
1153        assert!(validate_eval_case(&case).is_ok());
1154    }
1155
1156    #[test]
1157    fn validate_rejects_duplicate_environment_state_names() {
1158        let mut case = base_case("dup");
1159        case.expected_environment_state = Some(vec![
1160            EnvironmentState {
1161                name: "alpha".into(),
1162                state: serde_json::json!({"v": 1}),
1163            },
1164            EnvironmentState {
1165                name: "alpha".into(),
1166                state: serde_json::json!({"v": 2}),
1167            },
1168        ]);
1169        let err = validate_eval_case(&case).expect_err("duplicate should be rejected");
1170        match err {
1171            EvalError::InvalidCase { reason } => {
1172                assert!(reason.contains("alpha"), "reason: {reason}");
1173                assert!(reason.contains("dup"), "reason mentions case id: {reason}");
1174            }
1175            other => panic!("expected InvalidCase, got {other:?}"),
1176        }
1177    }
1178
1179    #[test]
1180    fn validate_none_environment_state_is_ok() {
1181        let case = base_case("none");
1182        assert!(validate_eval_case(&case).is_ok());
1183    }
1184
1185    #[test]
1186    fn validate_eval_set_propagates_case_errors() {
1187        let mut case = base_case("bad");
1188        case.expected_environment_state = Some(vec![
1189            EnvironmentState {
1190                name: "x".into(),
1191                state: serde_json::Value::Null,
1192            },
1193            EnvironmentState {
1194                name: "x".into(),
1195                state: serde_json::Value::Null,
1196            },
1197        ]);
1198        let set = EvalSet {
1199            id: "set".into(),
1200            name: "Set".into(),
1201            description: None,
1202            cases: vec![case],
1203        };
1204        assert!(validate_eval_set(&set).is_err());
1205    }
1206
1207    #[test]
1208    fn environment_state_serde_round_trip() {
1209        let state = EnvironmentState {
1210            name: "db".into(),
1211            state: serde_json::json!({"rows": 3, "schema": "public"}),
1212        };
1213        let json = serde_json::to_string(&state).unwrap();
1214        let back: EnvironmentState = serde_json::from_str(&json).unwrap();
1215        assert_eq!(back.name, state.name);
1216        assert_eq!(back.state, state.state);
1217    }
1218
1219    #[test]
1220    fn eval_case_serde_round_trip_with_v2_fields() {
1221        let mut case = base_case("v2");
1222        case.expected_environment_state = Some(vec![EnvironmentState {
1223            name: "alpha".into(),
1224            state: serde_json::json!({"n": 1}),
1225        }]);
1226        case.expected_tool_intent = Some(ToolIntent {
1227            intent: "read config".into(),
1228            tool_name: Some("read_file".into()),
1229        });
1230        case.expected_assertion = Some(Assertion {
1231            description: "goal completed".into(),
1232            kind: AssertionKind::GoalCompleted,
1233        });
1234        case.expected_interactions = Some(vec![InteractionExpectation {
1235            from: "planner".into(),
1236            to: "worker".into(),
1237            description: "delegates the task".into(),
1238        }]);
1239        case.few_shot_examples = vec![FewShotExample {
1240            input: "hello".into(),
1241            expected: "world".into(),
1242            reasoning: Some("example".into()),
1243        }];
1244        case.session_id = Some(Uuid::nil());
1245        case.semantic_tool_selection = true;
1246        let yaml_like = serde_json::to_string(&case).unwrap();
1247        let back: EvalCase = serde_json::from_str(&yaml_like).unwrap();
1248        assert_eq!(back.expected_environment_state.as_ref().unwrap().len(), 1);
1249        assert_eq!(
1250            back.expected_tool_intent.as_ref().unwrap().intent,
1251            "read config"
1252        );
1253        assert_eq!(
1254            back.expected_assertion.as_ref().unwrap().description,
1255            "goal completed"
1256        );
1257        assert_eq!(back.expected_interactions.as_ref().unwrap().len(), 1);
1258        assert_eq!(back.few_shot_examples.len(), 1);
1259        assert_eq!(back.session_id, Some(Uuid::nil()));
1260        assert!(back.semantic_tool_selection);
1261        assert!(back.attachments.is_empty());
1262        assert!(back.state_capture.is_none());
1263    }
1264
1265    #[test]
1266    fn case_namespace_matches_oid_derived_value() {
1267        assert_eq!(
1268            CASE_NAMESPACE,
1269            Uuid::new_v5(&Uuid::NAMESPACE_OID, b"swink-agent-eval.case")
1270        );
1271    }
1272
1273    #[test]
1274    fn default_session_id_is_deterministic_for_same_case() {
1275        let mut case = base_case("stable");
1276        case.metadata = serde_json::json!({
1277            "beta": [2, {"y": true, "x": false}],
1278            "alpha": {"nested_b": 2, "nested_a": 1}
1279        });
1280        case.expected_response = Some(ResponseCriteria::Contains {
1281            substring: "ok".into(),
1282        });
1283        case.expected_trajectory = Some(vec![ExpectedToolCall {
1284            tool_name: "read_file".into(),
1285            arguments: Some(serde_json::json!({"path": "./project-alpha/config.toml"})),
1286        }]);
1287
1288        let first = case.default_session_id();
1289        let second = case.default_session_id();
1290        assert_eq!(first, second);
1291    }
1292
1293    #[test]
1294    fn default_session_id_is_stable_across_json_key_order() {
1295        let mut left = base_case("ordered");
1296        left.metadata = serde_json::json!({
1297            "alpha": {"x": 1, "y": 2},
1298            "beta": [3, 4]
1299        });
1300        left.expected_environment_state = Some(vec![EnvironmentState {
1301            name: "workspace".into(),
1302            state: serde_json::json!({"files": {"b": 2, "a": 1}}),
1303        }]);
1304
1305        let mut right = left.clone();
1306        right.metadata = serde_json::from_str(r#"{"beta":[3,4],"alpha":{"y":2,"x":1}}"#)
1307            .expect("valid metadata json");
1308        right.expected_environment_state = Some(vec![EnvironmentState {
1309            name: "workspace".into(),
1310            state: serde_json::from_str(r#"{"files":{"a":1,"b":2}}"#).expect("valid state json"),
1311        }]);
1312
1313        assert_eq!(left.default_session_id(), right.default_session_id());
1314    }
1315
1316    #[test]
1317    fn default_session_id_changes_when_case_content_changes() {
1318        let mut case = base_case("mutates");
1319        let original = case.default_session_id();
1320        case.user_messages.push("follow-up".into());
1321        assert_ne!(original, case.default_session_id());
1322    }
1323}
1324
1325#[cfg(test)]
1326mod budget_policy_tests {
1327    use super::*;
1328    use swink_agent::{Cost, PolicyContext, PolicyVerdict, PreTurnPolicy, SessionState, Usage};
1329
1330    fn make_ctx<'a>(turn_index: usize, usage: &'a Usage, cost: &'a Cost) -> PolicyContext<'a> {
1331        let state = Box::leak(Box::new(SessionState::new()));
1332        PolicyContext {
1333            turn_index,
1334            accumulated_usage: usage,
1335            accumulated_cost: cost,
1336            message_count: 0,
1337            overflow_signal: false,
1338            new_messages: &[],
1339            state,
1340        }
1341    }
1342
1343    #[test]
1344    fn budget_constraints_to_policies_none_when_unset() {
1345        let constraints = BudgetConstraints {
1346            max_cost: None,
1347            max_input: None,
1348            max_output: None,
1349            max_turns: None,
1350        };
1351
1352        let (budget_policy, max_turns_policy) = constraints.to_policies();
1353
1354        assert!(budget_policy.is_none());
1355        assert!(max_turns_policy.is_none());
1356    }
1357
1358    #[test]
1359    fn budget_constraints_to_policies_builds_budget_only_for_cost() {
1360        let constraints = BudgetConstraints {
1361            max_cost: Some(1.0),
1362            max_input: None,
1363            max_output: None,
1364            max_turns: None,
1365        };
1366
1367        let (budget_policy, max_turns_policy) = constraints.to_policies();
1368        let usage = Usage::default();
1369        let cost = Cost {
1370            total: 1.0,
1371            ..Default::default()
1372        };
1373        let ctx = make_ctx(0, &usage, &cost);
1374
1375        assert!(matches!(
1376            PreTurnPolicy::evaluate(&budget_policy.unwrap(), &ctx),
1377            PolicyVerdict::Stop(_)
1378        ));
1379        assert!(max_turns_policy.is_none());
1380    }
1381
1382    #[test]
1383    fn budget_constraints_to_policies_builds_budget_only_for_input_output() {
1384        let constraints = BudgetConstraints {
1385            max_cost: None,
1386            max_input: Some(10),
1387            max_output: Some(20),
1388            max_turns: None,
1389        };
1390
1391        let (budget_policy, max_turns_policy) = constraints.to_policies();
1392        let usage = Usage {
1393            input: 10,
1394            output: 20,
1395            total: 30,
1396            ..Default::default()
1397        };
1398        let cost = Cost::default();
1399        let ctx = make_ctx(0, &usage, &cost);
1400
1401        assert!(matches!(
1402            PreTurnPolicy::evaluate(&budget_policy.unwrap(), &ctx),
1403            PolicyVerdict::Stop(_)
1404        ));
1405        assert!(max_turns_policy.is_none());
1406    }
1407
1408    #[test]
1409    fn budget_constraints_to_policies_builds_both_policies_when_needed() {
1410        let constraints = BudgetConstraints {
1411            max_cost: Some(2.0),
1412            max_input: None,
1413            max_output: None,
1414            max_turns: Some(3),
1415        };
1416
1417        let (budget_policy, max_turns_policy) = constraints.to_policies();
1418        let usage = Usage::default();
1419        let cost = Cost {
1420            total: 2.0,
1421            ..Default::default()
1422        };
1423        let budget_ctx = make_ctx(0, &usage, &cost);
1424        let turn_cost = Cost::default();
1425        let turn_ctx = make_ctx(3, &usage, &turn_cost);
1426
1427        assert!(matches!(
1428            PreTurnPolicy::evaluate(&budget_policy.unwrap(), &budget_ctx),
1429            PolicyVerdict::Stop(_)
1430        ));
1431        assert!(matches!(
1432            PreTurnPolicy::evaluate(&max_turns_policy.unwrap(), &turn_ctx),
1433            PolicyVerdict::Stop(_)
1434        ));
1435    }
1436}
1437
1438#[cfg(all(test, feature = "multimodal"))]
1439mod attachment_url_tests {
1440    use super::*;
1441
1442    struct AllowListedFilter;
1443
1444    impl UrlFilter for AllowListedFilter {
1445        fn allows(&self, url: &Url) -> bool {
1446            matches!(
1447                url.host_str(),
1448                Some("assets.example.com" | "cdn.example.com")
1449            )
1450        }
1451    }
1452
1453    #[test]
1454    fn resolve_redirect_target_revalidates_each_hop_against_filter() {
1455        let current = Url::parse("https://assets.example.com/image.png").unwrap();
1456        let err = resolve_redirect_target(
1457            &current,
1458            "https://169.254.169.254/latest/meta-data",
1459            &AllowListedFilter,
1460        )
1461        .expect_err("redirect target should be revalidated");
1462
1463        match err {
1464            AttachmentError::UrlBlocked { url, reason } => {
1465                assert_eq!(url, "https://169.254.169.254/latest/meta-data");
1466                assert!(reason.contains("blocked by URL filter"));
1467            }
1468            other => panic!("expected UrlBlocked, got {other:?}"),
1469        }
1470    }
1471
1472    #[test]
1473    fn resolve_redirect_target_rejects_http_downgrades() {
1474        let current = Url::parse("https://assets.example.com/image.png").unwrap();
1475        let err = resolve_redirect_target(
1476            &current,
1477            "http://cdn.example.com/image.png",
1478            &AllowListedFilter,
1479        )
1480        .expect_err("http redirect should be rejected");
1481
1482        match err {
1483            AttachmentError::UrlBlocked { url, reason } => {
1484                assert_eq!(url, "http://cdn.example.com/image.png");
1485                assert!(reason.contains("only https URLs are supported"));
1486            }
1487            other => panic!("expected UrlBlocked, got {other:?}"),
1488        }
1489    }
1490
1491    #[test]
1492    fn resolve_redirect_target_allows_relative_https_redirects_when_filter_passes() {
1493        let current = Url::parse("https://assets.example.com/path/start.png").unwrap();
1494        let redirected =
1495            resolve_redirect_target(&current, "../final.webp", &AllowListedFilter).unwrap();
1496
1497        assert_eq!(redirected.as_str(), "https://assets.example.com/final.webp");
1498    }
1499}
swink_agent_eval/types.rs

swink_agent_eval/
types.rs