use std::collections::{BTreeMap, HashSet};
use std::sync::Arc;
use std::time::Duration;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use swink_agent::{AssistantMessage, Cost, ModelSpec, StopReason, ToolResultMessage, Usage};
use swink_agent_policies::{BudgetPolicy, MaxTurnsPolicy};
use uuid::Uuid;
use crate::error::EvalError;
use crate::score::{Score, Verdict};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RecordedToolCall {
pub id: String,
pub name: String,
pub arguments: serde_json::Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TurnRecord {
pub turn_index: usize,
pub assistant_message: AssistantMessage,
pub tool_calls: Vec<RecordedToolCall>,
pub tool_results: Vec<ToolResultMessage>,
pub duration: Duration,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Invocation {
pub turns: Vec<TurnRecord>,
pub total_usage: Usage,
pub total_cost: Cost,
pub total_duration: Duration,
pub final_response: Option<String>,
pub stop_reason: StopReason,
pub model: ModelSpec,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExpectedToolCall {
pub tool_name: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub arguments: Option<serde_json::Value>,
}
#[derive(Clone, Serialize, Deserialize)]
#[serde(tag = "mode", rename_all = "snake_case")]
pub enum ResponseCriteria {
Exact { expected: String },
Contains { substring: String },
Regex { pattern: String },
#[serde(skip)]
Custom(#[serde(skip)] Arc<dyn Fn(&str) -> Score + Send + Sync>),
}
impl std::fmt::Debug for ResponseCriteria {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Exact { expected } => {
f.debug_struct("Exact").field("expected", expected).finish()
}
Self::Contains { substring } => f
.debug_struct("Contains")
.field("substring", substring)
.finish(),
Self::Regex { pattern } => f.debug_struct("Regex").field("pattern", pattern).finish(),
Self::Custom(_) => f.debug_tuple("Custom").field(&"<fn>").finish(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnvironmentState {
pub name: String,
pub state: serde_json::Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ToolIntent {
pub intent: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub tool_name: Option<String>,
}
pub type StateCapture = Arc<dyn Fn(&Invocation) -> Vec<EnvironmentState> + Send + Sync>;
pub const CASE_NAMESPACE: Uuid = Uuid::from_bytes([
37, 101, 28, 203, 118, 231, 87, 244, 147, 248, 152, 59, 222, 174, 80, 226,
]);
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct CaseFingerprint {
pub id: String,
pub name: String,
pub description: Option<String>,
pub system_prompt: String,
pub user_messages: Vec<String>,
pub expected_trajectory: Option<Vec<ExpectedToolCallFingerprint>>,
pub expected_response: Option<ResponseCriteriaFingerprint>,
pub budget: Option<BudgetConstraintsFingerprint>,
pub evaluators: Vec<String>,
pub metadata: CanonicalJsonValue,
pub expected_environment_state: Option<Vec<EnvironmentStateFingerprint>>,
pub expected_tool_intent: Option<ToolIntentFingerprint>,
pub semantic_tool_selection: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct ExpectedToolCallFingerprint {
pub tool_name: String,
pub arguments: Option<CanonicalJsonValue>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub enum ResponseCriteriaFingerprint {
Exact { expected: String },
Contains { substring: String },
Regex { pattern: String },
Custom,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct BudgetConstraintsFingerprint {
pub cost_limit_bits: Option<u64>,
pub input_limit: Option<u64>,
pub output_limit: Option<u64>,
pub turn_limit: Option<usize>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct EnvironmentStateFingerprint {
pub name: String,
pub state: CanonicalJsonValue,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct ToolIntentFingerprint {
pub intent: String,
pub tool_name: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
#[serde(tag = "kind", content = "value", rename_all = "snake_case")]
pub enum CanonicalJsonValue {
Null,
Bool(bool),
Number(String),
String(String),
Array(Vec<Self>),
Object(BTreeMap<String, Self>),
}
impl From<&serde_json::Value> for CanonicalJsonValue {
fn from(value: &serde_json::Value) -> Self {
match value {
serde_json::Value::Null => Self::Null,
serde_json::Value::Bool(value) => Self::Bool(*value),
serde_json::Value::Number(value) => Self::Number(value.to_string()),
serde_json::Value::String(value) => Self::String(value.clone()),
serde_json::Value::Array(values) => {
Self::Array(values.iter().map(Self::from).collect())
}
serde_json::Value::Object(values) => Self::Object(
values
.iter()
.map(|(key, value)| (key.clone(), Self::from(value)))
.collect(),
),
}
}
}
impl From<&ExpectedToolCall> for ExpectedToolCallFingerprint {
fn from(call: &ExpectedToolCall) -> Self {
Self {
tool_name: call.tool_name.clone(),
arguments: call.arguments.as_ref().map(CanonicalJsonValue::from),
}
}
}
impl From<&ResponseCriteria> for ResponseCriteriaFingerprint {
fn from(criteria: &ResponseCriteria) -> Self {
match criteria {
ResponseCriteria::Exact { expected } => Self::Exact {
expected: expected.clone(),
},
ResponseCriteria::Contains { substring } => Self::Contains {
substring: substring.clone(),
},
ResponseCriteria::Regex { pattern } => Self::Regex {
pattern: pattern.clone(),
},
ResponseCriteria::Custom(_) => Self::Custom,
}
}
}
impl From<&BudgetConstraints> for BudgetConstraintsFingerprint {
fn from(budget: &BudgetConstraints) -> Self {
Self {
cost_limit_bits: budget.max_cost.map(f64::to_bits),
input_limit: budget.max_input,
output_limit: budget.max_output,
turn_limit: budget.max_turns,
}
}
}
impl From<&EnvironmentState> for EnvironmentStateFingerprint {
fn from(state: &EnvironmentState) -> Self {
Self {
name: state.name.clone(),
state: CanonicalJsonValue::from(&state.state),
}
}
}
impl From<&ToolIntent> for ToolIntentFingerprint {
fn from(intent: &ToolIntent) -> Self {
Self {
intent: intent.intent.clone(),
tool_name: intent.tool_name.clone(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BudgetConstraints {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_cost: Option<f64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_input: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_output: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_turns: Option<usize>,
}
impl BudgetConstraints {
#[must_use]
pub fn to_policies(&self) -> (Option<BudgetPolicy>, Option<MaxTurnsPolicy>) {
let budget_policy =
if self.max_cost.is_none() && self.max_input.is_none() && self.max_output.is_none() {
None
} else {
let mut policy = BudgetPolicy::new();
if let Some(max_cost) = self.max_cost {
policy = policy.max_cost(max_cost);
}
if let Some(max_input) = self.max_input {
policy = policy.max_input(max_input);
}
if let Some(max_output) = self.max_output {
policy = policy.max_output(max_output);
}
Some(policy)
};
let max_turns_policy = self.max_turns.map(MaxTurnsPolicy::new);
(budget_policy, max_turns_policy)
}
}
#[derive(Clone, Serialize, Deserialize)]
pub struct EvalCase {
pub id: String,
pub name: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
pub system_prompt: String,
pub user_messages: Vec<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub expected_trajectory: Option<Vec<ExpectedToolCall>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub expected_response: Option<ResponseCriteria>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub budget: Option<BudgetConstraints>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub evaluators: Vec<String>,
#[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
pub metadata: serde_json::Value,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub expected_environment_state: Option<Vec<EnvironmentState>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub expected_tool_intent: Option<ToolIntent>,
#[serde(default, skip_serializing_if = "is_false")]
pub semantic_tool_selection: bool,
#[serde(skip)]
pub state_capture: Option<StateCapture>,
}
impl std::fmt::Debug for EvalCase {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("EvalCase")
.field("id", &self.id)
.field("name", &self.name)
.field("description", &self.description)
.field("system_prompt", &self.system_prompt)
.field("user_messages", &self.user_messages)
.field("expected_trajectory", &self.expected_trajectory)
.field("expected_response", &self.expected_response)
.field("budget", &self.budget)
.field("evaluators", &self.evaluators)
.field("metadata", &self.metadata)
.field(
"expected_environment_state",
&self.expected_environment_state,
)
.field("expected_tool_intent", &self.expected_tool_intent)
.field("semantic_tool_selection", &self.semantic_tool_selection)
.field(
"state_capture",
&self.state_capture.as_ref().map(|_| "<fn>"),
)
.finish()
}
}
impl From<&EvalCase> for CaseFingerprint {
fn from(case: &EvalCase) -> Self {
Self {
id: case.id.clone(),
name: case.name.clone(),
description: case.description.clone(),
system_prompt: case.system_prompt.clone(),
user_messages: case.user_messages.clone(),
expected_trajectory: case.expected_trajectory.as_ref().map(|calls| {
calls
.iter()
.map(ExpectedToolCallFingerprint::from)
.collect()
}),
expected_response: case
.expected_response
.as_ref()
.map(ResponseCriteriaFingerprint::from),
budget: case.budget.as_ref().map(BudgetConstraintsFingerprint::from),
evaluators: case.evaluators.clone(),
metadata: CanonicalJsonValue::from(&case.metadata),
expected_environment_state: case.expected_environment_state.as_ref().map(|states| {
states
.iter()
.map(EnvironmentStateFingerprint::from)
.collect()
}),
expected_tool_intent: case
.expected_tool_intent
.as_ref()
.map(ToolIntentFingerprint::from),
semantic_tool_selection: case.semantic_tool_selection,
}
}
}
impl EvalCase {
#[must_use]
pub fn content_fingerprint(&self) -> CaseFingerprint {
CaseFingerprint::from(self)
}
#[must_use]
pub fn default_session_id(&self) -> Uuid {
let canonical =
bincode::serialize(&self.content_fingerprint()).expect("case fingerprint serializes");
let digest = Sha256::digest(canonical);
Uuid::new_v5(&CASE_NAMESPACE, digest.as_slice())
}
}
#[allow(clippy::trivially_copy_pass_by_ref)]
const fn is_false(b: &bool) -> bool {
!*b
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalSet {
pub id: String,
pub name: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
pub cases: Vec<EvalCase>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalMetricResult {
pub evaluator_name: String,
pub score: Score,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub details: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalCaseResult {
pub case_id: String,
pub invocation: Invocation,
pub metric_results: Vec<EvalMetricResult>,
pub verdict: Verdict,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalSetResult {
pub eval_set_id: String,
pub case_results: Vec<EvalCaseResult>,
pub summary: EvalSummary,
pub timestamp: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalSummary {
pub total_cases: usize,
pub passed: usize,
pub failed: usize,
pub total_cost: Cost,
pub total_usage: Usage,
pub total_duration: Duration,
}
pub fn validate_eval_case(case: &EvalCase) -> Result<(), EvalError> {
if let Some(states) = &case.expected_environment_state {
let mut seen: HashSet<&str> = HashSet::with_capacity(states.len());
for state in states {
if !seen.insert(state.name.as_str()) {
return Err(EvalError::invalid_case(format!(
"case `{case_id}`: duplicate expected_environment_state name `{name}`",
case_id = case.id,
name = state.name,
)));
}
}
}
Ok(())
}
pub fn validate_eval_set(set: &EvalSet) -> Result<(), EvalError> {
for case in &set.cases {
validate_eval_case(case)?;
}
Ok(())
}
#[cfg(test)]
mod validation_tests {
use super::*;
fn base_case(id: &str) -> EvalCase {
EvalCase {
id: id.to_string(),
name: id.to_string(),
description: None,
system_prompt: String::new(),
user_messages: vec!["hi".to_string()],
expected_trajectory: None,
expected_response: None,
budget: None,
evaluators: vec![],
metadata: serde_json::Value::Null,
expected_environment_state: None,
expected_tool_intent: None,
semantic_tool_selection: false,
state_capture: None,
}
}
#[test]
fn validate_accepts_unique_environment_state_names() {
let mut case = base_case("c1");
case.expected_environment_state = Some(vec![
EnvironmentState {
name: "alpha".into(),
state: serde_json::json!({"v": 1}),
},
EnvironmentState {
name: "beta".into(),
state: serde_json::json!({"v": 2}),
},
]);
assert!(validate_eval_case(&case).is_ok());
}
#[test]
fn validate_rejects_duplicate_environment_state_names() {
let mut case = base_case("dup");
case.expected_environment_state = Some(vec![
EnvironmentState {
name: "alpha".into(),
state: serde_json::json!({"v": 1}),
},
EnvironmentState {
name: "alpha".into(),
state: serde_json::json!({"v": 2}),
},
]);
let err = validate_eval_case(&case).expect_err("duplicate should be rejected");
match err {
EvalError::InvalidCase { reason } => {
assert!(reason.contains("alpha"), "reason: {reason}");
assert!(reason.contains("dup"), "reason mentions case id: {reason}");
}
other => panic!("expected InvalidCase, got {other:?}"),
}
}
#[test]
fn validate_none_environment_state_is_ok() {
let case = base_case("none");
assert!(validate_eval_case(&case).is_ok());
}
#[test]
fn validate_eval_set_propagates_case_errors() {
let mut case = base_case("bad");
case.expected_environment_state = Some(vec![
EnvironmentState {
name: "x".into(),
state: serde_json::Value::Null,
},
EnvironmentState {
name: "x".into(),
state: serde_json::Value::Null,
},
]);
let set = EvalSet {
id: "set".into(),
name: "Set".into(),
description: None,
cases: vec![case],
};
assert!(validate_eval_set(&set).is_err());
}
#[test]
fn environment_state_serde_round_trip() {
let state = EnvironmentState {
name: "db".into(),
state: serde_json::json!({"rows": 3, "schema": "public"}),
};
let json = serde_json::to_string(&state).unwrap();
let back: EnvironmentState = serde_json::from_str(&json).unwrap();
assert_eq!(back.name, state.name);
assert_eq!(back.state, state.state);
}
#[test]
fn eval_case_serde_round_trip_with_v2_fields() {
let mut case = base_case("v2");
case.expected_environment_state = Some(vec![EnvironmentState {
name: "alpha".into(),
state: serde_json::json!({"n": 1}),
}]);
case.expected_tool_intent = Some(ToolIntent {
intent: "read config".into(),
tool_name: Some("read_file".into()),
});
case.semantic_tool_selection = true;
let yaml_like = serde_json::to_string(&case).unwrap();
let back: EvalCase = serde_json::from_str(&yaml_like).unwrap();
assert_eq!(back.expected_environment_state.as_ref().unwrap().len(), 1);
assert_eq!(
back.expected_tool_intent.as_ref().unwrap().intent,
"read config"
);
assert!(back.semantic_tool_selection);
assert!(back.state_capture.is_none());
}
#[test]
fn case_namespace_matches_oid_derived_value() {
assert_eq!(
CASE_NAMESPACE,
Uuid::new_v5(&Uuid::NAMESPACE_OID, b"swink-agent-eval.case")
);
}
#[test]
fn default_session_id_is_deterministic_for_same_case() {
let mut case = base_case("stable");
case.metadata = serde_json::json!({
"beta": [2, {"y": true, "x": false}],
"alpha": {"nested_b": 2, "nested_a": 1}
});
case.expected_response = Some(ResponseCriteria::Contains {
substring: "ok".into(),
});
case.expected_trajectory = Some(vec![ExpectedToolCall {
tool_name: "read_file".into(),
arguments: Some(serde_json::json!({"path": "./project-alpha/config.toml"})),
}]);
let first = case.default_session_id();
let second = case.default_session_id();
assert_eq!(first, second);
}
#[test]
fn default_session_id_is_stable_across_json_key_order() {
let mut left = base_case("ordered");
left.metadata = serde_json::json!({
"alpha": {"x": 1, "y": 2},
"beta": [3, 4]
});
left.expected_environment_state = Some(vec![EnvironmentState {
name: "workspace".into(),
state: serde_json::json!({"files": {"b": 2, "a": 1}}),
}]);
let mut right = left.clone();
right.metadata = serde_json::from_str(r#"{"beta":[3,4],"alpha":{"y":2,"x":1}}"#)
.expect("valid metadata json");
right.expected_environment_state = Some(vec![EnvironmentState {
name: "workspace".into(),
state: serde_json::from_str(r#"{"files":{"a":1,"b":2}}"#).expect("valid state json"),
}]);
assert_eq!(left.default_session_id(), right.default_session_id());
}
#[test]
fn default_session_id_changes_when_case_content_changes() {
let mut case = base_case("mutates");
let original = case.default_session_id();
case.user_messages.push("follow-up".into());
assert_ne!(original, case.default_session_id());
}
}
#[cfg(test)]
mod budget_policy_tests {
use super::*;
use swink_agent::{Cost, PolicyContext, PolicyVerdict, PreTurnPolicy, SessionState, Usage};
fn make_ctx<'a>(turn_index: usize, usage: &'a Usage, cost: &'a Cost) -> PolicyContext<'a> {
let state = Box::leak(Box::new(SessionState::new()));
PolicyContext {
turn_index,
accumulated_usage: usage,
accumulated_cost: cost,
message_count: 0,
overflow_signal: false,
new_messages: &[],
state,
}
}
#[test]
fn budget_constraints_to_policies_none_when_unset() {
let constraints = BudgetConstraints {
max_cost: None,
max_input: None,
max_output: None,
max_turns: None,
};
let (budget_policy, max_turns_policy) = constraints.to_policies();
assert!(budget_policy.is_none());
assert!(max_turns_policy.is_none());
}
#[test]
fn budget_constraints_to_policies_builds_budget_only_for_cost() {
let constraints = BudgetConstraints {
max_cost: Some(1.0),
max_input: None,
max_output: None,
max_turns: None,
};
let (budget_policy, max_turns_policy) = constraints.to_policies();
let usage = Usage::default();
let cost = Cost {
total: 1.0,
..Default::default()
};
let ctx = make_ctx(0, &usage, &cost);
assert!(matches!(
PreTurnPolicy::evaluate(&budget_policy.unwrap(), &ctx),
PolicyVerdict::Stop(_)
));
assert!(max_turns_policy.is_none());
}
#[test]
fn budget_constraints_to_policies_builds_budget_only_for_input_output() {
let constraints = BudgetConstraints {
max_cost: None,
max_input: Some(10),
max_output: Some(20),
max_turns: None,
};
let (budget_policy, max_turns_policy) = constraints.to_policies();
let usage = Usage {
input: 10,
output: 20,
total: 30,
..Default::default()
};
let cost = Cost::default();
let ctx = make_ctx(0, &usage, &cost);
assert!(matches!(
PreTurnPolicy::evaluate(&budget_policy.unwrap(), &ctx),
PolicyVerdict::Stop(_)
));
assert!(max_turns_policy.is_none());
}
#[test]
fn budget_constraints_to_policies_builds_both_policies_when_needed() {
let constraints = BudgetConstraints {
max_cost: Some(2.0),
max_input: None,
max_output: None,
max_turns: Some(3),
};
let (budget_policy, max_turns_policy) = constraints.to_policies();
let usage = Usage::default();
let cost = Cost {
total: 2.0,
..Default::default()
};
let budget_ctx = make_ctx(0, &usage, &cost);
let turn_cost = Cost::default();
let turn_ctx = make_ctx(3, &usage, &turn_cost);
assert!(matches!(
PreTurnPolicy::evaluate(&budget_policy.unwrap(), &budget_ctx),
PolicyVerdict::Stop(_)
));
assert!(matches!(
PreTurnPolicy::evaluate(&max_turns_policy.unwrap(), &turn_ctx),
PolicyVerdict::Stop(_)
));
}
}