mod common;
use std::sync::Arc;
use std::time::{Duration, Instant};
use swink_agent::{AssistantMessage, Cost, ModelSpec, StopReason, Usage};
use swink_agent_eval::{
EvalCase, Evaluator, Invocation, JudgeClient, JudgeError, JudgeVerdict, MockJudge,
RecordedToolCall, SemanticToolParameterEvaluator, SlowMockJudge, ToolIntent, TurnRecord,
Verdict,
};
use common::mock_invocation;
fn case_with_intent(id: &str, intent: &str, tool_name: Option<&str>) -> EvalCase {
EvalCase {
id: id.to_string(),
name: id.to_string(),
description: None,
system_prompt: "be helpful".to_string(),
user_messages: vec!["do the thing".to_string()],
expected_trajectory: None,
expected_response: None,
expected_assertion: None,
expected_interactions: None,
few_shot_examples: vec![],
budget: None,
evaluators: vec![],
metadata: serde_json::Value::Null,
attachments: vec![],
session_id: None,
expected_environment_state: None,
expected_tool_intent: Some(ToolIntent {
intent: intent.to_string(),
tool_name: tool_name.map(String::from),
}),
semantic_tool_selection: false,
state_capture: None,
}
}
fn case_without_intent(id: &str) -> EvalCase {
EvalCase {
id: id.to_string(),
name: id.to_string(),
description: None,
system_prompt: "be helpful".to_string(),
user_messages: vec!["do the thing".to_string()],
expected_trajectory: None,
expected_response: None,
expected_assertion: None,
expected_interactions: None,
few_shot_examples: vec![],
budget: None,
evaluators: vec![],
metadata: serde_json::Value::Null,
attachments: vec![],
session_id: None,
expected_environment_state: None,
expected_tool_intent: None,
semantic_tool_selection: false,
state_capture: None,
}
}
fn invocation_with(calls: &[(&str, serde_json::Value)]) -> Invocation {
let tool_calls: Vec<RecordedToolCall> = calls
.iter()
.enumerate()
.map(|(i, (name, args))| RecordedToolCall {
id: format!("id{i}"),
name: (*name).to_string(),
arguments: args.clone(),
})
.collect();
Invocation {
turns: vec![TurnRecord {
turn_index: 0,
assistant_message: AssistantMessage {
content: vec![],
provider: "test".into(),
model_id: "test-model".into(),
usage: Usage::default(),
cost: Cost::default(),
stop_reason: StopReason::Stop,
error_message: None,
error_kind: None,
timestamp: 0,
cache_hint: None,
},
tool_calls,
tool_results: vec![],
duration: Duration::from_millis(1),
}],
total_usage: Usage::default(),
total_cost: Cost::default(),
total_duration: Duration::from_millis(1),
final_response: None,
stop_reason: StopReason::Stop,
model: ModelSpec::new("test", "test-model"),
}
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn intent_satisfied_by_non_literal_arguments() {
let verdict = JudgeVerdict {
score: 1.0,
pass: true,
reason: Some("arguments satisfy the intent".into()),
label: Some("equivalent".into()),
};
let judge: Arc<dyn JudgeClient> = Arc::new(MockJudge::with_verdicts(vec![verdict]));
let evaluator = SemanticToolParameterEvaluator::new(Arc::clone(&judge));
let case = case_with_intent("as-6-1", "read config for project-alpha", None);
let invocation = invocation_with(&[(
"read_file",
serde_json::json!({"path": "./project-alpha/config.toml"}),
)]);
let result = evaluator
.evaluate(&case, &invocation)
.expect("evaluator must return a metric when intent is set");
assert_eq!(result.evaluator_name, "semantic_tool_parameter");
assert_eq!(result.score.verdict(), Verdict::Pass);
let details = result.details.unwrap();
assert!(
details.contains("satisfy"),
"expected judge reason in details, got: {details}"
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn no_expected_tool_intent_returns_none() {
let judge: Arc<dyn JudgeClient> = Arc::new(MockJudge::always_pass());
let evaluator = SemanticToolParameterEvaluator::new(judge);
let case = case_without_intent("as-6-2");
let invocation = mock_invocation(&["read_file"], None, 0.0, 0);
assert!(evaluator.evaluate(&case, &invocation).is_none());
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn inner_judge_timeout_maps_to_fail() {
let judge: Arc<dyn JudgeClient> = Arc::new(MockJudge::always_err(JudgeError::Timeout));
let evaluator = SemanticToolParameterEvaluator::new(judge);
let case = case_with_intent("as-6-3", "read the config", None);
let invocation = mock_invocation(&["read_file"], None, 0.0, 0);
let start = Instant::now();
let result = evaluator.evaluate(&case, &invocation).unwrap();
let elapsed = start.elapsed();
assert!(
elapsed < Duration::from_secs(2),
"inner timeout should not hang — elapsed: {elapsed:?}"
);
assert_eq!(result.score.verdict(), Verdict::Fail);
let details = result.details.unwrap();
assert!(details.contains("Timeout"), "details: {details}");
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn outer_timeout_maps_to_fail_and_does_not_hang() {
let judge: Arc<dyn JudgeClient> = Arc::new(SlowMockJudge::new(Duration::from_secs(10)));
let evaluator =
SemanticToolParameterEvaluator::new(judge).with_timeout(Duration::from_millis(50));
let case = case_with_intent("as-6-outer", "read the config", None);
let invocation = mock_invocation(&["read_file"], None, 0.0, 0);
let start = Instant::now();
let result = evaluator.evaluate(&case, &invocation).unwrap();
let elapsed = start.elapsed();
assert!(
elapsed < Duration::from_secs(2),
"outer timeout did not fire promptly — elapsed: {elapsed:?}"
);
assert_eq!(result.score.verdict(), Verdict::Fail);
let details = result.details.unwrap();
assert!(
details.contains("exceeded"),
"expected timeout context in details, got: {details}"
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn filter_with_no_match_returns_none() {
let judge_inner = Arc::new(MockJudge::always_pass());
let judge: Arc<dyn JudgeClient> = Arc::clone(&judge_inner) as _;
let evaluator = SemanticToolParameterEvaluator::new(judge);
let case = case_with_intent("as-6-4", "read config for project-alpha", Some("read_file"));
let invocation = invocation_with(&[("list_dir", serde_json::json!({"path": "."}))]);
assert!(evaluator.evaluate(&case, &invocation).is_none());
assert_eq!(
judge_inner.call_count(),
0,
"judge must not be invoked when filter excludes all calls"
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn filter_with_partial_match_judges_only_target() {
let verdict = JudgeVerdict {
score: 1.0,
pass: true,
reason: Some("path matches project-alpha".into()),
label: Some("equivalent".into()),
};
let judge_inner = Arc::new(MockJudge::with_verdicts(vec![verdict]));
let judge: Arc<dyn JudgeClient> = Arc::clone(&judge_inner) as _;
let evaluator = SemanticToolParameterEvaluator::new(judge);
let case = case_with_intent(
"as-6-partial",
"read config for project-alpha",
Some("read_file"),
);
let invocation = invocation_with(&[
("list_dir", serde_json::json!({"path": "."})),
(
"read_file",
serde_json::json!({"path": "./project-alpha/config.toml"}),
),
("search", serde_json::json!({"query": "config"})),
]);
let result = evaluator
.evaluate(&case, &invocation)
.expect("evaluator must return a metric for the single matching call");
assert_eq!(result.evaluator_name, "semantic_tool_parameter");
assert_eq!(result.score.verdict(), Verdict::Pass);
assert_eq!(
judge_inner.call_count(),
1,
"judge must be invoked exactly once — only the target call"
);
let details = result.details.unwrap();
assert!(
details.contains("read_file"),
"details should reference the judged tool, got: {details}"
);
assert!(
!details.contains("list_dir") && !details.contains("search"),
"details should not reference filtered-out tools, got: {details}"
);
}
#[test]
fn evaluates_from_plain_sync_context_without_panic() {
let verdict = JudgeVerdict {
score: 1.0,
pass: true,
reason: Some("intent satisfied".into()),
label: None,
};
let judge: Arc<dyn JudgeClient> = Arc::new(MockJudge::with_verdicts(vec![verdict]));
let evaluator = SemanticToolParameterEvaluator::new(judge);
let case = case_with_intent("sync-ctx", "read config for project-alpha", None);
let invocation = mock_invocation(&["read_file"], Some("ok"), 0.0, 0);
let result = evaluator
.evaluate(&case, &invocation)
.expect("must produce a metric without an ambient runtime");
assert_eq!(result.score.verdict(), Verdict::Pass);
}