mod common;
use swink_agent_eval::{
BudgetConstraints, EfficiencyEvaluator, Evaluator, EvaluatorRegistry, Verdict,
};
use common::{case_with_budget, mock_invocation_multi_turn};
#[test]
fn all_unique_passes() {
let eval = EfficiencyEvaluator::new();
let invocation = mock_invocation_multi_turn(&[&[
("read", serde_json::json!({"file": "a.rs"})),
("write", serde_json::json!({"file": "b.rs"})),
]]);
let case = case_with_budget(BudgetConstraints {
max_cost: None,
max_input: None,
max_output: None,
max_turns: None,
});
let result = eval.evaluate(&case, &invocation).unwrap();
assert_eq!(result.score.verdict(), Verdict::Pass);
assert!((result.score.value - 1.0).abs() < f64::EPSILON);
}
#[test]
fn duplicates_reduce_score() {
let eval = EfficiencyEvaluator::new();
let invocation = mock_invocation_multi_turn(&[
&[
("read", serde_json::json!({"file": "a.rs"})),
("read", serde_json::json!({"file": "a.rs"})),
],
&[
("read", serde_json::json!({"file": "a.rs"})),
("read", serde_json::json!({"file": "a.rs"})),
],
]);
let case = case_with_budget(BudgetConstraints {
max_cost: None,
max_input: None,
max_output: None,
max_turns: None,
});
let result = eval.evaluate(&case, &invocation).unwrap();
assert!(result.score.value < 0.5);
assert_eq!(result.score.verdict(), Verdict::Fail);
}
#[test]
fn in_default_registry() {
let registry = EvaluatorRegistry::with_defaults();
let invocation = mock_invocation_multi_turn(&[&[("read", serde_json::json!({}))]]);
let case = case_with_budget(BudgetConstraints {
max_cost: None,
max_input: None,
max_output: None,
max_turns: None,
});
let results = registry.evaluate(&case, &invocation);
let names: Vec<_> = results.iter().map(|r| r.evaluator_name.as_str()).collect();
assert!(
names.contains(&"efficiency"),
"expected efficiency evaluator in defaults, got: {names:?}"
);
}
#[test]
fn us3_perfect_efficiency_score_1() {
let eval = EfficiencyEvaluator::new();
let invocation = mock_invocation_multi_turn(&[&[
("read", serde_json::json!({"file": "a.rs"})),
("write", serde_json::json!({"file": "b.rs"})),
]]);
let case = case_with_budget(BudgetConstraints {
max_cost: None,
max_input: None,
max_output: None,
max_turns: Some(1),
});
let result = eval.evaluate(&case, &invocation).unwrap();
let repeated = eval.evaluate(&case, &invocation).unwrap();
assert!(
(result.score.value - 1.0).abs() < f64::EPSILON,
"expected 1.0, got {}",
result.score.value
);
assert!(
(repeated.score.value - result.score.value).abs() < f64::EPSILON,
"repeated evaluation must be deterministic",
);
assert_eq!(repeated.score.verdict(), result.score.verdict());
assert_eq!(repeated.details, result.details);
}
#[test]
fn us3_half_duplicates_double_turns() {
let eval = EfficiencyEvaluator::new();
let invocation = mock_invocation_multi_turn(&[
&[("read", serde_json::json!({"file": "a.rs"}))],
&[("read", serde_json::json!({"file": "a.rs"}))],
]);
let case = case_with_budget(BudgetConstraints {
max_cost: None,
max_input: None,
max_output: None,
max_turns: Some(1), });
let result = eval.evaluate(&case, &invocation).unwrap();
assert!(
(result.score.value - 0.5).abs() < 0.01,
"expected ~0.5, got {}",
result.score.value
);
}
#[test]
fn us3_empty_trajectory_returns_none() {
let eval = EfficiencyEvaluator::new();
let invocation = mock_invocation_multi_turn(&[&[]]);
let case = case_with_budget(BudgetConstraints {
max_cost: None,
max_input: None,
max_output: None,
max_turns: None,
});
assert!(eval.evaluate(&case, &invocation).is_none());
}
#[test]
fn us3_more_efficient_scores_higher() {
let eval = EfficiencyEvaluator::new();
let case = case_with_budget(BudgetConstraints {
max_cost: None,
max_input: None,
max_output: None,
max_turns: Some(1),
});
let efficient = mock_invocation_multi_turn(&[&[
("read", serde_json::json!({})),
("write", serde_json::json!({})),
]]);
let score_efficient = eval.evaluate(&case, &efficient).unwrap().score.value;
let inefficient = mock_invocation_multi_turn(&[
&[("read", serde_json::json!({}))],
&[("read", serde_json::json!({}))],
&[("read", serde_json::json!({}))],
]);
let score_inefficient = eval.evaluate(&case, &inefficient).unwrap().score.value;
assert!(
score_efficient > score_inefficient,
"efficient ({score_efficient}) should score higher than inefficient ({score_inefficient})"
);
}
#[test]
fn us3_ideal_turns_from_budget() {
let eval = EfficiencyEvaluator::new();
let invocation = mock_invocation_multi_turn(&[
&[("read", serde_json::json!({}))],
&[("write", serde_json::json!({}))],
&[("read", serde_json::json!({"file": "c.rs"}))],
&[("write", serde_json::json!({"file": "d.rs"}))],
]);
let case = case_with_budget(BudgetConstraints {
max_cost: None,
max_input: None,
max_output: None,
max_turns: Some(2),
});
let result = eval.evaluate(&case, &invocation).unwrap();
assert!(
(result.score.value - 0.8).abs() < 0.01,
"expected ~0.8, got {}",
result.score.value
);
}