use std::collections::HashSet;
use crate::evaluator::Evaluator;
use crate::score::Score;
use crate::types::{EvalCase, EvalMetricResult, Invocation};
pub struct EfficiencyEvaluator {
threshold: f64,
}
impl EfficiencyEvaluator {
#[must_use]
pub const fn new() -> Self {
Self { threshold: 0.5 }
}
#[must_use]
pub const fn with_threshold(mut self, threshold: f64) -> Self {
self.threshold = threshold;
self
}
}
impl Default for EfficiencyEvaluator {
fn default() -> Self {
Self::new()
}
}
impl Evaluator for EfficiencyEvaluator {
fn name(&self) -> &'static str {
"efficiency"
}
fn evaluate(&self, case: &EvalCase, invocation: &Invocation) -> Option<EvalMetricResult> {
let all_calls: Vec<_> = invocation
.turns
.iter()
.flat_map(|t| &t.tool_calls)
.collect();
let total = all_calls.len();
if total == 0 {
return None;
}
let unique_keys: HashSet<_> = all_calls
.iter()
.map(|tc| {
let args_str = serde_json::to_string(&tc.arguments).unwrap_or_default();
(tc.name.clone(), args_str)
})
.collect();
let unique = unique_keys.len();
#[allow(clippy::cast_precision_loss)]
let duplicate_ratio = unique as f64 / total as f64;
let actual_turns = invocation.turns.len();
let ideal = case
.budget
.as_ref()
.and_then(|b| b.max_turns)
.unwrap_or_else(|| unique_keys.len().max(1));
#[allow(clippy::cast_precision_loss)]
let step_ratio = (ideal.min(actual_turns) as f64 / actual_turns as f64).clamp(0.0, 1.0);
let composite = 0.6f64.mul_add(duplicate_ratio, 0.4 * step_ratio);
let details = format!(
"duplicate ratio: {duplicate_ratio:.2} ({unique}/{total} unique), \
step ratio: {step_ratio:.2} ({}/{actual_turns} turns efficient), \
composite: {composite:.2}",
ideal.min(actual_turns),
);
Some(EvalMetricResult {
evaluator_name: "efficiency".to_string(),
score: Score::new(composite, self.threshold),
details: Some(details),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::{BudgetConstraints, RecordedToolCall, TurnRecord};
use std::time::Duration;
use swink_agent::{AssistantMessage, Cost, ModelSpec, StopReason, Usage};
fn make_invocation(turns: &[&[(&str, serde_json::Value)]]) -> Invocation {
let turn_records: Vec<TurnRecord> = turns
.iter()
.enumerate()
.map(|(i, calls)| {
let tool_calls = calls
.iter()
.enumerate()
.map(|(j, (name, args))| RecordedToolCall {
id: format!("call_{i}_{j}"),
name: (*name).to_string(),
arguments: args.clone(),
})
.collect();
TurnRecord {
turn_index: i,
assistant_message: AssistantMessage {
content: vec![],
provider: "test".to_string(),
model_id: "test-model".to_string(),
usage: Usage::default(),
cost: Cost::default(),
stop_reason: StopReason::Stop,
error_message: None,
error_kind: None,
timestamp: 0,
cache_hint: None,
},
tool_calls,
tool_results: vec![],
duration: Duration::from_millis(50),
}
})
.collect();
Invocation {
turns: turn_records,
total_usage: Usage::default(),
total_cost: Cost::default(),
total_duration: Duration::from_millis(100),
final_response: None,
stop_reason: StopReason::Stop,
model: ModelSpec::new("test", "test-model"),
}
}
fn minimal_case() -> EvalCase {
EvalCase {
id: "test".to_string(),
name: "Test".to_string(),
description: None,
system_prompt: "test".to_string(),
user_messages: vec!["test".to_string()],
expected_trajectory: None,
expected_response: None,
expected_assertion: None,
expected_interactions: None,
few_shot_examples: vec![],
budget: None,
evaluators: vec![],
metadata: serde_json::Value::Null,
attachments: vec![],
session_id: None,
expected_environment_state: None,
expected_tool_intent: None,
semantic_tool_selection: false,
state_capture: None,
}
}
#[test]
fn no_tool_calls_returns_none() {
let eval = EfficiencyEvaluator::new();
let invocation = make_invocation(&[&[]]);
assert!(eval.evaluate(&minimal_case(), &invocation).is_none());
}
#[test]
fn all_unique_perfect_score() {
let eval = EfficiencyEvaluator::new();
let invocation = make_invocation(&[&[
("read", serde_json::json!({"file": "a.rs"})),
("write", serde_json::json!({"file": "b.rs"})),
]]);
let result = eval.evaluate(&minimal_case(), &invocation).unwrap();
assert!((result.score.value - 1.0).abs() < f64::EPSILON);
}
#[test]
fn duplicate_calls_penalized() {
let eval = EfficiencyEvaluator::new();
let invocation = make_invocation(&[&[
("read", serde_json::json!({"file": "a.rs"})),
("read", serde_json::json!({"file": "a.rs"})),
("read", serde_json::json!({"file": "a.rs"})),
("write", serde_json::json!({"file": "b.rs"})),
]]);
let result = eval.evaluate(&minimal_case(), &invocation).unwrap();
assert!((result.score.value - 0.7).abs() < 0.01);
}
#[test]
fn step_ratio_uses_budget() {
let eval = EfficiencyEvaluator::new();
let invocation = make_invocation(&[
&[("read", serde_json::json!({}))],
&[("write", serde_json::json!({}))],
&[("read", serde_json::json!({"file": "c.rs"}))],
&[("write", serde_json::json!({"file": "d.rs"}))],
]);
let mut case = minimal_case();
case.budget = Some(BudgetConstraints {
max_cost: None,
max_input: None,
max_output: None,
max_turns: Some(2),
});
let result = eval.evaluate(&case, &invocation).unwrap();
assert!((result.score.value - 0.8).abs() < 0.01);
}
#[test]
fn composite_weighted() {
let eval = EfficiencyEvaluator::new();
let invocation = make_invocation(&[
&[("read", serde_json::json!({"file": "a.rs"}))],
&[("read", serde_json::json!({"file": "a.rs"}))],
]);
let result = eval.evaluate(&minimal_case(), &invocation).unwrap();
assert!((result.score.value - 0.5).abs() < 0.01);
}
}