use crate::events::TaskOutcome;
use crate::execution_policy::ModelProfile;
use super::accumulator::{ContractSnapshot, CostSnapshot, ProgressSnapshot, RoutingSnapshot};
#[derive(Debug, Clone)]
pub struct HarnessEvalConfig {
pub enabled: bool,
pub weight_routing: f32,
pub weight_progress: f32,
pub weight_quality: f32,
pub weight_cost: f32,
pub cost_tier_cheap: f32,
pub cost_tier_balanced: f32,
pub cost_tier_strong: f32,
pub cost_tier_unknown: f32,
}
impl Default for HarnessEvalConfig {
fn default() -> Self {
Self {
enabled: true,
weight_routing: 0.30,
weight_progress: 0.25,
weight_quality: 0.30,
weight_cost: 0.15,
cost_tier_cheap: 1.0,
cost_tier_balanced: 2.5,
cost_tier_strong: 5.0,
cost_tier_unknown: 3.0,
}
}
}
impl HarnessEvalConfig {
pub fn tier_multiplier(&self, profile: Option<ModelProfile>) -> f32 {
match profile {
Some(ModelProfile::Cheap) => self.cost_tier_cheap,
Some(ModelProfile::Balanced) => self.cost_tier_balanced,
Some(ModelProfile::Strong) => self.cost_tier_strong,
None => self.cost_tier_unknown,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct HarnessScores {
pub routing_accuracy: f32,
pub progress_yield: f32,
pub contract_fulfillment: f32,
pub cost_efficiency: f32,
pub overall: f32,
}
pub fn compute_routing_accuracy(routing: &RoutingSnapshot) -> f32 {
let mut score = 1.0_f32;
if routing.orchestration_route == "direct_reply" && routing.tools_actually_used {
score -= 0.5;
}
if routing.tools_required_predicted
&& !routing.tools_actually_used
&& routing.outcome != TaskOutcome::Succeeded
{
score -= 0.3;
}
if routing.route_drift_failsafe {
score -= 0.2;
}
if routing.model_escalated {
score -= 0.1;
}
score.clamp(0.0, 1.0)
}
pub fn compute_progress_yield(progress: &ProgressSnapshot) -> f32 {
let iterations = progress.iterations.max(1) as f32;
let yield_units = progress.tool_calls_succeeded as f32
+ progress.observation_count as f32
+ progress.verification_count as f32;
let mut score = (yield_units / iterations).min(1.0);
if score < f32::EPSILON
&& progress.stall_guard_fires == 0
&& progress.deferred_no_tool_events <= 1
&& progress.iterations <= 3
{
score = (1.0 / iterations).min(0.5);
}
if progress.stall_guard_fires > 2 {
score *= 0.8;
}
score.clamp(0.0, 1.0)
}
pub fn contract_is_fulfilled(contract: &ContractSnapshot) -> bool {
let mut checks = 0_u32;
let mut passed = 0_u32;
if contract.requires_observation {
checks += 1;
if contract.observation_count > 0 {
passed += 1;
}
}
if contract.expects_mutation {
checks += 1;
if contract.mutation_count > 0 {
passed += 1;
}
}
if contract.verification_required {
checks += 1;
if contract.verification_count > 0 && contract.verification_blocks <= 2 {
passed += 1;
}
}
checks == 0 || passed == checks
}
pub fn compute_contract_fulfillment(contract: &ContractSnapshot, outcome: TaskOutcome) -> f32 {
if !contract.expects_mutation
&& !contract.requires_observation
&& !contract.verification_required
{
return match outcome {
TaskOutcome::Succeeded => 1.0,
TaskOutcome::Partial => 0.5,
TaskOutcome::Failed => 0.0,
};
}
let mut checks = 0_u32;
let mut passed = 0_u32;
if contract.requires_observation {
checks += 1;
if contract.observation_count > 0 {
passed += 1;
}
}
if contract.expects_mutation {
checks += 1;
if contract.mutation_count > 0 {
passed += 1;
}
}
if contract.verification_required {
checks += 1;
if contract.verification_count > 0 && contract.verification_blocks <= 2 {
passed += 1;
}
}
if checks == 0 {
return match outcome {
TaskOutcome::Succeeded => 1.0,
TaskOutcome::Partial => 0.5,
TaskOutcome::Failed => 0.0,
};
}
let base = passed as f32 / checks as f32;
match outcome {
TaskOutcome::Succeeded => base,
TaskOutcome::Partial => base * 0.75,
TaskOutcome::Failed => base * 0.25,
}
}
pub fn compute_cost_efficiency(cost: &CostSnapshot, outcome: TaskOutcome) -> f32 {
let weighted = cost.weighted_tokens.max(1) as f64;
let mut score = (1.0 / (1.0 + (weighted / 1000.0).log10().max(0.0))) as f32;
if outcome != TaskOutcome::Succeeded {
score *= 0.5;
}
score.clamp(0.0, 1.0)
}
pub fn compute_overall(scores: HarnessScores, config: &HarnessEvalConfig) -> f32 {
(config.weight_routing * scores.routing_accuracy
+ config.weight_progress * scores.progress_yield
+ config.weight_quality * scores.contract_fulfillment
+ config.weight_cost * scores.cost_efficiency)
.clamp(0.0, 1.0)
}
pub fn build_scores(
routing: &RoutingSnapshot,
progress: &ProgressSnapshot,
contract: &ContractSnapshot,
cost: &CostSnapshot,
outcome: TaskOutcome,
config: &HarnessEvalConfig,
) -> HarnessScores {
let routing_accuracy = compute_routing_accuracy(routing);
let progress_yield = compute_progress_yield(progress);
let contract_fulfillment = compute_contract_fulfillment(contract, outcome);
let cost_efficiency = compute_cost_efficiency(cost, outcome);
let partial = HarnessScores {
routing_accuracy,
progress_yield,
contract_fulfillment,
cost_efficiency,
overall: 0.0,
};
let overall = compute_overall(partial, config);
HarnessScores { overall, ..partial }
}
pub fn weighted_tokens_from_raw(
input_tokens: u64,
output_tokens: u64,
profile: Option<ModelProfile>,
config: &HarnessEvalConfig,
) -> u64 {
let raw = input_tokens.saturating_add(output_tokens);
(raw as f64 * config.tier_multiplier(profile) as f64).round() as u64
}
#[cfg(test)]
mod tests {
use super::*;
use crate::events::TaskOutcome;
fn default_routing() -> RoutingSnapshot {
RoutingSnapshot {
orchestration_route: "default_continue".to_string(),
tools_required_predicted: false,
tools_actually_used: false,
direct_return_attempted: false,
route_drift_failsafe: false,
skills_activated: vec![],
policy_profile: Some("balanced".to_string()),
model_escalated: false,
outcome: TaskOutcome::Succeeded,
response_fallthrough: false,
intent_gate_fires: 0,
evidence_gate_blocks: 0,
critique_replan_fires: 0,
}
}
#[test]
fn routing_penalizes_direct_reply_with_tools() {
let mut routing = default_routing();
routing.orchestration_route = "direct_reply".to_string();
routing.tools_actually_used = true;
assert!(compute_routing_accuracy(&routing) <= 0.5);
}
#[test]
fn cost_efficiency_penalizes_failed_outcomes() {
let cost = CostSnapshot {
total_input_tokens: 5000,
total_output_tokens: 500,
weighted_tokens: 5500,
llm_calls: 2,
fell_back_count: 0,
sub_agent_weighted_tokens: 0,
sub_agent_spawn_count: 0,
sub_agent_failures: 0,
tokens_failed_waste: true,
};
let ok = compute_cost_efficiency(&cost, TaskOutcome::Succeeded);
let bad = compute_cost_efficiency(&cost, TaskOutcome::Failed);
assert!(ok > bad);
}
#[test]
fn tier_multiplier_scales_weighted_tokens() {
let config = HarnessEvalConfig::default();
let cheap = weighted_tokens_from_raw(1000, 100, Some(ModelProfile::Cheap), &config);
let strong = weighted_tokens_from_raw(1000, 100, Some(ModelProfile::Strong), &config);
assert!(strong > cheap);
}
#[test]
fn contract_is_fulfilled_requires_mutation_when_expected() {
let mut contract = ContractSnapshot::default();
contract.expects_mutation = true;
contract.mutation_count = 0;
assert!(!contract_is_fulfilled(&contract));
contract.mutation_count = 1;
assert!(contract_is_fulfilled(&contract));
}
#[test]
fn progress_yield_credits_clean_conversational_turns() {
let progress = ProgressSnapshot {
iterations: 1,
..Default::default()
};
assert!(compute_progress_yield(&progress) >= 0.5);
}
}