#![allow(dead_code)]
use std::path::Path;
use std::sync::Arc;
use std::time::{Duration, Instant};
use futures_lite::StreamExt;
use poe2_agent::{AgentEvent, ChatGptClient, PobParser, ToolAgent, Usage};
const POB_PATH: &str = "vendor/PathOfBuilding-PoE2";
struct EvalCase {
name: &'static str,
question: &'static str,
expected_tools: &'static [&'static str],
banned_tools: &'static [&'static str],
max_tool_rounds: usize,
answer_must_contain: &'static [&'static str],
max_total_tokens: Option<usize>,
}
struct Trace {
tool_calls: Vec<String>,
tool_result_sizes: Vec<(String, usize)>,
tool_rounds: usize,
final_answer: String,
usage: Usage,
time_to_first_token: Option<Duration>,
total_time: Duration,
}
struct CaseResult {
name: String,
status: CaseStatus,
expected_tools_called: Vec<String>,
expected_tools_missed: Vec<String>,
banned_tools_called: Vec<String>,
extra_tools_called: Vec<String>,
tool_rounds: usize,
max_tool_rounds: usize,
rounds_over_budget: bool,
facts_found: Vec<String>,
facts_missing: Vec<String>,
total_tokens: u32,
tokens_over_budget: bool,
total_time: Duration,
time_to_first_token: Option<Duration>,
all_tools: Vec<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum CaseStatus {
Pass,
Warn,
Fail,
}
impl CaseStatus {
fn icon(self) -> &'static str {
match self {
CaseStatus::Pass => "✓",
CaseStatus::Warn => "~",
CaseStatus::Fail => "✗",
}
}
}
fn fixture_xml() -> Vec<u8> {
std::fs::read(Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/ranger-with-gear.xml"))
.expect("fixture XML missing — run from repo root")
}
async fn capture_trace(agent: &ToolAgent, xml: &[u8], question: &str) -> Trace {
let start = Instant::now();
let stream = agent.respond(xml, question, vec![]);
tokio::pin!(stream);
let mut tool_calls: Vec<String> = Vec::new();
let mut tool_result_sizes: Vec<(String, usize)> = Vec::new();
let mut tool_rounds: usize = 0;
let mut final_answer = String::new();
let mut usage = Usage::default();
let mut time_to_first_token: Option<Duration> = None;
let mut last_was_tool_call = false;
while let Some(event) = stream.next().await {
match event.expect("agent stream error") {
AgentEvent::ToolCall { name } => {
if !last_was_tool_call {
tool_rounds += 1;
}
last_was_tool_call = true;
tool_calls.push(name);
}
AgentEvent::ToolResult { name, size_bytes } => {
tool_result_sizes.push((name, size_bytes));
}
AgentEvent::Token(text) => {
last_was_tool_call = false;
if time_to_first_token.is_none() {
time_to_first_token = Some(start.elapsed());
}
final_answer.push_str(&text);
}
AgentEvent::Usage(u) => {
usage = u;
}
_ => {}
}
}
Trace {
tool_calls,
tool_result_sizes,
tool_rounds,
final_answer,
usage,
time_to_first_token,
total_time: start.elapsed(),
}
}
fn score(case: &EvalCase, trace: &Trace) -> CaseResult {
let called_set: std::collections::HashSet<&str> =
trace.tool_calls.iter().map(|s| s.as_str()).collect();
let expected_set: std::collections::HashSet<&str> =
case.expected_tools.iter().copied().collect();
let banned_set: std::collections::HashSet<&str> = case.banned_tools.iter().copied().collect();
let expected_tools_called: Vec<String> = case
.expected_tools
.iter()
.filter(|t| called_set.contains(**t))
.map(|t| t.to_string())
.collect();
let expected_tools_missed: Vec<String> = case
.expected_tools
.iter()
.filter(|t| !called_set.contains(**t))
.map(|t| t.to_string())
.collect();
let banned_tools_called: Vec<String> = case
.banned_tools
.iter()
.filter(|t| called_set.contains(**t))
.map(|t| t.to_string())
.collect();
let unique_called: std::collections::HashSet<&str> = called_set.clone();
let extra_tools_called: Vec<String> = unique_called
.iter()
.filter(|t| !expected_set.contains(**t) && !banned_set.contains(**t))
.map(|t| t.to_string())
.collect();
let rounds_over_budget = trace.tool_rounds > case.max_tool_rounds;
let answer_lower = trace.final_answer.to_lowercase();
let facts_found: Vec<String> = case
.answer_must_contain
.iter()
.filter(|f| answer_lower.contains(&f.to_lowercase()))
.map(|f| f.to_string())
.collect();
let facts_missing: Vec<String> = case
.answer_must_contain
.iter()
.filter(|f| !answer_lower.contains(&f.to_lowercase()))
.map(|f| f.to_string())
.collect();
let tokens_over_budget = case
.max_total_tokens
.map(|max| trace.usage.total_tokens as usize > max)
.unwrap_or(false);
let has_fail = !expected_tools_missed.is_empty()
|| !banned_tools_called.is_empty()
|| rounds_over_budget
|| !facts_missing.is_empty();
let has_warn = !extra_tools_called.is_empty() || tokens_over_budget;
let status = if has_fail {
CaseStatus::Fail
} else if has_warn {
CaseStatus::Warn
} else {
CaseStatus::Pass
};
CaseResult {
name: case.name.to_string(),
status,
expected_tools_called,
expected_tools_missed,
banned_tools_called,
extra_tools_called,
tool_rounds: trace.tool_rounds,
max_tool_rounds: case.max_tool_rounds,
rounds_over_budget,
facts_found,
facts_missing,
total_tokens: trace.usage.total_tokens,
tokens_over_budget,
total_time: trace.total_time,
time_to_first_token: trace.time_to_first_token,
all_tools: trace.tool_calls.clone(),
}
}
fn eval_cases() -> Vec<EvalCase> {
vec![
EvalCase {
name: "basic_dps",
question: "What is my total DPS and what main skill am I using?",
expected_tools: &["get_build_stats", "get_skill_list"],
banned_tools: &["get_item", "get_passive_tree", "get_jewel", "query_passive_stats"],
max_tool_rounds: 2,
answer_must_contain: &["DPS"],
max_total_tokens: None,
},
EvalCase {
name: "defensive_stats",
question: "How tanky is this build? What are my defenses?",
expected_tools: &["get_build_stats"],
banned_tools: &["get_item", "get_jewel"],
max_tool_rounds: 2,
answer_must_contain: &[],
max_total_tokens: None,
},
EvalCase {
name: "skill_gems",
question: "What support gems are linked to my main skill?",
expected_tools: &["get_skill_list"],
banned_tools: &["get_item", "get_passive_tree", "get_jewel"],
max_tool_rounds: 2,
answer_must_contain: &[],
max_total_tokens: None,
},
EvalCase {
name: "specific_item",
question: "What weapon am I using?",
expected_tools: &["get_item"],
banned_tools: &[
"get_passive_tree",
"get_jewel",
"query_passive_stats",
"get_unallocated_ascendancy",
],
max_tool_rounds: 2,
answer_must_contain: &[],
max_total_tokens: None,
},
EvalCase {
name: "missing_gear",
question: "Am I missing any gear? What slots are empty?",
expected_tools: &["get_equipped_items"],
banned_tools: &["get_passive_tree", "get_jewel", "query_passive_stats"],
max_tool_rounds: 2,
answer_must_contain: &[],
max_total_tokens: None,
},
EvalCase {
name: "gear_overview_then_detail",
question: "What's my worst piece of gear and how could I upgrade it?",
expected_tools: &["get_equipped_items"],
banned_tools: &["get_jewel", "query_passive_stats"],
max_tool_rounds: 2,
answer_must_contain: &["upgrade"],
max_total_tokens: None,
},
EvalCase {
name: "keystones",
question: "What keystones am I using?",
expected_tools: &["get_passive_tree"],
banned_tools: &["get_item", "get_config", "get_jewel"],
max_tool_rounds: 2,
answer_must_contain: &[],
max_total_tokens: None,
},
EvalCase {
name: "jewel_inspection",
question: "What jewels do I have socketed in my passive tree?",
expected_tools: &["get_passive_tree", "get_jewel"],
banned_tools: &["get_item", "get_config", "query_passive_stats"],
max_tool_rounds: 3,
answer_must_contain: &[],
max_total_tokens: None,
},
EvalCase {
name: "stat_sourcing",
question: "How much fire damage am I getting from the passive tree, and is there more nearby?",
expected_tools: &["query_passive_stats"],
banned_tools: &["get_item", "get_jewel", "get_config"],
max_tool_rounds: 2,
answer_must_contain: &["fire damage"],
max_total_tokens: None,
},
EvalCase {
name: "ascendancy_recommendation",
question: "What ascendancy nodes should I take next?",
expected_tools: &["get_unallocated_ascendancy"],
banned_tools: &["get_item", "get_jewel"],
max_tool_rounds: 2,
answer_must_contain: &[],
max_total_tokens: None,
},
EvalCase {
name: "ascendancy_current",
question: "What ascendancy am I playing and what nodes do I have?",
expected_tools: &["get_unallocated_ascendancy"],
banned_tools: &["get_item", "get_config"],
max_tool_rounds: 2,
answer_must_contain: &[],
max_total_tokens: None,
},
EvalCase {
name: "build_config",
question: "What enemy level is my build configured for?",
expected_tools: &["get_config"],
banned_tools: &["get_item", "get_passive_tree", "get_jewel"],
max_tool_rounds: 2,
answer_must_contain: &[],
max_total_tokens: None,
},
EvalCase {
name: "full_build_review",
question: "Give me a quick overview of this build — what's working and what needs improvement?",
expected_tools: &["get_build_stats"],
banned_tools: &[],
max_tool_rounds: 5,
answer_must_contain: &[],
max_total_tokens: Some(8000),
},
EvalCase {
name: "upgrade_priorities",
question: "What are the top 3 things I should upgrade on this build?",
expected_tools: &["get_build_stats"],
banned_tools: &[],
max_tool_rounds: 5,
answer_must_contain: &[],
max_total_tokens: Some(10000),
},
EvalCase {
name: "no_tools_needed",
question: "What is Path of Exile 2?",
expected_tools: &[],
banned_tools: &[],
max_tool_rounds: 1,
answer_must_contain: &["Path of Exile"],
max_total_tokens: None,
},
EvalCase {
name: "ambiguous_item_slot",
question: "Show me my ring",
expected_tools: &["get_item"],
banned_tools: &["get_passive_tree", "get_jewel"],
max_tool_rounds: 2,
answer_must_contain: &[],
max_total_tokens: None,
},
EvalCase {
name: "multi_stat_passive",
question: "How much life, armour, and fire resistance do I get from passives?",
expected_tools: &["query_passive_stats"],
banned_tools: &["get_item", "get_jewel", "get_config"],
max_tool_rounds: 1,
answer_must_contain: &[],
max_total_tokens: None,
},
EvalCase {
name: "skill_breakdown",
question: "Why is my main skill's DPS low? Break down the damage.",
expected_tools: &["get_skill_breakdown"],
banned_tools: &["get_item", "get_jewel"],
max_tool_rounds: 3,
answer_must_contain: &["DPS"],
max_total_tokens: None,
},
EvalCase {
name: "full_gear_review",
question: "How's my gear overall? What should I upgrade?",
expected_tools: &["get_equipped_items"],
banned_tools: &["get_passive_tree", "get_jewel"],
max_tool_rounds: 2,
answer_must_contain: &[],
max_total_tokens: None,
},
EvalCase {
name: "search_support_gems",
question: "What support gems work with projectile attacks?",
expected_tools: &["search_gems"],
banned_tools: &["get_item", "get_jewel", "get_passive_tree"],
max_tool_rounds: 2,
answer_must_contain: &[],
max_total_tokens: None,
},
EvalCase {
name: "search_fire_aoe_gems",
question: "Are there any AoE fire skills I could use?",
expected_tools: &["search_gems"],
banned_tools: &["get_item", "get_jewel", "get_passive_tree"],
max_tool_rounds: 2,
answer_must_contain: &[],
max_total_tokens: None,
},
]
}
fn print_report(model: &str, results: &[CaseResult]) {
let total = results.len();
let passed = results
.iter()
.filter(|r| r.status == CaseStatus::Pass)
.count();
let warned = results
.iter()
.filter(|r| r.status == CaseStatus::Warn)
.count();
let failed = results
.iter()
.filter(|r| r.status == CaseStatus::Fail)
.count();
eprintln!();
eprintln!("=== Agent Eval Report (model: {model}) ===");
eprintln!(" Fixture: ranger-with-gear.xml");
eprintln!(" Cases: {total}");
eprintln!(
" Passed: {passed} ({:.0}%)",
passed as f64 / total as f64 * 100.0
);
eprintln!(" Warned: {warned}");
eprintln!(" Failed: {failed}");
eprintln!();
eprintln!(" --- Per-case results ---");
eprintln!();
for r in results {
let round_label = if r.tool_rounds == 1 {
"round ".to_string()
} else {
"rounds".to_string()
};
eprintln!(
" {} {:<30} {} {} {:>5} tok {:.1}s",
r.status.icon(),
r.name,
r.tool_rounds,
round_label,
r.total_tokens,
r.total_time.as_secs_f64(),
);
if !r.all_tools.is_empty() {
let tool_display: Vec<String> = r
.all_tools
.iter()
.map(|t| {
if r.banned_tools_called.contains(t) {
format!("{t}(BANNED!)")
} else if r.extra_tools_called.contains(t) {
format!("{t}(!)")
} else {
t.clone()
}
})
.collect();
eprintln!(" tools: {}", tool_display.join(", "));
}
if !r.expected_tools_missed.is_empty() {
eprintln!(
" FAIL: missing expected tools: {}",
r.expected_tools_missed.join(", ")
);
}
if !r.banned_tools_called.is_empty() {
eprintln!(
" FAIL: called banned tools: {}",
r.banned_tools_called.join(", ")
);
}
if r.rounds_over_budget {
eprintln!(
" FAIL: {} rounds > max {}",
r.tool_rounds, r.max_tool_rounds,
);
}
if !r.facts_missing.is_empty() {
eprintln!(
" FAIL: missing answer facts: {}",
r.facts_missing.join(", ")
);
}
if !r.extra_tools_called.is_empty() {
eprintln!(" WARN: extra tools: {}", r.extra_tools_called.join(", "));
}
if r.tokens_over_budget {
eprintln!(" WARN: token budget exceeded ({})", r.total_tokens);
}
eprintln!();
}
let tool_selection_correct = results
.iter()
.filter(|r| r.expected_tools_missed.is_empty() && r.banned_tools_called.is_empty())
.count();
let no_banned = results
.iter()
.filter(|r| r.banned_tools_called.is_empty())
.count();
let within_rounds = results.iter().filter(|r| !r.rounds_over_budget).count();
let facts_correct = results
.iter()
.filter(|r| r.facts_missing.is_empty())
.count();
let avg_tokens: f64 = results.iter().map(|r| r.total_tokens as f64).sum::<f64>() / total as f64;
let avg_latency: f64 = results
.iter()
.map(|r| r.total_time.as_secs_f64())
.sum::<f64>()
/ total as f64;
let unnecessary_rate: f64 = results
.iter()
.map(|r| r.extra_tools_called.len() as f64)
.sum::<f64>()
/ total as f64;
eprintln!(" --- Aggregate ---");
eprintln!();
eprintln!(
" Tool selection accuracy: {:>3.0}% ({}/{} cases all expected tools called, no banned)",
tool_selection_correct as f64 / total as f64 * 100.0,
tool_selection_correct,
total,
);
eprintln!(
" No-banned-tool rate: {:>3.0}% ({}/{} cases called no banned tool)",
no_banned as f64 / total as f64 * 100.0,
no_banned,
total,
);
eprintln!(
" Efficiency rate: {:>3.0}% ({}/{} within round budget)",
within_rounds as f64 / total as f64 * 100.0,
within_rounds,
total,
);
eprintln!(
" Answer correctness: {:>3.0}% ({}/{} all required facts present)",
facts_correct as f64 / total as f64 * 100.0,
facts_correct,
total,
);
eprintln!(" Avg total tokens: {avg_tokens:.0}");
eprintln!(" Avg latency: {avg_latency:.1}s");
eprintln!(" Unnecessary call rate: {unnecessary_rate:.1} tools/question");
}
#[tokio::test]
async fn eval_suite() {
let _ = dotenvy::dotenv();
let api_key = match std::env::var("OPENAI_API_KEY") {
Ok(k) if !k.is_empty() => k,
_ => {
eprintln!("\n=== Skipping eval suite (no OPENAI_API_KEY) ===");
return;
}
};
let model = std::env::var("OPENAI_MODEL").unwrap_or_else(|_| "gpt-4.1-nano".into());
let case_filter = std::env::var("EVAL_CASE").ok();
eprintln!("\n=== Agent Eval Suite (model: {model}) ===");
if let Some(ref filter) = case_filter {
eprintln!(" Filter: {filter}");
}
let parser = Arc::new(
PobParser::new(Path::new(POB_PATH))
.await
.expect("PobParser::new failed"),
);
let llm = ChatGptClient::new(&api_key, &model).expect("ChatGptClient::new failed");
let agent = ToolAgent::new(llm.clone(), parser, None);
let xml = fixture_xml();
let cases = eval_cases();
let cases: Vec<&EvalCase> = if let Some(ref filter) = case_filter {
cases
.iter()
.filter(|c| c.name.contains(filter.as_str()))
.collect()
} else {
cases.iter().collect()
};
if cases.is_empty() {
eprintln!(" No cases match filter — exiting.");
return;
}
let mut results: Vec<CaseResult> = Vec::with_capacity(cases.len());
for case in &cases {
eprintln!(" Running: {} ...", case.name);
let trace = capture_trace(&agent, &xml, case.question).await;
let result = score(case, &trace);
eprintln!(
" {} {} ({} rounds, {} tokens, {:.1}s)",
result.status.icon(),
result.name,
result.tool_rounds,
result.total_tokens,
result.total_time.as_secs_f64(),
);
results.push(result);
}
print_report(llm.model(), &results);
}