agentforge_core/
benchmark.rs1use chrono::{DateTime, Utc};
2use serde::{Deserialize, Serialize};
3use uuid::Uuid;
4
5#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
7#[serde(rename_all = "snake_case")]
8pub enum BenchmarkSuite {
9 Gaia,
10 AgentBench,
11 WebArena,
12}
13
14impl std::fmt::Display for BenchmarkSuite {
15 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
16 match self {
17 BenchmarkSuite::Gaia => write!(f, "gaia"),
18 BenchmarkSuite::AgentBench => write!(f, "agentbench"),
19 BenchmarkSuite::WebArena => write!(f, "webarena"),
20 }
21 }
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct BenchmarkTask {
27 pub id: String,
28 pub suite: BenchmarkSuite,
29 pub difficulty_level: Option<u8>,
31 pub question: String,
33 pub expected_answer: Option<String>,
35 pub context_files: Vec<String>,
37}
38
39#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct BenchmarkResult {
42 pub task_id: String,
43 pub suite: BenchmarkSuite,
44 pub agent_answer: Option<String>,
45 pub correct: bool,
46 pub score: f64,
47 pub latency_ms: u64,
48 pub token_cost_usd: f64,
49}
50
51#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct BenchmarkRun {
54 pub id: Uuid,
55 pub agent_id: Uuid,
56 pub suite: BenchmarkSuite,
57 pub total_tasks: u32,
58 pub correct: u32,
59 pub accuracy: f64,
60 pub percentile_rank: Option<f64>,
62 pub results: Vec<BenchmarkResult>,
63 pub started_at: DateTime<Utc>,
64 pub completed_at: Option<DateTime<Utc>>,
65}
66
67#[derive(Debug, Clone)]
69pub struct BenchmarkBaseline {
70 pub suite: BenchmarkSuite,
71 pub model_name: String,
73 pub accuracy: f64,
74}
75
76pub fn published_baselines() -> Vec<BenchmarkBaseline> {
78 vec![
79 BenchmarkBaseline {
80 suite: BenchmarkSuite::Gaia,
81 model_name: "GPT-4o (OpenAI, 2025)".into(),
82 accuracy: 0.53,
83 },
84 BenchmarkBaseline {
85 suite: BenchmarkSuite::Gaia,
86 model_name: "Claude 3.5 Sonnet (Anthropic, 2025)".into(),
87 accuracy: 0.49,
88 },
89 BenchmarkBaseline {
90 suite: BenchmarkSuite::AgentBench,
91 model_name: "GPT-4 (OpenAI, 2024)".into(),
92 accuracy: 0.45,
93 },
94 BenchmarkBaseline {
95 suite: BenchmarkSuite::WebArena,
96 model_name: "GPT-4o (OpenAI, 2025)".into(),
97 accuracy: 0.39,
98 },
99 ]
100}
101
102#[cfg(test)]
103mod tests {
104 use super::*;
105
106 #[test]
107 fn suite_display() {
108 assert_eq!(BenchmarkSuite::Gaia.to_string(), "gaia");
109 assert_eq!(BenchmarkSuite::AgentBench.to_string(), "agentbench");
110 assert_eq!(BenchmarkSuite::WebArena.to_string(), "webarena");
111 }
112
113 #[test]
114 fn baselines_non_empty() {
115 assert!(!published_baselines().is_empty());
116 }
117}