adk_bench/adapters/mod.rs
1//! Task quality adapters for established benchmark suites.
2//!
3//! Provides thin adapter interfaces for routing established benchmark
4//! suite requests through the ADK-Rust agent runtime.
5//!
6//! # Adapters
7//!
8//! - **τ²-bench** (`tau2` feature) — Implements the τ²-bench agent interface
9//! protocol, translating between its format and ADK-Rust's Event/Content model.
10//! - **BFCL** (`bfcl` feature) — Loads Berkeley Function Calling Leaderboard
11//! dataset entries and scores tool call accuracy.
12//!
13//! # Example
14//!
15//! ```rust,ignore
16//! use adk_bench::adapters::{TaskQualityAdapter, TaskQualityResult};
17//!
18//! async fn run_adapter(adapter: &dyn TaskQualityAdapter, model: &str) {
19//! let result = adapter.run(model).await.unwrap();
20//! println!("Accuracy: {:.1}%", result.accuracy * 100.0);
21//! }
22//! ```
23
24use async_trait::async_trait;
25use serde::{Deserialize, Serialize};
26
27/// τ²-bench adapter (feature-gated behind `tau2`).
28#[cfg(feature = "tau2")]
29pub mod tau2;
30
31/// BFCL adapter (feature-gated behind `bfcl`).
32#[cfg(feature = "bfcl")]
33pub mod bfcl;
34
35/// Trait for task quality benchmark adapters.
36///
37/// Implementations route requests from established benchmark suites
38/// (τ²-bench, BFCL) through the ADK-Rust agent runtime and report
39/// accuracy/quality scores.
40#[async_trait]
41pub trait TaskQualityAdapter: Send + Sync {
42 /// Returns the adapter name (e.g., "tau2", "bfcl").
43 fn name(&self) -> &str;
44
45 /// Runs the task quality suite and returns results.
46 async fn run(&self, model: &str) -> crate::Result<TaskQualityResult>;
47}
48
49/// Aggregated results from running a task quality benchmark suite.
50#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
51#[serde(rename_all = "camelCase")]
52pub struct TaskQualityResult {
53 /// Name of the adapter that produced these results.
54 pub adapter_name: String,
55 /// Model used for the benchmark run.
56 pub model: String,
57 /// Total number of test cases executed.
58 pub total_cases: usize,
59 /// Number of test cases that passed.
60 pub passed_cases: usize,
61 /// Accuracy score (passed_cases / total_cases).
62 pub accuracy: f64,
63 /// Per-case results.
64 pub cases: Vec<CaseResult>,
65}
66
67/// Result from a single test case in a task quality benchmark.
68#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
69#[serde(rename_all = "camelCase")]
70pub struct CaseResult {
71 /// Unique identifier for the test case.
72 pub case_id: String,
73 /// Whether the test case passed.
74 pub passed: bool,
75 /// Score for the test case (0.0 to 1.0).
76 pub score: f64,
77 /// Optional details about the result (e.g., failure reason).
78 #[serde(skip_serializing_if = "Option::is_none")]
79 pub details: Option<String>,
80}