Skip to main content

adk_bench/adapters/
mod.rs

1//! Task quality adapters for established benchmark suites.
2//!
3//! Provides thin adapter interfaces for routing established benchmark
4//! suite requests through the ADK-Rust agent runtime.
5//!
6//! # Adapters
7//!
8//! - **τ²-bench** (`tau2` feature) — Implements the τ²-bench agent interface
9//!   protocol, translating between its format and ADK-Rust's Event/Content model.
10//! - **BFCL** (`bfcl` feature) — Loads Berkeley Function Calling Leaderboard
11//!   dataset entries and scores tool call accuracy.
12//!
13//! # Example
14//!
15//! ```rust,ignore
16//! use adk_bench::adapters::{TaskQualityAdapter, TaskQualityResult};
17//!
18//! async fn run_adapter(adapter: &dyn TaskQualityAdapter, model: &str) {
19//!     let result = adapter.run(model).await.unwrap();
20//!     println!("Accuracy: {:.1}%", result.accuracy * 100.0);
21//! }
22//! ```
23
24use async_trait::async_trait;
25use serde::{Deserialize, Serialize};
26
27/// τ²-bench adapter (feature-gated behind `tau2`).
28#[cfg(feature = "tau2")]
29pub mod tau2;
30
31/// BFCL adapter (feature-gated behind `bfcl`).
32#[cfg(feature = "bfcl")]
33pub mod bfcl;
34
35/// Trait for task quality benchmark adapters.
36///
37/// Implementations route requests from established benchmark suites
38/// (τ²-bench, BFCL) through the ADK-Rust agent runtime and report
39/// accuracy/quality scores.
40#[async_trait]
41pub trait TaskQualityAdapter: Send + Sync {
42    /// Returns the adapter name (e.g., "tau2", "bfcl").
43    fn name(&self) -> &str;
44
45    /// Runs the task quality suite and returns results.
46    async fn run(&self, model: &str) -> crate::Result<TaskQualityResult>;
47}
48
49/// Aggregated results from running a task quality benchmark suite.
50#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
51#[serde(rename_all = "camelCase")]
52pub struct TaskQualityResult {
53    /// Name of the adapter that produced these results.
54    pub adapter_name: String,
55    /// Model used for the benchmark run.
56    pub model: String,
57    /// Total number of test cases executed.
58    pub total_cases: usize,
59    /// Number of test cases that passed.
60    pub passed_cases: usize,
61    /// Accuracy score (passed_cases / total_cases).
62    pub accuracy: f64,
63    /// Per-case results.
64    pub cases: Vec<CaseResult>,
65}
66
67/// Result from a single test case in a task quality benchmark.
68#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
69#[serde(rename_all = "camelCase")]
70pub struct CaseResult {
71    /// Unique identifier for the test case.
72    pub case_id: String,
73    /// Whether the test case passed.
74    pub passed: bool,
75    /// Score for the test case (0.0 to 1.0).
76    pub score: f64,
77    /// Optional details about the result (e.g., failure reason).
78    #[serde(skip_serializing_if = "Option::is_none")]
79    pub details: Option<String>,
80}