use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, Instant};
use adk_agent::LlmAgentBuilder;
use adk_core::{
Content, Llm,
identity::{SessionId, UserId},
};
use adk_eval::{BaselineStore, CostTracker};
use adk_model::gemini::GeminiModel;
use adk_runner::Runner;
use adk_session::InMemorySessionService;
use adk_session::SessionService;
use adk_tool::FunctionTool;
use futures::StreamExt;
use tokio::task::JoinSet;
use crate::config::BenchConfig;
use crate::error::{BenchError, Result};
use crate::instrumented_llm::InstrumentedLlm;
use crate::metrics::{
BenchmarkResult, ConcurrencyLevel, DurationStats, RunMetadata, ThroughputMetrics, compute_stats,
};
use crate::workload::{
Workload, builtin_workloads, load_workload, multi_agent_delegation_workload,
};
const SWEEP_LEVELS: &[usize] = &[1, 2, 4, 8, 16, 32, 64];
const CV_WARNING_THRESHOLD: f64 = 0.20;
#[derive(Debug, Clone)]
pub struct RegressionReport {
pub metric_name: String,
pub workload_name: String,
pub baseline_value: f64,
pub current_value: f64,
pub degradation: f64,
}
pub struct BenchRunner {
config: BenchConfig,
baseline_store: BaselineStore,
cost_tracker: CostTracker,
}
impl BenchRunner {
pub fn new(config: BenchConfig) -> Self {
let baseline_store = BaselineStore::new(&config.baseline_path);
let cost_tracker = CostTracker::new();
Self { config, baseline_store, cost_tracker }
}
pub async fn run(&self) -> Result<Vec<BenchmarkResult>> {
let workloads = self.resolve_workloads()?;
let estimated_cost = self.estimate_cost(&workloads);
if self.config.dry_run {
tracing::info!(
estimated_cost_usd = estimated_cost,
total_workloads = workloads.len(),
runs = self.config.runs,
concurrency = self.config.concurrency,
"dry-run: displaying estimated cost without executing"
);
return Ok(Vec::new());
}
if let Some(max_cost) = self.config.max_cost_usd
&& estimated_cost > max_cost
{
return Err(BenchError::Baseline(format!(
"estimated cost ${estimated_cost:.4} exceeds --max-cost-usd limit ${max_cost:.4}. \
Reduce runs, concurrency, or workloads to stay within budget."
)));
}
if estimated_cost > 1.0 && !self.config.confirm_cost {
tracing::warn!(
estimated_cost_usd = estimated_cost,
"estimated cost exceeds $1.00; pass --confirm-cost to proceed"
);
return Err(BenchError::Baseline(format!(
"estimated cost ${estimated_cost:.4} exceeds $1.00. \
Pass --confirm-cost to acknowledge, or use --max-cost-usd to set a limit."
)));
}
let mut results = Vec::new();
for workload in &workloads {
if let Some(ref sweep_levels) = self.config.concurrency_sweep {
let result = self.run_workload_with_sweep(workload, sweep_levels).await?;
results.push(result);
} else if self.config.concurrency > 1 {
let result =
self.run_workload_concurrent(workload, self.config.concurrency).await?;
results.push(result);
} else {
let result = self.run_workload_sequential(workload).await?;
results.push(result);
}
}
Ok(results)
}
pub fn save_baseline(&self, results: &[BenchmarkResult]) -> Result<()> {
let metrics = self.results_to_baseline_metrics(results);
self.baseline_store
.save("adk-bench", &metrics)
.map_err(|e| BenchError::Baseline(format!("failed to save baseline: {e}")))?;
Ok(())
}
pub fn check_regression(&self, results: &[BenchmarkResult]) -> Result<Vec<RegressionReport>> {
let current_metrics = self.results_to_baseline_metrics(results);
let baseline = self
.baseline_store
.load()
.map_err(|e| BenchError::Baseline(format!("regression check failed: {e}")))?;
let baseline = match baseline {
Some(b) => b,
None => {
tracing::info!("no baseline file found, skipping regression check");
return Ok(Vec::new());
}
};
let mut reports = Vec::new();
for (metric_name, baseline_cases) in &baseline.metrics {
if let Some(current_cases) = current_metrics.get(metric_name) {
for (case_id, &baseline_value) in baseline_cases {
if let Some(¤t_value) = current_cases.get(case_id) {
let degradation = if baseline_value > 0.0 {
(current_value - baseline_value) / baseline_value
} else {
0.0
};
if degradation > self.config.tolerance {
let (workload_name, parsed_metric_name) = case_id
.split_once("::")
.map(|(w, m)| (w.to_string(), m.to_string()))
.unwrap_or((metric_name.clone(), case_id.clone()));
reports.push(RegressionReport {
metric_name: parsed_metric_name,
workload_name,
baseline_value,
current_value,
degradation,
});
}
}
}
}
}
Ok(reports)
}
fn resolve_workloads(&self) -> Result<Vec<Workload>> {
if let Some(ref workload_path) = self.config.workload {
let path = std::path::Path::new(workload_path);
if path.exists() {
let workload = load_workload(path)?;
return Ok(vec![workload]);
}
let mut all = builtin_workloads();
if self.config.experimental {
all.push(multi_agent_delegation_workload());
}
let found = all.into_iter().find(|w| w.name == *workload_path);
match found {
Some(w) => Ok(vec![w]),
None => Err(BenchError::WorkloadNotFound { path: workload_path.clone() }),
}
} else {
let mut workloads = builtin_workloads();
if self.config.experimental {
workloads.push(multi_agent_delegation_workload());
}
Ok(workloads)
}
}
fn estimate_cost(&self, workloads: &[Workload]) -> f64 {
let mut total_cost = 0.0;
const ESTIMATED_INPUT_TOKENS_PER_TURN: u64 = 500;
const ESTIMATED_OUTPUT_TOKENS_PER_TURN: u64 = 200;
let concurrency_multiplier = if let Some(ref levels) = self.config.concurrency_sweep {
levels.iter().sum::<usize>()
} else {
self.config.concurrency
};
for workload in workloads {
let turns = workload.expected_turns as u64;
let total_iterations =
(self.config.runs + self.config.warmup) as u64 * concurrency_multiplier as u64;
let prompt_tokens = turns * ESTIMATED_INPUT_TOKENS_PER_TURN * total_iterations;
let completion_tokens = turns * ESTIMATED_OUTPUT_TOKENS_PER_TURN * total_iterations;
if let Some(cost) =
self.cost_tracker.compute_cost(&workload.model, prompt_tokens, completion_tokens)
{
total_cost += cost;
}
}
total_cost
}
async fn run_workload_sequential(&self, workload: &Workload) -> Result<BenchmarkResult> {
tracing::info!(
workload = workload.name,
warmup = self.config.warmup,
"starting warm-up phase"
);
for i in 0..self.config.warmup {
tracing::debug!(workload = workload.name, iteration = i, "warm-up iteration");
self.execute_single_workload(workload).await?;
}
tracing::info!(
workload = workload.name,
runs = self.config.runs,
"starting measurement phase"
);
let mut cold_start_durations = Vec::new();
let mut overhead_durations = Vec::new();
for i in 0..self.config.runs {
tracing::debug!(workload = workload.name, iteration = i, "measurement iteration");
let (cold_start, overheads) = self.execute_single_workload(workload).await?;
cold_start_durations.push(cold_start);
overhead_durations.extend(overheads);
}
let cold_start_stats = compute_stats(&cold_start_durations);
let overhead_stats = compute_stats(&overhead_durations);
self.emit_cv_warning(&overhead_stats, &workload.name);
Ok(BenchmarkResult {
schema_version: 1,
workload_name: workload.name.clone(),
model: workload.model.clone(),
metadata: self.build_run_metadata(),
cold_start: cold_start_stats,
agent_loop_overhead: overhead_stats,
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: self.config.runs,
})
}
async fn run_workload_concurrent(
&self,
workload: &Workload,
concurrency: usize,
) -> Result<BenchmarkResult> {
tracing::info!(
workload = workload.name,
warmup = self.config.warmup,
concurrency,
"starting concurrent warm-up phase"
);
for _ in 0..self.config.warmup {
self.execute_concurrent_batch(workload, concurrency).await?;
}
tracing::info!(
workload = workload.name,
runs = self.config.runs,
concurrency,
"starting concurrent measurement phase"
);
let mut cold_start_durations = Vec::new();
let mut overhead_durations = Vec::new();
let mut completion_times = Vec::new();
for _ in 0..self.config.runs {
let batch_start = Instant::now();
let batch_results = self.execute_concurrent_batch(workload, concurrency).await?;
let batch_elapsed = batch_start.elapsed();
for (cold_start, overheads) in &batch_results {
cold_start_durations.push(*cold_start);
overhead_durations.extend(overheads.iter().copied());
}
completion_times.push(batch_elapsed);
}
let cold_start_stats = compute_stats(&cold_start_durations);
let overhead_stats = compute_stats(&overhead_durations);
let completion_stats = compute_stats(&completion_times);
self.emit_cv_warning(&overhead_stats, &workload.name);
let mean_completion_secs = if !completion_times.is_empty() {
completion_times.iter().map(|d| d.as_secs_f64()).sum::<f64>()
/ completion_times.len() as f64
} else {
1.0
};
let agents_per_second = concurrency as f64 / mean_completion_secs;
let throughput = Some(ThroughputMetrics {
levels: vec![ConcurrencyLevel {
concurrency,
agents_per_second,
completion_time: completion_stats,
}],
});
Ok(BenchmarkResult {
schema_version: 1,
workload_name: workload.name.clone(),
model: workload.model.clone(),
metadata: self.build_run_metadata(),
cold_start: cold_start_stats,
agent_loop_overhead: overhead_stats,
tool_invocation: None,
throughput,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: self.config.runs,
})
}
async fn run_workload_with_sweep(
&self,
workload: &Workload,
sweep_levels: &[usize],
) -> Result<BenchmarkResult> {
let levels_to_test =
if sweep_levels.is_empty() { SWEEP_LEVELS.to_vec() } else { sweep_levels.to_vec() };
tracing::info!(
workload = workload.name,
levels = ?levels_to_test,
"starting concurrency sweep"
);
let min_level = *levels_to_test.first().unwrap_or(&1);
for _ in 0..self.config.warmup {
self.execute_concurrent_batch(workload, min_level).await?;
}
let mut all_cold_starts = Vec::new();
let mut all_overheads = Vec::new();
let mut throughput_levels = Vec::new();
for &level in &levels_to_test {
tracing::info!(
workload = workload.name,
concurrency = level,
"sweeping concurrency level"
);
let mut level_completion_times = Vec::new();
for _ in 0..self.config.runs {
let batch_start = Instant::now();
let batch_results = self.execute_concurrent_batch(workload, level).await?;
let batch_elapsed = batch_start.elapsed();
for (cold_start, overheads) in &batch_results {
all_cold_starts.push(*cold_start);
all_overheads.extend(overheads.iter().copied());
}
level_completion_times.push(batch_elapsed);
}
let completion_stats = compute_stats(&level_completion_times);
let mean_secs = if !level_completion_times.is_empty() {
level_completion_times.iter().map(|d| d.as_secs_f64()).sum::<f64>()
/ level_completion_times.len() as f64
} else {
1.0
};
let agents_per_second = level as f64 / mean_secs;
throughput_levels.push(ConcurrencyLevel {
concurrency: level,
agents_per_second,
completion_time: completion_stats,
});
}
let cold_start_stats = compute_stats(&all_cold_starts);
let overhead_stats = compute_stats(&all_overheads);
self.emit_cv_warning(&overhead_stats, &workload.name);
Ok(BenchmarkResult {
schema_version: 1,
workload_name: workload.name.clone(),
model: workload.model.clone(),
metadata: self.build_run_metadata(),
cold_start: cold_start_stats,
agent_loop_overhead: overhead_stats,
tool_invocation: None,
throughput: Some(ThroughputMetrics { levels: throughput_levels }),
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: self.config.runs,
})
}
async fn execute_concurrent_batch(
&self,
workload: &Workload,
concurrency: usize,
) -> Result<Vec<(Duration, Vec<Duration>)>> {
let mut join_set = JoinSet::new();
for _ in 0..concurrency {
let workload = workload.clone();
let model_name = self.config.model.clone();
join_set.spawn(async move { execute_workload_real(&workload, &model_name).await });
}
let mut results = Vec::with_capacity(concurrency);
while let Some(join_result) = join_set.join_next().await {
let task_result =
join_result.map_err(|e| BenchError::Llm(format!("task join failed: {e}")))?;
results.push(task_result?);
}
Ok(results)
}
async fn execute_single_workload(
&self,
workload: &Workload,
) -> Result<(Duration, Vec<Duration>)> {
execute_workload_real(workload, &self.config.model).await
}
fn emit_cv_warning(&self, stats: &DurationStats, workload_name: &str) {
if stats.count > 1 && stats.coefficient_of_variation > CV_WARNING_THRESHOLD {
tracing::warn!(
workload = workload_name,
cv = format!("{:.1}%", stats.coefficient_of_variation * 100.0),
threshold = "20%",
mean_us = stats.mean_us,
std_dev_us = stats.std_dev_us,
"Agent_Loop_Overhead CV exceeds 20%, measurements may be unstable. \
Consider increasing iteration count or reducing system load."
);
}
}
fn build_run_metadata(&self) -> RunMetadata {
RunMetadata {
timestamp: chrono::Utc::now().to_rfc3339(),
adk_version: env!("CARGO_PKG_VERSION").to_string(),
rust_version: rustc_version(),
os: std::env::consts::OS.to_string(),
arch: std::env::consts::ARCH.to_string(),
}
}
fn results_to_baseline_metrics(
&self,
results: &[BenchmarkResult],
) -> HashMap<String, HashMap<String, f64>> {
let mut metrics: HashMap<String, HashMap<String, f64>> = HashMap::new();
for result in results {
let prefix = &result.workload_name;
let mut case_metrics = HashMap::new();
case_metrics
.insert(format!("{prefix}::cold_start_mean_us"), result.cold_start.mean_us as f64);
case_metrics
.insert(format!("{prefix}::cold_start_p95_us"), result.cold_start.p95_us as f64);
case_metrics.insert(
format!("{prefix}::overhead_mean_us"),
result.agent_loop_overhead.mean_us as f64,
);
case_metrics.insert(
format!("{prefix}::overhead_p95_us"),
result.agent_loop_overhead.p95_us as f64,
);
metrics.entry("timing".to_string()).or_default().extend(case_metrics);
}
metrics
}
}
fn create_llm(model_name: &str) -> Result<Arc<dyn Llm>> {
let api_key = std::env::var("GOOGLE_API_KEY").map_err(|_| {
BenchError::Llm(
"GOOGLE_API_KEY environment variable not set. \
Set it to your Gemini API key to run benchmarks."
.to_string(),
)
})?;
let model = GeminiModel::new(api_key, model_name).map_err(|e| {
BenchError::Llm(format!("failed to create Gemini model '{model_name}': {e}"))
})?;
Ok(Arc::new(model))
}
fn create_tools_from_workload(workload: &Workload) -> Vec<Arc<dyn adk_core::Tool>> {
workload
.agent
.tools
.iter()
.map(|(name, def)| {
let tool_name = name.clone();
let description = def.description.clone();
let fixed_response = def.fixed_response.clone();
let latency_ms = def.simulated_latency_ms;
let tool = FunctionTool::new(tool_name, description, move |_ctx, _args| {
let response = fixed_response.clone();
let latency = latency_ms;
async move {
if latency > 0 {
tokio::time::sleep(Duration::from_millis(latency)).await;
}
Ok(response.unwrap_or(serde_json::json!({"status": "success"})))
}
})
.with_read_only(true)
.with_concurrency_safe(true);
Arc::new(tool) as Arc<dyn adk_core::Tool>
})
.collect()
}
async fn execute_workload_real(
workload: &Workload,
model_name: &str,
) -> Result<(Duration, Vec<Duration>)> {
let run_start = Instant::now();
let inner_llm = create_llm(model_name)?;
let instrumented = Arc::new(InstrumentedLlm::new(inner_llm));
let tools = create_tools_from_workload(workload);
let mut agent_builder = LlmAgentBuilder::new(&workload.name)
.model(instrumented.clone() as Arc<dyn Llm>)
.instruction(&workload.agent.instructions);
for tool in tools {
agent_builder = agent_builder.tool(tool);
}
let agent = agent_builder
.build()
.map_err(|e| BenchError::Llm(format!("failed to build agent: {e}")))?;
let session_service = Arc::new(InMemorySessionService::new());
let app_name = format!("bench-{}", workload.name);
let session_id_str = format!("bench-{}", uuid_v4());
session_service
.create(adk_session::CreateRequest {
app_name: app_name.clone(),
user_id: "bench-user".to_string(),
session_id: Some(session_id_str.clone()),
state: HashMap::new(),
})
.await
.map_err(|e| BenchError::Llm(format!("failed to create session: {e}")))?;
let runner = Runner::builder()
.app_name(app_name)
.agent(Arc::new(agent))
.session_service(session_service)
.build()
.map_err(|e| BenchError::Llm(format!("failed to create runner: {e}")))?;
let user_content = Content::new("user").with_text(&workload.agent.user_message);
let user_id = UserId::try_from("bench-user")
.map_err(|e| BenchError::Llm(format!("invalid user id: {e}")))?;
let session_id = SessionId::try_from(session_id_str.as_str())
.map_err(|e| BenchError::Llm(format!("invalid session id: {e}")))?;
let turn_start = Instant::now();
let mut event_stream = runner
.run(user_id, session_id, user_content)
.await
.map_err(|e| BenchError::Llm(format!("agent run failed: {e}")))?;
while let Some(event_result) = event_stream.next().await {
match event_result {
Ok(_event) => {
}
Err(e) => {
tracing::warn!(error = %e, "event stream error during benchmark");
}
}
}
let total_turn_time = turn_start.elapsed();
let records = instrumented.records().await;
let cold_start = if let Some(first_record) = records.first() {
first_record.request_sent.duration_since(run_start)
} else {
run_start.elapsed()
};
let total_llm_time: Duration = records.iter().map(|r| r.round_trip).sum();
let overhead = total_turn_time.saturating_sub(total_llm_time);
let num_turns = records.len().max(1);
let per_turn_overhead = overhead / num_turns as u32;
let overheads: Vec<Duration> = (0..num_turns).map(|_| per_turn_overhead).collect();
tracing::debug!(
workload = workload.name,
cold_start_us = cold_start.as_micros(),
total_turn_ms = total_turn_time.as_millis(),
llm_calls = records.len(),
total_llm_ms = total_llm_time.as_millis(),
overhead_us = overhead.as_micros(),
"workload execution complete"
);
Ok((cold_start, overheads))
}
fn uuid_v4() -> String {
use std::time::SystemTime;
let nanos =
SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap_or_default().as_nanos();
format!("{:032x}", nanos)
}
fn rustc_version() -> String {
option_env!("RUSTC_VERSION").unwrap_or(env!("CARGO_PKG_RUST_VERSION")).to_string()
}
#[cfg(test)]
mod tests {
use super::*;
fn test_config() -> BenchConfig {
BenchConfig { runs: 3, warmup: 1, concurrency: 1, ..Default::default() }
}
#[tokio::test]
async fn test_bench_runner_new() {
let config = test_config();
let runner = BenchRunner::new(config.clone());
assert_eq!(runner.config.runs, 3);
assert_eq!(runner.config.warmup, 1);
}
#[tokio::test]
async fn test_resolve_workloads_all_builtin() {
let config = test_config();
let runner = BenchRunner::new(config);
let workloads = runner.resolve_workloads().unwrap();
assert_eq!(workloads.len(), 3);
}
#[tokio::test]
async fn test_resolve_workloads_with_experimental() {
let config = BenchConfig { experimental: true, ..test_config() };
let runner = BenchRunner::new(config);
let workloads = runner.resolve_workloads().unwrap();
assert_eq!(workloads.len(), 4);
}
#[tokio::test]
async fn test_resolve_workloads_specific_builtin() {
let config =
BenchConfig { workload: Some("simple_tool_call".to_string()), ..test_config() };
let runner = BenchRunner::new(config);
let workloads = runner.resolve_workloads().unwrap();
assert_eq!(workloads.len(), 1);
assert_eq!(workloads[0].name, "simple_tool_call");
}
#[tokio::test]
async fn test_resolve_workloads_not_found() {
let config =
BenchConfig { workload: Some("nonexistent_workload".to_string()), ..test_config() };
let runner = BenchRunner::new(config);
let result = runner.resolve_workloads();
assert!(result.is_err());
}
#[tokio::test]
async fn test_dry_run_returns_empty() {
let config = BenchConfig { dry_run: true, ..test_config() };
let runner = BenchRunner::new(config);
let results = runner.run().await.unwrap();
assert!(results.is_empty());
}
#[tokio::test]
async fn test_max_cost_usd_abort() {
let config = BenchConfig {
max_cost_usd: Some(0.0001), runs: 100,
..test_config()
};
let runner = BenchRunner::new(config);
let result = runner.run().await;
assert!(result.is_err());
}
#[tokio::test]
#[ignore] async fn test_sequential_run() {
let config = BenchConfig {
workload: Some("simple_tool_call".to_string()),
runs: 2,
warmup: 1,
confirm_cost: true,
..test_config()
};
let runner = BenchRunner::new(config);
let results = runner.run().await.unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].workload_name, "simple_tool_call");
assert_eq!(results[0].iterations, 2);
assert!(results[0].throughput.is_none());
}
#[tokio::test]
#[ignore] async fn test_concurrent_run() {
let config = BenchConfig {
workload: Some("simple_tool_call".to_string()),
runs: 2,
warmup: 1,
concurrency: 4,
confirm_cost: true,
..test_config()
};
let runner = BenchRunner::new(config);
let results = runner.run().await.unwrap();
assert_eq!(results.len(), 1);
assert!(results[0].throughput.is_some());
let throughput = results[0].throughput.as_ref().unwrap();
assert_eq!(throughput.levels.len(), 1);
assert_eq!(throughput.levels[0].concurrency, 4);
}
#[tokio::test]
#[ignore] async fn test_sweep_mode() {
let config = BenchConfig {
workload: Some("simple_tool_call".to_string()),
runs: 1,
warmup: 1,
concurrency_sweep: Some(vec![1, 2, 4]),
confirm_cost: true,
..test_config()
};
let runner = BenchRunner::new(config);
let results = runner.run().await.unwrap();
assert_eq!(results.len(), 1);
assert!(results[0].throughput.is_some());
let throughput = results[0].throughput.as_ref().unwrap();
assert_eq!(throughput.levels.len(), 3);
assert_eq!(throughput.levels[0].concurrency, 1);
assert_eq!(throughput.levels[1].concurrency, 2);
assert_eq!(throughput.levels[2].concurrency, 4);
}
#[tokio::test]
async fn test_cv_warning_not_emitted_for_low_cv() {
let stats = DurationStats {
min_us: 100,
max_us: 120,
mean_us: 110,
median_us: 110,
p95_us: 119,
p99_us: 120,
std_dev_us: 5,
count: 10,
coefficient_of_variation: 0.045, };
let config = test_config();
let runner = BenchRunner::new(config);
runner.emit_cv_warning(&stats, "test_workload");
}
#[tokio::test]
async fn test_save_and_check_baseline() {
let dir = tempfile::TempDir::new().unwrap();
let baseline_path = dir.path().join("test-baseline.json");
let config = BenchConfig { baseline_path: baseline_path.clone(), ..test_config() };
let runner = BenchRunner::new(config);
let results = vec![BenchmarkResult {
schema_version: 1,
workload_name: "test_workload".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-01T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(1000)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
}];
runner.save_baseline(&results).unwrap();
assert!(baseline_path.exists());
let regressions = runner.check_regression(&results).unwrap();
assert!(regressions.is_empty());
}
#[tokio::test]
async fn test_check_regression_detects_timing_increase() {
let dir = tempfile::TempDir::new().unwrap();
let baseline_path = dir.path().join("test-baseline.json");
let config = BenchConfig {
baseline_path: baseline_path.clone(),
tolerance: 0.10, ..test_config()
};
let runner = BenchRunner::new(config);
let baseline_results = vec![BenchmarkResult {
schema_version: 1,
workload_name: "test_workload".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-01T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(1000)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
}];
runner.save_baseline(&baseline_results).unwrap();
let current_results = vec![BenchmarkResult {
schema_version: 1,
workload_name: "test_workload".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-02T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(1200)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
}];
let regressions = runner.check_regression(¤t_results).unwrap();
assert!(!regressions.is_empty(), "expected regression for 20% cold start increase");
let cold_start_regression = regressions
.iter()
.find(|r| r.metric_name.contains("cold_start"))
.expect("should have cold_start regression");
assert_eq!(cold_start_regression.workload_name, "test_workload");
assert!((cold_start_regression.degradation - 0.20).abs() < 0.01);
}
#[tokio::test]
async fn test_check_regression_within_tolerance() {
let dir = tempfile::TempDir::new().unwrap();
let baseline_path = dir.path().join("test-baseline.json");
let config = BenchConfig {
baseline_path: baseline_path.clone(),
tolerance: 0.10, ..test_config()
};
let runner = BenchRunner::new(config);
let baseline_results = vec![BenchmarkResult {
schema_version: 1,
workload_name: "test_workload".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-01T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(1000)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
}];
runner.save_baseline(&baseline_results).unwrap();
let current_results = vec![BenchmarkResult {
schema_version: 1,
workload_name: "test_workload".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-02T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(1050)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(105)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
}];
let regressions = runner.check_regression(¤t_results).unwrap();
assert!(
regressions.is_empty(),
"expected no regression for 5% increase within 10% tolerance"
);
}
#[tokio::test]
async fn test_check_regression_improvement_not_flagged() {
let dir = tempfile::TempDir::new().unwrap();
let baseline_path = dir.path().join("test-baseline.json");
let config =
BenchConfig { baseline_path: baseline_path.clone(), tolerance: 0.10, ..test_config() };
let runner = BenchRunner::new(config);
let baseline_results = vec![BenchmarkResult {
schema_version: 1,
workload_name: "test_workload".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-01T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(1000)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
}];
runner.save_baseline(&baseline_results).unwrap();
let current_results = vec![BenchmarkResult {
schema_version: 1,
workload_name: "test_workload".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-02T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(800)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(80)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
}];
let regressions = runner.check_regression(¤t_results).unwrap();
assert!(regressions.is_empty(), "improvement should not be flagged as regression");
}
#[tokio::test]
async fn test_check_regression_no_baseline_file() {
let dir = tempfile::TempDir::new().unwrap();
let baseline_path = dir.path().join("nonexistent-baseline.json");
let config =
BenchConfig { baseline_path: baseline_path.clone(), tolerance: 0.10, ..test_config() };
let runner = BenchRunner::new(config);
let results = vec![BenchmarkResult {
schema_version: 1,
workload_name: "test_workload".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-01T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(1000)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
}];
let regressions = runner.check_regression(&results).unwrap();
assert!(regressions.is_empty());
}
#[tokio::test]
async fn test_check_regression_exact_tolerance_boundary() {
let dir = tempfile::TempDir::new().unwrap();
let baseline_path = dir.path().join("test-baseline.json");
let config = BenchConfig {
baseline_path: baseline_path.clone(),
tolerance: 0.10, ..test_config()
};
let runner = BenchRunner::new(config);
let baseline_results = vec![BenchmarkResult {
schema_version: 1,
workload_name: "test_workload".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-01T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(1000)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
}];
runner.save_baseline(&baseline_results).unwrap();
let current_results = vec![BenchmarkResult {
schema_version: 1,
workload_name: "test_workload".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-02T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(1100)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(110)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
}];
let regressions = runner.check_regression(¤t_results).unwrap();
assert!(
regressions.is_empty(),
"exactly at tolerance boundary should not trigger regression"
);
}
#[tokio::test]
async fn test_check_regression_multiple_workloads() {
let dir = tempfile::TempDir::new().unwrap();
let baseline_path = dir.path().join("test-baseline.json");
let config =
BenchConfig { baseline_path: baseline_path.clone(), tolerance: 0.10, ..test_config() };
let runner = BenchRunner::new(config);
let baseline_results = vec![
BenchmarkResult {
schema_version: 1,
workload_name: "workload_a".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-01T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(1000)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
},
BenchmarkResult {
schema_version: 1,
workload_name: "workload_b".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-01T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(2000)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(200)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
},
];
runner.save_baseline(&baseline_results).unwrap();
let current_results = vec![
BenchmarkResult {
schema_version: 1,
workload_name: "workload_a".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-02T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(1300)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
},
BenchmarkResult {
schema_version: 1,
workload_name: "workload_b".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-02T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(2000)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(200)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
},
];
let regressions = runner.check_regression(¤t_results).unwrap();
assert!(!regressions.is_empty());
let workload_a_regressions: Vec<_> =
regressions.iter().filter(|r| r.workload_name == "workload_a").collect();
assert!(!workload_a_regressions.is_empty(), "workload_a should have regressions");
let workload_b_regressions: Vec<_> =
regressions.iter().filter(|r| r.workload_name == "workload_b").collect();
assert!(workload_b_regressions.is_empty(), "workload_b should not have regressions");
}
#[tokio::test]
async fn test_regression_report_fields() {
let dir = tempfile::TempDir::new().unwrap();
let baseline_path = dir.path().join("test-baseline.json");
let config =
BenchConfig { baseline_path: baseline_path.clone(), tolerance: 0.10, ..test_config() };
let runner = BenchRunner::new(config);
let baseline_results = vec![BenchmarkResult {
schema_version: 1,
workload_name: "my_workload".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-01T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(1000)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(200)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
}];
runner.save_baseline(&baseline_results).unwrap();
let current_results = vec![BenchmarkResult {
schema_version: 1,
workload_name: "my_workload".to_string(),
model: "gemini-2.5-flash".to_string(),
metadata: RunMetadata {
timestamp: "2025-01-02T00:00:00Z".to_string(),
adk_version: "0.5.0".to_string(),
rust_version: "1.85.0".to_string(),
os: "linux".to_string(),
arch: "x86_64".to_string(),
},
cold_start: compute_stats(&[Duration::from_micros(1500)]),
agent_loop_overhead: compute_stats(&[Duration::from_micros(200)]),
tool_invocation: None,
throughput: None,
memory: None,
token_overhead: None,
reproducibility_rate: None,
iterations: 5,
}];
let regressions = runner.check_regression(¤t_results).unwrap();
assert!(!regressions.is_empty());
let report = regressions
.iter()
.find(|r| r.metric_name == "cold_start_mean_us")
.expect("should have cold_start_mean_us regression");
assert_eq!(report.workload_name, "my_workload");
assert!((report.baseline_value - 1000.0).abs() < 1.0);
assert!((report.current_value - 1500.0).abs() < 1.0);
assert!((report.degradation - 0.50).abs() < 0.01);
}
#[tokio::test]
async fn test_estimate_cost_non_zero() {
let config = test_config();
let runner = BenchRunner::new(config);
let workloads = runner.resolve_workloads().unwrap();
let cost = runner.estimate_cost(&workloads);
assert!(cost >= 0.0);
}
#[tokio::test]
async fn test_build_run_metadata() {
let config = test_config();
let runner = BenchRunner::new(config);
let metadata = runner.build_run_metadata();
assert!(!metadata.timestamp.is_empty());
assert!(!metadata.adk_version.is_empty());
assert!(!metadata.os.is_empty());
assert!(!metadata.arch.is_empty());
}
}