Skip to main content

adk_bench/
runner.rs

1//! Benchmark runner orchestrator.
2//!
3//! Coordinates workload execution, warm-up, iteration, concurrency,
4//! metric aggregation, and regression detection.
5//!
6//! The [`BenchRunner`] is the top-level orchestrator for benchmark execution.
7//! It loads workloads, performs warm-up iterations, runs measurement iterations,
8//! handles concurrent agent execution, supports concurrency sweep mode, and
9//! integrates with [`BaselineStore`] for regression detection and [`CostTracker`]
10//! for cost estimation.
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use adk_bench::{BenchConfig, BenchRunner};
16//!
17//! let config = BenchConfig::default();
18//! let runner = BenchRunner::new(config);
19//! let results = runner.run().await?;
20//! ```
21
22use std::collections::HashMap;
23use std::sync::Arc;
24use std::time::{Duration, Instant};
25
26use adk_agent::LlmAgentBuilder;
27use adk_core::{
28    Content, Llm,
29    identity::{SessionId, UserId},
30};
31use adk_eval::{BaselineStore, CostTracker};
32use adk_model::gemini::GeminiModel;
33use adk_runner::Runner;
34use adk_session::InMemorySessionService;
35use adk_session::SessionService;
36use adk_tool::FunctionTool;
37use futures::StreamExt;
38use tokio::task::JoinSet;
39
40use crate::config::BenchConfig;
41use crate::error::{BenchError, Result};
42use crate::instrumented_llm::InstrumentedLlm;
43use crate::metrics::{
44    BenchmarkResult, ConcurrencyLevel, DurationStats, RunMetadata, ThroughputMetrics, compute_stats,
45};
46use crate::workload::{
47    Workload, builtin_workloads, load_workload, multi_agent_delegation_workload,
48};
49
50/// Default concurrency sweep levels when `--sweep` is active.
51const SWEEP_LEVELS: &[usize] = &[1, 2, 4, 8, 16, 32, 64];
52
53/// CV threshold (20%) above which a warning is emitted for Agent_Loop_Overhead.
54const CV_WARNING_THRESHOLD: f64 = 0.20;
55
56/// A regression detected during baseline comparison.
57///
58/// Reports which metric and workload regressed, the baseline and current values,
59/// and the degradation percentage.
60#[derive(Debug, Clone)]
61pub struct RegressionReport {
62    /// The metric that regressed (e.g., "agent_loop_overhead_mean_us").
63    pub metric_name: String,
64    /// The workload where the regression was detected.
65    pub workload_name: String,
66    /// The baseline value for the metric.
67    pub baseline_value: f64,
68    /// The current measured value for the metric.
69    pub current_value: f64,
70    /// Degradation as a fraction (e.g., 0.15 means 15% worse).
71    pub degradation: f64,
72}
73
74/// Top-level orchestrator for benchmark execution.
75///
76/// Manages the full lifecycle of a benchmark run:
77/// 1. Loading workloads (built-in or from file)
78/// 2. Cost estimation and budget enforcement
79/// 3. Warm-up phase (iterations discarded)
80/// 4. Measurement phase (iterations recorded)
81/// 5. Concurrent execution and sweep modes
82/// 6. Metric aggregation and CV warnings
83/// 7. Baseline save and regression detection
84pub struct BenchRunner {
85    config: BenchConfig,
86    baseline_store: BaselineStore,
87    cost_tracker: CostTracker,
88}
89
90impl BenchRunner {
91    /// Creates a new `BenchRunner` with the given configuration.
92    ///
93    /// Initializes [`BaselineStore`] at the configured baseline path
94    /// and a default [`CostTracker`] with standard model pricing.
95    pub fn new(config: BenchConfig) -> Self {
96        let baseline_store = BaselineStore::new(&config.baseline_path);
97        let cost_tracker = CostTracker::new();
98        Self { config, baseline_store, cost_tracker }
99    }
100
101    /// Runs the full benchmark suite and returns results.
102    ///
103    /// # Execution Flow
104    ///
105    /// 1. Resolves workloads (specific workload via `--workload` or all built-in)
106    /// 2. Estimates cost and enforces budget (dry-run, max-cost-usd, confirm-cost)
107    /// 3. For each workload:
108    ///    a. Warm-up phase: run `config.warmup` iterations, discard results
109    ///    b. Measurement phase: run `config.runs` iterations, collect metrics
110    ///    c. If sweep mode: iterate through concurrency levels
111    ///    d. If concurrency > 1: spawn concurrent tasks
112    /// 4. Aggregates metrics and emits CV warnings
113    /// 5. Returns collected [`BenchmarkResult`] values
114    ///
115    /// # Errors
116    ///
117    /// - [`BenchError::WorkloadNotFound`] if a specified workload file doesn't exist
118    /// - [`BenchError::Baseline`] if cost exceeds `--max-cost-usd`
119    pub async fn run(&self) -> Result<Vec<BenchmarkResult>> {
120        let workloads = self.resolve_workloads()?;
121
122        // Cost estimation phase
123        let estimated_cost = self.estimate_cost(&workloads);
124        if self.config.dry_run {
125            tracing::info!(
126                estimated_cost_usd = estimated_cost,
127                total_workloads = workloads.len(),
128                runs = self.config.runs,
129                concurrency = self.config.concurrency,
130                "dry-run: displaying estimated cost without executing"
131            );
132            return Ok(Vec::new());
133        }
134
135        // Max cost guard
136        if let Some(max_cost) = self.config.max_cost_usd
137            && estimated_cost > max_cost
138        {
139            return Err(BenchError::Baseline(format!(
140                "estimated cost ${estimated_cost:.4} exceeds --max-cost-usd limit ${max_cost:.4}. \
141                 Reduce runs, concurrency, or workloads to stay within budget."
142            )));
143        }
144
145        // Cost confirmation gate (when cost > $1.00 and --confirm-cost not set)
146        if estimated_cost > 1.0 && !self.config.confirm_cost {
147            tracing::warn!(
148                estimated_cost_usd = estimated_cost,
149                "estimated cost exceeds $1.00; pass --confirm-cost to proceed"
150            );
151            return Err(BenchError::Baseline(format!(
152                "estimated cost ${estimated_cost:.4} exceeds $1.00. \
153                 Pass --confirm-cost to acknowledge, or use --max-cost-usd to set a limit."
154            )));
155        }
156
157        let mut results = Vec::new();
158
159        for workload in &workloads {
160            if let Some(ref sweep_levels) = self.config.concurrency_sweep {
161                // Concurrency sweep mode
162                let result = self.run_workload_with_sweep(workload, sweep_levels).await?;
163                results.push(result);
164            } else if self.config.concurrency > 1 {
165                // Fixed concurrency mode
166                let result =
167                    self.run_workload_concurrent(workload, self.config.concurrency).await?;
168                results.push(result);
169            } else {
170                // Sequential mode
171                let result = self.run_workload_sequential(workload).await?;
172                results.push(result);
173            }
174        }
175
176        Ok(results)
177    }
178
179    /// Saves current results as the regression baseline.
180    ///
181    /// Persists metrics via [`BaselineStore`] for later regression detection.
182    pub fn save_baseline(&self, results: &[BenchmarkResult]) -> Result<()> {
183        let metrics = self.results_to_baseline_metrics(results);
184        self.baseline_store
185            .save("adk-bench", &metrics)
186            .map_err(|e| BenchError::Baseline(format!("failed to save baseline: {e}")))?;
187        Ok(())
188    }
189
190    /// Checks results against saved baseline using configured tolerance.
191    ///
192    /// For benchmark timing metrics, a regression means the current value is
193    /// *higher* than the baseline (worse performance). The formula is:
194    ///
195    /// ```text
196    /// regression detected when (current - baseline) / baseline > tolerance
197    /// ```
198    ///
199    /// Returns a list of [`RegressionReport`] entries for any metrics that
200    /// exceed the tolerance threshold. An empty list means no regressions.
201    ///
202    /// # Exit Code Contract
203    ///
204    /// The CLI layer (Task 8.1) should exit with code 2 when this method
205    /// returns a non-empty list, and exit with code 0 otherwise.
206    pub fn check_regression(&self, results: &[BenchmarkResult]) -> Result<Vec<RegressionReport>> {
207        let current_metrics = self.results_to_baseline_metrics(results);
208
209        // Load baseline directly for timing-aware comparison.
210        // BaselineStore::check_regressions() uses `baseline - current > tolerance`
211        // which is designed for "higher is better" metrics (like accuracy).
212        // For benchmarks, higher timing values are *worse*, so we need the inverse:
213        // detect when `(current - baseline) / baseline > tolerance`.
214        let baseline = self
215            .baseline_store
216            .load()
217            .map_err(|e| BenchError::Baseline(format!("regression check failed: {e}")))?;
218
219        let baseline = match baseline {
220            Some(b) => b,
221            None => {
222                tracing::info!("no baseline file found, skipping regression check");
223                return Ok(Vec::new());
224            }
225        };
226
227        let mut reports = Vec::new();
228
229        for (metric_name, baseline_cases) in &baseline.metrics {
230            if let Some(current_cases) = current_metrics.get(metric_name) {
231                for (case_id, &baseline_value) in baseline_cases {
232                    if let Some(&current_value) = current_cases.get(case_id) {
233                        // For timing metrics: regression = current is worse (higher) than baseline
234                        let degradation = if baseline_value > 0.0 {
235                            (current_value - baseline_value) / baseline_value
236                        } else {
237                            0.0
238                        };
239
240                        if degradation > self.config.tolerance {
241                            // Parse workload and metric name from the case_id
242                            // Format is "workload_name::metric_suffix"
243                            let (workload_name, parsed_metric_name) = case_id
244                                .split_once("::")
245                                .map(|(w, m)| (w.to_string(), m.to_string()))
246                                .unwrap_or((metric_name.clone(), case_id.clone()));
247
248                            reports.push(RegressionReport {
249                                metric_name: parsed_metric_name,
250                                workload_name,
251                                baseline_value,
252                                current_value,
253                                degradation,
254                            });
255                        }
256                    }
257                }
258            }
259        }
260
261        Ok(reports)
262    }
263
264    /// Resolves workloads based on configuration.
265    fn resolve_workloads(&self) -> Result<Vec<Workload>> {
266        if let Some(ref workload_path) = self.config.workload {
267            // Check if it's a file path
268            let path = std::path::Path::new(workload_path);
269            if path.exists() {
270                let workload = load_workload(path)?;
271                return Ok(vec![workload]);
272            }
273
274            // Otherwise, look for it in built-in workloads
275            let mut all = builtin_workloads();
276            if self.config.experimental {
277                all.push(multi_agent_delegation_workload());
278            }
279
280            let found = all.into_iter().find(|w| w.name == *workload_path);
281            match found {
282                Some(w) => Ok(vec![w]),
283                None => Err(BenchError::WorkloadNotFound { path: workload_path.clone() }),
284            }
285        } else {
286            let mut workloads = builtin_workloads();
287            if self.config.experimental {
288                workloads.push(multi_agent_delegation_workload());
289            }
290            Ok(workloads)
291        }
292    }
293
294    /// Estimates the total API cost for the benchmark run.
295    ///
296    /// Uses the CostTracker to compute cost from estimated token counts.
297    /// Estimation is based on workload expected_turns × average tokens per turn.
298    fn estimate_cost(&self, workloads: &[Workload]) -> f64 {
299        let mut total_cost = 0.0;
300
301        // Rough token estimate: ~500 input + ~200 output per turn
302        const ESTIMATED_INPUT_TOKENS_PER_TURN: u64 = 500;
303        const ESTIMATED_OUTPUT_TOKENS_PER_TURN: u64 = 200;
304
305        let concurrency_multiplier = if let Some(ref levels) = self.config.concurrency_sweep {
306            // Sum of all sweep levels
307            levels.iter().sum::<usize>()
308        } else {
309            self.config.concurrency
310        };
311
312        for workload in workloads {
313            let turns = workload.expected_turns as u64;
314            let total_iterations =
315                (self.config.runs + self.config.warmup) as u64 * concurrency_multiplier as u64;
316
317            let prompt_tokens = turns * ESTIMATED_INPUT_TOKENS_PER_TURN * total_iterations;
318            let completion_tokens = turns * ESTIMATED_OUTPUT_TOKENS_PER_TURN * total_iterations;
319
320            if let Some(cost) =
321                self.cost_tracker.compute_cost(&workload.model, prompt_tokens, completion_tokens)
322            {
323                total_cost += cost;
324            }
325        }
326
327        total_cost
328    }
329
330    /// Runs a single workload sequentially (concurrency=1).
331    async fn run_workload_sequential(&self, workload: &Workload) -> Result<BenchmarkResult> {
332        // Warm-up phase: run iterations but discard results
333        tracing::info!(
334            workload = workload.name,
335            warmup = self.config.warmup,
336            "starting warm-up phase"
337        );
338        for i in 0..self.config.warmup {
339            tracing::debug!(workload = workload.name, iteration = i, "warm-up iteration");
340            self.execute_single_workload(workload).await?;
341        }
342
343        // Measurement phase
344        tracing::info!(
345            workload = workload.name,
346            runs = self.config.runs,
347            "starting measurement phase"
348        );
349        let mut cold_start_durations = Vec::new();
350        let mut overhead_durations = Vec::new();
351
352        for i in 0..self.config.runs {
353            tracing::debug!(workload = workload.name, iteration = i, "measurement iteration");
354            let (cold_start, overheads) = self.execute_single_workload(workload).await?;
355            cold_start_durations.push(cold_start);
356            overhead_durations.extend(overheads);
357        }
358
359        let cold_start_stats = compute_stats(&cold_start_durations);
360        let overhead_stats = compute_stats(&overhead_durations);
361
362        // CV warning for Agent_Loop_Overhead
363        self.emit_cv_warning(&overhead_stats, &workload.name);
364
365        Ok(BenchmarkResult {
366            schema_version: 1,
367            workload_name: workload.name.clone(),
368            model: workload.model.clone(),
369            metadata: self.build_run_metadata(),
370            cold_start: cold_start_stats,
371            agent_loop_overhead: overhead_stats,
372            tool_invocation: None,
373            throughput: None,
374            memory: None,
375            token_overhead: None,
376            reproducibility_rate: None,
377            iterations: self.config.runs,
378        })
379    }
380
381    /// Runs a single workload at a fixed concurrency level.
382    async fn run_workload_concurrent(
383        &self,
384        workload: &Workload,
385        concurrency: usize,
386    ) -> Result<BenchmarkResult> {
387        // Warm-up phase
388        tracing::info!(
389            workload = workload.name,
390            warmup = self.config.warmup,
391            concurrency,
392            "starting concurrent warm-up phase"
393        );
394        for _ in 0..self.config.warmup {
395            self.execute_concurrent_batch(workload, concurrency).await?;
396        }
397
398        // Measurement phase
399        tracing::info!(
400            workload = workload.name,
401            runs = self.config.runs,
402            concurrency,
403            "starting concurrent measurement phase"
404        );
405        let mut cold_start_durations = Vec::new();
406        let mut overhead_durations = Vec::new();
407        let mut completion_times = Vec::new();
408
409        for _ in 0..self.config.runs {
410            let batch_start = Instant::now();
411            let batch_results = self.execute_concurrent_batch(workload, concurrency).await?;
412            let batch_elapsed = batch_start.elapsed();
413
414            for (cold_start, overheads) in &batch_results {
415                cold_start_durations.push(*cold_start);
416                overhead_durations.extend(overheads.iter().copied());
417            }
418            // Per-agent completion time is the full batch divided by concurrency
419            completion_times.push(batch_elapsed);
420        }
421
422        let cold_start_stats = compute_stats(&cold_start_durations);
423        let overhead_stats = compute_stats(&overhead_durations);
424        let completion_stats = compute_stats(&completion_times);
425
426        // CV warning for Agent_Loop_Overhead
427        self.emit_cv_warning(&overhead_stats, &workload.name);
428
429        // Compute throughput: agents_per_second = concurrency / mean_completion_time_secs
430        let mean_completion_secs = if !completion_times.is_empty() {
431            completion_times.iter().map(|d| d.as_secs_f64()).sum::<f64>()
432                / completion_times.len() as f64
433        } else {
434            1.0
435        };
436        let agents_per_second = concurrency as f64 / mean_completion_secs;
437
438        let throughput = Some(ThroughputMetrics {
439            levels: vec![ConcurrencyLevel {
440                concurrency,
441                agents_per_second,
442                completion_time: completion_stats,
443            }],
444        });
445
446        Ok(BenchmarkResult {
447            schema_version: 1,
448            workload_name: workload.name.clone(),
449            model: workload.model.clone(),
450            metadata: self.build_run_metadata(),
451            cold_start: cold_start_stats,
452            agent_loop_overhead: overhead_stats,
453            tool_invocation: None,
454            throughput,
455            memory: None,
456            token_overhead: None,
457            reproducibility_rate: None,
458            iterations: self.config.runs,
459        })
460    }
461
462    /// Runs a workload in concurrency sweep mode.
463    ///
464    /// Tests multiple concurrency levels (e.g., 1, 2, 4, 8, 16, 32, 64) and
465    /// records throughput at each level.
466    async fn run_workload_with_sweep(
467        &self,
468        workload: &Workload,
469        sweep_levels: &[usize],
470    ) -> Result<BenchmarkResult> {
471        let levels_to_test =
472            if sweep_levels.is_empty() { SWEEP_LEVELS.to_vec() } else { sweep_levels.to_vec() };
473
474        tracing::info!(
475            workload = workload.name,
476            levels = ?levels_to_test,
477            "starting concurrency sweep"
478        );
479
480        // Warm-up at lowest concurrency level
481        let min_level = *levels_to_test.first().unwrap_or(&1);
482        for _ in 0..self.config.warmup {
483            self.execute_concurrent_batch(workload, min_level).await?;
484        }
485
486        let mut all_cold_starts = Vec::new();
487        let mut all_overheads = Vec::new();
488        let mut throughput_levels = Vec::new();
489
490        for &level in &levels_to_test {
491            tracing::info!(
492                workload = workload.name,
493                concurrency = level,
494                "sweeping concurrency level"
495            );
496
497            let mut level_completion_times = Vec::new();
498
499            for _ in 0..self.config.runs {
500                let batch_start = Instant::now();
501                let batch_results = self.execute_concurrent_batch(workload, level).await?;
502                let batch_elapsed = batch_start.elapsed();
503
504                for (cold_start, overheads) in &batch_results {
505                    all_cold_starts.push(*cold_start);
506                    all_overheads.extend(overheads.iter().copied());
507                }
508                level_completion_times.push(batch_elapsed);
509            }
510
511            let completion_stats = compute_stats(&level_completion_times);
512            let mean_secs = if !level_completion_times.is_empty() {
513                level_completion_times.iter().map(|d| d.as_secs_f64()).sum::<f64>()
514                    / level_completion_times.len() as f64
515            } else {
516                1.0
517            };
518            let agents_per_second = level as f64 / mean_secs;
519
520            throughput_levels.push(ConcurrencyLevel {
521                concurrency: level,
522                agents_per_second,
523                completion_time: completion_stats,
524            });
525        }
526
527        let cold_start_stats = compute_stats(&all_cold_starts);
528        let overhead_stats = compute_stats(&all_overheads);
529
530        // CV warning for Agent_Loop_Overhead
531        self.emit_cv_warning(&overhead_stats, &workload.name);
532
533        Ok(BenchmarkResult {
534            schema_version: 1,
535            workload_name: workload.name.clone(),
536            model: workload.model.clone(),
537            metadata: self.build_run_metadata(),
538            cold_start: cold_start_stats,
539            agent_loop_overhead: overhead_stats,
540            tool_invocation: None,
541            throughput: Some(ThroughputMetrics { levels: throughput_levels }),
542            memory: None,
543            token_overhead: None,
544            reproducibility_rate: None,
545            iterations: self.config.runs,
546        })
547    }
548
549    /// Executes a batch of concurrent workload instances.
550    ///
551    /// Spawns `concurrency` tasks, each executing the workload independently.
552    /// Returns timing results for each task.
553    async fn execute_concurrent_batch(
554        &self,
555        workload: &Workload,
556        concurrency: usize,
557    ) -> Result<Vec<(Duration, Vec<Duration>)>> {
558        let mut join_set = JoinSet::new();
559
560        for _ in 0..concurrency {
561            let workload = workload.clone();
562            let model_name = self.config.model.clone();
563            join_set.spawn(async move { execute_workload_real(&workload, &model_name).await });
564        }
565
566        let mut results = Vec::with_capacity(concurrency);
567        while let Some(join_result) = join_set.join_next().await {
568            let task_result =
569                join_result.map_err(|e| BenchError::Llm(format!("task join failed: {e}")))?;
570            results.push(task_result?);
571        }
572
573        Ok(results)
574    }
575
576    /// Executes a single workload iteration using a real LLM.
577    ///
578    /// Returns (cold_start_duration, vec_of_per_turn_overheads).
579    async fn execute_single_workload(
580        &self,
581        workload: &Workload,
582    ) -> Result<(Duration, Vec<Duration>)> {
583        execute_workload_real(workload, &self.config.model).await
584    }
585
586    /// Emits a warning if the coefficient of variation exceeds the threshold.
587    fn emit_cv_warning(&self, stats: &DurationStats, workload_name: &str) {
588        if stats.count > 1 && stats.coefficient_of_variation > CV_WARNING_THRESHOLD {
589            tracing::warn!(
590                workload = workload_name,
591                cv = format!("{:.1}%", stats.coefficient_of_variation * 100.0),
592                threshold = "20%",
593                mean_us = stats.mean_us,
594                std_dev_us = stats.std_dev_us,
595                "Agent_Loop_Overhead CV exceeds 20%, measurements may be unstable. \
596                 Consider increasing iteration count or reducing system load."
597            );
598        }
599    }
600
601    /// Builds run metadata for result provenance.
602    fn build_run_metadata(&self) -> RunMetadata {
603        RunMetadata {
604            timestamp: chrono::Utc::now().to_rfc3339(),
605            adk_version: env!("CARGO_PKG_VERSION").to_string(),
606            rust_version: rustc_version(),
607            os: std::env::consts::OS.to_string(),
608            arch: std::env::consts::ARCH.to_string(),
609        }
610    }
611
612    /// Converts benchmark results to the format expected by BaselineStore.
613    fn results_to_baseline_metrics(
614        &self,
615        results: &[BenchmarkResult],
616    ) -> HashMap<String, HashMap<String, f64>> {
617        let mut metrics: HashMap<String, HashMap<String, f64>> = HashMap::new();
618
619        for result in results {
620            let prefix = &result.workload_name;
621
622            let mut case_metrics = HashMap::new();
623            case_metrics
624                .insert(format!("{prefix}::cold_start_mean_us"), result.cold_start.mean_us as f64);
625            case_metrics
626                .insert(format!("{prefix}::cold_start_p95_us"), result.cold_start.p95_us as f64);
627            case_metrics.insert(
628                format!("{prefix}::overhead_mean_us"),
629                result.agent_loop_overhead.mean_us as f64,
630            );
631            case_metrics.insert(
632                format!("{prefix}::overhead_p95_us"),
633                result.agent_loop_overhead.p95_us as f64,
634            );
635
636            // For BaselineStore, we use metric_name → { case_id → value }
637            // We invert: store workload metrics under a "timing" key
638            metrics.entry("timing".to_string()).or_default().extend(case_metrics);
639        }
640
641        metrics
642    }
643}
644
645/// Creates an LLM model instance from a model name string.
646///
647/// Currently supports Gemini models (default). The model is selected based on
648/// the GOOGLE_API_KEY environment variable.
649fn create_llm(model_name: &str) -> Result<Arc<dyn Llm>> {
650    let api_key = std::env::var("GOOGLE_API_KEY").map_err(|_| {
651        BenchError::Llm(
652            "GOOGLE_API_KEY environment variable not set. \
653             Set it to your Gemini API key to run benchmarks."
654                .to_string(),
655        )
656    })?;
657
658    let model = GeminiModel::new(api_key, model_name).map_err(|e| {
659        BenchError::Llm(format!("failed to create Gemini model '{model_name}': {e}"))
660    })?;
661
662    Ok(Arc::new(model))
663}
664
665/// Creates simulated tools from workload tool definitions.
666///
667/// Each tool returns its `fixed_response` after sleeping for its
668/// `simulated_latency_ms`. If no fixed response is defined, returns
669/// a generic success object.
670fn create_tools_from_workload(workload: &Workload) -> Vec<Arc<dyn adk_core::Tool>> {
671    workload
672        .agent
673        .tools
674        .iter()
675        .map(|(name, def)| {
676            let tool_name = name.clone();
677            let description = def.description.clone();
678            let fixed_response = def.fixed_response.clone();
679            let latency_ms = def.simulated_latency_ms;
680
681            let tool = FunctionTool::new(tool_name, description, move |_ctx, _args| {
682                let response = fixed_response.clone();
683                let latency = latency_ms;
684                async move {
685                    if latency > 0 {
686                        tokio::time::sleep(Duration::from_millis(latency)).await;
687                    }
688                    Ok(response.unwrap_or(serde_json::json!({"status": "success"})))
689                }
690            })
691            .with_read_only(true)
692            .with_concurrency_safe(true);
693
694            Arc::new(tool) as Arc<dyn adk_core::Tool>
695        })
696        .collect()
697}
698
699/// Executes a single workload against a real LLM using the full agent pipeline.
700///
701/// 1. Creates the model and wraps it in InstrumentedLlm
702/// 2. Builds an LlmAgent with the workload's tools and instructions
703/// 3. Runs through the Runner, collecting events
704/// 4. Computes cold_start from InstrumentedLlm records
705/// 5. Computes per-turn overhead = total_turn_time - llm_round_trip
706async fn execute_workload_real(
707    workload: &Workload,
708    model_name: &str,
709) -> Result<(Duration, Vec<Duration>)> {
710    let run_start = Instant::now();
711
712    // 1. Create model and wrap in InstrumentedLlm
713    let inner_llm = create_llm(model_name)?;
714    let instrumented = Arc::new(InstrumentedLlm::new(inner_llm));
715
716    // 2. Build LlmAgent with workload tools and instructions
717    let tools = create_tools_from_workload(workload);
718    let mut agent_builder = LlmAgentBuilder::new(&workload.name)
719        .model(instrumented.clone() as Arc<dyn Llm>)
720        .instruction(&workload.agent.instructions);
721
722    for tool in tools {
723        agent_builder = agent_builder.tool(tool);
724    }
725
726    let agent = agent_builder
727        .build()
728        .map_err(|e| BenchError::Llm(format!("failed to build agent: {e}")))?;
729
730    // 3. Create Runner with in-memory session
731    let session_service = Arc::new(InMemorySessionService::new());
732
733    // Pre-create the session so the Runner can find it
734    let app_name = format!("bench-{}", workload.name);
735    let session_id_str = format!("bench-{}", uuid_v4());
736    session_service
737        .create(adk_session::CreateRequest {
738            app_name: app_name.clone(),
739            user_id: "bench-user".to_string(),
740            session_id: Some(session_id_str.clone()),
741            state: HashMap::new(),
742        })
743        .await
744        .map_err(|e| BenchError::Llm(format!("failed to create session: {e}")))?;
745
746    let runner = Runner::builder()
747        .app_name(app_name)
748        .agent(Arc::new(agent))
749        .session_service(session_service)
750        .build()
751        .map_err(|e| BenchError::Llm(format!("failed to create runner: {e}")))?;
752
753    // 4. Run the agent with the workload's user message
754    let user_content = Content::new("user").with_text(&workload.agent.user_message);
755
756    let user_id = UserId::try_from("bench-user")
757        .map_err(|e| BenchError::Llm(format!("invalid user id: {e}")))?;
758    let session_id = SessionId::try_from(session_id_str.as_str())
759        .map_err(|e| BenchError::Llm(format!("invalid session id: {e}")))?;
760
761    let turn_start = Instant::now();
762    let mut event_stream = runner
763        .run(user_id, session_id, user_content)
764        .await
765        .map_err(|e| BenchError::Llm(format!("agent run failed: {e}")))?;
766
767    // Consume all events
768    while let Some(event_result) = event_stream.next().await {
769        match event_result {
770            Ok(_event) => {
771                // Events consumed — timing captured by InstrumentedLlm
772            }
773            Err(e) => {
774                tracing::warn!(error = %e, "event stream error during benchmark");
775            }
776        }
777    }
778    let total_turn_time = turn_start.elapsed();
779
780    // 5. Compute metrics from InstrumentedLlm records
781    let records = instrumented.records().await;
782
783    // Cold start = time from run_start to first LLM call
784    let cold_start = if let Some(first_record) = records.first() {
785        first_record.request_sent.duration_since(run_start)
786    } else {
787        run_start.elapsed()
788    };
789
790    // Per-turn overhead = total_turn_time - sum(llm_round_trips)
791    // If multiple LLM calls, compute overhead per call
792    let total_llm_time: Duration = records.iter().map(|r| r.round_trip).sum();
793    let overhead = total_turn_time.saturating_sub(total_llm_time);
794
795    // Distribute overhead evenly across turns for per-turn reporting
796    let num_turns = records.len().max(1);
797    let per_turn_overhead = overhead / num_turns as u32;
798    let overheads: Vec<Duration> = (0..num_turns).map(|_| per_turn_overhead).collect();
799
800    tracing::debug!(
801        workload = workload.name,
802        cold_start_us = cold_start.as_micros(),
803        total_turn_ms = total_turn_time.as_millis(),
804        llm_calls = records.len(),
805        total_llm_ms = total_llm_time.as_millis(),
806        overhead_us = overhead.as_micros(),
807        "workload execution complete"
808    );
809
810    Ok((cold_start, overheads))
811}
812
813/// Generates a simple UUID v4 string for session IDs.
814fn uuid_v4() -> String {
815    use std::time::SystemTime;
816    let nanos =
817        SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap_or_default().as_nanos();
818    format!("{:032x}", nanos)
819}
820
821/// Returns the Rust compiler version string.
822fn rustc_version() -> String {
823    // Use a compile-time constant for the Rust version
824    option_env!("RUSTC_VERSION").unwrap_or(env!("CARGO_PKG_RUST_VERSION")).to_string()
825}
826
827#[cfg(test)]
828mod tests {
829    use super::*;
830
831    fn test_config() -> BenchConfig {
832        BenchConfig { runs: 3, warmup: 1, concurrency: 1, ..Default::default() }
833    }
834
835    #[tokio::test]
836    async fn test_bench_runner_new() {
837        let config = test_config();
838        let runner = BenchRunner::new(config.clone());
839        assert_eq!(runner.config.runs, 3);
840        assert_eq!(runner.config.warmup, 1);
841    }
842
843    #[tokio::test]
844    async fn test_resolve_workloads_all_builtin() {
845        let config = test_config();
846        let runner = BenchRunner::new(config);
847        let workloads = runner.resolve_workloads().unwrap();
848        assert_eq!(workloads.len(), 3);
849    }
850
851    #[tokio::test]
852    async fn test_resolve_workloads_with_experimental() {
853        let config = BenchConfig { experimental: true, ..test_config() };
854        let runner = BenchRunner::new(config);
855        let workloads = runner.resolve_workloads().unwrap();
856        assert_eq!(workloads.len(), 4);
857    }
858
859    #[tokio::test]
860    async fn test_resolve_workloads_specific_builtin() {
861        let config =
862            BenchConfig { workload: Some("simple_tool_call".to_string()), ..test_config() };
863        let runner = BenchRunner::new(config);
864        let workloads = runner.resolve_workloads().unwrap();
865        assert_eq!(workloads.len(), 1);
866        assert_eq!(workloads[0].name, "simple_tool_call");
867    }
868
869    #[tokio::test]
870    async fn test_resolve_workloads_not_found() {
871        let config =
872            BenchConfig { workload: Some("nonexistent_workload".to_string()), ..test_config() };
873        let runner = BenchRunner::new(config);
874        let result = runner.resolve_workloads();
875        assert!(result.is_err());
876    }
877
878    #[tokio::test]
879    async fn test_dry_run_returns_empty() {
880        let config = BenchConfig { dry_run: true, ..test_config() };
881        let runner = BenchRunner::new(config);
882        let results = runner.run().await.unwrap();
883        assert!(results.is_empty());
884    }
885
886    #[tokio::test]
887    async fn test_max_cost_usd_abort() {
888        let config = BenchConfig {
889            max_cost_usd: Some(0.0001), // Extremely low limit
890            runs: 100,
891            ..test_config()
892        };
893        let runner = BenchRunner::new(config);
894        let result = runner.run().await;
895        assert!(result.is_err());
896    }
897
898    #[tokio::test]
899    #[ignore] // Requires GOOGLE_API_KEY
900    async fn test_sequential_run() {
901        let config = BenchConfig {
902            workload: Some("simple_tool_call".to_string()),
903            runs: 2,
904            warmup: 1,
905            confirm_cost: true,
906            ..test_config()
907        };
908        let runner = BenchRunner::new(config);
909        let results = runner.run().await.unwrap();
910        assert_eq!(results.len(), 1);
911        assert_eq!(results[0].workload_name, "simple_tool_call");
912        assert_eq!(results[0].iterations, 2);
913        assert!(results[0].throughput.is_none());
914    }
915
916    #[tokio::test]
917    #[ignore] // Requires GOOGLE_API_KEY
918    async fn test_concurrent_run() {
919        let config = BenchConfig {
920            workload: Some("simple_tool_call".to_string()),
921            runs: 2,
922            warmup: 1,
923            concurrency: 4,
924            confirm_cost: true,
925            ..test_config()
926        };
927        let runner = BenchRunner::new(config);
928        let results = runner.run().await.unwrap();
929        assert_eq!(results.len(), 1);
930        assert!(results[0].throughput.is_some());
931        let throughput = results[0].throughput.as_ref().unwrap();
932        assert_eq!(throughput.levels.len(), 1);
933        assert_eq!(throughput.levels[0].concurrency, 4);
934    }
935
936    #[tokio::test]
937    #[ignore] // Requires GOOGLE_API_KEY
938    async fn test_sweep_mode() {
939        let config = BenchConfig {
940            workload: Some("simple_tool_call".to_string()),
941            runs: 1,
942            warmup: 1,
943            concurrency_sweep: Some(vec![1, 2, 4]),
944            confirm_cost: true,
945            ..test_config()
946        };
947        let runner = BenchRunner::new(config);
948        let results = runner.run().await.unwrap();
949        assert_eq!(results.len(), 1);
950        assert!(results[0].throughput.is_some());
951        let throughput = results[0].throughput.as_ref().unwrap();
952        assert_eq!(throughput.levels.len(), 3);
953        assert_eq!(throughput.levels[0].concurrency, 1);
954        assert_eq!(throughput.levels[1].concurrency, 2);
955        assert_eq!(throughput.levels[2].concurrency, 4);
956    }
957
958    #[tokio::test]
959    async fn test_cv_warning_not_emitted_for_low_cv() {
960        let stats = DurationStats {
961            min_us: 100,
962            max_us: 120,
963            mean_us: 110,
964            median_us: 110,
965            p95_us: 119,
966            p99_us: 120,
967            std_dev_us: 5,
968            count: 10,
969            coefficient_of_variation: 0.045, // 4.5%, below 20%
970        };
971        let config = test_config();
972        let runner = BenchRunner::new(config);
973        // This should not panic or produce errors
974        runner.emit_cv_warning(&stats, "test_workload");
975    }
976
977    #[tokio::test]
978    async fn test_save_and_check_baseline() {
979        let dir = tempfile::TempDir::new().unwrap();
980        let baseline_path = dir.path().join("test-baseline.json");
981
982        let config = BenchConfig { baseline_path: baseline_path.clone(), ..test_config() };
983        let runner = BenchRunner::new(config);
984
985        // Create a sample result
986        let results = vec![BenchmarkResult {
987            schema_version: 1,
988            workload_name: "test_workload".to_string(),
989            model: "gemini-2.5-flash".to_string(),
990            metadata: RunMetadata {
991                timestamp: "2025-01-01T00:00:00Z".to_string(),
992                adk_version: "0.5.0".to_string(),
993                rust_version: "1.85.0".to_string(),
994                os: "linux".to_string(),
995                arch: "x86_64".to_string(),
996            },
997            cold_start: compute_stats(&[Duration::from_micros(1000)]),
998            agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
999            tool_invocation: None,
1000            throughput: None,
1001            memory: None,
1002            token_overhead: None,
1003            reproducibility_rate: None,
1004            iterations: 5,
1005        }];
1006
1007        // Save baseline
1008        runner.save_baseline(&results).unwrap();
1009        assert!(baseline_path.exists());
1010
1011        // Check regression with same results should find none
1012        let regressions = runner.check_regression(&results).unwrap();
1013        assert!(regressions.is_empty());
1014    }
1015
1016    #[tokio::test]
1017    async fn test_check_regression_detects_timing_increase() {
1018        let dir = tempfile::TempDir::new().unwrap();
1019        let baseline_path = dir.path().join("test-baseline.json");
1020
1021        let config = BenchConfig {
1022            baseline_path: baseline_path.clone(),
1023            tolerance: 0.10, // 10% tolerance
1024            ..test_config()
1025        };
1026        let runner = BenchRunner::new(config);
1027
1028        // Save baseline with 1000μs cold start
1029        let baseline_results = vec![BenchmarkResult {
1030            schema_version: 1,
1031            workload_name: "test_workload".to_string(),
1032            model: "gemini-2.5-flash".to_string(),
1033            metadata: RunMetadata {
1034                timestamp: "2025-01-01T00:00:00Z".to_string(),
1035                adk_version: "0.5.0".to_string(),
1036                rust_version: "1.85.0".to_string(),
1037                os: "linux".to_string(),
1038                arch: "x86_64".to_string(),
1039            },
1040            cold_start: compute_stats(&[Duration::from_micros(1000)]),
1041            agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1042            tool_invocation: None,
1043            throughput: None,
1044            memory: None,
1045            token_overhead: None,
1046            reproducibility_rate: None,
1047            iterations: 5,
1048        }];
1049        runner.save_baseline(&baseline_results).unwrap();
1050
1051        // Current results with 20% worse cold start (1200μs vs 1000μs)
1052        let current_results = vec![BenchmarkResult {
1053            schema_version: 1,
1054            workload_name: "test_workload".to_string(),
1055            model: "gemini-2.5-flash".to_string(),
1056            metadata: RunMetadata {
1057                timestamp: "2025-01-02T00:00:00Z".to_string(),
1058                adk_version: "0.5.0".to_string(),
1059                rust_version: "1.85.0".to_string(),
1060                os: "linux".to_string(),
1061                arch: "x86_64".to_string(),
1062            },
1063            cold_start: compute_stats(&[Duration::from_micros(1200)]),
1064            agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1065            tool_invocation: None,
1066            throughput: None,
1067            memory: None,
1068            token_overhead: None,
1069            reproducibility_rate: None,
1070            iterations: 5,
1071        }];
1072
1073        let regressions = runner.check_regression(&current_results).unwrap();
1074        // Should detect regression: (1200 - 1000) / 1000 = 0.20 > 0.10 tolerance
1075        assert!(!regressions.is_empty(), "expected regression for 20% cold start increase");
1076
1077        // Verify the report has correct values
1078        let cold_start_regression = regressions
1079            .iter()
1080            .find(|r| r.metric_name.contains("cold_start"))
1081            .expect("should have cold_start regression");
1082        assert_eq!(cold_start_regression.workload_name, "test_workload");
1083        assert!((cold_start_regression.degradation - 0.20).abs() < 0.01);
1084    }
1085
1086    #[tokio::test]
1087    async fn test_check_regression_within_tolerance() {
1088        let dir = tempfile::TempDir::new().unwrap();
1089        let baseline_path = dir.path().join("test-baseline.json");
1090
1091        let config = BenchConfig {
1092            baseline_path: baseline_path.clone(),
1093            tolerance: 0.10, // 10% tolerance
1094            ..test_config()
1095        };
1096        let runner = BenchRunner::new(config);
1097
1098        // Save baseline with 1000μs cold start
1099        let baseline_results = vec![BenchmarkResult {
1100            schema_version: 1,
1101            workload_name: "test_workload".to_string(),
1102            model: "gemini-2.5-flash".to_string(),
1103            metadata: RunMetadata {
1104                timestamp: "2025-01-01T00:00:00Z".to_string(),
1105                adk_version: "0.5.0".to_string(),
1106                rust_version: "1.85.0".to_string(),
1107                os: "linux".to_string(),
1108                arch: "x86_64".to_string(),
1109            },
1110            cold_start: compute_stats(&[Duration::from_micros(1000)]),
1111            agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1112            tool_invocation: None,
1113            throughput: None,
1114            memory: None,
1115            token_overhead: None,
1116            reproducibility_rate: None,
1117            iterations: 5,
1118        }];
1119        runner.save_baseline(&baseline_results).unwrap();
1120
1121        // Current results with 5% worse cold start (1050μs vs 1000μs)
1122        let current_results = vec![BenchmarkResult {
1123            schema_version: 1,
1124            workload_name: "test_workload".to_string(),
1125            model: "gemini-2.5-flash".to_string(),
1126            metadata: RunMetadata {
1127                timestamp: "2025-01-02T00:00:00Z".to_string(),
1128                adk_version: "0.5.0".to_string(),
1129                rust_version: "1.85.0".to_string(),
1130                os: "linux".to_string(),
1131                arch: "x86_64".to_string(),
1132            },
1133            cold_start: compute_stats(&[Duration::from_micros(1050)]),
1134            agent_loop_overhead: compute_stats(&[Duration::from_micros(105)]),
1135            tool_invocation: None,
1136            throughput: None,
1137            memory: None,
1138            token_overhead: None,
1139            reproducibility_rate: None,
1140            iterations: 5,
1141        }];
1142
1143        let regressions = runner.check_regression(&current_results).unwrap();
1144        // 5% increase is within 10% tolerance — no regression
1145        assert!(
1146            regressions.is_empty(),
1147            "expected no regression for 5% increase within 10% tolerance"
1148        );
1149    }
1150
1151    #[tokio::test]
1152    async fn test_check_regression_improvement_not_flagged() {
1153        let dir = tempfile::TempDir::new().unwrap();
1154        let baseline_path = dir.path().join("test-baseline.json");
1155
1156        let config =
1157            BenchConfig { baseline_path: baseline_path.clone(), tolerance: 0.10, ..test_config() };
1158        let runner = BenchRunner::new(config);
1159
1160        // Save baseline with 1000μs cold start
1161        let baseline_results = vec![BenchmarkResult {
1162            schema_version: 1,
1163            workload_name: "test_workload".to_string(),
1164            model: "gemini-2.5-flash".to_string(),
1165            metadata: RunMetadata {
1166                timestamp: "2025-01-01T00:00:00Z".to_string(),
1167                adk_version: "0.5.0".to_string(),
1168                rust_version: "1.85.0".to_string(),
1169                os: "linux".to_string(),
1170                arch: "x86_64".to_string(),
1171            },
1172            cold_start: compute_stats(&[Duration::from_micros(1000)]),
1173            agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1174            tool_invocation: None,
1175            throughput: None,
1176            memory: None,
1177            token_overhead: None,
1178            reproducibility_rate: None,
1179            iterations: 5,
1180        }];
1181        runner.save_baseline(&baseline_results).unwrap();
1182
1183        // Current results are *better* (lower timing — improvement)
1184        let current_results = vec![BenchmarkResult {
1185            schema_version: 1,
1186            workload_name: "test_workload".to_string(),
1187            model: "gemini-2.5-flash".to_string(),
1188            metadata: RunMetadata {
1189                timestamp: "2025-01-02T00:00:00Z".to_string(),
1190                adk_version: "0.5.0".to_string(),
1191                rust_version: "1.85.0".to_string(),
1192                os: "linux".to_string(),
1193                arch: "x86_64".to_string(),
1194            },
1195            cold_start: compute_stats(&[Duration::from_micros(800)]),
1196            agent_loop_overhead: compute_stats(&[Duration::from_micros(80)]),
1197            tool_invocation: None,
1198            throughput: None,
1199            memory: None,
1200            token_overhead: None,
1201            reproducibility_rate: None,
1202            iterations: 5,
1203        }];
1204
1205        let regressions = runner.check_regression(&current_results).unwrap();
1206        // Improvements should never be flagged as regression
1207        assert!(regressions.is_empty(), "improvement should not be flagged as regression");
1208    }
1209
1210    #[tokio::test]
1211    async fn test_check_regression_no_baseline_file() {
1212        let dir = tempfile::TempDir::new().unwrap();
1213        let baseline_path = dir.path().join("nonexistent-baseline.json");
1214
1215        let config =
1216            BenchConfig { baseline_path: baseline_path.clone(), tolerance: 0.10, ..test_config() };
1217        let runner = BenchRunner::new(config);
1218
1219        let results = vec![BenchmarkResult {
1220            schema_version: 1,
1221            workload_name: "test_workload".to_string(),
1222            model: "gemini-2.5-flash".to_string(),
1223            metadata: RunMetadata {
1224                timestamp: "2025-01-01T00:00:00Z".to_string(),
1225                adk_version: "0.5.0".to_string(),
1226                rust_version: "1.85.0".to_string(),
1227                os: "linux".to_string(),
1228                arch: "x86_64".to_string(),
1229            },
1230            cold_start: compute_stats(&[Duration::from_micros(1000)]),
1231            agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1232            tool_invocation: None,
1233            throughput: None,
1234            memory: None,
1235            token_overhead: None,
1236            reproducibility_rate: None,
1237            iterations: 5,
1238        }];
1239
1240        // No baseline file — should return empty, not error
1241        let regressions = runner.check_regression(&results).unwrap();
1242        assert!(regressions.is_empty());
1243    }
1244
1245    #[tokio::test]
1246    async fn test_check_regression_exact_tolerance_boundary() {
1247        let dir = tempfile::TempDir::new().unwrap();
1248        let baseline_path = dir.path().join("test-baseline.json");
1249
1250        let config = BenchConfig {
1251            baseline_path: baseline_path.clone(),
1252            tolerance: 0.10, // exactly 10%
1253            ..test_config()
1254        };
1255        let runner = BenchRunner::new(config);
1256
1257        // Save baseline with 1000μs cold start
1258        let baseline_results = vec![BenchmarkResult {
1259            schema_version: 1,
1260            workload_name: "test_workload".to_string(),
1261            model: "gemini-2.5-flash".to_string(),
1262            metadata: RunMetadata {
1263                timestamp: "2025-01-01T00:00:00Z".to_string(),
1264                adk_version: "0.5.0".to_string(),
1265                rust_version: "1.85.0".to_string(),
1266                os: "linux".to_string(),
1267                arch: "x86_64".to_string(),
1268            },
1269            cold_start: compute_stats(&[Duration::from_micros(1000)]),
1270            agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1271            tool_invocation: None,
1272            throughput: None,
1273            memory: None,
1274            token_overhead: None,
1275            reproducibility_rate: None,
1276            iterations: 5,
1277        }];
1278        runner.save_baseline(&baseline_results).unwrap();
1279
1280        // Current results with exactly 10% degradation (1100μs vs 1000μs)
1281        // (1100 - 1000) / 1000 = 0.10, which equals tolerance but does NOT exceed it
1282        let current_results = vec![BenchmarkResult {
1283            schema_version: 1,
1284            workload_name: "test_workload".to_string(),
1285            model: "gemini-2.5-flash".to_string(),
1286            metadata: RunMetadata {
1287                timestamp: "2025-01-02T00:00:00Z".to_string(),
1288                adk_version: "0.5.0".to_string(),
1289                rust_version: "1.85.0".to_string(),
1290                os: "linux".to_string(),
1291                arch: "x86_64".to_string(),
1292            },
1293            cold_start: compute_stats(&[Duration::from_micros(1100)]),
1294            agent_loop_overhead: compute_stats(&[Duration::from_micros(110)]),
1295            tool_invocation: None,
1296            throughput: None,
1297            memory: None,
1298            token_overhead: None,
1299            reproducibility_rate: None,
1300            iterations: 5,
1301        }];
1302
1303        let regressions = runner.check_regression(&current_results).unwrap();
1304        // Exactly at tolerance boundary — should NOT be flagged (strictly greater than)
1305        assert!(
1306            regressions.is_empty(),
1307            "exactly at tolerance boundary should not trigger regression"
1308        );
1309    }
1310
1311    #[tokio::test]
1312    async fn test_check_regression_multiple_workloads() {
1313        let dir = tempfile::TempDir::new().unwrap();
1314        let baseline_path = dir.path().join("test-baseline.json");
1315
1316        let config =
1317            BenchConfig { baseline_path: baseline_path.clone(), tolerance: 0.10, ..test_config() };
1318        let runner = BenchRunner::new(config);
1319
1320        // Save baseline with two workloads
1321        let baseline_results = vec![
1322            BenchmarkResult {
1323                schema_version: 1,
1324                workload_name: "workload_a".to_string(),
1325                model: "gemini-2.5-flash".to_string(),
1326                metadata: RunMetadata {
1327                    timestamp: "2025-01-01T00:00:00Z".to_string(),
1328                    adk_version: "0.5.0".to_string(),
1329                    rust_version: "1.85.0".to_string(),
1330                    os: "linux".to_string(),
1331                    arch: "x86_64".to_string(),
1332                },
1333                cold_start: compute_stats(&[Duration::from_micros(1000)]),
1334                agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1335                tool_invocation: None,
1336                throughput: None,
1337                memory: None,
1338                token_overhead: None,
1339                reproducibility_rate: None,
1340                iterations: 5,
1341            },
1342            BenchmarkResult {
1343                schema_version: 1,
1344                workload_name: "workload_b".to_string(),
1345                model: "gemini-2.5-flash".to_string(),
1346                metadata: RunMetadata {
1347                    timestamp: "2025-01-01T00:00:00Z".to_string(),
1348                    adk_version: "0.5.0".to_string(),
1349                    rust_version: "1.85.0".to_string(),
1350                    os: "linux".to_string(),
1351                    arch: "x86_64".to_string(),
1352                },
1353                cold_start: compute_stats(&[Duration::from_micros(2000)]),
1354                agent_loop_overhead: compute_stats(&[Duration::from_micros(200)]),
1355                tool_invocation: None,
1356                throughput: None,
1357                memory: None,
1358                token_overhead: None,
1359                reproducibility_rate: None,
1360                iterations: 5,
1361            },
1362        ];
1363        runner.save_baseline(&baseline_results).unwrap();
1364
1365        // workload_a regresses (30%), workload_b stays the same
1366        let current_results = vec![
1367            BenchmarkResult {
1368                schema_version: 1,
1369                workload_name: "workload_a".to_string(),
1370                model: "gemini-2.5-flash".to_string(),
1371                metadata: RunMetadata {
1372                    timestamp: "2025-01-02T00:00:00Z".to_string(),
1373                    adk_version: "0.5.0".to_string(),
1374                    rust_version: "1.85.0".to_string(),
1375                    os: "linux".to_string(),
1376                    arch: "x86_64".to_string(),
1377                },
1378                cold_start: compute_stats(&[Duration::from_micros(1300)]),
1379                agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1380                tool_invocation: None,
1381                throughput: None,
1382                memory: None,
1383                token_overhead: None,
1384                reproducibility_rate: None,
1385                iterations: 5,
1386            },
1387            BenchmarkResult {
1388                schema_version: 1,
1389                workload_name: "workload_b".to_string(),
1390                model: "gemini-2.5-flash".to_string(),
1391                metadata: RunMetadata {
1392                    timestamp: "2025-01-02T00:00:00Z".to_string(),
1393                    adk_version: "0.5.0".to_string(),
1394                    rust_version: "1.85.0".to_string(),
1395                    os: "linux".to_string(),
1396                    arch: "x86_64".to_string(),
1397                },
1398                cold_start: compute_stats(&[Duration::from_micros(2000)]),
1399                agent_loop_overhead: compute_stats(&[Duration::from_micros(200)]),
1400                tool_invocation: None,
1401                throughput: None,
1402                memory: None,
1403                token_overhead: None,
1404                reproducibility_rate: None,
1405                iterations: 5,
1406            },
1407        ];
1408
1409        let regressions = runner.check_regression(&current_results).unwrap();
1410        // Only workload_a should have regression (cold_start 30% increase)
1411        assert!(!regressions.is_empty());
1412        let workload_a_regressions: Vec<_> =
1413            regressions.iter().filter(|r| r.workload_name == "workload_a").collect();
1414        assert!(!workload_a_regressions.is_empty(), "workload_a should have regressions");
1415
1416        let workload_b_regressions: Vec<_> =
1417            regressions.iter().filter(|r| r.workload_name == "workload_b").collect();
1418        assert!(workload_b_regressions.is_empty(), "workload_b should not have regressions");
1419    }
1420
1421    #[tokio::test]
1422    async fn test_regression_report_fields() {
1423        let dir = tempfile::TempDir::new().unwrap();
1424        let baseline_path = dir.path().join("test-baseline.json");
1425
1426        let config =
1427            BenchConfig { baseline_path: baseline_path.clone(), tolerance: 0.10, ..test_config() };
1428        let runner = BenchRunner::new(config);
1429
1430        // Save baseline
1431        let baseline_results = vec![BenchmarkResult {
1432            schema_version: 1,
1433            workload_name: "my_workload".to_string(),
1434            model: "gemini-2.5-flash".to_string(),
1435            metadata: RunMetadata {
1436                timestamp: "2025-01-01T00:00:00Z".to_string(),
1437                adk_version: "0.5.0".to_string(),
1438                rust_version: "1.85.0".to_string(),
1439                os: "linux".to_string(),
1440                arch: "x86_64".to_string(),
1441            },
1442            cold_start: compute_stats(&[Duration::from_micros(1000)]),
1443            agent_loop_overhead: compute_stats(&[Duration::from_micros(200)]),
1444            tool_invocation: None,
1445            throughput: None,
1446            memory: None,
1447            token_overhead: None,
1448            reproducibility_rate: None,
1449            iterations: 5,
1450        }];
1451        runner.save_baseline(&baseline_results).unwrap();
1452
1453        // 50% regression on cold start
1454        let current_results = vec![BenchmarkResult {
1455            schema_version: 1,
1456            workload_name: "my_workload".to_string(),
1457            model: "gemini-2.5-flash".to_string(),
1458            metadata: RunMetadata {
1459                timestamp: "2025-01-02T00:00:00Z".to_string(),
1460                adk_version: "0.5.0".to_string(),
1461                rust_version: "1.85.0".to_string(),
1462                os: "linux".to_string(),
1463                arch: "x86_64".to_string(),
1464            },
1465            cold_start: compute_stats(&[Duration::from_micros(1500)]),
1466            agent_loop_overhead: compute_stats(&[Duration::from_micros(200)]),
1467            tool_invocation: None,
1468            throughput: None,
1469            memory: None,
1470            token_overhead: None,
1471            reproducibility_rate: None,
1472            iterations: 5,
1473        }];
1474
1475        let regressions = runner.check_regression(&current_results).unwrap();
1476        assert!(!regressions.is_empty());
1477
1478        // Find the cold_start_mean_us regression
1479        let report = regressions
1480            .iter()
1481            .find(|r| r.metric_name == "cold_start_mean_us")
1482            .expect("should have cold_start_mean_us regression");
1483
1484        assert_eq!(report.workload_name, "my_workload");
1485        assert!((report.baseline_value - 1000.0).abs() < 1.0);
1486        assert!((report.current_value - 1500.0).abs() < 1.0);
1487        assert!((report.degradation - 0.50).abs() < 0.01);
1488    }
1489
1490    #[tokio::test]
1491    async fn test_estimate_cost_non_zero() {
1492        let config = test_config();
1493        let runner = BenchRunner::new(config);
1494        let workloads = runner.resolve_workloads().unwrap();
1495        let cost = runner.estimate_cost(&workloads);
1496        // With default pricing for gemini-2.5-flash and 3 workloads, cost should be > 0
1497        assert!(cost >= 0.0);
1498    }
1499
1500    #[tokio::test]
1501    async fn test_build_run_metadata() {
1502        let config = test_config();
1503        let runner = BenchRunner::new(config);
1504        let metadata = runner.build_run_metadata();
1505        assert!(!metadata.timestamp.is_empty());
1506        assert!(!metadata.adk_version.is_empty());
1507        assert!(!metadata.os.is_empty());
1508        assert!(!metadata.arch.is_empty());
1509    }
1510}