1use std::collections::HashMap;
23use std::sync::Arc;
24use std::time::{Duration, Instant};
25
26use adk_agent::LlmAgentBuilder;
27use adk_core::{
28 Content, Llm,
29 identity::{SessionId, UserId},
30};
31use adk_eval::{BaselineStore, CostTracker};
32use adk_model::gemini::GeminiModel;
33use adk_runner::Runner;
34use adk_session::InMemorySessionService;
35use adk_session::SessionService;
36use adk_tool::FunctionTool;
37use futures::StreamExt;
38use tokio::task::JoinSet;
39
40use crate::config::BenchConfig;
41use crate::error::{BenchError, Result};
42use crate::instrumented_llm::InstrumentedLlm;
43use crate::metrics::{
44 BenchmarkResult, ConcurrencyLevel, DurationStats, RunMetadata, ThroughputMetrics, compute_stats,
45};
46use crate::workload::{
47 Workload, builtin_workloads, load_workload, multi_agent_delegation_workload,
48};
49
50const SWEEP_LEVELS: &[usize] = &[1, 2, 4, 8, 16, 32, 64];
52
53const CV_WARNING_THRESHOLD: f64 = 0.20;
55
56#[derive(Debug, Clone)]
61pub struct RegressionReport {
62 pub metric_name: String,
64 pub workload_name: String,
66 pub baseline_value: f64,
68 pub current_value: f64,
70 pub degradation: f64,
72}
73
74pub struct BenchRunner {
85 config: BenchConfig,
86 baseline_store: BaselineStore,
87 cost_tracker: CostTracker,
88}
89
90impl BenchRunner {
91 pub fn new(config: BenchConfig) -> Self {
96 let baseline_store = BaselineStore::new(&config.baseline_path);
97 let cost_tracker = CostTracker::new();
98 Self { config, baseline_store, cost_tracker }
99 }
100
101 pub async fn run(&self) -> Result<Vec<BenchmarkResult>> {
120 let workloads = self.resolve_workloads()?;
121
122 let estimated_cost = self.estimate_cost(&workloads);
124 if self.config.dry_run {
125 tracing::info!(
126 estimated_cost_usd = estimated_cost,
127 total_workloads = workloads.len(),
128 runs = self.config.runs,
129 concurrency = self.config.concurrency,
130 "dry-run: displaying estimated cost without executing"
131 );
132 return Ok(Vec::new());
133 }
134
135 if let Some(max_cost) = self.config.max_cost_usd
137 && estimated_cost > max_cost
138 {
139 return Err(BenchError::Baseline(format!(
140 "estimated cost ${estimated_cost:.4} exceeds --max-cost-usd limit ${max_cost:.4}. \
141 Reduce runs, concurrency, or workloads to stay within budget."
142 )));
143 }
144
145 if estimated_cost > 1.0 && !self.config.confirm_cost {
147 tracing::warn!(
148 estimated_cost_usd = estimated_cost,
149 "estimated cost exceeds $1.00; pass --confirm-cost to proceed"
150 );
151 return Err(BenchError::Baseline(format!(
152 "estimated cost ${estimated_cost:.4} exceeds $1.00. \
153 Pass --confirm-cost to acknowledge, or use --max-cost-usd to set a limit."
154 )));
155 }
156
157 let mut results = Vec::new();
158
159 for workload in &workloads {
160 if let Some(ref sweep_levels) = self.config.concurrency_sweep {
161 let result = self.run_workload_with_sweep(workload, sweep_levels).await?;
163 results.push(result);
164 } else if self.config.concurrency > 1 {
165 let result =
167 self.run_workload_concurrent(workload, self.config.concurrency).await?;
168 results.push(result);
169 } else {
170 let result = self.run_workload_sequential(workload).await?;
172 results.push(result);
173 }
174 }
175
176 Ok(results)
177 }
178
179 pub fn save_baseline(&self, results: &[BenchmarkResult]) -> Result<()> {
183 let metrics = self.results_to_baseline_metrics(results);
184 self.baseline_store
185 .save("adk-bench", &metrics)
186 .map_err(|e| BenchError::Baseline(format!("failed to save baseline: {e}")))?;
187 Ok(())
188 }
189
190 pub fn check_regression(&self, results: &[BenchmarkResult]) -> Result<Vec<RegressionReport>> {
207 let current_metrics = self.results_to_baseline_metrics(results);
208
209 let baseline = self
215 .baseline_store
216 .load()
217 .map_err(|e| BenchError::Baseline(format!("regression check failed: {e}")))?;
218
219 let baseline = match baseline {
220 Some(b) => b,
221 None => {
222 tracing::info!("no baseline file found, skipping regression check");
223 return Ok(Vec::new());
224 }
225 };
226
227 let mut reports = Vec::new();
228
229 for (metric_name, baseline_cases) in &baseline.metrics {
230 if let Some(current_cases) = current_metrics.get(metric_name) {
231 for (case_id, &baseline_value) in baseline_cases {
232 if let Some(¤t_value) = current_cases.get(case_id) {
233 let degradation = if baseline_value > 0.0 {
235 (current_value - baseline_value) / baseline_value
236 } else {
237 0.0
238 };
239
240 if degradation > self.config.tolerance {
241 let (workload_name, parsed_metric_name) = case_id
244 .split_once("::")
245 .map(|(w, m)| (w.to_string(), m.to_string()))
246 .unwrap_or((metric_name.clone(), case_id.clone()));
247
248 reports.push(RegressionReport {
249 metric_name: parsed_metric_name,
250 workload_name,
251 baseline_value,
252 current_value,
253 degradation,
254 });
255 }
256 }
257 }
258 }
259 }
260
261 Ok(reports)
262 }
263
264 fn resolve_workloads(&self) -> Result<Vec<Workload>> {
266 if let Some(ref workload_path) = self.config.workload {
267 let path = std::path::Path::new(workload_path);
269 if path.exists() {
270 let workload = load_workload(path)?;
271 return Ok(vec![workload]);
272 }
273
274 let mut all = builtin_workloads();
276 if self.config.experimental {
277 all.push(multi_agent_delegation_workload());
278 }
279
280 let found = all.into_iter().find(|w| w.name == *workload_path);
281 match found {
282 Some(w) => Ok(vec![w]),
283 None => Err(BenchError::WorkloadNotFound { path: workload_path.clone() }),
284 }
285 } else {
286 let mut workloads = builtin_workloads();
287 if self.config.experimental {
288 workloads.push(multi_agent_delegation_workload());
289 }
290 Ok(workloads)
291 }
292 }
293
294 fn estimate_cost(&self, workloads: &[Workload]) -> f64 {
299 let mut total_cost = 0.0;
300
301 const ESTIMATED_INPUT_TOKENS_PER_TURN: u64 = 500;
303 const ESTIMATED_OUTPUT_TOKENS_PER_TURN: u64 = 200;
304
305 let concurrency_multiplier = if let Some(ref levels) = self.config.concurrency_sweep {
306 levels.iter().sum::<usize>()
308 } else {
309 self.config.concurrency
310 };
311
312 for workload in workloads {
313 let turns = workload.expected_turns as u64;
314 let total_iterations =
315 (self.config.runs + self.config.warmup) as u64 * concurrency_multiplier as u64;
316
317 let prompt_tokens = turns * ESTIMATED_INPUT_TOKENS_PER_TURN * total_iterations;
318 let completion_tokens = turns * ESTIMATED_OUTPUT_TOKENS_PER_TURN * total_iterations;
319
320 if let Some(cost) =
321 self.cost_tracker.compute_cost(&workload.model, prompt_tokens, completion_tokens)
322 {
323 total_cost += cost;
324 }
325 }
326
327 total_cost
328 }
329
330 async fn run_workload_sequential(&self, workload: &Workload) -> Result<BenchmarkResult> {
332 tracing::info!(
334 workload = workload.name,
335 warmup = self.config.warmup,
336 "starting warm-up phase"
337 );
338 for i in 0..self.config.warmup {
339 tracing::debug!(workload = workload.name, iteration = i, "warm-up iteration");
340 self.execute_single_workload(workload).await?;
341 }
342
343 tracing::info!(
345 workload = workload.name,
346 runs = self.config.runs,
347 "starting measurement phase"
348 );
349 let mut cold_start_durations = Vec::new();
350 let mut overhead_durations = Vec::new();
351
352 for i in 0..self.config.runs {
353 tracing::debug!(workload = workload.name, iteration = i, "measurement iteration");
354 let (cold_start, overheads) = self.execute_single_workload(workload).await?;
355 cold_start_durations.push(cold_start);
356 overhead_durations.extend(overheads);
357 }
358
359 let cold_start_stats = compute_stats(&cold_start_durations);
360 let overhead_stats = compute_stats(&overhead_durations);
361
362 self.emit_cv_warning(&overhead_stats, &workload.name);
364
365 Ok(BenchmarkResult {
366 schema_version: 1,
367 workload_name: workload.name.clone(),
368 model: workload.model.clone(),
369 metadata: self.build_run_metadata(),
370 cold_start: cold_start_stats,
371 agent_loop_overhead: overhead_stats,
372 tool_invocation: None,
373 throughput: None,
374 memory: None,
375 token_overhead: None,
376 reproducibility_rate: None,
377 iterations: self.config.runs,
378 })
379 }
380
381 async fn run_workload_concurrent(
383 &self,
384 workload: &Workload,
385 concurrency: usize,
386 ) -> Result<BenchmarkResult> {
387 tracing::info!(
389 workload = workload.name,
390 warmup = self.config.warmup,
391 concurrency,
392 "starting concurrent warm-up phase"
393 );
394 for _ in 0..self.config.warmup {
395 self.execute_concurrent_batch(workload, concurrency).await?;
396 }
397
398 tracing::info!(
400 workload = workload.name,
401 runs = self.config.runs,
402 concurrency,
403 "starting concurrent measurement phase"
404 );
405 let mut cold_start_durations = Vec::new();
406 let mut overhead_durations = Vec::new();
407 let mut completion_times = Vec::new();
408
409 for _ in 0..self.config.runs {
410 let batch_start = Instant::now();
411 let batch_results = self.execute_concurrent_batch(workload, concurrency).await?;
412 let batch_elapsed = batch_start.elapsed();
413
414 for (cold_start, overheads) in &batch_results {
415 cold_start_durations.push(*cold_start);
416 overhead_durations.extend(overheads.iter().copied());
417 }
418 completion_times.push(batch_elapsed);
420 }
421
422 let cold_start_stats = compute_stats(&cold_start_durations);
423 let overhead_stats = compute_stats(&overhead_durations);
424 let completion_stats = compute_stats(&completion_times);
425
426 self.emit_cv_warning(&overhead_stats, &workload.name);
428
429 let mean_completion_secs = if !completion_times.is_empty() {
431 completion_times.iter().map(|d| d.as_secs_f64()).sum::<f64>()
432 / completion_times.len() as f64
433 } else {
434 1.0
435 };
436 let agents_per_second = concurrency as f64 / mean_completion_secs;
437
438 let throughput = Some(ThroughputMetrics {
439 levels: vec![ConcurrencyLevel {
440 concurrency,
441 agents_per_second,
442 completion_time: completion_stats,
443 }],
444 });
445
446 Ok(BenchmarkResult {
447 schema_version: 1,
448 workload_name: workload.name.clone(),
449 model: workload.model.clone(),
450 metadata: self.build_run_metadata(),
451 cold_start: cold_start_stats,
452 agent_loop_overhead: overhead_stats,
453 tool_invocation: None,
454 throughput,
455 memory: None,
456 token_overhead: None,
457 reproducibility_rate: None,
458 iterations: self.config.runs,
459 })
460 }
461
462 async fn run_workload_with_sweep(
467 &self,
468 workload: &Workload,
469 sweep_levels: &[usize],
470 ) -> Result<BenchmarkResult> {
471 let levels_to_test =
472 if sweep_levels.is_empty() { SWEEP_LEVELS.to_vec() } else { sweep_levels.to_vec() };
473
474 tracing::info!(
475 workload = workload.name,
476 levels = ?levels_to_test,
477 "starting concurrency sweep"
478 );
479
480 let min_level = *levels_to_test.first().unwrap_or(&1);
482 for _ in 0..self.config.warmup {
483 self.execute_concurrent_batch(workload, min_level).await?;
484 }
485
486 let mut all_cold_starts = Vec::new();
487 let mut all_overheads = Vec::new();
488 let mut throughput_levels = Vec::new();
489
490 for &level in &levels_to_test {
491 tracing::info!(
492 workload = workload.name,
493 concurrency = level,
494 "sweeping concurrency level"
495 );
496
497 let mut level_completion_times = Vec::new();
498
499 for _ in 0..self.config.runs {
500 let batch_start = Instant::now();
501 let batch_results = self.execute_concurrent_batch(workload, level).await?;
502 let batch_elapsed = batch_start.elapsed();
503
504 for (cold_start, overheads) in &batch_results {
505 all_cold_starts.push(*cold_start);
506 all_overheads.extend(overheads.iter().copied());
507 }
508 level_completion_times.push(batch_elapsed);
509 }
510
511 let completion_stats = compute_stats(&level_completion_times);
512 let mean_secs = if !level_completion_times.is_empty() {
513 level_completion_times.iter().map(|d| d.as_secs_f64()).sum::<f64>()
514 / level_completion_times.len() as f64
515 } else {
516 1.0
517 };
518 let agents_per_second = level as f64 / mean_secs;
519
520 throughput_levels.push(ConcurrencyLevel {
521 concurrency: level,
522 agents_per_second,
523 completion_time: completion_stats,
524 });
525 }
526
527 let cold_start_stats = compute_stats(&all_cold_starts);
528 let overhead_stats = compute_stats(&all_overheads);
529
530 self.emit_cv_warning(&overhead_stats, &workload.name);
532
533 Ok(BenchmarkResult {
534 schema_version: 1,
535 workload_name: workload.name.clone(),
536 model: workload.model.clone(),
537 metadata: self.build_run_metadata(),
538 cold_start: cold_start_stats,
539 agent_loop_overhead: overhead_stats,
540 tool_invocation: None,
541 throughput: Some(ThroughputMetrics { levels: throughput_levels }),
542 memory: None,
543 token_overhead: None,
544 reproducibility_rate: None,
545 iterations: self.config.runs,
546 })
547 }
548
549 async fn execute_concurrent_batch(
554 &self,
555 workload: &Workload,
556 concurrency: usize,
557 ) -> Result<Vec<(Duration, Vec<Duration>)>> {
558 let mut join_set = JoinSet::new();
559
560 for _ in 0..concurrency {
561 let workload = workload.clone();
562 let model_name = self.config.model.clone();
563 join_set.spawn(async move { execute_workload_real(&workload, &model_name).await });
564 }
565
566 let mut results = Vec::with_capacity(concurrency);
567 while let Some(join_result) = join_set.join_next().await {
568 let task_result =
569 join_result.map_err(|e| BenchError::Llm(format!("task join failed: {e}")))?;
570 results.push(task_result?);
571 }
572
573 Ok(results)
574 }
575
576 async fn execute_single_workload(
580 &self,
581 workload: &Workload,
582 ) -> Result<(Duration, Vec<Duration>)> {
583 execute_workload_real(workload, &self.config.model).await
584 }
585
586 fn emit_cv_warning(&self, stats: &DurationStats, workload_name: &str) {
588 if stats.count > 1 && stats.coefficient_of_variation > CV_WARNING_THRESHOLD {
589 tracing::warn!(
590 workload = workload_name,
591 cv = format!("{:.1}%", stats.coefficient_of_variation * 100.0),
592 threshold = "20%",
593 mean_us = stats.mean_us,
594 std_dev_us = stats.std_dev_us,
595 "Agent_Loop_Overhead CV exceeds 20%, measurements may be unstable. \
596 Consider increasing iteration count or reducing system load."
597 );
598 }
599 }
600
601 fn build_run_metadata(&self) -> RunMetadata {
603 RunMetadata {
604 timestamp: chrono::Utc::now().to_rfc3339(),
605 adk_version: env!("CARGO_PKG_VERSION").to_string(),
606 rust_version: rustc_version(),
607 os: std::env::consts::OS.to_string(),
608 arch: std::env::consts::ARCH.to_string(),
609 }
610 }
611
612 fn results_to_baseline_metrics(
614 &self,
615 results: &[BenchmarkResult],
616 ) -> HashMap<String, HashMap<String, f64>> {
617 let mut metrics: HashMap<String, HashMap<String, f64>> = HashMap::new();
618
619 for result in results {
620 let prefix = &result.workload_name;
621
622 let mut case_metrics = HashMap::new();
623 case_metrics
624 .insert(format!("{prefix}::cold_start_mean_us"), result.cold_start.mean_us as f64);
625 case_metrics
626 .insert(format!("{prefix}::cold_start_p95_us"), result.cold_start.p95_us as f64);
627 case_metrics.insert(
628 format!("{prefix}::overhead_mean_us"),
629 result.agent_loop_overhead.mean_us as f64,
630 );
631 case_metrics.insert(
632 format!("{prefix}::overhead_p95_us"),
633 result.agent_loop_overhead.p95_us as f64,
634 );
635
636 metrics.entry("timing".to_string()).or_default().extend(case_metrics);
639 }
640
641 metrics
642 }
643}
644
645fn create_llm(model_name: &str) -> Result<Arc<dyn Llm>> {
650 let api_key = std::env::var("GOOGLE_API_KEY").map_err(|_| {
651 BenchError::Llm(
652 "GOOGLE_API_KEY environment variable not set. \
653 Set it to your Gemini API key to run benchmarks."
654 .to_string(),
655 )
656 })?;
657
658 let model = GeminiModel::new(api_key, model_name).map_err(|e| {
659 BenchError::Llm(format!("failed to create Gemini model '{model_name}': {e}"))
660 })?;
661
662 Ok(Arc::new(model))
663}
664
665fn create_tools_from_workload(workload: &Workload) -> Vec<Arc<dyn adk_core::Tool>> {
671 workload
672 .agent
673 .tools
674 .iter()
675 .map(|(name, def)| {
676 let tool_name = name.clone();
677 let description = def.description.clone();
678 let fixed_response = def.fixed_response.clone();
679 let latency_ms = def.simulated_latency_ms;
680
681 let tool = FunctionTool::new(tool_name, description, move |_ctx, _args| {
682 let response = fixed_response.clone();
683 let latency = latency_ms;
684 async move {
685 if latency > 0 {
686 tokio::time::sleep(Duration::from_millis(latency)).await;
687 }
688 Ok(response.unwrap_or(serde_json::json!({"status": "success"})))
689 }
690 })
691 .with_read_only(true)
692 .with_concurrency_safe(true);
693
694 Arc::new(tool) as Arc<dyn adk_core::Tool>
695 })
696 .collect()
697}
698
699async fn execute_workload_real(
707 workload: &Workload,
708 model_name: &str,
709) -> Result<(Duration, Vec<Duration>)> {
710 let run_start = Instant::now();
711
712 let inner_llm = create_llm(model_name)?;
714 let instrumented = Arc::new(InstrumentedLlm::new(inner_llm));
715
716 let tools = create_tools_from_workload(workload);
718 let mut agent_builder = LlmAgentBuilder::new(&workload.name)
719 .model(instrumented.clone() as Arc<dyn Llm>)
720 .instruction(&workload.agent.instructions);
721
722 for tool in tools {
723 agent_builder = agent_builder.tool(tool);
724 }
725
726 let agent = agent_builder
727 .build()
728 .map_err(|e| BenchError::Llm(format!("failed to build agent: {e}")))?;
729
730 let session_service = Arc::new(InMemorySessionService::new());
732
733 let app_name = format!("bench-{}", workload.name);
735 let session_id_str = format!("bench-{}", uuid_v4());
736 session_service
737 .create(adk_session::CreateRequest {
738 app_name: app_name.clone(),
739 user_id: "bench-user".to_string(),
740 session_id: Some(session_id_str.clone()),
741 state: HashMap::new(),
742 })
743 .await
744 .map_err(|e| BenchError::Llm(format!("failed to create session: {e}")))?;
745
746 let runner = Runner::builder()
747 .app_name(app_name)
748 .agent(Arc::new(agent))
749 .session_service(session_service)
750 .build()
751 .map_err(|e| BenchError::Llm(format!("failed to create runner: {e}")))?;
752
753 let user_content = Content::new("user").with_text(&workload.agent.user_message);
755
756 let user_id = UserId::try_from("bench-user")
757 .map_err(|e| BenchError::Llm(format!("invalid user id: {e}")))?;
758 let session_id = SessionId::try_from(session_id_str.as_str())
759 .map_err(|e| BenchError::Llm(format!("invalid session id: {e}")))?;
760
761 let turn_start = Instant::now();
762 let mut event_stream = runner
763 .run(user_id, session_id, user_content)
764 .await
765 .map_err(|e| BenchError::Llm(format!("agent run failed: {e}")))?;
766
767 while let Some(event_result) = event_stream.next().await {
769 match event_result {
770 Ok(_event) => {
771 }
773 Err(e) => {
774 tracing::warn!(error = %e, "event stream error during benchmark");
775 }
776 }
777 }
778 let total_turn_time = turn_start.elapsed();
779
780 let records = instrumented.records().await;
782
783 let cold_start = if let Some(first_record) = records.first() {
785 first_record.request_sent.duration_since(run_start)
786 } else {
787 run_start.elapsed()
788 };
789
790 let total_llm_time: Duration = records.iter().map(|r| r.round_trip).sum();
793 let overhead = total_turn_time.saturating_sub(total_llm_time);
794
795 let num_turns = records.len().max(1);
797 let per_turn_overhead = overhead / num_turns as u32;
798 let overheads: Vec<Duration> = (0..num_turns).map(|_| per_turn_overhead).collect();
799
800 tracing::debug!(
801 workload = workload.name,
802 cold_start_us = cold_start.as_micros(),
803 total_turn_ms = total_turn_time.as_millis(),
804 llm_calls = records.len(),
805 total_llm_ms = total_llm_time.as_millis(),
806 overhead_us = overhead.as_micros(),
807 "workload execution complete"
808 );
809
810 Ok((cold_start, overheads))
811}
812
813fn uuid_v4() -> String {
815 use std::time::SystemTime;
816 let nanos =
817 SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap_or_default().as_nanos();
818 format!("{:032x}", nanos)
819}
820
821fn rustc_version() -> String {
823 option_env!("RUSTC_VERSION").unwrap_or(env!("CARGO_PKG_RUST_VERSION")).to_string()
825}
826
827#[cfg(test)]
828mod tests {
829 use super::*;
830
831 fn test_config() -> BenchConfig {
832 BenchConfig { runs: 3, warmup: 1, concurrency: 1, ..Default::default() }
833 }
834
835 #[tokio::test]
836 async fn test_bench_runner_new() {
837 let config = test_config();
838 let runner = BenchRunner::new(config.clone());
839 assert_eq!(runner.config.runs, 3);
840 assert_eq!(runner.config.warmup, 1);
841 }
842
843 #[tokio::test]
844 async fn test_resolve_workloads_all_builtin() {
845 let config = test_config();
846 let runner = BenchRunner::new(config);
847 let workloads = runner.resolve_workloads().unwrap();
848 assert_eq!(workloads.len(), 3);
849 }
850
851 #[tokio::test]
852 async fn test_resolve_workloads_with_experimental() {
853 let config = BenchConfig { experimental: true, ..test_config() };
854 let runner = BenchRunner::new(config);
855 let workloads = runner.resolve_workloads().unwrap();
856 assert_eq!(workloads.len(), 4);
857 }
858
859 #[tokio::test]
860 async fn test_resolve_workloads_specific_builtin() {
861 let config =
862 BenchConfig { workload: Some("simple_tool_call".to_string()), ..test_config() };
863 let runner = BenchRunner::new(config);
864 let workloads = runner.resolve_workloads().unwrap();
865 assert_eq!(workloads.len(), 1);
866 assert_eq!(workloads[0].name, "simple_tool_call");
867 }
868
869 #[tokio::test]
870 async fn test_resolve_workloads_not_found() {
871 let config =
872 BenchConfig { workload: Some("nonexistent_workload".to_string()), ..test_config() };
873 let runner = BenchRunner::new(config);
874 let result = runner.resolve_workloads();
875 assert!(result.is_err());
876 }
877
878 #[tokio::test]
879 async fn test_dry_run_returns_empty() {
880 let config = BenchConfig { dry_run: true, ..test_config() };
881 let runner = BenchRunner::new(config);
882 let results = runner.run().await.unwrap();
883 assert!(results.is_empty());
884 }
885
886 #[tokio::test]
887 async fn test_max_cost_usd_abort() {
888 let config = BenchConfig {
889 max_cost_usd: Some(0.0001), runs: 100,
891 ..test_config()
892 };
893 let runner = BenchRunner::new(config);
894 let result = runner.run().await;
895 assert!(result.is_err());
896 }
897
898 #[tokio::test]
899 #[ignore] async fn test_sequential_run() {
901 let config = BenchConfig {
902 workload: Some("simple_tool_call".to_string()),
903 runs: 2,
904 warmup: 1,
905 confirm_cost: true,
906 ..test_config()
907 };
908 let runner = BenchRunner::new(config);
909 let results = runner.run().await.unwrap();
910 assert_eq!(results.len(), 1);
911 assert_eq!(results[0].workload_name, "simple_tool_call");
912 assert_eq!(results[0].iterations, 2);
913 assert!(results[0].throughput.is_none());
914 }
915
916 #[tokio::test]
917 #[ignore] async fn test_concurrent_run() {
919 let config = BenchConfig {
920 workload: Some("simple_tool_call".to_string()),
921 runs: 2,
922 warmup: 1,
923 concurrency: 4,
924 confirm_cost: true,
925 ..test_config()
926 };
927 let runner = BenchRunner::new(config);
928 let results = runner.run().await.unwrap();
929 assert_eq!(results.len(), 1);
930 assert!(results[0].throughput.is_some());
931 let throughput = results[0].throughput.as_ref().unwrap();
932 assert_eq!(throughput.levels.len(), 1);
933 assert_eq!(throughput.levels[0].concurrency, 4);
934 }
935
936 #[tokio::test]
937 #[ignore] async fn test_sweep_mode() {
939 let config = BenchConfig {
940 workload: Some("simple_tool_call".to_string()),
941 runs: 1,
942 warmup: 1,
943 concurrency_sweep: Some(vec![1, 2, 4]),
944 confirm_cost: true,
945 ..test_config()
946 };
947 let runner = BenchRunner::new(config);
948 let results = runner.run().await.unwrap();
949 assert_eq!(results.len(), 1);
950 assert!(results[0].throughput.is_some());
951 let throughput = results[0].throughput.as_ref().unwrap();
952 assert_eq!(throughput.levels.len(), 3);
953 assert_eq!(throughput.levels[0].concurrency, 1);
954 assert_eq!(throughput.levels[1].concurrency, 2);
955 assert_eq!(throughput.levels[2].concurrency, 4);
956 }
957
958 #[tokio::test]
959 async fn test_cv_warning_not_emitted_for_low_cv() {
960 let stats = DurationStats {
961 min_us: 100,
962 max_us: 120,
963 mean_us: 110,
964 median_us: 110,
965 p95_us: 119,
966 p99_us: 120,
967 std_dev_us: 5,
968 count: 10,
969 coefficient_of_variation: 0.045, };
971 let config = test_config();
972 let runner = BenchRunner::new(config);
973 runner.emit_cv_warning(&stats, "test_workload");
975 }
976
977 #[tokio::test]
978 async fn test_save_and_check_baseline() {
979 let dir = tempfile::TempDir::new().unwrap();
980 let baseline_path = dir.path().join("test-baseline.json");
981
982 let config = BenchConfig { baseline_path: baseline_path.clone(), ..test_config() };
983 let runner = BenchRunner::new(config);
984
985 let results = vec![BenchmarkResult {
987 schema_version: 1,
988 workload_name: "test_workload".to_string(),
989 model: "gemini-2.5-flash".to_string(),
990 metadata: RunMetadata {
991 timestamp: "2025-01-01T00:00:00Z".to_string(),
992 adk_version: "0.5.0".to_string(),
993 rust_version: "1.85.0".to_string(),
994 os: "linux".to_string(),
995 arch: "x86_64".to_string(),
996 },
997 cold_start: compute_stats(&[Duration::from_micros(1000)]),
998 agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
999 tool_invocation: None,
1000 throughput: None,
1001 memory: None,
1002 token_overhead: None,
1003 reproducibility_rate: None,
1004 iterations: 5,
1005 }];
1006
1007 runner.save_baseline(&results).unwrap();
1009 assert!(baseline_path.exists());
1010
1011 let regressions = runner.check_regression(&results).unwrap();
1013 assert!(regressions.is_empty());
1014 }
1015
1016 #[tokio::test]
1017 async fn test_check_regression_detects_timing_increase() {
1018 let dir = tempfile::TempDir::new().unwrap();
1019 let baseline_path = dir.path().join("test-baseline.json");
1020
1021 let config = BenchConfig {
1022 baseline_path: baseline_path.clone(),
1023 tolerance: 0.10, ..test_config()
1025 };
1026 let runner = BenchRunner::new(config);
1027
1028 let baseline_results = vec![BenchmarkResult {
1030 schema_version: 1,
1031 workload_name: "test_workload".to_string(),
1032 model: "gemini-2.5-flash".to_string(),
1033 metadata: RunMetadata {
1034 timestamp: "2025-01-01T00:00:00Z".to_string(),
1035 adk_version: "0.5.0".to_string(),
1036 rust_version: "1.85.0".to_string(),
1037 os: "linux".to_string(),
1038 arch: "x86_64".to_string(),
1039 },
1040 cold_start: compute_stats(&[Duration::from_micros(1000)]),
1041 agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1042 tool_invocation: None,
1043 throughput: None,
1044 memory: None,
1045 token_overhead: None,
1046 reproducibility_rate: None,
1047 iterations: 5,
1048 }];
1049 runner.save_baseline(&baseline_results).unwrap();
1050
1051 let current_results = vec![BenchmarkResult {
1053 schema_version: 1,
1054 workload_name: "test_workload".to_string(),
1055 model: "gemini-2.5-flash".to_string(),
1056 metadata: RunMetadata {
1057 timestamp: "2025-01-02T00:00:00Z".to_string(),
1058 adk_version: "0.5.0".to_string(),
1059 rust_version: "1.85.0".to_string(),
1060 os: "linux".to_string(),
1061 arch: "x86_64".to_string(),
1062 },
1063 cold_start: compute_stats(&[Duration::from_micros(1200)]),
1064 agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1065 tool_invocation: None,
1066 throughput: None,
1067 memory: None,
1068 token_overhead: None,
1069 reproducibility_rate: None,
1070 iterations: 5,
1071 }];
1072
1073 let regressions = runner.check_regression(¤t_results).unwrap();
1074 assert!(!regressions.is_empty(), "expected regression for 20% cold start increase");
1076
1077 let cold_start_regression = regressions
1079 .iter()
1080 .find(|r| r.metric_name.contains("cold_start"))
1081 .expect("should have cold_start regression");
1082 assert_eq!(cold_start_regression.workload_name, "test_workload");
1083 assert!((cold_start_regression.degradation - 0.20).abs() < 0.01);
1084 }
1085
1086 #[tokio::test]
1087 async fn test_check_regression_within_tolerance() {
1088 let dir = tempfile::TempDir::new().unwrap();
1089 let baseline_path = dir.path().join("test-baseline.json");
1090
1091 let config = BenchConfig {
1092 baseline_path: baseline_path.clone(),
1093 tolerance: 0.10, ..test_config()
1095 };
1096 let runner = BenchRunner::new(config);
1097
1098 let baseline_results = vec![BenchmarkResult {
1100 schema_version: 1,
1101 workload_name: "test_workload".to_string(),
1102 model: "gemini-2.5-flash".to_string(),
1103 metadata: RunMetadata {
1104 timestamp: "2025-01-01T00:00:00Z".to_string(),
1105 adk_version: "0.5.0".to_string(),
1106 rust_version: "1.85.0".to_string(),
1107 os: "linux".to_string(),
1108 arch: "x86_64".to_string(),
1109 },
1110 cold_start: compute_stats(&[Duration::from_micros(1000)]),
1111 agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1112 tool_invocation: None,
1113 throughput: None,
1114 memory: None,
1115 token_overhead: None,
1116 reproducibility_rate: None,
1117 iterations: 5,
1118 }];
1119 runner.save_baseline(&baseline_results).unwrap();
1120
1121 let current_results = vec![BenchmarkResult {
1123 schema_version: 1,
1124 workload_name: "test_workload".to_string(),
1125 model: "gemini-2.5-flash".to_string(),
1126 metadata: RunMetadata {
1127 timestamp: "2025-01-02T00:00:00Z".to_string(),
1128 adk_version: "0.5.0".to_string(),
1129 rust_version: "1.85.0".to_string(),
1130 os: "linux".to_string(),
1131 arch: "x86_64".to_string(),
1132 },
1133 cold_start: compute_stats(&[Duration::from_micros(1050)]),
1134 agent_loop_overhead: compute_stats(&[Duration::from_micros(105)]),
1135 tool_invocation: None,
1136 throughput: None,
1137 memory: None,
1138 token_overhead: None,
1139 reproducibility_rate: None,
1140 iterations: 5,
1141 }];
1142
1143 let regressions = runner.check_regression(¤t_results).unwrap();
1144 assert!(
1146 regressions.is_empty(),
1147 "expected no regression for 5% increase within 10% tolerance"
1148 );
1149 }
1150
1151 #[tokio::test]
1152 async fn test_check_regression_improvement_not_flagged() {
1153 let dir = tempfile::TempDir::new().unwrap();
1154 let baseline_path = dir.path().join("test-baseline.json");
1155
1156 let config =
1157 BenchConfig { baseline_path: baseline_path.clone(), tolerance: 0.10, ..test_config() };
1158 let runner = BenchRunner::new(config);
1159
1160 let baseline_results = vec![BenchmarkResult {
1162 schema_version: 1,
1163 workload_name: "test_workload".to_string(),
1164 model: "gemini-2.5-flash".to_string(),
1165 metadata: RunMetadata {
1166 timestamp: "2025-01-01T00:00:00Z".to_string(),
1167 adk_version: "0.5.0".to_string(),
1168 rust_version: "1.85.0".to_string(),
1169 os: "linux".to_string(),
1170 arch: "x86_64".to_string(),
1171 },
1172 cold_start: compute_stats(&[Duration::from_micros(1000)]),
1173 agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1174 tool_invocation: None,
1175 throughput: None,
1176 memory: None,
1177 token_overhead: None,
1178 reproducibility_rate: None,
1179 iterations: 5,
1180 }];
1181 runner.save_baseline(&baseline_results).unwrap();
1182
1183 let current_results = vec![BenchmarkResult {
1185 schema_version: 1,
1186 workload_name: "test_workload".to_string(),
1187 model: "gemini-2.5-flash".to_string(),
1188 metadata: RunMetadata {
1189 timestamp: "2025-01-02T00:00:00Z".to_string(),
1190 adk_version: "0.5.0".to_string(),
1191 rust_version: "1.85.0".to_string(),
1192 os: "linux".to_string(),
1193 arch: "x86_64".to_string(),
1194 },
1195 cold_start: compute_stats(&[Duration::from_micros(800)]),
1196 agent_loop_overhead: compute_stats(&[Duration::from_micros(80)]),
1197 tool_invocation: None,
1198 throughput: None,
1199 memory: None,
1200 token_overhead: None,
1201 reproducibility_rate: None,
1202 iterations: 5,
1203 }];
1204
1205 let regressions = runner.check_regression(¤t_results).unwrap();
1206 assert!(regressions.is_empty(), "improvement should not be flagged as regression");
1208 }
1209
1210 #[tokio::test]
1211 async fn test_check_regression_no_baseline_file() {
1212 let dir = tempfile::TempDir::new().unwrap();
1213 let baseline_path = dir.path().join("nonexistent-baseline.json");
1214
1215 let config =
1216 BenchConfig { baseline_path: baseline_path.clone(), tolerance: 0.10, ..test_config() };
1217 let runner = BenchRunner::new(config);
1218
1219 let results = vec![BenchmarkResult {
1220 schema_version: 1,
1221 workload_name: "test_workload".to_string(),
1222 model: "gemini-2.5-flash".to_string(),
1223 metadata: RunMetadata {
1224 timestamp: "2025-01-01T00:00:00Z".to_string(),
1225 adk_version: "0.5.0".to_string(),
1226 rust_version: "1.85.0".to_string(),
1227 os: "linux".to_string(),
1228 arch: "x86_64".to_string(),
1229 },
1230 cold_start: compute_stats(&[Duration::from_micros(1000)]),
1231 agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1232 tool_invocation: None,
1233 throughput: None,
1234 memory: None,
1235 token_overhead: None,
1236 reproducibility_rate: None,
1237 iterations: 5,
1238 }];
1239
1240 let regressions = runner.check_regression(&results).unwrap();
1242 assert!(regressions.is_empty());
1243 }
1244
1245 #[tokio::test]
1246 async fn test_check_regression_exact_tolerance_boundary() {
1247 let dir = tempfile::TempDir::new().unwrap();
1248 let baseline_path = dir.path().join("test-baseline.json");
1249
1250 let config = BenchConfig {
1251 baseline_path: baseline_path.clone(),
1252 tolerance: 0.10, ..test_config()
1254 };
1255 let runner = BenchRunner::new(config);
1256
1257 let baseline_results = vec![BenchmarkResult {
1259 schema_version: 1,
1260 workload_name: "test_workload".to_string(),
1261 model: "gemini-2.5-flash".to_string(),
1262 metadata: RunMetadata {
1263 timestamp: "2025-01-01T00:00:00Z".to_string(),
1264 adk_version: "0.5.0".to_string(),
1265 rust_version: "1.85.0".to_string(),
1266 os: "linux".to_string(),
1267 arch: "x86_64".to_string(),
1268 },
1269 cold_start: compute_stats(&[Duration::from_micros(1000)]),
1270 agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1271 tool_invocation: None,
1272 throughput: None,
1273 memory: None,
1274 token_overhead: None,
1275 reproducibility_rate: None,
1276 iterations: 5,
1277 }];
1278 runner.save_baseline(&baseline_results).unwrap();
1279
1280 let current_results = vec![BenchmarkResult {
1283 schema_version: 1,
1284 workload_name: "test_workload".to_string(),
1285 model: "gemini-2.5-flash".to_string(),
1286 metadata: RunMetadata {
1287 timestamp: "2025-01-02T00:00:00Z".to_string(),
1288 adk_version: "0.5.0".to_string(),
1289 rust_version: "1.85.0".to_string(),
1290 os: "linux".to_string(),
1291 arch: "x86_64".to_string(),
1292 },
1293 cold_start: compute_stats(&[Duration::from_micros(1100)]),
1294 agent_loop_overhead: compute_stats(&[Duration::from_micros(110)]),
1295 tool_invocation: None,
1296 throughput: None,
1297 memory: None,
1298 token_overhead: None,
1299 reproducibility_rate: None,
1300 iterations: 5,
1301 }];
1302
1303 let regressions = runner.check_regression(¤t_results).unwrap();
1304 assert!(
1306 regressions.is_empty(),
1307 "exactly at tolerance boundary should not trigger regression"
1308 );
1309 }
1310
1311 #[tokio::test]
1312 async fn test_check_regression_multiple_workloads() {
1313 let dir = tempfile::TempDir::new().unwrap();
1314 let baseline_path = dir.path().join("test-baseline.json");
1315
1316 let config =
1317 BenchConfig { baseline_path: baseline_path.clone(), tolerance: 0.10, ..test_config() };
1318 let runner = BenchRunner::new(config);
1319
1320 let baseline_results = vec![
1322 BenchmarkResult {
1323 schema_version: 1,
1324 workload_name: "workload_a".to_string(),
1325 model: "gemini-2.5-flash".to_string(),
1326 metadata: RunMetadata {
1327 timestamp: "2025-01-01T00:00:00Z".to_string(),
1328 adk_version: "0.5.0".to_string(),
1329 rust_version: "1.85.0".to_string(),
1330 os: "linux".to_string(),
1331 arch: "x86_64".to_string(),
1332 },
1333 cold_start: compute_stats(&[Duration::from_micros(1000)]),
1334 agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1335 tool_invocation: None,
1336 throughput: None,
1337 memory: None,
1338 token_overhead: None,
1339 reproducibility_rate: None,
1340 iterations: 5,
1341 },
1342 BenchmarkResult {
1343 schema_version: 1,
1344 workload_name: "workload_b".to_string(),
1345 model: "gemini-2.5-flash".to_string(),
1346 metadata: RunMetadata {
1347 timestamp: "2025-01-01T00:00:00Z".to_string(),
1348 adk_version: "0.5.0".to_string(),
1349 rust_version: "1.85.0".to_string(),
1350 os: "linux".to_string(),
1351 arch: "x86_64".to_string(),
1352 },
1353 cold_start: compute_stats(&[Duration::from_micros(2000)]),
1354 agent_loop_overhead: compute_stats(&[Duration::from_micros(200)]),
1355 tool_invocation: None,
1356 throughput: None,
1357 memory: None,
1358 token_overhead: None,
1359 reproducibility_rate: None,
1360 iterations: 5,
1361 },
1362 ];
1363 runner.save_baseline(&baseline_results).unwrap();
1364
1365 let current_results = vec![
1367 BenchmarkResult {
1368 schema_version: 1,
1369 workload_name: "workload_a".to_string(),
1370 model: "gemini-2.5-flash".to_string(),
1371 metadata: RunMetadata {
1372 timestamp: "2025-01-02T00:00:00Z".to_string(),
1373 adk_version: "0.5.0".to_string(),
1374 rust_version: "1.85.0".to_string(),
1375 os: "linux".to_string(),
1376 arch: "x86_64".to_string(),
1377 },
1378 cold_start: compute_stats(&[Duration::from_micros(1300)]),
1379 agent_loop_overhead: compute_stats(&[Duration::from_micros(100)]),
1380 tool_invocation: None,
1381 throughput: None,
1382 memory: None,
1383 token_overhead: None,
1384 reproducibility_rate: None,
1385 iterations: 5,
1386 },
1387 BenchmarkResult {
1388 schema_version: 1,
1389 workload_name: "workload_b".to_string(),
1390 model: "gemini-2.5-flash".to_string(),
1391 metadata: RunMetadata {
1392 timestamp: "2025-01-02T00:00:00Z".to_string(),
1393 adk_version: "0.5.0".to_string(),
1394 rust_version: "1.85.0".to_string(),
1395 os: "linux".to_string(),
1396 arch: "x86_64".to_string(),
1397 },
1398 cold_start: compute_stats(&[Duration::from_micros(2000)]),
1399 agent_loop_overhead: compute_stats(&[Duration::from_micros(200)]),
1400 tool_invocation: None,
1401 throughput: None,
1402 memory: None,
1403 token_overhead: None,
1404 reproducibility_rate: None,
1405 iterations: 5,
1406 },
1407 ];
1408
1409 let regressions = runner.check_regression(¤t_results).unwrap();
1410 assert!(!regressions.is_empty());
1412 let workload_a_regressions: Vec<_> =
1413 regressions.iter().filter(|r| r.workload_name == "workload_a").collect();
1414 assert!(!workload_a_regressions.is_empty(), "workload_a should have regressions");
1415
1416 let workload_b_regressions: Vec<_> =
1417 regressions.iter().filter(|r| r.workload_name == "workload_b").collect();
1418 assert!(workload_b_regressions.is_empty(), "workload_b should not have regressions");
1419 }
1420
1421 #[tokio::test]
1422 async fn test_regression_report_fields() {
1423 let dir = tempfile::TempDir::new().unwrap();
1424 let baseline_path = dir.path().join("test-baseline.json");
1425
1426 let config =
1427 BenchConfig { baseline_path: baseline_path.clone(), tolerance: 0.10, ..test_config() };
1428 let runner = BenchRunner::new(config);
1429
1430 let baseline_results = vec![BenchmarkResult {
1432 schema_version: 1,
1433 workload_name: "my_workload".to_string(),
1434 model: "gemini-2.5-flash".to_string(),
1435 metadata: RunMetadata {
1436 timestamp: "2025-01-01T00:00:00Z".to_string(),
1437 adk_version: "0.5.0".to_string(),
1438 rust_version: "1.85.0".to_string(),
1439 os: "linux".to_string(),
1440 arch: "x86_64".to_string(),
1441 },
1442 cold_start: compute_stats(&[Duration::from_micros(1000)]),
1443 agent_loop_overhead: compute_stats(&[Duration::from_micros(200)]),
1444 tool_invocation: None,
1445 throughput: None,
1446 memory: None,
1447 token_overhead: None,
1448 reproducibility_rate: None,
1449 iterations: 5,
1450 }];
1451 runner.save_baseline(&baseline_results).unwrap();
1452
1453 let current_results = vec![BenchmarkResult {
1455 schema_version: 1,
1456 workload_name: "my_workload".to_string(),
1457 model: "gemini-2.5-flash".to_string(),
1458 metadata: RunMetadata {
1459 timestamp: "2025-01-02T00:00:00Z".to_string(),
1460 adk_version: "0.5.0".to_string(),
1461 rust_version: "1.85.0".to_string(),
1462 os: "linux".to_string(),
1463 arch: "x86_64".to_string(),
1464 },
1465 cold_start: compute_stats(&[Duration::from_micros(1500)]),
1466 agent_loop_overhead: compute_stats(&[Duration::from_micros(200)]),
1467 tool_invocation: None,
1468 throughput: None,
1469 memory: None,
1470 token_overhead: None,
1471 reproducibility_rate: None,
1472 iterations: 5,
1473 }];
1474
1475 let regressions = runner.check_regression(¤t_results).unwrap();
1476 assert!(!regressions.is_empty());
1477
1478 let report = regressions
1480 .iter()
1481 .find(|r| r.metric_name == "cold_start_mean_us")
1482 .expect("should have cold_start_mean_us regression");
1483
1484 assert_eq!(report.workload_name, "my_workload");
1485 assert!((report.baseline_value - 1000.0).abs() < 1.0);
1486 assert!((report.current_value - 1500.0).abs() < 1.0);
1487 assert!((report.degradation - 0.50).abs() < 0.01);
1488 }
1489
1490 #[tokio::test]
1491 async fn test_estimate_cost_non_zero() {
1492 let config = test_config();
1493 let runner = BenchRunner::new(config);
1494 let workloads = runner.resolve_workloads().unwrap();
1495 let cost = runner.estimate_cost(&workloads);
1496 assert!(cost >= 0.0);
1498 }
1499
1500 #[tokio::test]
1501 async fn test_build_run_metadata() {
1502 let config = test_config();
1503 let runner = BenchRunner::new(config);
1504 let metadata = runner.build_run_metadata();
1505 assert!(!metadata.timestamp.is_empty());
1506 assert!(!metadata.adk_version.is_empty());
1507 assert!(!metadata.os.is_empty());
1508 assert!(!metadata.arch.is_empty());
1509 }
1510}