1use serde::{Deserialize, Serialize};
30use std::time::{Duration, Instant};
31
32#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
37#[serde(rename_all = "camelCase")]
38pub struct DurationStats {
39 pub min_us: u64,
41 pub max_us: u64,
43 pub mean_us: u64,
45 pub median_us: u64,
47 pub p95_us: u64,
49 pub p99_us: u64,
51 pub std_dev_us: u64,
53 pub count: usize,
55 pub coefficient_of_variation: f64,
57}
58
59#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
61#[serde(rename_all = "camelCase")]
62pub struct BenchmarkResult {
63 #[serde(default = "default_schema_version")]
66 pub schema_version: u32,
67 pub workload_name: String,
69 pub model: String,
71 pub metadata: RunMetadata,
73 pub cold_start: DurationStats,
75 pub agent_loop_overhead: DurationStats,
77 #[serde(default, skip_serializing_if = "Option::is_none")]
79 pub tool_invocation: Option<ToolInvocationMetrics>,
80 #[serde(default, skip_serializing_if = "Option::is_none")]
82 pub throughput: Option<ThroughputMetrics>,
83 #[serde(default, skip_serializing_if = "Option::is_none")]
85 pub memory: Option<MemoryMetrics>,
86 #[serde(default, skip_serializing_if = "Option::is_none")]
88 pub token_overhead: Option<TokenOverheadMetrics>,
89 #[serde(default, skip_serializing_if = "Option::is_none")]
92 pub reproducibility_rate: Option<f64>,
93 pub iterations: usize,
95}
96
97fn default_schema_version() -> u32 {
99 1
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
104#[serde(rename_all = "camelCase")]
105pub struct RunMetadata {
106 pub timestamp: String,
108 pub adk_version: String,
110 pub rust_version: String,
112 pub os: String,
114 pub arch: String,
116}
117
118#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
120#[serde(rename_all = "camelCase")]
121pub struct ToolInvocationMetrics {
122 pub total: DurationStats,
124 pub deserialization: DurationStats,
126 pub schema_validation: DurationStats,
128 pub execution_dispatch: DurationStats,
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
134#[serde(rename_all = "camelCase")]
135pub struct ThroughputMetrics {
136 pub levels: Vec<ConcurrencyLevel>,
138}
139
140#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
142#[serde(rename_all = "camelCase")]
143pub struct ConcurrencyLevel {
144 pub concurrency: usize,
146 pub agents_per_second: f64,
148 pub completion_time: DurationStats,
150}
151
152#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
154#[serde(rename_all = "camelCase")]
155pub struct MemoryMetrics {
156 pub peak_rss_bytes: u64,
158 #[serde(skip_serializing_if = "Option::is_none")]
160 pub per_agent_bytes: Option<u64>,
161 pub sample_count: usize,
163}
164
165#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
167#[serde(rename_all = "camelCase")]
168pub struct TokenOverheadMetrics {
169 pub total_tokens: u64,
171 pub user_content_tokens: u64,
173 pub overhead_tokens: u64,
175 pub overhead_percentage: f64,
177 pub breakdown: TokenBreakdown,
179}
180
181#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
183#[serde(rename_all = "camelCase")]
184pub struct TokenBreakdown {
185 pub system_prompt_tokens: u64,
187 pub tool_schema_tokens: u64,
189 pub framework_wrapper_tokens: u64,
191}
192
193pub fn compute_stats(durations: &[Duration]) -> DurationStats {
208 if durations.is_empty() {
209 return DurationStats {
210 min_us: 0,
211 max_us: 0,
212 mean_us: 0,
213 median_us: 0,
214 p95_us: 0,
215 p99_us: 0,
216 std_dev_us: 0,
217 count: 0,
218 coefficient_of_variation: 0.0,
219 };
220 }
221
222 let mut micros: Vec<u64> = durations.iter().map(|d| d.as_micros() as u64).collect();
223 micros.sort_unstable();
224
225 let count = micros.len();
226 let min_us = micros[0];
227 let max_us = micros[count - 1];
228
229 let sum: u64 = micros.iter().sum();
231 let mean_us = sum / count as u64;
232
233 let median_us = percentile_nearest_rank(µs, 50.0);
235
236 let p95_us = percentile_nearest_rank(µs, 95.0);
238 let p99_us = percentile_nearest_rank(µs, 99.0);
239
240 let mean_f64 = sum as f64 / count as f64;
242 let variance: f64 = micros
243 .iter()
244 .map(|&v| {
245 let diff = v as f64 - mean_f64;
246 diff * diff
247 })
248 .sum::<f64>()
249 / count as f64;
250 let std_dev_f64 = variance.sqrt();
251 let std_dev_us = std_dev_f64 as u64;
252
253 let coefficient_of_variation = if mean_f64 == 0.0 { 0.0 } else { std_dev_f64 / mean_f64 };
255
256 DurationStats {
257 min_us,
258 max_us,
259 mean_us,
260 median_us,
261 p95_us,
262 p99_us,
263 std_dev_us,
264 count,
265 coefficient_of_variation,
266 }
267}
268
269fn percentile_nearest_rank(sorted: &[u64], percentile: f64) -> u64 {
274 let count = sorted.len();
275 if count == 1 {
276 return sorted[0];
277 }
278 let rank = ((percentile / 100.0) * count as f64).ceil() as usize;
280 let rank = rank.clamp(1, count);
282 sorted[rank - 1]
283}
284
285#[derive(Debug, Clone)]
287pub struct ToolLatencyRecord {
288 pub total: Duration,
290 pub deserialization: Duration,
292 pub schema_validation: Duration,
294 pub execution_dispatch: Duration,
296}
297
298pub struct MetricCollector {
322 run_start: Option<Instant>,
323 first_llm_call: Option<Instant>,
324 turn_overheads: Vec<Duration>,
325 tool_latencies: Vec<ToolLatencyRecord>,
326 memory_samples: Vec<u64>,
327}
328
329impl MetricCollector {
330 pub fn new() -> Self {
332 Self {
333 run_start: None,
334 first_llm_call: None,
335 turn_overheads: Vec::new(),
336 tool_latencies: Vec::new(),
337 memory_samples: Vec::new(),
338 }
339 }
340
341 pub fn mark_run_start(&mut self) {
345 self.run_start = Some(Instant::now());
346 }
347
348 pub fn mark_first_llm_call(&mut self) {
353 if self.first_llm_call.is_none() {
354 self.first_llm_call = Some(Instant::now());
355 }
356 }
357
358 pub fn record_turn_overhead(&mut self, overhead: Duration) {
363 self.turn_overheads.push(overhead);
364 }
365
366 pub fn record_tool_latency(&mut self, record: ToolLatencyRecord) {
368 self.tool_latencies.push(record);
369 }
370
371 pub fn record_memory_sample(&mut self, rss_bytes: u64) {
373 self.memory_samples.push(rss_bytes);
374 }
375
376 pub fn cold_start_duration(&self) -> Option<Duration> {
381 match (self.run_start, self.first_llm_call) {
382 (Some(start), Some(first)) => Some(first.duration_since(start)),
383 _ => None,
384 }
385 }
386
387 pub fn turn_overheads(&self) -> &[Duration] {
389 &self.turn_overheads
390 }
391
392 pub fn tool_latencies(&self) -> &[ToolLatencyRecord] {
394 &self.tool_latencies
395 }
396
397 pub fn memory_samples(&self) -> &[u64] {
399 &self.memory_samples
400 }
401}
402
403impl Default for MetricCollector {
404 fn default() -> Self {
405 Self::new()
406 }
407}
408
409#[cfg(test)]
410mod tests {
411 use super::*;
412
413 #[test]
414 fn test_compute_stats_empty() {
415 let stats = compute_stats(&[]);
416 assert_eq!(stats.count, 0);
417 assert_eq!(stats.min_us, 0);
418 assert_eq!(stats.max_us, 0);
419 assert_eq!(stats.mean_us, 0);
420 assert_eq!(stats.median_us, 0);
421 assert_eq!(stats.p95_us, 0);
422 assert_eq!(stats.p99_us, 0);
423 assert_eq!(stats.std_dev_us, 0);
424 assert_eq!(stats.coefficient_of_variation, 0.0);
425 }
426
427 #[test]
428 fn test_compute_stats_single_element() {
429 let durations = vec![Duration::from_micros(500)];
430 let stats = compute_stats(&durations);
431 assert_eq!(stats.count, 1);
432 assert_eq!(stats.min_us, 500);
433 assert_eq!(stats.max_us, 500);
434 assert_eq!(stats.mean_us, 500);
435 assert_eq!(stats.median_us, 500);
436 assert_eq!(stats.p95_us, 500);
437 assert_eq!(stats.p99_us, 500);
438 assert_eq!(stats.std_dev_us, 0);
439 assert_eq!(stats.coefficient_of_variation, 0.0);
440 }
441
442 #[test]
443 fn test_compute_stats_multiple_elements() {
444 let durations = vec![
445 Duration::from_micros(100),
446 Duration::from_micros(200),
447 Duration::from_micros(300),
448 Duration::from_micros(400),
449 Duration::from_micros(500),
450 ];
451 let stats = compute_stats(&durations);
452 assert_eq!(stats.count, 5);
453 assert_eq!(stats.min_us, 100);
454 assert_eq!(stats.max_us, 500);
455 assert_eq!(stats.mean_us, 300);
456 assert_eq!(stats.median_us, 300);
457 assert_eq!(stats.p95_us, 500);
459 assert_eq!(stats.p99_us, 500);
461 }
462
463 #[test]
464 fn test_compute_stats_ordering_invariant() {
465 let durations = vec![
466 Duration::from_micros(50),
467 Duration::from_micros(100),
468 Duration::from_micros(150),
469 Duration::from_micros(200),
470 Duration::from_micros(250),
471 Duration::from_micros(300),
472 Duration::from_micros(350),
473 Duration::from_micros(400),
474 Duration::from_micros(450),
475 Duration::from_micros(500),
476 ];
477 let stats = compute_stats(&durations);
478 assert!(stats.min_us <= stats.median_us);
479 assert!(stats.median_us <= stats.p95_us);
480 assert!(stats.p95_us <= stats.p99_us);
481 assert!(stats.p99_us <= stats.max_us);
482 }
483
484 #[test]
485 fn test_compute_stats_unsorted_input() {
486 let durations = vec![
487 Duration::from_micros(500),
488 Duration::from_micros(100),
489 Duration::from_micros(300),
490 Duration::from_micros(200),
491 Duration::from_micros(400),
492 ];
493 let stats = compute_stats(&durations);
494 assert_eq!(stats.min_us, 100);
495 assert_eq!(stats.max_us, 500);
496 assert_eq!(stats.mean_us, 300);
497 }
498
499 #[test]
500 fn test_metric_collector_cold_start() {
501 let mut collector = MetricCollector::new();
502 assert!(collector.cold_start_duration().is_none());
503
504 collector.mark_run_start();
505 assert!(collector.cold_start_duration().is_none());
506
507 std::thread::sleep(Duration::from_millis(1));
509 collector.mark_first_llm_call();
510
511 let cold_start = collector.cold_start_duration().unwrap();
512 assert!(cold_start >= Duration::from_millis(1));
513 }
514
515 #[test]
516 fn test_metric_collector_first_llm_call_only_once() {
517 let mut collector = MetricCollector::new();
518 collector.mark_run_start();
519 std::thread::sleep(Duration::from_millis(1));
520 collector.mark_first_llm_call();
521
522 let first_duration = collector.cold_start_duration().unwrap();
523
524 std::thread::sleep(Duration::from_millis(10));
526 collector.mark_first_llm_call();
527
528 let second_duration = collector.cold_start_duration().unwrap();
529 assert_eq!(first_duration, second_duration);
530 }
531
532 #[test]
533 fn test_metric_collector_turn_overheads() {
534 let mut collector = MetricCollector::new();
535 collector.record_turn_overhead(Duration::from_micros(100));
536 collector.record_turn_overhead(Duration::from_micros(200));
537 assert_eq!(collector.turn_overheads().len(), 2);
538 }
539
540 #[test]
541 fn test_metric_collector_memory_samples() {
542 let mut collector = MetricCollector::new();
543 collector.record_memory_sample(1024);
544 collector.record_memory_sample(2048);
545 collector.record_memory_sample(4096);
546 assert_eq!(collector.memory_samples(), &[1024, 2048, 4096]);
547 }
548
549 #[test]
550 fn test_metric_collector_tool_latencies() {
551 let mut collector = MetricCollector::new();
552 collector.record_tool_latency(ToolLatencyRecord {
553 total: Duration::from_micros(500),
554 deserialization: Duration::from_micros(100),
555 schema_validation: Duration::from_micros(150),
556 execution_dispatch: Duration::from_micros(250),
557 });
558 assert_eq!(collector.tool_latencies().len(), 1);
559 }
560
561 #[test]
562 fn test_duration_stats_serialization_round_trip() {
563 let stats = DurationStats {
564 min_us: 100,
565 max_us: 500,
566 mean_us: 300,
567 median_us: 300,
568 p95_us: 480,
569 p99_us: 499,
570 std_dev_us: 141,
571 count: 5,
572 coefficient_of_variation: 0.47,
573 };
574 let json = serde_json::to_string(&stats).unwrap();
575 let deserialized: DurationStats = serde_json::from_str(&json).unwrap();
576 assert_eq!(stats, deserialized);
577 }
578
579 #[test]
580 fn test_coefficient_of_variation_zero_mean() {
581 let durations = vec![Duration::from_micros(0), Duration::from_micros(0)];
582 let stats = compute_stats(&durations);
583 assert_eq!(stats.coefficient_of_variation, 0.0);
584 }
585
586 fn sample_benchmark_result() -> BenchmarkResult {
588 BenchmarkResult {
589 schema_version: 1,
590 workload_name: "simple_tool_call".to_string(),
591 model: "gemini-2.5-flash".to_string(),
592 metadata: RunMetadata {
593 timestamp: "2025-01-15T10:30:00Z".to_string(),
594 adk_version: "0.5.0".to_string(),
595 rust_version: "1.85.0".to_string(),
596 os: "linux".to_string(),
597 arch: "x86_64".to_string(),
598 },
599 cold_start: DurationStats {
600 min_us: 1000,
601 max_us: 5000,
602 mean_us: 2500,
603 median_us: 2400,
604 p95_us: 4800,
605 p99_us: 4950,
606 std_dev_us: 800,
607 count: 5,
608 coefficient_of_variation: 0.32,
609 },
610 agent_loop_overhead: DurationStats {
611 min_us: 100,
612 max_us: 500,
613 mean_us: 250,
614 median_us: 240,
615 p95_us: 480,
616 p99_us: 495,
617 std_dev_us: 80,
618 count: 10,
619 coefficient_of_variation: 0.32,
620 },
621 tool_invocation: None,
622 throughput: None,
623 memory: None,
624 token_overhead: Some(TokenOverheadMetrics {
625 total_tokens: 1200,
626 user_content_tokens: 950,
627 overhead_tokens: 250,
628 overhead_percentage: 20.83,
629 breakdown: TokenBreakdown {
630 system_prompt_tokens: 100,
631 tool_schema_tokens: 100,
632 framework_wrapper_tokens: 50,
633 },
634 }),
635 reproducibility_rate: Some(0.95),
636 iterations: 5,
637 }
638 }
639
640 #[test]
641 fn test_benchmark_result_serialization_round_trip() {
642 let result = sample_benchmark_result();
643 let json = serde_json::to_string(&result).unwrap();
644 let deserialized: BenchmarkResult = serde_json::from_str(&json).unwrap();
645 assert_eq!(result, deserialized);
646 }
647
648 #[test]
649 fn test_benchmark_result_schema_version_always_present() {
650 let result = sample_benchmark_result();
651 let json = serde_json::to_string(&result).unwrap();
652 let value: serde_json::Value = serde_json::from_str(&json).unwrap();
653 assert_eq!(value["schemaVersion"], serde_json::json!(1));
654 }
655
656 #[test]
657 fn test_benchmark_result_deserialize_missing_schema_version() {
658 let json = r#"{
660 "workloadName": "simple_tool_call",
661 "model": "gemini-2.5-flash",
662 "metadata": {
663 "timestamp": "2025-01-15T10:30:00Z",
664 "adkVersion": "0.4.0",
665 "rustVersion": "1.85.0",
666 "os": "linux",
667 "arch": "x86_64"
668 },
669 "coldStart": {
670 "minUs": 1000, "maxUs": 5000, "meanUs": 2500,
671 "medianUs": 2400, "p95Us": 4800, "p99Us": 4950,
672 "stdDevUs": 800, "count": 5, "coefficientOfVariation": 0.32
673 },
674 "agentLoopOverhead": {
675 "minUs": 100, "maxUs": 500, "meanUs": 250,
676 "medianUs": 240, "p95Us": 480, "p99Us": 495,
677 "stdDevUs": 80, "count": 10, "coefficientOfVariation": 0.32
678 },
679 "iterations": 5
680 }"#;
681
682 let result: BenchmarkResult = serde_json::from_str(json).unwrap();
683 assert_eq!(result.schema_version, 1);
685 }
686
687 #[test]
688 fn test_benchmark_result_deserialize_missing_optional_fields() {
689 let json = r#"{
691 "schemaVersion": 1,
692 "workloadName": "simple_tool_call",
693 "model": "gemini-2.5-flash",
694 "metadata": {
695 "timestamp": "2025-01-15T10:30:00Z",
696 "adkVersion": "0.4.0",
697 "rustVersion": "1.85.0",
698 "os": "linux",
699 "arch": "x86_64"
700 },
701 "coldStart": {
702 "minUs": 1000, "maxUs": 5000, "meanUs": 2500,
703 "medianUs": 2400, "p95Us": 4800, "p99Us": 4950,
704 "stdDevUs": 800, "count": 5, "coefficientOfVariation": 0.32
705 },
706 "agentLoopOverhead": {
707 "minUs": 100, "maxUs": 500, "meanUs": 250,
708 "medianUs": 240, "p95Us": 480, "p99Us": 495,
709 "stdDevUs": 80, "count": 10, "coefficientOfVariation": 0.32
710 },
711 "iterations": 5
712 }"#;
713
714 let result: BenchmarkResult = serde_json::from_str(json).unwrap();
715 assert_eq!(result.token_overhead, None);
717 assert_eq!(result.reproducibility_rate, None);
718 assert_eq!(result.memory, None);
719 assert_eq!(result.throughput, None);
720 assert_eq!(result.tool_invocation, None);
721 }
722
723 #[test]
724 fn test_benchmark_result_with_all_optional_fields() {
725 let mut result = sample_benchmark_result();
726 result.memory = Some(MemoryMetrics {
727 peak_rss_bytes: 52_428_800,
728 per_agent_bytes: Some(2_097_152),
729 sample_count: 50,
730 });
731 result.throughput = Some(ThroughputMetrics {
732 levels: vec![ConcurrencyLevel {
733 concurrency: 4,
734 agents_per_second: 12.5,
735 completion_time: DurationStats {
736 min_us: 800_000,
737 max_us: 1_200_000,
738 mean_us: 1_000_000,
739 median_us: 980_000,
740 p95_us: 1_150_000,
741 p99_us: 1_190_000,
742 std_dev_us: 100_000,
743 count: 4,
744 coefficient_of_variation: 0.1,
745 },
746 }],
747 });
748
749 let json = serde_json::to_string(&result).unwrap();
750 let deserialized: BenchmarkResult = serde_json::from_str(&json).unwrap();
751 assert_eq!(result, deserialized);
752 }
753}