Skip to main content

adk_bench/
metrics.rs

1//! Metric collection and statistical computation.
2//!
3//! Provides [`DurationStats`], [`BenchmarkResult`], and the [`MetricCollector`]
4//! for accumulating timing samples during benchmark runs.
5//!
6//! # Statistical Computation
7//!
8//! The [`compute_stats`] function computes a full statistical summary from
9//! a slice of [`Duration`] values, including percentiles using the nearest-rank
10//! method.
11//!
12//! # Example
13//!
14//! ```rust
15//! use std::time::Duration;
16//! use adk_bench::metrics::compute_stats;
17//!
18//! let durations = vec![
19//!     Duration::from_micros(100),
20//!     Duration::from_micros(200),
21//!     Duration::from_micros(300),
22//! ];
23//! let stats = compute_stats(&durations);
24//! assert_eq!(stats.count, 3);
25//! assert_eq!(stats.min_us, 100);
26//! assert_eq!(stats.max_us, 300);
27//! ```
28
29use serde::{Deserialize, Serialize};
30use std::time::{Duration, Instant};
31
32/// Statistical summary for a collection of duration measurements.
33///
34/// All timing values are reported in microseconds (μs).
35/// Percentiles use the nearest-rank method.
36#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
37#[serde(rename_all = "camelCase")]
38pub struct DurationStats {
39    /// Minimum duration in microseconds.
40    pub min_us: u64,
41    /// Maximum duration in microseconds.
42    pub max_us: u64,
43    /// Arithmetic mean in microseconds.
44    pub mean_us: u64,
45    /// Median (50th percentile) in microseconds.
46    pub median_us: u64,
47    /// 95th percentile in microseconds (nearest-rank method).
48    pub p95_us: u64,
49    /// 99th percentile in microseconds (nearest-rank method).
50    pub p99_us: u64,
51    /// Population standard deviation in microseconds.
52    pub std_dev_us: u64,
53    /// Number of samples.
54    pub count: usize,
55    /// Coefficient of variation (std_dev / mean). 0.0 if mean is 0.
56    pub coefficient_of_variation: f64,
57}
58
59/// Metrics for a single benchmark run.
60#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
61#[serde(rename_all = "camelCase")]
62pub struct BenchmarkResult {
63    /// Schema version for forward compatibility.
64    /// Defaults to 1 when deserializing older results that lack this field.
65    #[serde(default = "default_schema_version")]
66    pub schema_version: u32,
67    /// Workload that was executed.
68    pub workload_name: String,
69    /// Model used.
70    pub model: String,
71    /// Run metadata.
72    pub metadata: RunMetadata,
73    /// Cold start time (process start → first LLM call).
74    pub cold_start: DurationStats,
75    /// Per-turn agent loop overhead (total_turn - llm_round_trip).
76    pub agent_loop_overhead: DurationStats,
77    /// Tool invocation latency breakdown.
78    #[serde(default, skip_serializing_if = "Option::is_none")]
79    pub tool_invocation: Option<ToolInvocationMetrics>,
80    /// Concurrent throughput (agents/sec at each concurrency level).
81    #[serde(default, skip_serializing_if = "Option::is_none")]
82    pub throughput: Option<ThroughputMetrics>,
83    /// Memory footprint measurements.
84    #[serde(default, skip_serializing_if = "Option::is_none")]
85    pub memory: Option<MemoryMetrics>,
86    /// Token overhead analysis.
87    #[serde(default, skip_serializing_if = "Option::is_none")]
88    pub token_overhead: Option<TokenOverheadMetrics>,
89    /// Reproducibility rate (percentage of semantically equivalent responses across runs).
90    /// Semantic equivalence = same tool calls + same structured output field values.
91    #[serde(default, skip_serializing_if = "Option::is_none")]
92    pub reproducibility_rate: Option<f64>,
93    /// Number of iterations performed.
94    pub iterations: usize,
95}
96
97/// Returns the default schema version (1) for backward compatibility.
98fn default_schema_version() -> u32 {
99    1
100}
101
102/// Run metadata for result provenance.
103#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
104#[serde(rename_all = "camelCase")]
105pub struct RunMetadata {
106    /// ISO 8601 timestamp of the run.
107    pub timestamp: String,
108    /// ADK-Rust crate version.
109    pub adk_version: String,
110    /// Rust compiler version.
111    pub rust_version: String,
112    /// Operating system.
113    pub os: String,
114    /// CPU architecture.
115    pub arch: String,
116}
117
118/// Tool invocation latency breakdown.
119#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
120#[serde(rename_all = "camelCase")]
121pub struct ToolInvocationMetrics {
122    /// Total tool invocation latency.
123    pub total: DurationStats,
124    /// Argument deserialization time.
125    pub deserialization: DurationStats,
126    /// Schema validation time.
127    pub schema_validation: DurationStats,
128    /// Execution dispatch time.
129    pub execution_dispatch: DurationStats,
130}
131
132/// Throughput measurements at various concurrency levels.
133#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
134#[serde(rename_all = "camelCase")]
135pub struct ThroughputMetrics {
136    /// Agents completed per second at each concurrency level.
137    pub levels: Vec<ConcurrencyLevel>,
138}
139
140/// Throughput measurement at a specific concurrency level.
141#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
142#[serde(rename_all = "camelCase")]
143pub struct ConcurrencyLevel {
144    /// Number of concurrent agents.
145    pub concurrency: usize,
146    /// Agents completed per second.
147    pub agents_per_second: f64,
148    /// Per-agent completion time statistics.
149    pub completion_time: DurationStats,
150}
151
152/// Memory footprint measurements.
153#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
154#[serde(rename_all = "camelCase")]
155pub struct MemoryMetrics {
156    /// Peak RSS in bytes during the run.
157    pub peak_rss_bytes: u64,
158    /// Estimated per-agent memory in bytes.
159    #[serde(skip_serializing_if = "Option::is_none")]
160    pub per_agent_bytes: Option<u64>,
161    /// Number of memory samples taken.
162    pub sample_count: usize,
163}
164
165/// Token overhead analysis.
166#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
167#[serde(rename_all = "camelCase")]
168pub struct TokenOverheadMetrics {
169    /// Total tokens sent to LLM.
170    pub total_tokens: u64,
171    /// Tokens from user content only.
172    pub user_content_tokens: u64,
173    /// Framework overhead tokens.
174    pub overhead_tokens: u64,
175    /// Overhead as percentage of total.
176    pub overhead_percentage: f64,
177    /// Breakdown by category.
178    pub breakdown: TokenBreakdown,
179}
180
181/// Token overhead breakdown by category.
182#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
183#[serde(rename_all = "camelCase")]
184pub struct TokenBreakdown {
185    /// Tokens from framework-injected system prompts.
186    pub system_prompt_tokens: u64,
187    /// Tokens consumed by serialized tool/function definitions.
188    pub tool_schema_tokens: u64,
189    /// Tokens added as framework wrappers around user messages.
190    pub framework_wrapper_tokens: u64,
191}
192
193/// Computes a statistical summary from a slice of durations.
194///
195/// Returns a [`DurationStats`] with min, max, mean, median, P95, P99,
196/// standard deviation, count, and coefficient of variation.
197///
198/// # Edge Cases
199///
200/// - **Empty slice**: Returns all zeros with `count = 0`.
201/// - **Single element**: Min = max = mean = median = P95 = P99, std_dev = 0.
202///
203/// # Percentile Method
204///
205/// Uses the nearest-rank method: `rank = ceil(percentile / 100 * count)`,
206/// then index into the sorted array at `rank - 1`.
207pub fn compute_stats(durations: &[Duration]) -> DurationStats {
208    if durations.is_empty() {
209        return DurationStats {
210            min_us: 0,
211            max_us: 0,
212            mean_us: 0,
213            median_us: 0,
214            p95_us: 0,
215            p99_us: 0,
216            std_dev_us: 0,
217            count: 0,
218            coefficient_of_variation: 0.0,
219        };
220    }
221
222    let mut micros: Vec<u64> = durations.iter().map(|d| d.as_micros() as u64).collect();
223    micros.sort_unstable();
224
225    let count = micros.len();
226    let min_us = micros[0];
227    let max_us = micros[count - 1];
228
229    // Mean
230    let sum: u64 = micros.iter().sum();
231    let mean_us = sum / count as u64;
232
233    // Median using nearest-rank method (same as P50)
234    let median_us = percentile_nearest_rank(&micros, 50.0);
235
236    // P95 and P99 using nearest-rank method
237    let p95_us = percentile_nearest_rank(&micros, 95.0);
238    let p99_us = percentile_nearest_rank(&micros, 99.0);
239
240    // Population standard deviation
241    let mean_f64 = sum as f64 / count as f64;
242    let variance: f64 = micros
243        .iter()
244        .map(|&v| {
245            let diff = v as f64 - mean_f64;
246            diff * diff
247        })
248        .sum::<f64>()
249        / count as f64;
250    let std_dev_f64 = variance.sqrt();
251    let std_dev_us = std_dev_f64 as u64;
252
253    // Coefficient of variation = std_dev / mean (0.0 if mean is 0)
254    let coefficient_of_variation = if mean_f64 == 0.0 { 0.0 } else { std_dev_f64 / mean_f64 };
255
256    DurationStats {
257        min_us,
258        max_us,
259        mean_us,
260        median_us,
261        p95_us,
262        p99_us,
263        std_dev_us,
264        count,
265        coefficient_of_variation,
266    }
267}
268
269/// Computes the percentile value using the nearest-rank method.
270///
271/// `sorted` must be a non-empty, sorted slice of values.
272/// `percentile` is a value between 0.0 and 100.0.
273fn percentile_nearest_rank(sorted: &[u64], percentile: f64) -> u64 {
274    let count = sorted.len();
275    if count == 1 {
276        return sorted[0];
277    }
278    // Nearest-rank: rank = ceil(percentile / 100 * count)
279    let rank = ((percentile / 100.0) * count as f64).ceil() as usize;
280    // Clamp to valid index range [1, count]
281    let rank = rank.clamp(1, count);
282    sorted[rank - 1]
283}
284
285/// A record of tool invocation latency broken into phases.
286#[derive(Debug, Clone)]
287pub struct ToolLatencyRecord {
288    /// Total tool invocation duration.
289    pub total: Duration,
290    /// Time spent deserializing tool arguments.
291    pub deserialization: Duration,
292    /// Time spent validating arguments against schema.
293    pub schema_validation: Duration,
294    /// Time spent dispatching the tool execution.
295    pub execution_dispatch: Duration,
296}
297
298/// Accumulates timing samples during a benchmark run.
299///
300/// `MetricCollector` is a mutable accumulator that records various timing
301/// and memory measurements as a benchmark progresses, then provides the
302/// data needed to produce a [`BenchmarkResult`].
303///
304/// # Example
305///
306/// ```rust
307/// use std::time::Duration;
308/// use adk_bench::metrics::MetricCollector;
309///
310/// let mut collector = MetricCollector::new();
311/// collector.mark_run_start();
312/// // ... perform work ...
313/// collector.mark_first_llm_call();
314/// collector.record_turn_overhead(Duration::from_micros(150));
315/// collector.record_memory_sample(1024 * 1024);
316///
317/// if let Some(cold_start) = collector.cold_start_duration() {
318///     println!("Cold start: {:?}", cold_start);
319/// }
320/// ```
321pub struct MetricCollector {
322    run_start: Option<Instant>,
323    first_llm_call: Option<Instant>,
324    turn_overheads: Vec<Duration>,
325    tool_latencies: Vec<ToolLatencyRecord>,
326    memory_samples: Vec<u64>,
327}
328
329impl MetricCollector {
330    /// Creates a new empty `MetricCollector`.
331    pub fn new() -> Self {
332        Self {
333            run_start: None,
334            first_llm_call: None,
335            turn_overheads: Vec::new(),
336            tool_latencies: Vec::new(),
337            memory_samples: Vec::new(),
338        }
339    }
340
341    /// Marks the start of the benchmark run.
342    ///
343    /// Records a monotonic timestamp for cold start calculation.
344    pub fn mark_run_start(&mut self) {
345        self.run_start = Some(Instant::now());
346    }
347
348    /// Marks the first LLM API call.
349    ///
350    /// Only records the timestamp on the first invocation; subsequent
351    /// calls are no-ops.
352    pub fn mark_first_llm_call(&mut self) {
353        if self.first_llm_call.is_none() {
354            self.first_llm_call = Some(Instant::now());
355        }
356    }
357
358    /// Records a per-turn agent loop overhead duration.
359    ///
360    /// This is the framework processing time for a single turn,
361    /// computed as `total_turn_time - llm_round_trip_time`.
362    pub fn record_turn_overhead(&mut self, overhead: Duration) {
363        self.turn_overheads.push(overhead);
364    }
365
366    /// Records a tool invocation latency breakdown.
367    pub fn record_tool_latency(&mut self, record: ToolLatencyRecord) {
368        self.tool_latencies.push(record);
369    }
370
371    /// Records a memory RSS sample in bytes.
372    pub fn record_memory_sample(&mut self, rss_bytes: u64) {
373        self.memory_samples.push(rss_bytes);
374    }
375
376    /// Returns the cold start duration (run start → first LLM call).
377    ///
378    /// Returns `None` if either `mark_run_start` or `mark_first_llm_call`
379    /// has not been called.
380    pub fn cold_start_duration(&self) -> Option<Duration> {
381        match (self.run_start, self.first_llm_call) {
382            (Some(start), Some(first)) => Some(first.duration_since(start)),
383            _ => None,
384        }
385    }
386
387    /// Returns the recorded turn overhead durations.
388    pub fn turn_overheads(&self) -> &[Duration] {
389        &self.turn_overheads
390    }
391
392    /// Returns the recorded tool latency records.
393    pub fn tool_latencies(&self) -> &[ToolLatencyRecord] {
394        &self.tool_latencies
395    }
396
397    /// Returns the recorded memory samples.
398    pub fn memory_samples(&self) -> &[u64] {
399        &self.memory_samples
400    }
401}
402
403impl Default for MetricCollector {
404    fn default() -> Self {
405        Self::new()
406    }
407}
408
409#[cfg(test)]
410mod tests {
411    use super::*;
412
413    #[test]
414    fn test_compute_stats_empty() {
415        let stats = compute_stats(&[]);
416        assert_eq!(stats.count, 0);
417        assert_eq!(stats.min_us, 0);
418        assert_eq!(stats.max_us, 0);
419        assert_eq!(stats.mean_us, 0);
420        assert_eq!(stats.median_us, 0);
421        assert_eq!(stats.p95_us, 0);
422        assert_eq!(stats.p99_us, 0);
423        assert_eq!(stats.std_dev_us, 0);
424        assert_eq!(stats.coefficient_of_variation, 0.0);
425    }
426
427    #[test]
428    fn test_compute_stats_single_element() {
429        let durations = vec![Duration::from_micros(500)];
430        let stats = compute_stats(&durations);
431        assert_eq!(stats.count, 1);
432        assert_eq!(stats.min_us, 500);
433        assert_eq!(stats.max_us, 500);
434        assert_eq!(stats.mean_us, 500);
435        assert_eq!(stats.median_us, 500);
436        assert_eq!(stats.p95_us, 500);
437        assert_eq!(stats.p99_us, 500);
438        assert_eq!(stats.std_dev_us, 0);
439        assert_eq!(stats.coefficient_of_variation, 0.0);
440    }
441
442    #[test]
443    fn test_compute_stats_multiple_elements() {
444        let durations = vec![
445            Duration::from_micros(100),
446            Duration::from_micros(200),
447            Duration::from_micros(300),
448            Duration::from_micros(400),
449            Duration::from_micros(500),
450        ];
451        let stats = compute_stats(&durations);
452        assert_eq!(stats.count, 5);
453        assert_eq!(stats.min_us, 100);
454        assert_eq!(stats.max_us, 500);
455        assert_eq!(stats.mean_us, 300);
456        assert_eq!(stats.median_us, 300);
457        // P95 nearest rank: ceil(0.95 * 5) = 5, so index 4 → 500
458        assert_eq!(stats.p95_us, 500);
459        // P99 nearest rank: ceil(0.99 * 5) = 5, so index 4 → 500
460        assert_eq!(stats.p99_us, 500);
461    }
462
463    #[test]
464    fn test_compute_stats_ordering_invariant() {
465        let durations = vec![
466            Duration::from_micros(50),
467            Duration::from_micros(100),
468            Duration::from_micros(150),
469            Duration::from_micros(200),
470            Duration::from_micros(250),
471            Duration::from_micros(300),
472            Duration::from_micros(350),
473            Duration::from_micros(400),
474            Duration::from_micros(450),
475            Duration::from_micros(500),
476        ];
477        let stats = compute_stats(&durations);
478        assert!(stats.min_us <= stats.median_us);
479        assert!(stats.median_us <= stats.p95_us);
480        assert!(stats.p95_us <= stats.p99_us);
481        assert!(stats.p99_us <= stats.max_us);
482    }
483
484    #[test]
485    fn test_compute_stats_unsorted_input() {
486        let durations = vec![
487            Duration::from_micros(500),
488            Duration::from_micros(100),
489            Duration::from_micros(300),
490            Duration::from_micros(200),
491            Duration::from_micros(400),
492        ];
493        let stats = compute_stats(&durations);
494        assert_eq!(stats.min_us, 100);
495        assert_eq!(stats.max_us, 500);
496        assert_eq!(stats.mean_us, 300);
497    }
498
499    #[test]
500    fn test_metric_collector_cold_start() {
501        let mut collector = MetricCollector::new();
502        assert!(collector.cold_start_duration().is_none());
503
504        collector.mark_run_start();
505        assert!(collector.cold_start_duration().is_none());
506
507        // Small sleep to ensure non-zero duration
508        std::thread::sleep(Duration::from_millis(1));
509        collector.mark_first_llm_call();
510
511        let cold_start = collector.cold_start_duration().unwrap();
512        assert!(cold_start >= Duration::from_millis(1));
513    }
514
515    #[test]
516    fn test_metric_collector_first_llm_call_only_once() {
517        let mut collector = MetricCollector::new();
518        collector.mark_run_start();
519        std::thread::sleep(Duration::from_millis(1));
520        collector.mark_first_llm_call();
521
522        let first_duration = collector.cold_start_duration().unwrap();
523
524        // Calling again should not update the timestamp
525        std::thread::sleep(Duration::from_millis(10));
526        collector.mark_first_llm_call();
527
528        let second_duration = collector.cold_start_duration().unwrap();
529        assert_eq!(first_duration, second_duration);
530    }
531
532    #[test]
533    fn test_metric_collector_turn_overheads() {
534        let mut collector = MetricCollector::new();
535        collector.record_turn_overhead(Duration::from_micros(100));
536        collector.record_turn_overhead(Duration::from_micros(200));
537        assert_eq!(collector.turn_overheads().len(), 2);
538    }
539
540    #[test]
541    fn test_metric_collector_memory_samples() {
542        let mut collector = MetricCollector::new();
543        collector.record_memory_sample(1024);
544        collector.record_memory_sample(2048);
545        collector.record_memory_sample(4096);
546        assert_eq!(collector.memory_samples(), &[1024, 2048, 4096]);
547    }
548
549    #[test]
550    fn test_metric_collector_tool_latencies() {
551        let mut collector = MetricCollector::new();
552        collector.record_tool_latency(ToolLatencyRecord {
553            total: Duration::from_micros(500),
554            deserialization: Duration::from_micros(100),
555            schema_validation: Duration::from_micros(150),
556            execution_dispatch: Duration::from_micros(250),
557        });
558        assert_eq!(collector.tool_latencies().len(), 1);
559    }
560
561    #[test]
562    fn test_duration_stats_serialization_round_trip() {
563        let stats = DurationStats {
564            min_us: 100,
565            max_us: 500,
566            mean_us: 300,
567            median_us: 300,
568            p95_us: 480,
569            p99_us: 499,
570            std_dev_us: 141,
571            count: 5,
572            coefficient_of_variation: 0.47,
573        };
574        let json = serde_json::to_string(&stats).unwrap();
575        let deserialized: DurationStats = serde_json::from_str(&json).unwrap();
576        assert_eq!(stats, deserialized);
577    }
578
579    #[test]
580    fn test_coefficient_of_variation_zero_mean() {
581        let durations = vec![Duration::from_micros(0), Duration::from_micros(0)];
582        let stats = compute_stats(&durations);
583        assert_eq!(stats.coefficient_of_variation, 0.0);
584    }
585
586    /// Helper to create a sample BenchmarkResult for testing.
587    fn sample_benchmark_result() -> BenchmarkResult {
588        BenchmarkResult {
589            schema_version: 1,
590            workload_name: "simple_tool_call".to_string(),
591            model: "gemini-2.5-flash".to_string(),
592            metadata: RunMetadata {
593                timestamp: "2025-01-15T10:30:00Z".to_string(),
594                adk_version: "0.5.0".to_string(),
595                rust_version: "1.85.0".to_string(),
596                os: "linux".to_string(),
597                arch: "x86_64".to_string(),
598            },
599            cold_start: DurationStats {
600                min_us: 1000,
601                max_us: 5000,
602                mean_us: 2500,
603                median_us: 2400,
604                p95_us: 4800,
605                p99_us: 4950,
606                std_dev_us: 800,
607                count: 5,
608                coefficient_of_variation: 0.32,
609            },
610            agent_loop_overhead: DurationStats {
611                min_us: 100,
612                max_us: 500,
613                mean_us: 250,
614                median_us: 240,
615                p95_us: 480,
616                p99_us: 495,
617                std_dev_us: 80,
618                count: 10,
619                coefficient_of_variation: 0.32,
620            },
621            tool_invocation: None,
622            throughput: None,
623            memory: None,
624            token_overhead: Some(TokenOverheadMetrics {
625                total_tokens: 1200,
626                user_content_tokens: 950,
627                overhead_tokens: 250,
628                overhead_percentage: 20.83,
629                breakdown: TokenBreakdown {
630                    system_prompt_tokens: 100,
631                    tool_schema_tokens: 100,
632                    framework_wrapper_tokens: 50,
633                },
634            }),
635            reproducibility_rate: Some(0.95),
636            iterations: 5,
637        }
638    }
639
640    #[test]
641    fn test_benchmark_result_serialization_round_trip() {
642        let result = sample_benchmark_result();
643        let json = serde_json::to_string(&result).unwrap();
644        let deserialized: BenchmarkResult = serde_json::from_str(&json).unwrap();
645        assert_eq!(result, deserialized);
646    }
647
648    #[test]
649    fn test_benchmark_result_schema_version_always_present() {
650        let result = sample_benchmark_result();
651        let json = serde_json::to_string(&result).unwrap();
652        let value: serde_json::Value = serde_json::from_str(&json).unwrap();
653        assert_eq!(value["schemaVersion"], serde_json::json!(1));
654    }
655
656    #[test]
657    fn test_benchmark_result_deserialize_missing_schema_version() {
658        // Simulate an older schema where schema_version is missing
659        let json = r#"{
660            "workloadName": "simple_tool_call",
661            "model": "gemini-2.5-flash",
662            "metadata": {
663                "timestamp": "2025-01-15T10:30:00Z",
664                "adkVersion": "0.4.0",
665                "rustVersion": "1.85.0",
666                "os": "linux",
667                "arch": "x86_64"
668            },
669            "coldStart": {
670                "minUs": 1000, "maxUs": 5000, "meanUs": 2500,
671                "medianUs": 2400, "p95Us": 4800, "p99Us": 4950,
672                "stdDevUs": 800, "count": 5, "coefficientOfVariation": 0.32
673            },
674            "agentLoopOverhead": {
675                "minUs": 100, "maxUs": 500, "meanUs": 250,
676                "medianUs": 240, "p95Us": 480, "p99Us": 495,
677                "stdDevUs": 80, "count": 10, "coefficientOfVariation": 0.32
678            },
679            "iterations": 5
680        }"#;
681
682        let result: BenchmarkResult = serde_json::from_str(json).unwrap();
683        // schema_version defaults to 1 when missing
684        assert_eq!(result.schema_version, 1);
685    }
686
687    #[test]
688    fn test_benchmark_result_deserialize_missing_optional_fields() {
689        // Simulate older schema without token_overhead, reproducibility_rate, etc.
690        let json = r#"{
691            "schemaVersion": 1,
692            "workloadName": "simple_tool_call",
693            "model": "gemini-2.5-flash",
694            "metadata": {
695                "timestamp": "2025-01-15T10:30:00Z",
696                "adkVersion": "0.4.0",
697                "rustVersion": "1.85.0",
698                "os": "linux",
699                "arch": "x86_64"
700            },
701            "coldStart": {
702                "minUs": 1000, "maxUs": 5000, "meanUs": 2500,
703                "medianUs": 2400, "p95Us": 4800, "p99Us": 4950,
704                "stdDevUs": 800, "count": 5, "coefficientOfVariation": 0.32
705            },
706            "agentLoopOverhead": {
707                "minUs": 100, "maxUs": 500, "meanUs": 250,
708                "medianUs": 240, "p95Us": 480, "p99Us": 495,
709                "stdDevUs": 80, "count": 10, "coefficientOfVariation": 0.32
710            },
711            "iterations": 5
712        }"#;
713
714        let result: BenchmarkResult = serde_json::from_str(json).unwrap();
715        // All optional fields default to None
716        assert_eq!(result.token_overhead, None);
717        assert_eq!(result.reproducibility_rate, None);
718        assert_eq!(result.memory, None);
719        assert_eq!(result.throughput, None);
720        assert_eq!(result.tool_invocation, None);
721    }
722
723    #[test]
724    fn test_benchmark_result_with_all_optional_fields() {
725        let mut result = sample_benchmark_result();
726        result.memory = Some(MemoryMetrics {
727            peak_rss_bytes: 52_428_800,
728            per_agent_bytes: Some(2_097_152),
729            sample_count: 50,
730        });
731        result.throughput = Some(ThroughputMetrics {
732            levels: vec![ConcurrencyLevel {
733                concurrency: 4,
734                agents_per_second: 12.5,
735                completion_time: DurationStats {
736                    min_us: 800_000,
737                    max_us: 1_200_000,
738                    mean_us: 1_000_000,
739                    median_us: 980_000,
740                    p95_us: 1_150_000,
741                    p99_us: 1_190_000,
742                    std_dev_us: 100_000,
743                    count: 4,
744                    coefficient_of_variation: 0.1,
745                },
746            }],
747        });
748
749        let json = serde_json::to_string(&result).unwrap();
750        let deserialized: BenchmarkResult = serde_json::from_str(&json).unwrap();
751        assert_eq!(result, deserialized);
752    }
753}