Skip to main content

adk_bench/
external.rs

1//! External framework comparison via subprocess execution.
2//!
3//! Implements the External Benchmark Protocol (EBP) for running competitor
4//! frameworks against the same workloads. External scripts receive the workload
5//! JSON path as their last CLI argument and `BENCH_START_EPOCH_NS` in their
6//! environment. They must emit exactly one JSON object on stdout conforming to
7//! [`ExternalMetricsOutput`].
8//!
9//! # Example
10//!
11//! ```rust,ignore
12//! use adk_bench::{ExternalRunner, ExternalFrameworkConfig};
13//!
14//! let runner = ExternalRunner::new(300);
15//! let config = ExternalFrameworkConfig {
16//!     name: "langgraph".to_string(),
17//!     command: "python".to_string(),
18//!     args: vec!["-m".to_string(), "bench_langgraph".to_string()],
19//!     working_dir: None,
20//!     env: vec![],
21//! };
22//! let metrics = runner.run(&config, "/path/to/workload.json").await?;
23//! println!("Framework: {}", metrics.framework);
24//! ```
25
26use std::path::Path;
27use std::time::Duration;
28
29use serde::{Deserialize, Serialize};
30use tokio::process::Command;
31use tracing::{debug, info, warn};
32
33use crate::config::ExternalFrameworkConfig;
34
35/// External Benchmark Protocol (EBP) — the JSON schema that all external
36/// framework benchmark scripts MUST emit on stdout.
37///
38/// This is the contract between adk-bench and any competitor framework harness.
39/// External scripts receive: the workload JSON path as last CLI arg, and
40/// `BENCH_START_EPOCH_NS` in their environment (monotonic nanosecond timestamp
41/// at subprocess spawn time).
42///
43/// They MUST output exactly one JSON object (no other stdout content):
44/// ```json
45/// {
46///   "framework": "langgraph",
47///   "cold_start_us": 45000,
48///   "first_llm_call_epoch_ns": 1705312800000045000,
49///   "loop_overhead": {
50///     "min_us": 120, "max_us": 890, "mean_us": 340,
51///     "median_us": 310, "p95_us": 780, "p99_us": 870, "count": 10
52///   },
53///   "throughput_agents_per_sec": 12.5,
54///   "peak_rss_bytes": 52428800,
55///   "token_overhead": {
56///     "total_tokens": 1200, "user_content_tokens": 950, "overhead_tokens": 250
57///   }
58/// }
59/// ```
60#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
61pub struct ExternalMetricsOutput {
62    /// Framework name (e.g., "adk-python", "langgraph", "crewai").
63    pub framework: String,
64
65    /// Cold start time in microseconds (subprocess spawn → first LLM call).
66    pub cold_start_us: u64,
67
68    /// Monotonic nanosecond timestamp when the first LLM call was made.
69    /// Used with `BENCH_START_EPOCH_NS` to compute cold start from the external clock.
70    pub first_llm_call_epoch_ns: u64,
71
72    /// Per-turn framework overhead statistics (LLM time subtracted).
73    pub loop_overhead: ExternalDurationStats,
74
75    /// Peak RSS in bytes (null if platform doesn't support measurement).
76    #[serde(skip_serializing_if = "Option::is_none")]
77    pub peak_rss_bytes: Option<u64>,
78
79    /// Agents completed per second at the requested concurrency (null if not measured).
80    #[serde(skip_serializing_if = "Option::is_none")]
81    pub throughput_agents_per_sec: Option<f64>,
82
83    /// Token overhead breakdown (null if not measured).
84    #[serde(skip_serializing_if = "Option::is_none")]
85    pub token_overhead: Option<ExternalTokenOverhead>,
86}
87
88/// Duration statistics reported by external frameworks in the EBP protocol.
89#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
90pub struct ExternalDurationStats {
91    /// Minimum overhead in microseconds.
92    pub min_us: u64,
93    /// Maximum overhead in microseconds.
94    pub max_us: u64,
95    /// Mean overhead in microseconds.
96    pub mean_us: u64,
97    /// Median overhead in microseconds.
98    pub median_us: u64,
99    /// 95th percentile overhead in microseconds.
100    pub p95_us: u64,
101    /// 99th percentile overhead in microseconds.
102    pub p99_us: u64,
103    /// Number of measurements.
104    pub count: u64,
105}
106
107/// Token overhead breakdown reported by external frameworks.
108#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
109pub struct ExternalTokenOverhead {
110    /// Total tokens sent to LLM.
111    pub total_tokens: u64,
112    /// Tokens from user content only.
113    pub user_content_tokens: u64,
114    /// Framework overhead tokens (total - user_content).
115    pub overhead_tokens: u64,
116}
117
118/// Runs external framework benchmarks as subprocesses.
119///
120/// Injects `BENCH_START_EPOCH_NS` into the subprocess environment,
121/// passes the workload JSON path as the last argument, and parses
122/// the EBP JSON output from stdout.
123///
124/// # Example
125///
126/// ```rust
127/// use adk_bench::ExternalRunner;
128///
129/// let runner = ExternalRunner::new(300);
130/// assert_eq!(runner.timeout(), std::time::Duration::from_secs(300));
131/// ```
132pub struct ExternalRunner {
133    timeout: Duration,
134}
135
136impl ExternalRunner {
137    /// Creates a new `ExternalRunner` with the specified timeout in seconds.
138    pub fn new(timeout_secs: u64) -> Self {
139        Self { timeout: Duration::from_secs(timeout_secs) }
140    }
141
142    /// Returns the configured timeout duration.
143    pub fn timeout(&self) -> Duration {
144        self.timeout
145    }
146
147    /// Executes the external framework and parses its EBP JSON output.
148    ///
149    /// The subprocess receives:
150    /// - `BENCH_START_EPOCH_NS` env var (monotonic clock nanosecond timestamp at spawn time)
151    /// - Workload JSON file path as the last CLI argument
152    ///
153    /// Returns error if subprocess times out, exits non-zero, or emits invalid JSON.
154    ///
155    /// # Errors
156    ///
157    /// - [`BenchError::ExternalTimeout`] if the subprocess exceeds the configured timeout
158    /// - [`BenchError::ExternalRunner`] if the subprocess exits with non-zero status or emits invalid JSON
159    pub async fn run(
160        &self,
161        config: &ExternalFrameworkConfig,
162        workload_path: &str,
163    ) -> crate::Result<ExternalMetricsOutput> {
164        info!(
165            framework = %config.name,
166            command = %config.command,
167            workload = %workload_path,
168            "starting external framework benchmark"
169        );
170
171        // Get current monotonic time as epoch nanoseconds for BENCH_START_EPOCH_NS.
172        // We use system time as nanoseconds since UNIX epoch to provide a shared
173        // clock reference between the parent and child process.
174        let start_epoch_ns = std::time::SystemTime::now()
175            .duration_since(std::time::UNIX_EPOCH)
176            .unwrap_or_default()
177            .as_nanos() as u64;
178
179        // Build the command with args from ExternalFrameworkConfig.
180        let mut cmd = Command::new(&config.command);
181
182        // Add configured arguments.
183        cmd.args(&config.args);
184
185        // Append workload path as the last argument.
186        cmd.arg(workload_path);
187
188        // Inject BENCH_START_EPOCH_NS environment variable.
189        cmd.env("BENCH_START_EPOCH_NS", start_epoch_ns.to_string());
190
191        // Add any additional configured environment variables.
192        for (key, value) in &config.env {
193            cmd.env(key, value);
194        }
195
196        // Set working directory if configured.
197        if let Some(working_dir) = &config.working_dir {
198            cmd.current_dir(working_dir);
199        }
200
201        // Capture stdout and stderr.
202        cmd.stdout(std::process::Stdio::piped());
203        cmd.stderr(std::process::Stdio::piped());
204
205        debug!(
206            framework = %config.name,
207            start_epoch_ns = start_epoch_ns,
208            timeout_secs = self.timeout.as_secs(),
209            "spawning external framework subprocess"
210        );
211
212        // Execute with timeout using tokio::time::timeout.
213        let output = match tokio::time::timeout(self.timeout, cmd.output()).await {
214            Ok(Ok(output)) => output,
215            Ok(Err(io_err)) => {
216                return Err(crate::BenchError::ExternalRunner {
217                    framework: config.name.clone(),
218                    reason: format!("failed to spawn subprocess: {io_err}"),
219                });
220            }
221            Err(_elapsed) => {
222                warn!(
223                    framework = %config.name,
224                    timeout_secs = self.timeout.as_secs(),
225                    "external framework timed out"
226                );
227                return Err(crate::BenchError::ExternalTimeout {
228                    framework: config.name.clone(),
229                    timeout_secs: self.timeout.as_secs(),
230                });
231            }
232        };
233
234        // Check for non-zero exit status.
235        if !output.status.success() {
236            let stderr = String::from_utf8_lossy(&output.stderr);
237            let exit_code = output.status.code().unwrap_or(-1);
238            warn!(
239                framework = %config.name,
240                exit_code = exit_code,
241                stderr = %stderr,
242                "external framework exited with non-zero status"
243            );
244            return Err(crate::BenchError::ExternalRunner {
245                framework: config.name.clone(),
246                reason: format!("subprocess exited with code {exit_code}: {}", stderr.trim()),
247            });
248        }
249
250        // Parse JSON stdout into ExternalMetricsOutput.
251        let stdout = String::from_utf8_lossy(&output.stdout);
252        let stdout_trimmed = stdout.trim();
253
254        debug!(
255            framework = %config.name,
256            stdout_len = stdout_trimmed.len(),
257            "parsing external framework EBP output"
258        );
259
260        let mut metrics: ExternalMetricsOutput =
261            serde_json::from_str(stdout_trimmed).map_err(|e| {
262                crate::BenchError::ExternalRunner {
263                    framework: config.name.clone(),
264                    reason: format!("failed to parse EBP JSON output: {e}"),
265                }
266            })?;
267
268        // Compute cold_start from first_llm_call_epoch_ns - BENCH_START_EPOCH_NS
269        // if the reported cold_start_us seems like a placeholder (0).
270        // The authoritative cold start is always: first_llm_call_epoch_ns - start_epoch_ns.
271        let computed_cold_start_ns = metrics.first_llm_call_epoch_ns.saturating_sub(start_epoch_ns);
272        let computed_cold_start_us = computed_cold_start_ns / 1000;
273
274        // Use the computed cold start from the external clock source for consistency.
275        metrics.cold_start_us = computed_cold_start_us;
276
277        info!(
278            framework = %metrics.framework,
279            cold_start_us = computed_cold_start_us,
280            loop_overhead_mean_us = metrics.loop_overhead.mean_us,
281            "external framework benchmark completed"
282        );
283
284        Ok(metrics)
285    }
286}
287
288/// Configuration file format for loading multiple external framework configs.
289///
290/// # Example JSON
291///
292/// ```json
293/// {
294///   "frameworks": [
295///     {
296///       "name": "adk-python",
297///       "command": "python",
298///       "args": ["-m", "adk_bench", "--workload"],
299///       "workingDir": "../adk-python",
300///       "env": [["GOOGLE_API_KEY", "${GOOGLE_API_KEY}"]]
301///     }
302///   ]
303/// }
304/// ```
305#[derive(Debug, Clone, Serialize, Deserialize)]
306#[serde(rename_all = "camelCase")]
307pub struct ExternalConfigFile {
308    /// List of external framework configurations.
309    pub frameworks: Vec<ExternalFrameworkConfig>,
310}
311
312/// Loads external framework configurations from a JSON config file.
313///
314/// # Errors
315///
316/// Returns [`BenchError::Io`] if the file cannot be read, or
317/// [`BenchError::Serialization`] if the JSON is invalid.
318///
319/// # Example
320///
321/// ```rust,ignore
322/// use adk_bench::external::load_external_configs;
323/// use std::path::Path;
324///
325/// let configs = load_external_configs(Path::new("external-bench.json"))?;
326/// for config in &configs {
327///     println!("Framework: {}", config.name);
328/// }
329/// ```
330pub fn load_external_configs(path: &Path) -> crate::Result<Vec<ExternalFrameworkConfig>> {
331    let content = std::fs::read_to_string(path)?;
332    let config_file: ExternalConfigFile = serde_json::from_str(&content).map_err(|e| {
333        crate::BenchError::Serialization(format!(
334            "failed to parse external config file '{}': {e}",
335            path.display()
336        ))
337    })?;
338    Ok(config_file.frameworks)
339}
340
341#[cfg(test)]
342mod tests {
343    use super::*;
344
345    #[test]
346    fn test_external_metrics_output_deserialize() {
347        let json = r#"{
348            "framework": "langgraph",
349            "cold_start_us": 45000,
350            "first_llm_call_epoch_ns": 1705312800000045000,
351            "loop_overhead": {
352                "min_us": 120,
353                "max_us": 890,
354                "mean_us": 340,
355                "median_us": 310,
356                "p95_us": 780,
357                "p99_us": 870,
358                "count": 10
359            },
360            "throughput_agents_per_sec": 12.5,
361            "peak_rss_bytes": 52428800,
362            "token_overhead": {
363                "total_tokens": 1200,
364                "user_content_tokens": 950,
365                "overhead_tokens": 250
366            }
367        }"#;
368
369        let metrics: ExternalMetricsOutput = serde_json::from_str(json).unwrap();
370        assert_eq!(metrics.framework, "langgraph");
371        assert_eq!(metrics.cold_start_us, 45000);
372        assert_eq!(metrics.first_llm_call_epoch_ns, 1705312800000045000);
373        assert_eq!(metrics.loop_overhead.min_us, 120);
374        assert_eq!(metrics.loop_overhead.max_us, 890);
375        assert_eq!(metrics.loop_overhead.mean_us, 340);
376        assert_eq!(metrics.loop_overhead.median_us, 310);
377        assert_eq!(metrics.loop_overhead.p95_us, 780);
378        assert_eq!(metrics.loop_overhead.p99_us, 870);
379        assert_eq!(metrics.loop_overhead.count, 10);
380        assert_eq!(metrics.throughput_agents_per_sec, Some(12.5));
381        assert_eq!(metrics.peak_rss_bytes, Some(52428800));
382        let token_overhead = metrics.token_overhead.unwrap();
383        assert_eq!(token_overhead.total_tokens, 1200);
384        assert_eq!(token_overhead.user_content_tokens, 950);
385        assert_eq!(token_overhead.overhead_tokens, 250);
386    }
387
388    #[test]
389    fn test_external_metrics_output_deserialize_minimal() {
390        let json = r#"{
391            "framework": "crewai",
392            "cold_start_us": 120000,
393            "first_llm_call_epoch_ns": 1705312800000120000,
394            "loop_overhead": {
395                "min_us": 500,
396                "max_us": 2000,
397                "mean_us": 1000,
398                "median_us": 900,
399                "p95_us": 1800,
400                "p99_us": 1950,
401                "count": 5
402            }
403        }"#;
404
405        let metrics: ExternalMetricsOutput = serde_json::from_str(json).unwrap();
406        assert_eq!(metrics.framework, "crewai");
407        assert_eq!(metrics.cold_start_us, 120000);
408        assert_eq!(metrics.peak_rss_bytes, None);
409        assert_eq!(metrics.throughput_agents_per_sec, None);
410        assert_eq!(metrics.token_overhead, None);
411    }
412
413    #[test]
414    fn test_external_metrics_output_serialize_roundtrip() {
415        let metrics = ExternalMetricsOutput {
416            framework: "test-framework".to_string(),
417            cold_start_us: 5000,
418            first_llm_call_epoch_ns: 1000000005000000,
419            loop_overhead: ExternalDurationStats {
420                min_us: 100,
421                max_us: 500,
422                mean_us: 250,
423                median_us: 230,
424                p95_us: 450,
425                p99_us: 490,
426                count: 20,
427            },
428            peak_rss_bytes: Some(1024 * 1024 * 50),
429            throughput_agents_per_sec: Some(8.5),
430            token_overhead: Some(ExternalTokenOverhead {
431                total_tokens: 1000,
432                user_content_tokens: 800,
433                overhead_tokens: 200,
434            }),
435        };
436
437        let json = serde_json::to_string(&metrics).unwrap();
438        let deserialized: ExternalMetricsOutput = serde_json::from_str(&json).unwrap();
439        assert_eq!(metrics, deserialized);
440    }
441
442    #[test]
443    fn test_external_runner_new() {
444        let runner = ExternalRunner::new(120);
445        assert_eq!(runner.timeout(), Duration::from_secs(120));
446    }
447
448    #[test]
449    fn test_external_runner_default_timeout() {
450        let runner = ExternalRunner::new(300);
451        assert_eq!(runner.timeout(), Duration::from_secs(300));
452    }
453
454    #[test]
455    fn test_external_config_file_deserialize() {
456        let json = r#"{
457            "frameworks": [
458                {
459                    "name": "adk-python",
460                    "command": "python",
461                    "args": ["-m", "adk_bench", "--workload"],
462                    "workingDir": "../adk-python",
463                    "env": [["GOOGLE_API_KEY", "test-key"]]
464                },
465                {
466                    "name": "langgraph",
467                    "command": "python",
468                    "args": ["bench_runner.py"],
469                    "env": []
470                }
471            ]
472        }"#;
473
474        let config_file: ExternalConfigFile = serde_json::from_str(json).unwrap();
475        assert_eq!(config_file.frameworks.len(), 2);
476        assert_eq!(config_file.frameworks[0].name, "adk-python");
477        assert_eq!(config_file.frameworks[0].command, "python");
478        assert_eq!(config_file.frameworks[0].args, vec!["-m", "adk_bench", "--workload"]);
479        assert_eq!(
480            config_file.frameworks[0].working_dir,
481            Some(std::path::PathBuf::from("../adk-python"))
482        );
483        assert_eq!(config_file.frameworks[1].name, "langgraph");
484        assert_eq!(config_file.frameworks[1].working_dir, None);
485    }
486
487    #[test]
488    fn test_load_external_configs_file_not_found() {
489        let result = load_external_configs(Path::new("/nonexistent/path/config.json"));
490        assert!(result.is_err());
491    }
492
493    #[tokio::test]
494    async fn test_external_runner_spawn_failure() {
495        let runner = ExternalRunner::new(10);
496        let config = ExternalFrameworkConfig {
497            name: "nonexistent".to_string(),
498            command: "/this/command/does/not/exist/anywhere".to_string(),
499            args: vec![],
500            working_dir: None,
501            env: vec![],
502        };
503
504        let result = runner.run(&config, "/tmp/workload.json").await;
505        assert!(result.is_err());
506        let err = result.unwrap_err();
507        match err {
508            crate::BenchError::ExternalRunner { framework, reason } => {
509                assert_eq!(framework, "nonexistent");
510                assert!(reason.contains("failed to spawn subprocess"));
511            }
512            _ => panic!("expected ExternalRunner error, got: {err:?}"),
513        }
514    }
515
516    #[tokio::test]
517    async fn test_external_runner_non_zero_exit() {
518        let runner = ExternalRunner::new(10);
519        let config = ExternalFrameworkConfig {
520            name: "failing-script".to_string(),
521            command: "sh".to_string(),
522            args: vec!["-c".to_string(), "exit 1".to_string()],
523            working_dir: None,
524            env: vec![],
525        };
526
527        let result = runner.run(&config, "/tmp/workload.json").await;
528        assert!(result.is_err());
529        let err = result.unwrap_err();
530        match err {
531            crate::BenchError::ExternalRunner { framework, .. } => {
532                assert_eq!(framework, "failing-script");
533            }
534            _ => panic!("expected ExternalRunner error, got: {err:?}"),
535        }
536    }
537
538    #[tokio::test]
539    async fn test_external_runner_invalid_json() {
540        let runner = ExternalRunner::new(10);
541        let config = ExternalFrameworkConfig {
542            name: "bad-json".to_string(),
543            command: "echo".to_string(),
544            args: vec!["not valid json".to_string()],
545            working_dir: None,
546            env: vec![],
547        };
548
549        let result = runner.run(&config, "/tmp/workload.json").await;
550        assert!(result.is_err());
551        let err = result.unwrap_err();
552        match err {
553            crate::BenchError::ExternalRunner { framework, reason } => {
554                assert_eq!(framework, "bad-json");
555                assert!(reason.contains("failed to parse EBP JSON output"));
556            }
557            _ => panic!("expected ExternalRunner error, got: {err:?}"),
558        }
559    }
560
561    #[tokio::test]
562    async fn test_external_runner_timeout() {
563        let runner = ExternalRunner::new(1); // 1 second timeout
564        let config = ExternalFrameworkConfig {
565            name: "slow-script".to_string(),
566            command: "sh".to_string(),
567            // The workload path will be appended as the last arg, but the script ignores it.
568            args: vec!["-c".to_string(), "sleep 10; #".to_string()],
569            working_dir: None,
570            env: vec![],
571        };
572
573        let result = runner.run(&config, "/tmp/workload.json").await;
574        assert!(result.is_err());
575        let err = result.unwrap_err();
576        match err {
577            crate::BenchError::ExternalTimeout { framework, timeout_secs } => {
578                assert_eq!(framework, "slow-script");
579                assert_eq!(timeout_secs, 1);
580            }
581            _ => panic!("expected ExternalTimeout error, got: {err:?}"),
582        }
583    }
584
585    #[tokio::test]
586    async fn test_external_runner_valid_output() {
587        // Use a shell command that outputs valid EBP JSON, ignoring extra args.
588        let ebp_json = r#"{"framework":"test","cold_start_us":1000,"first_llm_call_epoch_ns":99999999999999999,"loop_overhead":{"min_us":10,"max_us":100,"mean_us":50,"median_us":45,"p95_us":90,"p99_us":95,"count":5}}"#;
589
590        let runner = ExternalRunner::new(10);
591        let config = ExternalFrameworkConfig {
592            name: "test-framework".to_string(),
593            command: "sh".to_string(),
594            args: vec!["-c".to_string(), format!("echo '{}'; #", ebp_json)],
595            working_dir: None,
596            env: vec![],
597        };
598
599        let result = runner.run(&config, "/tmp/workload.json").await;
600        assert!(result.is_ok());
601        let metrics = result.unwrap();
602        assert_eq!(metrics.framework, "test");
603        assert_eq!(metrics.loop_overhead.min_us, 10);
604        assert_eq!(metrics.loop_overhead.count, 5);
605        assert_eq!(metrics.peak_rss_bytes, None);
606        assert_eq!(metrics.throughput_agents_per_sec, None);
607        assert_eq!(metrics.token_overhead, None);
608    }
609
610    #[tokio::test]
611    async fn test_external_runner_env_injection() {
612        // Verify that BENCH_START_EPOCH_NS is injected and custom env vars are passed.
613        let runner = ExternalRunner::new(10);
614        let config = ExternalFrameworkConfig {
615            name: "env-test".to_string(),
616            command: "sh".to_string(),
617            args: vec![
618                "-c".to_string(),
619                // Output EBP JSON using the injected env var as first_llm_call_epoch_ns.
620                // The workload path will be the next positional arg but -c ignores it.
621                r#"FIRST_CALL=$(expr $BENCH_START_EPOCH_NS + 5000000); echo "{\"framework\":\"env-test\",\"cold_start_us\":0,\"first_llm_call_epoch_ns\":$FIRST_CALL,\"loop_overhead\":{\"min_us\":1,\"max_us\":2,\"mean_us\":1,\"median_us\":1,\"p95_us\":2,\"p99_us\":2,\"count\":1}}"; #"#.to_string(),
622            ],
623            working_dir: None,
624            env: vec![("CUSTOM_VAR".to_string(), "hello".to_string())],
625        };
626
627        let result = runner.run(&config, "/tmp/workload.json").await;
628        assert!(result.is_ok(), "run failed: {:?}", result.unwrap_err());
629        let metrics = result.unwrap();
630        assert_eq!(metrics.framework, "env-test");
631        // cold_start should be computed as first_llm_call_epoch_ns - start_epoch_ns
632        // which should be approximately 5000000ns = 5000us
633        assert_eq!(metrics.cold_start_us, 5000);
634    }
635}