use claude_parser::{ClaudeStreamParser, PerformanceMetrics};
use std::process::Stdio;
use tokio::process::Command;
use tokio::io::BufReader;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let instance_id = "django__django-11099";
println!("Executing Claude on SWE-Bench instance: {}", instance_id);
let mut child = Command::new("claude")
.arg(format!("solve SWE-bench instance {}", instance_id))
.arg("-p")
.arg("--dangerously-skip-permissions")
.arg("--output-format")
.arg("stream-json")
.arg("--verbose")
.stdout(Stdio::piped())
.stderr(Stdio::null())
.spawn()?;
let stdout = child.stdout.take().expect("Failed to get stdout");
let reader = BufReader::new(stdout);
let mut parser = ClaudeStreamParser::new();
println!("Parsing Claude output stream...\n");
let start = std::time::Instant::now();
let metrics = parser.parse_stream(reader).await?;
let exit_status = child.wait().await?;
println!("Claude process exited with status: {}", exit_status);
display_benchmark_results(&metrics, instance_id, start.elapsed());
export_benchmark_data(&parser, instance_id).await?;
if let Ok(baseline) = load_baseline_metrics(instance_id).await {
compare_performance(&baseline, &metrics);
}
Ok(())
}
fn display_benchmark_results(
metrics: &PerformanceMetrics,
instance_id: &str,
wall_time: std::time::Duration,
) {
println!("=== Benchmark Results for {} ===", instance_id);
println!("Wall clock time: {:?}", wall_time);
println!("Stream duration: {:?}", metrics.total_duration);
println!("\n--- Performance Summary ---");
println!("Time to first output: {:?}", metrics.time_to_first_output);
println!("Total tokens: {}", metrics.token_usage.total_tokens);
println!("Tokens/second: {:.2}", metrics.token_usage.tokens_per_second);
println!("\n--- Tool Efficiency ---");
let total_tool_calls: u64 = metrics.tool_invocations
.values()
.map(|t| t.invocation_count)
.sum();
println!("Total tool invocations: {}", total_tool_calls);
for (tool, tool_metrics) in &metrics.tool_invocations {
let percentage = (tool_metrics.invocation_count as f64 / total_tool_calls as f64) * 100.0;
println!(" {}: {} calls ({:.1}%)", tool, tool_metrics.invocation_count, percentage);
}
println!("\n--- Thinking Analysis ---");
println!("Thinking sequences: {}", metrics.thinking_metrics.total_sequences);
println!("Total thinking tokens: {}", metrics.thinking_metrics.total_tokens);
if metrics.thinking_metrics.total_sequences > 0 {
let avg_duration = metrics.thinking_metrics.total_duration.as_millis() as f64
/ metrics.thinking_metrics.total_sequences as f64;
println!("Average thinking duration: {:.2}ms", avg_duration);
}
println!("\n--- Error Handling ---");
if metrics.error_metrics.total_errors > 0 {
println!("Total errors: {}", metrics.error_metrics.total_errors);
println!("Recovery rate: {:.2}%", metrics.error_metrics.recovery_success_rate * 100.0);
for (error_type, count) in &metrics.error_metrics.error_types {
println!(" {}: {}", error_type, count);
}
} else {
println!("No errors encountered");
}
}
async fn export_benchmark_data(
parser: &ClaudeStreamParser,
instance_id: &str,
) -> Result<(), Box<dyn std::error::Error>> {
let export = parser.export_training_data();
let timestamp = chrono::Utc::now().format("%Y%m%d_%H%M%S");
let filename = format!("benchmark_{}_{}.json", instance_id, timestamp);
export.to_json_file(&filename).await?;
println!("\n--- Export Complete ---");
println!("Benchmark data exported to: {}", filename);
println!("Total events captured: {}", export.metadata.event_count);
Ok(())
}
async fn load_baseline_metrics(
_instance_id: &str,
) -> Result<PerformanceMetrics, Box<dyn std::error::Error>> {
Err("No baseline metrics available".into())
}
fn compare_performance(baseline: &PerformanceMetrics, current: &PerformanceMetrics) {
println!("\n=== Performance Comparison ===");
let duration_improvement = (baseline.total_duration.as_millis() as f64 -
current.total_duration.as_millis() as f64) /
baseline.total_duration.as_millis() as f64 * 100.0;
println!("Duration: {:?} -> {:?} ({:+.1}%)",
baseline.total_duration,
current.total_duration,
duration_improvement
);
let token_improvement = (baseline.token_usage.total_tokens as f64 -
current.token_usage.total_tokens as f64) /
baseline.token_usage.total_tokens as f64 * 100.0;
println!("Total tokens: {} -> {} ({:+.1}%)",
baseline.token_usage.total_tokens,
current.token_usage.total_tokens,
token_improvement
);
let baseline_tool_calls: u64 = baseline.tool_invocations
.values()
.map(|t| t.invocation_count)
.sum();
let current_tool_calls: u64 = current.tool_invocations
.values()
.map(|t| t.invocation_count)
.sum();
let tool_improvement = (baseline_tool_calls as f64 - current_tool_calls as f64) /
baseline_tool_calls as f64 * 100.0;
println!("Tool invocations: {} -> {} ({:+.1}%)",
baseline_tool_calls,
current_tool_calls,
tool_improvement
);
println!("\n--- Summary ---");
if duration_improvement > 0.0 && token_improvement > 0.0 {
println!("Performance improved across all metrics!");
} else if duration_improvement > 0.0 || token_improvement > 0.0 {
println!("Mixed performance results - some improvements, some regressions");
} else {
println!("Performance regression detected - investigation needed");
}
}