Skip to main content

smith_protocol/
benchmark.rs

1//! Benchmark event protocol for Smith agent optimization
2//!
3//! This module defines the complete event schema for collecting agent performance
4//! data to optimize Smith's performance on coding benchmarks like SWE-bench.
5
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::time::Duration;
10use uuid::Uuid;
11
12/// Core benchmark event with required tracking fields
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct BenchmarkEvent {
15    // Required IDs for event correlation
16    pub run_id: Uuid,
17    pub task_id: String,
18    pub step_id: Uuid,
19    pub ts: DateTime<Utc>,
20    pub policy_id: String,
21    pub cfg_hash: String,
22    pub env_hash: String,
23    pub seed: u64,
24
25    // Event payload
26    pub event_type: BenchmarkEventType,
27    pub data: serde_json::Value,
28}
29
30/// All benchmark event types for agent optimization
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub enum BenchmarkEventType {
33    RunStart(RunConfig),
34    RunStop(RunResult),
35    Step(StepData),
36    ToolCall(ToolPerformance),
37    ContextDecision(PruningDecision),
38    Failure(FailureAnalysis),
39    OptimizerUpdate(ConfigSuggestion),
40}
41
42/// Configuration for a benchmark run
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct RunConfig {
45    // Agent behavior parameters
46    pub max_iterations: usize,
47    pub temperature: f32,
48    pub context_window: usize,
49    pub pruning_threshold: f32,
50    pub planner_depth: usize,
51    pub debate_rounds: usize,
52
53    // Tool configuration
54    pub available_tools: Vec<String>,
55    pub tool_timeout_ms: u64,
56    pub retry_policy: RetryPolicy,
57
58    // Early stopping criteria
59    pub execution_timeout: Duration,
60    pub failing_tests_patience: usize,
61    pub no_progress_timeout: Duration,
62
63    // Environment settings
64    pub docker_image: String,
65    pub tools_commit: String,
66    pub sandbox_limits: SandboxLimits,
67}
68
69/// Results from a completed benchmark run
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct RunResult {
72    // Core success metrics (for reward function)
73    pub success: bool,
74    pub wall_time_ms: u64,
75    pub total_tokens: u64,
76    pub tool_errors: u32,
77    pub stability_variance: f64,
78
79    // Detailed results
80    pub final_score: f64,
81    pub failing_tests: Vec<String>,
82    pub tests_passed: u32,
83    pub tests_failed: u32,
84    pub files_modified: u32,
85    pub early_stopped: bool,
86    pub early_stop_reason: Option<String>,
87}
88
89/// Individual reasoning/action step data
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct StepData {
92    pub step_type: StepType,
93    pub content: String,
94    pub tokens_in: u32,
95    pub tokens_out: u32,
96    pub wall_time_ms: u64,
97    pub planned_step: bool,
98    pub context_kb: usize,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub enum StepType {
103    Reason,
104    Act,
105    Observe,
106    Conclude,
107    Plan,
108    Debate,
109}
110
111/// Tool performance metrics for optimization
112#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct ToolPerformance {
114    pub tool_name: String,
115    pub tool_args_hash: String,
116    pub latency_ms: u64,
117    pub retries: u32,
118    pub exit_kind: ExitKind,
119    pub evidence_footprint: EvidenceFootprint,
120    pub user_feedback_score: Option<f32>,
121    pub intent_accuracy: Option<f32>, // Did tool do what agent wanted?
122}
123
124#[derive(Debug, Clone, Serialize, Deserialize)]
125pub enum ExitKind {
126    Ok,
127    Deterministic(String),
128    Flake(String),
129    Oom,
130    Timeout,
131    UserError(String),
132    SystemError(String),
133}
134
135/// Evidence of tool's impact on the codebase
136#[derive(Debug, Clone, Serialize, Deserialize)]
137pub struct EvidenceFootprint {
138    pub files_read: Vec<String>,
139    pub files_modified: Vec<String>,
140    pub lines_changed: usize,
141    pub tests_added: usize,
142    pub tests_modified: usize,
143    pub bytes_read: usize,
144    pub bytes_written: usize,
145}
146
147/// Context pruning decisions for optimization
148#[derive(Debug, Clone, Serialize, Deserialize)]
149pub struct PruningDecision {
150    pub segment_id: String,
151    pub kept: bool,
152    pub segment_kb: usize,
153    pub segment_type: SegmentType,
154    pub impact_probe_ev: Option<f64>, // Expected value from shadow replay
155    pub pruning_algorithm: String,
156    pub confidence_score: f64,
157}
158
159#[derive(Debug, Clone, Serialize, Deserialize)]
160pub enum SegmentType {
161    FileContent,
162    SearchResults,
163    ConversationHistory,
164    ToolOutput,
165    ReasoningContext,
166    Documentation,
167    ErrorLogs,
168}
169
170/// Structured failure analysis for learning
171#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct FailureAnalysis {
173    pub first_failure_root: FailureRoot,
174    pub recovery_attempts: Vec<RecoveryAttempt>,
175    pub final_error_state: ErrorState,
176    pub contributing_factors: Vec<String>,
177    pub remediation_suggestions: Vec<String>,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
181pub enum FailureRoot {
182    Build {
183        error_type: String,
184        command: String,
185    },
186    Import {
187        missing_module: String,
188        suggested_fix: String,
189    },
190    Path {
191        invalid_path: String,
192        expected_location: String,
193    },
194    Flake {
195        flaky_test: String,
196        failure_rate: f64,
197    },
198    Semantic {
199        logic_error: String,
200        context: String,
201    },
202    Planning {
203        reasoning_error: String,
204        step_number: usize,
205    },
206    Context {
207        missing_info: String,
208        pruning_error: bool,
209    },
210    Tool {
211        tool_name: String,
212        usage_error: String,
213    },
214}
215
216/// Recovery attempt during failure handling
217#[derive(Debug, Clone, Serialize, Deserialize)]
218pub struct RecoveryAttempt {
219    pub strategy: String,
220    pub success: bool,
221    pub time_spent_ms: u64,
222    pub side_effects: Vec<String>,
223}
224
225/// Final error state classification
226#[derive(Debug, Clone, Serialize, Deserialize)]
227pub struct ErrorState {
228    pub recoverable: bool,
229    pub error_category: String,
230    pub confidence: f64,
231    pub similar_failures: u32,
232}
233
234/// Optimizer configuration suggestions
235#[derive(Debug, Clone, Serialize, Deserialize)]
236pub struct ConfigSuggestion {
237    pub suggested_config: RunConfig,
238    pub confidence: f64,
239    pub expected_improvement: f64,
240    pub exploration_vs_exploitation: f64,
241    pub reasoning: String,
242}
243
244/// Task features for contextual optimization
245#[derive(Debug, Clone, Serialize, Deserialize)]
246pub struct TaskFeatures {
247    pub repo_size_mb: f64,
248    pub file_count: u32,
249    pub language_mix: HashMap<String, f64>,
250    pub dependency_count: u32,
251    pub test_count: u32,
252    pub complexity_score: f64,
253    pub domain: String,
254    pub historical_failure_rate: f64,
255    pub avg_completion_time_ms: u64,
256}
257
258/// Retry policy configuration
259#[derive(Debug, Clone, Serialize, Deserialize)]
260pub struct RetryPolicy {
261    pub max_retries: u32,
262    pub backoff_ms: u64,
263    pub exponential_backoff: bool,
264    pub retry_on_flake: bool,
265    pub retry_on_timeout: bool,
266}
267
268/// Sandbox resource limits
269#[derive(Debug, Clone, Serialize, Deserialize)]
270pub struct SandboxLimits {
271    pub memory_mb: u32,
272    pub cpu_cores: f32,
273    pub disk_mb: u32,
274    pub network_enabled: bool,
275    pub execution_timeout_ms: u64,
276}
277
278impl Default for RetryPolicy {
279    fn default() -> Self {
280        Self {
281            max_retries: 3,
282            backoff_ms: 1000,
283            exponential_backoff: true,
284            retry_on_flake: true,
285            retry_on_timeout: true,
286        }
287    }
288}
289
290impl Default for SandboxLimits {
291    fn default() -> Self {
292        Self {
293            memory_mb: 4096,
294            cpu_cores: 2.0,
295            disk_mb: 10240,
296            network_enabled: true,
297            execution_timeout_ms: 600000, // 10 minutes
298        }
299    }
300}
301
302impl Default for RunConfig {
303    fn default() -> Self {
304        Self {
305            max_iterations: 10,
306            temperature: 0.7,
307            context_window: 8192,
308            pruning_threshold: 0.8,
309            planner_depth: 3,
310            debate_rounds: 2,
311            available_tools: vec![
312                "bash".to_string(),
313                "str_replace_editor".to_string(),
314                "grep".to_string(),
315            ],
316            tool_timeout_ms: 30000,
317            retry_policy: RetryPolicy::default(),
318            execution_timeout: Duration::from_secs(600),
319            failing_tests_patience: 3,
320            no_progress_timeout: Duration::from_secs(300),
321            docker_image: "smith:latest".to_string(),
322            tools_commit: "main".to_string(),
323            sandbox_limits: SandboxLimits::default(),
324        }
325    }
326}
327
328impl RunConfig {
329    /// Generate deterministic hash for configuration reproducibility
330    pub fn hash(&self) -> String {
331        use std::collections::hash_map::DefaultHasher;
332        use std::hash::{Hash, Hasher};
333
334        let serialized = serde_json::to_string(self).expect("Failed to serialize RunConfig");
335
336        let mut hasher = DefaultHasher::new();
337        serialized.hash(&mut hasher);
338        format!("{:x}", hasher.finish())
339    }
340
341    /// Generate environment hash for reproducibility
342    pub fn env_hash(&self) -> String {
343        let env_string = format!(
344            "{}:{}:{:?}",
345            self.docker_image, self.tools_commit, self.sandbox_limits
346        );
347
348        use std::collections::hash_map::DefaultHasher;
349        use std::hash::{Hash, Hasher};
350
351        let mut hasher = DefaultHasher::new();
352        env_string.hash(&mut hasher);
353        format!("{:x}", hasher.finish())
354    }
355}
356
357impl BenchmarkEvent {
358    /// Create a new benchmark event with automatic timestamp
359    #[allow(clippy::too_many_arguments)]
360    pub fn new(
361        run_id: Uuid,
362        task_id: String,
363        step_id: Uuid,
364        policy_id: String,
365        cfg_hash: String,
366        env_hash: String,
367        seed: u64,
368        event_type: BenchmarkEventType,
369    ) -> Self {
370        Self {
371            run_id,
372            task_id,
373            step_id,
374            ts: Utc::now(),
375            policy_id,
376            cfg_hash,
377            env_hash,
378            seed,
379            data: serde_json::to_value(&event_type).unwrap(),
380            event_type,
381        }
382    }
383
384    /// Get event as JSON for NATS streaming
385    pub fn to_json(&self) -> serde_json::Value {
386        serde_json::to_value(self).unwrap()
387    }
388
389    /// Get string representation of event type for ClickHouse
390    pub fn event_type_name(&self) -> &'static str {
391        match self.event_type {
392            BenchmarkEventType::RunStart(_) => "run_start",
393            BenchmarkEventType::RunStop(_) => "run_stop",
394            BenchmarkEventType::Step(_) => "step",
395            BenchmarkEventType::ToolCall(_) => "tool_call",
396            BenchmarkEventType::ContextDecision(_) => "context_decision",
397            BenchmarkEventType::Failure(_) => "failure",
398            BenchmarkEventType::OptimizerUpdate(_) => "optimizer_update",
399        }
400    }
401}
402
403#[cfg(test)]
404mod tests {
405    use super::*;
406
407    #[test]
408    fn test_benchmark_event_serialization() {
409        let run_config = RunConfig {
410            max_iterations: 10,
411            temperature: 0.7,
412            context_window: 8192,
413            pruning_threshold: 0.8,
414            planner_depth: 3,
415            debate_rounds: 2,
416            available_tools: vec!["ripgrep".to_string(), "file_edit".to_string()],
417            tool_timeout_ms: 30000,
418            retry_policy: RetryPolicy {
419                max_retries: 3,
420                backoff_ms: 1000,
421                exponential_backoff: true,
422                retry_on_flake: true,
423                retry_on_timeout: false,
424            },
425            execution_timeout: Duration::from_secs(300),
426            failing_tests_patience: 5,
427            no_progress_timeout: Duration::from_secs(60),
428            docker_image: "smith:latest".to_string(),
429            tools_commit: "abc123".to_string(),
430            sandbox_limits: SandboxLimits {
431                memory_mb: 1024,
432                cpu_cores: 2.0,
433                disk_mb: 2048,
434                network_enabled: false,
435                execution_timeout_ms: 300000,
436            },
437        };
438
439        let event = BenchmarkEvent::new(
440            Uuid::new_v4(),
441            "swe_bench_task_123".to_string(),
442            Uuid::new_v4(),
443            "policy_v1".to_string(),
444            run_config.hash(),
445            run_config.env_hash(),
446            42,
447            BenchmarkEventType::RunStart(run_config),
448        );
449
450        let json = event.to_json();
451        assert!(json.is_object());
452        assert!(json.get("run_id").is_some());
453        assert!(json.get("event_type").is_some());
454    }
455
456    #[test]
457    fn test_defaults() {
458        let retry_policy = RetryPolicy::default();
459        assert_eq!(retry_policy.max_retries, 3);
460        assert_eq!(retry_policy.backoff_ms, 1000);
461        assert!(retry_policy.exponential_backoff);
462        assert!(retry_policy.retry_on_flake);
463        assert!(retry_policy.retry_on_timeout);
464
465        let sandbox_limits = SandboxLimits::default();
466        assert_eq!(sandbox_limits.memory_mb, 4096);
467        assert_eq!(sandbox_limits.cpu_cores, 2.0);
468        assert_eq!(sandbox_limits.disk_mb, 10240);
469        assert!(sandbox_limits.network_enabled);
470        assert_eq!(sandbox_limits.execution_timeout_ms, 600000);
471
472        let run_config = RunConfig::default();
473        assert_eq!(run_config.max_iterations, 10);
474        assert_eq!(run_config.temperature, 0.7);
475        assert_eq!(run_config.context_window, 8192);
476        assert_eq!(run_config.pruning_threshold, 0.8);
477        assert_eq!(run_config.planner_depth, 3);
478        assert_eq!(run_config.debate_rounds, 2);
479        assert!(!run_config.available_tools.is_empty());
480        assert_eq!(run_config.tool_timeout_ms, 30000);
481        assert_eq!(run_config.failing_tests_patience, 3);
482        assert_eq!(run_config.docker_image, "smith:latest");
483        assert_eq!(run_config.tools_commit, "main");
484    }
485
486    #[test]
487    fn test_event_type_name() {
488        let run_config = RunConfig::default();
489        let run_result = RunResult {
490            success: true,
491            wall_time_ms: 1000,
492            total_tokens: 500,
493            tool_errors: 0,
494            stability_variance: 0.1,
495            final_score: 95.0,
496            failing_tests: vec![],
497            tests_passed: 10,
498            tests_failed: 0,
499            files_modified: 2,
500            early_stopped: false,
501            early_stop_reason: None,
502        };
503
504        let step_data = StepData {
505            step_type: StepType::Reason,
506            content: "Analyzing the problem".to_string(),
507            tokens_in: 100,
508            tokens_out: 50,
509            wall_time_ms: 500,
510            planned_step: true,
511            context_kb: 4,
512        };
513
514        let tool_performance = ToolPerformance {
515            tool_name: "bash".to_string(),
516            tool_args_hash: "abc123".to_string(),
517            latency_ms: 200,
518            retries: 0,
519            exit_kind: ExitKind::Ok,
520            evidence_footprint: EvidenceFootprint {
521                files_read: vec!["/tmp/test.py".to_string()],
522                files_modified: vec![],
523                lines_changed: 0,
524                tests_added: 0,
525                tests_modified: 0,
526                bytes_read: 1024,
527                bytes_written: 0,
528            },
529            user_feedback_score: Some(4.5),
530            intent_accuracy: Some(0.9),
531        };
532
533        let pruning_decision = PruningDecision {
534            segment_id: "context_123".to_string(),
535            kept: true,
536            segment_kb: 10,
537            segment_type: SegmentType::FileContent,
538            impact_probe_ev: Some(0.8),
539            pruning_algorithm: "threshold_based".to_string(),
540            confidence_score: 0.95,
541        };
542
543        let failure_analysis = FailureAnalysis {
544            first_failure_root: FailureRoot::Build {
545                error_type: "compilation_error".to_string(),
546                command: "make build".to_string(),
547            },
548            recovery_attempts: vec![RecoveryAttempt {
549                strategy: "fix_imports".to_string(),
550                success: false,
551                time_spent_ms: 5000,
552                side_effects: vec!["modified_file".to_string()],
553            }],
554            final_error_state: ErrorState {
555                recoverable: true,
556                error_category: "build".to_string(),
557                confidence: 0.85,
558                similar_failures: 3,
559            },
560            contributing_factors: vec!["missing_dependency".to_string()],
561            remediation_suggestions: vec!["install_package".to_string()],
562        };
563
564        let config_suggestion = ConfigSuggestion {
565            suggested_config: run_config.clone(),
566            confidence: 0.9,
567            expected_improvement: 0.15,
568            exploration_vs_exploitation: 0.3,
569            reasoning: "Based on successful patterns".to_string(),
570        };
571
572        // Test event type names
573        let run_start_event = BenchmarkEvent::new(
574            Uuid::new_v4(),
575            "task".to_string(),
576            Uuid::new_v4(),
577            "policy".to_string(),
578            "cfg".to_string(),
579            "env".to_string(),
580            0,
581            BenchmarkEventType::RunStart(run_config),
582        );
583        assert_eq!(run_start_event.event_type_name(), "run_start");
584
585        let run_stop_event = BenchmarkEvent::new(
586            Uuid::new_v4(),
587            "task".to_string(),
588            Uuid::new_v4(),
589            "policy".to_string(),
590            "cfg".to_string(),
591            "env".to_string(),
592            0,
593            BenchmarkEventType::RunStop(run_result),
594        );
595        assert_eq!(run_stop_event.event_type_name(), "run_stop");
596
597        let step_event = BenchmarkEvent::new(
598            Uuid::new_v4(),
599            "task".to_string(),
600            Uuid::new_v4(),
601            "policy".to_string(),
602            "cfg".to_string(),
603            "env".to_string(),
604            0,
605            BenchmarkEventType::Step(step_data),
606        );
607        assert_eq!(step_event.event_type_name(), "step");
608
609        let tool_call_event = BenchmarkEvent::new(
610            Uuid::new_v4(),
611            "task".to_string(),
612            Uuid::new_v4(),
613            "policy".to_string(),
614            "cfg".to_string(),
615            "env".to_string(),
616            0,
617            BenchmarkEventType::ToolCall(tool_performance),
618        );
619        assert_eq!(tool_call_event.event_type_name(), "tool_call");
620
621        let context_decision_event = BenchmarkEvent::new(
622            Uuid::new_v4(),
623            "task".to_string(),
624            Uuid::new_v4(),
625            "policy".to_string(),
626            "cfg".to_string(),
627            "env".to_string(),
628            0,
629            BenchmarkEventType::ContextDecision(pruning_decision),
630        );
631        assert_eq!(context_decision_event.event_type_name(), "context_decision");
632
633        let failure_event = BenchmarkEvent::new(
634            Uuid::new_v4(),
635            "task".to_string(),
636            Uuid::new_v4(),
637            "policy".to_string(),
638            "cfg".to_string(),
639            "env".to_string(),
640            0,
641            BenchmarkEventType::Failure(failure_analysis),
642        );
643        assert_eq!(failure_event.event_type_name(), "failure");
644
645        let optimizer_event = BenchmarkEvent::new(
646            Uuid::new_v4(),
647            "task".to_string(),
648            Uuid::new_v4(),
649            "policy".to_string(),
650            "cfg".to_string(),
651            "env".to_string(),
652            0,
653            BenchmarkEventType::OptimizerUpdate(config_suggestion),
654        );
655        assert_eq!(optimizer_event.event_type_name(), "optimizer_update");
656    }
657
658    #[test]
659    fn test_env_hash() {
660        let config = RunConfig::default();
661        let hash1 = config.env_hash();
662
663        let mut config2 = config.clone();
664        config2.docker_image = "smith:v2".to_string();
665        let hash2 = config2.env_hash();
666
667        assert_ne!(hash1, hash2);
668
669        // Same config should produce same hash
670        assert_eq!(hash1, config.env_hash());
671    }
672
673    #[test]
674    fn test_complex_failure_roots() {
675        let failure_roots = vec![
676            FailureRoot::Import {
677                missing_module: "numpy".to_string(),
678                suggested_fix: "pip install numpy".to_string(),
679            },
680            FailureRoot::Path {
681                invalid_path: "/invalid/path".to_string(),
682                expected_location: "/correct/path".to_string(),
683            },
684            FailureRoot::Flake {
685                flaky_test: "test_network".to_string(),
686                failure_rate: 0.25,
687            },
688            FailureRoot::Semantic {
689                logic_error: "Off-by-one error".to_string(),
690                context: "Loop iteration".to_string(),
691            },
692            FailureRoot::Planning {
693                reasoning_error: "Incorrect assumption".to_string(),
694                step_number: 3,
695            },
696            FailureRoot::Context {
697                missing_info: "API documentation".to_string(),
698                pruning_error: true,
699            },
700            FailureRoot::Tool {
701                tool_name: "grep".to_string(),
702                usage_error: "Invalid regex".to_string(),
703            },
704        ];
705
706        for failure_root in failure_roots {
707            let failure_analysis = FailureAnalysis {
708                first_failure_root: failure_root,
709                recovery_attempts: vec![],
710                final_error_state: ErrorState {
711                    recoverable: false,
712                    error_category: "test".to_string(),
713                    confidence: 0.9,
714                    similar_failures: 1,
715                },
716                contributing_factors: vec!["test_factor".to_string()],
717                remediation_suggestions: vec!["test_suggestion".to_string()],
718            };
719
720            // Test serialization
721            let json = serde_json::to_value(&failure_analysis).unwrap();
722            let deserialized: FailureAnalysis = serde_json::from_value(json).unwrap();
723
724            // Basic validation that we can round-trip serialize
725            assert_eq!(
726                deserialized.contributing_factors,
727                failure_analysis.contributing_factors
728            );
729            assert_eq!(
730                deserialized.remediation_suggestions,
731                failure_analysis.remediation_suggestions
732            );
733        }
734    }
735
736    #[test]
737    fn test_config_hashing() {
738        let config1 = RunConfig {
739            max_iterations: 10,
740            temperature: 0.7,
741            context_window: 8192,
742            pruning_threshold: 0.8,
743            planner_depth: 3,
744            debate_rounds: 2,
745            available_tools: vec!["ripgrep".to_string()],
746            tool_timeout_ms: 30000,
747            retry_policy: RetryPolicy {
748                max_retries: 3,
749                backoff_ms: 1000,
750                exponential_backoff: true,
751                retry_on_flake: true,
752                retry_on_timeout: false,
753            },
754            execution_timeout: Duration::from_secs(300),
755            failing_tests_patience: 5,
756            no_progress_timeout: Duration::from_secs(60),
757            docker_image: "smith:latest".to_string(),
758            tools_commit: "abc123".to_string(),
759            sandbox_limits: SandboxLimits {
760                memory_mb: 1024,
761                cpu_cores: 2.0,
762                disk_mb: 2048,
763                network_enabled: false,
764                execution_timeout_ms: 300000,
765            },
766        };
767
768        let config2 = config1.clone();
769
770        assert_eq!(config1.hash(), config2.hash());
771
772        // Different configs should have different hashes
773        let mut config3 = config1.clone();
774        config3.temperature = 0.8;
775        assert_ne!(config1.hash(), config3.hash());
776    }
777}