1use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::time::Duration;
10use uuid::Uuid;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct BenchmarkEvent {
15 pub run_id: Uuid,
17 pub task_id: String,
18 pub step_id: Uuid,
19 pub ts: DateTime<Utc>,
20 pub policy_id: String,
21 pub cfg_hash: String,
22 pub env_hash: String,
23 pub seed: u64,
24
25 pub event_type: BenchmarkEventType,
27 pub data: serde_json::Value,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
32pub enum BenchmarkEventType {
33 RunStart(RunConfig),
34 RunStop(RunResult),
35 Step(StepData),
36 ToolCall(ToolPerformance),
37 ContextDecision(PruningDecision),
38 Failure(FailureAnalysis),
39 OptimizerUpdate(ConfigSuggestion),
40}
41
42#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct RunConfig {
45 pub max_iterations: usize,
47 pub temperature: f32,
48 pub context_window: usize,
49 pub pruning_threshold: f32,
50 pub planner_depth: usize,
51 pub debate_rounds: usize,
52
53 pub available_tools: Vec<String>,
55 pub tool_timeout_ms: u64,
56 pub retry_policy: RetryPolicy,
57
58 pub execution_timeout: Duration,
60 pub failing_tests_patience: usize,
61 pub no_progress_timeout: Duration,
62
63 pub docker_image: String,
65 pub tools_commit: String,
66 pub sandbox_limits: SandboxLimits,
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct RunResult {
72 pub success: bool,
74 pub wall_time_ms: u64,
75 pub total_tokens: u64,
76 pub tool_errors: u32,
77 pub stability_variance: f64,
78
79 pub final_score: f64,
81 pub failing_tests: Vec<String>,
82 pub tests_passed: u32,
83 pub tests_failed: u32,
84 pub files_modified: u32,
85 pub early_stopped: bool,
86 pub early_stop_reason: Option<String>,
87}
88
89#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct StepData {
92 pub step_type: StepType,
93 pub content: String,
94 pub tokens_in: u32,
95 pub tokens_out: u32,
96 pub wall_time_ms: u64,
97 pub planned_step: bool,
98 pub context_kb: usize,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub enum StepType {
103 Reason,
104 Act,
105 Observe,
106 Conclude,
107 Plan,
108 Debate,
109}
110
111#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct ToolPerformance {
114 pub tool_name: String,
115 pub tool_args_hash: String,
116 pub latency_ms: u64,
117 pub retries: u32,
118 pub exit_kind: ExitKind,
119 pub evidence_footprint: EvidenceFootprint,
120 pub user_feedback_score: Option<f32>,
121 pub intent_accuracy: Option<f32>, }
123
124#[derive(Debug, Clone, Serialize, Deserialize)]
125pub enum ExitKind {
126 Ok,
127 Deterministic(String),
128 Flake(String),
129 Oom,
130 Timeout,
131 UserError(String),
132 SystemError(String),
133}
134
135#[derive(Debug, Clone, Serialize, Deserialize)]
137pub struct EvidenceFootprint {
138 pub files_read: Vec<String>,
139 pub files_modified: Vec<String>,
140 pub lines_changed: usize,
141 pub tests_added: usize,
142 pub tests_modified: usize,
143 pub bytes_read: usize,
144 pub bytes_written: usize,
145}
146
147#[derive(Debug, Clone, Serialize, Deserialize)]
149pub struct PruningDecision {
150 pub segment_id: String,
151 pub kept: bool,
152 pub segment_kb: usize,
153 pub segment_type: SegmentType,
154 pub impact_probe_ev: Option<f64>, pub pruning_algorithm: String,
156 pub confidence_score: f64,
157}
158
159#[derive(Debug, Clone, Serialize, Deserialize)]
160pub enum SegmentType {
161 FileContent,
162 SearchResults,
163 ConversationHistory,
164 ToolOutput,
165 ReasoningContext,
166 Documentation,
167 ErrorLogs,
168}
169
170#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct FailureAnalysis {
173 pub first_failure_root: FailureRoot,
174 pub recovery_attempts: Vec<RecoveryAttempt>,
175 pub final_error_state: ErrorState,
176 pub contributing_factors: Vec<String>,
177 pub remediation_suggestions: Vec<String>,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
181pub enum FailureRoot {
182 Build {
183 error_type: String,
184 command: String,
185 },
186 Import {
187 missing_module: String,
188 suggested_fix: String,
189 },
190 Path {
191 invalid_path: String,
192 expected_location: String,
193 },
194 Flake {
195 flaky_test: String,
196 failure_rate: f64,
197 },
198 Semantic {
199 logic_error: String,
200 context: String,
201 },
202 Planning {
203 reasoning_error: String,
204 step_number: usize,
205 },
206 Context {
207 missing_info: String,
208 pruning_error: bool,
209 },
210 Tool {
211 tool_name: String,
212 usage_error: String,
213 },
214}
215
216#[derive(Debug, Clone, Serialize, Deserialize)]
218pub struct RecoveryAttempt {
219 pub strategy: String,
220 pub success: bool,
221 pub time_spent_ms: u64,
222 pub side_effects: Vec<String>,
223}
224
225#[derive(Debug, Clone, Serialize, Deserialize)]
227pub struct ErrorState {
228 pub recoverable: bool,
229 pub error_category: String,
230 pub confidence: f64,
231 pub similar_failures: u32,
232}
233
234#[derive(Debug, Clone, Serialize, Deserialize)]
236pub struct ConfigSuggestion {
237 pub suggested_config: RunConfig,
238 pub confidence: f64,
239 pub expected_improvement: f64,
240 pub exploration_vs_exploitation: f64,
241 pub reasoning: String,
242}
243
244#[derive(Debug, Clone, Serialize, Deserialize)]
246pub struct TaskFeatures {
247 pub repo_size_mb: f64,
248 pub file_count: u32,
249 pub language_mix: HashMap<String, f64>,
250 pub dependency_count: u32,
251 pub test_count: u32,
252 pub complexity_score: f64,
253 pub domain: String,
254 pub historical_failure_rate: f64,
255 pub avg_completion_time_ms: u64,
256}
257
258#[derive(Debug, Clone, Serialize, Deserialize)]
260pub struct RetryPolicy {
261 pub max_retries: u32,
262 pub backoff_ms: u64,
263 pub exponential_backoff: bool,
264 pub retry_on_flake: bool,
265 pub retry_on_timeout: bool,
266}
267
268#[derive(Debug, Clone, Serialize, Deserialize)]
270pub struct SandboxLimits {
271 pub memory_mb: u32,
272 pub cpu_cores: f32,
273 pub disk_mb: u32,
274 pub network_enabled: bool,
275 pub execution_timeout_ms: u64,
276}
277
278impl Default for RetryPolicy {
279 fn default() -> Self {
280 Self {
281 max_retries: 3,
282 backoff_ms: 1000,
283 exponential_backoff: true,
284 retry_on_flake: true,
285 retry_on_timeout: true,
286 }
287 }
288}
289
290impl Default for SandboxLimits {
291 fn default() -> Self {
292 Self {
293 memory_mb: 4096,
294 cpu_cores: 2.0,
295 disk_mb: 10240,
296 network_enabled: true,
297 execution_timeout_ms: 600000, }
299 }
300}
301
302impl Default for RunConfig {
303 fn default() -> Self {
304 Self {
305 max_iterations: 10,
306 temperature: 0.7,
307 context_window: 8192,
308 pruning_threshold: 0.8,
309 planner_depth: 3,
310 debate_rounds: 2,
311 available_tools: vec![
312 "bash".to_string(),
313 "str_replace_editor".to_string(),
314 "grep".to_string(),
315 ],
316 tool_timeout_ms: 30000,
317 retry_policy: RetryPolicy::default(),
318 execution_timeout: Duration::from_secs(600),
319 failing_tests_patience: 3,
320 no_progress_timeout: Duration::from_secs(300),
321 docker_image: "smith:latest".to_string(),
322 tools_commit: "main".to_string(),
323 sandbox_limits: SandboxLimits::default(),
324 }
325 }
326}
327
328impl RunConfig {
329 pub fn hash(&self) -> String {
331 use std::collections::hash_map::DefaultHasher;
332 use std::hash::{Hash, Hasher};
333
334 let serialized = serde_json::to_string(self).expect("Failed to serialize RunConfig");
335
336 let mut hasher = DefaultHasher::new();
337 serialized.hash(&mut hasher);
338 format!("{:x}", hasher.finish())
339 }
340
341 pub fn env_hash(&self) -> String {
343 let env_string = format!(
344 "{}:{}:{:?}",
345 self.docker_image, self.tools_commit, self.sandbox_limits
346 );
347
348 use std::collections::hash_map::DefaultHasher;
349 use std::hash::{Hash, Hasher};
350
351 let mut hasher = DefaultHasher::new();
352 env_string.hash(&mut hasher);
353 format!("{:x}", hasher.finish())
354 }
355}
356
357impl BenchmarkEvent {
358 #[allow(clippy::too_many_arguments)]
360 pub fn new(
361 run_id: Uuid,
362 task_id: String,
363 step_id: Uuid,
364 policy_id: String,
365 cfg_hash: String,
366 env_hash: String,
367 seed: u64,
368 event_type: BenchmarkEventType,
369 ) -> Self {
370 Self {
371 run_id,
372 task_id,
373 step_id,
374 ts: Utc::now(),
375 policy_id,
376 cfg_hash,
377 env_hash,
378 seed,
379 data: serde_json::to_value(&event_type).unwrap(),
380 event_type,
381 }
382 }
383
384 pub fn to_json(&self) -> serde_json::Value {
386 serde_json::to_value(self).unwrap()
387 }
388
389 pub fn event_type_name(&self) -> &'static str {
391 match self.event_type {
392 BenchmarkEventType::RunStart(_) => "run_start",
393 BenchmarkEventType::RunStop(_) => "run_stop",
394 BenchmarkEventType::Step(_) => "step",
395 BenchmarkEventType::ToolCall(_) => "tool_call",
396 BenchmarkEventType::ContextDecision(_) => "context_decision",
397 BenchmarkEventType::Failure(_) => "failure",
398 BenchmarkEventType::OptimizerUpdate(_) => "optimizer_update",
399 }
400 }
401}
402
403#[cfg(test)]
404mod tests {
405 use super::*;
406
407 #[test]
408 fn test_benchmark_event_serialization() {
409 let run_config = RunConfig {
410 max_iterations: 10,
411 temperature: 0.7,
412 context_window: 8192,
413 pruning_threshold: 0.8,
414 planner_depth: 3,
415 debate_rounds: 2,
416 available_tools: vec!["ripgrep".to_string(), "file_edit".to_string()],
417 tool_timeout_ms: 30000,
418 retry_policy: RetryPolicy {
419 max_retries: 3,
420 backoff_ms: 1000,
421 exponential_backoff: true,
422 retry_on_flake: true,
423 retry_on_timeout: false,
424 },
425 execution_timeout: Duration::from_secs(300),
426 failing_tests_patience: 5,
427 no_progress_timeout: Duration::from_secs(60),
428 docker_image: "smith:latest".to_string(),
429 tools_commit: "abc123".to_string(),
430 sandbox_limits: SandboxLimits {
431 memory_mb: 1024,
432 cpu_cores: 2.0,
433 disk_mb: 2048,
434 network_enabled: false,
435 execution_timeout_ms: 300000,
436 },
437 };
438
439 let event = BenchmarkEvent::new(
440 Uuid::new_v4(),
441 "swe_bench_task_123".to_string(),
442 Uuid::new_v4(),
443 "policy_v1".to_string(),
444 run_config.hash(),
445 run_config.env_hash(),
446 42,
447 BenchmarkEventType::RunStart(run_config),
448 );
449
450 let json = event.to_json();
451 assert!(json.is_object());
452 assert!(json.get("run_id").is_some());
453 assert!(json.get("event_type").is_some());
454 }
455
456 #[test]
457 fn test_defaults() {
458 let retry_policy = RetryPolicy::default();
459 assert_eq!(retry_policy.max_retries, 3);
460 assert_eq!(retry_policy.backoff_ms, 1000);
461 assert!(retry_policy.exponential_backoff);
462 assert!(retry_policy.retry_on_flake);
463 assert!(retry_policy.retry_on_timeout);
464
465 let sandbox_limits = SandboxLimits::default();
466 assert_eq!(sandbox_limits.memory_mb, 4096);
467 assert_eq!(sandbox_limits.cpu_cores, 2.0);
468 assert_eq!(sandbox_limits.disk_mb, 10240);
469 assert!(sandbox_limits.network_enabled);
470 assert_eq!(sandbox_limits.execution_timeout_ms, 600000);
471
472 let run_config = RunConfig::default();
473 assert_eq!(run_config.max_iterations, 10);
474 assert_eq!(run_config.temperature, 0.7);
475 assert_eq!(run_config.context_window, 8192);
476 assert_eq!(run_config.pruning_threshold, 0.8);
477 assert_eq!(run_config.planner_depth, 3);
478 assert_eq!(run_config.debate_rounds, 2);
479 assert!(!run_config.available_tools.is_empty());
480 assert_eq!(run_config.tool_timeout_ms, 30000);
481 assert_eq!(run_config.failing_tests_patience, 3);
482 assert_eq!(run_config.docker_image, "smith:latest");
483 assert_eq!(run_config.tools_commit, "main");
484 }
485
486 #[test]
487 fn test_event_type_name() {
488 let run_config = RunConfig::default();
489 let run_result = RunResult {
490 success: true,
491 wall_time_ms: 1000,
492 total_tokens: 500,
493 tool_errors: 0,
494 stability_variance: 0.1,
495 final_score: 95.0,
496 failing_tests: vec![],
497 tests_passed: 10,
498 tests_failed: 0,
499 files_modified: 2,
500 early_stopped: false,
501 early_stop_reason: None,
502 };
503
504 let step_data = StepData {
505 step_type: StepType::Reason,
506 content: "Analyzing the problem".to_string(),
507 tokens_in: 100,
508 tokens_out: 50,
509 wall_time_ms: 500,
510 planned_step: true,
511 context_kb: 4,
512 };
513
514 let tool_performance = ToolPerformance {
515 tool_name: "bash".to_string(),
516 tool_args_hash: "abc123".to_string(),
517 latency_ms: 200,
518 retries: 0,
519 exit_kind: ExitKind::Ok,
520 evidence_footprint: EvidenceFootprint {
521 files_read: vec!["/tmp/test.py".to_string()],
522 files_modified: vec![],
523 lines_changed: 0,
524 tests_added: 0,
525 tests_modified: 0,
526 bytes_read: 1024,
527 bytes_written: 0,
528 },
529 user_feedback_score: Some(4.5),
530 intent_accuracy: Some(0.9),
531 };
532
533 let pruning_decision = PruningDecision {
534 segment_id: "context_123".to_string(),
535 kept: true,
536 segment_kb: 10,
537 segment_type: SegmentType::FileContent,
538 impact_probe_ev: Some(0.8),
539 pruning_algorithm: "threshold_based".to_string(),
540 confidence_score: 0.95,
541 };
542
543 let failure_analysis = FailureAnalysis {
544 first_failure_root: FailureRoot::Build {
545 error_type: "compilation_error".to_string(),
546 command: "make build".to_string(),
547 },
548 recovery_attempts: vec![RecoveryAttempt {
549 strategy: "fix_imports".to_string(),
550 success: false,
551 time_spent_ms: 5000,
552 side_effects: vec!["modified_file".to_string()],
553 }],
554 final_error_state: ErrorState {
555 recoverable: true,
556 error_category: "build".to_string(),
557 confidence: 0.85,
558 similar_failures: 3,
559 },
560 contributing_factors: vec!["missing_dependency".to_string()],
561 remediation_suggestions: vec!["install_package".to_string()],
562 };
563
564 let config_suggestion = ConfigSuggestion {
565 suggested_config: run_config.clone(),
566 confidence: 0.9,
567 expected_improvement: 0.15,
568 exploration_vs_exploitation: 0.3,
569 reasoning: "Based on successful patterns".to_string(),
570 };
571
572 let run_start_event = BenchmarkEvent::new(
574 Uuid::new_v4(),
575 "task".to_string(),
576 Uuid::new_v4(),
577 "policy".to_string(),
578 "cfg".to_string(),
579 "env".to_string(),
580 0,
581 BenchmarkEventType::RunStart(run_config),
582 );
583 assert_eq!(run_start_event.event_type_name(), "run_start");
584
585 let run_stop_event = BenchmarkEvent::new(
586 Uuid::new_v4(),
587 "task".to_string(),
588 Uuid::new_v4(),
589 "policy".to_string(),
590 "cfg".to_string(),
591 "env".to_string(),
592 0,
593 BenchmarkEventType::RunStop(run_result),
594 );
595 assert_eq!(run_stop_event.event_type_name(), "run_stop");
596
597 let step_event = BenchmarkEvent::new(
598 Uuid::new_v4(),
599 "task".to_string(),
600 Uuid::new_v4(),
601 "policy".to_string(),
602 "cfg".to_string(),
603 "env".to_string(),
604 0,
605 BenchmarkEventType::Step(step_data),
606 );
607 assert_eq!(step_event.event_type_name(), "step");
608
609 let tool_call_event = BenchmarkEvent::new(
610 Uuid::new_v4(),
611 "task".to_string(),
612 Uuid::new_v4(),
613 "policy".to_string(),
614 "cfg".to_string(),
615 "env".to_string(),
616 0,
617 BenchmarkEventType::ToolCall(tool_performance),
618 );
619 assert_eq!(tool_call_event.event_type_name(), "tool_call");
620
621 let context_decision_event = BenchmarkEvent::new(
622 Uuid::new_v4(),
623 "task".to_string(),
624 Uuid::new_v4(),
625 "policy".to_string(),
626 "cfg".to_string(),
627 "env".to_string(),
628 0,
629 BenchmarkEventType::ContextDecision(pruning_decision),
630 );
631 assert_eq!(context_decision_event.event_type_name(), "context_decision");
632
633 let failure_event = BenchmarkEvent::new(
634 Uuid::new_v4(),
635 "task".to_string(),
636 Uuid::new_v4(),
637 "policy".to_string(),
638 "cfg".to_string(),
639 "env".to_string(),
640 0,
641 BenchmarkEventType::Failure(failure_analysis),
642 );
643 assert_eq!(failure_event.event_type_name(), "failure");
644
645 let optimizer_event = BenchmarkEvent::new(
646 Uuid::new_v4(),
647 "task".to_string(),
648 Uuid::new_v4(),
649 "policy".to_string(),
650 "cfg".to_string(),
651 "env".to_string(),
652 0,
653 BenchmarkEventType::OptimizerUpdate(config_suggestion),
654 );
655 assert_eq!(optimizer_event.event_type_name(), "optimizer_update");
656 }
657
658 #[test]
659 fn test_env_hash() {
660 let config = RunConfig::default();
661 let hash1 = config.env_hash();
662
663 let mut config2 = config.clone();
664 config2.docker_image = "smith:v2".to_string();
665 let hash2 = config2.env_hash();
666
667 assert_ne!(hash1, hash2);
668
669 assert_eq!(hash1, config.env_hash());
671 }
672
673 #[test]
674 fn test_complex_failure_roots() {
675 let failure_roots = vec![
676 FailureRoot::Import {
677 missing_module: "numpy".to_string(),
678 suggested_fix: "pip install numpy".to_string(),
679 },
680 FailureRoot::Path {
681 invalid_path: "/invalid/path".to_string(),
682 expected_location: "/correct/path".to_string(),
683 },
684 FailureRoot::Flake {
685 flaky_test: "test_network".to_string(),
686 failure_rate: 0.25,
687 },
688 FailureRoot::Semantic {
689 logic_error: "Off-by-one error".to_string(),
690 context: "Loop iteration".to_string(),
691 },
692 FailureRoot::Planning {
693 reasoning_error: "Incorrect assumption".to_string(),
694 step_number: 3,
695 },
696 FailureRoot::Context {
697 missing_info: "API documentation".to_string(),
698 pruning_error: true,
699 },
700 FailureRoot::Tool {
701 tool_name: "grep".to_string(),
702 usage_error: "Invalid regex".to_string(),
703 },
704 ];
705
706 for failure_root in failure_roots {
707 let failure_analysis = FailureAnalysis {
708 first_failure_root: failure_root,
709 recovery_attempts: vec![],
710 final_error_state: ErrorState {
711 recoverable: false,
712 error_category: "test".to_string(),
713 confidence: 0.9,
714 similar_failures: 1,
715 },
716 contributing_factors: vec!["test_factor".to_string()],
717 remediation_suggestions: vec!["test_suggestion".to_string()],
718 };
719
720 let json = serde_json::to_value(&failure_analysis).unwrap();
722 let deserialized: FailureAnalysis = serde_json::from_value(json).unwrap();
723
724 assert_eq!(
726 deserialized.contributing_factors,
727 failure_analysis.contributing_factors
728 );
729 assert_eq!(
730 deserialized.remediation_suggestions,
731 failure_analysis.remediation_suggestions
732 );
733 }
734 }
735
736 #[test]
737 fn test_config_hashing() {
738 let config1 = RunConfig {
739 max_iterations: 10,
740 temperature: 0.7,
741 context_window: 8192,
742 pruning_threshold: 0.8,
743 planner_depth: 3,
744 debate_rounds: 2,
745 available_tools: vec!["ripgrep".to_string()],
746 tool_timeout_ms: 30000,
747 retry_policy: RetryPolicy {
748 max_retries: 3,
749 backoff_ms: 1000,
750 exponential_backoff: true,
751 retry_on_flake: true,
752 retry_on_timeout: false,
753 },
754 execution_timeout: Duration::from_secs(300),
755 failing_tests_patience: 5,
756 no_progress_timeout: Duration::from_secs(60),
757 docker_image: "smith:latest".to_string(),
758 tools_commit: "abc123".to_string(),
759 sandbox_limits: SandboxLimits {
760 memory_mb: 1024,
761 cpu_cores: 2.0,
762 disk_mb: 2048,
763 network_enabled: false,
764 execution_timeout_ms: 300000,
765 },
766 };
767
768 let config2 = config1.clone();
769
770 assert_eq!(config1.hash(), config2.hash());
771
772 let mut config3 = config1.clone();
774 config3.temperature = 0.8;
775 assert_ne!(config1.hash(), config3.hash());
776 }
777}