Skip to main content

vtcode_core/core/
error_recovery.rs

1use crate::core::timeout_detector::{OperationType, TIMEOUT_DETECTOR};
2use crate::utils::current_timestamp;
3use anyhow::Result;
4use indexmap::IndexMap;
5use serde::{Deserialize, Serialize};
6use serde_json::Value;
7
8/// Represents an error that occurred during execution
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct ExecutionError {
11    pub id: String,
12    pub timestamp: u64,
13    pub error_type: ErrorType,
14    pub message: String,
15    pub context: ErrorContext,
16    pub recovery_attempts: Vec<RecoveryAttempt>,
17    pub resolved: bool,
18}
19
20/// Type of error that can occur
21///
22/// Canonical error type used across both the global error recovery manager
23/// and agent-specific error state tracking. Superset of all error categories.
24#[derive(Debug, Clone, Copy, Serialize, Deserialize, Eq, Hash, PartialEq)]
25pub enum ErrorType {
26    ToolExecution,
27    ApiCall,
28    FileSystem,
29    Network,
30    Validation,
31    CircuitBreaker,
32    Timeout,
33    PermissionDenied,
34    InvalidArguments,
35    ResourceNotFound,
36    Other,
37}
38
39/// Context information about where and why the error occurred
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct ErrorContext {
42    pub conversation_turn: usize,
43    pub user_input: Option<String>,
44    pub tool_name: Option<String>,
45    pub tool_args: Option<Value>,
46    pub api_request_size: Option<usize>,
47    pub context_size: Option<usize>,
48    pub stack_trace: Option<String>,
49}
50
51/// A recovery attempt that was made
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct RecoveryAttempt {
54    pub timestamp: u64,
55    pub strategy: RecoveryStrategy,
56    pub success: bool,
57    pub result: String,
58    pub new_context_size: Option<usize>,
59}
60
61/// Recovery strategy used to handle the error
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub enum RecoveryStrategy {
64    RetryWithBackoff {
65        delay_ms: u64,
66        attempt_number: usize,
67    },
68
69    SimplifyRequest {
70        removed_parameters: Vec<String>,
71    },
72    AlternativeTool {
73        original_tool: String,
74        alternative_tool: String,
75    },
76    ContextReset {
77        preserved_data: IndexMap<String, Value>,
78    },
79    ManualIntervention,
80}
81
82/// Error recovery manager
83pub struct ErrorRecoveryManager {
84    errors: Vec<ExecutionError>,
85    recovery_strategies: IndexMap<ErrorType, Vec<RecoveryStrategy>>,
86    operation_type_mapping: IndexMap<ErrorType, OperationType>,
87}
88
89impl Default for ErrorRecoveryManager {
90    fn default() -> Self {
91        Self::new()
92    }
93}
94
95impl ErrorRecoveryManager {
96    pub fn new() -> Self {
97        // Pre-allocate with known capacity
98        let mut recovery_strategies = IndexMap::with_capacity(2);
99        let mut operation_type_mapping = IndexMap::with_capacity(11);
100
101        // Define recovery strategies for different error types
102        recovery_strategies.insert(
103            ErrorType::ToolExecution,
104            vec![
105                RecoveryStrategy::RetryWithBackoff {
106                    delay_ms: 1000,
107                    attempt_number: 1,
108                },
109                RecoveryStrategy::AlternativeTool {
110                    original_tool: String::new(),
111                    alternative_tool: String::new(),
112                },
113            ],
114        );
115
116        recovery_strategies.insert(
117            ErrorType::ApiCall,
118            vec![
119                RecoveryStrategy::RetryWithBackoff {
120                    delay_ms: 2000,
121                    attempt_number: 1,
122                },
123                RecoveryStrategy::ContextReset {
124                    preserved_data: IndexMap::new(),
125                },
126            ],
127        );
128
129        // Map error types to operation types for timeout detector integration
130        operation_type_mapping.insert(ErrorType::ToolExecution, OperationType::ToolExecution);
131        operation_type_mapping.insert(ErrorType::ApiCall, OperationType::ApiCall);
132        operation_type_mapping.insert(ErrorType::Network, OperationType::NetworkRequest);
133        operation_type_mapping.insert(ErrorType::FileSystem, OperationType::FileOperation);
134        operation_type_mapping.insert(ErrorType::Validation, OperationType::Processing);
135        operation_type_mapping.insert(ErrorType::CircuitBreaker, OperationType::ToolExecution);
136        operation_type_mapping.insert(ErrorType::Timeout, OperationType::Processing);
137        operation_type_mapping.insert(ErrorType::PermissionDenied, OperationType::Processing);
138        operation_type_mapping.insert(ErrorType::InvalidArguments, OperationType::Processing);
139        operation_type_mapping.insert(ErrorType::ResourceNotFound, OperationType::FileOperation);
140        operation_type_mapping.insert(ErrorType::Other, OperationType::Processing);
141
142        Self {
143            errors: Vec::with_capacity(16), // Pre-allocate for typical session
144            recovery_strategies,
145            operation_type_mapping,
146        }
147    }
148
149    /// Record a new error
150    pub fn record_error(
151        &mut self,
152        error_type: ErrorType,
153        message: String,
154        context: ErrorContext,
155    ) -> String {
156        // Use a more efficient ID generation with minimal formatting
157        let error_count = self.errors.len();
158        let timestamp_short = current_timestamp() % 10000;
159        let error_id = format!("e{}_{}", error_count, timestamp_short);
160
161        let error = ExecutionError {
162            id: error_id.clone(),
163            timestamp: current_timestamp(),
164            error_type, // ErrorType is Copy now, no need to clone
165            message,
166            context,
167            recovery_attempts: Vec::with_capacity(2), // Most errors have 1-2 recovery attempts
168            resolved: false,
169        };
170
171        self.errors.push(error);
172        error_id
173    }
174
175    /// Record a recovery attempt
176    #[cold]
177    pub fn record_recovery_attempt(
178        &mut self,
179        error_id: &str,
180        strategy: RecoveryStrategy,
181        success: bool,
182        result: String,
183        new_context_size: Option<usize>,
184    ) {
185        let attempt = RecoveryAttempt {
186            timestamp: current_timestamp(),
187            strategy,
188            success,
189            result,
190            new_context_size,
191        };
192
193        if let Some(error) = self.errors.iter_mut().find(|e| e.id == error_id) {
194            error.recovery_attempts.push(attempt);
195            if success {
196                error.resolved = true;
197            }
198        }
199    }
200
201    /// Get recovery strategies for a specific error type
202    #[cold]
203    pub fn get_recovery_strategies(&self, error_type: &ErrorType) -> &[RecoveryStrategy] {
204        self.recovery_strategies
205            .get(error_type)
206            .map(|strategies| strategies.as_slice())
207            .unwrap_or(&[])
208    }
209
210    /// Generate a context preservation plan
211    pub fn generate_context_preservation_plan(
212        &self,
213        context_size: usize,
214        error_count: usize,
215    ) -> ContextPreservationPlan {
216        let critical_errors = error_count > 5;
217
218        let strategies = if critical_errors {
219            vec![
220                PreservationStrategy::SelectiveRetention {
221                    preserve_decisions: true,
222                    preserve_errors: true,
223                },
224                PreservationStrategy::ContextReset {
225                    preserve_session_data: true,
226                },
227            ]
228        } else {
229            vec![PreservationStrategy::NoAction]
230        };
231
232        ContextPreservationPlan {
233            current_context_size: context_size,
234            error_count,
235            recommended_strategies: strategies,
236            urgency: if critical_errors {
237                Urgency::Critical
238            } else {
239                Urgency::Low
240            },
241        }
242    }
243
244    /// Get error statistics
245    pub fn get_error_statistics(&self) -> ErrorStatistics {
246        let total_errors = self.errors.len();
247        if total_errors == 0 {
248            return ErrorStatistics {
249                total_errors: 0,
250                resolved_errors: 0,
251                unresolved_errors: 0,
252                errors_by_type: IndexMap::new(),
253                avg_recovery_attempts: 0.0,
254                recent_errors: Vec::new(),
255            };
256        }
257
258        let resolved_errors = self.errors.iter().filter(|e| e.resolved).count();
259        let unresolved_errors = total_errors - resolved_errors;
260
261        // Use a more efficient approach for counting by type
262        let mut errors_by_type = IndexMap::new();
263        let mut total_attempts = 0usize;
264
265        for error in &self.errors {
266            *errors_by_type.entry(error.error_type).or_insert(0) += 1;
267            total_attempts += error.recovery_attempts.len();
268        }
269
270        let avg_recovery_attempts = total_attempts as f64 / total_errors as f64;
271
272        // Get recent errors more efficiently
273        let recent_count = total_errors.min(5);
274        let recent_errors: Vec<_> = self
275            .errors
276            .iter()
277            .rev()
278            .take(recent_count)
279            .cloned()
280            .collect();
281
282        ErrorStatistics {
283            total_errors,
284            resolved_errors,
285            unresolved_errors,
286            errors_by_type,
287            avg_recovery_attempts,
288            recent_errors,
289        }
290    }
291
292    /// Check if a specific error pattern is recurring
293    pub fn detect_error_pattern(&self, error_type: &ErrorType, time_window_seconds: u64) -> bool {
294        let now = current_timestamp();
295
296        let recent_errors = self
297            .errors
298            .iter()
299            .filter(|e| e.error_type == *error_type && (now - e.timestamp) < time_window_seconds)
300            .count();
301
302        recent_errors >= 3 // Consider it a pattern if 3+ similar errors in time window
303    }
304
305    /// Get the corresponding operation type for an error type
306    pub fn get_operation_type(&self, error_type: &ErrorType) -> OperationType {
307        self.operation_type_mapping
308            .get(error_type)
309            .cloned()
310            .unwrap_or(OperationType::Processing)
311    }
312
313    /// Execute an operation with intelligent timeout detection and recovery
314    pub async fn execute_with_recovery<F, Fut, T>(
315        &mut self,
316        operation_id: String,
317        error_type: ErrorType,
318        _context: ErrorContext,
319        operation: F,
320    ) -> Result<T, anyhow::Error>
321    where
322        F: Fn() -> Fut,
323        Fut: Future<Output = Result<T, anyhow::Error>>,
324    {
325        let operation_type = self.get_operation_type(&error_type);
326
327        TIMEOUT_DETECTOR
328            .execute_with_timeout_retry(operation_id, operation_type, operation)
329            .await
330    }
331
332    /// Check if an operation should be retried based on error analysis
333    /// Cold path: only called after an error has occurred.
334    #[cold]
335    pub async fn should_retry_operation(
336        &self,
337        error_type: &ErrorType,
338        error: &anyhow::Error,
339        attempt: u32,
340    ) -> bool {
341        let operation_type = self.get_operation_type(error_type);
342        TIMEOUT_DETECTOR
343            .should_retry(&operation_type, error, attempt)
344            .await
345    }
346
347    /// Get timeout statistics for monitoring and optimization
348    pub async fn get_timeout_stats(&self) -> crate::core::timeout_detector::TimeoutStats {
349        TIMEOUT_DETECTOR.get_stats().await
350    }
351
352    /// Configure timeout settings for a specific error type
353    /// Cold path: only called during proactive error recovery.
354    #[cold]
355    pub async fn configure_timeout_for_error_type(
356        &self,
357        error_type: ErrorType,
358        config: crate::core::timeout_detector::TimeoutConfig,
359    ) {
360        let operation_type = self.get_operation_type(&error_type);
361        TIMEOUT_DETECTOR.set_config(operation_type, config).await;
362    }
363
364    /// Generate enhanced recovery plan based on timeout detector insights
365    /// Cold path: only called during actual error recovery.
366    #[cold]
367    pub async fn generate_enhanced_recovery_plan(
368        &self,
369        context_size: usize,
370        error_count: usize,
371    ) -> EnhancedContextPreservationPlan {
372        let timeout_stats = self.get_timeout_stats().await;
373        let base_plan = self.generate_context_preservation_plan(context_size, error_count);
374
375        // Enhance the plan based on timeout detector insights
376        let timeout_rate = if timeout_stats.total_operations > 0 {
377            timeout_stats.timed_out_operations as f64 / timeout_stats.total_operations as f64
378        } else {
379            0.0
380        };
381
382        let retry_success_rate = if timeout_stats.total_retry_attempts > 0 {
383            timeout_stats.successful_retries as f64 / timeout_stats.total_retry_attempts as f64
384        } else {
385            1.0
386        };
387
388        // Adjust urgency based on timeout patterns
389        let _adjusted_urgency = if timeout_rate > 0.3 {
390            // High timeout rate indicates systemic issues
391            Urgency::Critical
392        } else if retry_success_rate < 0.5 {
393            // Low retry success rate indicates recovery issues
394            Urgency::High
395        } else {
396            base_plan.urgency
397        };
398
399        EnhancedContextPreservationPlan {
400            base_plan,
401            timeout_rate,
402            retry_success_rate,
403            timeout_stats,
404        }
405    }
406
407    /// Get the number of errors (for context preservation plan)
408    pub fn error_count(&self) -> usize {
409        self.errors.len()
410    }
411}
412
413/// Plan for preserving context during error recovery
414#[derive(Debug, Clone, Serialize, Deserialize)]
415pub struct ContextPreservationPlan {
416    pub current_context_size: usize,
417    pub error_count: usize,
418    pub recommended_strategies: Vec<PreservationStrategy>,
419    pub urgency: Urgency,
420}
421
422/// Strategy for preserving context
423#[derive(Debug, Clone, Serialize, Deserialize)]
424pub enum PreservationStrategy {
425    SelectiveRetention {
426        preserve_decisions: bool,
427        preserve_errors: bool,
428    },
429    ContextReset {
430        preserve_session_data: bool,
431    },
432    NoAction,
433}
434
435/// Urgency level for context preservation
436#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
437pub enum Urgency {
438    Low,
439    High,
440    Critical,
441}
442
443/// Statistics about errors in the session
444#[derive(Debug, Clone, Serialize, Deserialize)]
445pub struct ErrorStatistics {
446    pub total_errors: usize,
447    pub resolved_errors: usize,
448    pub unresolved_errors: usize,
449    pub errors_by_type: IndexMap<ErrorType, usize>,
450    pub avg_recovery_attempts: f64,
451    pub recent_errors: Vec<ExecutionError>,
452}
453
454/// Enhanced context preservation plan with timeout detector insights
455#[derive(Debug, Clone)]
456pub struct EnhancedContextPreservationPlan {
457    pub base_plan: ContextPreservationPlan,
458    pub timeout_rate: f64,
459    pub retry_success_rate: f64,
460    pub timeout_stats: crate::core::timeout_detector::TimeoutStats,
461}