vtcode_core/core/
error_recovery.rs

1use crate::core::timeout_detector::{OperationType, TIMEOUT_DETECTOR};
2use anyhow::Result;
3use indexmap::IndexMap;
4use serde::{Deserialize, Serialize};
5use serde_json::Value;
6use std::time::{SystemTime, UNIX_EPOCH};
7
8/// Represents an error that occurred during execution
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct ExecutionError {
11    pub id: String,
12    pub timestamp: u64,
13    pub error_type: ErrorType,
14    pub message: String,
15    pub context: ErrorContext,
16    pub recovery_attempts: Vec<RecoveryAttempt>,
17    pub resolved: bool,
18}
19
20/// Type of error that can occur
21#[derive(Debug, Clone, Serialize, Deserialize, Eq, Hash, PartialEq)]
22pub enum ErrorType {
23    ToolExecution,
24    ApiCall,
25    ContextCompression,
26    FileSystem,
27    Network,
28    Validation,
29    Unknown,
30}
31
32/// Context information about where and why the error occurred
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct ErrorContext {
35    pub conversation_turn: usize,
36    pub user_input: Option<String>,
37    pub tool_name: Option<String>,
38    pub tool_args: Option<Value>,
39    pub api_request_size: Option<usize>,
40    pub context_size: Option<usize>,
41    pub stack_trace: Option<String>,
42}
43
44/// A recovery attempt that was made
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct RecoveryAttempt {
47    pub timestamp: u64,
48    pub strategy: RecoveryStrategy,
49    pub success: bool,
50    pub result: String,
51    pub new_context_size: Option<usize>,
52}
53
54/// Recovery strategy used to handle the error
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub enum RecoveryStrategy {
57    RetryWithBackoff {
58        delay_ms: u64,
59        attempt_number: usize,
60    },
61    ContextCompression {
62        compression_ratio: f64,
63    },
64    SimplifyRequest {
65        removed_parameters: Vec<String>,
66    },
67    AlternativeTool {
68        original_tool: String,
69        alternative_tool: String,
70    },
71    ContextReset {
72        preserved_data: IndexMap<String, Value>,
73    },
74    ManualIntervention,
75}
76
77/// Error recovery manager
78pub struct ErrorRecoveryManager {
79    errors: Vec<ExecutionError>,
80    recovery_strategies: IndexMap<ErrorType, Vec<RecoveryStrategy>>,
81    context_compression_threshold: usize,
82    operation_type_mapping: IndexMap<ErrorType, OperationType>,
83}
84
85impl ErrorRecoveryManager {
86    pub fn new() -> Self {
87        let mut recovery_strategies = IndexMap::new();
88        let mut operation_type_mapping = IndexMap::new();
89
90        // Define recovery strategies for different error types
91        recovery_strategies.insert(
92            ErrorType::ToolExecution,
93            vec![
94                RecoveryStrategy::RetryWithBackoff {
95                    delay_ms: 1000,
96                    attempt_number: 1,
97                },
98                RecoveryStrategy::AlternativeTool {
99                    original_tool: "".to_string(),
100                    alternative_tool: "".to_string(),
101                },
102                RecoveryStrategy::ContextCompression {
103                    compression_ratio: 0.5,
104                },
105            ],
106        );
107
108        recovery_strategies.insert(
109            ErrorType::ApiCall,
110            vec![
111                RecoveryStrategy::RetryWithBackoff {
112                    delay_ms: 2000,
113                    attempt_number: 1,
114                },
115                RecoveryStrategy::ContextCompression {
116                    compression_ratio: 0.7,
117                },
118                RecoveryStrategy::ContextReset {
119                    preserved_data: IndexMap::new(),
120                },
121            ],
122        );
123
124        recovery_strategies.insert(
125            ErrorType::ContextCompression,
126            vec![RecoveryStrategy::ContextReset {
127                preserved_data: IndexMap::new(),
128            }],
129        );
130
131        // Map error types to operation types for timeout detector integration
132        operation_type_mapping.insert(ErrorType::ToolExecution, OperationType::ToolExecution);
133        operation_type_mapping.insert(ErrorType::ApiCall, OperationType::ApiCall);
134        operation_type_mapping.insert(ErrorType::Network, OperationType::NetworkRequest);
135        operation_type_mapping.insert(ErrorType::FileSystem, OperationType::FileOperation);
136        operation_type_mapping.insert(ErrorType::Validation, OperationType::Processing);
137        operation_type_mapping.insert(ErrorType::Unknown, OperationType::Processing);
138
139        Self {
140            errors: Vec::new(),
141            recovery_strategies,
142            context_compression_threshold: 50000, // tokens
143            operation_type_mapping,
144        }
145    }
146
147    /// Record a new error
148    pub fn record_error(
149        &mut self,
150        error_type: ErrorType,
151        message: String,
152        context: ErrorContext,
153    ) -> String {
154        let error_id = format!(
155            "error_{}_{}",
156            SystemTime::now()
157                .duration_since(UNIX_EPOCH)
158                .unwrap()
159                .as_secs(),
160            self.errors.len()
161        );
162
163        let error = ExecutionError {
164            id: error_id.clone(),
165            timestamp: SystemTime::now()
166                .duration_since(UNIX_EPOCH)
167                .unwrap()
168                .as_secs(),
169            error_type: error_type.clone(),
170            message,
171            context,
172            recovery_attempts: Vec::new(),
173            resolved: false,
174        };
175
176        self.errors.push(error);
177        error_id
178    }
179
180    /// Record a recovery attempt
181    pub fn record_recovery_attempt(
182        &mut self,
183        error_id: &str,
184        strategy: RecoveryStrategy,
185        success: bool,
186        result: String,
187        new_context_size: Option<usize>,
188    ) {
189        let attempt = RecoveryAttempt {
190            timestamp: SystemTime::now()
191                .duration_since(UNIX_EPOCH)
192                .unwrap()
193                .as_secs(),
194            strategy,
195            success,
196            result,
197            new_context_size,
198        };
199
200        if let Some(error) = self.errors.iter_mut().find(|e| e.id == error_id) {
201            error.recovery_attempts.push(attempt);
202            if success {
203                error.resolved = true;
204            }
205        }
206    }
207
208    /// Get recovery strategies for a specific error type
209    pub fn get_recovery_strategies(&self, error_type: &ErrorType) -> &[RecoveryStrategy] {
210        self.recovery_strategies
211            .get(error_type)
212            .map(|strategies| strategies.as_slice())
213            .unwrap_or(&[])
214    }
215
216    /// Determine if context compression is needed based on current context size
217    pub fn should_compress_context(&self, context_size: usize) -> bool {
218        context_size > self.context_compression_threshold
219    }
220
221    /// Generate a context preservation plan
222    pub fn generate_context_preservation_plan(
223        &self,
224        context_size: usize,
225        error_count: usize,
226    ) -> ContextPreservationPlan {
227        let compression_needed = context_size > self.context_compression_threshold;
228        let critical_errors = error_count > 5;
229
230        let strategies = if critical_errors {
231            vec![
232                PreservationStrategy::ImmediateCompression { target_ratio: 0.5 },
233                PreservationStrategy::SelectiveRetention {
234                    preserve_decisions: true,
235                    preserve_errors: true,
236                },
237                PreservationStrategy::ContextReset {
238                    preserve_session_data: true,
239                },
240            ]
241        } else if compression_needed {
242            vec![
243                PreservationStrategy::GradualCompression { target_ratio: 0.7 },
244                PreservationStrategy::SelectiveRetention {
245                    preserve_decisions: true,
246                    preserve_errors: false,
247                },
248            ]
249        } else {
250            vec![PreservationStrategy::NoAction]
251        };
252
253        ContextPreservationPlan {
254            current_context_size: context_size,
255            error_count,
256            recommended_strategies: strategies,
257            urgency: if critical_errors {
258                Urgency::Critical
259            } else if compression_needed {
260                Urgency::High
261            } else {
262                Urgency::Low
263            },
264        }
265    }
266
267    /// Get error statistics
268    pub fn get_error_statistics(&self) -> ErrorStatistics {
269        let total_errors = self.errors.len();
270        let resolved_errors = self.errors.iter().filter(|e| e.resolved).count();
271        let unresolved_errors = total_errors - resolved_errors;
272
273        let errors_by_type = self.errors.iter().fold(IndexMap::new(), |mut acc, error| {
274            *acc.entry(error.error_type.clone()).or_insert(0) += 1;
275            acc
276        });
277
278        let avg_recovery_attempts = if total_errors > 0 {
279            self.errors
280                .iter()
281                .map(|e| e.recovery_attempts.len())
282                .sum::<usize>() as f64
283                / total_errors as f64
284        } else {
285            0.0
286        };
287
288        ErrorStatistics {
289            total_errors,
290            resolved_errors,
291            unresolved_errors,
292            errors_by_type,
293            avg_recovery_attempts,
294            recent_errors: self.errors.iter().rev().take(5).cloned().collect(),
295        }
296    }
297
298    /// Check if a specific error pattern is recurring
299    pub fn detect_error_pattern(&self, error_type: &ErrorType, time_window_seconds: u64) -> bool {
300        let now = SystemTime::now()
301            .duration_since(UNIX_EPOCH)
302            .unwrap()
303            .as_secs();
304
305        let recent_errors = self
306            .errors
307            .iter()
308            .filter(|e| e.error_type == *error_type && (now - e.timestamp) < time_window_seconds)
309            .count();
310
311        recent_errors >= 3 // Consider it a pattern if 3+ similar errors in time window
312    }
313
314    /// Get the corresponding operation type for an error type
315    pub fn get_operation_type(&self, error_type: &ErrorType) -> OperationType {
316        self.operation_type_mapping
317            .get(error_type)
318            .cloned()
319            .unwrap_or(OperationType::Processing)
320    }
321
322    /// Execute an operation with intelligent timeout detection and recovery
323    pub async fn execute_with_recovery<F, Fut, T>(
324        &mut self,
325        operation_id: String,
326        error_type: ErrorType,
327        _context: ErrorContext,
328        operation: F,
329    ) -> Result<T, anyhow::Error>
330    where
331        F: Fn() -> Fut,
332        Fut: std::future::Future<Output = Result<T, anyhow::Error>>,
333    {
334        let operation_type = self.get_operation_type(&error_type);
335
336        TIMEOUT_DETECTOR
337            .execute_with_timeout_retry(operation_id, operation_type, operation)
338            .await
339    }
340
341    /// Check if an operation should be retried based on error analysis
342    pub async fn should_retry_operation(
343        &self,
344        error_type: &ErrorType,
345        error: &anyhow::Error,
346        attempt: u32,
347    ) -> bool {
348        let operation_type = self.get_operation_type(error_type);
349        TIMEOUT_DETECTOR
350            .should_retry(&operation_type, error, attempt)
351            .await
352    }
353
354    /// Get timeout statistics for monitoring and optimization
355    pub async fn get_timeout_stats(&self) -> crate::core::timeout_detector::TimeoutStats {
356        TIMEOUT_DETECTOR.get_stats().await
357    }
358
359    /// Configure timeout settings for a specific error type
360    pub async fn configure_timeout_for_error_type(
361        &self,
362        error_type: ErrorType,
363        config: crate::core::timeout_detector::TimeoutConfig,
364    ) {
365        let operation_type = self.get_operation_type(&error_type);
366        TIMEOUT_DETECTOR.set_config(operation_type, config).await;
367    }
368
369    /// Generate enhanced recovery plan based on timeout detector insights
370    pub async fn generate_enhanced_recovery_plan(
371        &self,
372        context_size: usize,
373        error_count: usize,
374    ) -> EnhancedContextPreservationPlan {
375        let timeout_stats = self.get_timeout_stats().await;
376        let base_plan = self.generate_context_preservation_plan(context_size, error_count);
377
378        // Enhance the plan based on timeout detector insights
379        let timeout_rate = if timeout_stats.total_operations > 0 {
380            timeout_stats.timed_out_operations as f64 / timeout_stats.total_operations as f64
381        } else {
382            0.0
383        };
384
385        let retry_success_rate = if timeout_stats.total_retry_attempts > 0 {
386            timeout_stats.successful_retries as f64 / timeout_stats.total_retry_attempts as f64
387        } else {
388            1.0
389        };
390
391        // Adjust urgency based on timeout patterns
392        let _adjusted_urgency = if timeout_rate > 0.3 {
393            // High timeout rate indicates systemic issues
394            Urgency::Critical
395        } else if retry_success_rate < 0.5 {
396            // Low retry success rate indicates recovery issues
397            Urgency::High
398        } else {
399            base_plan.urgency.clone()
400        };
401
402        EnhancedContextPreservationPlan {
403            base_plan,
404            timeout_rate,
405            retry_success_rate,
406            timeout_stats,
407        }
408    }
409
410    /// Get the number of errors (for context preservation plan)
411    pub fn error_count(&self) -> usize {
412        self.errors.len()
413    }
414}
415
416/// Plan for preserving context during error recovery
417#[derive(Debug, Clone, Serialize, Deserialize)]
418pub struct ContextPreservationPlan {
419    pub current_context_size: usize,
420    pub error_count: usize,
421    pub recommended_strategies: Vec<PreservationStrategy>,
422    pub urgency: Urgency,
423}
424
425/// Strategy for preserving context
426#[derive(Debug, Clone, Serialize, Deserialize)]
427pub enum PreservationStrategy {
428    ImmediateCompression {
429        target_ratio: f64,
430    },
431    GradualCompression {
432        target_ratio: f64,
433    },
434    SelectiveRetention {
435        preserve_decisions: bool,
436        preserve_errors: bool,
437    },
438    ContextReset {
439        preserve_session_data: bool,
440    },
441    NoAction,
442}
443
444/// Urgency level for context preservation
445#[derive(Debug, Clone, Serialize, Deserialize)]
446pub enum Urgency {
447    Low,
448    High,
449    Critical,
450}
451
452/// Statistics about errors in the session
453#[derive(Debug, Clone, Serialize, Deserialize)]
454pub struct ErrorStatistics {
455    pub total_errors: usize,
456    pub resolved_errors: usize,
457    pub unresolved_errors: usize,
458    pub errors_by_type: IndexMap<ErrorType, usize>,
459    pub avg_recovery_attempts: f64,
460    pub recent_errors: Vec<ExecutionError>,
461}
462
463/// Enhanced context preservation plan with timeout detector insights
464#[derive(Debug, Clone)]
465pub struct EnhancedContextPreservationPlan {
466    pub base_plan: ContextPreservationPlan,
467    pub timeout_rate: f64,
468    pub retry_success_rate: f64,
469    pub timeout_stats: crate::core::timeout_detector::TimeoutStats,
470}
471
472impl Default for ErrorRecoveryManager {
473    fn default() -> Self {
474        Self::new()
475    }
476}