1use crate::core::timeout_detector::{OperationType, TIMEOUT_DETECTOR};
2use crate::utils::current_timestamp;
3use anyhow::Result;
4use indexmap::IndexMap;
5use serde::{Deserialize, Serialize};
6use serde_json::Value;
7
8#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct ExecutionError {
11 pub id: String,
12 pub timestamp: u64,
13 pub error_type: ErrorType,
14 pub message: String,
15 pub context: ErrorContext,
16 pub recovery_attempts: Vec<RecoveryAttempt>,
17 pub resolved: bool,
18}
19
20#[derive(Debug, Clone, Copy, Serialize, Deserialize, Eq, Hash, PartialEq)]
25pub enum ErrorType {
26 ToolExecution,
27 ApiCall,
28 FileSystem,
29 Network,
30 Validation,
31 CircuitBreaker,
32 Timeout,
33 PermissionDenied,
34 InvalidArguments,
35 ResourceNotFound,
36 Other,
37}
38
39#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct ErrorContext {
42 pub conversation_turn: usize,
43 pub user_input: Option<String>,
44 pub tool_name: Option<String>,
45 pub tool_args: Option<Value>,
46 pub api_request_size: Option<usize>,
47 pub context_size: Option<usize>,
48 pub stack_trace: Option<String>,
49}
50
51#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct RecoveryAttempt {
54 pub timestamp: u64,
55 pub strategy: RecoveryStrategy,
56 pub success: bool,
57 pub result: String,
58 pub new_context_size: Option<usize>,
59}
60
61#[derive(Debug, Clone, Serialize, Deserialize)]
63pub enum RecoveryStrategy {
64 RetryWithBackoff {
65 delay_ms: u64,
66 attempt_number: usize,
67 },
68
69 SimplifyRequest {
70 removed_parameters: Vec<String>,
71 },
72 AlternativeTool {
73 original_tool: String,
74 alternative_tool: String,
75 },
76 ContextReset {
77 preserved_data: IndexMap<String, Value>,
78 },
79 ManualIntervention,
80}
81
82pub struct ErrorRecoveryManager {
84 errors: Vec<ExecutionError>,
85 recovery_strategies: IndexMap<ErrorType, Vec<RecoveryStrategy>>,
86 operation_type_mapping: IndexMap<ErrorType, OperationType>,
87}
88
89impl Default for ErrorRecoveryManager {
90 fn default() -> Self {
91 Self::new()
92 }
93}
94
95impl ErrorRecoveryManager {
96 pub fn new() -> Self {
97 let mut recovery_strategies = IndexMap::with_capacity(2);
99 let mut operation_type_mapping = IndexMap::with_capacity(11);
100
101 recovery_strategies.insert(
103 ErrorType::ToolExecution,
104 vec![
105 RecoveryStrategy::RetryWithBackoff {
106 delay_ms: 1000,
107 attempt_number: 1,
108 },
109 RecoveryStrategy::AlternativeTool {
110 original_tool: String::new(),
111 alternative_tool: String::new(),
112 },
113 ],
114 );
115
116 recovery_strategies.insert(
117 ErrorType::ApiCall,
118 vec![
119 RecoveryStrategy::RetryWithBackoff {
120 delay_ms: 2000,
121 attempt_number: 1,
122 },
123 RecoveryStrategy::ContextReset {
124 preserved_data: IndexMap::new(),
125 },
126 ],
127 );
128
129 operation_type_mapping.insert(ErrorType::ToolExecution, OperationType::ToolExecution);
131 operation_type_mapping.insert(ErrorType::ApiCall, OperationType::ApiCall);
132 operation_type_mapping.insert(ErrorType::Network, OperationType::NetworkRequest);
133 operation_type_mapping.insert(ErrorType::FileSystem, OperationType::FileOperation);
134 operation_type_mapping.insert(ErrorType::Validation, OperationType::Processing);
135 operation_type_mapping.insert(ErrorType::CircuitBreaker, OperationType::ToolExecution);
136 operation_type_mapping.insert(ErrorType::Timeout, OperationType::Processing);
137 operation_type_mapping.insert(ErrorType::PermissionDenied, OperationType::Processing);
138 operation_type_mapping.insert(ErrorType::InvalidArguments, OperationType::Processing);
139 operation_type_mapping.insert(ErrorType::ResourceNotFound, OperationType::FileOperation);
140 operation_type_mapping.insert(ErrorType::Other, OperationType::Processing);
141
142 Self {
143 errors: Vec::with_capacity(16), recovery_strategies,
145 operation_type_mapping,
146 }
147 }
148
149 pub fn record_error(
151 &mut self,
152 error_type: ErrorType,
153 message: String,
154 context: ErrorContext,
155 ) -> String {
156 let error_count = self.errors.len();
158 let timestamp_short = current_timestamp() % 10000;
159 let error_id = format!("e{}_{}", error_count, timestamp_short);
160
161 let error = ExecutionError {
162 id: error_id.clone(),
163 timestamp: current_timestamp(),
164 error_type, message,
166 context,
167 recovery_attempts: Vec::with_capacity(2), resolved: false,
169 };
170
171 self.errors.push(error);
172 error_id
173 }
174
175 #[cold]
177 pub fn record_recovery_attempt(
178 &mut self,
179 error_id: &str,
180 strategy: RecoveryStrategy,
181 success: bool,
182 result: String,
183 new_context_size: Option<usize>,
184 ) {
185 let attempt = RecoveryAttempt {
186 timestamp: current_timestamp(),
187 strategy,
188 success,
189 result,
190 new_context_size,
191 };
192
193 if let Some(error) = self.errors.iter_mut().find(|e| e.id == error_id) {
194 error.recovery_attempts.push(attempt);
195 if success {
196 error.resolved = true;
197 }
198 }
199 }
200
201 #[cold]
203 pub fn get_recovery_strategies(&self, error_type: &ErrorType) -> &[RecoveryStrategy] {
204 self.recovery_strategies
205 .get(error_type)
206 .map(|strategies| strategies.as_slice())
207 .unwrap_or(&[])
208 }
209
210 pub fn generate_context_preservation_plan(
212 &self,
213 context_size: usize,
214 error_count: usize,
215 ) -> ContextPreservationPlan {
216 let critical_errors = error_count > 5;
217
218 let strategies = if critical_errors {
219 vec![
220 PreservationStrategy::SelectiveRetention {
221 preserve_decisions: true,
222 preserve_errors: true,
223 },
224 PreservationStrategy::ContextReset {
225 preserve_session_data: true,
226 },
227 ]
228 } else {
229 vec![PreservationStrategy::NoAction]
230 };
231
232 ContextPreservationPlan {
233 current_context_size: context_size,
234 error_count,
235 recommended_strategies: strategies,
236 urgency: if critical_errors {
237 Urgency::Critical
238 } else {
239 Urgency::Low
240 },
241 }
242 }
243
244 pub fn get_error_statistics(&self) -> ErrorStatistics {
246 let total_errors = self.errors.len();
247 if total_errors == 0 {
248 return ErrorStatistics {
249 total_errors: 0,
250 resolved_errors: 0,
251 unresolved_errors: 0,
252 errors_by_type: IndexMap::new(),
253 avg_recovery_attempts: 0.0,
254 recent_errors: Vec::new(),
255 };
256 }
257
258 let resolved_errors = self.errors.iter().filter(|e| e.resolved).count();
259 let unresolved_errors = total_errors - resolved_errors;
260
261 let mut errors_by_type = IndexMap::new();
263 let mut total_attempts = 0usize;
264
265 for error in &self.errors {
266 *errors_by_type.entry(error.error_type).or_insert(0) += 1;
267 total_attempts += error.recovery_attempts.len();
268 }
269
270 let avg_recovery_attempts = total_attempts as f64 / total_errors as f64;
271
272 let recent_count = total_errors.min(5);
274 let recent_errors: Vec<_> = self
275 .errors
276 .iter()
277 .rev()
278 .take(recent_count)
279 .cloned()
280 .collect();
281
282 ErrorStatistics {
283 total_errors,
284 resolved_errors,
285 unresolved_errors,
286 errors_by_type,
287 avg_recovery_attempts,
288 recent_errors,
289 }
290 }
291
292 pub fn detect_error_pattern(&self, error_type: &ErrorType, time_window_seconds: u64) -> bool {
294 let now = current_timestamp();
295
296 let recent_errors = self
297 .errors
298 .iter()
299 .filter(|e| e.error_type == *error_type && (now - e.timestamp) < time_window_seconds)
300 .count();
301
302 recent_errors >= 3 }
304
305 pub fn get_operation_type(&self, error_type: &ErrorType) -> OperationType {
307 self.operation_type_mapping
308 .get(error_type)
309 .cloned()
310 .unwrap_or(OperationType::Processing)
311 }
312
313 pub async fn execute_with_recovery<F, Fut, T>(
315 &mut self,
316 operation_id: String,
317 error_type: ErrorType,
318 _context: ErrorContext,
319 operation: F,
320 ) -> Result<T, anyhow::Error>
321 where
322 F: Fn() -> Fut,
323 Fut: Future<Output = Result<T, anyhow::Error>>,
324 {
325 let operation_type = self.get_operation_type(&error_type);
326
327 TIMEOUT_DETECTOR
328 .execute_with_timeout_retry(operation_id, operation_type, operation)
329 .await
330 }
331
332 #[cold]
335 pub async fn should_retry_operation(
336 &self,
337 error_type: &ErrorType,
338 error: &anyhow::Error,
339 attempt: u32,
340 ) -> bool {
341 let operation_type = self.get_operation_type(error_type);
342 TIMEOUT_DETECTOR
343 .should_retry(&operation_type, error, attempt)
344 .await
345 }
346
347 pub async fn get_timeout_stats(&self) -> crate::core::timeout_detector::TimeoutStats {
349 TIMEOUT_DETECTOR.get_stats().await
350 }
351
352 #[cold]
355 pub async fn configure_timeout_for_error_type(
356 &self,
357 error_type: ErrorType,
358 config: crate::core::timeout_detector::TimeoutConfig,
359 ) {
360 let operation_type = self.get_operation_type(&error_type);
361 TIMEOUT_DETECTOR.set_config(operation_type, config).await;
362 }
363
364 #[cold]
367 pub async fn generate_enhanced_recovery_plan(
368 &self,
369 context_size: usize,
370 error_count: usize,
371 ) -> EnhancedContextPreservationPlan {
372 let timeout_stats = self.get_timeout_stats().await;
373 let base_plan = self.generate_context_preservation_plan(context_size, error_count);
374
375 let timeout_rate = if timeout_stats.total_operations > 0 {
377 timeout_stats.timed_out_operations as f64 / timeout_stats.total_operations as f64
378 } else {
379 0.0
380 };
381
382 let retry_success_rate = if timeout_stats.total_retry_attempts > 0 {
383 timeout_stats.successful_retries as f64 / timeout_stats.total_retry_attempts as f64
384 } else {
385 1.0
386 };
387
388 let _adjusted_urgency = if timeout_rate > 0.3 {
390 Urgency::Critical
392 } else if retry_success_rate < 0.5 {
393 Urgency::High
395 } else {
396 base_plan.urgency
397 };
398
399 EnhancedContextPreservationPlan {
400 base_plan,
401 timeout_rate,
402 retry_success_rate,
403 timeout_stats,
404 }
405 }
406
407 pub fn error_count(&self) -> usize {
409 self.errors.len()
410 }
411}
412
413#[derive(Debug, Clone, Serialize, Deserialize)]
415pub struct ContextPreservationPlan {
416 pub current_context_size: usize,
417 pub error_count: usize,
418 pub recommended_strategies: Vec<PreservationStrategy>,
419 pub urgency: Urgency,
420}
421
422#[derive(Debug, Clone, Serialize, Deserialize)]
424pub enum PreservationStrategy {
425 SelectiveRetention {
426 preserve_decisions: bool,
427 preserve_errors: bool,
428 },
429 ContextReset {
430 preserve_session_data: bool,
431 },
432 NoAction,
433}
434
435#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
437pub enum Urgency {
438 Low,
439 High,
440 Critical,
441}
442
443#[derive(Debug, Clone, Serialize, Deserialize)]
445pub struct ErrorStatistics {
446 pub total_errors: usize,
447 pub resolved_errors: usize,
448 pub unresolved_errors: usize,
449 pub errors_by_type: IndexMap<ErrorType, usize>,
450 pub avg_recovery_attempts: f64,
451 pub recent_errors: Vec<ExecutionError>,
452}
453
454#[derive(Debug, Clone)]
456pub struct EnhancedContextPreservationPlan {
457 pub base_plan: ContextPreservationPlan,
458 pub timeout_rate: f64,
459 pub retry_success_rate: f64,
460 pub timeout_stats: crate::core::timeout_detector::TimeoutStats,
461}