1use crate::core::timeout_detector::{OperationType, TIMEOUT_DETECTOR};
2use anyhow::Result;
3use indexmap::IndexMap;
4use serde::{Deserialize, Serialize};
5use serde_json::Value;
6use std::time::{SystemTime, UNIX_EPOCH};
7
8#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct ExecutionError {
11 pub id: String,
12 pub timestamp: u64,
13 pub error_type: ErrorType,
14 pub message: String,
15 pub context: ErrorContext,
16 pub recovery_attempts: Vec<RecoveryAttempt>,
17 pub resolved: bool,
18}
19
20#[derive(Debug, Clone, Serialize, Deserialize, Eq, Hash, PartialEq)]
22pub enum ErrorType {
23 ToolExecution,
24 ApiCall,
25 ContextCompression,
26 FileSystem,
27 Network,
28 Validation,
29 Unknown,
30}
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct ErrorContext {
35 pub conversation_turn: usize,
36 pub user_input: Option<String>,
37 pub tool_name: Option<String>,
38 pub tool_args: Option<Value>,
39 pub api_request_size: Option<usize>,
40 pub context_size: Option<usize>,
41 pub stack_trace: Option<String>,
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct RecoveryAttempt {
47 pub timestamp: u64,
48 pub strategy: RecoveryStrategy,
49 pub success: bool,
50 pub result: String,
51 pub new_context_size: Option<usize>,
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
56pub enum RecoveryStrategy {
57 RetryWithBackoff {
58 delay_ms: u64,
59 attempt_number: usize,
60 },
61 ContextCompression {
62 compression_ratio: f64,
63 },
64 SimplifyRequest {
65 removed_parameters: Vec<String>,
66 },
67 AlternativeTool {
68 original_tool: String,
69 alternative_tool: String,
70 },
71 ContextReset {
72 preserved_data: IndexMap<String, Value>,
73 },
74 ManualIntervention,
75}
76
77pub struct ErrorRecoveryManager {
79 errors: Vec<ExecutionError>,
80 recovery_strategies: IndexMap<ErrorType, Vec<RecoveryStrategy>>,
81 context_compression_threshold: usize,
82 operation_type_mapping: IndexMap<ErrorType, OperationType>,
83}
84
85impl ErrorRecoveryManager {
86 pub fn new() -> Self {
87 let mut recovery_strategies = IndexMap::new();
88 let mut operation_type_mapping = IndexMap::new();
89
90 recovery_strategies.insert(
92 ErrorType::ToolExecution,
93 vec![
94 RecoveryStrategy::RetryWithBackoff {
95 delay_ms: 1000,
96 attempt_number: 1,
97 },
98 RecoveryStrategy::AlternativeTool {
99 original_tool: "".to_string(),
100 alternative_tool: "".to_string(),
101 },
102 RecoveryStrategy::ContextCompression {
103 compression_ratio: 0.5,
104 },
105 ],
106 );
107
108 recovery_strategies.insert(
109 ErrorType::ApiCall,
110 vec![
111 RecoveryStrategy::RetryWithBackoff {
112 delay_ms: 2000,
113 attempt_number: 1,
114 },
115 RecoveryStrategy::ContextCompression {
116 compression_ratio: 0.7,
117 },
118 RecoveryStrategy::ContextReset {
119 preserved_data: IndexMap::new(),
120 },
121 ],
122 );
123
124 recovery_strategies.insert(
125 ErrorType::ContextCompression,
126 vec![RecoveryStrategy::ContextReset {
127 preserved_data: IndexMap::new(),
128 }],
129 );
130
131 operation_type_mapping.insert(ErrorType::ToolExecution, OperationType::ToolExecution);
133 operation_type_mapping.insert(ErrorType::ApiCall, OperationType::ApiCall);
134 operation_type_mapping.insert(ErrorType::Network, OperationType::NetworkRequest);
135 operation_type_mapping.insert(ErrorType::FileSystem, OperationType::FileOperation);
136 operation_type_mapping.insert(ErrorType::Validation, OperationType::Processing);
137 operation_type_mapping.insert(ErrorType::Unknown, OperationType::Processing);
138
139 Self {
140 errors: Vec::new(),
141 recovery_strategies,
142 context_compression_threshold: 50000, operation_type_mapping,
144 }
145 }
146
147 pub fn record_error(
149 &mut self,
150 error_type: ErrorType,
151 message: String,
152 context: ErrorContext,
153 ) -> String {
154 let error_id = format!(
155 "error_{}_{}",
156 SystemTime::now()
157 .duration_since(UNIX_EPOCH)
158 .unwrap()
159 .as_secs(),
160 self.errors.len()
161 );
162
163 let error = ExecutionError {
164 id: error_id.clone(),
165 timestamp: SystemTime::now()
166 .duration_since(UNIX_EPOCH)
167 .unwrap()
168 .as_secs(),
169 error_type: error_type.clone(),
170 message,
171 context,
172 recovery_attempts: Vec::new(),
173 resolved: false,
174 };
175
176 self.errors.push(error);
177 error_id
178 }
179
180 pub fn record_recovery_attempt(
182 &mut self,
183 error_id: &str,
184 strategy: RecoveryStrategy,
185 success: bool,
186 result: String,
187 new_context_size: Option<usize>,
188 ) {
189 let attempt = RecoveryAttempt {
190 timestamp: SystemTime::now()
191 .duration_since(UNIX_EPOCH)
192 .unwrap()
193 .as_secs(),
194 strategy,
195 success,
196 result,
197 new_context_size,
198 };
199
200 if let Some(error) = self.errors.iter_mut().find(|e| e.id == error_id) {
201 error.recovery_attempts.push(attempt);
202 if success {
203 error.resolved = true;
204 }
205 }
206 }
207
208 pub fn get_recovery_strategies(&self, error_type: &ErrorType) -> &[RecoveryStrategy] {
210 self.recovery_strategies
211 .get(error_type)
212 .map(|strategies| strategies.as_slice())
213 .unwrap_or(&[])
214 }
215
216 pub fn should_compress_context(&self, context_size: usize) -> bool {
218 context_size > self.context_compression_threshold
219 }
220
221 pub fn generate_context_preservation_plan(
223 &self,
224 context_size: usize,
225 error_count: usize,
226 ) -> ContextPreservationPlan {
227 let compression_needed = context_size > self.context_compression_threshold;
228 let critical_errors = error_count > 5;
229
230 let strategies = if critical_errors {
231 vec![
232 PreservationStrategy::ImmediateCompression { target_ratio: 0.5 },
233 PreservationStrategy::SelectiveRetention {
234 preserve_decisions: true,
235 preserve_errors: true,
236 },
237 PreservationStrategy::ContextReset {
238 preserve_session_data: true,
239 },
240 ]
241 } else if compression_needed {
242 vec![
243 PreservationStrategy::GradualCompression { target_ratio: 0.7 },
244 PreservationStrategy::SelectiveRetention {
245 preserve_decisions: true,
246 preserve_errors: false,
247 },
248 ]
249 } else {
250 vec![PreservationStrategy::NoAction]
251 };
252
253 ContextPreservationPlan {
254 current_context_size: context_size,
255 error_count,
256 recommended_strategies: strategies,
257 urgency: if critical_errors {
258 Urgency::Critical
259 } else if compression_needed {
260 Urgency::High
261 } else {
262 Urgency::Low
263 },
264 }
265 }
266
267 pub fn get_error_statistics(&self) -> ErrorStatistics {
269 let total_errors = self.errors.len();
270 let resolved_errors = self.errors.iter().filter(|e| e.resolved).count();
271 let unresolved_errors = total_errors - resolved_errors;
272
273 let errors_by_type = self.errors.iter().fold(IndexMap::new(), |mut acc, error| {
274 *acc.entry(error.error_type.clone()).or_insert(0) += 1;
275 acc
276 });
277
278 let avg_recovery_attempts = if total_errors > 0 {
279 self.errors
280 .iter()
281 .map(|e| e.recovery_attempts.len())
282 .sum::<usize>() as f64
283 / total_errors as f64
284 } else {
285 0.0
286 };
287
288 ErrorStatistics {
289 total_errors,
290 resolved_errors,
291 unresolved_errors,
292 errors_by_type,
293 avg_recovery_attempts,
294 recent_errors: self.errors.iter().rev().take(5).cloned().collect(),
295 }
296 }
297
298 pub fn detect_error_pattern(&self, error_type: &ErrorType, time_window_seconds: u64) -> bool {
300 let now = SystemTime::now()
301 .duration_since(UNIX_EPOCH)
302 .unwrap()
303 .as_secs();
304
305 let recent_errors = self
306 .errors
307 .iter()
308 .filter(|e| e.error_type == *error_type && (now - e.timestamp) < time_window_seconds)
309 .count();
310
311 recent_errors >= 3 }
313
314 pub fn get_operation_type(&self, error_type: &ErrorType) -> OperationType {
316 self.operation_type_mapping
317 .get(error_type)
318 .cloned()
319 .unwrap_or(OperationType::Processing)
320 }
321
322 pub async fn execute_with_recovery<F, Fut, T>(
324 &mut self,
325 operation_id: String,
326 error_type: ErrorType,
327 _context: ErrorContext,
328 operation: F,
329 ) -> Result<T, anyhow::Error>
330 where
331 F: Fn() -> Fut,
332 Fut: std::future::Future<Output = Result<T, anyhow::Error>>,
333 {
334 let operation_type = self.get_operation_type(&error_type);
335
336 TIMEOUT_DETECTOR
337 .execute_with_timeout_retry(operation_id, operation_type, operation)
338 .await
339 }
340
341 pub async fn should_retry_operation(
343 &self,
344 error_type: &ErrorType,
345 error: &anyhow::Error,
346 attempt: u32,
347 ) -> bool {
348 let operation_type = self.get_operation_type(error_type);
349 TIMEOUT_DETECTOR
350 .should_retry(&operation_type, error, attempt)
351 .await
352 }
353
354 pub async fn get_timeout_stats(&self) -> crate::core::timeout_detector::TimeoutStats {
356 TIMEOUT_DETECTOR.get_stats().await
357 }
358
359 pub async fn configure_timeout_for_error_type(
361 &self,
362 error_type: ErrorType,
363 config: crate::core::timeout_detector::TimeoutConfig,
364 ) {
365 let operation_type = self.get_operation_type(&error_type);
366 TIMEOUT_DETECTOR.set_config(operation_type, config).await;
367 }
368
369 pub async fn generate_enhanced_recovery_plan(
371 &self,
372 context_size: usize,
373 error_count: usize,
374 ) -> EnhancedContextPreservationPlan {
375 let timeout_stats = self.get_timeout_stats().await;
376 let base_plan = self.generate_context_preservation_plan(context_size, error_count);
377
378 let timeout_rate = if timeout_stats.total_operations > 0 {
380 timeout_stats.timed_out_operations as f64 / timeout_stats.total_operations as f64
381 } else {
382 0.0
383 };
384
385 let retry_success_rate = if timeout_stats.total_retry_attempts > 0 {
386 timeout_stats.successful_retries as f64 / timeout_stats.total_retry_attempts as f64
387 } else {
388 1.0
389 };
390
391 let _adjusted_urgency = if timeout_rate > 0.3 {
393 Urgency::Critical
395 } else if retry_success_rate < 0.5 {
396 Urgency::High
398 } else {
399 base_plan.urgency.clone()
400 };
401
402 EnhancedContextPreservationPlan {
403 base_plan,
404 timeout_rate,
405 retry_success_rate,
406 timeout_stats,
407 }
408 }
409
410 pub fn error_count(&self) -> usize {
412 self.errors.len()
413 }
414}
415
416#[derive(Debug, Clone, Serialize, Deserialize)]
418pub struct ContextPreservationPlan {
419 pub current_context_size: usize,
420 pub error_count: usize,
421 pub recommended_strategies: Vec<PreservationStrategy>,
422 pub urgency: Urgency,
423}
424
425#[derive(Debug, Clone, Serialize, Deserialize)]
427pub enum PreservationStrategy {
428 ImmediateCompression {
429 target_ratio: f64,
430 },
431 GradualCompression {
432 target_ratio: f64,
433 },
434 SelectiveRetention {
435 preserve_decisions: bool,
436 preserve_errors: bool,
437 },
438 ContextReset {
439 preserve_session_data: bool,
440 },
441 NoAction,
442}
443
444#[derive(Debug, Clone, Serialize, Deserialize)]
446pub enum Urgency {
447 Low,
448 High,
449 Critical,
450}
451
452#[derive(Debug, Clone, Serialize, Deserialize)]
454pub struct ErrorStatistics {
455 pub total_errors: usize,
456 pub resolved_errors: usize,
457 pub unresolved_errors: usize,
458 pub errors_by_type: IndexMap<ErrorType, usize>,
459 pub avg_recovery_attempts: f64,
460 pub recent_errors: Vec<ExecutionError>,
461}
462
463#[derive(Debug, Clone)]
465pub struct EnhancedContextPreservationPlan {
466 pub base_plan: ContextPreservationPlan,
467 pub timeout_rate: f64,
468 pub retry_success_rate: f64,
469 pub timeout_stats: crate::core::timeout_detector::TimeoutStats,
470}
471
472impl Default for ErrorRecoveryManager {
473 fn default() -> Self {
474 Self::new()
475 }
476}