1use anyhow::Result;
4use serde::{Deserialize, Serialize};
5use std::collections::{HashMap, VecDeque};
6use std::time::{Duration, Instant};
7use uuid::Uuid;
8
9#[derive(Debug)]
11pub struct ErrorRecoverySystem {
12 config: ErrorRecoveryConfig,
13 recovery_strategies: HashMap<ErrorType, Vec<RecoveryStrategy>>,
14 error_history: VecDeque<ErrorEvent>,
15 recovery_history: VecDeque<RecoveryEvent>,
16 circuit_breaker: CircuitBreaker,
17 health_monitor: SystemHealthMonitor,
18 failsafe_manager: FailsafeManager,
19}
20
21#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct ErrorRecoveryConfig {
24 pub enabled: bool,
25 pub max_retry_attempts: usize,
26 pub retry_delay_ms: u64,
27 pub circuit_breaker_threshold: usize,
28 pub health_check_interval_ms: u64,
29 pub auto_failsafe_enabled: bool,
30 pub error_history_limit: usize,
31 pub recovery_timeout_ms: u64,
32}
33
34impl Default for ErrorRecoveryConfig {
35 fn default() -> Self {
36 Self {
37 enabled: true,
38 max_retry_attempts: 3,
39 retry_delay_ms: 100,
40 circuit_breaker_threshold: 5,
41 health_check_interval_ms: 5000,
42 auto_failsafe_enabled: true,
43 error_history_limit: 1000,
44 recovery_timeout_ms: 30000,
45 }
46 }
47}
48
49#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
51pub enum ErrorType {
52 TensorInspectionError,
53 GradientDebuggingError,
54 ModelDiagnosticsError,
55 VisualizationError,
56 MemoryProfilingError,
57 IOError,
58 NetworkError,
59 ResourceExhaustion,
60 ConfigurationError,
61 DataCorruption,
62 SystemFailure,
63 UserError,
64}
65
66#[derive(Debug, Clone, Serialize, Deserialize)]
68pub enum RecoveryStrategy {
69 Retry { max_attempts: usize, delay_ms: u64 },
70 Fallback { alternative_method: String },
71 GracefulDegradation { reduced_functionality: String },
72 ResourceCleanup { cleanup_type: String },
73 SystemReset { component: String },
74 EmergencyShutdown,
75 UserNotification { message: String },
76 AutomaticRepair { repair_action: String },
77}
78
79#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct ErrorEvent {
82 pub id: Uuid,
83 pub error_type: ErrorType,
84 pub error_message: String,
85 pub component: String,
86 pub severity: ErrorSeverity,
87 pub timestamp: chrono::DateTime<chrono::Utc>,
88 pub context: ErrorContext,
89 pub stack_trace: Option<String>,
90}
91
92#[derive(Debug, Clone, Serialize, Deserialize)]
94pub enum ErrorSeverity {
95 Low,
96 Medium,
97 High,
98 Critical,
99 Fatal,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct ErrorContext {
105 pub session_id: Uuid,
106 pub operation: String,
107 pub parameters: HashMap<String, String>,
108 pub system_state: SystemState,
109}
110
111#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct SystemState {
114 pub memory_usage_mb: u64,
115 pub cpu_usage_percent: f64,
116 pub active_tensors: usize,
117 pub active_sessions: usize,
118 pub uptime_seconds: u64,
119}
120
121#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct RecoveryEvent {
124 pub id: Uuid,
125 pub error_id: Uuid,
126 pub strategy: RecoveryStrategy,
127 pub start_time: chrono::DateTime<chrono::Utc>,
128 pub end_time: Option<chrono::DateTime<chrono::Utc>>,
129 pub success: Option<bool>,
130 pub result_message: String,
131 pub attempts: usize,
132}
133
134#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct CircuitBreaker {
137 pub state: CircuitState,
138 pub failure_count: usize,
139 pub last_failure_time: Option<chrono::DateTime<chrono::Utc>>,
140 pub threshold: usize,
141 pub timeout_duration: Duration,
142}
143
144#[derive(Debug, Clone, Serialize, Deserialize)]
146pub enum CircuitState {
147 Closed,
148 Open,
149 HalfOpen,
150}
151
152#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct SystemHealthMonitor {
155 pub overall_health: HealthStatus,
156 pub component_health: HashMap<String, HealthStatus>,
157 pub last_health_check: chrono::DateTime<chrono::Utc>,
158 pub health_metrics: HealthMetrics,
159}
160
161#[derive(Debug, Clone, Serialize, Deserialize)]
163pub enum HealthStatus {
164 Healthy,
165 Degraded,
166 Unhealthy,
167 Critical,
168}
169
170#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct HealthMetrics {
173 pub error_rate: f64,
174 pub recovery_success_rate: f64,
175 pub average_response_time_ms: f64,
176 pub memory_health_score: f64,
177 pub stability_score: f64,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct FailsafeManager {
183 pub enabled: bool,
184 pub emergency_protocols: Vec<EmergencyProtocol>,
185 pub safe_mode_enabled: bool,
186 pub data_backup_enabled: bool,
187 pub last_backup: Option<chrono::DateTime<chrono::Utc>>,
188}
189
190#[derive(Debug, Clone, Serialize, Deserialize)]
192pub struct EmergencyProtocol {
193 pub name: String,
194 pub trigger_conditions: Vec<String>,
195 pub actions: Vec<String>,
196 pub priority: u8,
197}
198
199impl ErrorRecoverySystem {
200 pub fn new(config: ErrorRecoveryConfig) -> Self {
202 let mut system = Self {
203 config,
204 recovery_strategies: HashMap::new(),
205 error_history: VecDeque::new(),
206 recovery_history: VecDeque::new(),
207 circuit_breaker: CircuitBreaker {
208 state: CircuitState::Closed,
209 failure_count: 0,
210 last_failure_time: None,
211 threshold: 5,
212 timeout_duration: Duration::from_secs(60),
213 },
214 health_monitor: SystemHealthMonitor {
215 overall_health: HealthStatus::Healthy,
216 component_health: HashMap::new(),
217 last_health_check: chrono::Utc::now(),
218 health_metrics: HealthMetrics {
219 error_rate: 0.0,
220 recovery_success_rate: 1.0,
221 average_response_time_ms: 0.0,
222 memory_health_score: 1.0,
223 stability_score: 1.0,
224 },
225 },
226 failsafe_manager: FailsafeManager {
227 enabled: true,
228 emergency_protocols: Vec::new(),
229 safe_mode_enabled: false,
230 data_backup_enabled: true,
231 last_backup: None,
232 },
233 };
234
235 system.initialize_default_strategies();
236 system.initialize_emergency_protocols();
237 system
238 }
239
240 pub async fn handle_error(&mut self, error: ErrorEvent) -> Result<RecoveryResult> {
242 if matches!(self.circuit_breaker.state, CircuitState::Open) {
244 return Ok(RecoveryResult {
245 success: false,
246 strategy_used: None,
247 message: "Circuit breaker is open - recovery attempts suspended".to_string(),
248 recovery_time: Duration::from_millis(0),
249 });
250 }
251
252 self.record_error(error.clone());
254
255 if self.should_trigger_emergency_protocol(&error) {
257 return self.execute_emergency_protocol(&error).await;
258 }
259
260 let recovery_result = self.attempt_recovery(&error).await?;
262
263 self.update_circuit_breaker(&recovery_result);
265 self.update_health_metrics(&error, &recovery_result);
266
267 Ok(recovery_result)
268 }
269
270 pub fn record_error(&mut self, error: ErrorEvent) {
272 self.error_history.push_back(error);
273
274 while self.error_history.len() > self.config.error_history_limit {
276 self.error_history.pop_front();
277 }
278 }
279
280 pub async fn attempt_recovery(&mut self, error: &ErrorEvent) -> Result<RecoveryResult> {
282 let strategies = self.get_recovery_strategies(&error.error_type);
283
284 for (attempt, strategy) in strategies.iter().enumerate() {
285 if attempt >= self.config.max_retry_attempts {
286 break;
287 }
288
289 let recovery_event = RecoveryEvent {
290 id: Uuid::new_v4(),
291 error_id: error.id,
292 strategy: strategy.clone(),
293 start_time: chrono::Utc::now(),
294 end_time: None,
295 success: None,
296 result_message: String::new(),
297 attempts: attempt + 1,
298 };
299
300 let result = self.execute_recovery_strategy(strategy, error).await?;
301
302 let mut updated_event = recovery_event;
303 updated_event.end_time = Some(chrono::Utc::now());
304 updated_event.success = Some(result.success);
305 updated_event.result_message = result.message.clone();
306
307 self.recovery_history.push_back(updated_event);
308
309 if result.success {
310 return Ok(result);
311 }
312
313 if attempt < strategies.len() - 1 {
315 tokio::time::sleep(Duration::from_millis(self.config.retry_delay_ms)).await;
316 }
317 }
318
319 Ok(RecoveryResult {
320 success: false,
321 strategy_used: None,
322 message: "All recovery strategies failed".to_string(),
323 recovery_time: Duration::from_millis(0),
324 })
325 }
326
327 pub async fn execute_recovery_strategy(
329 &self,
330 strategy: &RecoveryStrategy,
331 error: &ErrorEvent,
332 ) -> Result<RecoveryResult> {
333 let start_time = Instant::now();
334
335 let result = match strategy {
336 RecoveryStrategy::Retry {
337 max_attempts,
338 delay_ms,
339 } => self.execute_retry_strategy(*max_attempts, *delay_ms, error).await,
340 RecoveryStrategy::Fallback { alternative_method } => {
341 self.execute_fallback_strategy(alternative_method, error).await
342 },
343 RecoveryStrategy::GracefulDegradation {
344 reduced_functionality,
345 } => self.execute_degradation_strategy(reduced_functionality, error).await,
346 RecoveryStrategy::ResourceCleanup { cleanup_type } => {
347 self.execute_cleanup_strategy(cleanup_type, error).await
348 },
349 RecoveryStrategy::SystemReset { component } => {
350 self.execute_reset_strategy(component, error).await
351 },
352 RecoveryStrategy::EmergencyShutdown => self.execute_shutdown_strategy(error).await,
353 RecoveryStrategy::UserNotification { message } => {
354 self.execute_notification_strategy(message, error).await
355 },
356 RecoveryStrategy::AutomaticRepair { repair_action } => {
357 self.execute_repair_strategy(repair_action, error).await
358 },
359 };
360
361 let recovery_time = start_time.elapsed();
362
363 match result {
364 Ok(mut recovery_result) => {
365 recovery_result.recovery_time = recovery_time;
366 recovery_result.strategy_used = Some(strategy.clone());
367 Ok(recovery_result)
368 },
369 Err(e) => Ok(RecoveryResult {
370 success: false,
371 strategy_used: Some(strategy.clone()),
372 message: format!("Recovery strategy failed: {}", e),
373 recovery_time,
374 }),
375 }
376 }
377
378 pub fn get_recovery_strategies(&self, error_type: &ErrorType) -> Vec<RecoveryStrategy> {
380 self.recovery_strategies.get(error_type).cloned().unwrap_or_default()
381 }
382
383 pub async fn check_system_health(&mut self) -> HealthStatus {
385 self.health_monitor.last_health_check = chrono::Utc::now();
387
388 let recent_errors = self
390 .error_history
391 .iter()
392 .filter(|e| {
393 let age = chrono::Utc::now() - e.timestamp;
394 age < chrono::Duration::minutes(5)
395 })
396 .count();
397
398 self.health_monitor.health_metrics.error_rate = recent_errors as f64 / 100.0; let recent_recoveries = self
402 .recovery_history
403 .iter()
404 .filter(|r| {
405 if let Some(end_time) = r.end_time {
406 let age = chrono::Utc::now() - end_time;
407 age < chrono::Duration::minutes(5)
408 } else {
409 false
410 }
411 })
412 .collect::<Vec<_>>();
413
414 if !recent_recoveries.is_empty() {
415 let successful_recoveries =
416 recent_recoveries.iter().filter(|r| r.success.unwrap_or(false)).count();
417 self.health_monitor.health_metrics.recovery_success_rate =
418 successful_recoveries as f64 / recent_recoveries.len() as f64;
419 }
420
421 self.health_monitor.overall_health = if self.health_monitor.health_metrics.error_rate > 0.5
423 {
424 HealthStatus::Critical
425 } else if self.health_monitor.health_metrics.error_rate > 0.2 {
426 HealthStatus::Unhealthy
427 } else if self.health_monitor.health_metrics.error_rate > 0.1 {
428 HealthStatus::Degraded
429 } else {
430 HealthStatus::Healthy
431 };
432
433 self.health_monitor.overall_health.clone()
434 }
435
436 pub fn enable_safe_mode(&mut self) {
438 self.failsafe_manager.safe_mode_enabled = true;
439 tracing::warn!("Safe mode enabled - operating with reduced functionality");
440 }
441
442 pub fn disable_safe_mode(&mut self) {
444 self.failsafe_manager.safe_mode_enabled = false;
445 tracing::info!("Safe mode disabled - full functionality restored");
446 }
447
448 pub fn get_error_statistics(&self) -> ErrorStatistics {
450 let total_errors = self.error_history.len();
451 let error_type_counts = self.error_history.iter().fold(HashMap::new(), |mut acc, error| {
452 *acc.entry(error.error_type.clone()).or_insert(0) += 1;
453 acc
454 });
455
456 let severity_counts = self.error_history.iter().fold(HashMap::new(), |mut acc, error| {
457 *acc.entry(format!("{:?}", error.severity)).or_insert(0) += 1;
458 acc
459 });
460
461 ErrorStatistics {
462 total_errors,
463 error_type_counts,
464 severity_counts,
465 recovery_success_rate: self.health_monitor.health_metrics.recovery_success_rate,
466 circuit_breaker_state: self.circuit_breaker.state.clone(),
467 system_health: self.health_monitor.overall_health.clone(),
468 }
469 }
470
471 fn initialize_default_strategies(&mut self) {
474 self.recovery_strategies.insert(
476 ErrorType::TensorInspectionError,
477 vec![
478 RecoveryStrategy::Retry {
479 max_attempts: 3,
480 delay_ms: 100,
481 },
482 RecoveryStrategy::ResourceCleanup {
483 cleanup_type: "tensor_cache".to_string(),
484 },
485 RecoveryStrategy::Fallback {
486 alternative_method: "simplified_inspection".to_string(),
487 },
488 ],
489 );
490
491 self.recovery_strategies.insert(
492 ErrorType::GradientDebuggingError,
493 vec![
494 RecoveryStrategy::Retry {
495 max_attempts: 2,
496 delay_ms: 200,
497 },
498 RecoveryStrategy::GracefulDegradation {
499 reduced_functionality: "basic_gradient_info".to_string(),
500 },
501 ],
502 );
503
504 self.recovery_strategies.insert(
505 ErrorType::MemoryProfilingError,
506 vec![
507 RecoveryStrategy::ResourceCleanup {
508 cleanup_type: "memory_profiler".to_string(),
509 },
510 RecoveryStrategy::SystemReset {
511 component: "memory_tracker".to_string(),
512 },
513 ],
514 );
515
516 self.recovery_strategies.insert(
517 ErrorType::ResourceExhaustion,
518 vec![
519 RecoveryStrategy::ResourceCleanup {
520 cleanup_type: "all_caches".to_string(),
521 },
522 RecoveryStrategy::GracefulDegradation {
523 reduced_functionality: "essential_only".to_string(),
524 },
525 RecoveryStrategy::EmergencyShutdown,
526 ],
527 );
528
529 }
531
532 fn initialize_emergency_protocols(&mut self) {
533 self.failsafe_manager.emergency_protocols = vec![
534 EmergencyProtocol {
535 name: "Memory Exhaustion Protocol".to_string(),
536 trigger_conditions: vec!["memory_usage > 90%".to_string()],
537 actions: vec![
538 "clear_all_caches".to_string(),
539 "reduce_tracking".to_string(),
540 ],
541 priority: 1,
542 },
543 EmergencyProtocol {
544 name: "Critical Error Protocol".to_string(),
545 trigger_conditions: vec!["error_severity == Fatal".to_string()],
546 actions: vec!["emergency_backup".to_string(), "safe_shutdown".to_string()],
547 priority: 0,
548 },
549 ];
550 }
551
552 fn should_trigger_emergency_protocol(&self, error: &ErrorEvent) -> bool {
553 matches!(error.severity, ErrorSeverity::Fatal)
554 || error.context.system_state.memory_usage_mb > 8192 }
556
557 async fn execute_emergency_protocol(&mut self, error: &ErrorEvent) -> Result<RecoveryResult> {
558 tracing::error!(
559 "Executing emergency protocol for error: {}",
560 error.error_message
561 );
562
563 self.enable_safe_mode();
565
566 if self.failsafe_manager.data_backup_enabled {
568 self.create_emergency_backup().await?;
569 }
570
571 Ok(RecoveryResult {
572 success: true,
573 strategy_used: Some(RecoveryStrategy::EmergencyShutdown),
574 message: "Emergency protocol executed successfully".to_string(),
575 recovery_time: Duration::from_millis(0),
576 })
577 }
578
579 async fn create_emergency_backup(&mut self) -> Result<()> {
580 tracing::info!("Creating emergency backup");
581 self.failsafe_manager.last_backup = Some(chrono::Utc::now());
582 Ok(())
584 }
585
586 fn update_circuit_breaker(&mut self, result: &RecoveryResult) {
587 if result.success {
588 self.circuit_breaker.failure_count = 0;
589 self.circuit_breaker.state = CircuitState::Closed;
590 } else {
591 self.circuit_breaker.failure_count += 1;
592 self.circuit_breaker.last_failure_time = Some(chrono::Utc::now());
593
594 if self.circuit_breaker.failure_count >= self.circuit_breaker.threshold {
595 self.circuit_breaker.state = CircuitState::Open;
596 }
597 }
598 }
599
600 fn update_health_metrics(&mut self, _error: &ErrorEvent, _result: &RecoveryResult) {
601 }
604
605 async fn execute_retry_strategy(
607 &self,
608 _max_attempts: usize,
609 _delay_ms: u64,
610 _error: &ErrorEvent,
611 ) -> Result<RecoveryResult> {
612 Ok(RecoveryResult {
613 success: true,
614 strategy_used: None,
615 message: "Retry successful".to_string(),
616 recovery_time: Duration::from_millis(0),
617 })
618 }
619
620 async fn execute_fallback_strategy(
621 &self,
622 _alternative: &str,
623 _error: &ErrorEvent,
624 ) -> Result<RecoveryResult> {
625 Ok(RecoveryResult {
626 success: true,
627 strategy_used: None,
628 message: "Fallback strategy executed".to_string(),
629 recovery_time: Duration::from_millis(0),
630 })
631 }
632
633 async fn execute_degradation_strategy(
634 &self,
635 _functionality: &str,
636 _error: &ErrorEvent,
637 ) -> Result<RecoveryResult> {
638 Ok(RecoveryResult {
639 success: true,
640 strategy_used: None,
641 message: "Graceful degradation applied".to_string(),
642 recovery_time: Duration::from_millis(0),
643 })
644 }
645
646 async fn execute_cleanup_strategy(
647 &self,
648 _cleanup_type: &str,
649 _error: &ErrorEvent,
650 ) -> Result<RecoveryResult> {
651 Ok(RecoveryResult {
652 success: true,
653 strategy_used: None,
654 message: "Resource cleanup completed".to_string(),
655 recovery_time: Duration::from_millis(0),
656 })
657 }
658
659 async fn execute_reset_strategy(
660 &self,
661 _component: &str,
662 _error: &ErrorEvent,
663 ) -> Result<RecoveryResult> {
664 Ok(RecoveryResult {
665 success: true,
666 strategy_used: None,
667 message: "Component reset completed".to_string(),
668 recovery_time: Duration::from_millis(0),
669 })
670 }
671
672 async fn execute_shutdown_strategy(&self, _error: &ErrorEvent) -> Result<RecoveryResult> {
673 Ok(RecoveryResult {
674 success: true,
675 strategy_used: None,
676 message: "Emergency shutdown initiated".to_string(),
677 recovery_time: Duration::from_millis(0),
678 })
679 }
680
681 async fn execute_notification_strategy(
682 &self,
683 message: &str,
684 _error: &ErrorEvent,
685 ) -> Result<RecoveryResult> {
686 tracing::warn!("User notification: {}", message);
687 Ok(RecoveryResult {
688 success: true,
689 strategy_used: None,
690 message: "User notified".to_string(),
691 recovery_time: Duration::from_millis(0),
692 })
693 }
694
695 async fn execute_repair_strategy(
696 &self,
697 _repair_action: &str,
698 _error: &ErrorEvent,
699 ) -> Result<RecoveryResult> {
700 Ok(RecoveryResult {
701 success: true,
702 strategy_used: None,
703 message: "Automatic repair completed".to_string(),
704 recovery_time: Duration::from_millis(0),
705 })
706 }
707}
708
709#[derive(Debug, Clone, Serialize, Deserialize)]
711pub struct RecoveryResult {
712 pub success: bool,
713 pub strategy_used: Option<RecoveryStrategy>,
714 pub message: String,
715 pub recovery_time: Duration,
716}
717
718#[derive(Debug, Clone, Serialize, Deserialize)]
720pub struct ErrorStatistics {
721 pub total_errors: usize,
722 pub error_type_counts: HashMap<ErrorType, usize>,
723 pub severity_counts: HashMap<String, usize>,
724 pub recovery_success_rate: f64,
725 pub circuit_breaker_state: CircuitState,
726 pub system_health: HealthStatus,
727}