Skip to main content

trustformers_debug/
error_recovery.rs

1//! Comprehensive error recovery mechanisms for debugging sessions
2
3use anyhow::Result;
4use serde::{Deserialize, Serialize};
5use std::collections::{HashMap, VecDeque};
6use std::time::{Duration, Instant};
7use uuid::Uuid;
8
9/// Comprehensive error recovery system
10#[derive(Debug)]
11pub struct ErrorRecoverySystem {
12    config: ErrorRecoveryConfig,
13    recovery_strategies: HashMap<ErrorType, Vec<RecoveryStrategy>>,
14    error_history: VecDeque<ErrorEvent>,
15    recovery_history: VecDeque<RecoveryEvent>,
16    circuit_breaker: CircuitBreaker,
17    health_monitor: SystemHealthMonitor,
18    failsafe_manager: FailsafeManager,
19}
20
21/// Configuration for error recovery
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct ErrorRecoveryConfig {
24    pub enabled: bool,
25    pub max_retry_attempts: usize,
26    pub retry_delay_ms: u64,
27    pub circuit_breaker_threshold: usize,
28    pub health_check_interval_ms: u64,
29    pub auto_failsafe_enabled: bool,
30    pub error_history_limit: usize,
31    pub recovery_timeout_ms: u64,
32}
33
34impl Default for ErrorRecoveryConfig {
35    fn default() -> Self {
36        Self {
37            enabled: true,
38            max_retry_attempts: 3,
39            retry_delay_ms: 100,
40            circuit_breaker_threshold: 5,
41            health_check_interval_ms: 5000,
42            auto_failsafe_enabled: true,
43            error_history_limit: 1000,
44            recovery_timeout_ms: 30000,
45        }
46    }
47}
48
49/// Types of errors that can occur during debugging
50#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
51pub enum ErrorType {
52    TensorInspectionError,
53    GradientDebuggingError,
54    ModelDiagnosticsError,
55    VisualizationError,
56    MemoryProfilingError,
57    IOError,
58    NetworkError,
59    ResourceExhaustion,
60    ConfigurationError,
61    DataCorruption,
62    SystemFailure,
63    UserError,
64}
65
66/// Recovery strategies for different error types
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub enum RecoveryStrategy {
69    Retry { max_attempts: usize, delay_ms: u64 },
70    Fallback { alternative_method: String },
71    GracefulDegradation { reduced_functionality: String },
72    ResourceCleanup { cleanup_type: String },
73    SystemReset { component: String },
74    EmergencyShutdown,
75    UserNotification { message: String },
76    AutomaticRepair { repair_action: String },
77}
78
79/// Error event record
80#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct ErrorEvent {
82    pub id: Uuid,
83    pub error_type: ErrorType,
84    pub error_message: String,
85    pub component: String,
86    pub severity: ErrorSeverity,
87    pub timestamp: chrono::DateTime<chrono::Utc>,
88    pub context: ErrorContext,
89    pub stack_trace: Option<String>,
90}
91
92/// Error severity levels
93#[derive(Debug, Clone, Serialize, Deserialize)]
94pub enum ErrorSeverity {
95    Low,
96    Medium,
97    High,
98    Critical,
99    Fatal,
100}
101
102/// Context information for errors
103#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct ErrorContext {
105    pub session_id: Uuid,
106    pub operation: String,
107    pub parameters: HashMap<String, String>,
108    pub system_state: SystemState,
109}
110
111/// System state at time of error
112#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct SystemState {
114    pub memory_usage_mb: u64,
115    pub cpu_usage_percent: f64,
116    pub active_tensors: usize,
117    pub active_sessions: usize,
118    pub uptime_seconds: u64,
119}
120
121/// Recovery event record
122#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct RecoveryEvent {
124    pub id: Uuid,
125    pub error_id: Uuid,
126    pub strategy: RecoveryStrategy,
127    pub start_time: chrono::DateTime<chrono::Utc>,
128    pub end_time: Option<chrono::DateTime<chrono::Utc>>,
129    pub success: Option<bool>,
130    pub result_message: String,
131    pub attempts: usize,
132}
133
134/// Circuit breaker for preventing cascading failures
135#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct CircuitBreaker {
137    pub state: CircuitState,
138    pub failure_count: usize,
139    pub last_failure_time: Option<chrono::DateTime<chrono::Utc>>,
140    pub threshold: usize,
141    pub timeout_duration: Duration,
142}
143
144/// Circuit breaker states
145#[derive(Debug, Clone, Serialize, Deserialize)]
146pub enum CircuitState {
147    Closed,
148    Open,
149    HalfOpen,
150}
151
152/// System health monitoring
153#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct SystemHealthMonitor {
155    pub overall_health: HealthStatus,
156    pub component_health: HashMap<String, HealthStatus>,
157    pub last_health_check: chrono::DateTime<chrono::Utc>,
158    pub health_metrics: HealthMetrics,
159}
160
161/// Health status
162#[derive(Debug, Clone, Serialize, Deserialize)]
163pub enum HealthStatus {
164    Healthy,
165    Degraded,
166    Unhealthy,
167    Critical,
168}
169
170/// Health metrics
171#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct HealthMetrics {
173    pub error_rate: f64,
174    pub recovery_success_rate: f64,
175    pub average_response_time_ms: f64,
176    pub memory_health_score: f64,
177    pub stability_score: f64,
178}
179
180/// Failsafe manager for critical situations
181#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct FailsafeManager {
183    pub enabled: bool,
184    pub emergency_protocols: Vec<EmergencyProtocol>,
185    pub safe_mode_enabled: bool,
186    pub data_backup_enabled: bool,
187    pub last_backup: Option<chrono::DateTime<chrono::Utc>>,
188}
189
190/// Emergency protocols
191#[derive(Debug, Clone, Serialize, Deserialize)]
192pub struct EmergencyProtocol {
193    pub name: String,
194    pub trigger_conditions: Vec<String>,
195    pub actions: Vec<String>,
196    pub priority: u8,
197}
198
199impl ErrorRecoverySystem {
200    /// Create a new error recovery system
201    pub fn new(config: ErrorRecoveryConfig) -> Self {
202        let mut system = Self {
203            config,
204            recovery_strategies: HashMap::new(),
205            error_history: VecDeque::new(),
206            recovery_history: VecDeque::new(),
207            circuit_breaker: CircuitBreaker {
208                state: CircuitState::Closed,
209                failure_count: 0,
210                last_failure_time: None,
211                threshold: 5,
212                timeout_duration: Duration::from_secs(60),
213            },
214            health_monitor: SystemHealthMonitor {
215                overall_health: HealthStatus::Healthy,
216                component_health: HashMap::new(),
217                last_health_check: chrono::Utc::now(),
218                health_metrics: HealthMetrics {
219                    error_rate: 0.0,
220                    recovery_success_rate: 1.0,
221                    average_response_time_ms: 0.0,
222                    memory_health_score: 1.0,
223                    stability_score: 1.0,
224                },
225            },
226            failsafe_manager: FailsafeManager {
227                enabled: true,
228                emergency_protocols: Vec::new(),
229                safe_mode_enabled: false,
230                data_backup_enabled: true,
231                last_backup: None,
232            },
233        };
234
235        system.initialize_default_strategies();
236        system.initialize_emergency_protocols();
237        system
238    }
239
240    /// Handle an error event and attempt recovery
241    pub async fn handle_error(&mut self, error: ErrorEvent) -> Result<RecoveryResult> {
242        // Check circuit breaker
243        if matches!(self.circuit_breaker.state, CircuitState::Open) {
244            return Ok(RecoveryResult {
245                success: false,
246                strategy_used: None,
247                message: "Circuit breaker is open - recovery attempts suspended".to_string(),
248                recovery_time: Duration::from_millis(0),
249            });
250        }
251
252        // Record error
253        self.record_error(error.clone());
254
255        // Check if emergency protocols should be triggered
256        if self.should_trigger_emergency_protocol(&error) {
257            return self.execute_emergency_protocol(&error).await;
258        }
259
260        // Attempt recovery
261        let recovery_result = self.attempt_recovery(&error).await?;
262
263        // Update circuit breaker and health monitor
264        self.update_circuit_breaker(&recovery_result);
265        self.update_health_metrics(&error, &recovery_result);
266
267        Ok(recovery_result)
268    }
269
270    /// Record an error event
271    pub fn record_error(&mut self, error: ErrorEvent) {
272        self.error_history.push_back(error);
273
274        // Maintain history limit
275        while self.error_history.len() > self.config.error_history_limit {
276            self.error_history.pop_front();
277        }
278    }
279
280    /// Attempt recovery using appropriate strategies
281    pub async fn attempt_recovery(&mut self, error: &ErrorEvent) -> Result<RecoveryResult> {
282        let strategies = self.get_recovery_strategies(&error.error_type);
283
284        for (attempt, strategy) in strategies.iter().enumerate() {
285            if attempt >= self.config.max_retry_attempts {
286                break;
287            }
288
289            let recovery_event = RecoveryEvent {
290                id: Uuid::new_v4(),
291                error_id: error.id,
292                strategy: strategy.clone(),
293                start_time: chrono::Utc::now(),
294                end_time: None,
295                success: None,
296                result_message: String::new(),
297                attempts: attempt + 1,
298            };
299
300            let result = self.execute_recovery_strategy(strategy, error).await?;
301
302            let mut updated_event = recovery_event;
303            updated_event.end_time = Some(chrono::Utc::now());
304            updated_event.success = Some(result.success);
305            updated_event.result_message = result.message.clone();
306
307            self.recovery_history.push_back(updated_event);
308
309            if result.success {
310                return Ok(result);
311            }
312
313            // Wait before next attempt
314            if attempt < strategies.len() - 1 {
315                tokio::time::sleep(Duration::from_millis(self.config.retry_delay_ms)).await;
316            }
317        }
318
319        Ok(RecoveryResult {
320            success: false,
321            strategy_used: None,
322            message: "All recovery strategies failed".to_string(),
323            recovery_time: Duration::from_millis(0),
324        })
325    }
326
327    /// Execute a specific recovery strategy
328    pub async fn execute_recovery_strategy(
329        &self,
330        strategy: &RecoveryStrategy,
331        error: &ErrorEvent,
332    ) -> Result<RecoveryResult> {
333        let start_time = Instant::now();
334
335        let result = match strategy {
336            RecoveryStrategy::Retry {
337                max_attempts,
338                delay_ms,
339            } => self.execute_retry_strategy(*max_attempts, *delay_ms, error).await,
340            RecoveryStrategy::Fallback { alternative_method } => {
341                self.execute_fallback_strategy(alternative_method, error).await
342            },
343            RecoveryStrategy::GracefulDegradation {
344                reduced_functionality,
345            } => self.execute_degradation_strategy(reduced_functionality, error).await,
346            RecoveryStrategy::ResourceCleanup { cleanup_type } => {
347                self.execute_cleanup_strategy(cleanup_type, error).await
348            },
349            RecoveryStrategy::SystemReset { component } => {
350                self.execute_reset_strategy(component, error).await
351            },
352            RecoveryStrategy::EmergencyShutdown => self.execute_shutdown_strategy(error).await,
353            RecoveryStrategy::UserNotification { message } => {
354                self.execute_notification_strategy(message, error).await
355            },
356            RecoveryStrategy::AutomaticRepair { repair_action } => {
357                self.execute_repair_strategy(repair_action, error).await
358            },
359        };
360
361        let recovery_time = start_time.elapsed();
362
363        match result {
364            Ok(mut recovery_result) => {
365                recovery_result.recovery_time = recovery_time;
366                recovery_result.strategy_used = Some(strategy.clone());
367                Ok(recovery_result)
368            },
369            Err(e) => Ok(RecoveryResult {
370                success: false,
371                strategy_used: Some(strategy.clone()),
372                message: format!("Recovery strategy failed: {}", e),
373                recovery_time,
374            }),
375        }
376    }
377
378    /// Get recovery strategies for an error type
379    pub fn get_recovery_strategies(&self, error_type: &ErrorType) -> Vec<RecoveryStrategy> {
380        self.recovery_strategies.get(error_type).cloned().unwrap_or_default()
381    }
382
383    /// Check system health
384    pub async fn check_system_health(&mut self) -> HealthStatus {
385        // Update health metrics
386        self.health_monitor.last_health_check = chrono::Utc::now();
387
388        // Calculate error rate from recent history
389        let recent_errors = self
390            .error_history
391            .iter()
392            .filter(|e| {
393                let age = chrono::Utc::now() - e.timestamp;
394                age < chrono::Duration::minutes(5)
395            })
396            .count();
397
398        self.health_monitor.health_metrics.error_rate = recent_errors as f64 / 100.0; // Normalized
399
400        // Calculate recovery success rate
401        let recent_recoveries = self
402            .recovery_history
403            .iter()
404            .filter(|r| {
405                if let Some(end_time) = r.end_time {
406                    let age = chrono::Utc::now() - end_time;
407                    age < chrono::Duration::minutes(5)
408                } else {
409                    false
410                }
411            })
412            .collect::<Vec<_>>();
413
414        if !recent_recoveries.is_empty() {
415            let successful_recoveries =
416                recent_recoveries.iter().filter(|r| r.success.unwrap_or(false)).count();
417            self.health_monitor.health_metrics.recovery_success_rate =
418                successful_recoveries as f64 / recent_recoveries.len() as f64;
419        }
420
421        // Determine overall health
422        self.health_monitor.overall_health = if self.health_monitor.health_metrics.error_rate > 0.5
423        {
424            HealthStatus::Critical
425        } else if self.health_monitor.health_metrics.error_rate > 0.2 {
426            HealthStatus::Unhealthy
427        } else if self.health_monitor.health_metrics.error_rate > 0.1 {
428            HealthStatus::Degraded
429        } else {
430            HealthStatus::Healthy
431        };
432
433        self.health_monitor.overall_health.clone()
434    }
435
436    /// Enable safe mode
437    pub fn enable_safe_mode(&mut self) {
438        self.failsafe_manager.safe_mode_enabled = true;
439        tracing::warn!("Safe mode enabled - operating with reduced functionality");
440    }
441
442    /// Disable safe mode
443    pub fn disable_safe_mode(&mut self) {
444        self.failsafe_manager.safe_mode_enabled = false;
445        tracing::info!("Safe mode disabled - full functionality restored");
446    }
447
448    /// Get error statistics
449    pub fn get_error_statistics(&self) -> ErrorStatistics {
450        let total_errors = self.error_history.len();
451        let error_type_counts = self.error_history.iter().fold(HashMap::new(), |mut acc, error| {
452            *acc.entry(error.error_type.clone()).or_insert(0) += 1;
453            acc
454        });
455
456        let severity_counts = self.error_history.iter().fold(HashMap::new(), |mut acc, error| {
457            *acc.entry(format!("{:?}", error.severity)).or_insert(0) += 1;
458            acc
459        });
460
461        ErrorStatistics {
462            total_errors,
463            error_type_counts,
464            severity_counts,
465            recovery_success_rate: self.health_monitor.health_metrics.recovery_success_rate,
466            circuit_breaker_state: self.circuit_breaker.state.clone(),
467            system_health: self.health_monitor.overall_health.clone(),
468        }
469    }
470
471    // Private helper methods
472
473    fn initialize_default_strategies(&mut self) {
474        // Initialize default recovery strategies for each error type
475        self.recovery_strategies.insert(
476            ErrorType::TensorInspectionError,
477            vec![
478                RecoveryStrategy::Retry {
479                    max_attempts: 3,
480                    delay_ms: 100,
481                },
482                RecoveryStrategy::ResourceCleanup {
483                    cleanup_type: "tensor_cache".to_string(),
484                },
485                RecoveryStrategy::Fallback {
486                    alternative_method: "simplified_inspection".to_string(),
487                },
488            ],
489        );
490
491        self.recovery_strategies.insert(
492            ErrorType::GradientDebuggingError,
493            vec![
494                RecoveryStrategy::Retry {
495                    max_attempts: 2,
496                    delay_ms: 200,
497                },
498                RecoveryStrategy::GracefulDegradation {
499                    reduced_functionality: "basic_gradient_info".to_string(),
500                },
501            ],
502        );
503
504        self.recovery_strategies.insert(
505            ErrorType::MemoryProfilingError,
506            vec![
507                RecoveryStrategy::ResourceCleanup {
508                    cleanup_type: "memory_profiler".to_string(),
509                },
510                RecoveryStrategy::SystemReset {
511                    component: "memory_tracker".to_string(),
512                },
513            ],
514        );
515
516        self.recovery_strategies.insert(
517            ErrorType::ResourceExhaustion,
518            vec![
519                RecoveryStrategy::ResourceCleanup {
520                    cleanup_type: "all_caches".to_string(),
521                },
522                RecoveryStrategy::GracefulDegradation {
523                    reduced_functionality: "essential_only".to_string(),
524                },
525                RecoveryStrategy::EmergencyShutdown,
526            ],
527        );
528
529        // Add more strategies for other error types...
530    }
531
532    fn initialize_emergency_protocols(&mut self) {
533        self.failsafe_manager.emergency_protocols = vec![
534            EmergencyProtocol {
535                name: "Memory Exhaustion Protocol".to_string(),
536                trigger_conditions: vec!["memory_usage > 90%".to_string()],
537                actions: vec![
538                    "clear_all_caches".to_string(),
539                    "reduce_tracking".to_string(),
540                ],
541                priority: 1,
542            },
543            EmergencyProtocol {
544                name: "Critical Error Protocol".to_string(),
545                trigger_conditions: vec!["error_severity == Fatal".to_string()],
546                actions: vec!["emergency_backup".to_string(), "safe_shutdown".to_string()],
547                priority: 0,
548            },
549        ];
550    }
551
552    fn should_trigger_emergency_protocol(&self, error: &ErrorEvent) -> bool {
553        matches!(error.severity, ErrorSeverity::Fatal)
554            || error.context.system_state.memory_usage_mb > 8192 // > 8GB
555    }
556
557    async fn execute_emergency_protocol(&mut self, error: &ErrorEvent) -> Result<RecoveryResult> {
558        tracing::error!(
559            "Executing emergency protocol for error: {}",
560            error.error_message
561        );
562
563        // Enable safe mode
564        self.enable_safe_mode();
565
566        // Execute emergency backup if enabled
567        if self.failsafe_manager.data_backup_enabled {
568            self.create_emergency_backup().await?;
569        }
570
571        Ok(RecoveryResult {
572            success: true,
573            strategy_used: Some(RecoveryStrategy::EmergencyShutdown),
574            message: "Emergency protocol executed successfully".to_string(),
575            recovery_time: Duration::from_millis(0),
576        })
577    }
578
579    async fn create_emergency_backup(&mut self) -> Result<()> {
580        tracing::info!("Creating emergency backup");
581        self.failsafe_manager.last_backup = Some(chrono::Utc::now());
582        // Implement backup logic here
583        Ok(())
584    }
585
586    fn update_circuit_breaker(&mut self, result: &RecoveryResult) {
587        if result.success {
588            self.circuit_breaker.failure_count = 0;
589            self.circuit_breaker.state = CircuitState::Closed;
590        } else {
591            self.circuit_breaker.failure_count += 1;
592            self.circuit_breaker.last_failure_time = Some(chrono::Utc::now());
593
594            if self.circuit_breaker.failure_count >= self.circuit_breaker.threshold {
595                self.circuit_breaker.state = CircuitState::Open;
596            }
597        }
598    }
599
600    fn update_health_metrics(&mut self, _error: &ErrorEvent, _result: &RecoveryResult) {
601        // Update health metrics based on error and recovery result
602        // This would include more sophisticated health scoring logic
603    }
604
605    // Recovery strategy implementations (simplified)
606    async fn execute_retry_strategy(
607        &self,
608        _max_attempts: usize,
609        _delay_ms: u64,
610        _error: &ErrorEvent,
611    ) -> Result<RecoveryResult> {
612        Ok(RecoveryResult {
613            success: true,
614            strategy_used: None,
615            message: "Retry successful".to_string(),
616            recovery_time: Duration::from_millis(0),
617        })
618    }
619
620    async fn execute_fallback_strategy(
621        &self,
622        _alternative: &str,
623        _error: &ErrorEvent,
624    ) -> Result<RecoveryResult> {
625        Ok(RecoveryResult {
626            success: true,
627            strategy_used: None,
628            message: "Fallback strategy executed".to_string(),
629            recovery_time: Duration::from_millis(0),
630        })
631    }
632
633    async fn execute_degradation_strategy(
634        &self,
635        _functionality: &str,
636        _error: &ErrorEvent,
637    ) -> Result<RecoveryResult> {
638        Ok(RecoveryResult {
639            success: true,
640            strategy_used: None,
641            message: "Graceful degradation applied".to_string(),
642            recovery_time: Duration::from_millis(0),
643        })
644    }
645
646    async fn execute_cleanup_strategy(
647        &self,
648        _cleanup_type: &str,
649        _error: &ErrorEvent,
650    ) -> Result<RecoveryResult> {
651        Ok(RecoveryResult {
652            success: true,
653            strategy_used: None,
654            message: "Resource cleanup completed".to_string(),
655            recovery_time: Duration::from_millis(0),
656        })
657    }
658
659    async fn execute_reset_strategy(
660        &self,
661        _component: &str,
662        _error: &ErrorEvent,
663    ) -> Result<RecoveryResult> {
664        Ok(RecoveryResult {
665            success: true,
666            strategy_used: None,
667            message: "Component reset completed".to_string(),
668            recovery_time: Duration::from_millis(0),
669        })
670    }
671
672    async fn execute_shutdown_strategy(&self, _error: &ErrorEvent) -> Result<RecoveryResult> {
673        Ok(RecoveryResult {
674            success: true,
675            strategy_used: None,
676            message: "Emergency shutdown initiated".to_string(),
677            recovery_time: Duration::from_millis(0),
678        })
679    }
680
681    async fn execute_notification_strategy(
682        &self,
683        message: &str,
684        _error: &ErrorEvent,
685    ) -> Result<RecoveryResult> {
686        tracing::warn!("User notification: {}", message);
687        Ok(RecoveryResult {
688            success: true,
689            strategy_used: None,
690            message: "User notified".to_string(),
691            recovery_time: Duration::from_millis(0),
692        })
693    }
694
695    async fn execute_repair_strategy(
696        &self,
697        _repair_action: &str,
698        _error: &ErrorEvent,
699    ) -> Result<RecoveryResult> {
700        Ok(RecoveryResult {
701            success: true,
702            strategy_used: None,
703            message: "Automatic repair completed".to_string(),
704            recovery_time: Duration::from_millis(0),
705        })
706    }
707}
708
709/// Result of a recovery attempt
710#[derive(Debug, Clone, Serialize, Deserialize)]
711pub struct RecoveryResult {
712    pub success: bool,
713    pub strategy_used: Option<RecoveryStrategy>,
714    pub message: String,
715    pub recovery_time: Duration,
716}
717
718/// Error statistics
719#[derive(Debug, Clone, Serialize, Deserialize)]
720pub struct ErrorStatistics {
721    pub total_errors: usize,
722    pub error_type_counts: HashMap<ErrorType, usize>,
723    pub severity_counts: HashMap<String, usize>,
724    pub recovery_success_rate: f64,
725    pub circuit_breaker_state: CircuitState,
726    pub system_health: HealthStatus,
727}
728
729// ─────────────────────────────────────────────────────────────────────────────
730// Tests
731// ─────────────────────────────────────────────────────────────────────────────
732
733#[cfg(test)]
734mod tests {
735    use super::*;
736
737    fn make_error_event(error_type: ErrorType) -> ErrorEvent {
738        ErrorEvent {
739            id: Uuid::new_v4(),
740            error_type,
741            error_message: "test error".to_string(),
742            component: "test_component".to_string(),
743            severity: ErrorSeverity::Medium,
744            timestamp: chrono::Utc::now(),
745            context: ErrorContext {
746                session_id: Uuid::new_v4(),
747                operation: "test_op".to_string(),
748                parameters: HashMap::new(),
749                system_state: SystemState {
750                    memory_usage_mb: 1024,
751                    cpu_usage_percent: 50.0,
752                    active_tensors: 4,
753                    active_sessions: 1,
754                    uptime_seconds: 100,
755                },
756            },
757            stack_trace: None,
758        }
759    }
760
761    // ── ErrorRecoveryConfig ──────────────────────────────────────────────
762
763    #[test]
764    fn test_config_default_fields() {
765        let cfg = ErrorRecoveryConfig::default();
766        assert!(cfg.enabled);
767        assert!(cfg.max_retry_attempts > 0);
768        assert!(cfg.circuit_breaker_threshold > 0);
769        assert!(cfg.error_history_limit > 0);
770    }
771
772    // ── ErrorRecoverySystem creation ──────────────────────────────────────
773
774    #[test]
775    fn test_system_new_initializes_strategies() {
776        let system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
777        // Should have strategies initialized for at least TensorInspectionError
778        let strategies = system.get_recovery_strategies(&ErrorType::TensorInspectionError);
779        assert!(!strategies.is_empty());
780    }
781
782    #[test]
783    fn test_system_new_circuit_breaker_closed() {
784        let system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
785        assert!(matches!(system.circuit_breaker.state, CircuitState::Closed));
786    }
787
788    // ── record_error ─────────────────────────────────────────────────────
789
790    #[test]
791    fn test_record_error_adds_to_history() {
792        let mut system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
793        let event = make_error_event(ErrorType::IOError);
794        system.record_error(event);
795        assert_eq!(system.error_history.len(), 1);
796    }
797
798    #[test]
799    fn test_record_error_respects_history_limit() {
800        let mut cfg = ErrorRecoveryConfig::default();
801        cfg.error_history_limit = 3;
802        let mut system = ErrorRecoverySystem::new(cfg);
803        for _ in 0..5 {
804            system.record_error(make_error_event(ErrorType::NetworkError));
805        }
806        assert_eq!(system.error_history.len(), 3);
807    }
808
809    // ── safe_mode ────────────────────────────────────────────────────────
810
811    #[test]
812    fn test_enable_disable_safe_mode() {
813        let mut system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
814        assert!(!system.failsafe_manager.safe_mode_enabled);
815        system.enable_safe_mode();
816        assert!(system.failsafe_manager.safe_mode_enabled);
817        system.disable_safe_mode();
818        assert!(!system.failsafe_manager.safe_mode_enabled);
819    }
820
821    // ── get_error_statistics ──────────────────────────────────────────────
822
823    #[test]
824    fn test_error_statistics_empty() {
825        let system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
826        let stats = system.get_error_statistics();
827        assert_eq!(stats.total_errors, 0);
828        assert!(matches!(stats.circuit_breaker_state, CircuitState::Closed));
829        assert!(matches!(stats.system_health, HealthStatus::Healthy));
830    }
831
832    #[test]
833    fn test_error_statistics_with_errors() {
834        let mut system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
835        system.record_error(make_error_event(ErrorType::IOError));
836        system.record_error(make_error_event(ErrorType::NetworkError));
837        let stats = system.get_error_statistics();
838        assert_eq!(stats.total_errors, 2);
839        assert_eq!(
840            stats.error_type_counts.get(&ErrorType::IOError).copied().unwrap_or(0),
841            1
842        );
843    }
844
845    // ── ErrorType variants ────────────────────────────────────────────────
846
847    #[test]
848    fn test_error_type_variants() {
849        let types = [
850            ErrorType::TensorInspectionError,
851            ErrorType::GradientDebuggingError,
852            ErrorType::ModelDiagnosticsError,
853            ErrorType::VisualizationError,
854            ErrorType::MemoryProfilingError,
855            ErrorType::IOError,
856            ErrorType::NetworkError,
857            ErrorType::ResourceExhaustion,
858            ErrorType::ConfigurationError,
859            ErrorType::DataCorruption,
860            ErrorType::SystemFailure,
861            ErrorType::UserError,
862        ];
863        for t in &types {
864            assert!(!format!("{:?}", t).is_empty());
865        }
866    }
867
868    // ── ErrorSeverity variants ────────────────────────────────────────────
869
870    #[test]
871    fn test_error_severity_variants() {
872        let severities = [
873            ErrorSeverity::Low,
874            ErrorSeverity::Medium,
875            ErrorSeverity::High,
876            ErrorSeverity::Critical,
877            ErrorSeverity::Fatal,
878        ];
879        for s in &severities {
880            assert!(!format!("{:?}", s).is_empty());
881        }
882    }
883
884    // ── RecoveryStrategy variants ─────────────────────────────────────────
885
886    #[test]
887    fn test_recovery_strategy_variants() {
888        let strats = [
889            RecoveryStrategy::Retry {
890                max_attempts: 3,
891                delay_ms: 100,
892            },
893            RecoveryStrategy::Fallback {
894                alternative_method: "alt".to_string(),
895            },
896            RecoveryStrategy::GracefulDegradation {
897                reduced_functionality: "basic".to_string(),
898            },
899            RecoveryStrategy::ResourceCleanup {
900                cleanup_type: "cache".to_string(),
901            },
902            RecoveryStrategy::SystemReset {
903                component: "comp".to_string(),
904            },
905            RecoveryStrategy::EmergencyShutdown,
906            RecoveryStrategy::UserNotification {
907                message: "msg".to_string(),
908            },
909            RecoveryStrategy::AutomaticRepair {
910                repair_action: "repair".to_string(),
911            },
912        ];
913        for s in &strats {
914            assert!(!format!("{:?}", s).is_empty());
915        }
916    }
917
918    // ── CircuitState variants ─────────────────────────────────────────────
919
920    #[test]
921    fn test_circuit_state_variants() {
922        let states = [
923            CircuitState::Closed,
924            CircuitState::Open,
925            CircuitState::HalfOpen,
926        ];
927        for s in &states {
928            assert!(!format!("{:?}", s).is_empty());
929        }
930    }
931
932    // ── HealthStatus variants ─────────────────────────────────────────────
933
934    #[test]
935    fn test_health_status_variants() {
936        let statuses = [
937            HealthStatus::Healthy,
938            HealthStatus::Degraded,
939            HealthStatus::Unhealthy,
940            HealthStatus::Critical,
941        ];
942        for s in &statuses {
943            assert!(!format!("{:?}", s).is_empty());
944        }
945    }
946
947    // ── CircuitBreaker ────────────────────────────────────────────────────
948
949    #[test]
950    fn test_circuit_breaker_initial_state() {
951        let system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
952        assert_eq!(system.circuit_breaker.failure_count, 0);
953        assert!(system.circuit_breaker.last_failure_time.is_none());
954        assert_eq!(system.circuit_breaker.threshold, 5);
955    }
956
957    // ── SystemState struct ────────────────────────────────────────────────
958
959    #[test]
960    fn test_system_state_construction() {
961        let state = SystemState {
962            memory_usage_mb: 2048,
963            cpu_usage_percent: 75.5,
964            active_tensors: 10,
965            active_sessions: 2,
966            uptime_seconds: 3600,
967        };
968        assert_eq!(state.memory_usage_mb, 2048);
969        assert!((state.cpu_usage_percent - 75.5).abs() < 1e-6);
970    }
971
972    // ── HealthMetrics ──────────────────────────────────────────────────────
973
974    #[test]
975    fn test_health_metrics_initial_values() {
976        let system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
977        let m = &system.health_monitor.health_metrics;
978        assert_eq!(m.error_rate, 0.0);
979        assert_eq!(m.recovery_success_rate, 1.0);
980    }
981
982    // ── async handle_error with open circuit breaker ──────────────────────
983
984    #[tokio::test]
985    async fn test_handle_error_with_open_circuit_breaker() {
986        let mut system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
987        system.circuit_breaker.state = CircuitState::Open;
988        let event = make_error_event(ErrorType::IOError);
989        let result = system.handle_error(event).await.expect("should succeed");
990        assert!(!result.success);
991        assert!(result.message.contains("Circuit breaker"));
992    }
993}