use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, VecDeque};
use std::time::{Duration, Instant};
use uuid::Uuid;
#[derive(Debug)]
pub struct ErrorRecoverySystem {
config: ErrorRecoveryConfig,
recovery_strategies: HashMap<ErrorType, Vec<RecoveryStrategy>>,
error_history: VecDeque<ErrorEvent>,
recovery_history: VecDeque<RecoveryEvent>,
circuit_breaker: CircuitBreaker,
health_monitor: SystemHealthMonitor,
failsafe_manager: FailsafeManager,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErrorRecoveryConfig {
pub enabled: bool,
pub max_retry_attempts: usize,
pub retry_delay_ms: u64,
pub circuit_breaker_threshold: usize,
pub health_check_interval_ms: u64,
pub auto_failsafe_enabled: bool,
pub error_history_limit: usize,
pub recovery_timeout_ms: u64,
}
impl Default for ErrorRecoveryConfig {
fn default() -> Self {
Self {
enabled: true,
max_retry_attempts: 3,
retry_delay_ms: 100,
circuit_breaker_threshold: 5,
health_check_interval_ms: 5000,
auto_failsafe_enabled: true,
error_history_limit: 1000,
recovery_timeout_ms: 30000,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum ErrorType {
TensorInspectionError,
GradientDebuggingError,
ModelDiagnosticsError,
VisualizationError,
MemoryProfilingError,
IOError,
NetworkError,
ResourceExhaustion,
ConfigurationError,
DataCorruption,
SystemFailure,
UserError,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum RecoveryStrategy {
Retry { max_attempts: usize, delay_ms: u64 },
Fallback { alternative_method: String },
GracefulDegradation { reduced_functionality: String },
ResourceCleanup { cleanup_type: String },
SystemReset { component: String },
EmergencyShutdown,
UserNotification { message: String },
AutomaticRepair { repair_action: String },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErrorEvent {
pub id: Uuid,
pub error_type: ErrorType,
pub error_message: String,
pub component: String,
pub severity: ErrorSeverity,
pub timestamp: chrono::DateTime<chrono::Utc>,
pub context: ErrorContext,
pub stack_trace: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum ErrorSeverity {
Low,
Medium,
High,
Critical,
Fatal,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErrorContext {
pub session_id: Uuid,
pub operation: String,
pub parameters: HashMap<String, String>,
pub system_state: SystemState,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemState {
pub memory_usage_mb: u64,
pub cpu_usage_percent: f64,
pub active_tensors: usize,
pub active_sessions: usize,
pub uptime_seconds: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RecoveryEvent {
pub id: Uuid,
pub error_id: Uuid,
pub strategy: RecoveryStrategy,
pub start_time: chrono::DateTime<chrono::Utc>,
pub end_time: Option<chrono::DateTime<chrono::Utc>>,
pub success: Option<bool>,
pub result_message: String,
pub attempts: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CircuitBreaker {
pub state: CircuitState,
pub failure_count: usize,
pub last_failure_time: Option<chrono::DateTime<chrono::Utc>>,
pub threshold: usize,
pub timeout_duration: Duration,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum CircuitState {
Closed,
Open,
HalfOpen,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemHealthMonitor {
pub overall_health: HealthStatus,
pub component_health: HashMap<String, HealthStatus>,
pub last_health_check: chrono::DateTime<chrono::Utc>,
pub health_metrics: HealthMetrics,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum HealthStatus {
Healthy,
Degraded,
Unhealthy,
Critical,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthMetrics {
pub error_rate: f64,
pub recovery_success_rate: f64,
pub average_response_time_ms: f64,
pub memory_health_score: f64,
pub stability_score: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FailsafeManager {
pub enabled: bool,
pub emergency_protocols: Vec<EmergencyProtocol>,
pub safe_mode_enabled: bool,
pub data_backup_enabled: bool,
pub last_backup: Option<chrono::DateTime<chrono::Utc>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmergencyProtocol {
pub name: String,
pub trigger_conditions: Vec<String>,
pub actions: Vec<String>,
pub priority: u8,
}
impl ErrorRecoverySystem {
pub fn new(config: ErrorRecoveryConfig) -> Self {
let mut system = Self {
config,
recovery_strategies: HashMap::new(),
error_history: VecDeque::new(),
recovery_history: VecDeque::new(),
circuit_breaker: CircuitBreaker {
state: CircuitState::Closed,
failure_count: 0,
last_failure_time: None,
threshold: 5,
timeout_duration: Duration::from_secs(60),
},
health_monitor: SystemHealthMonitor {
overall_health: HealthStatus::Healthy,
component_health: HashMap::new(),
last_health_check: chrono::Utc::now(),
health_metrics: HealthMetrics {
error_rate: 0.0,
recovery_success_rate: 1.0,
average_response_time_ms: 0.0,
memory_health_score: 1.0,
stability_score: 1.0,
},
},
failsafe_manager: FailsafeManager {
enabled: true,
emergency_protocols: Vec::new(),
safe_mode_enabled: false,
data_backup_enabled: true,
last_backup: None,
},
};
system.initialize_default_strategies();
system.initialize_emergency_protocols();
system
}
pub async fn handle_error(&mut self, error: ErrorEvent) -> Result<RecoveryResult> {
if matches!(self.circuit_breaker.state, CircuitState::Open) {
return Ok(RecoveryResult {
success: false,
strategy_used: None,
message: "Circuit breaker is open - recovery attempts suspended".to_string(),
recovery_time: Duration::from_millis(0),
});
}
self.record_error(error.clone());
if self.should_trigger_emergency_protocol(&error) {
return self.execute_emergency_protocol(&error).await;
}
let recovery_result = self.attempt_recovery(&error).await?;
self.update_circuit_breaker(&recovery_result);
self.update_health_metrics(&error, &recovery_result);
Ok(recovery_result)
}
pub fn record_error(&mut self, error: ErrorEvent) {
self.error_history.push_back(error);
while self.error_history.len() > self.config.error_history_limit {
self.error_history.pop_front();
}
}
pub async fn attempt_recovery(&mut self, error: &ErrorEvent) -> Result<RecoveryResult> {
let strategies = self.get_recovery_strategies(&error.error_type);
for (attempt, strategy) in strategies.iter().enumerate() {
if attempt >= self.config.max_retry_attempts {
break;
}
let recovery_event = RecoveryEvent {
id: Uuid::new_v4(),
error_id: error.id,
strategy: strategy.clone(),
start_time: chrono::Utc::now(),
end_time: None,
success: None,
result_message: String::new(),
attempts: attempt + 1,
};
let result = self.execute_recovery_strategy(strategy, error).await?;
let mut updated_event = recovery_event;
updated_event.end_time = Some(chrono::Utc::now());
updated_event.success = Some(result.success);
updated_event.result_message = result.message.clone();
self.recovery_history.push_back(updated_event);
if result.success {
return Ok(result);
}
if attempt < strategies.len() - 1 {
tokio::time::sleep(Duration::from_millis(self.config.retry_delay_ms)).await;
}
}
Ok(RecoveryResult {
success: false,
strategy_used: None,
message: "All recovery strategies failed".to_string(),
recovery_time: Duration::from_millis(0),
})
}
pub async fn execute_recovery_strategy(
&self,
strategy: &RecoveryStrategy,
error: &ErrorEvent,
) -> Result<RecoveryResult> {
let start_time = Instant::now();
let result = match strategy {
RecoveryStrategy::Retry {
max_attempts,
delay_ms,
} => self.execute_retry_strategy(*max_attempts, *delay_ms, error).await,
RecoveryStrategy::Fallback { alternative_method } => {
self.execute_fallback_strategy(alternative_method, error).await
},
RecoveryStrategy::GracefulDegradation {
reduced_functionality,
} => self.execute_degradation_strategy(reduced_functionality, error).await,
RecoveryStrategy::ResourceCleanup { cleanup_type } => {
self.execute_cleanup_strategy(cleanup_type, error).await
},
RecoveryStrategy::SystemReset { component } => {
self.execute_reset_strategy(component, error).await
},
RecoveryStrategy::EmergencyShutdown => self.execute_shutdown_strategy(error).await,
RecoveryStrategy::UserNotification { message } => {
self.execute_notification_strategy(message, error).await
},
RecoveryStrategy::AutomaticRepair { repair_action } => {
self.execute_repair_strategy(repair_action, error).await
},
};
let recovery_time = start_time.elapsed();
match result {
Ok(mut recovery_result) => {
recovery_result.recovery_time = recovery_time;
recovery_result.strategy_used = Some(strategy.clone());
Ok(recovery_result)
},
Err(e) => Ok(RecoveryResult {
success: false,
strategy_used: Some(strategy.clone()),
message: format!("Recovery strategy failed: {}", e),
recovery_time,
}),
}
}
pub fn get_recovery_strategies(&self, error_type: &ErrorType) -> Vec<RecoveryStrategy> {
self.recovery_strategies.get(error_type).cloned().unwrap_or_default()
}
pub async fn check_system_health(&mut self) -> HealthStatus {
self.health_monitor.last_health_check = chrono::Utc::now();
let recent_errors = self
.error_history
.iter()
.filter(|e| {
let age = chrono::Utc::now() - e.timestamp;
age < chrono::Duration::minutes(5)
})
.count();
self.health_monitor.health_metrics.error_rate = recent_errors as f64 / 100.0;
let recent_recoveries = self
.recovery_history
.iter()
.filter(|r| {
if let Some(end_time) = r.end_time {
let age = chrono::Utc::now() - end_time;
age < chrono::Duration::minutes(5)
} else {
false
}
})
.collect::<Vec<_>>();
if !recent_recoveries.is_empty() {
let successful_recoveries =
recent_recoveries.iter().filter(|r| r.success.unwrap_or(false)).count();
self.health_monitor.health_metrics.recovery_success_rate =
successful_recoveries as f64 / recent_recoveries.len() as f64;
}
self.health_monitor.overall_health = if self.health_monitor.health_metrics.error_rate > 0.5
{
HealthStatus::Critical
} else if self.health_monitor.health_metrics.error_rate > 0.2 {
HealthStatus::Unhealthy
} else if self.health_monitor.health_metrics.error_rate > 0.1 {
HealthStatus::Degraded
} else {
HealthStatus::Healthy
};
self.health_monitor.overall_health.clone()
}
pub fn enable_safe_mode(&mut self) {
self.failsafe_manager.safe_mode_enabled = true;
tracing::warn!("Safe mode enabled - operating with reduced functionality");
}
pub fn disable_safe_mode(&mut self) {
self.failsafe_manager.safe_mode_enabled = false;
tracing::info!("Safe mode disabled - full functionality restored");
}
pub fn get_error_statistics(&self) -> ErrorStatistics {
let total_errors = self.error_history.len();
let error_type_counts = self.error_history.iter().fold(HashMap::new(), |mut acc, error| {
*acc.entry(error.error_type.clone()).or_insert(0) += 1;
acc
});
let severity_counts = self.error_history.iter().fold(HashMap::new(), |mut acc, error| {
*acc.entry(format!("{:?}", error.severity)).or_insert(0) += 1;
acc
});
ErrorStatistics {
total_errors,
error_type_counts,
severity_counts,
recovery_success_rate: self.health_monitor.health_metrics.recovery_success_rate,
circuit_breaker_state: self.circuit_breaker.state.clone(),
system_health: self.health_monitor.overall_health.clone(),
}
}
fn initialize_default_strategies(&mut self) {
self.recovery_strategies.insert(
ErrorType::TensorInspectionError,
vec![
RecoveryStrategy::Retry {
max_attempts: 3,
delay_ms: 100,
},
RecoveryStrategy::ResourceCleanup {
cleanup_type: "tensor_cache".to_string(),
},
RecoveryStrategy::Fallback {
alternative_method: "simplified_inspection".to_string(),
},
],
);
self.recovery_strategies.insert(
ErrorType::GradientDebuggingError,
vec![
RecoveryStrategy::Retry {
max_attempts: 2,
delay_ms: 200,
},
RecoveryStrategy::GracefulDegradation {
reduced_functionality: "basic_gradient_info".to_string(),
},
],
);
self.recovery_strategies.insert(
ErrorType::MemoryProfilingError,
vec![
RecoveryStrategy::ResourceCleanup {
cleanup_type: "memory_profiler".to_string(),
},
RecoveryStrategy::SystemReset {
component: "memory_tracker".to_string(),
},
],
);
self.recovery_strategies.insert(
ErrorType::ResourceExhaustion,
vec![
RecoveryStrategy::ResourceCleanup {
cleanup_type: "all_caches".to_string(),
},
RecoveryStrategy::GracefulDegradation {
reduced_functionality: "essential_only".to_string(),
},
RecoveryStrategy::EmergencyShutdown,
],
);
}
fn initialize_emergency_protocols(&mut self) {
self.failsafe_manager.emergency_protocols = vec![
EmergencyProtocol {
name: "Memory Exhaustion Protocol".to_string(),
trigger_conditions: vec!["memory_usage > 90%".to_string()],
actions: vec![
"clear_all_caches".to_string(),
"reduce_tracking".to_string(),
],
priority: 1,
},
EmergencyProtocol {
name: "Critical Error Protocol".to_string(),
trigger_conditions: vec!["error_severity == Fatal".to_string()],
actions: vec!["emergency_backup".to_string(), "safe_shutdown".to_string()],
priority: 0,
},
];
}
fn should_trigger_emergency_protocol(&self, error: &ErrorEvent) -> bool {
matches!(error.severity, ErrorSeverity::Fatal)
|| error.context.system_state.memory_usage_mb > 8192 }
async fn execute_emergency_protocol(&mut self, error: &ErrorEvent) -> Result<RecoveryResult> {
tracing::error!(
"Executing emergency protocol for error: {}",
error.error_message
);
self.enable_safe_mode();
if self.failsafe_manager.data_backup_enabled {
self.create_emergency_backup().await?;
}
Ok(RecoveryResult {
success: true,
strategy_used: Some(RecoveryStrategy::EmergencyShutdown),
message: "Emergency protocol executed successfully".to_string(),
recovery_time: Duration::from_millis(0),
})
}
async fn create_emergency_backup(&mut self) -> Result<()> {
tracing::info!("Creating emergency backup");
self.failsafe_manager.last_backup = Some(chrono::Utc::now());
Ok(())
}
fn update_circuit_breaker(&mut self, result: &RecoveryResult) {
if result.success {
self.circuit_breaker.failure_count = 0;
self.circuit_breaker.state = CircuitState::Closed;
} else {
self.circuit_breaker.failure_count += 1;
self.circuit_breaker.last_failure_time = Some(chrono::Utc::now());
if self.circuit_breaker.failure_count >= self.circuit_breaker.threshold {
self.circuit_breaker.state = CircuitState::Open;
}
}
}
fn update_health_metrics(&mut self, _error: &ErrorEvent, _result: &RecoveryResult) {
}
async fn execute_retry_strategy(
&self,
_max_attempts: usize,
_delay_ms: u64,
_error: &ErrorEvent,
) -> Result<RecoveryResult> {
Ok(RecoveryResult {
success: true,
strategy_used: None,
message: "Retry successful".to_string(),
recovery_time: Duration::from_millis(0),
})
}
async fn execute_fallback_strategy(
&self,
_alternative: &str,
_error: &ErrorEvent,
) -> Result<RecoveryResult> {
Ok(RecoveryResult {
success: true,
strategy_used: None,
message: "Fallback strategy executed".to_string(),
recovery_time: Duration::from_millis(0),
})
}
async fn execute_degradation_strategy(
&self,
_functionality: &str,
_error: &ErrorEvent,
) -> Result<RecoveryResult> {
Ok(RecoveryResult {
success: true,
strategy_used: None,
message: "Graceful degradation applied".to_string(),
recovery_time: Duration::from_millis(0),
})
}
async fn execute_cleanup_strategy(
&self,
_cleanup_type: &str,
_error: &ErrorEvent,
) -> Result<RecoveryResult> {
Ok(RecoveryResult {
success: true,
strategy_used: None,
message: "Resource cleanup completed".to_string(),
recovery_time: Duration::from_millis(0),
})
}
async fn execute_reset_strategy(
&self,
_component: &str,
_error: &ErrorEvent,
) -> Result<RecoveryResult> {
Ok(RecoveryResult {
success: true,
strategy_used: None,
message: "Component reset completed".to_string(),
recovery_time: Duration::from_millis(0),
})
}
async fn execute_shutdown_strategy(&self, _error: &ErrorEvent) -> Result<RecoveryResult> {
Ok(RecoveryResult {
success: true,
strategy_used: None,
message: "Emergency shutdown initiated".to_string(),
recovery_time: Duration::from_millis(0),
})
}
async fn execute_notification_strategy(
&self,
message: &str,
_error: &ErrorEvent,
) -> Result<RecoveryResult> {
tracing::warn!("User notification: {}", message);
Ok(RecoveryResult {
success: true,
strategy_used: None,
message: "User notified".to_string(),
recovery_time: Duration::from_millis(0),
})
}
async fn execute_repair_strategy(
&self,
_repair_action: &str,
_error: &ErrorEvent,
) -> Result<RecoveryResult> {
Ok(RecoveryResult {
success: true,
strategy_used: None,
message: "Automatic repair completed".to_string(),
recovery_time: Duration::from_millis(0),
})
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RecoveryResult {
pub success: bool,
pub strategy_used: Option<RecoveryStrategy>,
pub message: String,
pub recovery_time: Duration,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErrorStatistics {
pub total_errors: usize,
pub error_type_counts: HashMap<ErrorType, usize>,
pub severity_counts: HashMap<String, usize>,
pub recovery_success_rate: f64,
pub circuit_breaker_state: CircuitState,
pub system_health: HealthStatus,
}
#[cfg(test)]
mod tests {
use super::*;
fn make_error_event(error_type: ErrorType) -> ErrorEvent {
ErrorEvent {
id: Uuid::new_v4(),
error_type,
error_message: "test error".to_string(),
component: "test_component".to_string(),
severity: ErrorSeverity::Medium,
timestamp: chrono::Utc::now(),
context: ErrorContext {
session_id: Uuid::new_v4(),
operation: "test_op".to_string(),
parameters: HashMap::new(),
system_state: SystemState {
memory_usage_mb: 1024,
cpu_usage_percent: 50.0,
active_tensors: 4,
active_sessions: 1,
uptime_seconds: 100,
},
},
stack_trace: None,
}
}
#[test]
fn test_config_default_fields() {
let cfg = ErrorRecoveryConfig::default();
assert!(cfg.enabled);
assert!(cfg.max_retry_attempts > 0);
assert!(cfg.circuit_breaker_threshold > 0);
assert!(cfg.error_history_limit > 0);
}
#[test]
fn test_system_new_initializes_strategies() {
let system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
let strategies = system.get_recovery_strategies(&ErrorType::TensorInspectionError);
assert!(!strategies.is_empty());
}
#[test]
fn test_system_new_circuit_breaker_closed() {
let system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
assert!(matches!(system.circuit_breaker.state, CircuitState::Closed));
}
#[test]
fn test_record_error_adds_to_history() {
let mut system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
let event = make_error_event(ErrorType::IOError);
system.record_error(event);
assert_eq!(system.error_history.len(), 1);
}
#[test]
fn test_record_error_respects_history_limit() {
let mut cfg = ErrorRecoveryConfig::default();
cfg.error_history_limit = 3;
let mut system = ErrorRecoverySystem::new(cfg);
for _ in 0..5 {
system.record_error(make_error_event(ErrorType::NetworkError));
}
assert_eq!(system.error_history.len(), 3);
}
#[test]
fn test_enable_disable_safe_mode() {
let mut system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
assert!(!system.failsafe_manager.safe_mode_enabled);
system.enable_safe_mode();
assert!(system.failsafe_manager.safe_mode_enabled);
system.disable_safe_mode();
assert!(!system.failsafe_manager.safe_mode_enabled);
}
#[test]
fn test_error_statistics_empty() {
let system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
let stats = system.get_error_statistics();
assert_eq!(stats.total_errors, 0);
assert!(matches!(stats.circuit_breaker_state, CircuitState::Closed));
assert!(matches!(stats.system_health, HealthStatus::Healthy));
}
#[test]
fn test_error_statistics_with_errors() {
let mut system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
system.record_error(make_error_event(ErrorType::IOError));
system.record_error(make_error_event(ErrorType::NetworkError));
let stats = system.get_error_statistics();
assert_eq!(stats.total_errors, 2);
assert_eq!(
stats.error_type_counts.get(&ErrorType::IOError).copied().unwrap_or(0),
1
);
}
#[test]
fn test_error_type_variants() {
let types = [
ErrorType::TensorInspectionError,
ErrorType::GradientDebuggingError,
ErrorType::ModelDiagnosticsError,
ErrorType::VisualizationError,
ErrorType::MemoryProfilingError,
ErrorType::IOError,
ErrorType::NetworkError,
ErrorType::ResourceExhaustion,
ErrorType::ConfigurationError,
ErrorType::DataCorruption,
ErrorType::SystemFailure,
ErrorType::UserError,
];
for t in &types {
assert!(!format!("{:?}", t).is_empty());
}
}
#[test]
fn test_error_severity_variants() {
let severities = [
ErrorSeverity::Low,
ErrorSeverity::Medium,
ErrorSeverity::High,
ErrorSeverity::Critical,
ErrorSeverity::Fatal,
];
for s in &severities {
assert!(!format!("{:?}", s).is_empty());
}
}
#[test]
fn test_recovery_strategy_variants() {
let strats = [
RecoveryStrategy::Retry {
max_attempts: 3,
delay_ms: 100,
},
RecoveryStrategy::Fallback {
alternative_method: "alt".to_string(),
},
RecoveryStrategy::GracefulDegradation {
reduced_functionality: "basic".to_string(),
},
RecoveryStrategy::ResourceCleanup {
cleanup_type: "cache".to_string(),
},
RecoveryStrategy::SystemReset {
component: "comp".to_string(),
},
RecoveryStrategy::EmergencyShutdown,
RecoveryStrategy::UserNotification {
message: "msg".to_string(),
},
RecoveryStrategy::AutomaticRepair {
repair_action: "repair".to_string(),
},
];
for s in &strats {
assert!(!format!("{:?}", s).is_empty());
}
}
#[test]
fn test_circuit_state_variants() {
let states = [
CircuitState::Closed,
CircuitState::Open,
CircuitState::HalfOpen,
];
for s in &states {
assert!(!format!("{:?}", s).is_empty());
}
}
#[test]
fn test_health_status_variants() {
let statuses = [
HealthStatus::Healthy,
HealthStatus::Degraded,
HealthStatus::Unhealthy,
HealthStatus::Critical,
];
for s in &statuses {
assert!(!format!("{:?}", s).is_empty());
}
}
#[test]
fn test_circuit_breaker_initial_state() {
let system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
assert_eq!(system.circuit_breaker.failure_count, 0);
assert!(system.circuit_breaker.last_failure_time.is_none());
assert_eq!(system.circuit_breaker.threshold, 5);
}
#[test]
fn test_system_state_construction() {
let state = SystemState {
memory_usage_mb: 2048,
cpu_usage_percent: 75.5,
active_tensors: 10,
active_sessions: 2,
uptime_seconds: 3600,
};
assert_eq!(state.memory_usage_mb, 2048);
assert!((state.cpu_usage_percent - 75.5).abs() < 1e-6);
}
#[test]
fn test_health_metrics_initial_values() {
let system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
let m = &system.health_monitor.health_metrics;
assert_eq!(m.error_rate, 0.0);
assert_eq!(m.recovery_success_rate, 1.0);
}
#[tokio::test]
async fn test_handle_error_with_open_circuit_breaker() {
let mut system = ErrorRecoverySystem::new(ErrorRecoveryConfig::default());
system.circuit_breaker.state = CircuitState::Open;
let event = make_error_event(ErrorType::IOError);
let result = system.handle_error(event).await.expect("should succeed");
assert!(!result.success);
assert!(result.message.contains("Circuit breaker"));
}
}