1use serde::{Deserialize, Serialize};
8use sklears_core::error::Result as SklResult;
9use std::collections::HashMap;
10use std::time::{Duration, SystemTime};
11
12pub trait FaultToleranceManager: Send + Sync {
17 fn initialize_fault_tolerance(
26 &mut self,
27 session_id: String,
28 config: FaultToleranceConfig,
29 ) -> SklResult<FaultToleranceSession>;
30
31 fn register_component(
40 &mut self,
41 session_id: String,
42 component: FaultToleranceComponent,
43 ) -> SklResult<ComponentHandle>;
44
45 fn report_fault(&mut self, session_id: String, fault: FaultReport) -> SklResult<FaultResponse>;
54
55 fn get_session_status(&self, session_id: String) -> SklResult<FaultToleranceSessionStatus>;
63
64 fn shutdown_fault_tolerance(&mut self, session_id: String) -> SklResult<FaultToleranceReport>;
72}
73
74#[derive(Debug, Clone)]
79pub struct FaultToleranceSession {
80 pub session_id: String,
82 pub start_time: SystemTime,
84 pub config: FaultToleranceConfig,
86 pub components: Vec<ComponentHandle>,
88 pub status: FaultToleranceSessionStatus,
90 pub circuit_breakers: Vec<CircuitBreakerHandle>,
92 pub recovery_history: Vec<RecoveryHistoryEntry>,
94 pub metadata: FaultToleranceMetadata,
96}
97
98#[derive(Debug, Clone, PartialEq)]
100pub enum FaultToleranceSessionStatus {
101 Initializing,
103 Active,
105 Degraded { failed_components: usize },
107 Recovery,
109 Suspended,
111 ShuttingDown,
113 Terminated,
115}
116
117#[derive(Debug, Clone, Serialize, Deserialize)]
119pub struct FaultToleranceConfig {
120 pub enabled: bool,
122 pub sensitivity: f64,
124 pub max_concurrent_recoveries: usize,
126 pub global_timeout: Duration,
128 pub recovery_config: RecoveryConfig,
130 pub circuit_breaker_config: CircuitBreakerConfig,
132 pub retry_config: RetryConfig,
134 pub bulkhead_config: BulkheadConfig,
136 pub health_check_config: HealthCheckConfig,
138 pub performance_config: PerformanceConfig,
140 pub advanced_config: AdvancedConfig,
142}
143
144#[derive(Debug, Clone, Serialize, Deserialize)]
146pub struct RecoveryConfig {
147 pub automatic_recovery: bool,
149 pub recovery_timeout: Duration,
151 pub max_recovery_attempts: usize,
153 pub strategy_priority: Vec<RecoveryStrategyType>,
155 pub validation: RecoveryValidationConfig,
157 pub escalation: RecoveryEscalationConfig,
159}
160
161#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
163pub enum RecoveryStrategyType {
164 Restart,
166 Failover,
168 Scale,
170 Reset,
172 Rollback,
174 Manual,
176 Custom(String),
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct RecoveryValidationConfig {
183 pub enabled: bool,
185 pub timeout: Duration,
187 pub criteria: Vec<ValidationCriterion>,
189 pub depth: ValidationDepth,
191}
192
193#[derive(Debug, Clone, Serialize, Deserialize)]
195pub struct ValidationCriterion {
196 pub name: String,
198 pub criterion_type: CriterionType,
200 pub expected_value: String,
202 pub tolerance: f64,
204 pub weight: f64,
206}
207
208#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
210pub enum CriterionType {
211 HealthCheck,
213 PerformanceMetric,
215 ResourceUtilization,
217 BusinessMetric,
219 Custom(String),
221}
222
223#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
225pub enum ValidationDepth {
226 Shallow,
228 Medium,
230 Deep,
232 Comprehensive,
234}
235
236#[derive(Debug, Clone, Serialize, Deserialize)]
238pub struct RecoveryEscalationConfig {
239 pub enabled: bool,
241 pub levels: Vec<EscalationLevel>,
243 pub timeout: Duration,
245 pub notifications: NotificationConfig,
247}
248
249#[derive(Debug, Clone, Serialize, Deserialize)]
251pub struct EscalationLevel {
252 pub level: u32,
254 pub name: String,
256 pub strategies: Vec<RecoveryStrategyType>,
258 pub timeout: Duration,
260 pub requires_approval: bool,
262 pub notification_channels: Vec<String>,
264}
265
266#[derive(Debug, Clone, Serialize, Deserialize)]
268pub struct NotificationConfig {
269 pub enabled: bool,
271 pub channels: Vec<NotificationChannel>,
273 pub templates: HashMap<String, NotificationTemplate>,
275 pub rate_limit: RateLimitConfig,
277}
278
279#[derive(Debug, Clone, Serialize, Deserialize)]
281pub struct NotificationChannel {
282 pub name: String,
284 pub channel_type: ChannelType,
286 pub config: HashMap<String, String>,
288 pub priority_threshold: Priority,
290}
291
292#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
294pub enum ChannelType {
295 Email,
297 Slack,
299 PagerDuty,
301 Webhook,
303 SMS,
305 Custom(String),
307}
308
309#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd)]
311pub enum Priority {
312 Low = 1,
313 Medium = 2,
314 High = 3,
315 Critical = 4,
316 Emergency = 5,
317}
318
319#[derive(Debug, Clone, Serialize, Deserialize)]
321pub struct NotificationTemplate {
322 pub name: String,
324 pub subject: String,
326 pub body: String,
328 pub variables: Vec<String>,
330}
331
332#[derive(Debug, Clone, Serialize, Deserialize)]
334pub struct RateLimitConfig {
335 pub max_per_window: usize,
337 pub window_duration: Duration,
339 pub burst_allowance: usize,
341}
342
343#[derive(Debug, Clone, Serialize, Deserialize)]
345pub struct CircuitBreakerConfig {
346 pub enabled: bool,
348 pub failure_threshold: usize,
350 pub success_threshold: usize,
352 pub timeout: Duration,
354 pub half_open_max_calls: usize,
356 pub policies: Vec<CircuitBreakerPolicy>,
358 pub failure_detection: FailureDetectionConfig,
360 pub analytics: crate::circuit_breaker::analytics_engine::AnalyticsConfig,
362}
363
364impl Default for CircuitBreakerConfig {
365 fn default() -> Self {
366 Self {
367 enabled: true,
368 failure_threshold: 5,
369 success_threshold: 3,
370 timeout: Duration::from_secs(60),
371 half_open_max_calls: 3,
372 policies: Vec::new(),
373 failure_detection: FailureDetectionConfig::default(),
374 analytics: crate::circuit_breaker::analytics_engine::AnalyticsConfig::default(),
375 }
376 }
377}
378
379#[derive(Debug, Clone, Serialize, Deserialize)]
381pub struct CircuitBreakerPolicy {
382 pub name: String,
384 pub component_patterns: Vec<String>,
386 pub failure_detection: FailureDetectionConfig,
388 pub recovery_configuration: CircuitRecoveryConfig,
390}
391
392#[derive(Debug, Clone, Serialize, Deserialize)]
394pub struct FailureDetectionConfig {
395 pub patterns: Vec<FailurePattern>,
397 pub window_size: usize,
399 pub min_requests: usize,
401 pub statistics: StatisticalConfig,
403}
404
405#[derive(Debug, Clone, Serialize, Deserialize)]
407pub struct FailurePattern {
408 pub name: String,
410 pub pattern_type: PatternType,
412 pub expression: String,
414 pub weight: f64,
416}
417
418#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
420pub enum PatternType {
421 ErrorRate,
423 ResponseTime,
425 ResourceUtilization,
427 Custom(String),
429}
430
431#[derive(Debug, Clone, Serialize, Deserialize)]
433pub struct StatisticalConfig {
434 pub method: StatisticalMethod,
436 pub confidence_level: f64,
438 pub outlier_detection: bool,
440 pub trend_analysis: bool,
442}
443
444#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
446pub enum StatisticalMethod {
447 Simple,
449 ExponentialMovingAverage,
451 WeightedAverage,
453 Percentile(f64),
455 StandardDeviation,
457 Custom(String),
459}
460
461#[derive(Debug, Clone, Serialize, Deserialize)]
463pub struct CircuitRecoveryConfig {
464 pub strategy: CircuitRecoveryStrategy,
466 pub progressive: ProgressiveRecoveryConfig,
468 pub health_check: CircuitHealthCheckConfig,
470}
471
472#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
474pub enum CircuitRecoveryStrategy {
475 Immediate,
477 Progressive,
479 HealthCheckBased,
481 TimeBased,
483 Custom(String),
485}
486
487#[derive(Debug, Clone, Serialize, Deserialize)]
489pub struct ProgressiveRecoveryConfig {
490 pub initial_percentage: f64,
492 pub increment_percentage: f64,
494 pub max_percentage: f64,
496 pub check_interval: Duration,
498}
499
500#[derive(Debug, Clone, Serialize, Deserialize)]
502pub struct CircuitHealthCheckConfig {
503 pub endpoint: String,
505 pub interval: Duration,
507 pub timeout: Duration,
509 pub expected_response: String,
511}
512
513#[derive(Debug, Clone, Serialize, Deserialize)]
515pub struct RetryConfig {
516 pub enabled: bool,
518 pub max_attempts: usize,
520 pub base_delay: Duration,
522 pub max_delay: Duration,
524 pub backoff_strategy: BackoffStrategy,
526 pub retry_conditions: Vec<RetryCondition>,
528 pub jitter: JitterConfig,
530}
531
532#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
534pub enum BackoffStrategy {
535 Fixed,
537 Linear,
539 Exponential { multiplier: f64 },
541 Fibonacci,
543 Custom(String),
545}
546
547#[derive(Debug, Clone, Serialize, Deserialize)]
549pub struct RetryCondition {
550 pub name: String,
552 pub error_patterns: Vec<String>,
554 pub status_codes: Vec<u16>,
556 pub custom_condition: Option<String>,
558}
559
560#[derive(Debug, Clone, Serialize, Deserialize)]
562pub struct JitterConfig {
563 pub enabled: bool,
565 pub jitter_type: JitterType,
567 pub amount: f64,
569}
570
571#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
573pub enum JitterType {
574 None,
575 Full,
577 Equal,
579 Decorrelated,
581}
582
583#[derive(Debug, Clone, Serialize, Deserialize)]
585pub struct BulkheadConfig {
586 pub enabled: bool,
588 pub default_isolation: IsolationSettings,
590 pub component_isolation: HashMap<String, IsolationSettings>,
592 pub resource_pools: Vec<ResourcePoolConfig>,
594}
595
596#[derive(Debug, Clone, Serialize, Deserialize)]
598pub struct IsolationSettings {
599 pub max_concurrent_calls: usize,
601 pub queue_size: usize,
603 pub queue_timeout: Duration,
605 pub isolation_type: IsolationType,
607}
608
609#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
611pub enum IsolationType {
612 ThreadPool,
614 Semaphore,
616 Actor,
618 Custom(String),
620}
621
622#[derive(Debug, Clone, Serialize, Deserialize)]
624pub struct ResourcePoolConfig {
625 pub name: String,
627 pub resource_type: ResourceType,
629 pub size: usize,
631 pub max_size: Option<usize>,
633 pub timeout: Duration,
635 pub management_strategy: PoolManagementStrategy,
637}
638
639#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
641pub enum ResourceType {
642 Thread,
644 Connection,
646 Memory,
648 CPU,
650 Custom(String),
652}
653
654#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
656pub enum PoolManagementStrategy {
657 FIFO,
659 LIFO,
661 LeastRecentlyUsed,
663 MostRecentlyUsed,
665 Custom(String),
667}
668
669#[derive(Debug, Clone, Serialize, Deserialize)]
671pub struct HealthCheckConfig {
672 pub enabled: bool,
674 pub default_interval: Duration,
676 pub timeout: Duration,
678 pub component_configs: HashMap<ComponentType, ComponentHealthConfig>,
680 pub thresholds: HealthThresholds,
682}
683
684#[derive(Debug, Clone, Serialize, Deserialize)]
686pub struct ComponentHealthConfig {
687 pub interval: Duration,
689 pub check_type: HealthCheckType,
691 pub endpoint: String,
693 pub expected_response: Option<String>,
695 pub failure_threshold: usize,
697 pub success_threshold: usize,
699}
700
701#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
703pub enum HealthCheckType {
704 Http {
706 method: String,
707 headers: HashMap<String, String>,
708 },
709 Tcp { host: String, port: u16 },
711 Function { function_name: String },
713 Resource { resource_type: String },
715 Custom { check_name: String },
717}
718
719#[derive(Debug, Clone, Serialize, Deserialize)]
721pub struct HealthThresholds {
722 pub critical: f64,
724 pub warning: f64,
726 pub good: f64,
728 pub excellent: f64,
730}
731
732#[derive(Debug, Clone, Serialize, Deserialize)]
734pub struct PerformanceConfig {
735 pub enabled: bool,
737 pub collection_interval: Duration,
739 pub thresholds: PerformanceThresholds,
741 pub resource_monitoring: ResourceMonitoringConfig,
743 pub optimization: PerformanceOptimizationConfig,
745}
746
747#[derive(Debug, Clone, Serialize, Deserialize)]
749pub struct PerformanceThresholds {
750 pub response_time: ThresholdConfig,
752 pub throughput: ThresholdConfig,
754 pub error_rate: ThresholdConfig,
756 pub cpu_utilization: ThresholdConfig,
758 pub memory_utilization: ThresholdConfig,
760}
761
762#[derive(Debug, Clone, Serialize, Deserialize)]
764pub struct ThresholdConfig {
765 pub warning: f64,
767 pub critical: f64,
769 pub unit: String,
771}
772
773#[derive(Debug, Clone, Serialize, Deserialize)]
775pub struct ResourceMonitoringConfig {
776 pub monitor_cpu: bool,
778 pub monitor_memory: bool,
780 pub monitor_disk: bool,
782 pub monitor_network: bool,
784 pub custom_monitors: Vec<CustomResourceMonitor>,
786}
787
788#[derive(Debug, Clone, Serialize, Deserialize)]
790pub struct CustomResourceMonitor {
791 pub name: String,
793 pub monitor_type: String,
795 pub config: HashMap<String, String>,
797}
798
799#[derive(Debug, Clone, Serialize, Deserialize)]
801pub struct PerformanceOptimizationConfig {
802 pub auto_optimization: bool,
804 pub strategies: Vec<OptimizationStrategy>,
806 pub optimization_interval: Duration,
808 pub learning: LearningConfig,
810}
811
812#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
814pub enum OptimizationStrategy {
815 LoadBalancing,
817 Caching,
819 ResourceScaling,
821 RequestBatching,
823 Custom(String),
825}
826
827#[derive(Debug, Clone, Serialize, Deserialize)]
829pub struct LearningConfig {
830 pub enabled: bool,
832 pub algorithm: LearningAlgorithm,
834 pub data_retention: Duration,
836 pub update_interval: Duration,
838}
839
840#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
842pub enum LearningAlgorithm {
843 LinearRegression,
845 DecisionTree,
847 NeuralNetwork,
849 ReinforcementLearning,
851 Custom(String),
853}
854
855#[derive(Debug, Clone, Serialize, Deserialize)]
857pub struct AdvancedConfig {
858 pub predictive_analytics: bool,
860 pub chaos_engineering: bool,
862 pub security_monitoring: bool,
864 pub compliance_reporting: bool,
866 pub custom_features: HashMap<String, bool>,
868}
869
870#[derive(Debug, Clone)]
872pub struct FaultToleranceComponent {
873 pub component_type: ComponentType,
875 pub component_id: String,
877 pub name: String,
879 pub description: String,
881 pub health_config: ComponentHealthConfig,
883 pub recovery_config: ComponentRecoveryConfig,
885 pub policies: Vec<FaultTolerancePolicy>,
887 pub metadata: ComponentMetadata,
889 pub dependencies: Vec<String>,
891 pub tags: Vec<String>,
893}
894
895#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Hash, Eq)]
897pub enum ComponentType {
898 ExecutionEngine,
900 TaskScheduler,
902 ResourceManager,
904 MonitoringSystem,
906 StorageSystem,
908 NetworkInterface,
910 Database,
912 MessageQueue,
914 Cache,
916 ExternalService { service_name: String },
918 Custom { type_name: String },
920}
921
922#[derive(Debug, Clone)]
924pub struct ComponentRecoveryConfig {
925 pub automatic_recovery: bool,
927 pub strategies: Vec<RecoveryStrategy>,
929 pub recovery_timeout: Duration,
931 pub max_recovery_attempts: usize,
933 pub cooldown_period: Duration,
935 pub validation: RecoveryValidationConfig,
937}
938
939#[derive(Debug, Clone)]
941pub enum RecoveryStrategy {
942 Restart {
944 restart_delay: Duration,
945 cleanup_before_restart: bool,
946 },
947 Failover {
949 backup_component: String,
950 failover_delay: Duration,
951 },
952 Scale {
954 scale_factor: f64,
955 scale_timeout: Duration,
956 },
957 Reset {
959 checkpoint: String,
960 reset_timeout: Duration,
961 },
962 Manual {
964 notification_channels: Vec<String>,
965 instructions: String,
966 },
967 Custom {
969 strategy_name: String,
970 parameters: HashMap<String, String>,
971 },
972}
973
974#[derive(Debug, Clone)]
976pub enum FaultTolerancePolicy {
977 RetryPolicy {
979 max_attempts: usize,
980 backoff_strategy: BackoffStrategy,
981 retry_conditions: Vec<RetryCondition>,
982 },
983 CircuitBreakerPolicy {
985 failure_threshold: usize,
986 recovery_timeout: Duration,
987 half_open_max_calls: usize,
988 },
989 BulkheadPolicy {
991 max_concurrent_calls: usize,
992 queue_size: usize,
993 timeout: Duration,
994 },
995 FallbackPolicy {
997 fallback_action: FallbackAction,
998 fallback_conditions: Vec<FallbackCondition>,
999 },
1000 TimeoutPolicy {
1002 timeout: Duration,
1003 timeout_action: TimeoutAction,
1004 },
1005 RateLimitPolicy {
1007 max_requests: usize,
1008 time_window: Duration,
1009 rate_limit_action: RateLimitAction,
1010 },
1011 CustomPolicy {
1013 policy_name: String,
1014 parameters: HashMap<String, String>,
1015 },
1016}
1017
1018#[derive(Debug, Clone)]
1020pub enum FallbackAction {
1021 DefaultValue { value: String },
1023 AlternativeService { service_name: String },
1025 CachedResponse { cache_key: String },
1027 QueueRequest { queue_name: String },
1029 Manual { action: ManualAction },
1031 Custom {
1033 action_name: String,
1034 parameters: HashMap<String, String>,
1035 },
1036}
1037
1038#[derive(Debug, Clone)]
1040pub struct FallbackCondition {
1041 pub name: String,
1043 pub patterns: Vec<String>,
1045 pub priority: i32,
1047}
1048
1049#[derive(Debug, Clone)]
1051pub enum TimeoutAction {
1052 Cancel,
1054 PartialResult,
1056 ExtendTimeout { extension: Duration },
1058 Fallback { action: FallbackAction },
1060 Custom { action_name: String },
1062}
1063
1064#[derive(Debug, Clone)]
1066pub enum RateLimitAction {
1067 Reject,
1069 Queue { max_queue_size: usize },
1071 Delay { delay: Duration },
1073 Throttle { factor: f64 },
1075 Custom { action_name: String },
1077}
1078
1079#[derive(Debug, Clone)]
1081pub struct ManualAction {
1082 pub action_id: String,
1084 pub description: String,
1086 pub required_skills: Vec<String>,
1088 pub estimated_duration: Duration,
1090 pub priority: i32,
1092 pub instructions: String,
1094 pub dependencies: Vec<String>,
1096}
1097
1098#[derive(Debug, Clone)]
1100pub struct ComponentMetadata {
1101 pub version: String,
1103 pub owner: String,
1105 pub created_at: SystemTime,
1107 pub updated_at: SystemTime,
1109 pub environment: String,
1111 pub custom: HashMap<String, String>,
1113}
1114
1115#[derive(Debug, Clone)]
1117pub struct ComponentHandle {
1118 pub id: String,
1120 pub component_type: ComponentType,
1122 pub health_config: ComponentHealthConfig,
1124 pub recovery_config: ComponentRecoveryConfig,
1126 pub policies: Vec<FaultTolerancePolicy>,
1128 pub metadata: ComponentMetadata,
1130 pub registered_at: SystemTime,
1132 pub last_health_check: SystemTime,
1134}
1135
1136#[derive(Debug, Clone)]
1138pub enum ComponentHealth {
1139 Healthy {
1141 uptime: Duration,
1142 performance_score: f64,
1143 },
1144 Degraded { reason: String, impact_level: f64 },
1146 Unhealthy {
1148 error_count: usize,
1149 last_error: String,
1150 },
1151 Failed {
1153 failure_reason: String,
1154 failure_time: SystemTime,
1155 },
1156 Unknown { last_check: SystemTime },
1158}
1159
1160#[derive(Debug, Clone)]
1162pub struct CircuitBreakerHandle {
1163 pub id: String,
1165 pub name: String,
1167 pub state: CircuitBreakerState,
1169 pub config: CircuitBreakerConfig,
1171 pub stats: CircuitBreakerStats,
1173 pub created_at: SystemTime,
1175 pub last_state_change: SystemTime,
1177}
1178
1179#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1181pub enum CircuitBreakerState {
1182 Closed,
1184 Open,
1186 HalfOpen,
1188}
1189
1190#[derive(Debug, Clone)]
1192pub struct CircuitBreakerStats {
1193 pub total_requests: u64,
1195 pub successful_requests: u64,
1197 pub failed_requests: u64,
1199 pub consecutive_failures: u64,
1201 pub state_changes: u64,
1203 pub last_failure_time: Option<SystemTime>,
1205 pub last_success_time: Option<SystemTime>,
1207 pub half_open_requests: u64,
1209 pub half_open_successes: u64,
1211}
1212
1213#[derive(Debug, Clone)]
1215pub struct RecoveryHistoryEntry {
1216 pub id: String,
1218 pub component_id: String,
1220 pub strategy: RecoveryStrategyType,
1222 pub start_time: SystemTime,
1224 pub end_time: Option<SystemTime>,
1226 pub result: RecoveryResult,
1228 pub details: String,
1230 pub metadata: HashMap<String, String>,
1232}
1233
1234#[derive(Debug, Clone)]
1236pub enum RecoveryResult {
1237 Success,
1239 Failure { reason: String },
1241 Partial { details: String },
1243 InProgress,
1245 Cancelled,
1247}
1248
1249#[derive(Debug, Clone, Default)]
1251pub struct FaultToleranceMetadata {
1252 pub tags: Vec<String>,
1254 pub custom: HashMap<String, String>,
1256 pub performance_metrics: HashMap<String, f64>,
1258 pub config_overrides: HashMap<String, String>,
1260}
1261
1262#[derive(Debug, Clone)]
1264pub struct FaultReport {
1265 pub fault_id: String,
1267 pub component_id: String,
1269 pub fault_type: FaultType,
1271 pub severity: FaultSeverity,
1273 pub timestamp: SystemTime,
1275 pub description: String,
1277 pub error_details: ErrorDetails,
1279 pub context: FaultContext,
1281}
1282
1283#[derive(Debug, Clone)]
1285pub enum FaultType {
1286 Timeout,
1288 ConnectionFailure,
1290 ServiceUnavailable,
1292 ResourceExhaustion,
1294 ConfigurationError,
1296 SecurityViolation,
1298 DataCorruption,
1300 PerformanceDegradation,
1302 Custom(String),
1304}
1305
1306#[derive(Debug, Clone, PartialEq, PartialOrd)]
1308pub enum FaultSeverity {
1309 Low = 1,
1310 Medium = 2,
1311 High = 3,
1312 Critical = 4,
1313 Emergency = 5,
1314}
1315
1316#[derive(Debug, Clone)]
1318pub struct ErrorDetails {
1319 pub error_code: String,
1321 pub error_message: String,
1323 pub stack_trace: Option<String>,
1325 pub additional_data: HashMap<String, String>,
1327}
1328
1329#[derive(Debug, Clone)]
1331pub struct FaultContext {
1332 pub request_id: Option<String>,
1334 pub user_id: Option<String>,
1336 pub session_id: Option<String>,
1338 pub environment: HashMap<String, String>,
1340 pub system_state: HashMap<String, String>,
1342}
1343
1344#[derive(Debug, Clone)]
1346pub enum FaultResponse {
1347 Acknowledged,
1349 RecoveryInitiated { strategy: RecoveryStrategyType },
1351 EscalationRequired { level: u32 },
1353 ManualInterventionRequired,
1355 Ignored { reason: String },
1357}
1358
1359#[derive(Debug, Clone, Default)]
1361pub struct FaultToleranceReport {
1362 pub session_id: String,
1364 pub report_type: ReportType,
1366 pub resilience_score: f64,
1368 pub availability: f64,
1370 pub performance_metrics: PerformanceMetrics,
1372 pub health_summary: HealthSummary,
1374 pub recovery_summary: RecoverySummary,
1376 pub recommendations: Vec<Recommendation>,
1378}
1379
1380#[derive(Debug, Clone, Default)]
1382pub enum ReportType {
1383 #[default]
1384 Summary,
1385 Detailed,
1387 Comprehensive,
1389 Custom(String),
1391}
1392
1393impl Default for HealthSummary {
1394 fn default() -> Self {
1395 Self {
1396 overall_health: 1.0,
1397 critical_issues: 0,
1398 component_count: 0,
1399 healthy_components: 0,
1400 }
1401 }
1402}
1403
1404impl Default for RecoverySummary {
1405 fn default() -> Self {
1406 Self {
1407 total_attempts: 0,
1408 successful_recoveries: 0,
1409 failed_recoveries: 0,
1410 avg_recovery_time: Duration::from_secs(0),
1411 }
1412 }
1413}
1414
1415#[derive(Debug, Clone, Default)]
1417pub struct PerformanceMetrics {
1418 pub avg_response_time: Duration,
1420 pub peak_response_time: Duration,
1422 pub throughput: f64,
1424 pub error_rate: f64,
1426 pub resource_utilization: f64,
1428}
1429
1430#[derive(Debug, Clone)]
1432pub struct HealthSummary {
1433 pub overall_health: f64,
1435 pub critical_issues: usize,
1437 pub component_count: usize,
1439 pub healthy_components: usize,
1441}
1442
1443#[derive(Debug, Clone)]
1445pub struct RecoverySummary {
1446 pub total_attempts: usize,
1448 pub successful_recoveries: usize,
1450 pub failed_recoveries: usize,
1452 pub avg_recovery_time: Duration,
1454}
1455
1456#[derive(Debug, Clone)]
1458pub struct Recommendation {
1459 pub recommendation_type: RecommendationType,
1461 pub priority: Priority,
1463 pub description: String,
1465 pub implementation_steps: Vec<String>,
1467 pub expected_impact: String,
1469}
1470
1471#[derive(Debug, Clone)]
1473pub enum RecommendationType {
1474 Configuration,
1476 Infrastructure,
1478 Architecture,
1480 Monitoring,
1482 Process,
1484 Custom(String),
1486}
1487
1488impl Default for FaultToleranceConfig {
1490 fn default() -> Self {
1491 Self {
1492 enabled: true,
1493 sensitivity: 0.7,
1494 max_concurrent_recoveries: 5,
1495 global_timeout: Duration::from_secs(30),
1496 recovery_config: RecoveryConfig {
1497 automatic_recovery: true,
1498 recovery_timeout: Duration::from_secs(60),
1499 max_recovery_attempts: 3,
1500 strategy_priority: vec![
1501 RecoveryStrategyType::Restart,
1502 RecoveryStrategyType::Failover,
1503 RecoveryStrategyType::Scale,
1504 ],
1505 validation: RecoveryValidationConfig {
1506 enabled: true,
1507 timeout: Duration::from_secs(10),
1508 criteria: Vec::new(),
1509 depth: ValidationDepth::Medium,
1510 },
1511 escalation: RecoveryEscalationConfig {
1512 enabled: false,
1513 levels: Vec::new(),
1514 timeout: Duration::from_secs(300),
1515 notifications: NotificationConfig {
1516 enabled: false,
1517 channels: Vec::new(),
1518 templates: HashMap::new(),
1519 rate_limit: RateLimitConfig {
1520 max_per_window: 10,
1521 window_duration: Duration::from_secs(60),
1522 burst_allowance: 2,
1523 },
1524 },
1525 },
1526 },
1527 circuit_breaker_config: CircuitBreakerConfig {
1528 enabled: true,
1529 failure_threshold: 5,
1530 success_threshold: 3,
1531 timeout: Duration::from_secs(60),
1532 half_open_max_calls: 3,
1533 policies: Vec::new(),
1534 failure_detection: FailureDetectionConfig::default(),
1535 analytics: crate::circuit_breaker::analytics_engine::AnalyticsConfig::default(),
1536 },
1537 retry_config: RetryConfig {
1538 enabled: true,
1539 max_attempts: 3,
1540 base_delay: Duration::from_millis(100),
1541 max_delay: Duration::from_secs(30),
1542 backoff_strategy: BackoffStrategy::Exponential { multiplier: 2.0 },
1543 retry_conditions: Vec::new(),
1544 jitter: JitterConfig {
1545 enabled: true,
1546 jitter_type: JitterType::Equal,
1547 amount: 0.1,
1548 },
1549 },
1550 bulkhead_config: BulkheadConfig {
1551 enabled: true,
1552 default_isolation: IsolationSettings {
1553 max_concurrent_calls: 10,
1554 queue_size: 20,
1555 queue_timeout: Duration::from_secs(5),
1556 isolation_type: IsolationType::Semaphore,
1557 },
1558 component_isolation: HashMap::new(),
1559 resource_pools: Vec::new(),
1560 },
1561 health_check_config: HealthCheckConfig {
1562 enabled: true,
1563 default_interval: Duration::from_secs(30),
1564 timeout: Duration::from_secs(5),
1565 component_configs: HashMap::new(),
1566 thresholds: HealthThresholds {
1567 critical: 0.3,
1568 warning: 0.6,
1569 good: 0.8,
1570 excellent: 0.95,
1571 },
1572 },
1573 performance_config: PerformanceConfig {
1574 enabled: true,
1575 collection_interval: Duration::from_secs(10),
1576 thresholds: PerformanceThresholds {
1577 response_time: ThresholdConfig {
1578 warning: 1000.0,
1579 critical: 5000.0,
1580 unit: "ms".to_string(),
1581 },
1582 throughput: ThresholdConfig {
1583 warning: 10.0,
1584 critical: 1.0,
1585 unit: "rps".to_string(),
1586 },
1587 error_rate: ThresholdConfig {
1588 warning: 0.05,
1589 critical: 0.1,
1590 unit: "ratio".to_string(),
1591 },
1592 cpu_utilization: ThresholdConfig {
1593 warning: 0.8,
1594 critical: 0.95,
1595 unit: "ratio".to_string(),
1596 },
1597 memory_utilization: ThresholdConfig {
1598 warning: 0.85,
1599 critical: 0.95,
1600 unit: "ratio".to_string(),
1601 },
1602 },
1603 resource_monitoring: ResourceMonitoringConfig {
1604 monitor_cpu: true,
1605 monitor_memory: true,
1606 monitor_disk: false,
1607 monitor_network: false,
1608 custom_monitors: Vec::new(),
1609 },
1610 optimization: PerformanceOptimizationConfig {
1611 auto_optimization: false,
1612 strategies: vec![OptimizationStrategy::LoadBalancing],
1613 optimization_interval: Duration::from_secs(300),
1614 learning: LearningConfig {
1615 enabled: false,
1616 algorithm: LearningAlgorithm::LinearRegression,
1617 data_retention: Duration::from_secs(86400),
1618 update_interval: Duration::from_secs(3600),
1619 },
1620 },
1621 },
1622 advanced_config: AdvancedConfig {
1623 predictive_analytics: false,
1624 chaos_engineering: false,
1625 security_monitoring: false,
1626 compliance_reporting: false,
1627 custom_features: HashMap::new(),
1628 },
1629 }
1630 }
1631}
1632
1633impl Default for RetryConfig {
1634 fn default() -> Self {
1635 Self {
1636 enabled: true,
1637 max_attempts: 3,
1638 base_delay: Duration::from_millis(100),
1639 max_delay: Duration::from_secs(30),
1640 backoff_strategy: BackoffStrategy::Exponential { multiplier: 2.0 },
1641 retry_conditions: vec![
1642 RetryCondition {
1644 name: "NetworkError".to_string(),
1645 error_patterns: vec!["connection".to_string(), "timeout".to_string()],
1646 status_codes: vec![],
1647 custom_condition: None,
1648 },
1649 RetryCondition {
1651 name: "ServiceUnavailable".to_string(),
1652 error_patterns: vec!["unavailable".to_string()],
1653 status_codes: vec![503],
1654 custom_condition: None,
1655 },
1656 ],
1657 jitter: JitterConfig {
1658 enabled: true,
1659 jitter_type: JitterType::Equal,
1660 amount: 0.1,
1661 },
1662 }
1663 }
1664}
1665
1666impl Default for FailureDetectionConfig {
1667 fn default() -> Self {
1668 Self {
1669 patterns: vec![],
1670 window_size: 100,
1671 min_requests: 10,
1672 statistics: StatisticalConfig {
1673 method: StatisticalMethod::Simple,
1674 confidence_level: 0.95,
1675 outlier_detection: true,
1676 trend_analysis: false,
1677 },
1678 }
1679 }
1680}
1681
1682