memscope_rs/export/
error_recovery.rs

1//! error recovery mechanism
2//!
3//! this module provides comprehensive error recovery strategies,
4//! including automatic retries, graceful degradation,
5//! partial result saving, and error state recovery,
6//! ensuring optimal user experience in various
7use crate::core::types::{TrackingError, TrackingResult};
8use crate::export::error_handling::{ConflictType, ExportError, ExportStage, ResourceType};
9use crate::export::fast_export_coordinator::{CompleteExportStats, FastExportConfig};
10use std::collections::HashMap;
11use std::path::PathBuf;
12use std::time::{Duration, Instant};
13// Removed unused atomic imports
14
15/// error recovery manager
16#[derive(Debug)]
17pub struct ErrorRecoveryManager {
18    /// recovery config
19    config: RecoveryConfig,
20    /// recovery stats
21    stats: RecoveryStats,
22    /// retry history
23    retry_history: HashMap<String, RetryHistory>,
24    /// degradation state
25    degradation_state: DegradationState,
26}
27
28/// recovery config
29#[derive(Debug, Clone)]
30pub struct RecoveryConfig {
31    /// whether to enable auto retry
32    pub enable_auto_retry: bool,
33    /// max retry attempts
34    pub max_retry_attempts: usize,
35    /// retry interval (milliseconds)
36    pub retry_interval_ms: u64,
37    /// retry interval backoff factor
38    pub retry_backoff_factor: f64,
39    /// max retry interval (milliseconds)
40    pub max_retry_interval_ms: u64,
41
42    /// whether to enable graceful degradation
43    pub enable_graceful_degradation: bool,
44    /// degradation threshold (error rate percentage)
45    pub degradation_threshold: f64,
46    /// recovery threshold (error rate percentage)
47    pub recovery_threshold: f64,
48
49    /// whether to enable partial result saving
50    pub enable_partial_save: bool,
51    /// partial result save directory
52    pub partial_save_directory: PathBuf,
53    /// partial result save interval (operations count)
54    pub partial_save_interval: usize,
55
56    /// whether to enable verbose logging
57    pub verbose_logging: bool,
58}
59
60impl Default for RecoveryConfig {
61    fn default() -> Self {
62        Self {
63            enable_auto_retry: true,
64            max_retry_attempts: 3,
65            retry_interval_ms: 1000,
66            retry_backoff_factor: 2.0,
67            max_retry_interval_ms: 10000,
68
69            enable_graceful_degradation: true,
70            degradation_threshold: 10.0, // 10% error rate triggers degradation
71            recovery_threshold: 2.0,     // 2% error rate to recover to normal
72
73            enable_partial_save: true,
74            partial_save_directory: PathBuf::from("./partial_exports"),
75            partial_save_interval: 1000,
76
77            verbose_logging: false,
78        }
79    }
80}
81
82/// recovery stats
83#[derive(Debug, Clone, Default)]
84pub struct RecoveryStats {
85    /// total errors
86    pub total_errors: usize,
87    /// successful recoveries
88    pub successful_recoveries: usize,
89    /// failed recoveries
90    pub failed_recoveries: usize,
91    /// total retries
92    pub total_retries: usize,
93    /// degradation count
94    pub degradation_count: usize,
95    /// partial saves count
96    pub partial_saves: usize,
97    /// total recovery time (milliseconds)
98    pub total_recovery_time_ms: u64,
99}
100
101/// retry history
102#[derive(Debug, Clone)]
103pub struct RetryHistory {
104    /// operation name
105    pub operation: String,
106    /// retry count
107    pub attempt_count: usize,
108    /// last retry time
109    pub last_attempt: Instant,
110    /// next retry interval (milliseconds)
111    pub next_interval_ms: u64,
112    /// error history
113    pub error_history: Vec<String>,
114}
115
116/// degradation state
117#[derive(Debug, Clone)]
118pub struct DegradationState {
119    /// whether in degradation state
120    pub is_degraded: bool,
121    /// degradation start time
122    pub degradation_start: Option<Instant>,
123    /// current error rate
124    pub current_error_rate: f64,
125    /// degradation level
126    pub degradation_level: DegradationLevel,
127    /// degradation reason
128    pub degradation_reason: Option<String>,
129}
130
131/// degradation level
132#[derive(Debug, Clone, PartialEq)]
133pub enum DegradationLevel {
134    /// normal operation
135    Normal,
136    /// light degradation (reduce parallelism)
137    Light,
138    /// moderate degradation (disable complex features)
139    Moderate,
140    /// severe degradation (only basic features)
141    Severe,
142    /// emergency mode (minimum features)
143    Emergency,
144}
145
146/// recovery strategy
147#[derive(Debug, Clone)]
148pub enum RecoveryStrategy {
149    /// auto retry
150    AutoRetry {
151        /// max attempts
152        max_attempts: usize,
153        /// interval milliseconds
154        interval_ms: u64,
155        /// backoff factor
156        backoff_factor: f64,
157    },
158    /// graceful degradation
159    GracefulDegradation {
160        /// target degradation level
161        target_level: DegradationLevel,
162        /// reason for degradation
163        reason: String,
164    },
165    /// partial result saving
166    PartialSave {
167        /// save path
168        save_path: PathBuf,
169        /// progress percentage
170        progress_percentage: f64,
171    },
172    /// config adjustment
173    ConfigAdjustment {
174        /// new config
175        new_config: Box<FastExportConfig>,
176        /// reason for adjustment
177        reason: String,
178    },
179    /// resource release
180    ResourceRelease {
181        /// resource type
182        resource_type: ResourceType,
183        /// amount of resource released
184        amount: usize,
185    },
186    /// skip operation
187    SkipOperation {
188        /// operation name
189        operation: String,
190        /// reason for skipping
191        reason: String,
192    },
193}
194
195/// recovery result
196#[derive(Debug, Clone)]
197pub struct RecoveryResult {
198    /// whether recovery is successful
199    pub success: bool,
200    /// recovery strategy
201    pub strategy: RecoveryStrategy,
202    /// recovery message
203    pub message: String,
204    /// recovery time (milliseconds)
205    pub recovery_time_ms: u64,
206    /// partial result path (if any)
207    pub partial_result_path: Option<PathBuf>,
208    /// suggested actions
209    pub suggested_actions: Vec<String>,
210}
211
212impl ErrorRecoveryManager {
213    /// create new error recovery manager
214    pub fn new(config: RecoveryConfig) -> Self {
215        // ensure partial save directory exists
216        if config.enable_partial_save {
217            if let Err(e) = std::fs::create_dir_all(&config.partial_save_directory) {
218                tracing::warn!("⚠️ Unable to create partial save directory: {e}");
219            }
220        }
221
222        Self {
223            config,
224            stats: RecoveryStats::default(),
225            retry_history: HashMap::new(),
226            degradation_state: DegradationState {
227                is_degraded: false,
228                degradation_start: None,
229                current_error_rate: 0.0,
230                degradation_level: DegradationLevel::Normal,
231                degradation_reason: None,
232            },
233        }
234    }
235
236    /// handle export error and attempt recovery
237    pub fn handle_export_error(
238        &mut self,
239        error: &ExportError,
240        operation: &str,
241        context: &ErrorContext,
242    ) -> TrackingResult<RecoveryResult> {
243        let recovery_start = Instant::now();
244        self.stats.total_errors += 1;
245
246        if self.config.verbose_logging {
247            tracing::debug!("🔧 Error recovery: {} - {}", operation, error);
248        }
249
250        // select recovery strategy
251        let strategy = self.select_recovery_strategy(error, operation, context)?;
252
253        // execute recovery strategy
254        let result = self.execute_recovery_strategy(strategy, error, operation, context)?;
255
256        // update statistics
257        let recovery_time = recovery_start.elapsed().as_millis() as u64;
258        self.stats.total_recovery_time_ms += recovery_time;
259
260        if result.success {
261            self.stats.successful_recoveries += 1;
262        } else {
263            self.stats.failed_recoveries += 1;
264        }
265
266        // update degradation state
267        self.update_degradation_state(error, &result);
268
269        if self.config.verbose_logging {
270            tracing::debug!(
271                "🔧 Recovery completed: {} ({}ms)",
272                result.message,
273                recovery_time
274            );
275        }
276
277        Ok(result)
278    }
279
280    /// select recovery strategy
281    fn select_recovery_strategy(
282        &self,
283        error: &ExportError,
284        operation: &str,
285        context: &ErrorContext,
286    ) -> TrackingResult<RecoveryStrategy> {
287        match error {
288            ExportError::ParallelProcessingError { shard_index, .. } => {
289                // parallel processing error: try retry or degrade parallelism
290                if self.should_retry(operation) {
291                    Ok(RecoveryStrategy::AutoRetry {
292                        max_attempts: self.config.max_retry_attempts,
293                        interval_ms: self.config.retry_interval_ms,
294                        backoff_factor: self.config.retry_backoff_factor,
295                    })
296                } else {
297                    Ok(RecoveryStrategy::GracefulDegradation {
298                        target_level: DegradationLevel::Light,
299                        reason: format!(
300                            "shard {shard_index} processing failed, degrade parallelism"
301                        ),
302                    })
303                }
304            }
305
306            ExportError::ResourceLimitExceeded {
307                resource_type,
308                suggested_action,
309                ..
310            } => {
311                // resource limit exceeded: release resources or adjust configuration
312                match resource_type {
313                    ResourceType::Memory => Ok(RecoveryStrategy::ConfigAdjustment {
314                        new_config: Box::new(self.create_memory_optimized_config(context)),
315                        reason: "memory limit exceeded, adjust to memory optimized configuration"
316                            .to_string(),
317                    }),
318                    ResourceType::CPU => Ok(RecoveryStrategy::GracefulDegradation {
319                        target_level: DegradationLevel::Moderate,
320                        reason: "CPU usage exceeded, degrade processing intensity".to_string(),
321                    }),
322                    _ => Ok(RecoveryStrategy::ConfigAdjustment {
323                        new_config: Box::new(context.current_config.clone()),
324                        reason: suggested_action.clone(),
325                    }),
326                }
327            }
328
329            ExportError::DataQualityError {
330                affected_records, ..
331            } => {
332                // data quality error: save partial results
333                if self.config.enable_partial_save && context.progress_percentage > 10.0 {
334                    Ok(RecoveryStrategy::PartialSave {
335                        save_path: self.generate_partial_save_path(operation),
336                        progress_percentage: context.progress_percentage,
337                    })
338                } else {
339                    Ok(RecoveryStrategy::SkipOperation {
340                        operation: operation.to_string(),
341                        reason: format!("data quality issue affects {affected_records} records, skip processing"),
342                    })
343                }
344            }
345
346            ExportError::PerformanceThresholdExceeded { stage, .. } => {
347                // performance threshold exceeded: adjust configuration or degrade
348                match stage {
349                    ExportStage::ParallelProcessing => Ok(RecoveryStrategy::ConfigAdjustment {
350                        new_config: Box::new(self.create_performance_optimized_config(context)),
351                        reason: "performance threshold exceeded, adjust configuration".to_string(),
352                    }),
353                    _ => Ok(RecoveryStrategy::GracefulDegradation {
354                        target_level: DegradationLevel::Light,
355                        reason: format!(
356                            "performance threshold exceeded in stage {stage:?}, degrade processing"
357                        ),
358                    }),
359                }
360            }
361
362            ExportError::ConcurrencyConflict {
363                conflict_type,
364                retry_count,
365                ..
366            } => {
367                // concurrency conflict: retry or adjust concurrency strategy
368                if *retry_count < self.config.max_retry_attempts {
369                    let interval = match conflict_type {
370                        ConflictType::LockContention => self.config.retry_interval_ms * 2,
371                        ConflictType::ThreadPoolExhaustion => self.config.retry_interval_ms * 3,
372                        _ => self.config.retry_interval_ms,
373                    };
374
375                    Ok(RecoveryStrategy::AutoRetry {
376                        max_attempts: self.config.max_retry_attempts - retry_count,
377                        interval_ms: interval,
378                        backoff_factor: self.config.retry_backoff_factor,
379                    })
380                } else {
381                    Ok(RecoveryStrategy::GracefulDegradation {
382                        target_level: DegradationLevel::Moderate,
383                        reason: "concurrency conflict, degrade processing".to_string(),
384                    })
385                }
386            }
387
388            ExportError::InsufficientResources { .. } => {
389                // insufficient resources: emergency degradation
390                Ok(RecoveryStrategy::GracefulDegradation {
391                    target_level: DegradationLevel::Emergency,
392                    reason: "system resources severely insufficient, enable emergency mode"
393                        .to_string(),
394                })
395            }
396
397            ExportError::ExportInterrupted {
398                progress_percentage,
399                ..
400            } => {
401                // export interrupted: save partial results
402                Ok(RecoveryStrategy::PartialSave {
403                    save_path: self.generate_partial_save_path(operation),
404                    progress_percentage: *progress_percentage,
405                })
406            }
407
408            ExportError::DataCorruption {
409                recovery_possible, ..
410            } => {
411                // data corruption: determine strategy based on whether recovery is possible
412                if *recovery_possible {
413                    Ok(RecoveryStrategy::AutoRetry {
414                        max_attempts: 1, // only retry once
415                        interval_ms: self.config.retry_interval_ms,
416                        backoff_factor: 1.0,
417                    })
418                } else {
419                    Ok(RecoveryStrategy::SkipOperation {
420                        operation: operation.to_string(),
421                        reason: "data corruption and cannot be recovered, skip operation"
422                            .to_string(),
423                    })
424                }
425            }
426        }
427    }
428
429    /// execute recovery strategy
430    fn execute_recovery_strategy(
431        &mut self,
432        strategy: RecoveryStrategy,
433        _error: &ExportError,
434        operation: &str,
435        context: &ErrorContext,
436    ) -> TrackingResult<RecoveryResult> {
437        let _execution_start = Instant::now();
438
439        match strategy {
440            RecoveryStrategy::AutoRetry {
441                max_attempts,
442                interval_ms,
443                backoff_factor,
444            } => self.execute_auto_retry(operation, max_attempts, interval_ms, backoff_factor),
445
446            RecoveryStrategy::GracefulDegradation {
447                target_level,
448                reason,
449            } => self.execute_graceful_degradation(target_level, reason),
450
451            RecoveryStrategy::PartialSave {
452                save_path,
453                progress_percentage,
454            } => self.execute_partial_save(save_path, progress_percentage, context),
455
456            RecoveryStrategy::ConfigAdjustment { new_config, reason } => {
457                self.execute_config_adjustment(*new_config, reason)
458            }
459
460            RecoveryStrategy::ResourceRelease {
461                resource_type,
462                amount,
463            } => self.execute_resource_release(resource_type, amount),
464
465            RecoveryStrategy::SkipOperation { operation, reason } => {
466                self.execute_skip_operation(operation, reason)
467            }
468        }
469    }
470
471    /// execute auto retry
472    fn execute_auto_retry(
473        &mut self,
474        operation: &str,
475        max_attempts: usize,
476        interval_ms: u64,
477        backoff_factor: f64,
478    ) -> TrackingResult<RecoveryResult> {
479        let history = self
480            .retry_history
481            .entry(operation.to_string())
482            .or_insert_with(|| RetryHistory {
483                operation: operation.to_string(),
484                attempt_count: 0,
485                last_attempt: Instant::now(),
486                next_interval_ms: interval_ms,
487                error_history: Vec::new(),
488            });
489
490        if history.attempt_count >= max_attempts {
491            return Ok(RecoveryResult {
492                success: false,
493                strategy: RecoveryStrategy::AutoRetry {
494                    max_attempts,
495                    interval_ms,
496                    backoff_factor,
497                },
498                message: format!("reached maximum retry limit ({max_attempts})"),
499                recovery_time_ms: 0,
500                partial_result_path: None,
501                suggested_actions: vec![
502                    "consider manual intervention or configuration adjustment".to_string()
503                ],
504            });
505        }
506
507        // wait for retry interval
508        if history.attempt_count > 0 {
509            std::thread::sleep(Duration::from_millis(history.next_interval_ms));
510        }
511
512        history.attempt_count += 1;
513        history.last_attempt = Instant::now();
514        history.next_interval_ms = (history.next_interval_ms as f64 * backoff_factor) as u64;
515        history.next_interval_ms = history
516            .next_interval_ms
517            .min(self.config.max_retry_interval_ms);
518
519        self.stats.total_retries += 1;
520
521        Ok(RecoveryResult {
522            success: true,
523            strategy: RecoveryStrategy::AutoRetry {
524                max_attempts,
525                interval_ms,
526                backoff_factor,
527            },
528            message: format!(
529                "prepare for retry {} (max {})",
530                history.attempt_count, max_attempts
531            ),
532            recovery_time_ms: history.next_interval_ms,
533            partial_result_path: None,
534            suggested_actions: vec![
535                "monitor retry results".to_string(),
536                "if keep failed, consider adjustment strategy".to_string(),
537            ],
538        })
539    }
540
541    /// execute graceful degradation
542    fn execute_graceful_degradation(
543        &mut self,
544        target_level: DegradationLevel,
545        reason: String,
546    ) -> TrackingResult<RecoveryResult> {
547        self.degradation_state.is_degraded = true;
548        self.degradation_state.degradation_start = Some(Instant::now());
549        self.degradation_state.degradation_level = target_level.clone();
550        self.degradation_state.degradation_reason = Some(reason.clone());
551        self.stats.degradation_count += 1;
552
553        let message = match target_level {
554            DegradationLevel::Light => "Enable slightly degraded mode: reduce parallelism",
555            DegradationLevel::Moderate => "Enable moderate degraded mode: disable complex features",
556            DegradationLevel::Severe => "Enable severe degraded mode: only basic features",
557            DegradationLevel::Emergency => "Enable emergency mode: minimum features",
558            DegradationLevel::Normal => "Enable normal mode",
559        };
560
561        Ok(RecoveryResult {
562            success: true,
563            strategy: RecoveryStrategy::GracefulDegradation {
564                target_level,
565                reason,
566            },
567            message: message.to_string(),
568            recovery_time_ms: 0,
569            partial_result_path: None,
570            suggested_actions: vec![
571                "monitor system status".to_string(),
572                "in conditions improve, consider normal mode".to_string(),
573            ],
574        })
575    }
576
577    /// execute partial save
578    fn execute_partial_save(
579        &mut self,
580        save_path: PathBuf,
581        progress_percentage: f64,
582        _context: &ErrorContext,
583    ) -> TrackingResult<RecoveryResult> {
584        // create partial save directory
585        if let Some(parent) = save_path.parent() {
586            std::fs::create_dir_all(parent).map_err(|e| {
587                TrackingError::IoError(format!("create partial save directory failed: {e}"))
588            })?;
589        }
590
591        // save partial results (here is a simplified implementation)
592        let partial_data = format!(
593            "{{\"partial_export\":true,\"progress\":{progress_percentage},\"timestamp\":\"{}\",\"context\":\"unknown\"}}",
594            std::time::SystemTime::now()
595                .duration_since(std::time::UNIX_EPOCH)
596                .unwrap_or_default()
597                .as_secs()
598        );
599
600        std::fs::write(&save_path, partial_data)
601            .map_err(|e| TrackingError::IoError(format!("save partial results failed: {e}")))?;
602
603        self.stats.partial_saves += 1;
604
605        Ok(RecoveryResult {
606            success: true,
607            strategy: RecoveryStrategy::PartialSave {
608                save_path: save_path.clone(),
609                progress_percentage,
610            },
611            message: format!("partial results saved ({progress_percentage:.1}% completed)"),
612            recovery_time_ms: 0,
613            partial_result_path: Some(save_path),
614            suggested_actions: vec![
615                "check partial result file".to_string(),
616                "resume from here after fixing the issue".to_string(),
617            ],
618        })
619    }
620
621    /// execute config adjustment
622    fn execute_config_adjustment(
623        &self,
624        new_config: FastExportConfig,
625        reason: String,
626    ) -> TrackingResult<RecoveryResult> {
627        Ok(RecoveryResult {
628            success: true,
629            strategy: RecoveryStrategy::ConfigAdjustment {
630                new_config: Box::new(new_config),
631                reason: reason.clone(),
632            },
633            message: format!("config adjusted: {reason}"),
634            recovery_time_ms: 0,
635            partial_result_path: None,
636            suggested_actions: vec![
637                "use new config to retry export".to_string(),
638                "monitor new config effects".to_string(),
639            ],
640        })
641    }
642
643    /// execute resource release
644    fn execute_resource_release(
645        &self,
646        resource_type: ResourceType,
647        amount: usize,
648    ) -> TrackingResult<RecoveryResult> {
649        // here is a simplified implementation, should call system API to release resources in actual use
650        let message = match resource_type {
651            ResourceType::Memory => format!("try to release {amount} bytes of memory"),
652            ResourceType::CPU => format!("reduce CPU usage by {amount}%"),
653            ResourceType::Disk => format!("clean up {amount} bytes of disk space"),
654            ResourceType::FileHandles => format!("close {amount} file handles"),
655            ResourceType::ThreadPool => format!("reduce {amount} threads"),
656        };
657
658        Ok(RecoveryResult {
659            success: true,
660            strategy: RecoveryStrategy::ResourceRelease {
661                resource_type,
662                amount,
663            },
664            message,
665            recovery_time_ms: 0,
666            partial_result_path: None,
667            suggested_actions: vec![
668                "monitor resource usage".to_string(),
669                "retry failed operation".to_string(),
670            ],
671        })
672    }
673
674    /// execute skip operation
675    fn execute_skip_operation(
676        &self,
677        operation: String,
678        reason: String,
679    ) -> TrackingResult<RecoveryResult> {
680        Ok(RecoveryResult {
681            success: true,
682            strategy: RecoveryStrategy::SkipOperation {
683                operation: operation.clone(),
684                reason: reason.clone(),
685            },
686            message: format!("skip operation '{operation}': {reason}"),
687            recovery_time_ms: 0,
688            partial_result_path: None,
689            suggested_actions: vec![
690                "check the impact of skipping the operation".to_string(),
691                "consider manually handling the skipped part".to_string(),
692            ],
693        })
694    }
695
696    /// check if should retry
697    fn should_retry(&self, operation: &str) -> bool {
698        if !self.config.enable_auto_retry {
699            return false;
700        }
701
702        if let Some(history) = self.retry_history.get(operation) {
703            history.attempt_count < self.config.max_retry_attempts
704        } else {
705            true
706        }
707    }
708
709    /// update degradation state
710    fn update_degradation_state(&mut self, error: &ExportError, _result: &RecoveryResult) {
711        // simplified error rate calculation
712        let error_weight = match error {
713            ExportError::ParallelProcessingError { .. } => 1.0,
714            ExportError::ResourceLimitExceeded { .. } => 2.0,
715            ExportError::DataQualityError { .. } => 1.5,
716            ExportError::PerformanceThresholdExceeded { .. } => 1.0,
717            ExportError::ConcurrencyConflict { .. } => 1.0,
718            ExportError::DataCorruption { .. } => 3.0,
719            ExportError::InsufficientResources { .. } => 2.5,
720            ExportError::ExportInterrupted { .. } => 1.5,
721        };
722
723        // update error rate (simplified calculation)
724        self.degradation_state.current_error_rate =
725            (self.degradation_state.current_error_rate * 0.9) + (error_weight * 0.1);
726
727        // check if should degrade or recover
728        if !self.degradation_state.is_degraded
729            && self.degradation_state.current_error_rate > self.config.degradation_threshold
730        {
731            // trigger degradation
732            self.degradation_state.is_degraded = true;
733            self.degradation_state.degradation_start = Some(Instant::now());
734            self.degradation_state.degradation_level = DegradationLevel::Light;
735        } else if self.degradation_state.is_degraded
736            && self.degradation_state.current_error_rate < self.config.recovery_threshold
737        {
738            // trigger recovery
739            self.degradation_state.is_degraded = false;
740            self.degradation_state.degradation_start = None;
741            self.degradation_state.degradation_level = DegradationLevel::Normal;
742            self.degradation_state.degradation_reason = None;
743        }
744    }
745
746    /// generate partial save path
747    fn generate_partial_save_path(&self, operation: &str) -> PathBuf {
748        let timestamp = std::time::SystemTime::now()
749            .duration_since(std::time::UNIX_EPOCH)
750            .unwrap_or_default()
751            .as_secs();
752        let filename = format!("partial_export_{operation}_{timestamp}.json");
753        self.config.partial_save_directory.join(filename)
754    }
755
756    /// create memory optimized config
757    fn create_memory_optimized_config(&self, context: &ErrorContext) -> FastExportConfig {
758        let mut config = context.current_config.clone();
759
760        // reduce parallelism
761        config.shard_config.max_threads = Some(2);
762        config.shard_config.shard_size /= 2;
763
764        // reduce buffer size
765        config.writer_config.buffer_size /= 2;
766
767        // enable streaming
768        config.enable_data_localization = false;
769
770        config
771    }
772
773    /// create performance optimized config
774    fn create_performance_optimized_config(&self, context: &ErrorContext) -> FastExportConfig {
775        let mut config = context.current_config.clone();
776
777        // adjust shard size
778        config.shard_config.shard_size = 500; // smaller shard size
779        config.shard_config.parallel_threshold = 1000; // lower parallel threshold
780
781        // disable verbose logging
782        config.verbose_logging = false;
783        config.enable_performance_monitoring = false;
784
785        config
786    }
787
788    /// get recovery stats
789    pub fn get_stats(&self) -> &RecoveryStats {
790        &self.stats
791    }
792
793    /// get degradation state
794    pub fn get_degradation_state(&self) -> &DegradationState {
795        &self.degradation_state
796    }
797
798    /// generate recovery report
799    pub fn generate_recovery_report(&self) -> RecoveryReport {
800        let success_rate = if self.stats.total_errors > 0 {
801            (self.stats.successful_recoveries as f64 / self.stats.total_errors as f64) * 100.0
802        } else {
803            0.0
804        };
805
806        let avg_recovery_time = if self.stats.successful_recoveries > 0 {
807            self.stats.total_recovery_time_ms as f64 / self.stats.successful_recoveries as f64
808        } else {
809            0.0
810        };
811
812        RecoveryReport {
813            total_errors: self.stats.total_errors,
814            successful_recoveries: self.stats.successful_recoveries,
815            failed_recoveries: self.stats.failed_recoveries,
816            success_rate,
817            total_retries: self.stats.total_retries,
818            degradation_count: self.stats.degradation_count,
819            partial_saves: self.stats.partial_saves,
820            avg_recovery_time_ms: avg_recovery_time,
821            current_degradation_level: self.degradation_state.degradation_level.clone(),
822            is_currently_degraded: self.degradation_state.is_degraded,
823        }
824    }
825}
826
827/// error context
828#[derive(Debug, Clone)]
829pub struct ErrorContext {
830    /// current config
831    pub current_config: FastExportConfig,
832    /// progress percentage
833    pub progress_percentage: f64,
834    /// processed data size
835    pub processed_data_size: usize,
836    /// operation start time
837    pub operation_start_time: Instant,
838    /// current export stats
839    pub current_stats: Option<CompleteExportStats>,
840}
841
842/// Recovery report containing statistics and current status
843#[derive(Debug, Clone)]
844pub struct RecoveryReport {
845    /// Total number of errors encountered
846    pub total_errors: usize,
847    /// Number of successful recovery attempts
848    pub successful_recoveries: usize,
849    /// Number of failed recovery attempts
850    pub failed_recoveries: usize,
851    /// Success rate as percentage (0.0-1.0)
852    pub success_rate: f64,
853    /// Total number of retry attempts
854    pub total_retries: usize,
855    /// Number of times system entered degraded mode
856    pub degradation_count: usize,
857    /// Number of partial saves performed
858    pub partial_saves: usize,
859    /// Average recovery time in milliseconds
860    pub avg_recovery_time_ms: f64,
861    /// Current degradation level
862    pub current_degradation_level: DegradationLevel,
863    /// Whether system is currently in degraded mode
864    pub is_currently_degraded: bool,
865}
866
867impl RecoveryReport {
868    /// print detailed recovery report
869    pub fn print_detailed_report(&self) {
870        tracing::info!("\n🔧 recovery report");
871        tracing::info!("================");
872
873        tracing::info!("📊 total statistics:");
874        tracing::info!("   total errors: {}", self.total_errors);
875        tracing::info!(
876            "   successful recoveries: {} ({:.1}%)",
877            self.successful_recoveries,
878            self.success_rate
879        );
880        tracing::info!("   failed recoveries: {}", self.failed_recoveries);
881        tracing::info!("   total retries: {}", self.total_retries);
882        tracing::info!("   degradation count: {}", self.degradation_count);
883        tracing::info!("   partial saves: {}", self.partial_saves);
884        tracing::info!(
885            "   average recovery time: {:.2}ms",
886            self.avg_recovery_time_ms
887        );
888
889        tracing::info!("\n🎚️ current state:");
890        tracing::info!("   degradation level: {:?}", self.current_degradation_level);
891        tracing::info!(
892            "   is degraded: {}",
893            if self.is_currently_degraded {
894                "yes"
895            } else {
896                "no"
897            }
898        );
899    }
900}
901
902#[cfg(test)]
903mod tests {
904    use super::*;
905    use crate::export::fast_export_coordinator::FastExportConfig;
906
907    fn create_test_context() -> ErrorContext {
908        ErrorContext {
909            current_config: FastExportConfig::default(),
910            progress_percentage: 50.0,
911            processed_data_size: 1000,
912            operation_start_time: Instant::now(),
913            current_stats: None,
914        }
915    }
916
917    #[test]
918    fn test_error_recovery_manager_creation() {
919        let config = RecoveryConfig::default();
920        let manager = ErrorRecoveryManager::new(config);
921        assert_eq!(manager.stats.total_errors, 0);
922        assert!(!manager.degradation_state.is_degraded);
923    }
924
925    #[test]
926    fn test_handle_parallel_processing_error() {
927        let mut manager = ErrorRecoveryManager::new(RecoveryConfig::default());
928        let error = ExportError::ParallelProcessingError {
929            shard_index: 5,
930            thread_id: "thread-1".to_string(),
931            error_message: "Test error".to_string(),
932            partial_results: None,
933        };
934        let context = create_test_context();
935
936        let result = manager.handle_export_error(&error, "test_operation", &context);
937        assert!(result.is_ok());
938
939        let recovery_result = result.expect("Failed to handle export error");
940        assert!(recovery_result.success);
941        assert_eq!(manager.stats.total_errors, 1);
942    }
943
944    #[test]
945    fn test_graceful_degradation() {
946        let mut manager = ErrorRecoveryManager::new(RecoveryConfig::default());
947
948        let result = manager
949            .execute_graceful_degradation(DegradationLevel::Light, "test degradation".to_string());
950
951        assert!(result.is_ok());
952        let recovery_result = result.expect("Failed to get test value");
953        assert!(recovery_result.success);
954        assert!(manager.degradation_state.is_degraded);
955        assert_eq!(
956            manager.degradation_state.degradation_level,
957            DegradationLevel::Light
958        );
959    }
960
961    #[test]
962    fn test_partial_save() {
963        let config = RecoveryConfig {
964            partial_save_directory: std::env::temp_dir().join("test_partial_saves"),
965            ..Default::default()
966        };
967        let mut manager = ErrorRecoveryManager::new(config);
968        let context = create_test_context();
969
970        let save_path = manager.generate_partial_save_path("test_op");
971        let result = manager.execute_partial_save(save_path.clone(), 75.0, &context);
972
973        assert!(result.is_ok());
974        let recovery_result = result.expect("Failed to get test value");
975        assert!(recovery_result.success);
976        assert_eq!(recovery_result.partial_result_path, Some(save_path.clone()));
977
978        // clean up test file
979        let _ = std::fs::remove_file(save_path);
980    }
981
982    #[test]
983    fn test_recovery_report() {
984        let mut manager = ErrorRecoveryManager::new(RecoveryConfig::default());
985
986        // simulate some errors and recoveries
987        manager.stats.total_errors = 10;
988        manager.stats.successful_recoveries = 8;
989        manager.stats.failed_recoveries = 2;
990        manager.stats.total_retries = 5;
991        manager.stats.degradation_count = 1;
992        manager.stats.partial_saves = 2;
993        manager.stats.total_recovery_time_ms = 1000;
994
995        let report = manager.generate_recovery_report();
996        assert_eq!(report.total_errors, 10);
997        assert_eq!(report.successful_recoveries, 8);
998        assert_eq!(report.success_rate, 80.0);
999        assert_eq!(report.avg_recovery_time_ms, 125.0); // 1000 / 8
1000    }
1001}
memscope_rs/export/error_recovery.rs

memscope_rs/export/
error_recovery.rs