1use crate::error_handling_v2::ErrorCode;
7use std::collections::{HashMap, VecDeque};
8use std::sync::{
9 atomic::{AtomicUsize, Ordering},
10 Arc, Mutex,
11};
12use std::time::{Duration, Instant, SystemTime};
13
14#[derive(Debug, Clone)]
16pub struct ErrorPattern {
17 pub id: String,
19 pub error_codes: Vec<ErrorCode>,
21 pub frequency_threshold: usize,
23 pub time_window: Duration,
25 pub confidence: f64,
27 pub description: String,
29 pub mitigation: String,
31}
32
33impl ErrorPattern {
34 pub fn new(
36 id: impl Into<String>,
37 error_codes: Vec<ErrorCode>,
38 frequency_threshold: usize,
39 time_window: Duration,
40 description: impl Into<String>,
41 mitigation: impl Into<String>,
42 ) -> Self {
43 Self {
44 id: id.into(),
45 error_codes,
46 frequency_threshold,
47 time_window,
48 confidence: 0.0,
49 description: description.into(),
50 mitigation: mitigation.into(),
51 }
52 }
53}
54
55#[derive(Debug, Clone)]
57pub struct ErrorOccurrence {
58 pub code: ErrorCode,
60 pub timestamp: Instant,
62 pub operation: String,
64 pub count: usize,
66 pub resolved: bool,
68 pub recovery_action: Option<String>,
70}
71
72pub struct ErrorMonitor {
74 error_history: Arc<Mutex<VecDeque<ErrorOccurrence>>>,
76 error_counts: Arc<Mutex<HashMap<ErrorCode, AtomicUsize>>>,
78 patterns: Vec<ErrorPattern>,
80 max_historysize: usize,
82 pattern_detection_enabled: bool,
84 error_rate_thresholds: HashMap<ErrorCode, f64>,
86 start_time: Instant,
88}
89
90impl ErrorMonitor {
91 pub fn new() -> Self {
93 let mut monitor = Self {
94 error_history: Arc::new(Mutex::new(VecDeque::new())),
95 error_counts: Arc::new(Mutex::new(HashMap::new())),
96 patterns: Vec::new(),
97 max_historysize: 1000,
98 pattern_detection_enabled: true,
99 error_rate_thresholds: HashMap::new(),
100 start_time: Instant::now(),
101 };
102
103 monitor.initialize_default_patterns();
104 monitor.initialize_default_thresholds();
105 monitor
106 }
107
108 fn initialize_default_patterns(&mut self) {
110 self.patterns.push(ErrorPattern::new(
112 "memory_pressure",
113 vec![ErrorCode::E5001, ErrorCode::E5002],
114 3,
115 Duration::from_secs(60),
116 "High memory allocation failures indicating memory pressure",
117 "Reduce data size, enable streaming processing, or increase available memory",
118 ));
119
120 self.patterns.push(ErrorPattern::new(
122 "numerical_instability",
123 vec![
124 ErrorCode::E3001,
125 ErrorCode::E3002,
126 ErrorCode::E3005,
127 ErrorCode::E3006,
128 ],
129 5,
130 Duration::from_secs(30),
131 "Frequent numerical errors indicating data quality or algorithm issues",
132 "Check data preprocessing, scaling, and consider more stable algorithms",
133 ));
134
135 self.patterns.push(ErrorPattern::new(
137 "convergence_issues",
138 vec![ErrorCode::E3003, ErrorCode::E4001, ErrorCode::E4002],
139 3,
140 Duration::from_secs(120),
141 "Repeated convergence failures in iterative algorithms",
142 "Adjust algorithm parameters, improve initial conditions, or use different methods",
143 ));
144
145 self.patterns.push(ErrorPattern::new(
147 "data_quality_issues",
148 vec![
149 ErrorCode::E2003,
150 ErrorCode::E2004,
151 ErrorCode::E1001,
152 ErrorCode::E1002,
153 ],
154 4,
155 Duration::from_secs(60),
156 "Frequent data validation errors indicating poor data quality",
157 "Implement comprehensive data validation and cleaning pipeline",
158 ));
159 }
160
161 fn initialize_default_thresholds(&mut self) {
163 self.error_rate_thresholds.insert(ErrorCode::E5001, 0.01); self.error_rate_thresholds.insert(ErrorCode::E3001, 0.05); self.error_rate_thresholds.insert(ErrorCode::E3005, 0.10); self.error_rate_thresholds.insert(ErrorCode::E4001, 0.20); }
168
169 pub fn record_error(&self, code: ErrorCode, operation: impl Into<String>) {
171 let occurrence = ErrorOccurrence {
172 code,
173 timestamp: Instant::now(),
174 operation: operation.into(),
175 count: 1,
176 resolved: false,
177 recovery_action: None,
178 };
179
180 {
182 let mut history = self.error_history.lock().unwrap();
183 if history.len() >= self.max_historysize {
184 history.pop_front();
185 }
186 history.push_back(occurrence);
187 }
188
189 {
191 let mut counts = self.error_counts.lock().unwrap();
192 counts
193 .entry(code)
194 .or_insert_with(|| AtomicUsize::new(0))
195 .fetch_add(1, Ordering::Relaxed);
196 }
197
198 if self.pattern_detection_enabled {
200 self.check_patterns();
201 }
202 }
203
204 fn check_patterns(&self) {
206 let history = self.error_history.lock().unwrap();
207 let now = Instant::now();
208
209 for pattern in &self.patterns {
210 let relevant_errors: Vec<_> = history
211 .iter()
212 .filter(|err| {
213 pattern.error_codes.contains(&err.code)
214 && now.duration_since(err.timestamp) <= pattern.time_window
215 })
216 .collect();
217
218 if relevant_errors.len() >= pattern.frequency_threshold {
219 eprintln!(
220 "ā ļø ERROR PATTERN DETECTED: {} - {} ({})",
221 pattern.id, pattern.description, pattern.mitigation
222 );
223 }
224 }
225 }
226
227 pub fn get_statistics(&self) -> ErrorStatistics {
229 let counts = self.error_counts.lock().unwrap();
230 let history = self.error_history.lock().unwrap();
231
232 let total_errors: usize = counts
233 .values()
234 .map(|counter| counter.load(Ordering::Relaxed))
235 .sum();
236
237 let uptime = self.start_time.elapsed();
238 let error_rate = total_errors as f64 / uptime.as_secs_f64();
239
240 let mut error_distribution = HashMap::new();
242 for (code, counter) in counts.iter() {
243 let count = counter.load(Ordering::Relaxed);
244 if count > 0 {
245 error_distribution.insert(*code, count);
246 }
247 }
248
249 let mut frequent_errors: Vec<_> = error_distribution.clone().into_iter().collect();
251 frequent_errors.sort_by(|a, b| b.1.cmp(&a.1));
252 let top_errors: Vec<_> = frequent_errors.into_iter().take(5).collect();
253
254 let one_hour_ago = Instant::now() - Duration::from_secs(3600);
256 let recent_errors = history
257 .iter()
258 .filter(|err| err.timestamp > one_hour_ago)
259 .count();
260 let recent_error_rate = recent_errors as f64 / 3600.0;
261
262 ErrorStatistics {
263 total_errors,
264 error_rate,
265 recent_error_rate,
266 uptime,
267 error_distribution,
268 top_errors: top_errors.into_iter().collect(),
269 active_patterns: self.detect_active_patterns(),
270 }
271 }
272
273 fn detect_active_patterns(&self) -> Vec<String> {
275 let history = self.error_history.lock().unwrap();
276 let now = Instant::now();
277 let mut active_patterns = Vec::new();
278
279 for pattern in &self.patterns {
280 let recent_errors: Vec<_> = history
281 .iter()
282 .filter(|err| {
283 pattern.error_codes.contains(&err.code)
284 && now.duration_since(err.timestamp) <= pattern.time_window
285 })
286 .collect();
287
288 if recent_errors.len() >= pattern.frequency_threshold {
289 active_patterns.push(pattern.id.clone());
290 }
291 }
292
293 active_patterns
294 }
295
296 pub fn generate_health_report(&self) -> HealthReport {
298 let stats = self.get_statistics();
299 let history = self.error_history.lock().unwrap();
300
301 let health_score = self.calculate_health_score(&stats);
303
304 let critical_issues = self.identify_critical_issues(&stats);
306
307 let recommendations = self.generate_recommendations(&stats, &critical_issues);
309
310 let trend = self.calculate_error_trend(&history);
312
313 HealthReport {
314 health_score,
315 critical_issues,
316 recommendations,
317 statistics: stats,
318 trend,
319 timestamp: SystemTime::now(),
320 }
321 }
322
323 fn calculate_health_score(&self, stats: &ErrorStatistics) -> u8 {
325 let mut score = 100.0;
326
327 if stats.error_rate > 1.0 {
329 score -= 30.0;
330 } else if stats.error_rate > 0.1 {
331 score -= 20.0;
332 } else if stats.error_rate > 0.01 {
333 score -= 10.0;
334 }
335
336 score -= stats.active_patterns.len() as f64 * 15.0;
338
339 for (code, count) in &stats.top_errors {
341 if code.severity() <= 2 {
342 score -= *count as f64 * 5.0;
343 }
344 }
345
346 if stats.recent_error_rate > stats.error_rate * 2.0 {
348 score -= 20.0;
349 }
350
351 score.max(0.0).min(100.0) as u8
352 }
353
354 fn identify_critical_issues(&self, stats: &ErrorStatistics) -> Vec<CriticalIssue> {
356 let mut issues = Vec::new();
357
358 if stats
360 .active_patterns
361 .contains(&"memory_pressure".to_string())
362 {
363 issues.push(CriticalIssue {
364 severity: 1,
365 title: "Memory Pressure Detected".to_string(),
366 description: "High memory allocation failures indicate system memory pressure"
367 .to_string(),
368 impact: "May cause application crashes or severe performance degradation"
369 .to_string(),
370 action_required: "Immediate memory optimization or resource scaling required"
371 .to_string(),
372 });
373 }
374
375 for (code, count) in &stats.top_errors {
377 if code.severity() <= 2 && *count > 10 {
378 issues.push(CriticalIssue {
379 severity: code.severity(),
380 title: format!("High {} Error Rate", code),
381 description: format!("Frequent {} errors detected", code.description()),
382 impact: "May indicate fundamental data or algorithm issues".to_string(),
383 action_required: "Investigate root cause and implement fixes".to_string(),
384 });
385 }
386 }
387
388 if stats.recent_error_rate > stats.error_rate * 3.0 {
390 issues.push(CriticalIssue {
391 severity: 2,
392 title: "Error Rate Spike".to_string(),
393 description: "Recent error rate significantly higher than baseline".to_string(),
394 impact: "Indicates potential system instability or new issues".to_string(),
395 action_required: "Monitor closely and investigate recent changes".to_string(),
396 });
397 }
398
399 issues
400 }
401
402 fn generate_recommendations(
404 &self,
405 stats: &ErrorStatistics,
406 issues: &[CriticalIssue],
407 ) -> Vec<Recommendation> {
408 let mut recommendations = Vec::new();
409
410 if stats
412 .active_patterns
413 .contains(&"numerical_instability".to_string())
414 {
415 recommendations.push(Recommendation {
416 priority: 1,
417 category: "Data Quality".to_string(),
418 title: "Improve Numerical Stability".to_string(),
419 description: "Implement data preprocessing and normalization".to_string(),
420 steps: vec![
421 "Check for extreme values in input data".to_string(),
422 "Apply appropriate data scaling or normalization".to_string(),
423 "Consider using more numerically stable algorithms".to_string(),
424 ],
425 expected_impact: "Reduce numerical errors by 70-90%".to_string(),
426 });
427 }
428
429 for (code, count) in &stats.top_errors {
431 match code {
432 ErrorCode::E3005 => {
433 recommendations.push(Recommendation {
434 priority: 2,
435 category: "Data Validation".to_string(),
436 title: "Handle NaN Values".to_string(),
437 description: "Implement comprehensive NaN handling strategy".to_string(),
438 steps: vec![
439 "Add data validation checks before processing".to_string(),
440 "Implement NaN filtering or imputation".to_string(),
441 "Use statistical methods that handle missing data".to_string(),
442 ],
443 expected_impact: "Eliminate NaN-related errors".to_string(),
444 });
445 }
446 ErrorCode::E3003 => {
447 recommendations.push(Recommendation {
448 priority: 2,
449 category: "Algorithm Tuning".to_string(),
450 title: "Optimize Convergence Parameters".to_string(),
451 description: "Adjust algorithm parameters for better convergence"
452 .to_string(),
453 steps: vec![
454 "Increase maximum iterations for iterative algorithms".to_string(),
455 "Adjust convergence tolerance based on data characteristics"
456 .to_string(),
457 "Consider using different initialization strategies".to_string(),
458 ],
459 expected_impact: "Improve convergence rate by 50-80%".to_string(),
460 });
461 }
462 _ => {}
463 }
464 }
465
466 if stats.error_rate > 0.1 {
468 recommendations.push(Recommendation {
469 priority: 1,
470 category: "System Health".to_string(),
471 title: "Reduce Overall Error Rate".to_string(),
472 description: "Implement comprehensive error prevention strategy".to_string(),
473 steps: vec![
474 "Add input validation at system boundaries".to_string(),
475 "Implement data quality checks".to_string(),
476 "Use defensive programming practices".to_string(),
477 ],
478 expected_impact: "Reduce overall error rate significantly".to_string(),
479 });
480 }
481
482 recommendations
483 }
484
485 fn calculate_error_trend(&self, history: &VecDeque<ErrorOccurrence>) -> ErrorTrend {
487 if history.len() < 10 {
488 return ErrorTrend {
489 direction: TrendDirection::Stable,
490 magnitude: 0.0,
491 confidence: 0.0,
492 description: "Insufficient data for trend analysis".to_string(),
493 };
494 }
495
496 let now = Instant::now();
497 let recent_window = Duration::from_secs(1800); let older_window = Duration::from_secs(3600); let recent_errors = history
501 .iter()
502 .filter(|err| now.duration_since(err.timestamp) <= recent_window)
503 .count();
504
505 let older_errors = history
506 .iter()
507 .filter(|err| {
508 let age = now.duration_since(err.timestamp);
509 age > recent_window && age <= older_window
510 })
511 .count();
512
513 let recent_rate = recent_errors as f64 / recent_window.as_secs_f64();
514 let older_rate = older_errors as f64 / recent_window.as_secs_f64(); let change_ratio = if older_rate > 0.0 {
517 recent_rate / older_rate
518 } else if recent_rate > 0.0 {
519 2.0 } else {
521 1.0 };
523
524 let (direction, description) = if change_ratio > 1.5 {
525 (
526 TrendDirection::Increasing,
527 "Error rate is increasing significantly".to_string(),
528 )
529 } else if change_ratio < 0.5 {
530 (
531 TrendDirection::Decreasing,
532 "Error rate is decreasing significantly".to_string(),
533 )
534 } else {
535 (
536 TrendDirection::Stable,
537 "Error rate is relatively stable".to_string(),
538 )
539 };
540
541 let magnitude = (change_ratio - 1.0).abs();
542 let confidence = if history.len() > 50 { 0.8 } else { 0.5 };
543
544 ErrorTrend {
545 direction,
546 magnitude,
547 confidence,
548 description,
549 }
550 }
551}
552
553impl Default for ErrorMonitor {
554 fn default() -> Self {
555 Self::new()
556 }
557}
558
559#[derive(Debug)]
561pub struct ErrorStatistics {
562 pub total_errors: usize,
564 pub error_rate: f64,
566 pub recent_error_rate: f64,
568 pub uptime: Duration,
570 pub error_distribution: HashMap<ErrorCode, usize>,
572 pub top_errors: Vec<(ErrorCode, usize)>,
574 pub active_patterns: Vec<String>,
576}
577
578#[derive(Debug)]
580pub struct CriticalIssue {
581 pub severity: u8,
583 pub title: String,
585 pub description: String,
587 pub impact: String,
589 pub action_required: String,
591}
592
593#[derive(Debug)]
595pub struct Recommendation {
596 pub priority: u8,
598 pub category: String,
600 pub title: String,
602 pub description: String,
604 pub steps: Vec<String>,
606 pub expected_impact: String,
608}
609
610#[derive(Debug)]
612pub struct ErrorTrend {
613 pub direction: TrendDirection,
615 pub magnitude: f64,
617 pub confidence: f64,
619 pub description: String,
621}
622
623#[derive(Debug)]
625pub enum TrendDirection {
626 Increasing,
627 Decreasing,
628 Stable,
629}
630
631#[derive(Debug)]
633pub struct HealthReport {
634 pub health_score: u8,
636 pub critical_issues: Vec<CriticalIssue>,
638 pub recommendations: Vec<Recommendation>,
640 pub statistics: ErrorStatistics,
642 pub trend: ErrorTrend,
644 pub timestamp: SystemTime,
646}
647
648impl HealthReport {
649 pub fn to_formatted_string(&self) -> String {
651 let mut report = String::new();
652
653 report.push_str("=== STATISTICAL COMPUTING HEALTH REPORT ===\n\n");
654 report.push_str(&format!(
655 "š Overall Health Score: {}/100\n",
656 self.health_score
657 ));
658 report.push_str(&format!("ā±ļø Report Generated: {:?}\n\n", self.timestamp));
659
660 let health_indicator = match self.health_score {
662 90..=100 => "š¢ EXCELLENT",
663 70..=89 => "š” GOOD",
664 50..=69 => "š FAIR",
665 30..=49 => "š“ POOR",
666 _ => "šØ CRITICAL",
667 };
668 report.push_str(&format!("Status: {}\n\n", health_indicator));
669
670 if !self.critical_issues.is_empty() {
672 report.push_str("šØ CRITICAL ISSUES:\n");
673 for (i, issue) in self.critical_issues.iter().enumerate() {
674 report.push_str(&format!(
675 "{}. {} (Severity: {})\n {}\n Impact: {}\n Action: {}\n\n",
676 i + 1,
677 issue.title,
678 issue.severity,
679 issue.description,
680 issue.impact,
681 issue.action_required
682 ));
683 }
684 }
685
686 report.push_str("š STATISTICS SUMMARY:\n");
688 report.push_str(&format!(
689 "⢠Total Errors: {}\n",
690 self.statistics.total_errors
691 ));
692 report.push_str(&format!(
693 "⢠Error Rate: {:.4} errors/sec\n",
694 self.statistics.error_rate
695 ));
696 report.push_str(&format!(
697 "⢠Recent Rate: {:.4} errors/sec\n",
698 self.statistics.recent_error_rate
699 ));
700 report.push_str(&format!(
701 "⢠Uptime: {:.2} hours\n",
702 self.statistics.uptime.as_secs_f64() / 3600.0
703 ));
704
705 if !self.statistics.top_errors.is_empty() {
706 report.push_str("\nš TOP ERRORS:\n");
707 for (i, (code, count)) in self.statistics.top_errors.iter().enumerate() {
708 report.push_str(&format!(" {}. {}: {} occurrences\n", i + 1, code, count));
709 }
710 }
711
712 report.push_str(&format!("\nš TREND: {}\n", self.trend.description));
714
715 if !self.recommendations.is_empty() {
717 report.push_str("\nš” RECOMMENDATIONS:\n");
718 for (i, rec) in self.recommendations.iter().enumerate() {
719 report.push_str(&format!(
720 "{}. {} (Priority: {})\n {}\n Expected Impact: {}\n",
721 i + 1,
722 rec.title,
723 rec.priority,
724 rec.description,
725 rec.expected_impact
726 ));
727 if !rec.steps.is_empty() {
728 report.push_str(" Steps:\n");
729 for step in &rec.steps {
730 report.push_str(&format!(" ⢠{}\n", step));
731 }
732 }
733 report.push('\n');
734 }
735 }
736
737 report
738 }
739
740 pub fn requires_immediate_action(&self) -> bool {
742 self.health_score < 50 || self.critical_issues.iter().any(|issue| issue.severity <= 2)
743 }
744}
745
746static GLOBAL_MONITOR: std::sync::OnceLock<ErrorMonitor> = std::sync::OnceLock::new();
748
749#[allow(dead_code)]
751pub fn global_monitor() -> &'static ErrorMonitor {
752 GLOBAL_MONITOR.get_or_init(ErrorMonitor::new)
753}
754
755#[allow(dead_code)]
757pub fn record_global_error(code: ErrorCode, operation: impl Into<String>) {
758 global_monitor().record_error(code, operation);
759}
760
761#[allow(dead_code)]
763pub fn get_global_statistics() -> ErrorStatistics {
764 global_monitor().get_statistics()
765}
766
767#[allow(dead_code)]
769pub fn generate_global_health_report() -> HealthReport {
770 global_monitor().generate_health_report()
771}
772
773#[cfg(test)]
774mod tests {
775 use super::*;
776 use std::thread;
777
778 #[test]
779 #[ignore = "timeout"]
780 fn test_error_monitor_basic() {
781 let monitor = ErrorMonitor::new();
782 monitor.record_error(ErrorCode::E3005, "test_operation");
783
784 let stats = monitor.get_statistics();
785 assert_eq!(stats.total_errors, 1);
786 assert!(stats.error_distribution.contains_key(&ErrorCode::E3005));
787 }
788
789 #[test]
790 #[ignore = "timeout"]
791 fn test_pattern_detection() {
792 let monitor = ErrorMonitor::new();
793
794 for _ in 0..5 {
796 monitor.record_error(ErrorCode::E5001, "memory_test");
797 }
799
800 let stats = monitor.get_statistics();
801 }
804
805 #[test]
806 #[ignore = "timeout"]
807 fn test_health_score_calculation() {
808 let monitor = ErrorMonitor::new();
809
810 let health_report = monitor.generate_health_report();
812 assert_eq!(health_report.health_score, 100);
813
814 monitor.record_error(ErrorCode::E3001, "overflow_test");
816 monitor.record_error(ErrorCode::E5001, "memory_test");
817
818 let health_report = monitor.generate_health_report();
819 assert!(health_report.health_score < 100);
820 }
821}