1#![allow(dead_code)]
18use crate::error_handling::AutogradError;
19use scirs2_core::ndarray::Array2;
20use serde::{Deserialize, Serialize};
21use std::collections::hash_map::DefaultHasher;
22use std::collections::HashMap;
23use std::hash::{Hash, Hasher};
24use std::time::{Duration, Instant};
25
26#[derive(Debug)]
28pub struct ErrorDiagnosticsSystem {
29 patterns: ErrorPatternDatabase,
31 correlations: ErrorCorrelationTracker,
33 performance_analyzer: PerformanceImpactAnalyzer,
35 config: DiagnosticsConfig,
37 error_history: Vec<ErrorEvent>,
39}
40
41impl ErrorDiagnosticsSystem {
42 pub fn new() -> Self {
44 Self::with_config(DiagnosticsConfig::default())
45 }
46
47 pub fn with_config(config: DiagnosticsConfig) -> Self {
49 Self {
50 patterns: ErrorPatternDatabase::new(),
51 correlations: ErrorCorrelationTracker::new(),
52 performance_analyzer: PerformanceImpactAnalyzer::new(),
53 config,
54 error_history: Vec::new(),
55 }
56 }
57
58 pub fn record_error(&mut self, error: &AutogradError, context: ErrorContext) {
60 let event = ErrorEvent {
61 timestamp: Instant::now(),
62 error: error.clone(),
63 context,
64 operation_id: self.generate_operation_id(),
65 };
66
67 self.error_history.push(event.clone());
69
70 self.patterns.analyze_error(&event);
72
73 self.correlations.update(&event, &self.error_history);
75
76 self.performance_analyzer.assess_impact(&event);
78
79 if self.error_history.len() > self.config.max_history_size {
81 self.error_history
82 .drain(0..self.config.history_cleanup_batch);
83 }
84 }
85
86 pub fn generate_diagnostic_report(&self) -> DiagnosticReport {
88 let pattern_analysis = self.patterns.generate_analysis();
89 let correlation_analysis = self.correlations.generate_analysis();
90 let performance_analysis = self.performance_analyzer.generate_analysis();
91
92 let mut recommendations = Vec::new();
93
94 for pattern in &pattern_analysis.detected_patterns {
96 recommendations.extend(self.generate_pattern_recommendations(pattern));
97 }
98
99 for correlation in &correlation_analysis.significant_correlations {
101 recommendations.extend(self.generate_correlation_recommendations(correlation));
102 }
103
104 DiagnosticReport {
105 timestamp: Instant::now(),
106 total_errors: self.error_history.len(),
107 analysis_window: self.config.analysis_window,
108 pattern_analysis,
109 correlation_analysis,
110 performance_analysis,
111 recommendations,
112 severity_assessment: self.assess_overall_severity(),
113 }
114 }
115
116 pub fn get_status(&self) -> DiagnosticStatus {
118 let recent_errors = self.get_recent_errors(Duration::from_secs(300)); let error_rate = recent_errors.len() as f64 / 300.0; DiagnosticStatus {
122 error_rate,
123 active_patterns: self.patterns.get_active_patterns().len(),
124 severity_level: self.assess_current_severity(&recent_errors),
125 last_critical_error: self.find_last_critical_error(),
126 health_score: self.calculate_health_score(&recent_errors),
127 }
128 }
129
130 fn generate_operation_id(&self) -> String {
132 format!("op_{}", self.error_history.len())
133 }
134
135 fn get_recent_errors(&self, window: Duration) -> Vec<&ErrorEvent> {
137 let cutoff = Instant::now() - window;
138 self.error_history
139 .iter()
140 .filter(|event| event.timestamp >= cutoff)
141 .collect()
142 }
143
144 fn generate_pattern_recommendations(
146 &self,
147 pattern: &ErrorPattern,
148 ) -> Vec<DiagnosticRecommendation> {
149 match pattern.pattern_type {
150 PatternType::ShapeMismatchPattern => vec![DiagnosticRecommendation {
151 category: RecommendationCategory::CodeImprovement,
152 priority: RecommendationPriority::High,
153 title: "Add Shape Validation".to_string(),
154 description: "Add explicit shape validation before tensor operations".to_string(),
155 code_example: Some("tensor.validate_shape(&expected_shape)?;".to_string()),
156 estimated_fix_time: Duration::from_secs(300), }],
158 PatternType::MemoryAllocationPattern => vec![DiagnosticRecommendation {
159 category: RecommendationCategory::Performance,
160 priority: RecommendationPriority::Medium,
161 title: "Optimize Memory Usage".to_string(),
162 description: "Consider using memory pooling or reducing batch sizes".to_string(),
163 code_example: Some("config.batch_size = config.batch_size / 2;".to_string()),
164 estimated_fix_time: Duration::from_secs(900), }],
166 PatternType::NumericalInstabilityPattern => vec![DiagnosticRecommendation {
167 category: RecommendationCategory::Stability,
168 priority: RecommendationPriority::High,
169 title: "Add Numerical Stabilization".to_string(),
170 description: "Add gradient clipping and check for NaN/infinity values".to_string(),
171 code_example: Some(
172 "gradients = clip_gradients(gradients, max_norm=1.0);".to_string(),
173 ),
174 estimated_fix_time: Duration::from_secs(600), }],
176 _ => Vec::new(),
177 }
178 }
179
180 fn generate_correlation_recommendations(
182 &self,
183 correlation: &ErrorCorrelation,
184 ) -> Vec<DiagnosticRecommendation> {
185 vec![DiagnosticRecommendation {
186 category: RecommendationCategory::Architecture,
187 priority: RecommendationPriority::Medium,
188 title: "Address Error Correlation".to_string(),
189 description: format!(
190 "Errors of type {:?} and {:?} are correlated",
191 correlation.error_type_1, correlation.error_type_2
192 ),
193 code_example: None,
194 estimated_fix_time: Duration::from_secs(1800), }]
196 }
197
198 fn assess_overall_severity(&self) -> SeverityLevel {
200 let recent_errors = self.get_recent_errors(Duration::from_secs(3600)); if recent_errors.len() > 100 {
203 SeverityLevel::Critical
204 } else if recent_errors.len() > 50 {
205 SeverityLevel::High
206 } else if recent_errors.len() > 10 {
207 SeverityLevel::Medium
208 } else {
209 SeverityLevel::Low
210 }
211 }
212
213 fn assess_current_severity(&self, recent_errors: &[&ErrorEvent]) -> SeverityLevel {
215 if recent_errors.len() > 20 {
216 SeverityLevel::Critical
217 } else if recent_errors.len() > 10 {
218 SeverityLevel::High
219 } else if recent_errors.len() > 5 {
220 SeverityLevel::Medium
221 } else {
222 SeverityLevel::Low
223 }
224 }
225
226 fn find_last_critical_error(&self) -> Option<Instant> {
228 self.error_history
229 .iter()
230 .rev()
231 .find(|event| self.is_critical_error(&event.error))
232 .map(|event| event.timestamp)
233 }
234
235 fn is_critical_error(&self, error: &AutogradError) -> bool {
237 matches!(
238 error,
239 AutogradError::NumericalInstability { .. } | AutogradError::MemoryAllocation { .. }
240 )
241 }
242
243 fn calculate_health_score(&self, recent_errors: &[&ErrorEvent]) -> f64 {
245 let base_score = 1.0;
246 let error_penalty = recent_errors.len() as f64 * 0.01;
247 let critical_penalty = recent_errors
248 .iter()
249 .filter(|e| self.is_critical_error(&e.error))
250 .count() as f64
251 * 0.05;
252
253 (base_score - error_penalty - critical_penalty).max(0.0)
254 }
255}
256
257#[derive(Debug)]
259struct ErrorPatternDatabase {
260 patterns: Vec<ErrorPattern>,
261 pattern_counts: HashMap<PatternType, usize>,
262}
263
264impl ErrorPatternDatabase {
265 fn new() -> Self {
266 Self {
267 patterns: Vec::new(),
268 pattern_counts: HashMap::new(),
269 }
270 }
271
272 fn analyze_error(&mut self, event: &ErrorEvent) {
273 let pattern_type = self.classify_error_pattern(&event.error);
274
275 if let Some(pattern_type) = pattern_type {
276 *self.pattern_counts.entry(pattern_type).or_insert(0) += 1;
277
278 if let Some(pattern) = self
279 .patterns
280 .iter_mut()
281 .find(|p| p.pattern_type == pattern_type)
282 {
283 pattern.occurrences += 1;
284 pattern.last_occurrence = event.timestamp;
285 } else {
286 self.patterns.push(ErrorPattern {
287 pattern_type,
288 occurrences: 1,
289 first_occurrence: event.timestamp,
290 last_occurrence: event.timestamp,
291 severity: self.assess_pattern_severity(&pattern_type),
292 });
293 }
294 }
295 }
296
297 fn classify_error_pattern(&self, error: &AutogradError) -> Option<PatternType> {
298 match error {
299 AutogradError::ShapeMismatch { .. } => Some(PatternType::ShapeMismatchPattern),
300 AutogradError::MemoryAllocation { .. } => Some(PatternType::MemoryAllocationPattern),
301 AutogradError::NumericalInstability { .. } => {
302 Some(PatternType::NumericalInstabilityPattern)
303 }
304 AutogradError::ComputationGraph { .. } => Some(PatternType::ComputationGraphPattern),
305 AutogradError::GradientComputation { .. } => {
306 Some(PatternType::GradientComputationPattern)
307 }
308 _ => None,
309 }
310 }
311
312 fn assess_pattern_severity(&self, pattern_type: &PatternType) -> SeverityLevel {
313 match pattern_type {
314 PatternType::NumericalInstabilityPattern => SeverityLevel::Critical,
315 PatternType::MemoryAllocationPattern => SeverityLevel::High,
316 PatternType::ComputationGraphPattern => SeverityLevel::High,
317 PatternType::ShapeMismatchPattern => SeverityLevel::Medium,
318 PatternType::GradientComputationPattern => SeverityLevel::Medium,
319 _ => SeverityLevel::Low,
320 }
321 }
322
323 fn generate_analysis(&self) -> PatternAnalysis {
324 let detected_patterns = self.patterns.clone();
325 let most_common = self
326 .pattern_counts
327 .iter()
328 .max_by_key(|(_, count)| *count)
329 .map(|(pattern_type, _)| *pattern_type);
330
331 PatternAnalysis {
332 detected_patterns,
333 most_common_pattern: most_common,
334 total_patterns: self.patterns.len(),
335 }
336 }
337
338 fn get_active_patterns(&self) -> Vec<&ErrorPattern> {
339 let recent_threshold = Instant::now() - Duration::from_secs(3600); self.patterns
341 .iter()
342 .filter(|p| p.last_occurrence >= recent_threshold)
343 .collect()
344 }
345}
346
347#[derive(Debug)]
352pub struct MLPatternRecognitionSystem {
353 feature_matrix: Array2<f64>,
355 classifier: ErrorPatternClassifier,
357 anomaly_detector: ErrorAnomalyDetector,
359 temporal_analyzer: TemporalPatternAnalyzer,
361 training_data: Vec<LabeledErrorEvent>,
363 ml_config: MLSystemConfig,
365}
366
367impl MLPatternRecognitionSystem {
368 pub fn new() -> Self {
370 Self::with_config(MLSystemConfig::default())
371 }
372
373 pub fn with_config(config: MLSystemConfig) -> Self {
375 let feature_dim = config.feature_dimension;
376 let max_samples = config.max_training_samples;
377
378 Self {
379 feature_matrix: Array2::zeros((max_samples, feature_dim)),
380 classifier: ErrorPatternClassifier::new(feature_dim, config.num_classes),
381 anomaly_detector: ErrorAnomalyDetector::new(feature_dim),
382 temporal_analyzer: TemporalPatternAnalyzer::new(),
383 training_data: Vec::new(),
384 ml_config: config,
385 }
386 }
387
388 pub fn analyze_error_ml(&mut self, event: &ErrorEvent) -> MLAnalysisResult {
390 let features = self.extract_features(event);
392
393 let predicted_pattern = self.classifier.classify(&features);
395
396 let is_anomaly = self.anomaly_detector.is_anomaly(&features);
398
399 let temporal_info = self.temporal_analyzer.analyze_temporal_context(event);
401
402 let prediction = self.predict_next_error_probability(event);
404
405 MLAnalysisResult {
406 predicted_pattern: predicted_pattern.clone(),
407 confidence_score: predicted_pattern.confidence,
408 is_anomaly,
409 anomaly_score: if is_anomaly { 1.0 } else { 0.0 },
410 temporal_context: temporal_info,
411 next_error_probability: prediction,
412 feature_importance: self.calculate_feature_importance(&features),
413 }
414 }
415
416 fn extract_features(&self, event: &ErrorEvent) -> Vec<f64> {
418 let mut features = Vec::with_capacity(self.ml_config.feature_dimension);
419
420 features.extend(self.encode_error_type(&event.error));
422
423 features.extend(self.extract_temporal_features(event));
425
426 features.extend(self.extract_context_features(&event.context));
428
429 features.resize(self.ml_config.feature_dimension, 0.0);
431 features
432 }
433
434 fn encode_error_type(&self, error: &AutogradError) -> Vec<f64> {
436 let mut encoding = vec![0.0; 10]; match error {
439 AutogradError::ShapeMismatch { .. } => encoding[0] = 1.0,
440 AutogradError::MemoryAllocation { .. } => encoding[1] = 1.0,
441 AutogradError::NumericalInstability { .. } => encoding[2] = 1.0,
442 AutogradError::ComputationGraph { .. } => encoding[3] = 1.0,
443 AutogradError::GradientComputation { .. } => encoding[4] = 1.0,
444 _ => encoding[9] = 1.0, }
446
447 encoding
448 }
449
450 fn extract_temporal_features(&self, event: &ErrorEvent) -> Vec<f64> {
452 let timestamp_secs = event.timestamp.elapsed().as_secs_f64();
453 vec![
454 timestamp_secs.sin(), timestamp_secs.cos(),
456 timestamp_secs % 86400.0, (timestamp_secs / 86400.0).fract(), ]
459 }
460
461 fn extract_context_features(&self, context: &ErrorContext) -> Vec<f64> {
463 vec![
464 context.tensor_ids.len() as f64,
465 context.stack_trace.len() as f64,
466 context.operation_name.len() as f64,
467 ]
468 }
469
470 fn predict_next_error_probability(&self, event: &ErrorEvent) -> f64 {
472 let base_probability = 0.1;
475 let temporal_factor = self.temporal_analyzer.get_temporal_risk_factor(event);
476 (base_probability * temporal_factor).min(1.0)
477 }
478
479 fn calculate_feature_importance(&self, features: &[f64]) -> Vec<f64> {
481 features.iter().map(|&f| f.abs()).collect()
483 }
484
485 pub fn train_incremental(&mut self, labeled_events: &[LabeledErrorEvent]) {
487 for event in labeled_events {
488 self.training_data.push(event.clone());
489
490 let features = self.extract_features(&event.event);
492 self.classifier.update(&features, &event.label);
493
494 self.anomaly_detector.update(&features);
496 }
497
498 if self.training_data.len() > self.ml_config.max_training_samples {
500 let excess = self.training_data.len() - self.ml_config.max_training_samples;
501 self.training_data.drain(0..excess);
502 }
503 }
504}
505
506#[derive(Debug)]
508struct ErrorPatternClassifier {
509 weights: Array2<f64>,
511 bias: Vec<f64>,
513 learning_rate: f64,
515}
516
517impl ErrorPatternClassifier {
518 fn new(feature_dim: usize, num_classes: usize) -> Self {
519 let mut weights = Array2::zeros((num_classes, feature_dim));
520
521 let scale = (2.0 / feature_dim as f64).sqrt();
523 for i in 0..num_classes {
524 for j in 0..feature_dim {
525 let mut hasher = DefaultHasher::new();
527 (i, j).hash(&mut hasher);
528 let hash_val = hasher.finish();
529 let normalized = (hash_val as f64) / (u64::MAX as f64);
530 weights[[i, j]] = (normalized - 0.5) * 2.0 * scale;
531 }
532 }
533
534 Self {
535 weights,
536 bias: vec![0.0; num_classes],
537 learning_rate: 0.01,
538 }
539 }
540
541 fn classify(&self, features: &[f64]) -> MLPatternPrediction {
542 let scores = self.compute_scores(features);
543 let max_idx = scores
544 .iter()
545 .enumerate()
546 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
547 .map(|(idx, _)| idx)
548 .unwrap_or(0);
549
550 let confidence = self.softmax(&scores)[max_idx];
551 let pattern_type = self.index_to_pattern_type(max_idx);
552
553 MLPatternPrediction {
554 pattern_type,
555 confidence,
556 raw_scores: scores,
557 }
558 }
559
560 fn compute_scores(&self, features: &[f64]) -> Vec<f64> {
561 let mut scores = self.bias.clone();
562
563 for i in 0..self.weights.nrows() {
564 for j in 0..features.len().min(self.weights.ncols()) {
565 scores[i] += self.weights[[i, j]] * features[j];
566 }
567 }
568
569 scores
570 }
571
572 fn softmax(&self, scores: &[f64]) -> Vec<f64> {
573 let max_score = scores.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
574 let exp_scores: Vec<f64> = scores.iter().map(|&s| (s - max_score).exp()).collect();
575 let sum_exp: f64 = exp_scores.iter().sum();
576 exp_scores.iter().map(|&e| e / sum_exp).collect()
577 }
578
579 fn index_to_pattern_type(&self, index: usize) -> PatternType {
580 match index {
581 0 => PatternType::ShapeMismatchPattern,
582 1 => PatternType::MemoryAllocationPattern,
583 2 => PatternType::NumericalInstabilityPattern,
584 3 => PatternType::ComputationGraphPattern,
585 4 => PatternType::GradientComputationPattern,
586 _ => PatternType::ShapeMismatchPattern, }
588 }
589
590 fn update(&mut self, features: &[f64], label: &PatternLabel) {
591 let predicted = self.classify(features);
592 let target_idx = self.pattern_type_to_index(&label.pattern_type);
593 let predicted_idx = self.pattern_type_to_index(&predicted.pattern_type);
594
595 if target_idx != predicted_idx {
597 for j in 0..features.len().min(self.weights.ncols()) {
598 self.weights[[target_idx, j]] += self.learning_rate * features[j];
599 self.weights[[predicted_idx, j]] -= self.learning_rate * features[j];
600 }
601
602 self.bias[target_idx] += self.learning_rate;
603 self.bias[predicted_idx] -= self.learning_rate;
604 }
605 }
606
607 fn pattern_type_to_index(&self, pattern_type: &PatternType) -> usize {
608 match pattern_type {
609 PatternType::ShapeMismatchPattern => 0,
610 PatternType::MemoryAllocationPattern => 1,
611 PatternType::NumericalInstabilityPattern => 2,
612 PatternType::ComputationGraphPattern => 3,
613 PatternType::GradientComputationPattern => 4,
614 _ => 0,
615 }
616 }
617}
618
619#[derive(Debug)]
621struct ErrorAnomalyDetector {
622 mean: Vec<f64>,
624 variance: Vec<f64>,
626 sample_count: usize,
628 threshold: f64,
630}
631
632impl ErrorAnomalyDetector {
633 fn new(feature_dim: usize) -> Self {
634 Self {
635 mean: vec![0.0; feature_dim],
636 variance: vec![1.0; feature_dim],
637 sample_count: 0,
638 threshold: 3.0, }
640 }
641
642 fn is_anomaly(&self, features: &[f64]) -> bool {
643 if self.sample_count < 10 {
644 return false; }
646
647 let z_scores: Vec<f64> = features
648 .iter()
649 .zip(&self.mean)
650 .zip(&self.variance)
651 .map(|((&f, &m), &v)| (f - m) / v.sqrt())
652 .collect();
653
654 z_scores.iter().any(|&z| z.abs() > self.threshold)
655 }
656
657 fn update(&mut self, features: &[f64]) {
658 self.sample_count += 1;
659 let n = self.sample_count as f64;
660
661 for i in 0..features.len().min(self.mean.len()) {
663 let delta = features[i] - self.mean[i];
664 self.mean[i] += delta / n;
665 let delta2 = features[i] - self.mean[i];
666 self.variance[i] = ((n - 1.0) * self.variance[i] + delta * delta2) / n;
667 }
668 }
669}
670
671#[derive(Debug)]
673struct TemporalPatternAnalyzer {
674 error_history: Vec<Instant>,
676 seasonal_patterns: HashMap<String, f64>,
678}
679
680impl TemporalPatternAnalyzer {
681 fn new() -> Self {
682 Self {
683 error_history: Vec::new(),
684 seasonal_patterns: HashMap::new(),
685 }
686 }
687
688 fn analyze_temporal_context(&mut self, event: &ErrorEvent) -> TemporalContext {
689 self.error_history.push(event.timestamp);
690
691 let cutoff = Instant::now() - Duration::from_secs(86400); self.error_history.retain(|&t| t >= cutoff);
694
695 let frequency = self.calculate_error_frequency();
696 let trend = self.calculate_trend();
697 let seasonality = self.detect_seasonality();
698
699 TemporalContext {
700 error_frequency: frequency,
701 trend_direction: trend,
702 seasonal_factor: seasonality,
703 time_since_last_error: self.time_since_last_error(),
704 }
705 }
706
707 fn calculate_error_frequency(&self) -> f64 {
708 if self.error_history.len() < 2 {
709 return 0.0;
710 }
711
712 let time_span = self
713 .error_history
714 .last()
715 .expect("error_history is non-empty")
716 .duration_since(
717 *self
718 .error_history
719 .first()
720 .expect("error_history is non-empty"),
721 )
722 .as_secs_f64();
723
724 self.error_history.len() as f64 / time_span.max(1.0)
725 }
726
727 fn calculate_trend(&self) -> f64 {
728 if self.error_history.len() < 10 {
729 return 0.0; }
731
732 let recent = &self.error_history[self.error_history.len() - 5..];
734 let older =
735 &self.error_history[self.error_history.len() - 10..self.error_history.len() - 5];
736
737 let recent_rate = recent.len() as f64 / 300.0; let older_rate = older.len() as f64 / 300.0;
739
740 recent_rate - older_rate
741 }
742
743 fn detect_seasonality(&self) -> f64 {
744 1.0 }
747
748 fn time_since_last_error(&self) -> f64 {
749 self.error_history
750 .last()
751 .map(|&t| t.elapsed().as_secs_f64())
752 .unwrap_or(0.0)
753 }
754
755 fn get_temporal_risk_factor(&self, _event: &ErrorEvent) -> f64 {
756 let base_risk = 1.0;
758 let frequency_factor = (self.calculate_error_frequency() * 10.0).min(2.0);
759 let trend_factor = (1.0 + self.calculate_trend()).max(0.1);
760
761 base_risk * frequency_factor * trend_factor
762 }
763}
764
765#[derive(Debug, Clone)]
767pub struct MLSystemConfig {
768 pub feature_dimension: usize,
769 pub num_classes: usize,
770 pub max_training_samples: usize,
771 pub anomaly_threshold: f64,
772 pub learning_rate: f64,
773}
774
775impl Default for MLSystemConfig {
776 fn default() -> Self {
777 Self {
778 feature_dimension: 20,
779 num_classes: 5,
780 max_training_samples: 10000,
781 anomaly_threshold: 3.0,
782 learning_rate: 0.01,
783 }
784 }
785}
786
787#[derive(Debug, Clone)]
789pub struct MLAnalysisResult {
790 pub predicted_pattern: MLPatternPrediction,
791 pub confidence_score: f64,
792 pub is_anomaly: bool,
793 pub anomaly_score: f64,
794 pub temporal_context: TemporalContext,
795 pub next_error_probability: f64,
796 pub feature_importance: Vec<f64>,
797}
798
799#[derive(Debug, Clone)]
801pub struct MLPatternPrediction {
802 pub pattern_type: PatternType,
803 pub confidence: f64,
804 pub raw_scores: Vec<f64>,
805}
806
807#[derive(Debug, Clone)]
809pub struct TemporalContext {
810 pub error_frequency: f64,
811 pub trend_direction: f64,
812 pub seasonal_factor: f64,
813 pub time_since_last_error: f64,
814}
815
816#[derive(Debug, Clone)]
818pub struct LabeledErrorEvent {
819 pub event: ErrorEvent,
820 pub label: PatternLabel,
821}
822
823#[derive(Debug, Clone)]
825pub struct PatternLabel {
826 pub pattern_type: PatternType,
827 pub severity: SeverityLevel,
828 pub confidence: f64,
829}
830
831#[derive(Debug)]
833struct ErrorCorrelationTracker {
834 correlations: Vec<ErrorCorrelation>,
835}
836
837impl ErrorCorrelationTracker {
838 fn new() -> Self {
839 Self {
840 correlations: Vec::new(),
841 }
842 }
843
844 fn update(&mut self, _event: &ErrorEvent, _history: &[ErrorEvent]) {
845 }
848
849 fn generate_analysis(&self) -> CorrelationAnalysis {
850 CorrelationAnalysis {
851 significant_correlations: self.correlations.clone(),
852 correlation_strength: self.calculate_overall_correlation_strength(),
853 }
854 }
855
856 fn calculate_overall_correlation_strength(&self) -> f64 {
857 if self.correlations.is_empty() {
858 0.0
859 } else {
860 self.correlations.iter().map(|c| c.strength).sum::<f64>()
861 / self.correlations.len() as f64
862 }
863 }
864}
865
866#[derive(Debug)]
868struct PerformanceImpactAnalyzer {
869 impact_history: Vec<PerformanceImpact>,
870}
871
872impl PerformanceImpactAnalyzer {
873 fn new() -> Self {
874 Self {
875 impact_history: Vec::new(),
876 }
877 }
878
879 fn assess_impact(&mut self, event: &ErrorEvent) {
880 let impact = PerformanceImpact {
881 error_type: format!("{:?}", event.error),
882 timestamp: event.timestamp,
883 estimated_delay: self.estimate_error_delay(&event.error),
884 recovery_time: self.estimate_recovery_time(&event.error),
885 };
886
887 self.impact_history.push(impact);
888 }
889
890 fn estimate_error_delay(&self, error: &AutogradError) -> Duration {
891 match error {
892 AutogradError::MemoryAllocation { .. } => Duration::from_millis(100),
893 AutogradError::NumericalInstability { .. } => Duration::from_millis(50),
894 AutogradError::ComputationGraph { .. } => Duration::from_millis(200),
895 _ => Duration::from_millis(10),
896 }
897 }
898
899 fn estimate_recovery_time(&self, error: &AutogradError) -> Duration {
900 match error {
901 AutogradError::MemoryAllocation { .. } => Duration::from_secs(5),
902 AutogradError::NumericalInstability { .. } => Duration::from_secs(1),
903 AutogradError::ComputationGraph { .. } => Duration::from_secs(10),
904 _ => Duration::from_millis(100),
905 }
906 }
907
908 fn generate_analysis(&self) -> PerformanceAnalysis {
909 let total_delay: Duration = self.impact_history.iter().map(|i| i.estimated_delay).sum();
910 let total_recovery_time: Duration =
911 self.impact_history.iter().map(|i| i.recovery_time).sum();
912
913 PerformanceAnalysis {
914 total_performance_impact: total_delay + total_recovery_time,
915 average_error_delay: if self.impact_history.is_empty() {
916 Duration::from_millis(0)
917 } else {
918 total_delay / self.impact_history.len() as u32
919 },
920 impact_events: self.impact_history.len(),
921 }
922 }
923}
924
925#[derive(Debug, Clone)]
928pub struct ErrorEvent {
929 pub timestamp: Instant,
930 pub error: AutogradError,
931 pub context: ErrorContext,
932 pub operation_id: String,
933}
934
935#[derive(Debug, Clone)]
936pub struct ErrorContext {
937 pub operation_name: String,
938 pub tensor_ids: Vec<usize>,
939 pub stack_trace: Vec<String>,
940}
941
942#[derive(Debug, Clone, Serialize, Deserialize)]
943pub struct DiagnosticsConfig {
944 pub max_history_size: usize,
945 pub history_cleanup_batch: usize,
946 pub analysis_window: Duration,
947 pub enable_correlation_analysis: bool,
948 pub enable_performance_analysis: bool,
949}
950
951impl Default for DiagnosticsConfig {
952 fn default() -> Self {
953 Self {
954 max_history_size: 10000,
955 history_cleanup_batch: 1000,
956 analysis_window: Duration::from_secs(3600), enable_correlation_analysis: true,
958 enable_performance_analysis: true,
959 }
960 }
961}
962
963#[derive(Debug, Clone)]
964pub struct DiagnosticReport {
965 pub timestamp: Instant,
966 pub total_errors: usize,
967 pub analysis_window: Duration,
968 pub pattern_analysis: PatternAnalysis,
969 pub correlation_analysis: CorrelationAnalysis,
970 pub performance_analysis: PerformanceAnalysis,
971 pub recommendations: Vec<DiagnosticRecommendation>,
972 pub severity_assessment: SeverityLevel,
973}
974
975#[derive(Debug)]
976pub struct DiagnosticStatus {
977 pub error_rate: f64,
978 pub active_patterns: usize,
979 pub severity_level: SeverityLevel,
980 pub last_critical_error: Option<Instant>,
981 pub health_score: f64,
982}
983
984#[derive(Debug, Clone)]
985pub struct ErrorPattern {
986 pub pattern_type: PatternType,
987 pub occurrences: usize,
988 pub first_occurrence: Instant,
989 pub last_occurrence: Instant,
990 pub severity: SeverityLevel,
991}
992
993#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
994pub enum PatternType {
995 ShapeMismatchPattern,
996 MemoryAllocationPattern,
997 NumericalInstabilityPattern,
998 ComputationGraphPattern,
999 GradientComputationPattern,
1000 RecurrentFailurePattern,
1001}
1002
1003#[derive(Debug, Clone)]
1004pub struct ErrorCorrelation {
1005 pub error_type_1: String,
1006 pub error_type_2: String,
1007 pub strength: f64,
1008 pub occurrences: usize,
1009}
1010
1011#[derive(Debug, Clone)]
1012pub struct PatternAnalysis {
1013 pub detected_patterns: Vec<ErrorPattern>,
1014 pub most_common_pattern: Option<PatternType>,
1015 pub total_patterns: usize,
1016}
1017
1018#[derive(Debug, Clone)]
1019pub struct CorrelationAnalysis {
1020 pub significant_correlations: Vec<ErrorCorrelation>,
1021 pub correlation_strength: f64,
1022}
1023
1024#[derive(Debug, Clone)]
1025pub struct PerformanceAnalysis {
1026 pub total_performance_impact: Duration,
1027 pub average_error_delay: Duration,
1028 pub impact_events: usize,
1029}
1030
1031#[derive(Debug, Clone)]
1032pub struct PerformanceImpact {
1033 pub error_type: String,
1034 pub timestamp: Instant,
1035 pub estimated_delay: Duration,
1036 pub recovery_time: Duration,
1037}
1038
1039#[derive(Debug, Clone)]
1040pub struct DiagnosticRecommendation {
1041 pub category: RecommendationCategory,
1042 pub priority: RecommendationPriority,
1043 pub title: String,
1044 pub description: String,
1045 pub code_example: Option<String>,
1046 pub estimated_fix_time: Duration,
1047}
1048
1049#[derive(Debug, Clone)]
1050pub enum RecommendationCategory {
1051 CodeImprovement,
1052 Performance,
1053 Stability,
1054 Architecture,
1055 Configuration,
1056}
1057
1058#[derive(Debug, Clone)]
1059pub enum RecommendationPriority {
1060 Critical,
1061 High,
1062 Medium,
1063 Low,
1064}
1065
1066#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1067pub enum SeverityLevel {
1068 Critical,
1069 High,
1070 Medium,
1071 Low,
1072}
1073
1074#[cfg(test)]
1075mod tests {
1076 use super::*;
1077
1078 #[test]
1079 fn test_diagnostics_system_creation() {
1080 let system = ErrorDiagnosticsSystem::new();
1081 assert_eq!(system.error_history.len(), 0);
1082 assert_eq!(system.config.max_history_size, 10000);
1083 }
1084
1085 #[test]
1086 fn test_error_recording() {
1087 let mut system = ErrorDiagnosticsSystem::new();
1088 let error = AutogradError::GradientComputation {
1089 operation: "test_op".to_string(),
1090 tensor_id: Some(1),
1091 context: "test context".to_string(),
1092 source: None,
1093 };
1094 let context = ErrorContext {
1095 operation_name: "test".to_string(),
1096 tensor_ids: vec![1],
1097 stack_trace: vec!["test".to_string()],
1098 };
1099
1100 system.record_error(&error, context);
1101 assert_eq!(system.error_history.len(), 1);
1102 }
1103
1104 #[test]
1105 fn test_diagnostic_report_generation() {
1106 let system = ErrorDiagnosticsSystem::new();
1107 let report = system.generate_diagnostic_report();
1108
1109 assert_eq!(report.total_errors, 0);
1110 assert!(report.recommendations.is_empty());
1111 }
1112
1113 #[test]
1114 fn test_severity_assessment() {
1115 let system = ErrorDiagnosticsSystem::new();
1116 let severity = system.assess_overall_severity();
1117 assert_eq!(severity, SeverityLevel::Low);
1118 }
1119
1120 #[test]
1121 fn test_health_score_calculation() {
1122 let system = ErrorDiagnosticsSystem::new();
1123 let recent_errors = Vec::new();
1124 let health_score = system.calculate_health_score(&recent_errors);
1125 assert_eq!(health_score, 1.0);
1126 }
1127
1128 #[test]
1129 fn test_pattern_recognition() {
1130 let mut db = ErrorPatternDatabase::new();
1131 let error = AutogradError::ShapeMismatch {
1132 expected: vec![2, 3],
1133 actual: vec![3, 4],
1134 operation: "matmul".to_string(),
1135 tensor_names: vec!["A".to_string(), "B".to_string()],
1136 };
1137 let event = ErrorEvent {
1138 timestamp: Instant::now(),
1139 error,
1140 context: ErrorContext {
1141 operation_name: "test".to_string(),
1142 tensor_ids: vec![1, 2],
1143 stack_trace: vec!["test".to_string()],
1144 },
1145 operation_id: "test_op".to_string(),
1146 };
1147
1148 db.analyze_error(&event);
1149 assert_eq!(db.patterns.len(), 1);
1150 assert_eq!(
1151 db.patterns[0].pattern_type,
1152 PatternType::ShapeMismatchPattern
1153 );
1154 }
1155}