Skip to main content

trustformers_training/
error_codes.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use std::sync::OnceLock;
4
5/// Comprehensive error code system for TrustformeRS Training
6///
7/// Error codes follow the pattern: COMPONENT_CATEGORY_SPECIFIC
8/// where:
9/// - COMPONENT: The module/component where the error occurred
10/// - CATEGORY: The type of error (CONFIG, RUNTIME, RESOURCE, etc.)
11/// - SPECIFIC: Specific error within the category
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct ErrorCodeInfo {
15    pub code: &'static str,
16    pub name: &'static str,
17    pub description: &'static str,
18    pub severity: &'static str,
19    pub causes: Vec<&'static str>,
20    pub solutions: Vec<&'static str>,
21    pub documentation_url: Option<&'static str>,
22    pub related_codes: Vec<&'static str>,
23}
24
25/// Comprehensive error code registry
26pub struct ErrorCodeRegistry {
27    codes: HashMap<&'static str, ErrorCodeInfo>,
28}
29
30impl Default for ErrorCodeRegistry {
31    fn default() -> Self {
32        Self::new()
33    }
34}
35
36impl ErrorCodeRegistry {
37    pub fn new() -> Self {
38        let mut registry = Self {
39            codes: HashMap::new(),
40        };
41        registry.register_all_codes();
42        registry
43    }
44
45    pub fn get_code_info(&self, code: &str) -> Option<&ErrorCodeInfo> {
46        self.codes.get(code)
47    }
48
49    pub fn list_codes_by_component(&self, component: &str) -> Vec<&ErrorCodeInfo> {
50        self.codes.values().filter(|info| info.code.starts_with(component)).collect()
51    }
52
53    fn register_all_codes(&mut self) {
54        // Configuration Errors (CONFIG_*)
55        self.register_code(ErrorCodeInfo {
56            code: "CONFIG_INVALID_PARAM",
57            name: "Invalid Parameter",
58            description: "A configuration parameter has an invalid value",
59            severity: "HIGH",
60            causes: vec![
61                "Parameter value outside valid range",
62                "Incorrect parameter type",
63                "Missing required parameter",
64            ],
65            solutions: vec![
66                "Check parameter documentation",
67                "Validate parameter ranges",
68                "Use configuration schema validation",
69            ],
70            documentation_url: Some("https://docs.trustformers.rs/errors/config"),
71            related_codes: vec!["CONFIG_MISSING_PARAM", "CONFIG_TYPE_MISMATCH"],
72        });
73
74        self.register_code(ErrorCodeInfo {
75            code: "CONFIG_MISSING_PARAM",
76            name: "Missing Required Parameter",
77            description: "A required configuration parameter is missing",
78            severity: "CRITICAL",
79            causes: vec![
80                "Incomplete configuration file",
81                "Parameter not provided in CLI",
82                "Environment variable not set",
83            ],
84            solutions: vec![
85                "Add missing parameter to config",
86                "Check required parameters list",
87                "Use configuration template",
88            ],
89            documentation_url: Some("https://docs.trustformers.rs/errors/config"),
90            related_codes: vec!["CONFIG_INVALID_PARAM"],
91        });
92
93        self.register_code(ErrorCodeInfo {
94            code: "CONFIG_TYPE_MISMATCH",
95            name: "Parameter Type Mismatch",
96            description: "Configuration parameter has wrong type",
97            severity: "HIGH",
98            causes: vec![
99                "String provided where number expected",
100                "Invalid enum value",
101                "Incorrect data structure",
102            ],
103            solutions: vec![
104                "Check parameter type requirements",
105                "Use correct data type",
106                "Validate against schema",
107            ],
108            documentation_url: Some("https://docs.trustformers.rs/errors/config"),
109            related_codes: vec!["CONFIG_INVALID_PARAM"],
110        });
111
112        // Data Loading Errors (DATA_*)
113        self.register_code(ErrorCodeInfo {
114            code: "DATA_FILE_NOT_FOUND",
115            name: "Dataset File Not Found",
116            description: "Specified dataset file does not exist",
117            severity: "CRITICAL",
118            causes: vec![
119                "Incorrect file path",
120                "File moved or deleted",
121                "Permission issues",
122            ],
123            solutions: vec![
124                "Check file path spelling",
125                "Verify file exists",
126                "Check file permissions",
127            ],
128            documentation_url: Some("https://docs.trustformers.rs/errors/data"),
129            related_codes: vec!["DATA_PERMISSION_DENIED", "DATA_CORRUPT"],
130        });
131
132        self.register_code(ErrorCodeInfo {
133            code: "DATA_CORRUPT",
134            name: "Corrupted Dataset",
135            description: "Dataset file is corrupted or malformed",
136            severity: "HIGH",
137            causes: vec![
138                "Incomplete download",
139                "File corruption",
140                "Unsupported format",
141            ],
142            solutions: vec![
143                "Re-download dataset",
144                "Verify file integrity",
145                "Check format compatibility",
146            ],
147            documentation_url: Some("https://docs.trustformers.rs/errors/data"),
148            related_codes: vec!["DATA_FORMAT_UNSUPPORTED"],
149        });
150
151        self.register_code(ErrorCodeInfo {
152            code: "DATA_PERMISSION_DENIED",
153            name: "Data Access Permission Denied",
154            description: "Insufficient permissions to access dataset",
155            severity: "CRITICAL",
156            causes: vec![
157                "File permission restrictions",
158                "Directory access denied",
159                "Network access blocked",
160            ],
161            solutions: vec![
162                "Check file permissions",
163                "Run with appropriate user",
164                "Adjust filesystem permissions",
165            ],
166            documentation_url: Some("https://docs.trustformers.rs/errors/data"),
167            related_codes: vec!["DATA_FILE_NOT_FOUND"],
168        });
169
170        // Training Errors (TRAIN_*)
171        self.register_code(ErrorCodeInfo {
172            code: "TRAIN_NAN_LOSS",
173            name: "NaN Loss Detected",
174            description: "Training loss has become NaN (Not a Number)",
175            severity: "CRITICAL",
176            causes: vec![
177                "Learning rate too high",
178                "Gradient explosion",
179                "Numerical instability",
180                "Division by zero",
181            ],
182            solutions: vec![
183                "Reduce learning rate",
184                "Enable gradient clipping",
185                "Check input data normalization",
186                "Use mixed precision training",
187            ],
188            documentation_url: Some("https://docs.trustformers.rs/errors/training"),
189            related_codes: vec!["TRAIN_INF_LOSS", "TRAIN_GRADIENT_EXPLOSION"],
190        });
191
192        self.register_code(ErrorCodeInfo {
193            code: "TRAIN_INF_LOSS",
194            name: "Infinite Loss Detected",
195            description: "Training loss has become infinite",
196            severity: "CRITICAL",
197            causes: vec![
198                "Learning rate too high",
199                "Unstable model architecture",
200                "Overflow in calculations",
201            ],
202            solutions: vec![
203                "Reduce learning rate",
204                "Use gradient clipping",
205                "Check model stability",
206                "Enable loss scaling",
207            ],
208            documentation_url: Some("https://docs.trustformers.rs/errors/training"),
209            related_codes: vec!["TRAIN_NAN_LOSS", "TRAIN_GRADIENT_EXPLOSION"],
210        });
211
212        self.register_code(ErrorCodeInfo {
213            code: "TRAIN_GRADIENT_EXPLOSION",
214            name: "Gradient Explosion",
215            description: "Gradients have grown too large",
216            severity: "HIGH",
217            causes: vec![
218                "Learning rate too high",
219                "Poor weight initialization",
220                "Unstable optimization",
221            ],
222            solutions: vec![
223                "Enable gradient clipping",
224                "Reduce learning rate",
225                "Improve weight initialization",
226                "Use batch normalization",
227            ],
228            documentation_url: Some("https://docs.trustformers.rs/errors/training"),
229            related_codes: vec!["TRAIN_NAN_LOSS", "TRAIN_CONVERGENCE_FAILURE"],
230        });
231
232        self.register_code(ErrorCodeInfo {
233            code: "TRAIN_CONVERGENCE_FAILURE",
234            name: "Training Not Converging",
235            description: "Model is not learning or converging",
236            severity: "MEDIUM",
237            causes: vec![
238                "Learning rate too low",
239                "Poor data quality",
240                "Model capacity mismatch",
241                "Optimization algorithm unsuitable",
242            ],
243            solutions: vec![
244                "Increase learning rate",
245                "Check data quality",
246                "Adjust model size",
247                "Try different optimizer",
248            ],
249            documentation_url: Some("https://docs.trustformers.rs/errors/training"),
250            related_codes: vec!["TRAIN_OVERFITTING", "TRAIN_UNDERFITTING"],
251        });
252
253        // Resource Errors (RESOURCE_*)
254        self.register_code(ErrorCodeInfo {
255            code: "RESOURCE_OOM",
256            name: "Out of Memory",
257            description: "System has run out of available memory",
258            severity: "CRITICAL",
259            causes: vec![
260                "Batch size too large",
261                "Model too large for available memory",
262                "Memory leak",
263                "Insufficient system memory",
264            ],
265            solutions: vec![
266                "Reduce batch size",
267                "Enable gradient checkpointing",
268                "Use model parallelism",
269                "Upgrade system memory",
270            ],
271            documentation_url: Some("https://docs.trustformers.rs/errors/resource"),
272            related_codes: vec!["RESOURCE_GPU_OOM", "RESOURCE_CPU_LIMIT"],
273        });
274
275        self.register_code(ErrorCodeInfo {
276            code: "RESOURCE_GPU_OOM",
277            name: "GPU Out of Memory",
278            description: "GPU has run out of available memory",
279            severity: "CRITICAL",
280            causes: vec![
281                "Batch size too large for GPU",
282                "Model parameters exceed GPU memory",
283                "Multiple processes using GPU",
284            ],
285            solutions: vec![
286                "Reduce batch size",
287                "Use gradient accumulation",
288                "Enable GPU memory optimization",
289                "Use model sharding",
290            ],
291            documentation_url: Some("https://docs.trustformers.rs/errors/resource"),
292            related_codes: vec!["RESOURCE_OOM", "RESOURCE_GPU_UNAVAILABLE"],
293        });
294
295        self.register_code(ErrorCodeInfo {
296            code: "RESOURCE_GPU_UNAVAILABLE",
297            name: "GPU Not Available",
298            description: "Required GPU resources are not available",
299            severity: "HIGH",
300            causes: vec![
301                "GPU not detected",
302                "Driver issues",
303                "GPU already in use",
304                "CUDA/ROCm not installed",
305            ],
306            solutions: vec![
307                "Check GPU installation",
308                "Update GPU drivers",
309                "Install CUDA/ROCm",
310                "Use CPU fallback",
311            ],
312            documentation_url: Some("https://docs.trustformers.rs/errors/resource"),
313            related_codes: vec!["RESOURCE_CUDA_ERROR"],
314        });
315
316        // Network Errors (NETWORK_*)
317        self.register_code(ErrorCodeInfo {
318            code: "NETWORK_CONNECTION_TIMEOUT",
319            name: "Network Connection Timeout",
320            description: "Network connection timed out",
321            severity: "MEDIUM",
322            causes: vec![
323                "Slow network connection",
324                "Server overloaded",
325                "Network configuration issues",
326            ],
327            solutions: vec![
328                "Increase timeout value",
329                "Check network connectivity",
330                "Try different server",
331                "Use offline mode",
332            ],
333            documentation_url: Some("https://docs.trustformers.rs/errors/network"),
334            related_codes: vec!["NETWORK_CONNECTION_REFUSED"],
335        });
336
337        // Hardware Errors (HARDWARE_*)
338        self.register_code(ErrorCodeInfo {
339            code: "HARDWARE_THERMAL_THROTTLING",
340            name: "Thermal Throttling Detected",
341            description: "Hardware is throttling due to temperature",
342            severity: "MEDIUM",
343            causes: vec![
344                "High ambient temperature",
345                "Inadequate cooling",
346                "Intensive computational load",
347            ],
348            solutions: vec![
349                "Improve cooling",
350                "Reduce computational load",
351                "Lower power limits",
352                "Schedule training during cooler periods",
353            ],
354            documentation_url: Some("https://docs.trustformers.rs/errors/hardware"),
355            related_codes: vec!["HARDWARE_POWER_LIMIT"],
356        });
357
358        // Model Errors (MODEL_*)
359        self.register_code(ErrorCodeInfo {
360            code: "MODEL_INCOMPATIBLE_WEIGHTS",
361            name: "Incompatible Model Weights",
362            description: "Model weights are not compatible with architecture",
363            severity: "HIGH",
364            causes: vec![
365                "Architecture mismatch",
366                "Version incompatibility",
367                "Corrupted checkpoint",
368            ],
369            solutions: vec![
370                "Check model architecture",
371                "Verify checkpoint version",
372                "Re-download checkpoint",
373                "Use compatible model version",
374            ],
375            documentation_url: Some("https://docs.trustformers.rs/errors/model"),
376            related_codes: vec!["MODEL_CHECKPOINT_CORRUPT"],
377        });
378
379        // Quantization Errors (QUANT_*)
380        self.register_code(ErrorCodeInfo {
381            code: "QUANT_CALIBRATION_FAILED",
382            name: "Quantization Calibration Failed",
383            description: "Failed to calibrate quantization parameters",
384            severity: "HIGH",
385            causes: vec![
386                "Insufficient calibration data",
387                "Extreme activation ranges",
388                "Unsupported layer type",
389            ],
390            solutions: vec![
391                "Increase calibration dataset size",
392                "Check activation ranges",
393                "Use different quantization method",
394                "Pre-process calibration data",
395            ],
396            documentation_url: Some("https://docs.trustformers.rs/errors/quantization"),
397            related_codes: vec!["QUANT_UNSUPPORTED_OP"],
398        });
399
400        // Distributed Training Errors (DIST_*)
401        self.register_code(ErrorCodeInfo {
402            code: "DIST_COMMUNICATION_FAILURE",
403            name: "Distributed Communication Failure",
404            description: "Communication between nodes failed",
405            severity: "HIGH",
406            causes: vec![
407                "Network partition",
408                "Node failure",
409                "Communication timeout",
410                "Process crash",
411            ],
412            solutions: vec![
413                "Check network connectivity",
414                "Restart failed nodes",
415                "Increase timeout values",
416                "Use fault-tolerant training",
417            ],
418            documentation_url: Some("https://docs.trustformers.rs/errors/distributed"),
419            related_codes: vec!["DIST_NODE_FAILURE", "DIST_SYNC_TIMEOUT"],
420        });
421
422        self.register_code(ErrorCodeInfo {
423            code: "DIST_RANK_MISMATCH",
424            name: "Process Rank Mismatch",
425            description: "Process rank configuration mismatch",
426            severity: "CRITICAL",
427            causes: vec![
428                "Incorrect world size",
429                "Duplicate rank assignment",
430                "Process group misconfiguration",
431            ],
432            solutions: vec![
433                "Check process configuration",
434                "Verify world size setting",
435                "Ensure unique rank assignment",
436                "Restart distributed training",
437            ],
438            documentation_url: Some("https://docs.trustformers.rs/errors/distributed"),
439            related_codes: vec!["DIST_COMMUNICATION_FAILURE"],
440        });
441    }
442
443    fn register_code(&mut self, info: ErrorCodeInfo) {
444        self.codes.insert(info.code, info);
445    }
446
447    /// Generate markdown documentation for all error codes
448    pub fn generate_documentation(&self) -> String {
449        let mut doc = String::from("# TrustformeRS Training Error Code Reference\n\n");
450
451        doc.push_str("This document provides comprehensive information about all error codes used in TrustformeRS Training.\n\n");
452
453        // Group by component
454        let mut components: HashMap<String, Vec<&ErrorCodeInfo>> = HashMap::new();
455        for info in self.codes.values() {
456            let component = info.code.split('_').next().unwrap_or("UNKNOWN").to_string();
457            components.entry(component).or_default().push(info);
458        }
459
460        for (component, codes) in components {
461            doc.push_str(&format!("## {} Errors\n\n", component));
462
463            for code_info in codes {
464                doc.push_str(&format!("### {} - {}\n\n", code_info.code, code_info.name));
465                doc.push_str(&format!("**Severity**: {}\n\n", code_info.severity));
466                doc.push_str(&format!("**Description**: {}\n\n", code_info.description));
467
468                doc.push_str("**Common Causes**:\n");
469                for cause in &code_info.causes {
470                    doc.push_str(&format!("- {}\n", cause));
471                }
472                doc.push('\n');
473
474                doc.push_str("**Solutions**:\n");
475                for solution in &code_info.solutions {
476                    doc.push_str(&format!("- {}\n", solution));
477                }
478                doc.push('\n');
479
480                if !code_info.related_codes.is_empty() {
481                    doc.push_str("**Related Error Codes**:\n");
482                    for related in &code_info.related_codes {
483                        doc.push_str(&format!("- {}\n", related));
484                    }
485                    doc.push('\n');
486                }
487
488                if let Some(url) = code_info.documentation_url {
489                    doc.push_str(&format!("**Documentation**: [{}]({})\n\n", url, url));
490                }
491
492                doc.push_str("---\n\n");
493            }
494        }
495
496        doc
497    }
498
499    /// Get recovery actions for a specific error code
500    pub fn get_recovery_actions(&self, code: &str) -> Vec<String> {
501        if let Some(info) = self.get_code_info(code) {
502            info.solutions.iter().map(|s| s.to_string()).collect()
503        } else {
504            vec!["Unknown error code - check documentation".to_string()]
505        }
506    }
507
508    /// Check if an error code is critical
509    pub fn is_critical(&self, code: &str) -> bool {
510        if let Some(info) = self.get_code_info(code) {
511            info.severity == "CRITICAL"
512        } else {
513            false
514        }
515    }
516
517    /// Get all error codes for a severity level
518    pub fn get_codes_by_severity(&self, severity: &str) -> Vec<&str> {
519        self.codes
520            .values()
521            .filter(|info| info.severity == severity)
522            .map(|info| info.code)
523            .collect()
524    }
525}
526
527/// Global error code registry instance
528static ERROR_CODE_REGISTRY: OnceLock<ErrorCodeRegistry> = OnceLock::new();
529
530/// Get the global error code registry instance
531fn get_registry() -> &'static ErrorCodeRegistry {
532    ERROR_CODE_REGISTRY.get_or_init(ErrorCodeRegistry::new)
533}
534
535/// Helper function to get error code information
536pub fn get_error_info(code: &str) -> Option<&'static ErrorCodeInfo> {
537    get_registry().get_code_info(code)
538}
539
540/// Helper function to check if error is critical
541pub fn is_critical_error(code: &str) -> bool {
542    get_registry().is_critical(code)
543}
544
545/// Helper function to get recovery actions
546pub fn get_recovery_actions(code: &str) -> Vec<String> {
547    get_registry().get_recovery_actions(code)
548}
549
550#[cfg(test)]
551mod tests {
552    use super::*;
553
554    #[test]
555    fn test_error_code_registry() {
556        let registry = ErrorCodeRegistry::new();
557
558        // Test getting code info
559        let info = registry.get_code_info("TRAIN_NAN_LOSS");
560        assert!(info.is_some());
561
562        let info = info.expect("operation failed in test");
563        assert_eq!(info.code, "TRAIN_NAN_LOSS");
564        assert_eq!(info.severity, "CRITICAL");
565        assert!(!info.causes.is_empty());
566        assert!(!info.solutions.is_empty());
567    }
568
569    #[test]
570    fn test_severity_classification() {
571        let registry = ErrorCodeRegistry::new();
572
573        assert!(registry.is_critical("TRAIN_NAN_LOSS"));
574        assert!(registry.is_critical("CONFIG_MISSING_PARAM"));
575        assert!(!registry.is_critical("TRAIN_CONVERGENCE_FAILURE"));
576    }
577
578    #[test]
579    fn test_recovery_actions() {
580        let registry = ErrorCodeRegistry::new();
581
582        let actions = registry.get_recovery_actions("RESOURCE_OOM");
583        assert!(!actions.is_empty());
584        assert!(actions.iter().any(|a| a.contains("batch size")));
585    }
586
587    #[test]
588    fn test_component_grouping() {
589        let registry = ErrorCodeRegistry::new();
590
591        let config_codes = registry.list_codes_by_component("CONFIG");
592        assert!(!config_codes.is_empty());
593
594        for code in config_codes {
595            assert!(code.code.starts_with("CONFIG"));
596        }
597    }
598
599    #[test]
600    fn test_documentation_generation() {
601        let registry = ErrorCodeRegistry::new();
602
603        let doc = registry.generate_documentation();
604        assert!(!doc.is_empty());
605        assert!(doc.contains("TRAIN_NAN_LOSS"));
606        assert!(doc.contains("# TrustformeRS Training Error Code Reference"));
607    }
608
609    #[test]
610    fn test_global_registry_access() {
611        let info = get_error_info("TRAIN_NAN_LOSS");
612        assert!(info.is_some());
613
614        assert!(is_critical_error("CONFIG_MISSING_PARAM"));
615        assert!(!is_critical_error("TRAIN_CONVERGENCE_FAILURE"));
616
617        let actions = get_recovery_actions("RESOURCE_GPU_OOM");
618        assert!(!actions.is_empty());
619    }
620}