1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use std::sync::OnceLock;
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct ErrorCodeInfo {
15 pub code: &'static str,
16 pub name: &'static str,
17 pub description: &'static str,
18 pub severity: &'static str,
19 pub causes: Vec<&'static str>,
20 pub solutions: Vec<&'static str>,
21 pub documentation_url: Option<&'static str>,
22 pub related_codes: Vec<&'static str>,
23}
24
25pub struct ErrorCodeRegistry {
27 codes: HashMap<&'static str, ErrorCodeInfo>,
28}
29
30impl Default for ErrorCodeRegistry {
31 fn default() -> Self {
32 Self::new()
33 }
34}
35
36impl ErrorCodeRegistry {
37 pub fn new() -> Self {
38 let mut registry = Self {
39 codes: HashMap::new(),
40 };
41 registry.register_all_codes();
42 registry
43 }
44
45 pub fn get_code_info(&self, code: &str) -> Option<&ErrorCodeInfo> {
46 self.codes.get(code)
47 }
48
49 pub fn list_codes_by_component(&self, component: &str) -> Vec<&ErrorCodeInfo> {
50 self.codes.values().filter(|info| info.code.starts_with(component)).collect()
51 }
52
53 fn register_all_codes(&mut self) {
54 self.register_code(ErrorCodeInfo {
56 code: "CONFIG_INVALID_PARAM",
57 name: "Invalid Parameter",
58 description: "A configuration parameter has an invalid value",
59 severity: "HIGH",
60 causes: vec![
61 "Parameter value outside valid range",
62 "Incorrect parameter type",
63 "Missing required parameter",
64 ],
65 solutions: vec![
66 "Check parameter documentation",
67 "Validate parameter ranges",
68 "Use configuration schema validation",
69 ],
70 documentation_url: Some("https://docs.trustformers.rs/errors/config"),
71 related_codes: vec!["CONFIG_MISSING_PARAM", "CONFIG_TYPE_MISMATCH"],
72 });
73
74 self.register_code(ErrorCodeInfo {
75 code: "CONFIG_MISSING_PARAM",
76 name: "Missing Required Parameter",
77 description: "A required configuration parameter is missing",
78 severity: "CRITICAL",
79 causes: vec![
80 "Incomplete configuration file",
81 "Parameter not provided in CLI",
82 "Environment variable not set",
83 ],
84 solutions: vec![
85 "Add missing parameter to config",
86 "Check required parameters list",
87 "Use configuration template",
88 ],
89 documentation_url: Some("https://docs.trustformers.rs/errors/config"),
90 related_codes: vec!["CONFIG_INVALID_PARAM"],
91 });
92
93 self.register_code(ErrorCodeInfo {
94 code: "CONFIG_TYPE_MISMATCH",
95 name: "Parameter Type Mismatch",
96 description: "Configuration parameter has wrong type",
97 severity: "HIGH",
98 causes: vec![
99 "String provided where number expected",
100 "Invalid enum value",
101 "Incorrect data structure",
102 ],
103 solutions: vec![
104 "Check parameter type requirements",
105 "Use correct data type",
106 "Validate against schema",
107 ],
108 documentation_url: Some("https://docs.trustformers.rs/errors/config"),
109 related_codes: vec!["CONFIG_INVALID_PARAM"],
110 });
111
112 self.register_code(ErrorCodeInfo {
114 code: "DATA_FILE_NOT_FOUND",
115 name: "Dataset File Not Found",
116 description: "Specified dataset file does not exist",
117 severity: "CRITICAL",
118 causes: vec![
119 "Incorrect file path",
120 "File moved or deleted",
121 "Permission issues",
122 ],
123 solutions: vec![
124 "Check file path spelling",
125 "Verify file exists",
126 "Check file permissions",
127 ],
128 documentation_url: Some("https://docs.trustformers.rs/errors/data"),
129 related_codes: vec!["DATA_PERMISSION_DENIED", "DATA_CORRUPT"],
130 });
131
132 self.register_code(ErrorCodeInfo {
133 code: "DATA_CORRUPT",
134 name: "Corrupted Dataset",
135 description: "Dataset file is corrupted or malformed",
136 severity: "HIGH",
137 causes: vec![
138 "Incomplete download",
139 "File corruption",
140 "Unsupported format",
141 ],
142 solutions: vec![
143 "Re-download dataset",
144 "Verify file integrity",
145 "Check format compatibility",
146 ],
147 documentation_url: Some("https://docs.trustformers.rs/errors/data"),
148 related_codes: vec!["DATA_FORMAT_UNSUPPORTED"],
149 });
150
151 self.register_code(ErrorCodeInfo {
152 code: "DATA_PERMISSION_DENIED",
153 name: "Data Access Permission Denied",
154 description: "Insufficient permissions to access dataset",
155 severity: "CRITICAL",
156 causes: vec![
157 "File permission restrictions",
158 "Directory access denied",
159 "Network access blocked",
160 ],
161 solutions: vec![
162 "Check file permissions",
163 "Run with appropriate user",
164 "Adjust filesystem permissions",
165 ],
166 documentation_url: Some("https://docs.trustformers.rs/errors/data"),
167 related_codes: vec!["DATA_FILE_NOT_FOUND"],
168 });
169
170 self.register_code(ErrorCodeInfo {
172 code: "TRAIN_NAN_LOSS",
173 name: "NaN Loss Detected",
174 description: "Training loss has become NaN (Not a Number)",
175 severity: "CRITICAL",
176 causes: vec![
177 "Learning rate too high",
178 "Gradient explosion",
179 "Numerical instability",
180 "Division by zero",
181 ],
182 solutions: vec![
183 "Reduce learning rate",
184 "Enable gradient clipping",
185 "Check input data normalization",
186 "Use mixed precision training",
187 ],
188 documentation_url: Some("https://docs.trustformers.rs/errors/training"),
189 related_codes: vec!["TRAIN_INF_LOSS", "TRAIN_GRADIENT_EXPLOSION"],
190 });
191
192 self.register_code(ErrorCodeInfo {
193 code: "TRAIN_INF_LOSS",
194 name: "Infinite Loss Detected",
195 description: "Training loss has become infinite",
196 severity: "CRITICAL",
197 causes: vec![
198 "Learning rate too high",
199 "Unstable model architecture",
200 "Overflow in calculations",
201 ],
202 solutions: vec![
203 "Reduce learning rate",
204 "Use gradient clipping",
205 "Check model stability",
206 "Enable loss scaling",
207 ],
208 documentation_url: Some("https://docs.trustformers.rs/errors/training"),
209 related_codes: vec!["TRAIN_NAN_LOSS", "TRAIN_GRADIENT_EXPLOSION"],
210 });
211
212 self.register_code(ErrorCodeInfo {
213 code: "TRAIN_GRADIENT_EXPLOSION",
214 name: "Gradient Explosion",
215 description: "Gradients have grown too large",
216 severity: "HIGH",
217 causes: vec![
218 "Learning rate too high",
219 "Poor weight initialization",
220 "Unstable optimization",
221 ],
222 solutions: vec![
223 "Enable gradient clipping",
224 "Reduce learning rate",
225 "Improve weight initialization",
226 "Use batch normalization",
227 ],
228 documentation_url: Some("https://docs.trustformers.rs/errors/training"),
229 related_codes: vec!["TRAIN_NAN_LOSS", "TRAIN_CONVERGENCE_FAILURE"],
230 });
231
232 self.register_code(ErrorCodeInfo {
233 code: "TRAIN_CONVERGENCE_FAILURE",
234 name: "Training Not Converging",
235 description: "Model is not learning or converging",
236 severity: "MEDIUM",
237 causes: vec![
238 "Learning rate too low",
239 "Poor data quality",
240 "Model capacity mismatch",
241 "Optimization algorithm unsuitable",
242 ],
243 solutions: vec![
244 "Increase learning rate",
245 "Check data quality",
246 "Adjust model size",
247 "Try different optimizer",
248 ],
249 documentation_url: Some("https://docs.trustformers.rs/errors/training"),
250 related_codes: vec!["TRAIN_OVERFITTING", "TRAIN_UNDERFITTING"],
251 });
252
253 self.register_code(ErrorCodeInfo {
255 code: "RESOURCE_OOM",
256 name: "Out of Memory",
257 description: "System has run out of available memory",
258 severity: "CRITICAL",
259 causes: vec![
260 "Batch size too large",
261 "Model too large for available memory",
262 "Memory leak",
263 "Insufficient system memory",
264 ],
265 solutions: vec![
266 "Reduce batch size",
267 "Enable gradient checkpointing",
268 "Use model parallelism",
269 "Upgrade system memory",
270 ],
271 documentation_url: Some("https://docs.trustformers.rs/errors/resource"),
272 related_codes: vec!["RESOURCE_GPU_OOM", "RESOURCE_CPU_LIMIT"],
273 });
274
275 self.register_code(ErrorCodeInfo {
276 code: "RESOURCE_GPU_OOM",
277 name: "GPU Out of Memory",
278 description: "GPU has run out of available memory",
279 severity: "CRITICAL",
280 causes: vec![
281 "Batch size too large for GPU",
282 "Model parameters exceed GPU memory",
283 "Multiple processes using GPU",
284 ],
285 solutions: vec![
286 "Reduce batch size",
287 "Use gradient accumulation",
288 "Enable GPU memory optimization",
289 "Use model sharding",
290 ],
291 documentation_url: Some("https://docs.trustformers.rs/errors/resource"),
292 related_codes: vec!["RESOURCE_OOM", "RESOURCE_GPU_UNAVAILABLE"],
293 });
294
295 self.register_code(ErrorCodeInfo {
296 code: "RESOURCE_GPU_UNAVAILABLE",
297 name: "GPU Not Available",
298 description: "Required GPU resources are not available",
299 severity: "HIGH",
300 causes: vec![
301 "GPU not detected",
302 "Driver issues",
303 "GPU already in use",
304 "CUDA/ROCm not installed",
305 ],
306 solutions: vec![
307 "Check GPU installation",
308 "Update GPU drivers",
309 "Install CUDA/ROCm",
310 "Use CPU fallback",
311 ],
312 documentation_url: Some("https://docs.trustformers.rs/errors/resource"),
313 related_codes: vec!["RESOURCE_CUDA_ERROR"],
314 });
315
316 self.register_code(ErrorCodeInfo {
318 code: "NETWORK_CONNECTION_TIMEOUT",
319 name: "Network Connection Timeout",
320 description: "Network connection timed out",
321 severity: "MEDIUM",
322 causes: vec![
323 "Slow network connection",
324 "Server overloaded",
325 "Network configuration issues",
326 ],
327 solutions: vec![
328 "Increase timeout value",
329 "Check network connectivity",
330 "Try different server",
331 "Use offline mode",
332 ],
333 documentation_url: Some("https://docs.trustformers.rs/errors/network"),
334 related_codes: vec!["NETWORK_CONNECTION_REFUSED"],
335 });
336
337 self.register_code(ErrorCodeInfo {
339 code: "HARDWARE_THERMAL_THROTTLING",
340 name: "Thermal Throttling Detected",
341 description: "Hardware is throttling due to temperature",
342 severity: "MEDIUM",
343 causes: vec![
344 "High ambient temperature",
345 "Inadequate cooling",
346 "Intensive computational load",
347 ],
348 solutions: vec![
349 "Improve cooling",
350 "Reduce computational load",
351 "Lower power limits",
352 "Schedule training during cooler periods",
353 ],
354 documentation_url: Some("https://docs.trustformers.rs/errors/hardware"),
355 related_codes: vec!["HARDWARE_POWER_LIMIT"],
356 });
357
358 self.register_code(ErrorCodeInfo {
360 code: "MODEL_INCOMPATIBLE_WEIGHTS",
361 name: "Incompatible Model Weights",
362 description: "Model weights are not compatible with architecture",
363 severity: "HIGH",
364 causes: vec![
365 "Architecture mismatch",
366 "Version incompatibility",
367 "Corrupted checkpoint",
368 ],
369 solutions: vec![
370 "Check model architecture",
371 "Verify checkpoint version",
372 "Re-download checkpoint",
373 "Use compatible model version",
374 ],
375 documentation_url: Some("https://docs.trustformers.rs/errors/model"),
376 related_codes: vec!["MODEL_CHECKPOINT_CORRUPT"],
377 });
378
379 self.register_code(ErrorCodeInfo {
381 code: "QUANT_CALIBRATION_FAILED",
382 name: "Quantization Calibration Failed",
383 description: "Failed to calibrate quantization parameters",
384 severity: "HIGH",
385 causes: vec![
386 "Insufficient calibration data",
387 "Extreme activation ranges",
388 "Unsupported layer type",
389 ],
390 solutions: vec![
391 "Increase calibration dataset size",
392 "Check activation ranges",
393 "Use different quantization method",
394 "Pre-process calibration data",
395 ],
396 documentation_url: Some("https://docs.trustformers.rs/errors/quantization"),
397 related_codes: vec!["QUANT_UNSUPPORTED_OP"],
398 });
399
400 self.register_code(ErrorCodeInfo {
402 code: "DIST_COMMUNICATION_FAILURE",
403 name: "Distributed Communication Failure",
404 description: "Communication between nodes failed",
405 severity: "HIGH",
406 causes: vec![
407 "Network partition",
408 "Node failure",
409 "Communication timeout",
410 "Process crash",
411 ],
412 solutions: vec![
413 "Check network connectivity",
414 "Restart failed nodes",
415 "Increase timeout values",
416 "Use fault-tolerant training",
417 ],
418 documentation_url: Some("https://docs.trustformers.rs/errors/distributed"),
419 related_codes: vec!["DIST_NODE_FAILURE", "DIST_SYNC_TIMEOUT"],
420 });
421
422 self.register_code(ErrorCodeInfo {
423 code: "DIST_RANK_MISMATCH",
424 name: "Process Rank Mismatch",
425 description: "Process rank configuration mismatch",
426 severity: "CRITICAL",
427 causes: vec![
428 "Incorrect world size",
429 "Duplicate rank assignment",
430 "Process group misconfiguration",
431 ],
432 solutions: vec![
433 "Check process configuration",
434 "Verify world size setting",
435 "Ensure unique rank assignment",
436 "Restart distributed training",
437 ],
438 documentation_url: Some("https://docs.trustformers.rs/errors/distributed"),
439 related_codes: vec!["DIST_COMMUNICATION_FAILURE"],
440 });
441 }
442
443 fn register_code(&mut self, info: ErrorCodeInfo) {
444 self.codes.insert(info.code, info);
445 }
446
447 pub fn generate_documentation(&self) -> String {
449 let mut doc = String::from("# TrustformeRS Training Error Code Reference\n\n");
450
451 doc.push_str("This document provides comprehensive information about all error codes used in TrustformeRS Training.\n\n");
452
453 let mut components: HashMap<String, Vec<&ErrorCodeInfo>> = HashMap::new();
455 for info in self.codes.values() {
456 let component = info.code.split('_').next().unwrap_or("UNKNOWN").to_string();
457 components.entry(component).or_default().push(info);
458 }
459
460 for (component, codes) in components {
461 doc.push_str(&format!("## {} Errors\n\n", component));
462
463 for code_info in codes {
464 doc.push_str(&format!("### {} - {}\n\n", code_info.code, code_info.name));
465 doc.push_str(&format!("**Severity**: {}\n\n", code_info.severity));
466 doc.push_str(&format!("**Description**: {}\n\n", code_info.description));
467
468 doc.push_str("**Common Causes**:\n");
469 for cause in &code_info.causes {
470 doc.push_str(&format!("- {}\n", cause));
471 }
472 doc.push('\n');
473
474 doc.push_str("**Solutions**:\n");
475 for solution in &code_info.solutions {
476 doc.push_str(&format!("- {}\n", solution));
477 }
478 doc.push('\n');
479
480 if !code_info.related_codes.is_empty() {
481 doc.push_str("**Related Error Codes**:\n");
482 for related in &code_info.related_codes {
483 doc.push_str(&format!("- {}\n", related));
484 }
485 doc.push('\n');
486 }
487
488 if let Some(url) = code_info.documentation_url {
489 doc.push_str(&format!("**Documentation**: [{}]({})\n\n", url, url));
490 }
491
492 doc.push_str("---\n\n");
493 }
494 }
495
496 doc
497 }
498
499 pub fn get_recovery_actions(&self, code: &str) -> Vec<String> {
501 if let Some(info) = self.get_code_info(code) {
502 info.solutions.iter().map(|s| s.to_string()).collect()
503 } else {
504 vec!["Unknown error code - check documentation".to_string()]
505 }
506 }
507
508 pub fn is_critical(&self, code: &str) -> bool {
510 if let Some(info) = self.get_code_info(code) {
511 info.severity == "CRITICAL"
512 } else {
513 false
514 }
515 }
516
517 pub fn get_codes_by_severity(&self, severity: &str) -> Vec<&str> {
519 self.codes
520 .values()
521 .filter(|info| info.severity == severity)
522 .map(|info| info.code)
523 .collect()
524 }
525}
526
527static ERROR_CODE_REGISTRY: OnceLock<ErrorCodeRegistry> = OnceLock::new();
529
530fn get_registry() -> &'static ErrorCodeRegistry {
532 ERROR_CODE_REGISTRY.get_or_init(ErrorCodeRegistry::new)
533}
534
535pub fn get_error_info(code: &str) -> Option<&'static ErrorCodeInfo> {
537 get_registry().get_code_info(code)
538}
539
540pub fn is_critical_error(code: &str) -> bool {
542 get_registry().is_critical(code)
543}
544
545pub fn get_recovery_actions(code: &str) -> Vec<String> {
547 get_registry().get_recovery_actions(code)
548}
549
550#[cfg(test)]
551mod tests {
552 use super::*;
553
554 #[test]
555 fn test_error_code_registry() {
556 let registry = ErrorCodeRegistry::new();
557
558 let info = registry.get_code_info("TRAIN_NAN_LOSS");
560 assert!(info.is_some());
561
562 let info = info.expect("operation failed in test");
563 assert_eq!(info.code, "TRAIN_NAN_LOSS");
564 assert_eq!(info.severity, "CRITICAL");
565 assert!(!info.causes.is_empty());
566 assert!(!info.solutions.is_empty());
567 }
568
569 #[test]
570 fn test_severity_classification() {
571 let registry = ErrorCodeRegistry::new();
572
573 assert!(registry.is_critical("TRAIN_NAN_LOSS"));
574 assert!(registry.is_critical("CONFIG_MISSING_PARAM"));
575 assert!(!registry.is_critical("TRAIN_CONVERGENCE_FAILURE"));
576 }
577
578 #[test]
579 fn test_recovery_actions() {
580 let registry = ErrorCodeRegistry::new();
581
582 let actions = registry.get_recovery_actions("RESOURCE_OOM");
583 assert!(!actions.is_empty());
584 assert!(actions.iter().any(|a| a.contains("batch size")));
585 }
586
587 #[test]
588 fn test_component_grouping() {
589 let registry = ErrorCodeRegistry::new();
590
591 let config_codes = registry.list_codes_by_component("CONFIG");
592 assert!(!config_codes.is_empty());
593
594 for code in config_codes {
595 assert!(code.code.starts_with("CONFIG"));
596 }
597 }
598
599 #[test]
600 fn test_documentation_generation() {
601 let registry = ErrorCodeRegistry::new();
602
603 let doc = registry.generate_documentation();
604 assert!(!doc.is_empty());
605 assert!(doc.contains("TRAIN_NAN_LOSS"));
606 assert!(doc.contains("# TrustformeRS Training Error Code Reference"));
607 }
608
609 #[test]
610 fn test_global_registry_access() {
611 let info = get_error_info("TRAIN_NAN_LOSS");
612 assert!(info.is_some());
613
614 assert!(is_critical_error("CONFIG_MISSING_PARAM"));
615 assert!(!is_critical_error("TRAIN_CONVERGENCE_FAILURE"));
616
617 let actions = get_recovery_actions("RESOURCE_GPU_OOM");
618 assert!(!actions.is_empty());
619 }
620}