1pub use torsh_core::error::{Result as BackendResult, TorshError as BackendError};
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
11pub enum ErrorSeverity {
12 Info,
14 Warning,
16 Error,
18 Critical,
20 Fatal,
22}
23
24#[derive(Debug, Clone, PartialEq, Eq, Hash)]
26pub enum ErrorCategory {
27 Memory,
29 Computation,
31 Hardware,
33 Configuration,
35 Communication,
37 Io,
39 Security,
41 ResourceExhaustion,
43 Timeout,
45 Validation,
47 Internal,
49 External,
51}
52
53#[derive(Debug, Clone)]
55pub struct ErrorContext {
56 pub operation: String,
58 pub device: Option<String>,
60 pub backend: Option<String>,
62 pub details: Option<String>,
64 pub severity: ErrorSeverity,
66 pub category: ErrorCategory,
68 pub timestamp: std::time::SystemTime,
70 pub source_location: Option<String>,
72 pub thread_id: Option<String>,
74 pub cause: Option<Box<ErrorContext>>,
76 pub recovery_suggestions: Vec<String>,
78 pub error_code: Option<String>,
80}
81
82impl ErrorContext {
83 pub fn new(operation: impl Into<String>) -> Self {
85 Self {
86 operation: operation.into(),
87 device: None,
88 backend: None,
89 details: None,
90 severity: ErrorSeverity::Error,
91 category: ErrorCategory::Internal,
92 timestamp: std::time::SystemTime::now(),
93 source_location: None,
94 thread_id: Some(format!("{:?}", std::thread::current().id())),
95 cause: None,
96 recovery_suggestions: Vec::new(),
97 error_code: None,
98 }
99 }
100
101 pub fn new_with_category(
103 operation: impl Into<String>,
104 category: ErrorCategory,
105 severity: ErrorSeverity,
106 ) -> Self {
107 Self {
108 operation: operation.into(),
109 device: None,
110 backend: None,
111 details: None,
112 severity,
113 category,
114 timestamp: std::time::SystemTime::now(),
115 source_location: None,
116 thread_id: Some(format!("{:?}", std::thread::current().id())),
117 cause: None,
118 recovery_suggestions: Vec::new(),
119 error_code: None,
120 }
121 }
122
123 pub fn with_device(mut self, device: impl Into<String>) -> Self {
125 self.device = Some(device.into());
126 self
127 }
128
129 pub fn with_backend(mut self, backend: impl Into<String>) -> Self {
131 self.backend = Some(backend.into());
132 self
133 }
134
135 pub fn with_details(mut self, details: impl Into<String>) -> Self {
137 self.details = Some(details.into());
138 self
139 }
140
141 pub fn with_severity(mut self, severity: ErrorSeverity) -> Self {
143 self.severity = severity;
144 self
145 }
146
147 pub fn with_category(mut self, category: ErrorCategory) -> Self {
149 self.category = category;
150 self
151 }
152
153 pub fn with_source_location(mut self, location: impl Into<String>) -> Self {
155 self.source_location = Some(location.into());
156 self
157 }
158
159 pub fn with_error_code(mut self, code: impl Into<String>) -> Self {
161 self.error_code = Some(code.into());
162 self
163 }
164
165 pub fn add_recovery_suggestion(mut self, suggestion: impl Into<String>) -> Self {
167 self.recovery_suggestions.push(suggestion.into());
168 self
169 }
170
171 pub fn with_recovery_suggestions(mut self, suggestions: Vec<String>) -> Self {
173 self.recovery_suggestions = suggestions;
174 self
175 }
176
177 pub fn with_cause(mut self, cause: ErrorContext) -> Self {
179 self.cause = Some(Box::new(cause));
180 self
181 }
182
183 pub fn format(&self) -> String {
185 let mut parts = vec![
186 format!("[{}]", self.severity_string()),
187 format!("[{}]", self.category_string()),
188 self.operation.clone(),
189 ];
190
191 if let Some(ref backend) = self.backend {
192 parts.push(format!("backend: {}", backend));
193 }
194
195 if let Some(ref device) = self.device {
196 parts.push(format!("device: {}", device));
197 }
198
199 if let Some(ref error_code) = self.error_code {
200 parts.push(format!("code: {}", error_code));
201 }
202
203 if let Some(ref details) = self.details {
204 parts.push(format!("details: {}", details));
205 }
206
207 if let Some(ref location) = self.source_location {
208 parts.push(format!("at: {}", location));
209 }
210
211 let mut result = parts.join(", ");
212
213 if !self.recovery_suggestions.is_empty() {
215 result.push_str(&format!(
216 "\nSuggested recovery actions: {}",
217 self.recovery_suggestions.join("; ")
218 ));
219 }
220
221 if let Some(ref cause) = self.cause {
223 result.push_str(&format!("\nCaused by: {}", cause.format()));
224 }
225
226 result
227 }
228
229 pub fn severity_string(&self) -> &'static str {
231 match self.severity {
232 ErrorSeverity::Info => "INFO",
233 ErrorSeverity::Warning => "WARN",
234 ErrorSeverity::Error => "ERROR",
235 ErrorSeverity::Critical => "CRITICAL",
236 ErrorSeverity::Fatal => "FATAL",
237 }
238 }
239
240 pub fn category_string(&self) -> &'static str {
242 match self.category {
243 ErrorCategory::Memory => "MEMORY",
244 ErrorCategory::Computation => "COMPUTE",
245 ErrorCategory::Hardware => "HARDWARE",
246 ErrorCategory::Configuration => "CONFIG",
247 ErrorCategory::Communication => "COMM",
248 ErrorCategory::Io => "IO",
249 ErrorCategory::Security => "SECURITY",
250 ErrorCategory::ResourceExhaustion => "RESOURCE",
251 ErrorCategory::Timeout => "TIMEOUT",
252 ErrorCategory::Validation => "VALIDATION",
253 ErrorCategory::Internal => "INTERNAL",
254 ErrorCategory::External => "EXTERNAL",
255 }
256 }
257
258 pub fn is_recoverable(&self) -> bool {
260 matches!(
261 self.severity,
262 ErrorSeverity::Info | ErrorSeverity::Warning | ErrorSeverity::Error
263 ) && !self.recovery_suggestions.is_empty()
264 }
265
266 pub fn chain_depth(&self) -> usize {
268 match &self.cause {
269 Some(cause) => 1 + cause.chain_depth(),
270 None => 0,
271 }
272 }
273}
274
275pub trait ErrorContextExt<T> {
277 fn with_context(self, context: ErrorContext) -> BackendResult<T>;
279
280 fn with_operation(self, operation: &str) -> BackendResult<T>;
282}
283
284impl<T, E> ErrorContextExt<T> for Result<T, E>
285where
286 E: Into<BackendError>,
287{
288 fn with_context(self, context: ErrorContext) -> BackendResult<T> {
289 self.map_err(|e| {
290 let base_error = e.into();
291 match base_error {
292 BackendError::BackendError(msg) => {
293 BackendError::BackendError(format!("{}: {}", context.format(), msg))
294 }
295 BackendError::ComputeError(msg) => {
296 BackendError::ComputeError(format!("{}: {}", context.format(), msg))
297 }
298 BackendError::AllocationError(msg) => {
299 BackendError::AllocationError(format!("{}: {}", context.format(), msg))
300 }
301 other => other, }
303 })
304 }
305
306 fn with_operation(self, operation: &str) -> BackendResult<T> {
307 self.with_context(ErrorContext::new(operation))
308 }
309}
310
311#[macro_export]
313macro_rules! backend_error {
314 ($operation:expr, $category:expr, $severity:expr, $message:expr) => {
315 $crate::error::BackendError::BackendError(
316 $crate::error::ErrorContext::new_with_category($operation, $category, $severity)
317 .format()
318 + ": "
319 + &$message.to_string(),
320 )
321 };
322 ($operation:expr, $message:expr) => {
323 backend_error!(
324 $operation,
325 $crate::error::ErrorCategory::Internal,
326 $crate::error::ErrorSeverity::Error,
327 $message
328 )
329 };
330}
331
332#[macro_export]
333macro_rules! compute_error {
334 ($operation:expr, $backend:expr, $device:expr, $message:expr) => {
335 $crate::error::BackendError::BackendError(
336 $crate::error::ErrorContext::new_with_category(
337 $operation,
338 $crate::error::ErrorCategory::Computation,
339 $crate::error::ErrorSeverity::Error,
340 )
341 .with_backend($backend)
342 .with_device($device)
343 .format()
344 + ": "
345 + &$message.to_string(),
346 )
347 };
348}
349
350#[macro_export]
351macro_rules! memory_error {
352 ($size:expr, $backend:expr, $device:expr, $message:expr) => {
353 $crate::error::conversion::memory_error_with_context(
354 $message,
355 $size,
356 $backend,
357 Some($device),
358 )
359 };
360 ($size:expr, $backend:expr, $message:expr) => {
361 $crate::error::conversion::memory_error_with_context($message, $size, $backend, None)
362 };
363}
364
365pub struct ErrorReporter {
367 backend_name: String,
368 enable_logging: bool,
369 error_callback: Option<Box<dyn Fn(&ErrorContext) + Send + Sync>>,
370}
371
372impl ErrorReporter {
373 pub fn new(backend_name: String) -> Self {
374 Self {
375 backend_name,
376 enable_logging: true,
377 error_callback: None,
378 }
379 }
380
381 pub fn with_logging(mut self, enable: bool) -> Self {
382 self.enable_logging = enable;
383 self
384 }
385
386 pub fn with_callback<F>(mut self, callback: F) -> Self
387 where
388 F: Fn(&ErrorContext) + Send + Sync + 'static,
389 {
390 self.error_callback = Some(Box::new(callback));
391 self
392 }
393
394 pub fn report_error(&self, error: &BackendError) {
395 if self.enable_logging {
396 match error {
397 BackendError::BackendError(msg) => {
398 eprintln!("[{}] Backend Error: {}", self.backend_name, msg)
399 }
400 BackendError::ComputeError(msg) => {
401 eprintln!("[{}] Compute Error: {}", self.backend_name, msg)
402 }
403 BackendError::AllocationError(msg) => {
404 eprintln!("[{}] Memory Error: {}", self.backend_name, msg)
405 }
406 _ => eprintln!("[{}] Error: {:?}", self.backend_name, error),
407 }
408 }
409 }
410
411 pub fn report_context(&self, context: &ErrorContext) {
412 if self.enable_logging {
413 eprintln!("[{}] {}", self.backend_name, context.format());
414 }
415
416 if let Some(ref callback) = self.error_callback {
417 callback(context);
418 }
419 }
420}
421
422pub struct ErrorRecoverySystem {
424 recovery_strategies: std::collections::HashMap<ErrorCategory, Vec<RecoveryStrategy>>,
425 max_retry_attempts: u32,
426}
427
428#[derive(Debug, Clone)]
429pub struct RecoveryStrategy {
430 pub name: String,
431 pub description: String,
432 pub auto_retry: bool,
433 pub max_attempts: u32,
434 pub backoff_ms: u64,
435}
436
437impl ErrorRecoverySystem {
438 pub fn new() -> Self {
439 let mut system = Self {
440 recovery_strategies: std::collections::HashMap::new(),
441 max_retry_attempts: 3,
442 };
443
444 system.add_default_strategies();
446 system
447 }
448
449 fn add_default_strategies(&mut self) {
450 self.add_strategy(
452 ErrorCategory::Memory,
453 RecoveryStrategy {
454 name: "garbage_collection".to_string(),
455 description: "Force garbage collection and retry".to_string(),
456 auto_retry: true,
457 max_attempts: 2,
458 backoff_ms: 100,
459 },
460 );
461
462 self.add_strategy(
464 ErrorCategory::Timeout,
465 RecoveryStrategy {
466 name: "exponential_backoff".to_string(),
467 description: "Retry with exponential backoff".to_string(),
468 auto_retry: true,
469 max_attempts: 3,
470 backoff_ms: 1000,
471 },
472 );
473
474 self.add_strategy(
476 ErrorCategory::Hardware,
477 RecoveryStrategy {
478 name: "device_reset".to_string(),
479 description: "Reset device context and retry".to_string(),
480 auto_retry: false,
481 max_attempts: 1,
482 backoff_ms: 5000,
483 },
484 );
485 }
486
487 pub fn add_strategy(&mut self, category: ErrorCategory, strategy: RecoveryStrategy) {
488 self.recovery_strategies
489 .entry(category)
490 .or_insert_with(Vec::new)
491 .push(strategy);
492 }
493
494 pub fn get_recovery_strategies(
495 &self,
496 category: &ErrorCategory,
497 ) -> Option<&Vec<RecoveryStrategy>> {
498 self.recovery_strategies.get(category)
499 }
500
501 pub fn should_auto_retry(&self, context: &ErrorContext) -> bool {
502 if let Some(strategies) = self.get_recovery_strategies(&context.category) {
503 strategies
504 .iter()
505 .any(|s| s.auto_retry && context.severity <= ErrorSeverity::Error)
506 } else {
507 false
508 }
509 }
510}
511
512impl Default for ErrorRecoverySystem {
513 fn default() -> Self {
514 Self::new()
515 }
516}
517
518#[derive(Debug, Clone, Default)]
520pub struct ErrorStatistics {
521 pub total_errors: u64,
522 pub errors_by_category: std::collections::HashMap<ErrorCategory, u64>,
523 pub errors_by_severity: std::collections::HashMap<ErrorSeverity, u64>,
524 pub errors_by_backend: std::collections::HashMap<String, u64>,
525 pub recovery_attempts: u64,
526 pub successful_recoveries: u64,
527}
528
529impl ErrorStatistics {
530 pub fn record_error(&mut self, context: &ErrorContext) {
531 self.total_errors += 1;
532 *self
533 .errors_by_category
534 .entry(context.category.clone())
535 .or_insert(0) += 1;
536 *self.errors_by_severity.entry(context.severity).or_insert(0) += 1;
537
538 if let Some(ref backend) = context.backend {
539 *self.errors_by_backend.entry(backend.clone()).or_insert(0) += 1;
540 }
541 }
542
543 pub fn record_recovery_attempt(&mut self) {
544 self.recovery_attempts += 1;
545 }
546
547 pub fn record_successful_recovery(&mut self) {
548 self.successful_recoveries += 1;
549 }
550
551 pub fn get_recovery_success_rate(&self) -> f64 {
552 if self.recovery_attempts > 0 {
553 self.successful_recoveries as f64 / self.recovery_attempts as f64
554 } else {
555 0.0
556 }
557 }
558
559 pub fn get_most_frequent_error_category(&self) -> Option<(&ErrorCategory, &u64)> {
560 self.errors_by_category
561 .iter()
562 .max_by_key(|(_, count)| *count)
563 }
564
565 pub fn reset(&mut self) {
566 *self = Self::default();
567 }
568}
569
570#[cfg(cuda_available)]
574pub fn cust_error_to_backend(error: cust::error::CudaError) -> BackendError {
575 BackendError::Backend(format!("CUDA error: {}", error))
576}
577
578pub mod conversion {
580 use super::*;
581
582 pub fn cuda_error_with_context(
584 error: impl std::fmt::Display,
585 operation: &str,
586 device_id: Option<usize>,
587 ) -> BackendError {
588 let mut context = ErrorContext::new_with_category(
589 operation,
590 ErrorCategory::Hardware,
591 ErrorSeverity::Error,
592 )
593 .with_backend("CUDA")
594 .with_error_code("CUDA_ERROR")
595 .add_recovery_suggestion("Check CUDA installation and driver version")
596 .add_recovery_suggestion("Verify GPU memory availability")
597 .add_recovery_suggestion("Try reducing batch size or model size");
598
599 if let Some(device_id) = device_id {
600 context = context.with_device(format!("cuda:{}", device_id));
601 }
602
603 BackendError::BackendError(format!("{}: {}", context.format(), error))
604 }
605
606 pub fn cpu_error_with_context(error: impl std::fmt::Display, operation: &str) -> BackendError {
608 let context = ErrorContext::new_with_category(
609 operation,
610 ErrorCategory::Computation,
611 ErrorSeverity::Error,
612 )
613 .with_backend("CPU")
614 .with_error_code("CPU_ERROR")
615 .add_recovery_suggestion("Check system memory availability")
616 .add_recovery_suggestion("Reduce number of parallel threads")
617 .add_recovery_suggestion("Try smaller input sizes");
618
619 BackendError::BackendError(format!("{}: {}", context.format(), error))
620 }
621
622 pub fn metal_error_with_context(
624 error: impl std::fmt::Display,
625 operation: &str,
626 device_id: Option<usize>,
627 ) -> BackendError {
628 let mut context = ErrorContext::new_with_category(
629 operation,
630 ErrorCategory::Hardware,
631 ErrorSeverity::Error,
632 )
633 .with_backend("Metal")
634 .with_error_code("METAL_ERROR")
635 .add_recovery_suggestion("Check macOS version compatibility")
636 .add_recovery_suggestion("Verify Metal Performance Shaders framework")
637 .add_recovery_suggestion("Try reducing GPU memory usage");
638
639 if let Some(device_id) = device_id {
640 context = context.with_device(format!("metal:{}", device_id));
641 }
642
643 BackendError::BackendError(format!("{}: {}", context.format(), error))
644 }
645
646 pub fn webgpu_error_with_context(
648 error: impl std::fmt::Display,
649 operation: &str,
650 adapter_name: Option<&str>,
651 ) -> BackendError {
652 let mut context = ErrorContext::new_with_category(
653 operation,
654 ErrorCategory::Hardware,
655 ErrorSeverity::Error,
656 )
657 .with_backend("WebGPU")
658 .with_error_code("WEBGPU_ERROR")
659 .add_recovery_suggestion("Check WebGPU browser support")
660 .add_recovery_suggestion("Verify GPU driver compatibility")
661 .add_recovery_suggestion("Try different adapter if available");
662
663 if let Some(adapter_name) = adapter_name {
664 context = context.with_device(adapter_name.to_string());
665 }
666
667 BackendError::BackendError(format!("{}: {}", context.format(), error))
668 }
669
670 pub fn memory_error_with_context(
672 error: impl std::fmt::Display,
673 size: usize,
674 backend: &str,
675 device: Option<&str>,
676 ) -> BackendError {
677 let size_mb = size as f64 / (1024.0 * 1024.0);
678 let mut context = ErrorContext::new_with_category(
679 "memory_allocation",
680 ErrorCategory::Memory,
681 if size > 1024 * 1024 * 1024 {
682 ErrorSeverity::Critical
683 } else {
684 ErrorSeverity::Error
685 },
686 )
687 .with_backend(backend)
688 .with_details(format!("size: {} bytes ({:.2} MB)", size, size_mb))
689 .with_error_code("MEMORY_ALLOCATION_FAILED")
690 .add_recovery_suggestion("Free unused memory")
691 .add_recovery_suggestion("Reduce batch size or model parameters")
692 .add_recovery_suggestion("Enable memory optimization features");
693
694 if size > 512 * 1024 * 1024 {
695 context = context
697 .add_recovery_suggestion("Consider using memory-mapped files for large datasets");
698 }
699
700 if let Some(device) = device {
701 context = context.with_device(device.to_string());
702 }
703
704 BackendError::AllocationError(format!("{}: {}", context.format(), error))
705 }
706
707 pub fn kernel_error_with_context(
709 error: impl std::fmt::Display,
710 kernel_name: &str,
711 backend: &str,
712 device: Option<&str>,
713 ) -> BackendError {
714 let mut context = ErrorContext::new_with_category(
715 format!("kernel_execution:{}", kernel_name),
716 ErrorCategory::Computation,
717 ErrorSeverity::Error,
718 )
719 .with_backend(backend)
720 .with_error_code("KERNEL_EXECUTION_FAILED")
721 .add_recovery_suggestion("Check kernel parameters and input dimensions")
722 .add_recovery_suggestion("Verify workgroup/block sizes are valid")
723 .add_recovery_suggestion("Enable debug mode for detailed error information");
724
725 if let Some(device) = device {
726 context = context.with_device(device.to_string());
727 }
728
729 BackendError::BackendError(format!("{}: {}", context.format(), error))
730 }
731
732 pub fn timeout_error_with_context(
734 operation: &str,
735 timeout_seconds: f64,
736 backend: &str,
737 device: Option<&str>,
738 ) -> BackendError {
739 let mut context = ErrorContext::new_with_category(
740 operation,
741 ErrorCategory::Timeout,
742 ErrorSeverity::Warning,
743 )
744 .with_backend(backend)
745 .with_details(format!("timeout: {:.2}s", timeout_seconds))
746 .with_error_code("OPERATION_TIMEOUT")
747 .add_recovery_suggestion("Increase timeout value")
748 .add_recovery_suggestion("Optimize operation parameters")
749 .add_recovery_suggestion("Split operation into smaller chunks");
750
751 if let Some(device) = device {
752 context = context.with_device(device.to_string());
753 }
754
755 BackendError::BackendError(format!("{}: Operation timed out", context.format()))
756 }
757
758 pub fn validation_error_with_context(
760 parameter_name: &str,
761 expected: &str,
762 actual: &str,
763 operation: &str,
764 ) -> BackendError {
765 let context = ErrorContext::new_with_category(
766 operation,
767 ErrorCategory::Validation,
768 ErrorSeverity::Error,
769 )
770 .with_details(format!(
771 "parameter: {}, expected: {}, actual: {}",
772 parameter_name, expected, actual
773 ))
774 .with_error_code("PARAMETER_VALIDATION_FAILED")
775 .add_recovery_suggestion("Check parameter documentation")
776 .add_recovery_suggestion("Verify input data types and shapes")
777 .add_recovery_suggestion("Use parameter validation utilities");
778
779 BackendError::BackendError(format!(
780 "{}: Invalid parameter {}",
781 context.format(),
782 parameter_name
783 ))
784 }
785
786 pub fn resource_exhaustion_error_with_context(
788 resource_type: &str,
789 current_usage: f64,
790 max_limit: f64,
791 backend: &str,
792 ) -> BackendError {
793 let usage_percent = (current_usage / max_limit) * 100.0;
794 let context = ErrorContext::new_with_category(
795 "resource_check",
796 ErrorCategory::ResourceExhaustion,
797 ErrorSeverity::Critical,
798 )
799 .with_backend(backend)
800 .with_details(format!(
801 "resource: {}, usage: {:.1}% ({:.2}/{:.2})",
802 resource_type, usage_percent, current_usage, max_limit
803 ))
804 .with_error_code("RESOURCE_EXHAUSTED")
805 .add_recovery_suggestion("Free unused resources")
806 .add_recovery_suggestion("Reduce concurrent operations")
807 .add_recovery_suggestion("Increase resource limits if possible");
808
809 BackendError::BackendError(format!("{}: Resource exhausted", context.format()))
810 }
811
812 pub fn config_error_with_context(
814 parameter: &str,
815 value: &str,
816 operation: &str,
817 backend: &str,
818 ) -> BackendError {
819 let context = ErrorContext::new_with_category(
820 operation,
821 ErrorCategory::Configuration,
822 ErrorSeverity::Error,
823 )
824 .with_backend(backend)
825 .with_details(format!("parameter: {}, value: {}", parameter, value))
826 .with_error_code("CONFIG_ERROR")
827 .add_recovery_suggestion("Check configuration documentation")
828 .add_recovery_suggestion("Verify parameter values and types")
829 .add_recovery_suggestion("Use default configuration if available");
830
831 BackendError::BackendError(format!("{}: Invalid configuration", context.format()))
832 }
833
834 pub fn io_error_with_context(
836 error: impl std::fmt::Display,
837 file_path: Option<&str>,
838 operation: &str,
839 ) -> BackendError {
840 let mut context =
841 ErrorContext::new_with_category(operation, ErrorCategory::Io, ErrorSeverity::Error)
842 .with_error_code("IO_ERROR")
843 .add_recovery_suggestion("Check file permissions")
844 .add_recovery_suggestion("Verify disk space availability")
845 .add_recovery_suggestion("Ensure file path is accessible");
846
847 if let Some(path) = file_path {
848 context = context.with_details(format!("file: {}", path));
849 }
850
851 BackendError::BackendError(format!("{}: {}", context.format(), error))
852 }
853
854 pub fn security_error_with_context(
856 error: impl std::fmt::Display,
857 operation: &str,
858 backend: &str,
859 ) -> BackendError {
860 let context = ErrorContext::new_with_category(
861 operation,
862 ErrorCategory::Security,
863 ErrorSeverity::Critical,
864 )
865 .with_backend(backend)
866 .with_error_code("SECURITY_ERROR")
867 .add_recovery_suggestion("Check security policies and permissions")
868 .add_recovery_suggestion("Verify authentication credentials")
869 .add_recovery_suggestion("Review access control settings");
870
871 BackendError::BackendError(format!("{}: {}", context.format(), error))
872 }
873}