Skip to main content

ringkernel_core/
error.rs

1//! Error types for RingKernel operations.
2
3use thiserror::Error;
4
5/// Result type alias for RingKernel operations.
6pub type Result<T> = std::result::Result<T, RingKernelError>;
7
8/// Comprehensive error type for RingKernel operations.
9#[derive(Error, Debug)]
10pub enum RingKernelError {
11    // ===== Kernel Lifecycle Errors =====
12    /// Kernel not found with the given ID.
13    #[error("kernel not found: {0}")]
14    KernelNotFound(String),
15
16    /// Kernel is already active.
17    #[error("kernel already active: {0}")]
18    KernelAlreadyActive(String),
19
20    /// Kernel is not active.
21    #[error("kernel not active: {0}")]
22    KernelNotActive(String),
23
24    /// Kernel has already terminated.
25    #[error("kernel already terminated: {0}")]
26    KernelTerminated(String),
27
28    /// Invalid kernel state transition.
29    #[error("invalid state transition from {from:?} to {to:?}")]
30    InvalidStateTransition {
31        /// Current state
32        from: String,
33        /// Attempted target state
34        to: String,
35    },
36
37    /// Invalid kernel state.
38    #[error("invalid state: expected {expected}, got {actual}")]
39    InvalidState {
40        /// Expected state
41        expected: String,
42        /// Actual state
43        actual: String,
44    },
45
46    /// Kernel launch failed.
47    #[error("kernel launch failed: {0}")]
48    LaunchFailed(String),
49
50    /// Kernel compilation failed (NVRTC, shader compilation, etc.).
51    #[error("kernel compilation failed: {0}")]
52    CompilationError(String),
53
54    // ===== Message Errors =====
55    /// Queue is full, message cannot be enqueued.
56    #[error("queue full: capacity {capacity}, attempted to enqueue message")]
57    QueueFull {
58        /// Queue capacity
59        capacity: usize,
60    },
61
62    /// Queue is empty, no message to dequeue.
63    #[error("queue empty")]
64    QueueEmpty,
65
66    /// Message serialization failed.
67    #[error("serialization error: {0}")]
68    SerializationError(String),
69
70    /// Message deserialization failed.
71    #[error("deserialization error: {0}")]
72    DeserializationError(String),
73
74    /// Message validation failed.
75    #[error("message validation failed: {0}")]
76    ValidationError(String),
77
78    /// Message too large.
79    #[error("message too large: {size} bytes (max: {max} bytes)")]
80    MessageTooLarge {
81        /// Actual message size
82        size: usize,
83        /// Maximum allowed size
84        max: usize,
85    },
86
87    /// Message timeout.
88    #[error("message timeout after {0:?}")]
89    Timeout(std::time::Duration),
90
91    // ===== Memory Errors =====
92    /// GPU memory allocation failed.
93    #[error("GPU memory allocation failed: {size} bytes - {reason}")]
94    AllocationFailed {
95        /// Requested size
96        size: usize,
97        /// Failure reason
98        reason: String,
99    },
100
101    /// Host memory allocation failed.
102    #[error("host memory allocation failed: {size} bytes")]
103    HostAllocationFailed {
104        /// Requested size
105        size: usize,
106    },
107
108    /// Memory transfer failed.
109    #[error("memory transfer failed: {0}")]
110    TransferFailed(String),
111
112    /// Invalid memory alignment.
113    #[error("invalid alignment: expected {expected}, got {actual}")]
114    InvalidAlignment {
115        /// Expected alignment
116        expected: usize,
117        /// Actual alignment
118        actual: usize,
119    },
120
121    /// Out of GPU memory.
122    #[error("out of GPU memory: requested {requested} bytes, available {available} bytes")]
123    OutOfMemory {
124        /// Requested size
125        requested: usize,
126        /// Available memory
127        available: usize,
128    },
129
130    /// Memory pool exhausted.
131    #[error("memory pool exhausted")]
132    PoolExhausted,
133
134    /// Invalid index (out of bounds).
135    #[error("invalid index: {0}")]
136    InvalidIndex(usize),
137
138    /// Generic memory error.
139    #[error("memory error: {0}")]
140    MemoryError(String),
141
142    // ===== Backend Errors =====
143    /// Backend not available.
144    #[error("backend not available: {0}")]
145    BackendUnavailable(String),
146
147    /// Backend initialization failed.
148    #[error("backend initialization failed: {0}")]
149    BackendInitFailed(String),
150
151    /// No suitable GPU device found.
152    #[error("no GPU device found")]
153    NoDeviceFound,
154
155    /// Device selection failed.
156    #[error("device selection failed: {0}")]
157    DeviceSelectionFailed(String),
158
159    /// Backend operation failed.
160    #[error("backend error: {0}")]
161    BackendError(String),
162
163    // ===== Synchronization Errors =====
164    /// Deadlock detected.
165    #[error("deadlock detected")]
166    DeadlockDetected,
167
168    /// Lock poisoned.
169    #[error("lock poisoned")]
170    LockPoisoned,
171
172    /// Channel closed.
173    #[error("channel closed")]
174    ChannelClosed,
175
176    // ===== HLC Errors =====
177    /// Clock skew too large.
178    #[error("clock skew too large: {skew_ms}ms (max: {max_ms}ms)")]
179    ClockSkew {
180        /// Detected skew in milliseconds
181        skew_ms: u64,
182        /// Maximum allowed skew
183        max_ms: u64,
184    },
185
186    /// Invalid timestamp.
187    #[error("invalid timestamp")]
188    InvalidTimestamp,
189
190    // ===== K2K Messaging Errors =====
191    /// K2K messaging error.
192    #[error("K2K error: {0}")]
193    K2KError(String),
194
195    /// K2K destination not found.
196    #[error("K2K destination not found: {0}")]
197    K2KDestinationNotFound(String),
198
199    /// K2K delivery failed.
200    #[error("K2K delivery failed: {0}")]
201    K2KDeliveryFailed(String),
202
203    /// Cross-tenant K2K send rejected.
204    ///
205    /// Raised by the K2K broker when a kernel attempts to send a message to
206    /// another kernel registered under a different tenant. Tenant isolation
207    /// is the primary security boundary in multi-tenant deployments.
208    #[error("cross-tenant K2K send rejected: from tenant {from} to tenant {to}")]
209    TenantMismatch {
210        /// Tenant ID of the sending kernel.
211        from: u64,
212        /// Tenant ID of the destination kernel.
213        to: u64,
214    },
215
216    // ===== Pub/Sub Errors =====
217    /// Pub/sub error.
218    #[error("pub/sub error: {0}")]
219    PubSubError(String),
220
221    /// Topic not found.
222    #[error("topic not found: {0}")]
223    TopicNotFound(String),
224
225    /// Subscription error.
226    #[error("subscription error: {0}")]
227    SubscriptionError(String),
228
229    // ===== Multi-GPU Errors =====
230    /// Multi-GPU coordination error.
231    #[error("multi-GPU error: {0}")]
232    MultiGpuError(String),
233
234    /// Device not available.
235    #[error("device not available: {0}")]
236    DeviceNotAvailable(String),
237
238    /// Cross-device transfer failed.
239    #[error("cross-device transfer failed: {0}")]
240    CrossDeviceTransferFailed(String),
241
242    // ===== Telemetry Errors =====
243    /// Telemetry error.
244    #[error("telemetry error: {0}")]
245    TelemetryError(String),
246
247    /// Metrics collection failed.
248    #[error("metrics collection failed: {0}")]
249    MetricsCollectionFailed(String),
250
251    // ===== Configuration Errors =====
252    /// Invalid configuration.
253    #[error("invalid configuration: {0}")]
254    InvalidConfig(String),
255
256    /// Missing required configuration.
257    #[error("missing configuration: {0}")]
258    MissingConfig(String),
259
260    // ===== I/O Errors =====
261    /// I/O error wrapper.
262    #[error("I/O error: {0}")]
263    StdIoError(#[from] std::io::Error),
264
265    /// I/O error with string message.
266    #[error("I/O error: {0}")]
267    IoError(String),
268
269    // ===== Checkpoint Errors =====
270    /// Invalid checkpoint format or data.
271    #[error("invalid checkpoint: {0}")]
272    InvalidCheckpoint(String),
273
274    /// Checkpoint save failed.
275    #[error("checkpoint save failed: {0}")]
276    CheckpointSaveFailed(String),
277
278    /// Checkpoint restore failed.
279    #[error("checkpoint restore failed: {0}")]
280    CheckpointRestoreFailed(String),
281
282    /// Checkpoint not found.
283    #[error("checkpoint not found: {0}")]
284    CheckpointNotFound(String),
285
286    // ===== Health & Resilience Errors =====
287    /// Health check failed.
288    #[error("health check failed: {name} - {reason}")]
289    HealthCheckFailed {
290        /// Health check name
291        name: String,
292        /// Failure reason
293        reason: String,
294    },
295
296    /// Circuit breaker is open.
297    #[error("circuit breaker open: {name}")]
298    CircuitBreakerOpen {
299        /// Circuit breaker name
300        name: String,
301    },
302
303    /// Retry attempts exhausted.
304    #[error("retry exhausted after {attempts} attempts: {reason}")]
305    RetryExhausted {
306        /// Number of attempts made
307        attempts: u32,
308        /// Last failure reason
309        reason: String,
310    },
311
312    /// Kernel watchdog timeout.
313    #[error("kernel watchdog timeout: {kernel_id}")]
314    WatchdogTimeout {
315        /// Kernel ID that timed out
316        kernel_id: String,
317    },
318
319    /// Load shedding rejected request.
320    #[error("load shedding: request rejected at level {level}")]
321    LoadSheddingRejected {
322        /// Current degradation level
323        level: String,
324    },
325
326    // ===== Migration Errors =====
327    /// Kernel migration failed.
328    #[error("kernel migration failed: {0}")]
329    MigrationFailed(String),
330
331    /// Migration source not ready.
332    #[error("migration source not ready: {kernel_id}")]
333    MigrationSourceNotReady {
334        /// Source kernel ID
335        kernel_id: String,
336    },
337
338    /// Migration destination unavailable.
339    #[error("migration destination unavailable: device {device_id}")]
340    MigrationDestinationUnavailable {
341        /// Destination device ID
342        device_id: usize,
343    },
344
345    // ===== Observability Errors =====
346    /// Tracing error.
347    #[error("tracing error: {0}")]
348    TracingError(String),
349
350    /// Span not found.
351    #[error("span not found: {0}")]
352    SpanNotFound(String),
353
354    /// Metrics export failed.
355    #[error("metrics export failed: {0}")]
356    MetricsExportFailed(String),
357
358    // ===== Generic Errors =====
359    /// Internal error.
360    #[error("internal error: {0}")]
361    Internal(String),
362
363    /// Feature not supported.
364    #[error("feature not supported: {0}")]
365    NotSupported(String),
366
367    /// Operation cancelled.
368    #[error("operation cancelled")]
369    Cancelled,
370}
371
372impl RingKernelError {
373    /// Returns true if this error is recoverable.
374    pub fn is_recoverable(&self) -> bool {
375        matches!(
376            self,
377            RingKernelError::QueueFull { .. }
378                | RingKernelError::QueueEmpty
379                | RingKernelError::Timeout(_)
380                | RingKernelError::PoolExhausted
381                | RingKernelError::CircuitBreakerOpen { .. }
382                | RingKernelError::LoadSheddingRejected { .. }
383        )
384    }
385
386    /// Returns true if this error indicates a resource issue.
387    pub fn is_resource_error(&self) -> bool {
388        matches!(
389            self,
390            RingKernelError::AllocationFailed { .. }
391                | RingKernelError::HostAllocationFailed { .. }
392                | RingKernelError::OutOfMemory { .. }
393                | RingKernelError::PoolExhausted
394                | RingKernelError::MigrationDestinationUnavailable { .. }
395        )
396    }
397
398    /// Returns true if this is a fatal error requiring restart.
399    pub fn is_fatal(&self) -> bool {
400        matches!(
401            self,
402            RingKernelError::BackendInitFailed(_)
403                | RingKernelError::NoDeviceFound
404                | RingKernelError::LockPoisoned
405                | RingKernelError::Internal(_)
406        )
407    }
408
409    /// Returns true if this is a health/resilience related error.
410    pub fn is_health_error(&self) -> bool {
411        matches!(
412            self,
413            RingKernelError::HealthCheckFailed { .. }
414                | RingKernelError::CircuitBreakerOpen { .. }
415                | RingKernelError::RetryExhausted { .. }
416                | RingKernelError::WatchdogTimeout { .. }
417                | RingKernelError::LoadSheddingRejected { .. }
418        )
419    }
420
421    /// Returns true if this is a migration-related error.
422    pub fn is_migration_error(&self) -> bool {
423        matches!(
424            self,
425            RingKernelError::MigrationFailed(_)
426                | RingKernelError::MigrationSourceNotReady { .. }
427                | RingKernelError::MigrationDestinationUnavailable { .. }
428        )
429    }
430
431    /// Returns true if this is an observability-related error.
432    pub fn is_observability_error(&self) -> bool {
433        matches!(
434            self,
435            RingKernelError::TracingError(_)
436                | RingKernelError::SpanNotFound(_)
437                | RingKernelError::MetricsExportFailed(_)
438                | RingKernelError::TelemetryError(_)
439                | RingKernelError::MetricsCollectionFailed(_)
440        )
441    }
442}
443
444#[cfg(test)]
445mod tests {
446    use super::*;
447
448    #[test]
449    fn test_error_display() {
450        let err = RingKernelError::KernelNotFound("test_kernel".to_string());
451        assert_eq!(format!("{}", err), "kernel not found: test_kernel");
452
453        let err = RingKernelError::QueueFull { capacity: 1024 };
454        assert!(format!("{}", err).contains("1024"));
455    }
456
457    #[test]
458    fn test_error_classification() {
459        assert!(RingKernelError::QueueFull { capacity: 1024 }.is_recoverable());
460        assert!(RingKernelError::OutOfMemory {
461            requested: 1000,
462            available: 100
463        }
464        .is_resource_error());
465        assert!(RingKernelError::LockPoisoned.is_fatal());
466    }
467
468    #[test]
469    fn test_health_error_display() {
470        let err = RingKernelError::HealthCheckFailed {
471            name: "liveness".to_string(),
472            reason: "timeout".to_string(),
473        };
474        assert_eq!(
475            format!("{}", err),
476            "health check failed: liveness - timeout"
477        );
478
479        let err = RingKernelError::CircuitBreakerOpen {
480            name: "gpu_ops".to_string(),
481        };
482        assert_eq!(format!("{}", err), "circuit breaker open: gpu_ops");
483
484        let err = RingKernelError::RetryExhausted {
485            attempts: 5,
486            reason: "connection refused".to_string(),
487        };
488        assert!(format!("{}", err).contains("5 attempts"));
489
490        let err = RingKernelError::WatchdogTimeout {
491            kernel_id: "kernel_42".to_string(),
492        };
493        assert!(format!("{}", err).contains("kernel_42"));
494    }
495
496    #[test]
497    fn test_health_error_classification() {
498        assert!(RingKernelError::CircuitBreakerOpen {
499            name: "test".to_string()
500        }
501        .is_recoverable());
502        assert!(RingKernelError::LoadSheddingRejected {
503            level: "critical".to_string()
504        }
505        .is_recoverable());
506        assert!(RingKernelError::HealthCheckFailed {
507            name: "test".to_string(),
508            reason: "failed".to_string()
509        }
510        .is_health_error());
511        assert!(RingKernelError::WatchdogTimeout {
512            kernel_id: "k1".to_string()
513        }
514        .is_health_error());
515    }
516
517    #[test]
518    fn test_migration_error_display() {
519        let err = RingKernelError::MigrationFailed("checkpoint transfer error".to_string());
520        assert!(format!("{}", err).contains("checkpoint transfer error"));
521
522        let err = RingKernelError::MigrationSourceNotReady {
523            kernel_id: "kernel_1".to_string(),
524        };
525        assert!(format!("{}", err).contains("kernel_1"));
526
527        let err = RingKernelError::MigrationDestinationUnavailable { device_id: 2 };
528        assert!(format!("{}", err).contains("device 2"));
529    }
530
531    #[test]
532    fn test_migration_error_classification() {
533        assert!(RingKernelError::MigrationFailed("test".to_string()).is_migration_error());
534        assert!(RingKernelError::MigrationSourceNotReady {
535            kernel_id: "k1".to_string()
536        }
537        .is_migration_error());
538        assert!(
539            RingKernelError::MigrationDestinationUnavailable { device_id: 0 }.is_migration_error()
540        );
541        assert!(
542            RingKernelError::MigrationDestinationUnavailable { device_id: 0 }.is_resource_error()
543        );
544    }
545
546    #[test]
547    fn test_observability_error_display() {
548        let err = RingKernelError::TracingError("span creation failed".to_string());
549        assert!(format!("{}", err).contains("span creation failed"));
550
551        let err = RingKernelError::SpanNotFound("span_abc123".to_string());
552        assert!(format!("{}", err).contains("span_abc123"));
553
554        let err = RingKernelError::MetricsExportFailed("prometheus timeout".to_string());
555        assert!(format!("{}", err).contains("prometheus timeout"));
556    }
557
558    #[test]
559    fn test_observability_error_classification() {
560        assert!(RingKernelError::TracingError("test".to_string()).is_observability_error());
561        assert!(RingKernelError::SpanNotFound("test".to_string()).is_observability_error());
562        assert!(RingKernelError::MetricsExportFailed("test".to_string()).is_observability_error());
563        assert!(RingKernelError::TelemetryError("test".to_string()).is_observability_error());
564        assert!(
565            RingKernelError::MetricsCollectionFailed("test".to_string()).is_observability_error()
566        );
567    }
568}