Skip to main content

ringkernel_core/
error.rs

1//! Error types for RingKernel operations.
2
3use thiserror::Error;
4
5/// Result type alias for RingKernel operations.
6pub type Result<T> = std::result::Result<T, RingKernelError>;
7
8/// Comprehensive error type for RingKernel operations.
9#[derive(Error, Debug)]
10pub enum RingKernelError {
11    // ===== Kernel Lifecycle Errors =====
12    /// Kernel not found with the given ID.
13    #[error("kernel not found: {0}")]
14    KernelNotFound(String),
15
16    /// Kernel is already active.
17    #[error("kernel already active: {0}")]
18    KernelAlreadyActive(String),
19
20    /// Kernel is not active.
21    #[error("kernel not active: {0}")]
22    KernelNotActive(String),
23
24    /// Kernel has already terminated.
25    #[error("kernel already terminated: {0}")]
26    KernelTerminated(String),
27
28    /// Invalid kernel state transition.
29    #[error("invalid state transition from {from:?} to {to:?}")]
30    InvalidStateTransition {
31        /// Current state
32        from: String,
33        /// Attempted target state
34        to: String,
35    },
36
37    /// Invalid kernel state.
38    #[error("invalid state: expected {expected}, got {actual}")]
39    InvalidState {
40        /// Expected state
41        expected: String,
42        /// Actual state
43        actual: String,
44    },
45
46    /// Kernel launch failed.
47    #[error("kernel launch failed: {0}")]
48    LaunchFailed(String),
49
50    /// Kernel compilation failed (NVRTC, shader compilation, etc.).
51    #[error("kernel compilation failed: {0}")]
52    CompilationError(String),
53
54    // ===== Message Errors =====
55    /// Queue is full, message cannot be enqueued.
56    #[error("queue full: capacity {capacity}, attempted to enqueue message")]
57    QueueFull {
58        /// Queue capacity
59        capacity: usize,
60    },
61
62    /// Queue is empty, no message to dequeue.
63    #[error("queue empty")]
64    QueueEmpty,
65
66    /// Message serialization failed.
67    #[error("serialization error: {0}")]
68    SerializationError(String),
69
70    /// Message deserialization failed.
71    #[error("deserialization error: {0}")]
72    DeserializationError(String),
73
74    /// Message validation failed.
75    #[error("message validation failed: {0}")]
76    ValidationError(String),
77
78    /// Message too large.
79    #[error("message too large: {size} bytes (max: {max} bytes)")]
80    MessageTooLarge {
81        /// Actual message size
82        size: usize,
83        /// Maximum allowed size
84        max: usize,
85    },
86
87    /// Message timeout.
88    #[error("message timeout after {0:?}")]
89    Timeout(std::time::Duration),
90
91    // ===== Memory Errors =====
92    /// GPU memory allocation failed.
93    #[error("GPU memory allocation failed: {size} bytes - {reason}")]
94    AllocationFailed {
95        /// Requested size
96        size: usize,
97        /// Failure reason
98        reason: String,
99    },
100
101    /// Host memory allocation failed.
102    #[error("host memory allocation failed: {size} bytes")]
103    HostAllocationFailed {
104        /// Requested size
105        size: usize,
106    },
107
108    /// Memory transfer failed.
109    #[error("memory transfer failed: {0}")]
110    TransferFailed(String),
111
112    /// Invalid memory alignment.
113    #[error("invalid alignment: expected {expected}, got {actual}")]
114    InvalidAlignment {
115        /// Expected alignment
116        expected: usize,
117        /// Actual alignment
118        actual: usize,
119    },
120
121    /// Out of GPU memory.
122    #[error("out of GPU memory: requested {requested} bytes, available {available} bytes")]
123    OutOfMemory {
124        /// Requested size
125        requested: usize,
126        /// Available memory
127        available: usize,
128    },
129
130    /// Memory pool exhausted.
131    #[error("memory pool exhausted")]
132    PoolExhausted,
133
134    /// Invalid index (out of bounds).
135    #[error("invalid index: {0}")]
136    InvalidIndex(usize),
137
138    /// Generic memory error.
139    #[error("memory error: {0}")]
140    MemoryError(String),
141
142    // ===== Backend Errors =====
143    /// Backend not available.
144    #[error("backend not available: {0}")]
145    BackendUnavailable(String),
146
147    /// Backend initialization failed.
148    #[error("backend initialization failed: {0}")]
149    BackendInitFailed(String),
150
151    /// No suitable GPU device found.
152    #[error("no GPU device found")]
153    NoDeviceFound,
154
155    /// Device selection failed.
156    #[error("device selection failed: {0}")]
157    DeviceSelectionFailed(String),
158
159    /// Backend operation failed.
160    #[error("backend error: {0}")]
161    BackendError(String),
162
163    // ===== Synchronization Errors =====
164    /// Deadlock detected.
165    #[error("deadlock detected")]
166    DeadlockDetected,
167
168    /// Lock poisoned.
169    #[error("lock poisoned")]
170    LockPoisoned,
171
172    /// Channel closed.
173    #[error("channel closed")]
174    ChannelClosed,
175
176    // ===== HLC Errors =====
177    /// Clock skew too large.
178    #[error("clock skew too large: {skew_ms}ms (max: {max_ms}ms)")]
179    ClockSkew {
180        /// Detected skew in milliseconds
181        skew_ms: u64,
182        /// Maximum allowed skew
183        max_ms: u64,
184    },
185
186    /// Invalid timestamp.
187    #[error("invalid timestamp")]
188    InvalidTimestamp,
189
190    // ===== K2K Messaging Errors =====
191    /// K2K messaging error.
192    #[error("K2K error: {0}")]
193    K2KError(String),
194
195    /// K2K destination not found.
196    #[error("K2K destination not found: {0}")]
197    K2KDestinationNotFound(String),
198
199    /// K2K delivery failed.
200    #[error("K2K delivery failed: {0}")]
201    K2KDeliveryFailed(String),
202
203    // ===== Pub/Sub Errors =====
204    /// Pub/sub error.
205    #[error("pub/sub error: {0}")]
206    PubSubError(String),
207
208    /// Topic not found.
209    #[error("topic not found: {0}")]
210    TopicNotFound(String),
211
212    /// Subscription error.
213    #[error("subscription error: {0}")]
214    SubscriptionError(String),
215
216    // ===== Multi-GPU Errors =====
217    /// Multi-GPU coordination error.
218    #[error("multi-GPU error: {0}")]
219    MultiGpuError(String),
220
221    /// Device not available.
222    #[error("device not available: {0}")]
223    DeviceNotAvailable(String),
224
225    /// Cross-device transfer failed.
226    #[error("cross-device transfer failed: {0}")]
227    CrossDeviceTransferFailed(String),
228
229    // ===== Telemetry Errors =====
230    /// Telemetry error.
231    #[error("telemetry error: {0}")]
232    TelemetryError(String),
233
234    /// Metrics collection failed.
235    #[error("metrics collection failed: {0}")]
236    MetricsCollectionFailed(String),
237
238    // ===== Configuration Errors =====
239    /// Invalid configuration.
240    #[error("invalid configuration: {0}")]
241    InvalidConfig(String),
242
243    /// Missing required configuration.
244    #[error("missing configuration: {0}")]
245    MissingConfig(String),
246
247    // ===== I/O Errors =====
248    /// I/O error wrapper.
249    #[error("I/O error: {0}")]
250    StdIoError(#[from] std::io::Error),
251
252    /// I/O error with string message.
253    #[error("I/O error: {0}")]
254    IoError(String),
255
256    // ===== Checkpoint Errors =====
257    /// Invalid checkpoint format or data.
258    #[error("invalid checkpoint: {0}")]
259    InvalidCheckpoint(String),
260
261    /// Checkpoint save failed.
262    #[error("checkpoint save failed: {0}")]
263    CheckpointSaveFailed(String),
264
265    /// Checkpoint restore failed.
266    #[error("checkpoint restore failed: {0}")]
267    CheckpointRestoreFailed(String),
268
269    /// Checkpoint not found.
270    #[error("checkpoint not found: {0}")]
271    CheckpointNotFound(String),
272
273    // ===== Health & Resilience Errors =====
274    /// Health check failed.
275    #[error("health check failed: {name} - {reason}")]
276    HealthCheckFailed {
277        /// Health check name
278        name: String,
279        /// Failure reason
280        reason: String,
281    },
282
283    /// Circuit breaker is open.
284    #[error("circuit breaker open: {name}")]
285    CircuitBreakerOpen {
286        /// Circuit breaker name
287        name: String,
288    },
289
290    /// Retry attempts exhausted.
291    #[error("retry exhausted after {attempts} attempts: {reason}")]
292    RetryExhausted {
293        /// Number of attempts made
294        attempts: u32,
295        /// Last failure reason
296        reason: String,
297    },
298
299    /// Kernel watchdog timeout.
300    #[error("kernel watchdog timeout: {kernel_id}")]
301    WatchdogTimeout {
302        /// Kernel ID that timed out
303        kernel_id: String,
304    },
305
306    /// Load shedding rejected request.
307    #[error("load shedding: request rejected at level {level}")]
308    LoadSheddingRejected {
309        /// Current degradation level
310        level: String,
311    },
312
313    // ===== Migration Errors =====
314    /// Kernel migration failed.
315    #[error("kernel migration failed: {0}")]
316    MigrationFailed(String),
317
318    /// Migration source not ready.
319    #[error("migration source not ready: {kernel_id}")]
320    MigrationSourceNotReady {
321        /// Source kernel ID
322        kernel_id: String,
323    },
324
325    /// Migration destination unavailable.
326    #[error("migration destination unavailable: device {device_id}")]
327    MigrationDestinationUnavailable {
328        /// Destination device ID
329        device_id: usize,
330    },
331
332    // ===== Observability Errors =====
333    /// Tracing error.
334    #[error("tracing error: {0}")]
335    TracingError(String),
336
337    /// Span not found.
338    #[error("span not found: {0}")]
339    SpanNotFound(String),
340
341    /// Metrics export failed.
342    #[error("metrics export failed: {0}")]
343    MetricsExportFailed(String),
344
345    // ===== Generic Errors =====
346    /// Internal error.
347    #[error("internal error: {0}")]
348    Internal(String),
349
350    /// Feature not supported.
351    #[error("feature not supported: {0}")]
352    NotSupported(String),
353
354    /// Operation cancelled.
355    #[error("operation cancelled")]
356    Cancelled,
357}
358
359impl RingKernelError {
360    /// Returns true if this error is recoverable.
361    pub fn is_recoverable(&self) -> bool {
362        matches!(
363            self,
364            RingKernelError::QueueFull { .. }
365                | RingKernelError::QueueEmpty
366                | RingKernelError::Timeout(_)
367                | RingKernelError::PoolExhausted
368                | RingKernelError::CircuitBreakerOpen { .. }
369                | RingKernelError::LoadSheddingRejected { .. }
370        )
371    }
372
373    /// Returns true if this error indicates a resource issue.
374    pub fn is_resource_error(&self) -> bool {
375        matches!(
376            self,
377            RingKernelError::AllocationFailed { .. }
378                | RingKernelError::HostAllocationFailed { .. }
379                | RingKernelError::OutOfMemory { .. }
380                | RingKernelError::PoolExhausted
381                | RingKernelError::MigrationDestinationUnavailable { .. }
382        )
383    }
384
385    /// Returns true if this is a fatal error requiring restart.
386    pub fn is_fatal(&self) -> bool {
387        matches!(
388            self,
389            RingKernelError::BackendInitFailed(_)
390                | RingKernelError::NoDeviceFound
391                | RingKernelError::LockPoisoned
392                | RingKernelError::Internal(_)
393        )
394    }
395
396    /// Returns true if this is a health/resilience related error.
397    pub fn is_health_error(&self) -> bool {
398        matches!(
399            self,
400            RingKernelError::HealthCheckFailed { .. }
401                | RingKernelError::CircuitBreakerOpen { .. }
402                | RingKernelError::RetryExhausted { .. }
403                | RingKernelError::WatchdogTimeout { .. }
404                | RingKernelError::LoadSheddingRejected { .. }
405        )
406    }
407
408    /// Returns true if this is a migration-related error.
409    pub fn is_migration_error(&self) -> bool {
410        matches!(
411            self,
412            RingKernelError::MigrationFailed(_)
413                | RingKernelError::MigrationSourceNotReady { .. }
414                | RingKernelError::MigrationDestinationUnavailable { .. }
415        )
416    }
417
418    /// Returns true if this is an observability-related error.
419    pub fn is_observability_error(&self) -> bool {
420        matches!(
421            self,
422            RingKernelError::TracingError(_)
423                | RingKernelError::SpanNotFound(_)
424                | RingKernelError::MetricsExportFailed(_)
425                | RingKernelError::TelemetryError(_)
426                | RingKernelError::MetricsCollectionFailed(_)
427        )
428    }
429}
430
431#[cfg(test)]
432mod tests {
433    use super::*;
434
435    #[test]
436    fn test_error_display() {
437        let err = RingKernelError::KernelNotFound("test_kernel".to_string());
438        assert_eq!(format!("{}", err), "kernel not found: test_kernel");
439
440        let err = RingKernelError::QueueFull { capacity: 1024 };
441        assert!(format!("{}", err).contains("1024"));
442    }
443
444    #[test]
445    fn test_error_classification() {
446        assert!(RingKernelError::QueueFull { capacity: 1024 }.is_recoverable());
447        assert!(RingKernelError::OutOfMemory {
448            requested: 1000,
449            available: 100
450        }
451        .is_resource_error());
452        assert!(RingKernelError::LockPoisoned.is_fatal());
453    }
454
455    #[test]
456    fn test_health_error_display() {
457        let err = RingKernelError::HealthCheckFailed {
458            name: "liveness".to_string(),
459            reason: "timeout".to_string(),
460        };
461        assert_eq!(
462            format!("{}", err),
463            "health check failed: liveness - timeout"
464        );
465
466        let err = RingKernelError::CircuitBreakerOpen {
467            name: "gpu_ops".to_string(),
468        };
469        assert_eq!(format!("{}", err), "circuit breaker open: gpu_ops");
470
471        let err = RingKernelError::RetryExhausted {
472            attempts: 5,
473            reason: "connection refused".to_string(),
474        };
475        assert!(format!("{}", err).contains("5 attempts"));
476
477        let err = RingKernelError::WatchdogTimeout {
478            kernel_id: "kernel_42".to_string(),
479        };
480        assert!(format!("{}", err).contains("kernel_42"));
481    }
482
483    #[test]
484    fn test_health_error_classification() {
485        assert!(RingKernelError::CircuitBreakerOpen {
486            name: "test".to_string()
487        }
488        .is_recoverable());
489        assert!(RingKernelError::LoadSheddingRejected {
490            level: "critical".to_string()
491        }
492        .is_recoverable());
493        assert!(RingKernelError::HealthCheckFailed {
494            name: "test".to_string(),
495            reason: "failed".to_string()
496        }
497        .is_health_error());
498        assert!(RingKernelError::WatchdogTimeout {
499            kernel_id: "k1".to_string()
500        }
501        .is_health_error());
502    }
503
504    #[test]
505    fn test_migration_error_display() {
506        let err = RingKernelError::MigrationFailed("checkpoint transfer error".to_string());
507        assert!(format!("{}", err).contains("checkpoint transfer error"));
508
509        let err = RingKernelError::MigrationSourceNotReady {
510            kernel_id: "kernel_1".to_string(),
511        };
512        assert!(format!("{}", err).contains("kernel_1"));
513
514        let err = RingKernelError::MigrationDestinationUnavailable { device_id: 2 };
515        assert!(format!("{}", err).contains("device 2"));
516    }
517
518    #[test]
519    fn test_migration_error_classification() {
520        assert!(RingKernelError::MigrationFailed("test".to_string()).is_migration_error());
521        assert!(RingKernelError::MigrationSourceNotReady {
522            kernel_id: "k1".to_string()
523        }
524        .is_migration_error());
525        assert!(
526            RingKernelError::MigrationDestinationUnavailable { device_id: 0 }.is_migration_error()
527        );
528        assert!(
529            RingKernelError::MigrationDestinationUnavailable { device_id: 0 }.is_resource_error()
530        );
531    }
532
533    #[test]
534    fn test_observability_error_display() {
535        let err = RingKernelError::TracingError("span creation failed".to_string());
536        assert!(format!("{}", err).contains("span creation failed"));
537
538        let err = RingKernelError::SpanNotFound("span_abc123".to_string());
539        assert!(format!("{}", err).contains("span_abc123"));
540
541        let err = RingKernelError::MetricsExportFailed("prometheus timeout".to_string());
542        assert!(format!("{}", err).contains("prometheus timeout"));
543    }
544
545    #[test]
546    fn test_observability_error_classification() {
547        assert!(RingKernelError::TracingError("test".to_string()).is_observability_error());
548        assert!(RingKernelError::SpanNotFound("test".to_string()).is_observability_error());
549        assert!(RingKernelError::MetricsExportFailed("test".to_string()).is_observability_error());
550        assert!(RingKernelError::TelemetryError("test".to_string()).is_observability_error());
551        assert!(
552            RingKernelError::MetricsCollectionFailed("test".to_string()).is_observability_error()
553        );
554    }
555}