ringkernel_core/
error.rs

1//! Error types for RingKernel operations.
2
3use thiserror::Error;
4
5/// Result type alias for RingKernel operations.
6pub type Result<T> = std::result::Result<T, RingKernelError>;
7
8/// Comprehensive error type for RingKernel operations.
9#[derive(Error, Debug)]
10pub enum RingKernelError {
11    // ===== Kernel Lifecycle Errors =====
12    /// Kernel not found with the given ID.
13    #[error("kernel not found: {0}")]
14    KernelNotFound(String),
15
16    /// Kernel is already active.
17    #[error("kernel already active: {0}")]
18    KernelAlreadyActive(String),
19
20    /// Kernel is not active.
21    #[error("kernel not active: {0}")]
22    KernelNotActive(String),
23
24    /// Kernel has already terminated.
25    #[error("kernel already terminated: {0}")]
26    KernelTerminated(String),
27
28    /// Invalid kernel state transition.
29    #[error("invalid state transition from {from:?} to {to:?}")]
30    InvalidStateTransition {
31        /// Current state
32        from: String,
33        /// Attempted target state
34        to: String,
35    },
36
37    /// Invalid kernel state.
38    #[error("invalid state: expected {expected}, got {actual}")]
39    InvalidState {
40        /// Expected state
41        expected: String,
42        /// Actual state
43        actual: String,
44    },
45
46    /// Kernel launch failed.
47    #[error("kernel launch failed: {0}")]
48    LaunchFailed(String),
49
50    /// Kernel compilation failed (NVRTC, shader compilation, etc.).
51    #[error("kernel compilation failed: {0}")]
52    CompilationError(String),
53
54    // ===== Message Errors =====
55    /// Queue is full, message cannot be enqueued.
56    #[error("queue full: capacity {capacity}, attempted to enqueue message")]
57    QueueFull {
58        /// Queue capacity
59        capacity: usize,
60    },
61
62    /// Queue is empty, no message to dequeue.
63    #[error("queue empty")]
64    QueueEmpty,
65
66    /// Message serialization failed.
67    #[error("serialization error: {0}")]
68    SerializationError(String),
69
70    /// Message deserialization failed.
71    #[error("deserialization error: {0}")]
72    DeserializationError(String),
73
74    /// Message validation failed.
75    #[error("message validation failed: {0}")]
76    ValidationError(String),
77
78    /// Message too large.
79    #[error("message too large: {size} bytes (max: {max} bytes)")]
80    MessageTooLarge {
81        /// Actual message size
82        size: usize,
83        /// Maximum allowed size
84        max: usize,
85    },
86
87    /// Message timeout.
88    #[error("message timeout after {0:?}")]
89    Timeout(std::time::Duration),
90
91    // ===== Memory Errors =====
92    /// GPU memory allocation failed.
93    #[error("GPU memory allocation failed: {size} bytes - {reason}")]
94    AllocationFailed {
95        /// Requested size
96        size: usize,
97        /// Failure reason
98        reason: String,
99    },
100
101    /// Host memory allocation failed.
102    #[error("host memory allocation failed: {size} bytes")]
103    HostAllocationFailed {
104        /// Requested size
105        size: usize,
106    },
107
108    /// Memory transfer failed.
109    #[error("memory transfer failed: {0}")]
110    TransferFailed(String),
111
112    /// Invalid memory alignment.
113    #[error("invalid alignment: expected {expected}, got {actual}")]
114    InvalidAlignment {
115        /// Expected alignment
116        expected: usize,
117        /// Actual alignment
118        actual: usize,
119    },
120
121    /// Out of GPU memory.
122    #[error("out of GPU memory: requested {requested} bytes, available {available} bytes")]
123    OutOfMemory {
124        /// Requested size
125        requested: usize,
126        /// Available memory
127        available: usize,
128    },
129
130    /// Memory pool exhausted.
131    #[error("memory pool exhausted")]
132    PoolExhausted,
133
134    /// Generic memory error.
135    #[error("memory error: {0}")]
136    MemoryError(String),
137
138    // ===== Backend Errors =====
139    /// Backend not available.
140    #[error("backend not available: {0}")]
141    BackendUnavailable(String),
142
143    /// Backend initialization failed.
144    #[error("backend initialization failed: {0}")]
145    BackendInitFailed(String),
146
147    /// No suitable GPU device found.
148    #[error("no GPU device found")]
149    NoDeviceFound,
150
151    /// Device selection failed.
152    #[error("device selection failed: {0}")]
153    DeviceSelectionFailed(String),
154
155    /// Backend operation failed.
156    #[error("backend error: {0}")]
157    BackendError(String),
158
159    // ===== Synchronization Errors =====
160    /// Deadlock detected.
161    #[error("deadlock detected")]
162    DeadlockDetected,
163
164    /// Lock poisoned.
165    #[error("lock poisoned")]
166    LockPoisoned,
167
168    /// Channel closed.
169    #[error("channel closed")]
170    ChannelClosed,
171
172    // ===== HLC Errors =====
173    /// Clock skew too large.
174    #[error("clock skew too large: {skew_ms}ms (max: {max_ms}ms)")]
175    ClockSkew {
176        /// Detected skew in milliseconds
177        skew_ms: u64,
178        /// Maximum allowed skew
179        max_ms: u64,
180    },
181
182    /// Invalid timestamp.
183    #[error("invalid timestamp")]
184    InvalidTimestamp,
185
186    // ===== K2K Messaging Errors =====
187    /// K2K messaging error.
188    #[error("K2K error: {0}")]
189    K2KError(String),
190
191    /// K2K destination not found.
192    #[error("K2K destination not found: {0}")]
193    K2KDestinationNotFound(String),
194
195    /// K2K delivery failed.
196    #[error("K2K delivery failed: {0}")]
197    K2KDeliveryFailed(String),
198
199    // ===== Pub/Sub Errors =====
200    /// Pub/sub error.
201    #[error("pub/sub error: {0}")]
202    PubSubError(String),
203
204    /// Topic not found.
205    #[error("topic not found: {0}")]
206    TopicNotFound(String),
207
208    /// Subscription error.
209    #[error("subscription error: {0}")]
210    SubscriptionError(String),
211
212    // ===== Multi-GPU Errors =====
213    /// Multi-GPU coordination error.
214    #[error("multi-GPU error: {0}")]
215    MultiGpuError(String),
216
217    /// Device not available.
218    #[error("device not available: {0}")]
219    DeviceNotAvailable(String),
220
221    /// Cross-device transfer failed.
222    #[error("cross-device transfer failed: {0}")]
223    CrossDeviceTransferFailed(String),
224
225    // ===== Telemetry Errors =====
226    /// Telemetry error.
227    #[error("telemetry error: {0}")]
228    TelemetryError(String),
229
230    /// Metrics collection failed.
231    #[error("metrics collection failed: {0}")]
232    MetricsCollectionFailed(String),
233
234    // ===== Configuration Errors =====
235    /// Invalid configuration.
236    #[error("invalid configuration: {0}")]
237    InvalidConfig(String),
238
239    /// Missing required configuration.
240    #[error("missing configuration: {0}")]
241    MissingConfig(String),
242
243    // ===== I/O Errors =====
244    /// I/O error wrapper.
245    #[error("I/O error: {0}")]
246    IoError(#[from] std::io::Error),
247
248    // ===== Generic Errors =====
249    /// Internal error.
250    #[error("internal error: {0}")]
251    Internal(String),
252
253    /// Feature not supported.
254    #[error("feature not supported: {0}")]
255    NotSupported(String),
256
257    /// Operation cancelled.
258    #[error("operation cancelled")]
259    Cancelled,
260}
261
262impl RingKernelError {
263    /// Returns true if this error is recoverable.
264    pub fn is_recoverable(&self) -> bool {
265        matches!(
266            self,
267            RingKernelError::QueueFull { .. }
268                | RingKernelError::QueueEmpty
269                | RingKernelError::Timeout(_)
270                | RingKernelError::PoolExhausted
271        )
272    }
273
274    /// Returns true if this error indicates a resource issue.
275    pub fn is_resource_error(&self) -> bool {
276        matches!(
277            self,
278            RingKernelError::AllocationFailed { .. }
279                | RingKernelError::HostAllocationFailed { .. }
280                | RingKernelError::OutOfMemory { .. }
281                | RingKernelError::PoolExhausted
282        )
283    }
284
285    /// Returns true if this is a fatal error requiring restart.
286    pub fn is_fatal(&self) -> bool {
287        matches!(
288            self,
289            RingKernelError::BackendInitFailed(_)
290                | RingKernelError::NoDeviceFound
291                | RingKernelError::LockPoisoned
292                | RingKernelError::Internal(_)
293        )
294    }
295}
296
297#[cfg(test)]
298mod tests {
299    use super::*;
300
301    #[test]
302    fn test_error_display() {
303        let err = RingKernelError::KernelNotFound("test_kernel".to_string());
304        assert_eq!(format!("{}", err), "kernel not found: test_kernel");
305
306        let err = RingKernelError::QueueFull { capacity: 1024 };
307        assert!(format!("{}", err).contains("1024"));
308    }
309
310    #[test]
311    fn test_error_classification() {
312        assert!(RingKernelError::QueueFull { capacity: 1024 }.is_recoverable());
313        assert!(RingKernelError::OutOfMemory {
314            requested: 1000,
315            available: 100
316        }
317        .is_resource_error());
318        assert!(RingKernelError::LockPoisoned.is_fatal());
319    }
320}