1use thiserror::Error;
4
5pub type Result<T> = std::result::Result<T, RingKernelError>;
7
8#[derive(Error, Debug)]
10pub enum RingKernelError {
11 #[error("kernel not found: {0}")]
14 KernelNotFound(String),
15
16 #[error("kernel already active: {0}")]
18 KernelAlreadyActive(String),
19
20 #[error("kernel not active: {0}")]
22 KernelNotActive(String),
23
24 #[error("kernel already terminated: {0}")]
26 KernelTerminated(String),
27
28 #[error("invalid state transition from {from:?} to {to:?}")]
30 InvalidStateTransition {
31 from: String,
33 to: String,
35 },
36
37 #[error("invalid state: expected {expected}, got {actual}")]
39 InvalidState {
40 expected: String,
42 actual: String,
44 },
45
46 #[error("kernel launch failed: {0}")]
48 LaunchFailed(String),
49
50 #[error("kernel compilation failed: {0}")]
52 CompilationError(String),
53
54 #[error("queue full: capacity {capacity}, attempted to enqueue message")]
57 QueueFull {
58 capacity: usize,
60 },
61
62 #[error("queue empty")]
64 QueueEmpty,
65
66 #[error("serialization error: {0}")]
68 SerializationError(String),
69
70 #[error("deserialization error: {0}")]
72 DeserializationError(String),
73
74 #[error("message validation failed: {0}")]
76 ValidationError(String),
77
78 #[error("message too large: {size} bytes (max: {max} bytes)")]
80 MessageTooLarge {
81 size: usize,
83 max: usize,
85 },
86
87 #[error("message timeout after {0:?}")]
89 Timeout(std::time::Duration),
90
91 #[error("GPU memory allocation failed: {size} bytes - {reason}")]
94 AllocationFailed {
95 size: usize,
97 reason: String,
99 },
100
101 #[error("host memory allocation failed: {size} bytes")]
103 HostAllocationFailed {
104 size: usize,
106 },
107
108 #[error("memory transfer failed: {0}")]
110 TransferFailed(String),
111
112 #[error("invalid alignment: expected {expected}, got {actual}")]
114 InvalidAlignment {
115 expected: usize,
117 actual: usize,
119 },
120
121 #[error("out of GPU memory: requested {requested} bytes, available {available} bytes")]
123 OutOfMemory {
124 requested: usize,
126 available: usize,
128 },
129
130 #[error("memory pool exhausted")]
132 PoolExhausted,
133
134 #[error("invalid index: {0}")]
136 InvalidIndex(usize),
137
138 #[error("memory error: {0}")]
140 MemoryError(String),
141
142 #[error("backend not available: {0}")]
145 BackendUnavailable(String),
146
147 #[error("backend initialization failed: {0}")]
149 BackendInitFailed(String),
150
151 #[error("no GPU device found")]
153 NoDeviceFound,
154
155 #[error("device selection failed: {0}")]
157 DeviceSelectionFailed(String),
158
159 #[error("backend error: {0}")]
161 BackendError(String),
162
163 #[error("deadlock detected")]
166 DeadlockDetected,
167
168 #[error("lock poisoned")]
170 LockPoisoned,
171
172 #[error("channel closed")]
174 ChannelClosed,
175
176 #[error("clock skew too large: {skew_ms}ms (max: {max_ms}ms)")]
179 ClockSkew {
180 skew_ms: u64,
182 max_ms: u64,
184 },
185
186 #[error("invalid timestamp")]
188 InvalidTimestamp,
189
190 #[error("K2K error: {0}")]
193 K2KError(String),
194
195 #[error("K2K destination not found: {0}")]
197 K2KDestinationNotFound(String),
198
199 #[error("K2K delivery failed: {0}")]
201 K2KDeliveryFailed(String),
202
203 #[error("pub/sub error: {0}")]
206 PubSubError(String),
207
208 #[error("topic not found: {0}")]
210 TopicNotFound(String),
211
212 #[error("subscription error: {0}")]
214 SubscriptionError(String),
215
216 #[error("multi-GPU error: {0}")]
219 MultiGpuError(String),
220
221 #[error("device not available: {0}")]
223 DeviceNotAvailable(String),
224
225 #[error("cross-device transfer failed: {0}")]
227 CrossDeviceTransferFailed(String),
228
229 #[error("telemetry error: {0}")]
232 TelemetryError(String),
233
234 #[error("metrics collection failed: {0}")]
236 MetricsCollectionFailed(String),
237
238 #[error("invalid configuration: {0}")]
241 InvalidConfig(String),
242
243 #[error("missing configuration: {0}")]
245 MissingConfig(String),
246
247 #[error("I/O error: {0}")]
250 StdIoError(#[from] std::io::Error),
251
252 #[error("I/O error: {0}")]
254 IoError(String),
255
256 #[error("invalid checkpoint: {0}")]
259 InvalidCheckpoint(String),
260
261 #[error("checkpoint save failed: {0}")]
263 CheckpointSaveFailed(String),
264
265 #[error("checkpoint restore failed: {0}")]
267 CheckpointRestoreFailed(String),
268
269 #[error("checkpoint not found: {0}")]
271 CheckpointNotFound(String),
272
273 #[error("health check failed: {name} - {reason}")]
276 HealthCheckFailed {
277 name: String,
279 reason: String,
281 },
282
283 #[error("circuit breaker open: {name}")]
285 CircuitBreakerOpen {
286 name: String,
288 },
289
290 #[error("retry exhausted after {attempts} attempts: {reason}")]
292 RetryExhausted {
293 attempts: u32,
295 reason: String,
297 },
298
299 #[error("kernel watchdog timeout: {kernel_id}")]
301 WatchdogTimeout {
302 kernel_id: String,
304 },
305
306 #[error("load shedding: request rejected at level {level}")]
308 LoadSheddingRejected {
309 level: String,
311 },
312
313 #[error("kernel migration failed: {0}")]
316 MigrationFailed(String),
317
318 #[error("migration source not ready: {kernel_id}")]
320 MigrationSourceNotReady {
321 kernel_id: String,
323 },
324
325 #[error("migration destination unavailable: device {device_id}")]
327 MigrationDestinationUnavailable {
328 device_id: usize,
330 },
331
332 #[error("tracing error: {0}")]
335 TracingError(String),
336
337 #[error("span not found: {0}")]
339 SpanNotFound(String),
340
341 #[error("metrics export failed: {0}")]
343 MetricsExportFailed(String),
344
345 #[error("internal error: {0}")]
348 Internal(String),
349
350 #[error("feature not supported: {0}")]
352 NotSupported(String),
353
354 #[error("operation cancelled")]
356 Cancelled,
357}
358
359impl RingKernelError {
360 pub fn is_recoverable(&self) -> bool {
362 matches!(
363 self,
364 RingKernelError::QueueFull { .. }
365 | RingKernelError::QueueEmpty
366 | RingKernelError::Timeout(_)
367 | RingKernelError::PoolExhausted
368 | RingKernelError::CircuitBreakerOpen { .. }
369 | RingKernelError::LoadSheddingRejected { .. }
370 )
371 }
372
373 pub fn is_resource_error(&self) -> bool {
375 matches!(
376 self,
377 RingKernelError::AllocationFailed { .. }
378 | RingKernelError::HostAllocationFailed { .. }
379 | RingKernelError::OutOfMemory { .. }
380 | RingKernelError::PoolExhausted
381 | RingKernelError::MigrationDestinationUnavailable { .. }
382 )
383 }
384
385 pub fn is_fatal(&self) -> bool {
387 matches!(
388 self,
389 RingKernelError::BackendInitFailed(_)
390 | RingKernelError::NoDeviceFound
391 | RingKernelError::LockPoisoned
392 | RingKernelError::Internal(_)
393 )
394 }
395
396 pub fn is_health_error(&self) -> bool {
398 matches!(
399 self,
400 RingKernelError::HealthCheckFailed { .. }
401 | RingKernelError::CircuitBreakerOpen { .. }
402 | RingKernelError::RetryExhausted { .. }
403 | RingKernelError::WatchdogTimeout { .. }
404 | RingKernelError::LoadSheddingRejected { .. }
405 )
406 }
407
408 pub fn is_migration_error(&self) -> bool {
410 matches!(
411 self,
412 RingKernelError::MigrationFailed(_)
413 | RingKernelError::MigrationSourceNotReady { .. }
414 | RingKernelError::MigrationDestinationUnavailable { .. }
415 )
416 }
417
418 pub fn is_observability_error(&self) -> bool {
420 matches!(
421 self,
422 RingKernelError::TracingError(_)
423 | RingKernelError::SpanNotFound(_)
424 | RingKernelError::MetricsExportFailed(_)
425 | RingKernelError::TelemetryError(_)
426 | RingKernelError::MetricsCollectionFailed(_)
427 )
428 }
429}
430
431#[cfg(test)]
432mod tests {
433 use super::*;
434
435 #[test]
436 fn test_error_display() {
437 let err = RingKernelError::KernelNotFound("test_kernel".to_string());
438 assert_eq!(format!("{}", err), "kernel not found: test_kernel");
439
440 let err = RingKernelError::QueueFull { capacity: 1024 };
441 assert!(format!("{}", err).contains("1024"));
442 }
443
444 #[test]
445 fn test_error_classification() {
446 assert!(RingKernelError::QueueFull { capacity: 1024 }.is_recoverable());
447 assert!(RingKernelError::OutOfMemory {
448 requested: 1000,
449 available: 100
450 }
451 .is_resource_error());
452 assert!(RingKernelError::LockPoisoned.is_fatal());
453 }
454
455 #[test]
456 fn test_health_error_display() {
457 let err = RingKernelError::HealthCheckFailed {
458 name: "liveness".to_string(),
459 reason: "timeout".to_string(),
460 };
461 assert_eq!(
462 format!("{}", err),
463 "health check failed: liveness - timeout"
464 );
465
466 let err = RingKernelError::CircuitBreakerOpen {
467 name: "gpu_ops".to_string(),
468 };
469 assert_eq!(format!("{}", err), "circuit breaker open: gpu_ops");
470
471 let err = RingKernelError::RetryExhausted {
472 attempts: 5,
473 reason: "connection refused".to_string(),
474 };
475 assert!(format!("{}", err).contains("5 attempts"));
476
477 let err = RingKernelError::WatchdogTimeout {
478 kernel_id: "kernel_42".to_string(),
479 };
480 assert!(format!("{}", err).contains("kernel_42"));
481 }
482
483 #[test]
484 fn test_health_error_classification() {
485 assert!(RingKernelError::CircuitBreakerOpen {
486 name: "test".to_string()
487 }
488 .is_recoverable());
489 assert!(RingKernelError::LoadSheddingRejected {
490 level: "critical".to_string()
491 }
492 .is_recoverable());
493 assert!(RingKernelError::HealthCheckFailed {
494 name: "test".to_string(),
495 reason: "failed".to_string()
496 }
497 .is_health_error());
498 assert!(RingKernelError::WatchdogTimeout {
499 kernel_id: "k1".to_string()
500 }
501 .is_health_error());
502 }
503
504 #[test]
505 fn test_migration_error_display() {
506 let err = RingKernelError::MigrationFailed("checkpoint transfer error".to_string());
507 assert!(format!("{}", err).contains("checkpoint transfer error"));
508
509 let err = RingKernelError::MigrationSourceNotReady {
510 kernel_id: "kernel_1".to_string(),
511 };
512 assert!(format!("{}", err).contains("kernel_1"));
513
514 let err = RingKernelError::MigrationDestinationUnavailable { device_id: 2 };
515 assert!(format!("{}", err).contains("device 2"));
516 }
517
518 #[test]
519 fn test_migration_error_classification() {
520 assert!(RingKernelError::MigrationFailed("test".to_string()).is_migration_error());
521 assert!(RingKernelError::MigrationSourceNotReady {
522 kernel_id: "k1".to_string()
523 }
524 .is_migration_error());
525 assert!(
526 RingKernelError::MigrationDestinationUnavailable { device_id: 0 }.is_migration_error()
527 );
528 assert!(
529 RingKernelError::MigrationDestinationUnavailable { device_id: 0 }.is_resource_error()
530 );
531 }
532
533 #[test]
534 fn test_observability_error_display() {
535 let err = RingKernelError::TracingError("span creation failed".to_string());
536 assert!(format!("{}", err).contains("span creation failed"));
537
538 let err = RingKernelError::SpanNotFound("span_abc123".to_string());
539 assert!(format!("{}", err).contains("span_abc123"));
540
541 let err = RingKernelError::MetricsExportFailed("prometheus timeout".to_string());
542 assert!(format!("{}", err).contains("prometheus timeout"));
543 }
544
545 #[test]
546 fn test_observability_error_classification() {
547 assert!(RingKernelError::TracingError("test".to_string()).is_observability_error());
548 assert!(RingKernelError::SpanNotFound("test".to_string()).is_observability_error());
549 assert!(RingKernelError::MetricsExportFailed("test".to_string()).is_observability_error());
550 assert!(RingKernelError::TelemetryError("test".to_string()).is_observability_error());
551 assert!(
552 RingKernelError::MetricsCollectionFailed("test".to_string()).is_observability_error()
553 );
554 }
555}