1use thiserror::Error;
4
5pub type Result<T> = std::result::Result<T, RingKernelError>;
7
8#[derive(Error, Debug)]
10pub enum RingKernelError {
11 #[error("kernel not found: {0}")]
14 KernelNotFound(String),
15
16 #[error("kernel already active: {0}")]
18 KernelAlreadyActive(String),
19
20 #[error("kernel not active: {0}")]
22 KernelNotActive(String),
23
24 #[error("kernel already terminated: {0}")]
26 KernelTerminated(String),
27
28 #[error("invalid state transition from {from:?} to {to:?}")]
30 InvalidStateTransition {
31 from: String,
33 to: String,
35 },
36
37 #[error("invalid state: expected {expected}, got {actual}")]
39 InvalidState {
40 expected: String,
42 actual: String,
44 },
45
46 #[error("kernel launch failed: {0}")]
48 LaunchFailed(String),
49
50 #[error("kernel compilation failed: {0}")]
52 CompilationError(String),
53
54 #[error("queue full: capacity {capacity}, attempted to enqueue message")]
57 QueueFull {
58 capacity: usize,
60 },
61
62 #[error("queue empty")]
64 QueueEmpty,
65
66 #[error("serialization error: {0}")]
68 SerializationError(String),
69
70 #[error("deserialization error: {0}")]
72 DeserializationError(String),
73
74 #[error("message validation failed: {0}")]
76 ValidationError(String),
77
78 #[error("message too large: {size} bytes (max: {max} bytes)")]
80 MessageTooLarge {
81 size: usize,
83 max: usize,
85 },
86
87 #[error("message timeout after {0:?}")]
89 Timeout(std::time::Duration),
90
91 #[error("GPU memory allocation failed: {size} bytes - {reason}")]
94 AllocationFailed {
95 size: usize,
97 reason: String,
99 },
100
101 #[error("host memory allocation failed: {size} bytes")]
103 HostAllocationFailed {
104 size: usize,
106 },
107
108 #[error("memory transfer failed: {0}")]
110 TransferFailed(String),
111
112 #[error("invalid alignment: expected {expected}, got {actual}")]
114 InvalidAlignment {
115 expected: usize,
117 actual: usize,
119 },
120
121 #[error("out of GPU memory: requested {requested} bytes, available {available} bytes")]
123 OutOfMemory {
124 requested: usize,
126 available: usize,
128 },
129
130 #[error("memory pool exhausted")]
132 PoolExhausted,
133
134 #[error("invalid index: {0}")]
136 InvalidIndex(usize),
137
138 #[error("memory error: {0}")]
140 MemoryError(String),
141
142 #[error("backend not available: {0}")]
145 BackendUnavailable(String),
146
147 #[error("backend initialization failed: {0}")]
149 BackendInitFailed(String),
150
151 #[error("no GPU device found")]
153 NoDeviceFound,
154
155 #[error("device selection failed: {0}")]
157 DeviceSelectionFailed(String),
158
159 #[error("backend error: {0}")]
161 BackendError(String),
162
163 #[error("deadlock detected")]
166 DeadlockDetected,
167
168 #[error("lock poisoned")]
170 LockPoisoned,
171
172 #[error("channel closed")]
174 ChannelClosed,
175
176 #[error("clock skew too large: {skew_ms}ms (max: {max_ms}ms)")]
179 ClockSkew {
180 skew_ms: u64,
182 max_ms: u64,
184 },
185
186 #[error("invalid timestamp")]
188 InvalidTimestamp,
189
190 #[error("K2K error: {0}")]
193 K2KError(String),
194
195 #[error("K2K destination not found: {0}")]
197 K2KDestinationNotFound(String),
198
199 #[error("K2K delivery failed: {0}")]
201 K2KDeliveryFailed(String),
202
203 #[error("cross-tenant K2K send rejected: from tenant {from} to tenant {to}")]
209 TenantMismatch {
210 from: u64,
212 to: u64,
214 },
215
216 #[error("pub/sub error: {0}")]
219 PubSubError(String),
220
221 #[error("topic not found: {0}")]
223 TopicNotFound(String),
224
225 #[error("subscription error: {0}")]
227 SubscriptionError(String),
228
229 #[error("multi-GPU error: {0}")]
232 MultiGpuError(String),
233
234 #[error("device not available: {0}")]
236 DeviceNotAvailable(String),
237
238 #[error("cross-device transfer failed: {0}")]
240 CrossDeviceTransferFailed(String),
241
242 #[error("telemetry error: {0}")]
245 TelemetryError(String),
246
247 #[error("metrics collection failed: {0}")]
249 MetricsCollectionFailed(String),
250
251 #[error("invalid configuration: {0}")]
254 InvalidConfig(String),
255
256 #[error("missing configuration: {0}")]
258 MissingConfig(String),
259
260 #[error("I/O error: {0}")]
263 StdIoError(#[from] std::io::Error),
264
265 #[error("I/O error: {0}")]
267 IoError(String),
268
269 #[error("invalid checkpoint: {0}")]
272 InvalidCheckpoint(String),
273
274 #[error("checkpoint save failed: {0}")]
276 CheckpointSaveFailed(String),
277
278 #[error("checkpoint restore failed: {0}")]
280 CheckpointRestoreFailed(String),
281
282 #[error("checkpoint not found: {0}")]
284 CheckpointNotFound(String),
285
286 #[error("health check failed: {name} - {reason}")]
289 HealthCheckFailed {
290 name: String,
292 reason: String,
294 },
295
296 #[error("circuit breaker open: {name}")]
298 CircuitBreakerOpen {
299 name: String,
301 },
302
303 #[error("retry exhausted after {attempts} attempts: {reason}")]
305 RetryExhausted {
306 attempts: u32,
308 reason: String,
310 },
311
312 #[error("kernel watchdog timeout: {kernel_id}")]
314 WatchdogTimeout {
315 kernel_id: String,
317 },
318
319 #[error("load shedding: request rejected at level {level}")]
321 LoadSheddingRejected {
322 level: String,
324 },
325
326 #[error("kernel migration failed: {0}")]
329 MigrationFailed(String),
330
331 #[error("migration source not ready: {kernel_id}")]
333 MigrationSourceNotReady {
334 kernel_id: String,
336 },
337
338 #[error("migration destination unavailable: device {device_id}")]
340 MigrationDestinationUnavailable {
341 device_id: usize,
343 },
344
345 #[error("tracing error: {0}")]
348 TracingError(String),
349
350 #[error("span not found: {0}")]
352 SpanNotFound(String),
353
354 #[error("metrics export failed: {0}")]
356 MetricsExportFailed(String),
357
358 #[error("internal error: {0}")]
361 Internal(String),
362
363 #[error("feature not supported: {0}")]
365 NotSupported(String),
366
367 #[error("operation cancelled")]
369 Cancelled,
370}
371
372impl RingKernelError {
373 pub fn is_recoverable(&self) -> bool {
375 matches!(
376 self,
377 RingKernelError::QueueFull { .. }
378 | RingKernelError::QueueEmpty
379 | RingKernelError::Timeout(_)
380 | RingKernelError::PoolExhausted
381 | RingKernelError::CircuitBreakerOpen { .. }
382 | RingKernelError::LoadSheddingRejected { .. }
383 )
384 }
385
386 pub fn is_resource_error(&self) -> bool {
388 matches!(
389 self,
390 RingKernelError::AllocationFailed { .. }
391 | RingKernelError::HostAllocationFailed { .. }
392 | RingKernelError::OutOfMemory { .. }
393 | RingKernelError::PoolExhausted
394 | RingKernelError::MigrationDestinationUnavailable { .. }
395 )
396 }
397
398 pub fn is_fatal(&self) -> bool {
400 matches!(
401 self,
402 RingKernelError::BackendInitFailed(_)
403 | RingKernelError::NoDeviceFound
404 | RingKernelError::LockPoisoned
405 | RingKernelError::Internal(_)
406 )
407 }
408
409 pub fn is_health_error(&self) -> bool {
411 matches!(
412 self,
413 RingKernelError::HealthCheckFailed { .. }
414 | RingKernelError::CircuitBreakerOpen { .. }
415 | RingKernelError::RetryExhausted { .. }
416 | RingKernelError::WatchdogTimeout { .. }
417 | RingKernelError::LoadSheddingRejected { .. }
418 )
419 }
420
421 pub fn is_migration_error(&self) -> bool {
423 matches!(
424 self,
425 RingKernelError::MigrationFailed(_)
426 | RingKernelError::MigrationSourceNotReady { .. }
427 | RingKernelError::MigrationDestinationUnavailable { .. }
428 )
429 }
430
431 pub fn is_observability_error(&self) -> bool {
433 matches!(
434 self,
435 RingKernelError::TracingError(_)
436 | RingKernelError::SpanNotFound(_)
437 | RingKernelError::MetricsExportFailed(_)
438 | RingKernelError::TelemetryError(_)
439 | RingKernelError::MetricsCollectionFailed(_)
440 )
441 }
442}
443
444#[cfg(test)]
445mod tests {
446 use super::*;
447
448 #[test]
449 fn test_error_display() {
450 let err = RingKernelError::KernelNotFound("test_kernel".to_string());
451 assert_eq!(format!("{}", err), "kernel not found: test_kernel");
452
453 let err = RingKernelError::QueueFull { capacity: 1024 };
454 assert!(format!("{}", err).contains("1024"));
455 }
456
457 #[test]
458 fn test_error_classification() {
459 assert!(RingKernelError::QueueFull { capacity: 1024 }.is_recoverable());
460 assert!(RingKernelError::OutOfMemory {
461 requested: 1000,
462 available: 100
463 }
464 .is_resource_error());
465 assert!(RingKernelError::LockPoisoned.is_fatal());
466 }
467
468 #[test]
469 fn test_health_error_display() {
470 let err = RingKernelError::HealthCheckFailed {
471 name: "liveness".to_string(),
472 reason: "timeout".to_string(),
473 };
474 assert_eq!(
475 format!("{}", err),
476 "health check failed: liveness - timeout"
477 );
478
479 let err = RingKernelError::CircuitBreakerOpen {
480 name: "gpu_ops".to_string(),
481 };
482 assert_eq!(format!("{}", err), "circuit breaker open: gpu_ops");
483
484 let err = RingKernelError::RetryExhausted {
485 attempts: 5,
486 reason: "connection refused".to_string(),
487 };
488 assert!(format!("{}", err).contains("5 attempts"));
489
490 let err = RingKernelError::WatchdogTimeout {
491 kernel_id: "kernel_42".to_string(),
492 };
493 assert!(format!("{}", err).contains("kernel_42"));
494 }
495
496 #[test]
497 fn test_health_error_classification() {
498 assert!(RingKernelError::CircuitBreakerOpen {
499 name: "test".to_string()
500 }
501 .is_recoverable());
502 assert!(RingKernelError::LoadSheddingRejected {
503 level: "critical".to_string()
504 }
505 .is_recoverable());
506 assert!(RingKernelError::HealthCheckFailed {
507 name: "test".to_string(),
508 reason: "failed".to_string()
509 }
510 .is_health_error());
511 assert!(RingKernelError::WatchdogTimeout {
512 kernel_id: "k1".to_string()
513 }
514 .is_health_error());
515 }
516
517 #[test]
518 fn test_migration_error_display() {
519 let err = RingKernelError::MigrationFailed("checkpoint transfer error".to_string());
520 assert!(format!("{}", err).contains("checkpoint transfer error"));
521
522 let err = RingKernelError::MigrationSourceNotReady {
523 kernel_id: "kernel_1".to_string(),
524 };
525 assert!(format!("{}", err).contains("kernel_1"));
526
527 let err = RingKernelError::MigrationDestinationUnavailable { device_id: 2 };
528 assert!(format!("{}", err).contains("device 2"));
529 }
530
531 #[test]
532 fn test_migration_error_classification() {
533 assert!(RingKernelError::MigrationFailed("test".to_string()).is_migration_error());
534 assert!(RingKernelError::MigrationSourceNotReady {
535 kernel_id: "k1".to_string()
536 }
537 .is_migration_error());
538 assert!(
539 RingKernelError::MigrationDestinationUnavailable { device_id: 0 }.is_migration_error()
540 );
541 assert!(
542 RingKernelError::MigrationDestinationUnavailable { device_id: 0 }.is_resource_error()
543 );
544 }
545
546 #[test]
547 fn test_observability_error_display() {
548 let err = RingKernelError::TracingError("span creation failed".to_string());
549 assert!(format!("{}", err).contains("span creation failed"));
550
551 let err = RingKernelError::SpanNotFound("span_abc123".to_string());
552 assert!(format!("{}", err).contains("span_abc123"));
553
554 let err = RingKernelError::MetricsExportFailed("prometheus timeout".to_string());
555 assert!(format!("{}", err).contains("prometheus timeout"));
556 }
557
558 #[test]
559 fn test_observability_error_classification() {
560 assert!(RingKernelError::TracingError("test".to_string()).is_observability_error());
561 assert!(RingKernelError::SpanNotFound("test".to_string()).is_observability_error());
562 assert!(RingKernelError::MetricsExportFailed("test".to_string()).is_observability_error());
563 assert!(RingKernelError::TelemetryError("test".to_string()).is_observability_error());
564 assert!(
565 RingKernelError::MetricsCollectionFailed("test".to_string()).is_observability_error()
566 );
567 }
568}