use thiserror::Error;
pub type Result<T> = std::result::Result<T, RingKernelError>;
#[derive(Error, Debug)]
pub enum RingKernelError {
#[error("kernel not found: {0}")]
KernelNotFound(String),
#[error("kernel already active: {0}")]
KernelAlreadyActive(String),
#[error("kernel not active: {0}")]
KernelNotActive(String),
#[error("kernel already terminated: {0}")]
KernelTerminated(String),
#[error("invalid state transition from {from:?} to {to:?}")]
InvalidStateTransition {
from: String,
to: String,
},
#[error("invalid state: expected {expected}, got {actual}")]
InvalidState {
expected: String,
actual: String,
},
#[error("kernel launch failed: {0}")]
LaunchFailed(String),
#[error("kernel compilation failed: {0}")]
CompilationError(String),
#[error("queue full: capacity {capacity}, attempted to enqueue message")]
QueueFull {
capacity: usize,
},
#[error("queue empty")]
QueueEmpty,
#[error("serialization error: {0}")]
SerializationError(String),
#[error("deserialization error: {0}")]
DeserializationError(String),
#[error("message validation failed: {0}")]
ValidationError(String),
#[error("message too large: {size} bytes (max: {max} bytes)")]
MessageTooLarge {
size: usize,
max: usize,
},
#[error("message timeout after {0:?}")]
Timeout(std::time::Duration),
#[error("GPU memory allocation failed: {size} bytes - {reason}")]
AllocationFailed {
size: usize,
reason: String,
},
#[error("host memory allocation failed: {size} bytes")]
HostAllocationFailed {
size: usize,
},
#[error("memory transfer failed: {0}")]
TransferFailed(String),
#[error("invalid alignment: expected {expected}, got {actual}")]
InvalidAlignment {
expected: usize,
actual: usize,
},
#[error("out of GPU memory: requested {requested} bytes, available {available} bytes")]
OutOfMemory {
requested: usize,
available: usize,
},
#[error("memory pool exhausted")]
PoolExhausted,
#[error("invalid index: {0}")]
InvalidIndex(usize),
#[error("memory error: {0}")]
MemoryError(String),
#[error("backend not available: {0}")]
BackendUnavailable(String),
#[error("backend initialization failed: {0}")]
BackendInitFailed(String),
#[error("no GPU device found")]
NoDeviceFound,
#[error("device selection failed: {0}")]
DeviceSelectionFailed(String),
#[error("backend error: {0}")]
BackendError(String),
#[error("deadlock detected")]
DeadlockDetected,
#[error("lock poisoned")]
LockPoisoned,
#[error("channel closed")]
ChannelClosed,
#[error("clock skew too large: {skew_ms}ms (max: {max_ms}ms)")]
ClockSkew {
skew_ms: u64,
max_ms: u64,
},
#[error("invalid timestamp")]
InvalidTimestamp,
#[error("K2K error: {0}")]
K2KError(String),
#[error("K2K destination not found: {0}")]
K2KDestinationNotFound(String),
#[error("K2K delivery failed: {0}")]
K2KDeliveryFailed(String),
#[error("pub/sub error: {0}")]
PubSubError(String),
#[error("topic not found: {0}")]
TopicNotFound(String),
#[error("subscription error: {0}")]
SubscriptionError(String),
#[error("multi-GPU error: {0}")]
MultiGpuError(String),
#[error("device not available: {0}")]
DeviceNotAvailable(String),
#[error("cross-device transfer failed: {0}")]
CrossDeviceTransferFailed(String),
#[error("telemetry error: {0}")]
TelemetryError(String),
#[error("metrics collection failed: {0}")]
MetricsCollectionFailed(String),
#[error("invalid configuration: {0}")]
InvalidConfig(String),
#[error("missing configuration: {0}")]
MissingConfig(String),
#[error("I/O error: {0}")]
StdIoError(#[from] std::io::Error),
#[error("I/O error: {0}")]
IoError(String),
#[error("invalid checkpoint: {0}")]
InvalidCheckpoint(String),
#[error("checkpoint save failed: {0}")]
CheckpointSaveFailed(String),
#[error("checkpoint restore failed: {0}")]
CheckpointRestoreFailed(String),
#[error("checkpoint not found: {0}")]
CheckpointNotFound(String),
#[error("health check failed: {name} - {reason}")]
HealthCheckFailed {
name: String,
reason: String,
},
#[error("circuit breaker open: {name}")]
CircuitBreakerOpen {
name: String,
},
#[error("retry exhausted after {attempts} attempts: {reason}")]
RetryExhausted {
attempts: u32,
reason: String,
},
#[error("kernel watchdog timeout: {kernel_id}")]
WatchdogTimeout {
kernel_id: String,
},
#[error("load shedding: request rejected at level {level}")]
LoadSheddingRejected {
level: String,
},
#[error("kernel migration failed: {0}")]
MigrationFailed(String),
#[error("migration source not ready: {kernel_id}")]
MigrationSourceNotReady {
kernel_id: String,
},
#[error("migration destination unavailable: device {device_id}")]
MigrationDestinationUnavailable {
device_id: usize,
},
#[error("tracing error: {0}")]
TracingError(String),
#[error("span not found: {0}")]
SpanNotFound(String),
#[error("metrics export failed: {0}")]
MetricsExportFailed(String),
#[error("internal error: {0}")]
Internal(String),
#[error("feature not supported: {0}")]
NotSupported(String),
#[error("operation cancelled")]
Cancelled,
}
impl RingKernelError {
pub fn is_recoverable(&self) -> bool {
matches!(
self,
RingKernelError::QueueFull { .. }
| RingKernelError::QueueEmpty
| RingKernelError::Timeout(_)
| RingKernelError::PoolExhausted
| RingKernelError::CircuitBreakerOpen { .. }
| RingKernelError::LoadSheddingRejected { .. }
)
}
pub fn is_resource_error(&self) -> bool {
matches!(
self,
RingKernelError::AllocationFailed { .. }
| RingKernelError::HostAllocationFailed { .. }
| RingKernelError::OutOfMemory { .. }
| RingKernelError::PoolExhausted
| RingKernelError::MigrationDestinationUnavailable { .. }
)
}
pub fn is_fatal(&self) -> bool {
matches!(
self,
RingKernelError::BackendInitFailed(_)
| RingKernelError::NoDeviceFound
| RingKernelError::LockPoisoned
| RingKernelError::Internal(_)
)
}
pub fn is_health_error(&self) -> bool {
matches!(
self,
RingKernelError::HealthCheckFailed { .. }
| RingKernelError::CircuitBreakerOpen { .. }
| RingKernelError::RetryExhausted { .. }
| RingKernelError::WatchdogTimeout { .. }
| RingKernelError::LoadSheddingRejected { .. }
)
}
pub fn is_migration_error(&self) -> bool {
matches!(
self,
RingKernelError::MigrationFailed(_)
| RingKernelError::MigrationSourceNotReady { .. }
| RingKernelError::MigrationDestinationUnavailable { .. }
)
}
pub fn is_observability_error(&self) -> bool {
matches!(
self,
RingKernelError::TracingError(_)
| RingKernelError::SpanNotFound(_)
| RingKernelError::MetricsExportFailed(_)
| RingKernelError::TelemetryError(_)
| RingKernelError::MetricsCollectionFailed(_)
)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_error_display() {
let err = RingKernelError::KernelNotFound("test_kernel".to_string());
assert_eq!(format!("{}", err), "kernel not found: test_kernel");
let err = RingKernelError::QueueFull { capacity: 1024 };
assert!(format!("{}", err).contains("1024"));
}
#[test]
fn test_error_classification() {
assert!(RingKernelError::QueueFull { capacity: 1024 }.is_recoverable());
assert!(RingKernelError::OutOfMemory {
requested: 1000,
available: 100
}
.is_resource_error());
assert!(RingKernelError::LockPoisoned.is_fatal());
}
#[test]
fn test_health_error_display() {
let err = RingKernelError::HealthCheckFailed {
name: "liveness".to_string(),
reason: "timeout".to_string(),
};
assert_eq!(
format!("{}", err),
"health check failed: liveness - timeout"
);
let err = RingKernelError::CircuitBreakerOpen {
name: "gpu_ops".to_string(),
};
assert_eq!(format!("{}", err), "circuit breaker open: gpu_ops");
let err = RingKernelError::RetryExhausted {
attempts: 5,
reason: "connection refused".to_string(),
};
assert!(format!("{}", err).contains("5 attempts"));
let err = RingKernelError::WatchdogTimeout {
kernel_id: "kernel_42".to_string(),
};
assert!(format!("{}", err).contains("kernel_42"));
}
#[test]
fn test_health_error_classification() {
assert!(RingKernelError::CircuitBreakerOpen {
name: "test".to_string()
}
.is_recoverable());
assert!(RingKernelError::LoadSheddingRejected {
level: "critical".to_string()
}
.is_recoverable());
assert!(RingKernelError::HealthCheckFailed {
name: "test".to_string(),
reason: "failed".to_string()
}
.is_health_error());
assert!(RingKernelError::WatchdogTimeout {
kernel_id: "k1".to_string()
}
.is_health_error());
}
#[test]
fn test_migration_error_display() {
let err = RingKernelError::MigrationFailed("checkpoint transfer error".to_string());
assert!(format!("{}", err).contains("checkpoint transfer error"));
let err = RingKernelError::MigrationSourceNotReady {
kernel_id: "kernel_1".to_string(),
};
assert!(format!("{}", err).contains("kernel_1"));
let err = RingKernelError::MigrationDestinationUnavailable { device_id: 2 };
assert!(format!("{}", err).contains("device 2"));
}
#[test]
fn test_migration_error_classification() {
assert!(RingKernelError::MigrationFailed("test".to_string()).is_migration_error());
assert!(RingKernelError::MigrationSourceNotReady {
kernel_id: "k1".to_string()
}
.is_migration_error());
assert!(
RingKernelError::MigrationDestinationUnavailable { device_id: 0 }.is_migration_error()
);
assert!(
RingKernelError::MigrationDestinationUnavailable { device_id: 0 }.is_resource_error()
);
}
#[test]
fn test_observability_error_display() {
let err = RingKernelError::TracingError("span creation failed".to_string());
assert!(format!("{}", err).contains("span creation failed"));
let err = RingKernelError::SpanNotFound("span_abc123".to_string());
assert!(format!("{}", err).contains("span_abc123"));
let err = RingKernelError::MetricsExportFailed("prometheus timeout".to_string());
assert!(format!("{}", err).contains("prometheus timeout"));
}
#[test]
fn test_observability_error_classification() {
assert!(RingKernelError::TracingError("test".to_string()).is_observability_error());
assert!(RingKernelError::SpanNotFound("test".to_string()).is_observability_error());
assert!(RingKernelError::MetricsExportFailed("test".to_string()).is_observability_error());
assert!(RingKernelError::TelemetryError("test".to_string()).is_observability_error());
assert!(
RingKernelError::MetricsCollectionFailed("test".to_string()).is_observability_error()
);
}
}