use std::path::PathBuf;
use thiserror::Error;
#[derive(Debug, Error)]
pub enum HalldyllError {
#[error("Configuration error: {0}")]
Config(#[from] ConfigError),
#[error("State error: {0}")]
State(#[from] StateError),
#[error("RunPod API error: {0}")]
RunPod(#[from] RunPodError),
#[error("Planning error: {0}")]
Plan(#[from] PlanError),
#[error("Reconciliation error: {0}")]
Reconcile(#[from] ReconcileError),
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Internal error: {0}")]
Internal(String),
}
#[derive(Debug, Error)]
pub enum ConfigError {
#[error("Configuration file not found: {path}")]
FileNotFound {
path: PathBuf,
},
#[error("Failed to parse configuration: {message}")]
ParseError {
message: String,
location: Option<String>,
},
#[error("Configuration validation failed: {message}")]
ValidationError {
message: String,
field: Option<String>,
},
#[error("Missing environment variable: {name}")]
MissingEnvVar {
name: String,
},
#[error("Duplicate {resource_type} name: {name}")]
DuplicateName {
resource_type: String,
name: String,
},
#[error("Invalid GPU type: {gpu_type}")]
InvalidGpuType {
gpu_type: String,
},
#[error("Invalid port specification: {spec}")]
InvalidPort {
spec: String,
},
#[error("Circular dependency detected: {cycle}")]
CircularDependency {
cycle: String,
},
}
#[derive(Debug, Error)]
pub enum StateError {
#[error("State file not found: {path}")]
NotFound {
path: PathBuf,
},
#[error("State is corrupted: {message}")]
Corrupted {
message: String,
},
#[error("Failed to acquire state lock: {message}")]
LockFailed {
message: String,
},
#[error("State is locked by another process (lock holder: {holder}, since: {since})")]
LockedByOther {
holder: String,
since: String,
},
#[error("S3 state backend error: {message}")]
S3Error {
message: String,
},
#[error("State serialization error: {message}")]
SerializationError {
message: String,
},
#[error("State version mismatch: expected {expected}, found {found}")]
VersionMismatch {
expected: String,
found: String,
},
}
#[derive(Debug, Error)]
pub enum RunPodError {
#[error("RunPod authentication failed: {message}")]
AuthenticationFailed {
message: String,
},
#[error("RunPod API request failed: {status} - {message}")]
ApiRequestFailed {
status: u16,
message: String,
},
#[error("RunPod API rate limited, retry after {retry_after_secs} seconds")]
RateLimited {
retry_after_secs: u64,
},
#[error("Pod not found: {pod_id}")]
PodNotFound {
pod_id: String,
},
#[error("GPU type not available: {gpu_type} in region {region}")]
GpuNotAvailable {
gpu_type: String,
region: String,
},
#[error("Insufficient quota: {message}")]
InsufficientQuota {
message: String,
},
#[error("Network error communicating with RunPod: {message}")]
NetworkError {
message: String,
},
#[error("Invalid response from RunPod API: {message}")]
InvalidResponse {
message: String,
},
#[error("Timeout waiting for pod {pod_id} to reach state {expected_state}")]
Timeout {
pod_id: String,
expected_state: String,
},
}
#[derive(Debug, Error)]
pub enum PlanError {
#[error("Plan is empty: no changes required")]
EmptyPlan,
#[error("Plan would exceed budget: estimated ${estimated:.2}/hr, limit ${limit:.2}/hr")]
BudgetExceeded {
estimated: f64,
limit: f64,
},
#[error("Plan would exceed GPU quota: needs {needed}, available {available}")]
GpuQuotaExceeded {
needed: u32,
available: u32,
},
#[error("Conflicting operations in plan: {message}")]
ConflictingOperations {
message: String,
},
#[error("Failed to resolve dependencies: {message}")]
DependencyResolutionFailed {
message: String,
},
}
#[derive(Debug, Error)]
pub enum ReconcileError {
#[error("Failed to reconcile {resource_type} '{name}': {reason}")]
ResourceReconcileFailed {
resource_type: String,
name: String,
reason: String,
},
#[error("Maximum retry attempts ({attempts}) exceeded for {resource}")]
MaxRetriesExceeded {
attempts: u32,
resource: String,
},
#[error("Drift detected for {resource}: {drift_description}")]
DriftDetected {
resource: String,
drift_description: String,
},
#[error("Reconciliation aborted: {reason}")]
Aborted {
reason: String,
},
}
pub type Result<T> = std::result::Result<T, HalldyllError>;
impl HalldyllError {
#[must_use]
pub fn internal(message: impl Into<String>) -> Self {
Self::Internal(message.into())
}
#[must_use]
pub const fn is_retryable(&self) -> bool {
matches!(
self,
Self::RunPod(
RunPodError::RateLimited { .. } | RunPodError::NetworkError { .. }
) | Self::State(StateError::LockFailed { .. })
)
}
#[must_use]
pub const fn retry_delay_secs(&self) -> Option<u64> {
match self {
Self::RunPod(RunPodError::RateLimited { retry_after_secs }) => Some(*retry_after_secs),
Self::RunPod(RunPodError::NetworkError { .. }) => Some(5),
Self::State(StateError::LockFailed { .. }) => Some(2),
_ => None,
}
}
}
impl ConfigError {
#[must_use]
pub fn validation(message: impl Into<String>, field: impl Into<String>) -> Self {
Self::ValidationError {
message: message.into(),
field: Some(field.into()),
}
}
#[must_use]
pub fn validation_general(message: impl Into<String>) -> Self {
Self::ValidationError {
message: message.into(),
field: None,
}
}
}
impl StateError {
#[must_use]
pub fn s3(message: impl Into<String>) -> Self {
Self::S3Error {
message: message.into(),
}
}
#[must_use]
pub fn serialization(message: impl Into<String>) -> Self {
Self::SerializationError {
message: message.into(),
}
}
}
impl RunPodError {
#[must_use]
pub fn api_error(status: u16, message: impl Into<String>) -> Self {
Self::ApiRequestFailed {
status,
message: message.into(),
}
}
#[must_use]
pub fn network(message: impl Into<String>) -> Self {
Self::NetworkError {
message: message.into(),
}
}
}