Skip to main content

oxigdal_cluster/
error.rs

1//! Error types for the oxigdal-cluster crate.
2//!
3//! This module defines all error types that can occur during distributed
4//! cluster operations including task scheduling, worker management, data
5//! replication, and fault tolerance.
6
7use std::io;
8
9/// Result type alias for cluster operations.
10pub type Result<T> = std::result::Result<T, ClusterError>;
11
12/// Main error type for cluster operations.
13#[derive(Debug, thiserror::Error)]
14pub enum ClusterError {
15    /// Task scheduling errors
16    #[error("Task scheduling error: {0}")]
17    SchedulerError(String),
18
19    /// Task not found
20    #[error("Task not found: {0}")]
21    TaskNotFound(String),
22
23    /// Task dependency cycle detected
24    #[error("Task dependency cycle detected: {0}")]
25    DependencyCycle(String),
26
27    /// Worker pool errors
28    #[error("Worker pool error: {0}")]
29    WorkerPoolError(String),
30
31    /// Worker not found
32    #[error("Worker not found: {0}")]
33    WorkerNotFound(String),
34
35    /// Worker unhealthy
36    #[error("Worker unhealthy: {0}")]
37    WorkerUnhealthy(String),
38
39    /// Worker capacity exceeded
40    #[error("Worker capacity exceeded: {0}")]
41    CapacityExceeded(String),
42
43    /// Data locality errors
44    #[error("Data locality error: {0}")]
45    DataLocalityError(String),
46
47    /// Data not available on any worker
48    #[error("Data not available: {0}")]
49    DataNotAvailable(String),
50
51    /// Fault tolerance errors
52    #[error("Fault tolerance error: {0}")]
53    FaultToleranceError(String),
54
55    /// Maximum retries exceeded
56    #[error("Maximum retries exceeded for task: {0}")]
57    MaxRetriesExceeded(String),
58
59    /// Checkpoint error
60    #[error("Checkpoint error: {0}")]
61    CheckpointError(String),
62
63    /// Distributed cache errors
64    #[error("Cache error: {0}")]
65    CacheError(String),
66
67    /// Cache coherency violation
68    #[error("Cache coherency violation: {0}")]
69    CoherencyViolation(String),
70
71    /// Replication errors
72    #[error("Replication error: {0}")]
73    ReplicationError(String),
74
75    /// Quorum not reached
76    #[error("Quorum not reached: required {required}, got {actual}")]
77    QuorumNotReached {
78        /// The required quorum size
79        required: usize,
80        /// The actual number of responses received
81        actual: usize,
82    },
83
84    /// Replica placement error
85    #[error("Replica placement error: {0}")]
86    ReplicaPlacementError(String),
87
88    /// Coordinator errors
89    #[error("Coordinator error: {0}")]
90    CoordinatorError(String),
91
92    /// Leader election failed
93    #[error("Leader election failed: {0}")]
94    LeaderElectionFailed(String),
95
96    /// No leader available
97    #[error("No leader available")]
98    NoLeader,
99
100    /// Consensus error
101    #[error("Consensus error: {0}")]
102    ConsensusError(String),
103
104    /// Metrics collection error
105    #[error("Metrics error: {0}")]
106    MetricsError(String),
107
108    /// Serialization/deserialization errors
109    #[error("Serialization error: {0}")]
110    SerializationError(String),
111
112    /// Network communication errors
113    #[error("Network error: {0}")]
114    NetworkError(String),
115
116    /// Timeout error
117    #[error("Operation timed out: {0}")]
118    Timeout(String),
119
120    /// Configuration error
121    #[error("Configuration error: {0}")]
122    ConfigError(String),
123
124    /// Resource exhausted
125    #[error("Resource exhausted: {0}")]
126    ResourceExhausted(String),
127
128    /// Invalid state
129    #[error("Invalid state: {0}")]
130    InvalidState(String),
131
132    /// Task execution error
133    #[error("Task execution error: {0}")]
134    ExecutionError(String),
135
136    /// Task cancelled
137    #[error("Task cancelled: {0}")]
138    TaskCancelled(String),
139
140    /// IO errors
141    #[error("IO error: {0}")]
142    Io(#[from] io::Error),
143
144    /// JSON errors
145    #[error("JSON error: {0}")]
146    Json(#[from] serde_json::Error),
147
148    /// Raft errors
149    #[error("Raft error: {0}")]
150    RaftError(String),
151
152    /// Quota errors
153    #[error("Quota exceeded: {0}")]
154    QuotaExceeded(String),
155
156    /// Reservation errors
157    #[error("Reservation not found: {0}")]
158    ReservationNotFound(String),
159
160    /// Resource not available
161    #[error("Resource not available: {0}")]
162    ResourceNotAvailable(String),
163
164    /// Invalid operation
165    #[error("Invalid operation: {0}")]
166    InvalidOperation(String),
167
168    /// Invalid configuration
169    #[error("Invalid configuration: {0}")]
170    InvalidConfiguration(String),
171
172    /// Workflow errors
173    #[error("Workflow not found: {0}")]
174    WorkflowNotFound(String),
175
176    /// Monitoring errors
177    #[error("Metric not found: {0}")]
178    MetricNotFound(String),
179
180    /// Alert not found
181    #[error("Alert not found: {0}")]
182    AlertNotFound(String),
183
184    /// Security errors
185    #[error("Authentication failed: {0}")]
186    AuthenticationFailed(String),
187
188    /// Permission denied
189    #[error("Permission denied: {0}")]
190    PermissionDenied(String),
191
192    /// Secret not found
193    #[error("Secret not found: {0}")]
194    SecretNotFound(String),
195
196    /// Compression error
197    #[error("Compression error: {0}")]
198    CompressionError(String),
199
200    /// Other errors
201    #[error("Other error: {0}")]
202    Other(String),
203}
204
205impl ClusterError {
206    /// Check if the error is retryable.
207    pub fn is_retryable(&self) -> bool {
208        matches!(
209            self,
210            ClusterError::NetworkError(_)
211                | ClusterError::Timeout(_)
212                | ClusterError::WorkerUnhealthy(_)
213                | ClusterError::QuorumNotReached { .. }
214                | ClusterError::NoLeader
215                | ClusterError::ResourceExhausted(_)
216        )
217    }
218
219    /// Check if the error indicates a permanent failure.
220    pub fn is_permanent(&self) -> bool {
221        matches!(
222            self,
223            ClusterError::DependencyCycle(_)
224                | ClusterError::TaskNotFound(_)
225                | ClusterError::ConfigError(_)
226                | ClusterError::InvalidState(_)
227                | ClusterError::MaxRetriesExceeded(_)
228        )
229    }
230
231    /// Check if the error requires failover.
232    pub fn requires_failover(&self) -> bool {
233        matches!(
234            self,
235            ClusterError::WorkerNotFound(_)
236                | ClusterError::WorkerUnhealthy(_)
237                | ClusterError::NoLeader
238                | ClusterError::LeaderElectionFailed(_)
239        )
240    }
241}
242
243#[cfg(test)]
244#[allow(clippy::expect_used, clippy::unwrap_used)]
245mod tests {
246    use super::*;
247
248    #[test]
249    fn test_error_retryable() {
250        let err = ClusterError::NetworkError("connection failed".to_string());
251        assert!(err.is_retryable());
252        assert!(!err.is_permanent());
253
254        let err = ClusterError::DependencyCycle("cycle detected".to_string());
255        assert!(!err.is_retryable());
256        assert!(err.is_permanent());
257    }
258
259    #[test]
260    fn test_error_requires_failover() {
261        let err = ClusterError::WorkerNotFound("worker123".to_string());
262        assert!(err.requires_failover());
263
264        let err = ClusterError::NoLeader;
265        assert!(err.requires_failover());
266    }
267
268    #[test]
269    fn test_quorum_error() {
270        let err = ClusterError::QuorumNotReached {
271            required: 3,
272            actual: 2,
273        };
274        assert!(err.is_retryable());
275        assert!(!err.is_permanent());
276    }
277}