d_engine/
errors.rs

1//! Raft Consensus Protocol Error Hierarchy
2//!
3//! Defines comprehensive error types for a Raft-based distributed system,
4//! categorized by protocol layer and operational concerns.
5
6use std::sync::Arc;
7use std::time::Duration;
8
9use config::ConfigError;
10use tokio::task::JoinError;
11
12#[doc(hidden)]
13pub type Result<T> = std::result::Result<T, Error>;
14
15#[derive(Debug, thiserror::Error)]
16pub enum Error {
17    /// Infrastructure-level failures (network, storage, serialization)
18    #[error(transparent)]
19    System(#[from] SystemError),
20
21    /// Cluster configuration validation failures
22    #[error(transparent)]
23    Config(#[from] ConfigError),
24
25    /// Raft consensus protocol violations and failures
26    #[error(transparent)]
27    Consensus(#[from] ConsensusError),
28
29    /// Unrecoverable failures requiring process termination
30    #[error("Fatal error: {0}")]
31    Fatal(String),
32}
33
34#[derive(Debug, thiserror::Error)]
35pub enum ConsensusError {
36    /// Illegal Raft node state transitions
37    #[error(transparent)]
38    StateTransition(#[from] StateTransitionError),
39
40    /// Leader election failures (Section 5.2 Raft paper)
41    #[error(transparent)]
42    Election(#[from] ElectionError),
43
44    /// Log replication failures (Section 5.3 Raft paper)
45    #[error(transparent)]
46    Replication(#[from] ReplicationError),
47
48    /// Cluster membership change failures (Section 6 Raft paper)
49    #[error(transparent)]
50    Membership(#[from] MembershipError),
51}
52
53#[derive(Debug, thiserror::Error)]
54#[doc(hidden)]
55pub enum StateTransitionError {
56    #[error("Not enough votes to transition to leader.")]
57    NotEnoughVotes,
58
59    #[error("Invalid state transition.")]
60    InvalidTransition,
61
62    #[error("Lock error.")]
63    LockError,
64}
65
66#[derive(Debug, thiserror::Error)]
67pub enum NetworkError {
68    /// Endpoint unavailable (HTTP 503 equivalent)
69    #[error("Service unavailable: {0}")]
70    ServiceUnavailable(String),
71
72    /// Peer communication timeout
73    #[error("Connection timeout to {node_id} after {duration:?}")]
74    Timeout { node_id: u64, duration: Duration },
75
76    /// Unreachable node with source context
77    #[error("Network unreachable: {source}")]
78    Unreachable {
79        source: Arc<dyn std::error::Error + Send + Sync>,
80    },
81
82    /// Persistent connection failures
83    #[error("Socket connect failed error")]
84    ConnectError,
85
86    /// Retry policy exhaustion
87    #[error("Retry timeout after {0:?}")]
88    RetryTimeoutError(Duration),
89
90    /// TLS negotiation failures
91    #[error("TLS handshake failed")]
92    TlsHandshakeFailure,
93
94    /// Missing peer list for RPC
95    #[error("Request list for {request_type} contains no peers")]
96    EmptyPeerList { request_type: &'static str },
97
98    /// Malformed node addresses
99    #[error("Invalid URI format: {0}")]
100    InvalidURI(String),
101
102    /// RPC transmission failures with context
103    #[error("Failed to send {request_type} request")]
104    RequestSendFailure {
105        request_type: &'static str,
106        #[source]
107        source: tonic::transport::Error,
108    },
109
110    /// Low-level TCP configuration errors
111    #[error("TCP keepalive configuration error")]
112    TcpKeepaliveError,
113
114    /// HTTP/2 protocol configuration errors
115    #[error("HTTP/2 keepalive configuration error")]
116    Http2KeepaliveError,
117
118    /// gRPC transport layer errors
119    #[error(transparent)]
120    TonicError(#[from] tonic::transport::Error),
121
122    /// gRPC status code errors
123    #[error(transparent)]
124    TonicStatusError(#[from] tonic::Status),
125
126    #[error("Failed to send read request: {0}")]
127    ReadSend(#[from] ReadSendError),
128
129    #[error("Failed to send write request: {0}")]
130    WriteSend(#[from] WriteSendError),
131
132    #[error("Background task failed: {0}")]
133    TaskFailed(#[from] JoinError),
134
135    #[error("{0}")]
136    TaskBackoffFailed(String),
137
138    #[error("{0}")]
139    SingalSendFailed(String),
140
141    #[error("{0}")]
142    SingalReceiveFailed(String),
143}
144
145#[derive(Debug, thiserror::Error)]
146pub enum StorageError {
147    /// Disk I/O failures during log/snapshot operations
148    #[error(transparent)]
149    IoError(#[from] std::io::Error),
150
151    /// Serialization failures for persisted data
152    #[error(transparent)]
153    BincodeError(#[from] bincode::Error),
154
155    /// State machine application errors
156    #[error("State Machine error: {0}")]
157    StateMachineError(String),
158
159    /// Log storage subsystem failures
160    #[error("Log storage failure: {0}")]
161    LogStorage(String),
162
163    /// Snapshot creation/restoration failures
164    #[error("Snapshot operation failed: {0}")]
165    Snapshot(String),
166
167    /// Checksum validation failures
168    #[error("Data corruption detected at {location}")]
169    DataCorruption { location: String },
170
171    /// Configuration storage failures
172    #[error("Configuration storage error: {0}")]
173    ConfigStorage(String),
174
175    /// Embedded database errors
176    #[error(transparent)]
177    SledError(#[from] sled::Error),
178}
179
180#[derive(Debug, thiserror::Error)]
181pub enum ReadSendError {
182    #[error("Network timeout")]
183    Timeout(#[from] tokio::time::error::Elapsed),
184
185    #[error("Connection failed")]
186    Connection(#[from] tonic::transport::Error),
187}
188
189#[derive(Debug, thiserror::Error)]
190pub enum WriteSendError {
191    #[error("Not cluster leader")]
192    NotLeader,
193
194    #[error("Network unreachable")]
195    Unreachable,
196
197    #[error("Payload too large")]
198    PayloadExceeded,
199}
200
201#[derive(Debug, thiserror::Error)]
202pub enum SystemError {
203    // Network layer
204    #[error("Network error: {0}")]
205    Network(#[from] NetworkError),
206
207    // Storage layer
208    #[error("Storage operation failed")]
209    Storage(#[from] StorageError),
210
211    //Serialization
212    #[error("Serialization error")]
213    Serialization(#[from] SerializationError),
214
215    // Basic node operations
216    #[error("Node failed to start: {0}")]
217    NodeStartFailed(String),
218
219    #[error("General server error: {0}")]
220    GeneralServer(String),
221
222    #[error("Internal server error")]
223    ServerUnavailable,
224}
225
226// Serialization is classified separately (across protocol layers and system layers)
227#[derive(Debug, thiserror::Error)]
228pub enum SerializationError {
229    #[error("Bincode serialization failed: {0}")]
230    Bincode(#[from] bincode::Error),
231}
232
233#[derive(Debug, thiserror::Error)]
234pub enum ElectionError {
235    /// General election process failure
236    #[error("Election failed: {0}")]
237    Failed(String),
238
239    /// Stale term detection (Section 5.1 Raft paper)
240    #[error("Found higher term(={0}) during election process")]
241    HigherTerm(u64),
242
243    /// Term number inconsistency
244    #[error("Term conflict (current: {current}, received: {received})")]
245    TermConflict { current: u64, received: u64 },
246
247    /// Log inconsistency during vote requests (Section 5.4.1 Raft paper)
248    #[error("Log conflict at index {index} (expected: {expected_term}, actual: {actual_term})")]
249    LogConflict {
250        index: u64,
251        expected_term: u64,
252        actual_term: u64,
253    },
254
255    /// Quorum not achieved (Section 5.2 Raft paper)
256    #[error("Quorum not reached (required: {required}, succeed: {succeed})")]
257    QuorumFailure { required: usize, succeed: usize },
258
259    /// Leadership handoff failures
260    #[error("Leadership consensus error: {0}")]
261    LeadershipConsensus(String),
262
263    /// Isolated node scenario
264    #[error("No voting member found for candidate {candidate_id}")]
265    NoVotingMemberFound { candidate_id: u32 },
266}
267
268#[derive(Debug, thiserror::Error)]
269pub enum ReplicationError {
270    /// Stale leader detected during AppendEntries RPC
271    #[error("Found higher term(={0}) during replication process")]
272    HigherTerm(u64),
273
274    /// Failed to achieve majority acknowledgment
275    #[error("Quorum not reached for log replication")]
276    QuorumNotReached,
277
278    /// Target follower node unreachable
279    #[error("Node {node_id} unreachable for replication")]
280    NodeUnreachable { node_id: u32 },
281
282    /// Network timeout during replication RPC
283    #[error("RPC timeout after {duration}ms")]
284    RpcTimeout { duration: u64 },
285
286    /// Missing peer configuration in leader state
287    #[error("No peer mapping for leader {leader_id}")]
288    NoPeerFound { leader_id: u32 },
289
290    /// Log inconsistency detected during replication (ยง5.3)
291    #[error("Log conflict at index {index} (expected term {expected_term}, actual {actual_term})")]
292    LogConflict {
293        index: u64,
294        expected_term: u64,
295        actual_term: u64,
296    },
297
298    /// Node not in leader state for replication requests
299    #[error("Replication requires leader role (known leader: {leader_id:?})")]
300    NotLeader { leader_id: Option<u32> },
301}
302
303#[derive(Debug, thiserror::Error)]
304pub enum MembershipError {
305    /// Failed to reach consensus on configuration change
306    #[error("Membership update consensus failure: {0}")]
307    UpdateFailed(String),
308
309    /// Non-leader node attempted membership change
310    #[error("Membership changes require leader role")]
311    NotLeader,
312
313    /// Cluster not in operational state
314    #[error("Cluster bootstrap not completed")]
315    ClusterIsNotReady,
316
317    /// Connection establishment failure during join
318    #[error("Cluster connection setup failed: {0}")]
319    SetupClusterConnectionFailed(String),
320
321    /// Missing node metadata in configuration
322    #[error("Metadata missing for node {node_id} in cluster config")]
323    NoMetadataFoundForNode { node_id: u32 },
324}
325
326// ============== Conversion Implementations ============== //
327impl From<NetworkError> for Error {
328    fn from(e: NetworkError) -> Self {
329        Error::System(SystemError::Network(e))
330    }
331}
332
333impl From<StorageError> for Error {
334    fn from(e: StorageError) -> Self {
335        Error::System(SystemError::Storage(e))
336    }
337}
338
339impl From<SerializationError> for Error {
340    fn from(e: SerializationError) -> Self {
341        Error::System(SystemError::Serialization(e))
342    }
343}
344
345// ===== Consensus Error conversions =====
346
347impl From<StateTransitionError> for Error {
348    fn from(e: StateTransitionError) -> Self {
349        Error::Consensus(ConsensusError::StateTransition(e))
350    }
351}
352
353impl From<ElectionError> for Error {
354    fn from(e: ElectionError) -> Self {
355        Error::Consensus(ConsensusError::Election(e))
356    }
357}
358
359impl From<ReplicationError> for Error {
360    fn from(e: ReplicationError) -> Self {
361        Error::Consensus(ConsensusError::Replication(e))
362    }
363}
364
365impl From<MembershipError> for Error {
366    fn from(e: MembershipError) -> Self {
367        Error::Consensus(ConsensusError::Membership(e))
368    }
369}
370
371// ===== Network sub-error conversions =====
372impl From<ReadSendError> for Error {
373    fn from(e: ReadSendError) -> Self {
374        Error::System(SystemError::Network(NetworkError::ReadSend(e)))
375    }
376}
377
378impl From<WriteSendError> for Error {
379    fn from(e: WriteSendError) -> Self {
380        Error::System(SystemError::Network(NetworkError::WriteSend(e)))
381    }
382}
383
384impl From<tonic::transport::Error> for Error {
385    fn from(err: tonic::transport::Error) -> Self {
386        NetworkError::TonicError(err).into()
387    }
388}
389
390impl From<sled::Error> for Error {
391    fn from(err: sled::Error) -> Self {
392        StorageError::SledError(err).into()
393    }
394}
395
396impl From<std::io::Error> for Error {
397    fn from(err: std::io::Error) -> Self {
398        StorageError::IoError(err).into()
399    }
400}
401
402impl From<JoinError> for Error {
403    fn from(err: JoinError) -> Self {
404        NetworkError::TaskFailed(err).into()
405    }
406}