d_engine/
errors.rs

1//! Raft Consensus Protocol Error Hierarchy
2//!
3//! Defines comprehensive error types for a Raft-based distributed system,
4//! categorized by protocol layer and operational concerns.
5
6use std::path::PathBuf;
7use std::time::Duration;
8
9use config::ConfigError;
10use tokio::task::JoinError;
11
12#[doc(hidden)]
13pub type Result<T> = std::result::Result<T, Error>;
14
15#[derive(Debug, thiserror::Error)]
16pub enum Error {
17    /// Infrastructure-level failures (network, storage, serialization)
18    #[error(transparent)]
19    System(#[from] SystemError),
20
21    /// Cluster configuration validation failures
22    #[error(transparent)]
23    Config(#[from] ConfigError),
24
25    /// Raft consensus protocol violations and failures
26    #[error(transparent)]
27    Consensus(#[from] ConsensusError),
28
29    /// Unrecoverable failures requiring process termination
30    #[error("Fatal error: {0}")]
31    Fatal(String),
32}
33
34#[derive(Debug, thiserror::Error)]
35pub enum ConsensusError {
36    /// Illegal Raft node state transitions
37    #[error(transparent)]
38    StateTransition(#[from] StateTransitionError),
39
40    /// Leader election failures (Section 5.2 Raft paper)
41    #[error(transparent)]
42    Election(#[from] ElectionError),
43
44    /// Log replication failures (Section 5.3 Raft paper)
45    #[error(transparent)]
46    Replication(#[from] ReplicationError),
47
48    /// Cluster membership change failures (Section 6 Raft paper)
49    #[error(transparent)]
50    Membership(#[from] MembershipError),
51
52    /// Snapshot-related errors during installation or restoration
53    #[error(transparent)]
54    Snapshot(#[from] SnapshotError),
55
56    /// Role permission conflict error
57    #[error("Operation requires {required_role} role but current role is {current_role}")]
58    RoleViolation {
59        current_role: &'static str,
60        required_role: &'static str,
61        context: String,
62    },
63}
64
65#[derive(Debug, thiserror::Error)]
66#[doc(hidden)]
67pub enum StateTransitionError {
68    #[error("Not enough votes to transition to leader.")]
69    NotEnoughVotes,
70
71    #[error("Invalid state transition.")]
72    InvalidTransition,
73
74    #[error("Lock error.")]
75    LockError,
76}
77
78#[derive(Debug, thiserror::Error)]
79pub enum NetworkError {
80    /// Endpoint unavailable (HTTP 503 equivalent)
81    #[error("Service unavailable: {0}")]
82    ServiceUnavailable(String),
83
84    /// Peer communication timeout
85    #[error("Connection timeout to {node_id} after {duration:?}")]
86    Timeout { node_id: u32, duration: Duration },
87
88    /// Unreachable node with source context
89    #[error("Network unreachable: {source}")]
90    Unreachable {
91        source: Box<dyn std::error::Error + Send + Sync>,
92    },
93
94    /// Persistent connection failures
95    #[error("Socket connect failed error: {0}")]
96    ConnectError(String),
97
98    /// Retry policy exhaustion
99    #[error("Retry timeout after {0:?}")]
100    RetryTimeoutError(Duration),
101
102    /// TLS negotiation failures
103    #[error("TLS handshake failed")]
104    TlsHandshakeFailure,
105
106    /// Missing peer list for RPC
107    #[error("Request list for {request_type} contains no peers")]
108    EmptyPeerList { request_type: &'static str },
109
110    /// Peer connection not found
111    #[error("Peer({0}) connection not found")]
112    PeerConnectionNotFound(u32),
113
114    /// Peer closed the channel
115    #[error("Peer closed the channel")]
116    ResponseChannelClosed,
117
118    /// Peer address not found
119    #[error("Peer({0}) address not found")]
120    PeerAddressNotFound(u32),
121
122    /// Malformed node addresses
123    #[error("Invalid URI format: {0}")]
124    InvalidURI(String),
125
126    /// RPC transmission failures with context
127    #[error("Failed to send {request_type} request")]
128    RequestSendFailure {
129        request_type: &'static str,
130        #[source]
131        source: Box<tonic::transport::Error>,
132    },
133
134    /// Low-level TCP configuration errors
135    #[error("TCP keepalive configuration error")]
136    TcpKeepaliveError,
137
138    /// HTTP/2 protocol configuration errors
139    #[error("HTTP/2 keepalive configuration error")]
140    Http2KeepaliveError,
141
142    /// gRPC transport layer errors
143    #[error(transparent)]
144    TonicError(#[from] Box<tonic::transport::Error>),
145
146    /// gRPC status code errors
147    #[error(transparent)]
148    TonicStatusError(#[from] Box<tonic::Status>),
149
150    #[error("Failed to send read request: {0}")]
151    ReadSend(#[from] ReadSendError),
152
153    #[error("Failed to send write request: {0}")]
154    WriteSend(#[from] WriteSendError),
155
156    #[error("Background task failed: {0}")]
157    TaskFailed(#[from] JoinError),
158
159    #[error("{0}")]
160    TaskBackoffFailed(String),
161
162    #[error("{0}")]
163    SingalSendFailed(String),
164
165    #[error("{0}")]
166    SingalReceiveFailed(String),
167
168    // #[error("Install snapshot RPC request been rejected, last_chunk={last_chunk}")]
169    // SnapshotRejected { last_chunk: u32 },
170
171    // #[error("Install snapshot RPC request failed")]
172    // SnapshotTransferFailed,
173    #[error("New node join cluster failed: {0}")]
174    JoinFailed(String),
175
176    #[error("Network timeout: {0}")]
177    GlobalTimeout(String),
178}
179
180#[derive(Debug, thiserror::Error)]
181pub enum StorageError {
182    /// Disk I/O failures during log/snapshot operations
183    #[error(transparent)]
184    IoError(#[from] std::io::Error),
185
186    /// Custom error with a path as a string slice (`&str`)
187    #[error("Error occurred at path: {path}")]
188    PathError {
189        path: PathBuf, // Use &str for lightweight references
190        source: std::io::Error,
191    },
192
193    /// Serialization failures for persisted data
194    #[error(transparent)]
195    BincodeError(#[from] bincode::Error),
196
197    /// State machine application errors
198    #[error("State Machine error: {0}")]
199    StateMachineError(String),
200
201    /// Log storage subsystem failures
202    #[error("Log storage failure: {0}")]
203    LogStorage(String),
204
205    // /// Snapshot creation/restoration failures
206    // #[error("Snapshot operation failed: {0}")]
207    // Snapshot(String),
208    /// Checksum validation failures
209    #[error("Data corruption detected at {location}")]
210    DataCorruption { location: String },
211
212    /// Configuration storage failures
213    #[error("Configuration storage error: {0}")]
214    ConfigStorage(String),
215
216    /// Embedded database errors
217    #[error("Embedded database error: {0}")]
218    DbError(String),
219
220    /// Error type for value conversion operations
221    #[error("Value convert failed")]
222    Convert(#[from] ConvertError),
223
224    /// File errors
225    #[error("File errors")]
226    File(#[from] FileError),
227
228    /// Serialization error
229    #[error("Serialization error: {0}")]
230    SerializationError(String),
231
232    /// ID allocation errors
233    #[error(transparent)]
234    IdAllocation(#[from] IdAllocationError),
235}
236
237#[derive(Debug, thiserror::Error)]
238pub enum IdAllocationError {
239    /// ID allocation overflow
240    #[error("ID allocation overflow: {start} > {end}")]
241    Overflow { start: u64, end: u64 },
242
243    /// Invalid ID range
244    #[error("Invalid ID range: {start}..={end}")]
245    InvalidRange { start: u64, end: u64 },
246
247    /// No available IDs
248    #[error("No available IDs")]
249    NoIdsAvailable,
250}
251
252#[derive(Debug, thiserror::Error)]
253pub enum FileError {
254    #[error("Path does not exist: {0}")]
255    NotFound(String),
256    #[error("Path is a directory: {0}")]
257    IsDirectory(String),
258    #[error("File is busy: {0}")]
259    Busy(String),
260    #[error("Insufficient permissions: {0}")]
261    PermissionDenied(String),
262    #[error("File is occupied: {0}")]
263    FileBusy(String),
264    #[error("Invalid path: {0}")]
265    InvalidPath(String),
266    #[error("Too small: {0}")]
267    TooSmall(u64),
268    #[error("Invalid extension: {0}")]
269    InvalidExt(String),
270    #[error("Invalid GZIP header: {0}")]
271    InvalidGzipHeader(String),
272    #[error("Unknown IO error: {0}")]
273    UnknownIo(String),
274}
275
276/// Error type for value conversion operations
277#[derive(Debug, thiserror::Error)]
278pub enum ConvertError {
279    /// Invalid input length error
280    ///
281    /// This occurs when the input byte slice length doesn't match the required 8 bytes.
282    #[error("invalid byte length: expected 8 bytes, received {0} bytes")]
283    InvalidLength(usize),
284
285    /// Generic conversion failure with detailed message
286    ///
287    /// Wraps underlying parsing/conversion errors with context information
288    #[error("conversion failure: {0}")]
289    ConversionFailure(String),
290}
291
292#[derive(Debug, thiserror::Error)]
293pub enum ReadSendError {
294    #[error("Network timeout")]
295    Timeout(#[from] tokio::time::error::Elapsed),
296
297    #[error("Connection failed")]
298    Connection(#[from] tonic::transport::Error),
299}
300
301#[derive(Debug, thiserror::Error)]
302pub enum WriteSendError {
303    #[error("Not cluster leader")]
304    NotLeader,
305
306    #[error("Network unreachable")]
307    Unreachable,
308
309    #[error("Payload too large")]
310    PayloadExceeded,
311}
312
313#[derive(Debug, thiserror::Error)]
314pub enum SystemError {
315    // Network layer
316    #[error("Network error: {0}")]
317    Network(#[from] NetworkError),
318
319    // Storage layer
320    #[error("Storage operation failed")]
321    Storage(#[from] StorageError),
322
323    //Serialization
324    #[error("Serialization error")]
325    Serialization(#[from] SerializationError),
326
327    /// Protocol buffer encoding/decoding specific errors
328    #[error("Protobuf operation failed: {0}")]
329    Prost(#[from] ProstError),
330
331    // Basic node operations
332    #[error("Node failed to start: {0}")]
333    NodeStartFailed(String),
334
335    #[error("General server error: {0}")]
336    GeneralServer(String),
337
338    #[error("Internal server error")]
339    ServerUnavailable,
340}
341
342// Serialization is classified separately (across protocol layers and system layers)
343#[derive(Debug, thiserror::Error)]
344pub enum SerializationError {
345    #[error("Bincode serialization failed: {0}")]
346    Bincode(#[from] bincode::Error),
347}
348
349/// Wrapper for prost encoding/decoding errors
350#[derive(Debug, thiserror::Error)]
351pub enum ProstError {
352    #[error("Encoding failed: {0}")]
353    Encode(#[from] prost::EncodeError),
354
355    #[error("Decoding failed: {0}")]
356    Decode(#[from] prost::DecodeError),
357}
358
359#[derive(Debug, thiserror::Error)]
360pub enum ElectionError {
361    /// General election process failure
362    #[error("Election failed: {0}")]
363    Failed(String),
364
365    /// Stale term detection (Section 5.1 Raft paper)
366    #[error("Found higher term(={0}) during election process")]
367    HigherTerm(u64),
368
369    /// Term number inconsistency
370    #[error("Term conflict (current: {current}, received: {received})")]
371    TermConflict { current: u64, received: u64 },
372
373    /// Log inconsistency during vote requests (Section 5.4.1 Raft paper)
374    #[error("Log conflict at index {index} (expected: {expected_term}, actual: {actual_term})")]
375    LogConflict {
376        index: u64,
377        expected_term: u64,
378        actual_term: u64,
379    },
380
381    /// Quorum not achieved (Section 5.2 Raft paper)
382    #[error("Quorum not reached (required: {required}, succeed: {succeed})")]
383    QuorumFailure { required: usize, succeed: usize },
384
385    /// Leadership handoff failures
386    #[error("Leadership consensus error: {0}")]
387    LeadershipConsensus(String),
388
389    /// Isolated node scenario
390    #[error("No voting member found for candidate {candidate_id}")]
391    NoVotingMemberFound { candidate_id: u32 },
392}
393
394#[derive(Debug, thiserror::Error)]
395pub enum ReplicationError {
396    /// Stale leader detected during AppendEntries RPC
397    #[error("Found higher term(={0}) during replication process")]
398    HigherTerm(u64),
399
400    /// Failed to achieve majority acknowledgment
401    #[error("Quorum not reached for log replication")]
402    QuorumNotReached,
403
404    /// Timeout to receive majority response
405    #[error("Timeout to receive majority response")]
406    QuorumTimeout,
407
408    /// Target follower node unreachable
409    #[error("Node {node_id} unreachable for replication")]
410    NodeUnreachable { node_id: u32 },
411
412    /// Network timeout during replication RPC
413    #[error("RPC timeout after {duration}ms")]
414    RpcTimeout { duration: u64 },
415
416    /// Missing peer configuration in leader state
417    #[error("No peer mapping for leader {leader_id}")]
418    NoPeerFound { leader_id: u32 },
419
420    /// Log inconsistency detected during replication (ยง5.3)
421    #[error("Log conflict at index {index} (expected term {expected_term}, actual {actual_term})")]
422    LogConflict {
423        index: u64,
424        expected_term: u64,
425        actual_term: u64,
426    },
427
428    /// Node not in leader state for replication requests
429    #[error("Replication requires leader role (known leader: {leader_id:?})")]
430    NotLeader { leader_id: Option<u32> },
431}
432
433#[derive(Debug, thiserror::Error)]
434pub enum MembershipError {
435    /// Failed to reach consensus on configuration change
436    #[error("Membership update consensus failure: {0}")]
437    ConfigChangeUpdateFailed(String),
438
439    /// Non-leader node attempted membership change
440    #[error("Membership changes require leader role")]
441    NotLeader,
442
443    /// No leader information available
444    #[error("No leader information available")]
445    NoLeaderFound,
446
447    /// Non-learner node attempted join cluster
448    #[error("Only Learner can join cluster")]
449    NotLearner,
450
451    /// Cluster not in operational state
452    #[error("Cluster bootstrap not completed")]
453    ClusterIsNotReady,
454
455    /// Connection establishment failure during join
456    #[error("Cluster connection setup failed: {0}")]
457    SetupClusterConnectionFailed(String),
458
459    /// Missing node metadata in configuration
460    #[error("Metadata missing for node {node_id} in cluster config")]
461    NoMetadataFoundForNode { node_id: u32 },
462
463    /// No available peers found for request
464    #[error("No reachable peers found in cluster membership")]
465    NoPeersAvailable,
466
467    /// Node already been added into cluster config
468    #[error("Node({0}) already been added into cluster config.")]
469    NodeAlreadyExists(u32),
470
471    /// To be removed node is leader.
472    #[error("To be removed node({0}) is leader.")]
473    RemoveNodeIsLeader(u32),
474
475    #[error("Cannot promote node {node_id}: current role is {role} (expected LEARNER)")]
476    InvalidPromotion { node_id: u32, role: i32 },
477
478    #[error("Invalid membership change request")]
479    InvalidChangeRequest,
480
481    #[error("Commit Timeout")]
482    CommitTimeout,
483
484    #[error("Learner({0}) join cluster failed.")]
485    JoinClusterFailed(u32),
486
487    #[error("Join cluster error: {0}")]
488    JoinClusterError(String),
489
490    #[error("Not leader")]
491    NoLeader,
492
493    #[error("Mark leader id failed: {0}")]
494    MarkLeaderIdFailed(String),
495}
496
497#[derive(Debug, thiserror::Error)]
498pub enum SnapshotError {
499    #[error("Snapshot receiver lagging, dropping chunk")]
500    Backpressure,
501
502    /// Snapshot chunk rejected during installation
503    #[error("Install snapshot RPC request been rejected, last_chunk={last_chunk}")]
504    Rejected { last_chunk: u32 },
505
506    #[error("Install snapshot RPC request been rejected")]
507    RemoteRejection,
508
509    /// Snapshot transfer failed due to stream/network issues
510    #[error("Install snapshot RPC request failed")]
511    TransferFailed,
512
513    /// Snapshot transfer timeout due to network issues
514    #[error("Install snapshot RPC request timeout")]
515    TransferTimeout,
516
517    /// Snapshot operation failed with context
518    #[error("Snapshot operation failed: {0}")]
519    OperationFailed(String),
520
521    /// Snapshot is outdated and cannot be used
522    #[error("Snapshot is outdated")]
523    Outdated,
524
525    /// Snapshot file checksum mismatch
526    #[error("Snapshot file checksum mismatch")]
527    ChecksumMismatch,
528
529    /// Invalid snapshot
530    #[error("Invalid snapshot")]
531    InvalidSnapshot,
532
533    /// Invalid chunk sequence
534    #[error("Invalid chunk sequence")]
535    InvalidChunkSequence,
536
537    /// Stream receiver disconnected
538    #[error("Stream receiver disconnected")]
539    ReceiverDisconnected,
540
541    #[error("Invalid first snapshot stream chunk")]
542    InvalidFirstChunk,
543
544    #[error("Empty snapshot stream chunk")]
545    EmptySnapshot,
546
547    #[error("Incomplete snapshot error")]
548    IncompleteSnapshot,
549
550    #[error("Requested chunk {0} out of range (max: {1})")]
551    ChunkOutOfRange(u32, u32),
552
553    #[error("Chunk in stream is out of order")]
554    OutOfOrderChunk,
555
556    #[error("No metadata in chunk")]
557    MissingMetadata,
558
559    #[error("Chunk not cached: {0}")]
560    ChunkNotCached(u32),
561
562    #[error("Background stream push task died")]
563    BackgroundTaskDied,
564}
565
566// ============== Conversion Implementations ============== //
567impl From<NetworkError> for Error {
568    fn from(e: NetworkError) -> Self {
569        Error::System(SystemError::Network(e))
570    }
571}
572
573impl From<StorageError> for Error {
574    fn from(e: StorageError) -> Self {
575        Error::System(SystemError::Storage(e))
576    }
577}
578
579impl From<ConvertError> for Error {
580    fn from(e: ConvertError) -> Self {
581        Error::System(SystemError::Storage(StorageError::Convert(e)))
582    }
583}
584
585impl From<FileError> for Error {
586    fn from(e: FileError) -> Self {
587        Error::System(SystemError::Storage(StorageError::File(e)))
588    }
589}
590
591impl From<SerializationError> for Error {
592    fn from(e: SerializationError) -> Self {
593        Error::System(SystemError::Serialization(e))
594    }
595}
596
597// // These allow direct conversion from prost errors to SystemError
598// impl From<prost::EncodeError> for SystemError {
599//     fn from(error: prost::EncodeError) -> Self {
600//         SystemError::Prost(ProstError::Encode(error))
601//     }
602// }
603
604// impl From<prost::DecodeError> for SystemError {
605//     fn from(error: prost::DecodeError) -> Self {
606//         SystemError::Prost(ProstError::Decode(error))
607//     }
608// }
609
610impl From<ProstError> for Error {
611    fn from(error: ProstError) -> Self {
612        Error::System(SystemError::Prost(error))
613    }
614}
615
616// ===== Consensus Error conversions =====
617
618impl From<StateTransitionError> for Error {
619    fn from(e: StateTransitionError) -> Self {
620        Error::Consensus(ConsensusError::StateTransition(e))
621    }
622}
623
624impl From<ElectionError> for Error {
625    fn from(e: ElectionError) -> Self {
626        Error::Consensus(ConsensusError::Election(e))
627    }
628}
629
630impl From<ReplicationError> for Error {
631    fn from(e: ReplicationError) -> Self {
632        Error::Consensus(ConsensusError::Replication(e))
633    }
634}
635
636impl From<MembershipError> for Error {
637    fn from(e: MembershipError) -> Self {
638        Error::Consensus(ConsensusError::Membership(e))
639    }
640}
641
642// ===== Network sub-error conversions =====
643impl From<ReadSendError> for Error {
644    fn from(e: ReadSendError) -> Self {
645        Error::System(SystemError::Network(NetworkError::ReadSend(e)))
646    }
647}
648
649impl From<WriteSendError> for Error {
650    fn from(e: WriteSendError) -> Self {
651        Error::System(SystemError::Network(NetworkError::WriteSend(e)))
652    }
653}
654
655impl From<tonic::transport::Error> for Error {
656    fn from(err: tonic::transport::Error) -> Self {
657        NetworkError::TonicError(Box::new(err)).into()
658    }
659}
660
661impl From<JoinError> for Error {
662    fn from(err: JoinError) -> Self {
663        NetworkError::TaskFailed(err).into()
664    }
665}
666
667impl From<SnapshotError> for Error {
668    fn from(e: SnapshotError) -> Self {
669        Error::Consensus(ConsensusError::Snapshot(e))
670    }
671}
672
673impl From<IdAllocationError> for Error {
674    fn from(e: IdAllocationError) -> Self {
675        StorageError::IdAllocation(e).into()
676    }
677}
678
679impl From<std::io::Error> for Error {
680    fn from(e: std::io::Error) -> Self {
681        StorageError::IoError(e).into()
682    }
683}