d_engine_core/
errors.rs

1//! Raft Consensus Protocol Error Hierarchy
2//!
3//! Defines comprehensive error types for a Raft-based distributed system,
4//! categorized by protocol layer and operational concerns.
5
6use std::path::PathBuf;
7use std::time::Duration;
8
9use config::ConfigError;
10use tokio::task::JoinError;
11
12#[doc(hidden)]
13pub type Result<T> = std::result::Result<T, Error>;
14
15#[derive(Debug, thiserror::Error)]
16pub enum Error {
17    /// Infrastructure-level failures (network, storage, serialization)
18    #[error(transparent)]
19    System(#[from] SystemError),
20
21    /// Cluster configuration validation failures
22    #[error(transparent)]
23    Config(#[from] ConfigError),
24
25    /// Raft consensus protocol violations and failures
26    #[error(transparent)]
27    Consensus(#[from] ConsensusError),
28
29    /// Unrecoverable failures requiring process termination
30    #[error("Fatal error: {0}")]
31    Fatal(String),
32}
33
34#[derive(Debug, thiserror::Error)]
35pub enum ConsensusError {
36    /// Illegal Raft node state transitions
37    #[error(transparent)]
38    StateTransition(#[from] StateTransitionError),
39
40    /// Leader election failures (Section 5.2 Raft paper)
41    #[error(transparent)]
42    Election(#[from] ElectionError),
43
44    /// Log replication failures (Section 5.3 Raft paper)
45    #[error(transparent)]
46    Replication(#[from] ReplicationError),
47
48    /// Cluster membership change failures (Section 6 Raft paper)
49    #[error(transparent)]
50    Membership(#[from] MembershipError),
51
52    /// Snapshot-related errors during installation or restoration
53    #[error(transparent)]
54    Snapshot(#[from] SnapshotError),
55
56    /// Role permission conflict error
57    #[error("Operation requires {required_role} role but current role is {current_role}")]
58    RoleViolation {
59        current_role: &'static str,
60        required_role: &'static str,
61        context: String,
62    },
63}
64
65#[derive(Debug, thiserror::Error)]
66#[doc(hidden)]
67pub enum StateTransitionError {
68    #[error("Not enough votes to transition to leader.")]
69    NotEnoughVotes,
70
71    #[error("Invalid state transition.")]
72    InvalidTransition,
73
74    #[error("Lock error.")]
75    LockError,
76}
77
78#[derive(Debug, thiserror::Error)]
79pub enum NetworkError {
80    /// Endpoint unavailable (HTTP 503 equivalent)
81    #[error("Service unavailable: {0}")]
82    ServiceUnavailable(String),
83
84    /// Peer communication timeout
85    #[error("Connection timeout to {node_id} after {duration:?}")]
86    Timeout { node_id: u32, duration: Duration },
87
88    /// Unreachable node with source context
89    #[error("Network unreachable: {source}")]
90    Unreachable {
91        source: Box<dyn std::error::Error + Send + Sync>,
92    },
93
94    /// Persistent connection failures
95    #[error("Socket connect failed error: {0}")]
96    ConnectError(String),
97
98    /// Retry policy exhaustion
99    #[error("Retry timeout after {0:?}")]
100    RetryTimeoutError(Duration),
101
102    /// TLS negotiation failures
103    #[error("TLS handshake failed")]
104    TlsHandshakeFailure,
105
106    /// Missing peer list for RPC
107    #[error("Request list for {request_type} contains no peers")]
108    EmptyPeerList { request_type: &'static str },
109
110    /// Peer connection not found
111    #[error("Peer({0}) connection not found")]
112    PeerConnectionNotFound(u32),
113
114    /// Peer closed the channel
115    #[error("Peer closed the channel")]
116    ResponseChannelClosed,
117
118    /// Peer address not found
119    #[error("Peer({0}) address not found")]
120    PeerAddressNotFound(u32),
121
122    /// Malformed node addresses
123    #[error("Invalid URI format: {0}")]
124    InvalidURI(String),
125
126    /// RPC transmission failures with context
127    #[error("Failed to send {request_type} request")]
128    RequestSendFailure {
129        request_type: &'static str,
130        #[source]
131        source: Box<tonic::transport::Error>,
132    },
133
134    /// Low-level TCP configuration errors
135    #[error("TCP keepalive configuration error")]
136    TcpKeepaliveError,
137
138    /// HTTP/2 protocol configuration errors
139    #[error("HTTP/2 keepalive configuration error")]
140    Http2KeepaliveError,
141
142    /// gRPC transport layer errors
143    #[error(transparent)]
144    TonicError(#[from] Box<tonic::transport::Error>),
145
146    /// gRPC status code errors
147    #[error(transparent)]
148    TonicStatusError(#[from] Box<tonic::Status>),
149
150    #[error("Failed to send read request: {0}")]
151    ReadSend(#[from] ReadSendError),
152
153    #[error("Failed to send write request: {0}")]
154    WriteSend(#[from] WriteSendError),
155
156    #[error("Background task failed: {0}")]
157    TaskFailed(#[from] JoinError),
158
159    #[error("{0}")]
160    TaskBackoffFailed(String),
161
162    #[error("{0}")]
163    SingalSendFailed(String),
164
165    #[error("{0}")]
166    SingalReceiveFailed(String),
167
168    // #[error("Install snapshot RPC request been rejected, last_chunk={last_chunk}")]
169    // SnapshotRejected { last_chunk: u32 },
170
171    // #[error("Install snapshot RPC request failed")]
172    // SnapshotTransferFailed,
173    #[error("New node join cluster failed: {0}")]
174    JoinFailed(String),
175
176    #[error("Network timeout: {0}")]
177    GlobalTimeout(String),
178}
179
180#[derive(Debug, thiserror::Error)]
181pub enum StorageError {
182    /// Disk I/O failures during log/snapshot operations
183    #[error(transparent)]
184    IoError(#[from] std::io::Error),
185
186    /// Custom error with a path as a string slice (`&str`)
187    #[error("Error occurred at path: {path}")]
188    PathError {
189        path: PathBuf, // Use &str for lightweight references
190        source: std::io::Error,
191    },
192
193    /// Serialization failures for persisted data
194    #[error(transparent)]
195    BincodeError(#[from] bincode::Error),
196
197    /// State machine application errors
198    #[error("State Machine error: {0}")]
199    StateMachineError(String),
200
201    /// Log storage subsystem failures
202    #[error("Log storage failure: {0}")]
203    LogStorage(String),
204
205    // /// Snapshot creation/restoration failures
206    // #[error("Snapshot operation failed: {0}")]
207    // Snapshot(String),
208    /// Checksum validation failures
209    #[error("Data corruption detected at {location}")]
210    DataCorruption { location: String },
211
212    /// Configuration storage failures
213    #[error("Configuration storage error: {0}")]
214    ConfigStorage(String),
215
216    /// Embedded database errors
217    #[error("Embedded database error: {0}")]
218    DbError(String),
219
220    /// Error type for value conversion operations
221    #[error("Value convert failed")]
222    Convert(#[from] ConvertError),
223
224    /// File errors
225    #[error("File errors")]
226    File(#[from] FileError),
227
228    /// Serialization error
229    #[error("Serialization error: {0}")]
230    SerializationError(String),
231
232    /// ID allocation errors
233    #[error(transparent)]
234    IdAllocation(#[from] IdAllocationError),
235
236    /// Feature not enabled in configuration
237    ///
238    /// Returned when client requests a feature (e.g., TTL) that is not
239    /// enabled in the server configuration. This prevents silent failures
240    /// and ensures explicit feature activation.
241    #[error("Feature not enabled: {0}")]
242    FeatureNotEnabled(String),
243}
244
245#[derive(Debug, thiserror::Error)]
246pub enum IdAllocationError {
247    /// ID allocation overflow
248    #[error("ID allocation overflow: {start} > {end}")]
249    Overflow { start: u64, end: u64 },
250
251    /// Invalid ID range
252    #[error("Invalid ID range: {start}..={end}")]
253    InvalidRange { start: u64, end: u64 },
254
255    /// No available IDs
256    #[error("No available IDs")]
257    NoIdsAvailable,
258}
259
260#[derive(Debug, thiserror::Error)]
261pub enum FileError {
262    #[error("Path does not exist: {0}")]
263    NotFound(String),
264    #[error("Path is a directory: {0}")]
265    IsDirectory(String),
266    #[error("File is busy: {0}")]
267    Busy(String),
268    #[error("Insufficient permissions: {0}")]
269    PermissionDenied(String),
270    #[error("File is occupied: {0}")]
271    FileBusy(String),
272    #[error("Invalid path: {0}")]
273    InvalidPath(String),
274    #[error("Too small: {0}")]
275    TooSmall(u64),
276    #[error("Invalid extension: {0}")]
277    InvalidExt(String),
278    #[error("Invalid GZIP header: {0}")]
279    InvalidGzipHeader(String),
280    #[error("Unknown IO error: {0}")]
281    UnknownIo(String),
282}
283
284/// Error type for value conversion operations
285#[derive(Debug, thiserror::Error)]
286pub enum ConvertError {
287    /// Invalid input length error
288    ///
289    /// This occurs when the input byte slice length doesn't match the required 8 bytes.
290    #[error("invalid byte length: expected 8 bytes, received {0} bytes")]
291    InvalidLength(usize),
292
293    /// Generic conversion failure with detailed message
294    ///
295    /// Wraps underlying parsing/conversion errors with context information
296    #[error("conversion failure: {0}")]
297    ConversionFailure(String),
298}
299
300#[derive(Debug, thiserror::Error)]
301pub enum ReadSendError {
302    #[error("Network timeout")]
303    Timeout(#[from] tokio::time::error::Elapsed),
304
305    #[error("Connection failed")]
306    Connection(#[from] tonic::transport::Error),
307}
308
309#[derive(Debug, thiserror::Error)]
310pub enum WriteSendError {
311    #[error("Not cluster leader")]
312    NotLeader,
313
314    #[error("Network unreachable")]
315    Unreachable,
316
317    #[error("Payload too large")]
318    PayloadExceeded,
319}
320
321#[derive(Debug, thiserror::Error)]
322pub enum SystemError {
323    // Network layer
324    #[error("Network error: {0}")]
325    Network(#[from] NetworkError),
326
327    // Storage layer
328    #[error("Storage operation failed")]
329    Storage(#[from] StorageError),
330
331    //Serialization
332    #[error("Serialization error")]
333    Serialization(#[from] SerializationError),
334
335    /// Protocol buffer encoding/decoding specific errors
336    #[error("Protobuf operation failed: {0}")]
337    Prost(#[from] ProstError),
338
339    // Basic node operations
340    #[error("Node failed to start: {0}")]
341    NodeStartFailed(String),
342
343    #[error("General server error: {0}")]
344    GeneralServer(String),
345
346    #[error("Internal server error")]
347    ServerUnavailable,
348
349    /// State machine does not support lease-based expiration
350    #[error("State machine does not support lease management")]
351    LeaseNotSupported,
352}
353
354// Serialization is classified separately (across protocol layers and system layers)
355#[derive(Debug, thiserror::Error)]
356pub enum SerializationError {
357    #[error("Bincode serialization failed: {0}")]
358    Bincode(#[from] bincode::Error),
359}
360
361/// Wrapper for prost encoding/decoding errors
362#[derive(Debug, thiserror::Error)]
363pub enum ProstError {
364    #[error("Encoding failed: {0}")]
365    Encode(#[from] prost::EncodeError),
366
367    #[error("Decoding failed: {0}")]
368    Decode(#[from] prost::DecodeError),
369}
370
371#[derive(Debug, thiserror::Error)]
372pub enum ElectionError {
373    /// General election process failure
374    #[error("Election failed: {0}")]
375    Failed(String),
376
377    /// Stale term detection (Section 5.1 Raft paper)
378    #[error("Found higher term(={0}) during election process")]
379    HigherTerm(u64),
380
381    /// Term number inconsistency
382    #[error("Term conflict (current: {current}, received: {received})")]
383    TermConflict { current: u64, received: u64 },
384
385    /// Log inconsistency during vote requests (Section 5.4.1 Raft paper)
386    #[error("Log conflict at index {index} (expected: {expected_term}, actual: {actual_term})")]
387    LogConflict {
388        index: u64,
389        expected_term: u64,
390        actual_term: u64,
391    },
392
393    /// Quorum not achieved (Section 5.2 Raft paper)
394    #[error("Quorum not reached (required: {required}, succeed: {succeed})")]
395    QuorumFailure { required: usize, succeed: usize },
396
397    /// Leadership handoff failures
398    #[error("Leadership consensus error: {0}")]
399    LeadershipConsensus(String),
400
401    /// Isolated node scenario
402    #[error("No voting member found for candidate {candidate_id}")]
403    NoVotingMemberFound { candidate_id: u32 },
404}
405
406#[derive(Debug, thiserror::Error)]
407pub enum ReplicationError {
408    /// Stale leader detected during AppendEntries RPC
409    #[error("Found higher term(={0}) during replication process")]
410    HigherTerm(u64),
411
412    /// Failed to achieve majority acknowledgment
413    #[error("Quorum not reached for log replication")]
414    QuorumNotReached,
415
416    /// Timeout to receive majority response
417    #[error("Timeout to receive majority response")]
418    QuorumTimeout,
419
420    /// Target follower node unreachable
421    #[error("Node {node_id} unreachable for replication")]
422    NodeUnreachable { node_id: u32 },
423
424    /// Network timeout during replication RPC
425    #[error("RPC timeout after {duration}ms")]
426    RpcTimeout { duration: u64 },
427
428    /// Missing peer configuration in leader state
429    #[error("No peer mapping for leader {leader_id}")]
430    NoPeerFound { leader_id: u32 },
431
432    /// Log inconsistency detected during replication (ยง5.3)
433    #[error("Log conflict at index {index} (expected term {expected_term}, actual {actual_term})")]
434    LogConflict {
435        index: u64,
436        expected_term: u64,
437        actual_term: u64,
438    },
439
440    /// Node not in leader state for replication requests
441    #[error("Replication requires leader role (known leader: {leader_id:?})")]
442    NotLeader { leader_id: Option<u32> },
443}
444
445#[derive(Debug, thiserror::Error)]
446pub enum MembershipError {
447    /// Failed to reach consensus on configuration change
448    #[error("Membership update consensus failure: {0}")]
449    ConfigChangeUpdateFailed(String),
450
451    /// Non-leader node attempted membership change
452    #[error("Membership changes require leader role")]
453    NotLeader,
454
455    /// No leader information available
456    #[error("No leader information available")]
457    NoLeaderFound,
458
459    /// Non-learner node attempted join cluster
460    #[error("Only Learner can join cluster")]
461    NotLearner,
462
463    /// Cluster not in operational state
464    #[error("Cluster bootstrap not completed")]
465    ClusterIsNotReady,
466
467    /// Connection establishment failure during join
468    #[error("Cluster connection setup failed: {0}")]
469    SetupClusterConnectionFailed(String),
470
471    /// Missing node metadata in configuration
472    #[error("Metadata missing for node {node_id} in cluster config")]
473    NoMetadataFoundForNode { node_id: u32 },
474
475    /// No available peers found for request
476    #[error("No reachable peers found in cluster membership")]
477    NoPeersAvailable,
478
479    /// Node already been added into cluster config
480    #[error("Node({0}) already been added into cluster config.")]
481    NodeAlreadyExists(u32),
482
483    /// To be removed node is leader.
484    #[error("To be removed node({0}) is leader.")]
485    RemoveNodeIsLeader(u32),
486
487    #[error("Cannot promote node {node_id}: current role is {role} (expected LEARNER)")]
488    InvalidPromotion { node_id: u32, role: i32 },
489
490    #[error("Invalid membership change request")]
491    InvalidChangeRequest,
492
493    #[error("Commit Timeout")]
494    CommitTimeout,
495
496    #[error("Learner({0}) join cluster failed.")]
497    JoinClusterFailed(u32),
498
499    #[error("Join cluster error: {0}")]
500    JoinClusterError(String),
501
502    #[error("Not leader")]
503    NoLeader,
504
505    #[error("Mark leader id failed: {0}")]
506    MarkLeaderIdFailed(String),
507}
508
509#[derive(Debug, thiserror::Error)]
510pub enum SnapshotError {
511    #[error("Snapshot receiver lagging, dropping chunk")]
512    Backpressure,
513
514    /// Snapshot chunk rejected during installation
515    #[error("Install snapshot RPC request been rejected, last_chunk={last_chunk}")]
516    Rejected { last_chunk: u32 },
517
518    #[error("Install snapshot RPC request been rejected")]
519    RemoteRejection,
520
521    /// Snapshot transfer failed due to stream/network issues
522    #[error("Install snapshot RPC request failed")]
523    TransferFailed,
524
525    /// Snapshot transfer timeout due to network issues
526    #[error("Install snapshot RPC request timeout")]
527    TransferTimeout,
528
529    /// Snapshot operation failed with context
530    #[error("Snapshot operation failed: {0}")]
531    OperationFailed(String),
532
533    /// Snapshot is outdated and cannot be used
534    #[error("Snapshot is outdated")]
535    Outdated,
536
537    /// Snapshot file checksum mismatch
538    #[error("Snapshot file checksum mismatch")]
539    ChecksumMismatch,
540
541    /// Invalid snapshot
542    #[error("Invalid snapshot")]
543    InvalidSnapshot,
544
545    /// Invalid chunk sequence
546    #[error("Invalid chunk sequence")]
547    InvalidChunkSequence,
548
549    /// Stream receiver disconnected
550    #[error("Stream receiver disconnected")]
551    ReceiverDisconnected,
552
553    #[error("Invalid first snapshot stream chunk")]
554    InvalidFirstChunk,
555
556    #[error("Empty snapshot stream chunk")]
557    EmptySnapshot,
558
559    #[error("Incomplete snapshot error")]
560    IncompleteSnapshot,
561
562    #[error("Requested chunk {0} out of range (max: {1})")]
563    ChunkOutOfRange(u32, u32),
564
565    #[error("Chunk in stream is out of order")]
566    OutOfOrderChunk,
567
568    #[error("No metadata in chunk")]
569    MissingMetadata,
570
571    #[error("Chunk not cached: {0}")]
572    ChunkNotCached(u32),
573
574    #[error("Background stream push task died")]
575    BackgroundTaskDied,
576}
577
578// ============== Conversion Implementations ============== //
579impl From<NetworkError> for Error {
580    fn from(e: NetworkError) -> Self {
581        Error::System(SystemError::Network(e))
582    }
583}
584
585impl From<StorageError> for Error {
586    fn from(e: StorageError) -> Self {
587        Error::System(SystemError::Storage(e))
588    }
589}
590
591impl From<ConvertError> for Error {
592    fn from(e: ConvertError) -> Self {
593        Error::System(SystemError::Storage(StorageError::Convert(e)))
594    }
595}
596
597impl From<FileError> for Error {
598    fn from(e: FileError) -> Self {
599        Error::System(SystemError::Storage(StorageError::File(e)))
600    }
601}
602
603impl From<SerializationError> for Error {
604    fn from(e: SerializationError) -> Self {
605        Error::System(SystemError::Serialization(e))
606    }
607}
608
609// // These allow direct conversion from prost errors to SystemError
610// impl From<prost::EncodeError> for SystemError {
611//     fn from(error: prost::EncodeError) -> Self {
612//         SystemError::Prost(ProstError::Encode(error))
613//     }
614// }
615
616// impl From<prost::DecodeError> for SystemError {
617//     fn from(error: prost::DecodeError) -> Self {
618//         SystemError::Prost(ProstError::Decode(error))
619//     }
620// }
621
622impl From<ProstError> for Error {
623    fn from(error: ProstError) -> Self {
624        Error::System(SystemError::Prost(error))
625    }
626}
627
628// ===== Consensus Error conversions =====
629
630impl From<StateTransitionError> for Error {
631    fn from(e: StateTransitionError) -> Self {
632        Error::Consensus(ConsensusError::StateTransition(e))
633    }
634}
635
636impl From<ElectionError> for Error {
637    fn from(e: ElectionError) -> Self {
638        Error::Consensus(ConsensusError::Election(e))
639    }
640}
641
642impl From<ReplicationError> for Error {
643    fn from(e: ReplicationError) -> Self {
644        Error::Consensus(ConsensusError::Replication(e))
645    }
646}
647
648impl From<MembershipError> for Error {
649    fn from(e: MembershipError) -> Self {
650        Error::Consensus(ConsensusError::Membership(e))
651    }
652}
653
654// ===== Network sub-error conversions =====
655impl From<ReadSendError> for Error {
656    fn from(e: ReadSendError) -> Self {
657        Error::System(SystemError::Network(NetworkError::ReadSend(e)))
658    }
659}
660
661impl From<WriteSendError> for Error {
662    fn from(e: WriteSendError) -> Self {
663        Error::System(SystemError::Network(NetworkError::WriteSend(e)))
664    }
665}
666
667impl From<tonic::transport::Error> for Error {
668    fn from(err: tonic::transport::Error) -> Self {
669        NetworkError::TonicError(Box::new(err)).into()
670    }
671}
672
673impl From<JoinError> for Error {
674    fn from(err: JoinError) -> Self {
675        NetworkError::TaskFailed(err).into()
676    }
677}
678
679impl From<SnapshotError> for Error {
680    fn from(e: SnapshotError) -> Self {
681        Error::Consensus(ConsensusError::Snapshot(e))
682    }
683}
684
685impl From<IdAllocationError> for Error {
686    fn from(e: IdAllocationError) -> Self {
687        StorageError::IdAllocation(e).into()
688    }
689}
690
691impl From<std::io::Error> for Error {
692    fn from(e: std::io::Error) -> Self {
693        StorageError::IoError(e).into()
694    }
695}