Skip to main content

nodedb_cluster/
error.rs

1// SPDX-License-Identifier: BUSL-1.1
2
3use thiserror::Error;
4
5pub type Result<T> = std::result::Result<T, ClusterError>;
6
7/// Errors specific to the Calvin sequencer and transaction-class layer.
8#[derive(Debug, Error, PartialEq, Eq)]
9pub enum CalvinError {
10    #[error("write set is empty; a Calvin transaction must write at least one key")]
11    EmptyWriteSet,
12
13    #[error(
14        "write set resolves to a single vshard ({vshard}); \
15         use the single-shard fast path instead"
16    )]
17    SingleVshardTxn { vshard: u32 },
18
19    /// A sequencer-layer error. See [`crate::calvin::sequencer::error::SequencerError`]
20    /// for the full variant set.
21    #[error("sequencer error: {0}")]
22    Sequencer(#[from] crate::calvin::sequencer::error::SequencerError),
23}
24
25/// Error emitted when applying or validating a `MigrationCheckpoint` entry.
26#[derive(Debug, Error)]
27pub enum MigrationCheckpointError {
28    #[error(
29        "crc32c mismatch on migration checkpoint for {migration_id}: expected {expected:#010x} got {actual:#010x}"
30    )]
31    Crc32cMismatch {
32        migration_id: uuid::Uuid,
33        expected: u32,
34        actual: u32,
35    },
36    #[error("codec error persisting migration checkpoint: {detail}")]
37    Codec { detail: String },
38    #[error("storage error persisting migration checkpoint: {detail}")]
39    Storage { detail: String },
40}
41
42/// Error emitted during in-flight migration recovery at startup.
43#[derive(Debug, Error)]
44pub enum MigrationRecoveryError {
45    #[error("compensation failed for migration {migration_id} step {step}: {detail}")]
46    CompensationFailed {
47        migration_id: uuid::Uuid,
48        step: usize,
49        detail: String,
50    },
51    #[error("storage error during migration recovery: {detail}")]
52    Storage { detail: String },
53    #[error("codec error during migration recovery: {detail}")]
54    Codec { detail: String },
55}
56
57#[derive(Debug, Error)]
58pub enum ClusterError {
59    #[error("raft error: {0}")]
60    Raft(#[from] nodedb_raft::RaftError),
61
62    #[error("vshard {vshard_id} not mapped to any raft group")]
63    VShardNotMapped { vshard_id: u32 },
64
65    #[error("raft group {group_id} not found on this node")]
66    GroupNotFound { group_id: u64 },
67
68    #[error("migration in progress for vshard {vshard_id}")]
69    MigrationInProgress { vshard_id: u32 },
70
71    #[error("migration refused: estimated pause {estimated_us}µs exceeds budget {budget_us}µs")]
72    MigrationPauseBudgetExceeded { estimated_us: u64, budget_us: u64 },
73
74    #[error("node {node_id} not reachable")]
75    NodeUnreachable { node_id: u64 },
76
77    #[error("ghost stub not found: node={node_id} on shard={shard_id}")]
78    GhostNotFound { node_id: String, shard_id: u32 },
79
80    #[error("transport error: {detail}")]
81    Transport { detail: String },
82
83    #[error("storage error: {detail}")]
84    Storage { detail: String },
85
86    #[error("codec error: {detail}")]
87    Codec { detail: String },
88
89    #[error(
90        "unsupported wire version: got {got}, this node accepts [{supported_min}..={supported_max}]"
91    )]
92    UnsupportedWireVersion {
93        got: u8,
94        supported_min: u8,
95        supported_max: u8,
96    },
97
98    #[error("circuit open for node {node_id}: peer has {failures} consecutive failures")]
99    CircuitOpen { node_id: u64, failures: u32 },
100
101    #[error("raft group {group_id} disappeared while waiting for conf change commit")]
102    JoinGroupDisappeared { group_id: u64 },
103
104    #[error("conf change commit timeout on group {group_id} (waited for index {log_index})")]
105    JoinCommitTimeout { group_id: u64, log_index: u64 },
106
107    #[error("invalid cluster configuration: {detail}")]
108    Config { detail: String },
109
110    #[error("migration checkpoint error: {0}")]
111    MigrationCheckpoint(#[from] MigrationCheckpointError),
112
113    #[error("migration recovery error: {0}")]
114    MigrationRecovery(#[from] MigrationRecoveryError),
115
116    /// A shard RPC was routed to a node that no longer owns the target vShard.
117    ///
118    /// This surfaces when vShard ownership has transferred (rebalance or split
119    /// cut-over) after the coordinator computed its routing plan. The coordinator
120    /// must refresh its routing table and retry against the new owner.
121    ///
122    /// `expected_owner_node` is `Some` when the receiving shard knows who the
123    /// current owner is, and `None` when it does not (e.g. during a brief
124    /// transition window). Either way, the coordinator should re-derive the owner
125    /// from its local routing table — `expected_owner_node` is advisory only.
126    #[error(
127        "vshard {vshard_id} misrouted: this node is no longer the owner\
128         {}", if let Some(n) = expected_owner_node { format!("; current owner may be node {n}") } else { String::new() }
129    )]
130    WrongOwner {
131        vshard_id: u32,
132        expected_owner_node: Option<u64>,
133    },
134
135    #[error("calvin error: {0}")]
136    Calvin(#[from] CalvinError),
137
138    #[error(
139        "snapshot CRC mismatch for group {group_id}: stored {stored:#010x}, computed {computed:#010x}"
140    )]
141    SnapshotCrcMismatch {
142        group_id: u64,
143        stored: u32,
144        computed: u32,
145    },
146
147    #[error("snapshot offset regression for group {group_id}: expected {expected}, got {actual}")]
148    SnapshotOffsetRegression {
149        group_id: u64,
150        expected: u64,
151        actual: u64,
152    },
153
154    #[error("partial snapshot file corrupt for group {group_id}: {detail}")]
155    PartialSnapshotCorrupt { group_id: u64, detail: String },
156
157    #[error("partial snapshot cleanup failed for group {group_id}: {detail}")]
158    PartialSnapshotCleanupFailed { group_id: u64, detail: String },
159
160    #[error("mirror error: {0}")]
161    Mirror(#[from] crate::mirror::MirrorError),
162}