Skip to main content

ibverbs_rs/ibverbs/work/
error.rs

1use ibverbs_sys::ibv_wc_status;
2use num_enum::FromPrimitive;
3use std::fmt;
4use thiserror::Error;
5
6/// Represents a failed Work Request.
7///
8/// This error is returned when polling the Completion Queue results in a status other than `IBV_WC_SUCCESS`.
9/// It encapsulates the standard ibverbs status code as well as vendor-specific diagnostic information.
10#[derive(Copy, Clone, Debug, Error)]
11pub struct WorkError {
12    raw_status: u32,
13    vendor_code: u32,
14}
15
16impl WorkError {
17    /// Creates a new WorkError.
18    /// This function is not intended to be called with `IBV_WC_SUCCESS`.
19    pub(super) fn new(raw_status: ibv_wc_status::Type, vendor_code: u32) -> Self {
20        Self {
21            raw_status,
22            vendor_code,
23        }
24    }
25
26    /// Returns the raw `ibv_wc.status` value returned by the hardware.
27    pub fn raw_status(&self) -> u32 {
28        self.raw_status
29    }
30
31    /// Returns the vendor-specific error syndrome.
32    ///
33    /// This value is hardware-dependent (e.g., Mellanox/NVIDIA ConnectX syndrome).
34    /// It can be used to look up deep hardware diagnostics in the vendor's programmer manual.
35    pub fn vendor_code(&self) -> u32 {
36        self.vendor_code
37    }
38
39    /// Returns the canonical error code enum.
40    pub fn code(&self) -> WorkErrorCode {
41        WorkErrorCode::from(self.raw_status)
42    }
43}
44
45impl fmt::Display for WorkError {
46    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47        let code = self.code();
48
49        write!(
50            f,
51            "{} [{:?}] \
52             (raw_status={}, vendor_code={}, hint={})",
53            code,
54            code.class(),
55            self.raw_status,
56            self.vendor_code,
57            code.hint(),
58        )
59    }
60}
61
62/// Broad classification of failure domains.
63#[derive(Debug, Copy, Clone)]
64pub enum WorkErrorClass {
65    /// Bug or invalid usage in local application code.
66    LocalProgrammingError,
67
68    /// Local resource exhaustion or QP teardown.
69    LocalResourceError,
70
71    /// Error reported by the remote peer.
72    RemoteError,
73
74    /// Transport-level retry or link failure.
75    TransportError,
76
77    /// Timeout waiting for a response.
78    Timeout,
79
80    /// Fatal device or QP error.
81    Fatal,
82
83    /// Uncategorized or unknown failure.
84    Unknown,
85}
86
87/// Canonical ibverbs Work Completion status codes.
88///
89/// Numeric values match `enum ibv_wc_status`.
90#[derive(Debug, Copy, Clone, Error, FromPrimitive)]
91#[repr(u32)]
92pub enum WorkErrorCode {
93    #[error("local length error")]
94    LocalLengthError = 1,
95
96    #[error("local queue pair operation error")]
97    LocalQueuePairOperationError = 2,
98
99    #[error("local EEC operation error")]
100    LocalEecOperationError = 3,
101
102    #[error("local protection error")]
103    LocalProtectionError = 4,
104
105    #[error("work request flush error")]
106    WorkRequestFlushError = 5,
107
108    #[error("memory window bind error")]
109    MemoryWindowBindError = 6,
110
111    #[error("bad response error")]
112    BadResponseError = 7,
113
114    #[error("local access error")]
115    LocalAccessError = 8,
116
117    #[error("remote invalid request error")]
118    RemoteInvalidRequestError = 9,
119
120    #[error("remote access error")]
121    RemoteAccessError = 10,
122
123    #[error("remote operation error")]
124    RemoteOperationError = 11,
125
126    #[error("retry exceeded error")]
127    RetryExceededError = 12,
128
129    #[error("RNR retry exceeded error")]
130    RnrRetryExceededError = 13,
131
132    #[error("local RDD violation error")]
133    LocalRddViolationError = 14,
134
135    #[error("remote invalid RD request error")]
136    RemoteInvalidReadRequestError = 15,
137
138    #[error("remote abort error")]
139    RemoteAbortError = 16,
140
141    #[error("invalid EECN error")]
142    InvalidEecnError = 17,
143
144    #[error("invalid EEC state error")]
145    InvalidEecStateError = 18,
146
147    #[error("fatal error")]
148    FatalError = 19,
149
150    #[error("response timeout error")]
151    ResponseTimeoutError = 20,
152
153    #[error("general error")]
154    GeneralError = 21,
155
156    #[error("tag matching error")]
157    TagMatchingError = 22,
158
159    #[error("tag matching rendezvous incomplete")]
160    TagMatchingRendezvousIncomplete = 23,
161
162    #[error("unknown error")]
163    #[num_enum(default)]
164    UnknownError,
165}
166
167impl WorkErrorCode {
168    /// Classify the failure domain.
169    pub fn class(self) -> WorkErrorClass {
170        use WorkErrorClass::*;
171        use WorkErrorCode::*;
172
173        match self {
174            LocalLengthError
175            | LocalProtectionError
176            | LocalAccessError
177            | LocalQueuePairOperationError
178            | LocalEecOperationError
179            | InvalidEecnError
180            | InvalidEecStateError
181            | LocalRddViolationError => LocalProgrammingError,
182
183            WorkRequestFlushError | MemoryWindowBindError => LocalResourceError,
184
185            RemoteInvalidRequestError
186            | RemoteInvalidReadRequestError
187            | RemoteAccessError
188            | RemoteOperationError
189            | RemoteAbortError => RemoteError,
190
191            RetryExceededError | RnrRetryExceededError => TransportError,
192
193            ResponseTimeoutError => Timeout,
194
195            FatalError => Fatal,
196
197            GeneralError
198            | TagMatchingError
199            | TagMatchingRendezvousIncomplete
200            | BadResponseError
201            | UnknownError => Unknown,
202        }
203    }
204
205    /// Practical debugging hint.
206    pub fn hint(self) -> &'static str {
207        use WorkErrorCode::*;
208
209        match self {
210            LocalLengthError => "SGE length exceeds MR bounds or WR length is invalid",
211
212            LocalProtectionError => {
213                "Memory region permissions do not allow this operation \
214                 (check LOCAL_WRITE / REMOTE_READ / REMOTE_WRITE flags)"
215            }
216
217            LocalAccessError => "DMA failed due to invalid or unmapped memory",
218
219            LocalQueuePairOperationError => {
220                "Work request posted in invalid QP state or illegal opcode"
221            }
222
223            WorkRequestFlushError => "QP entered error state; outstanding WRs were flushed",
224
225            MemoryWindowBindError => "Memory window bind failed (invalid MR or access flags)",
226
227            RemoteInvalidRequestError => {
228                "Remote QP rejected the request (bad rkey, addr, or opcode)"
229            }
230
231            RemoteInvalidReadRequestError => {
232                "Remote rejected RDMA read (address or length invalid)"
233            }
234
235            RemoteAccessError => "Remote memory protection violation (check rkey and permissions)",
236
237            RemoteOperationError => "Remote QP failed processing the request",
238
239            RetryExceededError => {
240                "Packet retry limit exceeded (link issue or remote QP unresponsive)"
241            }
242
243            RnrRetryExceededError => {
244                "Receiver Not Ready retry limit exceeded (remote CQ/WQ stalled)"
245            }
246
247            ResponseTimeoutError => "No response before timeout (QP stalled or fabric issue)",
248
249            FatalError => "Fatal QP or device error; QP is no longer usable",
250
251            _ => "No additional diagnostic information available",
252        }
253    }
254}