Skip to main content

rlmesh_grpc/
error.rs

1//! Error types for concrete RLMesh gRPC clients and servers.
2
3use std::time::Duration;
4use thiserror::Error;
5
6/// Top-level error type for rlmesh-grpc operations.
7#[derive(Debug, Error)]
8#[non_exhaustive]
9pub enum Error {
10    /// Transport-level error (connection, I/O, etc.)
11    #[error("transport error: {0}")]
12    Transport(#[from] TransportError),
13
14    /// Protocol-level error (encoding, framing, etc.)
15    #[error("protocol error: {0}")]
16    Protocol(#[from] ProtocolError),
17
18    /// Environment error (from the environment itself)
19    #[error("environment error: {0}")]
20    Environment(#[from] EnvError),
21
22    /// Model error (from a served model handler)
23    #[error("model error: {0}")]
24    Model(#[from] ModelError),
25
26    /// Operation timed out
27    #[error("timeout after {0:?}")]
28    Timeout(Duration),
29
30    /// Operation was cancelled
31    #[error("cancelled: {0}")]
32    Cancelled(String),
33
34    /// Client error
35    #[error("client error: {0}")]
36    Client(#[from] ClientError),
37}
38
39impl Error {
40    /// Check if this error is recoverable (can retry).
41    pub fn is_recoverable(&self) -> bool {
42        match self {
43            Self::Timeout(_) => true,
44            Self::Environment(error) => error.is_recoverable,
45            Self::Model(error) => error.is_recoverable,
46            Self::Transport(TransportError::Unavailable(_)) => true,
47            Self::Transport(TransportError::Status {
48                code: tonic::Code::DeadlineExceeded,
49                ..
50            }) => true,
51            Self::Transport(TransportError::Io(_)) => false,
52            Self::Transport(TransportError::ConnectionClosed) => false,
53            _ => false,
54        }
55    }
56
57    /// A permanent handshake incompatibility (protocol/edition/SHA-pin mismatch)
58    /// that retrying cannot fix, vs a transient connect error while a server binds.
59    pub fn is_fatal_handshake(&self) -> bool {
60        matches!(self, Self::Protocol(ProtocolError::HandshakeFailed(_)))
61    }
62}
63
64/// Map a `tonic::Status` from an established connection into a structured
65/// [`enum@Error`], preserving the gRPC status code so callers can distinguish a
66/// retryable condition (e.g. `Unavailable`) from a permanent one (e.g.
67/// `Unimplemented`) instead of seeing every failure as `failed to connect`.
68pub fn status_to_grpc_error(status: tonic::Status) -> Error {
69    use tonic::Code;
70
71    let code = status.code();
72    let message = status.message().to_string();
73    match code {
74        Code::Unavailable | Code::ResourceExhausted | Code::Aborted => {
75            Error::Transport(TransportError::Unavailable(message))
76        }
77        Code::Cancelled => Error::Cancelled(message),
78        // Everything else keeps the structured code. DeadlineExceeded stays
79        // recoverable without fabricating a Timeout(0ns) duration.
80        _ => Error::Transport(TransportError::Status { code, message }),
81    }
82}
83
84/// Transport-level errors.
85#[derive(Debug, Error)]
86#[non_exhaustive]
87pub enum TransportError {
88    /// I/O error
89    #[error("io error: {0}")]
90    Io(#[from] std::io::Error),
91
92    /// Connection was closed
93    #[error("connection closed")]
94    ConnectionClosed,
95
96    /// Failed to bind to address
97    #[error("failed to bind: {0}")]
98    BindFailed(String),
99
100    /// Failed to connect
101    #[error("failed to connect: {0}")]
102    ConnectFailed(String),
103
104    /// Invalid address format
105    #[error("invalid address: {0}")]
106    InvalidAddress(String),
107
108    /// Message too large
109    #[error("message too large: {size} > {max}")]
110    MessageTooLarge { size: usize, max: usize },
111
112    /// Server is temporarily unavailable (retryable).
113    #[error("server unavailable: {0}")]
114    Unavailable(String),
115
116    /// A gRPC status returned on an established connection, preserving its code.
117    #[error("grpc status {code:?}: {message}")]
118    Status {
119        /// The gRPC status code.
120        code: tonic::Code,
121        /// The status message.
122        message: String,
123    },
124}
125
126/// Protocol-level errors.
127#[derive(Debug, Error)]
128#[non_exhaustive]
129pub enum ProtocolError {
130    /// Failed to encode message
131    #[error("encode error: {0}")]
132    EncodeError(String),
133
134    /// Failed to decode message
135    #[error("decode error: {0}")]
136    DecodeError(String),
137
138    /// Handshake failed
139    #[error("handshake failed: {0}")]
140    HandshakeFailed(String),
141
142    /// Unexpected message
143    #[error("unexpected message: expected {expected}, got {actual}")]
144    UnexpectedMessage { expected: String, actual: String },
145}
146
147/// Environment errors (from the environment itself).
148#[derive(Debug, Error)]
149pub struct EnvError {
150    /// Error code
151    pub code: EnvErrorCode,
152    /// Human-readable message
153    pub message: String,
154    /// Whether this error is recoverable
155    pub is_recoverable: bool,
156    /// Debug information
157    pub debug_info: Option<String>,
158}
159
160impl std::fmt::Display for EnvError {
161    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
162        write!(f, "[{:?}] {}", self.code, self.message)
163    }
164}
165
166/// Environment error codes.
167#[derive(Debug, Clone, Copy, PartialEq, Eq)]
168#[non_exhaustive]
169pub enum EnvErrorCode {
170    /// Unspecified error
171    Unspecified,
172    /// Operation timed out
173    Timeout,
174    /// Invalid action
175    InvalidAction,
176    /// Environment not ready (needs reset)
177    NotReady,
178    /// Environment busy with another operation
179    Busy,
180    /// Internal error
181    Internal,
182    /// Environment crashed
183    Crashed,
184    /// Operation was cancelled
185    Cancelled,
186    /// Environment was closed
187    Closed,
188}
189
190impl EnvError {
191    /// Create a new environment error.
192    pub fn new(code: EnvErrorCode, message: impl Into<String>) -> Self {
193        let is_recoverable = matches!(
194            code,
195            EnvErrorCode::Timeout
196                | EnvErrorCode::InvalidAction
197                | EnvErrorCode::NotReady
198                | EnvErrorCode::Busy
199        );
200        Self {
201            code,
202            message: message.into(),
203            is_recoverable,
204            debug_info: None,
205        }
206    }
207}
208
209impl From<rlmesh_proto::env::v1::EnvErrorCode> for EnvErrorCode {
210    fn from(code: rlmesh_proto::env::v1::EnvErrorCode) -> Self {
211        use rlmesh_proto::env::v1::EnvErrorCode as ProtoCode;
212        match code {
213            ProtoCode::Unspecified => EnvErrorCode::Unspecified,
214            ProtoCode::Timeout => EnvErrorCode::Timeout,
215            ProtoCode::InvalidAction => EnvErrorCode::InvalidAction,
216            ProtoCode::NotReady => EnvErrorCode::NotReady,
217            ProtoCode::Busy => EnvErrorCode::Busy,
218            ProtoCode::Internal => EnvErrorCode::Internal,
219            ProtoCode::Crashed => EnvErrorCode::Crashed,
220            ProtoCode::Cancelled => EnvErrorCode::Cancelled,
221            ProtoCode::Closed => EnvErrorCode::Closed,
222        }
223    }
224}
225
226impl From<EnvErrorCode> for rlmesh_proto::env::v1::EnvErrorCode {
227    fn from(code: EnvErrorCode) -> Self {
228        use rlmesh_proto::env::v1::EnvErrorCode as ProtoCode;
229        match code {
230            EnvErrorCode::Unspecified => ProtoCode::Unspecified,
231            EnvErrorCode::Timeout => ProtoCode::Timeout,
232            EnvErrorCode::InvalidAction => ProtoCode::InvalidAction,
233            EnvErrorCode::NotReady => ProtoCode::NotReady,
234            EnvErrorCode::Busy => ProtoCode::Busy,
235            EnvErrorCode::Internal => ProtoCode::Internal,
236            EnvErrorCode::Crashed => ProtoCode::Crashed,
237            EnvErrorCode::Cancelled => ProtoCode::Cancelled,
238            EnvErrorCode::Closed => ProtoCode::Closed,
239        }
240    }
241}
242
243/// Model errors (from a served model handler).
244#[derive(Debug, Error)]
245pub struct ModelError {
246    /// Error code
247    pub code: ModelErrorCode,
248    /// Human-readable message
249    pub message: String,
250    /// Whether this error is recoverable
251    pub is_recoverable: bool,
252    /// Debug information
253    pub debug_info: Option<String>,
254}
255
256impl std::fmt::Display for ModelError {
257    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
258        write!(f, "[{:?}] {}", self.code, self.message)
259    }
260}
261
262/// Model error codes.
263#[derive(Debug, Clone, Copy, PartialEq, Eq)]
264pub enum ModelErrorCode {
265    /// Unspecified error
266    Unspecified,
267    /// The request was invalid
268    InvalidRequest,
269    /// The route was not configured
270    NotConfigured,
271    /// The model is busy with another operation
272    Busy,
273    /// Internal error
274    Internal,
275    /// Operation was cancelled
276    Cancelled,
277    /// The model/route was closed
278    Closed,
279}
280
281impl From<rlmesh_proto::model::v1::ModelErrorCode> for ModelErrorCode {
282    fn from(code: rlmesh_proto::model::v1::ModelErrorCode) -> Self {
283        use rlmesh_proto::model::v1::ModelErrorCode as ProtoCode;
284        match code {
285            ProtoCode::Unspecified => ModelErrorCode::Unspecified,
286            ProtoCode::InvalidRequest => ModelErrorCode::InvalidRequest,
287            ProtoCode::NotConfigured => ModelErrorCode::NotConfigured,
288            ProtoCode::Busy => ModelErrorCode::Busy,
289            ProtoCode::Internal => ModelErrorCode::Internal,
290            ProtoCode::Cancelled => ModelErrorCode::Cancelled,
291            ProtoCode::Closed => ModelErrorCode::Closed,
292        }
293    }
294}
295
296/// Client-specific errors.
297#[derive(Debug, Error)]
298#[non_exhaustive]
299pub enum ClientError {
300    /// Not connected
301    #[error("not connected")]
302    NotConnected,
303
304    /// Handshake not completed
305    #[error("handshake not completed")]
306    NotHandshaked,
307}
308
309/// Result type alias for rlmesh-grpc operations.
310pub type Result<T> = std::result::Result<T, Error>;
311
312#[cfg(test)]
313mod status_mapping_tests {
314    use super::*;
315    use tonic::{Code, Status};
316
317    #[test]
318    fn unavailable_status_maps_to_recoverable_transport_error() {
319        let error = status_to_grpc_error(Status::new(Code::Unavailable, "try later"));
320        assert!(error.is_recoverable(), "Unavailable must be retryable");
321        match error {
322            Error::Transport(TransportError::Unavailable(message)) => {
323                assert_eq!(message, "try later");
324            }
325            other => panic!("expected Unavailable transport error, got {other:?}"),
326        }
327    }
328
329    #[test]
330    fn unimplemented_status_preserves_code_and_is_not_recoverable() {
331        let error = status_to_grpc_error(Status::new(Code::Unimplemented, "no such method"));
332        assert!(!error.is_recoverable(), "Unimplemented must be permanent");
333        match error {
334            Error::Transport(TransportError::Status { code, message }) => {
335                assert_eq!(code, Code::Unimplemented);
336                assert_eq!(message, "no such method");
337            }
338            other => panic!("expected structured Status error, got {other:?}"),
339        }
340        // It must not be misreported as a connect failure.
341        let error = status_to_grpc_error(Status::new(Code::Unimplemented, "x"));
342        assert!(!error.to_string().contains("failed to connect"));
343    }
344
345    #[test]
346    fn deadline_exceeded_status_keeps_code_and_recoverability() {
347        let error = status_to_grpc_error(Status::new(Code::DeadlineExceeded, "slow"));
348        assert!(matches!(
349            error,
350            Error::Transport(TransportError::Status {
351                code: Code::DeadlineExceeded,
352                ..
353            })
354        ));
355        assert!(error.is_recoverable());
356        // The message must not fabricate a zero duration.
357        assert!(!error.to_string().contains("0ns"));
358        assert!(error.to_string().contains("slow"));
359    }
360
361    #[test]
362    fn handshake_failure_is_fatal_but_connect_failure_is_not() {
363        let fatal = Error::Protocol(ProtocolError::HandshakeFailed("pin mismatch".into()));
364        assert!(fatal.is_fatal_handshake());
365        let transient = Error::Transport(TransportError::ConnectFailed("binding".into()));
366        assert!(!transient.is_fatal_handshake());
367    }
368}