Skip to main content

zeph_mcp/
error.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use zeph_common::ToolName;
5
6/// Typed error code for MCP tool call retry and recovery classification.
7///
8/// Used by [`McpError::code`] and callers such as the agent retry loop to decide
9/// whether an operation should be retried, backed off, or abandoned.
10#[non_exhaustive]
11#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
12#[serde(rename_all = "snake_case")]
13pub enum McpErrorCode {
14    /// Transient error: retry is likely to succeed.
15    Transient,
16    /// Rate limited: back off and retry.
17    RateLimited,
18    /// Invalid input: do not retry without changing parameters.
19    InvalidInput,
20    /// Auth failure: re-authenticate or escalate.
21    AuthFailure,
22    /// Server error: may be transient, retry with backoff.
23    ServerError,
24    /// Not found: resource or tool does not exist.
25    NotFound,
26    /// Blocked by policy rules.
27    PolicyBlocked,
28}
29
30impl McpErrorCode {
31    /// Whether this error code suggests the operation can be retried.
32    #[must_use]
33    pub fn is_retryable(self) -> bool {
34        matches!(
35            self,
36            Self::Transient | Self::RateLimited | Self::ServerError
37        )
38    }
39}
40
41/// Crate-wide error type for all MCP operations.
42///
43/// Variants cover connection failures, tool call errors, policy blocks, OAuth flows,
44/// and infrastructure errors (Qdrant, JSON serialization). Use [`McpError::code`] to
45/// obtain a typed [`McpErrorCode`] for retry/recovery decisions.
46///
47/// # Examples
48///
49/// ```
50/// use zeph_mcp::error::{McpError, McpErrorCode};
51///
52/// let err = McpError::Timeout {
53///     server_id: "github".to_owned(),
54///     tool_name: "create_issue".into(),
55///     timeout_secs: 30,
56/// };
57/// assert_eq!(err.code(), Some(McpErrorCode::Transient));
58/// assert!(err.code().unwrap().is_retryable());
59/// ```
60#[non_exhaustive]
61#[derive(Debug, thiserror::Error)]
62pub enum McpError {
63    #[error("connection failed for server '{server_id}': {message}")]
64    Connection { server_id: String, message: String },
65
66    #[error("tool call failed: {server_id}/{tool_name}: {message}")]
67    ToolCall {
68        server_id: String,
69        tool_name: ToolName,
70        message: String,
71        /// Typed error code for retry classification.
72        code: McpErrorCode,
73    },
74
75    #[error("server '{server_id}' not found")]
76    ServerNotFound { server_id: String },
77
78    #[error("server '{server_id}' is already connected")]
79    ServerAlreadyConnected { server_id: String },
80
81    #[error("tool '{tool_name}' not found on server '{server_id}'")]
82    ToolNotFound {
83        server_id: String,
84        tool_name: ToolName,
85    },
86
87    #[error("tool call timed out after {timeout_secs}s: {server_id}/{tool_name}")]
88    Timeout {
89        server_id: String,
90        tool_name: ToolName,
91        timeout_secs: u64,
92    },
93
94    #[error("Qdrant error: {0}")]
95    Qdrant(#[from] Box<qdrant_client::QdrantError>),
96
97    #[error("JSON error: {0}")]
98    Json(#[from] serde_json::Error),
99
100    #[error("integer conversion: {0}")]
101    IntConversion(#[from] std::num::TryFromIntError),
102
103    #[error("SSRF blocked: URL '{url}' resolves to private/reserved IP {addr}")]
104    SsrfBlocked { url: String, addr: String },
105
106    #[error("invalid URL '{url}': {message}")]
107    InvalidUrl { url: String, message: String },
108
109    #[error("embedding error: {0}")]
110    Embedding(String),
111
112    #[error("MCP command '{command}' not allowed")]
113    CommandNotAllowed { command: String },
114
115    #[error("env var '{var_name}' is blocked for MCP server processes")]
116    EnvVarBlocked { var_name: String },
117
118    #[error("policy violation: {0}")]
119    PolicyViolation(String),
120
121    #[error("OAuth error for server '{server_id}': {message}")]
122    OAuthError { server_id: String, message: String },
123
124    #[error("OAuth callback timed out for server '{server_id}' after {timeout_secs}s")]
125    OAuthCallbackTimeout {
126        server_id: String,
127        timeout_secs: u64,
128    },
129
130    #[error("tool list refresh rejected for '{server_id}': list is locked after initial connect")]
131    ToolListLocked { server_id: String },
132
133    /// The MCP manager is shutting down; the connection attempt was aborted.
134    ///
135    /// This is a terminal lifecycle signal, not a transient error. Callers must not retry.
136    #[error("MCP manager is shutting down (server '{server_id}')")]
137    ManagerShuttingDown { server_id: String },
138
139    /// HTTP 4xx response that indicates an authentication or authorization failure.
140    ///
141    /// This error is non-retryable: the client must fix credentials or permissions before
142    /// attempting to connect again. It maps to [`McpErrorCode::AuthFailure`].
143    #[error("HTTP {status} from MCP server '{server_id}': authentication or authorization failed")]
144    HttpAuth {
145        server_id: String,
146        /// HTTP status code (e.g. 401, 403, 404, 410, 422).
147        status: u16,
148    },
149}
150
151impl McpError {
152    /// Return the typed error code for this error variant.
153    #[must_use]
154    pub fn code(&self) -> Option<McpErrorCode> {
155        match self {
156            Self::ToolCall { code, .. } => Some(*code),
157            Self::Timeout { .. } | Self::Connection { .. } => Some(McpErrorCode::Transient),
158            Self::ServerNotFound { .. } | Self::ToolNotFound { .. } => Some(McpErrorCode::NotFound),
159            Self::PolicyViolation(_)
160            | Self::SsrfBlocked { .. }
161            | Self::CommandNotAllowed { .. }
162            | Self::EnvVarBlocked { .. } => Some(McpErrorCode::PolicyBlocked),
163            Self::OAuthError { .. } | Self::OAuthCallbackTimeout { .. } | Self::HttpAuth { .. } => {
164                Some(McpErrorCode::AuthFailure)
165            }
166            Self::InvalidUrl { .. } | Self::ToolListLocked { .. } => {
167                Some(McpErrorCode::InvalidInput)
168            }
169            Self::Embedding(_) => Some(McpErrorCode::ServerError),
170            // Lifecycle, infrastructure, and structural errors have no taxonomy code.
171            Self::ManagerShuttingDown { .. }
172            | Self::ServerAlreadyConnected { .. }
173            | Self::Qdrant(_)
174            | Self::Json(_)
175            | Self::IntConversion(_) => None,
176        }
177    }
178}
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183
184    #[test]
185    fn connection_error_display() {
186        let err = McpError::Connection {
187            server_id: "github".into(),
188            message: "refused".into(),
189        };
190        assert_eq!(
191            err.to_string(),
192            "connection failed for server 'github': refused"
193        );
194    }
195
196    #[test]
197    fn tool_call_error_display() {
198        let err = McpError::ToolCall {
199            server_id: "fs".into(),
200            tool_name: "read_file".into(),
201            message: "not found".into(),
202            code: McpErrorCode::ServerError,
203        };
204        assert_eq!(err.to_string(), "tool call failed: fs/read_file: not found");
205    }
206
207    #[test]
208    fn error_code_is_retryable() {
209        assert!(McpErrorCode::Transient.is_retryable());
210        assert!(McpErrorCode::RateLimited.is_retryable());
211        assert!(McpErrorCode::ServerError.is_retryable());
212        assert!(!McpErrorCode::InvalidInput.is_retryable());
213        assert!(!McpErrorCode::AuthFailure.is_retryable());
214        assert!(!McpErrorCode::NotFound.is_retryable());
215        assert!(!McpErrorCode::PolicyBlocked.is_retryable());
216    }
217
218    #[test]
219    fn mcp_error_code_method() {
220        let err = McpError::ToolCall {
221            server_id: "s".into(),
222            tool_name: "t".into(),
223            message: "e".into(),
224            code: McpErrorCode::RateLimited,
225        };
226        assert_eq!(err.code(), Some(McpErrorCode::RateLimited));
227
228        let timeout = McpError::Timeout {
229            server_id: "s".into(),
230            tool_name: "t".into(),
231            timeout_secs: 30,
232        };
233        assert_eq!(timeout.code(), Some(McpErrorCode::Transient));
234
235        let policy = McpError::PolicyViolation("denied".into());
236        assert_eq!(policy.code(), Some(McpErrorCode::PolicyBlocked));
237    }
238
239    #[test]
240    fn server_not_found_display() {
241        let err = McpError::ServerNotFound {
242            server_id: "missing".into(),
243        };
244        assert_eq!(err.to_string(), "server 'missing' not found");
245    }
246
247    #[test]
248    fn tool_not_found_display() {
249        let err = McpError::ToolNotFound {
250            server_id: "fs".into(),
251            tool_name: "delete".into(),
252        };
253        assert_eq!(err.to_string(), "tool 'delete' not found on server 'fs'");
254    }
255
256    #[test]
257    fn server_already_connected_display() {
258        let err = McpError::ServerAlreadyConnected {
259            server_id: "github".into(),
260        };
261        assert_eq!(err.to_string(), "server 'github' is already connected");
262    }
263
264    #[test]
265    fn timeout_error_display() {
266        let err = McpError::Timeout {
267            server_id: "slow".into(),
268            tool_name: "query".into(),
269            timeout_secs: 30,
270        };
271        assert_eq!(err.to_string(), "tool call timed out after 30s: slow/query");
272    }
273
274    #[test]
275    fn http_auth_error_display_and_code() {
276        let err = McpError::HttpAuth {
277            server_id: "remote".into(),
278            status: 401,
279        };
280        assert_eq!(
281            err.to_string(),
282            "HTTP 401 from MCP server 'remote': authentication or authorization failed"
283        );
284        assert_eq!(err.code(), Some(McpErrorCode::AuthFailure));
285        assert!(!err.code().unwrap().is_retryable());
286    }
287
288    #[test]
289    fn http_auth_403_not_retryable() {
290        let err = McpError::HttpAuth {
291            server_id: "api".into(),
292            status: 403,
293        };
294        assert_eq!(err.code(), Some(McpErrorCode::AuthFailure));
295        assert!(!err.code().unwrap().is_retryable());
296    }
297
298    #[test]
299    fn http_auth_404_not_retryable() {
300        let err = McpError::HttpAuth {
301            server_id: "srv".into(),
302            status: 404,
303        };
304        assert_eq!(err.code(), Some(McpErrorCode::AuthFailure));
305        assert!(!err.code().unwrap().is_retryable());
306    }
307
308    #[test]
309    fn http_auth_410_not_retryable() {
310        let err = McpError::HttpAuth {
311            server_id: "srv".into(),
312            status: 410,
313        };
314        assert_eq!(err.code(), Some(McpErrorCode::AuthFailure));
315        assert!(!err.code().unwrap().is_retryable());
316    }
317
318    #[test]
319    fn http_auth_422_not_retryable() {
320        let err = McpError::HttpAuth {
321            server_id: "srv".into(),
322            status: 422,
323        };
324        assert_eq!(err.code(), Some(McpErrorCode::AuthFailure));
325        assert!(!err.code().unwrap().is_retryable());
326    }
327
328    #[test]
329    fn handshake_timeout_has_initialize_tool_name() {
330        let err = McpError::Timeout {
331            server_id: "my-server".into(),
332            tool_name: "initialize".into(),
333            timeout_secs: 10,
334        };
335        assert_eq!(
336            err.to_string(),
337            "tool call timed out after 10s: my-server/initialize"
338        );
339        assert_eq!(err.code(), Some(McpErrorCode::Transient));
340    }
341
342    #[test]
343    fn list_tools_timeout_has_tools_list_tool_name() {
344        let err = McpError::Timeout {
345            server_id: "my-server".into(),
346            tool_name: "tools/list".into(),
347            timeout_secs: 30,
348        };
349        assert_eq!(
350            err.to_string(),
351            "tool call timed out after 30s: my-server/tools/list"
352        );
353        assert_eq!(err.code(), Some(McpErrorCode::Transient));
354    }
355}