converge_core/traits/error.rs
1// Copyright 2024-2026 Reflective Labs
2// SPDX-License-Identifier: MIT
3
4//! # Capability Error Infrastructure
5//!
6//! This module defines the shared error classification interface for all capability
7//! boundary traits. It provides a uniform way to categorize, classify, and handle
8//! errors from external capabilities (LLM providers, vector stores, event stores, etc.).
9//!
10//! ## Design Philosophy
11//!
12//! - **Shared classification, distinct types:** Each capability defines its own error
13//! enum (e.g., `LlmError`, `RecallError`, `StoreError`), but all implement
14//! [`CapabilityError`] for uniform handling.
15//!
16//! - **Transient vs retryable distinction:**
17//! - `is_transient()` = the underlying condition may clear without changing the request
18//! - `is_retryable()` = it makes sense to retry given typical idempotency guarantees
19//! - These often overlap but are semantically different. A transient error (server
20//! temporarily overloaded) is usually retryable. But some retryable errors (conflict
21//! after optimistic locking) are not transient—the condition won't clear on its own.
22//!
23//! - **Category enables generic handling:** [`ErrorCategory`] allows middleware (retry
24//! policies, circuit breakers, rate limiters) to operate generically without knowing
25//! the specific capability or error type.
26//!
27//! ## Usage
28//!
29//! Capability error types implement [`CapabilityError`]:
30//!
31//! ```ignore
32//! impl CapabilityError for LlmError {
33//! fn category(&self) -> ErrorCategory {
34//! match self {
35//! LlmError::RateLimited { .. } => ErrorCategory::RateLimit,
36//! LlmError::Timeout { .. } => ErrorCategory::Timeout,
37//! LlmError::AuthDenied { .. } => ErrorCategory::Auth,
38//! // ...
39//! }
40//! }
41//!
42//! fn is_transient(&self) -> bool {
43//! matches!(self, LlmError::RateLimited { .. } | LlmError::Timeout { .. })
44//! }
45//!
46//! fn is_retryable(&self) -> bool {
47//! self.is_transient() // Often the same, but can differ
48//! }
49//!
50//! fn retry_after(&self) -> Option<Duration> {
51//! match self {
52//! LlmError::RateLimited { retry_after } => Some(*retry_after),
53//! _ => None,
54//! }
55//! }
56//! }
57//! ```
58//!
59//! Generic retry logic can then work across capabilities:
60//!
61//! ```ignore
62//! async fn with_retry<T, E: CapabilityError>(
63//! mut f: impl FnMut() -> Result<T, E>
64//! ) -> Result<T, E> {
65//! loop {
66//! match f() {
67//! Ok(v) => return Ok(v),
68//! Err(e) if e.is_retryable() => {
69//! if let Some(delay) = e.retry_after() {
70//! sleep(delay).await;
71//! }
72//! continue;
73//! }
74//! Err(e) => return Err(e),
75//! }
76//! }
77//! }
78//! ```
79
80use serde::{Deserialize, Serialize};
81use std::time::Duration;
82
83/// Classification of error conditions for generic handling.
84///
85/// This enum enables middleware (retry policies, circuit breakers, rate limiters,
86/// alerting) to operate generically without knowing the specific capability or
87/// error type.
88///
89/// # Categories
90///
91/// - [`Timeout`](Self::Timeout) - Operation exceeded time limit
92/// - [`RateLimit`](Self::RateLimit) - Too many requests, backoff required
93/// - [`Auth`](Self::Auth) - Authentication or authorization failure
94/// - [`InvalidInput`](Self::InvalidInput) - Bad request parameters
95/// - [`NotFound`](Self::NotFound) - Requested resource doesn't exist
96/// - [`Conflict`](Self::Conflict) - Resource state conflict (optimistic locking, etc.)
97/// - [`Unavailable`](Self::Unavailable) - Service temporarily unavailable
98/// - [`InvariantViolation`](Self::InvariantViolation) - System invariant broken (Converge axiom violation)
99/// - [`Internal`](Self::Internal) - Unexpected internal error
100#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
101pub enum ErrorCategory {
102 /// Operation exceeded time limit.
103 ///
104 /// Typically transient and retryable.
105 Timeout,
106
107 /// Too many requests, backoff required.
108 ///
109 /// Transient by nature. Check `retry_after()` for suggested delay.
110 RateLimit,
111
112 /// Authentication or authorization failure.
113 ///
114 /// Usually NOT transient—credentials need to be refreshed or permissions granted.
115 Auth,
116
117 /// Bad request parameters.
118 ///
119 /// NOT transient or retryable—the request itself is invalid.
120 InvalidInput,
121
122 /// Requested resource doesn't exist.
123 ///
124 /// NOT transient unless the resource might be created by another process.
125 NotFound,
126
127 /// Resource state conflict (optimistic locking, etc.).
128 ///
129 /// May be retryable (re-fetch and retry with new version) but not transient.
130 Conflict,
131
132 /// Service temporarily unavailable.
133 ///
134 /// Transient and retryable. Backend is down or unreachable.
135 Unavailable,
136
137 /// System invariant broken (Converge axiom violation).
138 ///
139 /// NEVER retryable. Indicates a bug or corruption. Should alert immediately.
140 InvariantViolation,
141
142 /// Unexpected internal error.
143 ///
144 /// May or may not be transient. Often indicates bugs or unhandled edge cases.
145 Internal,
146}
147
148impl std::fmt::Display for ErrorCategory {
149 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
150 match self {
151 Self::Timeout => write!(f, "timeout"),
152 Self::RateLimit => write!(f, "rate_limit"),
153 Self::Auth => write!(f, "auth"),
154 Self::InvalidInput => write!(f, "invalid_input"),
155 Self::NotFound => write!(f, "not_found"),
156 Self::Conflict => write!(f, "conflict"),
157 Self::Unavailable => write!(f, "unavailable"),
158 Self::InvariantViolation => write!(f, "invariant_violation"),
159 Self::Internal => write!(f, "internal"),
160 }
161 }
162}
163
164/// Shared classification interface for capability errors.
165///
166/// All capability-specific error types (e.g., `LlmError`, `RecallError`, `StoreError`)
167/// implement this trait to enable uniform error handling across the system.
168///
169/// # Semantic Distinction: Transient vs Retryable
170///
171/// - **`is_transient()`**: The underlying condition may clear without changing the request.
172/// Examples: rate limiting (quota resets), timeout (server was busy), network blip.
173///
174/// - **`is_retryable()`**: It makes sense to retry the operation given typical idempotency.
175/// Examples: transient errors are usually retryable, but also conflicts (re-fetch and retry
176/// with updated version), or certain auth errors (token expired, can refresh).
177///
178/// These often overlap but serve different purposes:
179/// - A circuit breaker cares about transient errors (to detect unhealthy backends).
180/// - A retry loop cares about retryable errors (to know whether to attempt again).
181///
182/// # Implementation Notes
183///
184/// All implementations must also implement `std::error::Error` and be `Send + Sync`
185/// to ensure thread-safe error handling in async contexts.
186///
187/// # Example
188///
189/// ```ignore
190/// impl CapabilityError for MyError {
191/// fn category(&self) -> ErrorCategory {
192/// match self {
193/// Self::TimedOut => ErrorCategory::Timeout,
194/// Self::BadInput(_) => ErrorCategory::InvalidInput,
195/// // ...
196/// }
197/// }
198///
199/// fn is_transient(&self) -> bool {
200/// matches!(self.category(), ErrorCategory::Timeout | ErrorCategory::Unavailable)
201/// }
202///
203/// fn is_retryable(&self) -> bool {
204/// self.is_transient() || matches!(self.category(), ErrorCategory::Conflict)
205/// }
206///
207/// fn retry_after(&self) -> Option<Duration> {
208/// None // Override when rate limit info available
209/// }
210/// }
211/// ```
212pub trait CapabilityError: std::error::Error + Send + Sync {
213 /// Returns the category of this error for generic handling.
214 ///
215 /// Categories enable middleware to operate without knowing specific error types.
216 fn category(&self) -> ErrorCategory;
217
218 /// Returns `true` if the underlying condition may clear without changing the request.
219 ///
220 /// Transient errors indicate temporary conditions like rate limiting, network
221 /// issues, or service overload. Circuit breakers use this to detect unhealthy backends.
222 fn is_transient(&self) -> bool;
223
224 /// Returns `true` if retrying the operation makes sense given typical idempotency.
225 ///
226 /// Retryable errors include transient errors, but also cases like conflicts where
227 /// re-fetching and retrying with updated state may succeed.
228 fn is_retryable(&self) -> bool;
229
230 /// Returns the suggested delay before retrying, if known.
231 ///
232 /// Primarily used for rate limiting where the backend specifies a backoff period.
233 /// Returns `None` if no specific delay is suggested.
234 fn retry_after(&self) -> Option<Duration>;
235}
236
237#[cfg(test)]
238mod tests {
239 use super::*;
240
241 #[test]
242 fn error_category_display() {
243 assert_eq!(ErrorCategory::Timeout.to_string(), "timeout");
244 assert_eq!(ErrorCategory::RateLimit.to_string(), "rate_limit");
245 assert_eq!(ErrorCategory::Auth.to_string(), "auth");
246 assert_eq!(ErrorCategory::InvalidInput.to_string(), "invalid_input");
247 assert_eq!(ErrorCategory::NotFound.to_string(), "not_found");
248 assert_eq!(ErrorCategory::Conflict.to_string(), "conflict");
249 assert_eq!(ErrorCategory::Unavailable.to_string(), "unavailable");
250 assert_eq!(
251 ErrorCategory::InvariantViolation.to_string(),
252 "invariant_violation"
253 );
254 assert_eq!(ErrorCategory::Internal.to_string(), "internal");
255 }
256
257 #[test]
258 fn error_category_equality() {
259 assert_eq!(ErrorCategory::Timeout, ErrorCategory::Timeout);
260 assert_ne!(ErrorCategory::Timeout, ErrorCategory::RateLimit);
261 }
262
263 #[test]
264 fn error_category_serde_roundtrip() {
265 let categories = [
266 ErrorCategory::Timeout,
267 ErrorCategory::RateLimit,
268 ErrorCategory::Auth,
269 ErrorCategory::InvalidInput,
270 ErrorCategory::NotFound,
271 ErrorCategory::Conflict,
272 ErrorCategory::Unavailable,
273 ErrorCategory::InvariantViolation,
274 ErrorCategory::Internal,
275 ];
276 for cat in categories {
277 let json = serde_json::to_string(&cat).unwrap();
278 let back: ErrorCategory = serde_json::from_str(&json).unwrap();
279 assert_eq!(cat, back);
280 }
281 }
282
283 #[derive(Debug)]
284 struct TestError {
285 cat: ErrorCategory,
286 transient: bool,
287 }
288
289 impl std::fmt::Display for TestError {
290 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
291 write!(f, "test error: {}", self.cat)
292 }
293 }
294
295 impl std::error::Error for TestError {}
296
297 impl CapabilityError for TestError {
298 fn category(&self) -> ErrorCategory {
299 self.cat
300 }
301 fn is_transient(&self) -> bool {
302 self.transient
303 }
304 fn is_retryable(&self) -> bool {
305 self.transient
306 }
307 fn retry_after(&self) -> Option<Duration> {
308 if self.cat == ErrorCategory::RateLimit {
309 Some(Duration::from_secs(5))
310 } else {
311 None
312 }
313 }
314 }
315
316 #[test]
317 fn capability_error_trait_implementation() {
318 let err = TestError {
319 cat: ErrorCategory::RateLimit,
320 transient: true,
321 };
322 assert_eq!(err.category(), ErrorCategory::RateLimit);
323 assert!(err.is_transient());
324 assert!(err.is_retryable());
325 assert_eq!(err.retry_after(), Some(Duration::from_secs(5)));
326 }
327
328 #[test]
329 fn non_transient_error_not_retryable() {
330 let err = TestError {
331 cat: ErrorCategory::InvalidInput,
332 transient: false,
333 };
334 assert!(!err.is_transient());
335 assert!(!err.is_retryable());
336 assert!(err.retry_after().is_none());
337 }
338}