Skip to main content

converge_core/traits/
error.rs

1// Copyright 2024-2026 Reflective Labs
2// SPDX-License-Identifier: MIT
3
4//! # Capability Error Infrastructure
5//!
6//! This module defines the shared error classification interface for all capability
7//! boundary traits. It provides a uniform way to categorize, classify, and handle
8//! errors from external capabilities (LLM providers, vector stores, event stores, etc.).
9//!
10//! ## Design Philosophy
11//!
12//! - **Shared classification, distinct types:** Each capability defines its own error
13//!   enum (e.g., `LlmError`, `RecallError`, `StoreError`), but all implement
14//!   [`CapabilityError`] for uniform handling.
15//!
16//! - **Transient vs retryable distinction:**
17//!   - `is_transient()` = the underlying condition may clear without changing the request
18//!   - `is_retryable()` = it makes sense to retry given typical idempotency guarantees
19//!   - These often overlap but are semantically different. A transient error (server
20//!     temporarily overloaded) is usually retryable. But some retryable errors (conflict
21//!     after optimistic locking) are not transient—the condition won't clear on its own.
22//!
23//! - **Category enables generic handling:** [`ErrorCategory`] allows middleware (retry
24//!   policies, circuit breakers, rate limiters) to operate generically without knowing
25//!   the specific capability or error type.
26//!
27//! ## Usage
28//!
29//! Capability error types implement [`CapabilityError`]:
30//!
31//! ```ignore
32//! impl CapabilityError for LlmError {
33//!     fn category(&self) -> ErrorCategory {
34//!         match self {
35//!             LlmError::RateLimited { .. } => ErrorCategory::RateLimit,
36//!             LlmError::Timeout { .. } => ErrorCategory::Timeout,
37//!             LlmError::AuthDenied { .. } => ErrorCategory::Auth,
38//!             // ...
39//!         }
40//!     }
41//!
42//!     fn is_transient(&self) -> bool {
43//!         matches!(self, LlmError::RateLimited { .. } | LlmError::Timeout { .. })
44//!     }
45//!
46//!     fn is_retryable(&self) -> bool {
47//!         self.is_transient() // Often the same, but can differ
48//!     }
49//!
50//!     fn retry_after(&self) -> Option<Duration> {
51//!         match self {
52//!             LlmError::RateLimited { retry_after } => Some(*retry_after),
53//!             _ => None,
54//!         }
55//!     }
56//! }
57//! ```
58//!
59//! Generic retry logic can then work across capabilities:
60//!
61//! ```ignore
62//! async fn with_retry<T, E: CapabilityError>(
63//!     mut f: impl FnMut() -> Result<T, E>
64//! ) -> Result<T, E> {
65//!     loop {
66//!         match f() {
67//!             Ok(v) => return Ok(v),
68//!             Err(e) if e.is_retryable() => {
69//!                 if let Some(delay) = e.retry_after() {
70//!                     sleep(delay).await;
71//!                 }
72//!                 continue;
73//!             }
74//!             Err(e) => return Err(e),
75//!         }
76//!     }
77//! }
78//! ```
79
80use serde::{Deserialize, Serialize};
81use std::time::Duration;
82
83/// Classification of error conditions for generic handling.
84///
85/// This enum enables middleware (retry policies, circuit breakers, rate limiters,
86/// alerting) to operate generically without knowing the specific capability or
87/// error type.
88///
89/// # Categories
90///
91/// - [`Timeout`](Self::Timeout) - Operation exceeded time limit
92/// - [`RateLimit`](Self::RateLimit) - Too many requests, backoff required
93/// - [`Auth`](Self::Auth) - Authentication or authorization failure
94/// - [`InvalidInput`](Self::InvalidInput) - Bad request parameters
95/// - [`NotFound`](Self::NotFound) - Requested resource doesn't exist
96/// - [`Conflict`](Self::Conflict) - Resource state conflict (optimistic locking, etc.)
97/// - [`Unavailable`](Self::Unavailable) - Service temporarily unavailable
98/// - [`InvariantViolation`](Self::InvariantViolation) - System invariant broken (Converge axiom violation)
99/// - [`Internal`](Self::Internal) - Unexpected internal error
100#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
101pub enum ErrorCategory {
102    /// Operation exceeded time limit.
103    ///
104    /// Typically transient and retryable.
105    Timeout,
106
107    /// Too many requests, backoff required.
108    ///
109    /// Transient by nature. Check `retry_after()` for suggested delay.
110    RateLimit,
111
112    /// Authentication or authorization failure.
113    ///
114    /// Usually NOT transient—credentials need to be refreshed or permissions granted.
115    Auth,
116
117    /// Bad request parameters.
118    ///
119    /// NOT transient or retryable—the request itself is invalid.
120    InvalidInput,
121
122    /// Requested resource doesn't exist.
123    ///
124    /// NOT transient unless the resource might be created by another process.
125    NotFound,
126
127    /// Resource state conflict (optimistic locking, etc.).
128    ///
129    /// May be retryable (re-fetch and retry with new version) but not transient.
130    Conflict,
131
132    /// Service temporarily unavailable.
133    ///
134    /// Transient and retryable. Backend is down or unreachable.
135    Unavailable,
136
137    /// System invariant broken (Converge axiom violation).
138    ///
139    /// NEVER retryable. Indicates a bug or corruption. Should alert immediately.
140    InvariantViolation,
141
142    /// Unexpected internal error.
143    ///
144    /// May or may not be transient. Often indicates bugs or unhandled edge cases.
145    Internal,
146}
147
148impl std::fmt::Display for ErrorCategory {
149    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
150        match self {
151            Self::Timeout => write!(f, "timeout"),
152            Self::RateLimit => write!(f, "rate_limit"),
153            Self::Auth => write!(f, "auth"),
154            Self::InvalidInput => write!(f, "invalid_input"),
155            Self::NotFound => write!(f, "not_found"),
156            Self::Conflict => write!(f, "conflict"),
157            Self::Unavailable => write!(f, "unavailable"),
158            Self::InvariantViolation => write!(f, "invariant_violation"),
159            Self::Internal => write!(f, "internal"),
160        }
161    }
162}
163
164/// Shared classification interface for capability errors.
165///
166/// All capability-specific error types (e.g., `LlmError`, `RecallError`, `StoreError`)
167/// implement this trait to enable uniform error handling across the system.
168///
169/// # Semantic Distinction: Transient vs Retryable
170///
171/// - **`is_transient()`**: The underlying condition may clear without changing the request.
172///   Examples: rate limiting (quota resets), timeout (server was busy), network blip.
173///
174/// - **`is_retryable()`**: It makes sense to retry the operation given typical idempotency.
175///   Examples: transient errors are usually retryable, but also conflicts (re-fetch and retry
176///   with updated version), or certain auth errors (token expired, can refresh).
177///
178/// These often overlap but serve different purposes:
179/// - A circuit breaker cares about transient errors (to detect unhealthy backends).
180/// - A retry loop cares about retryable errors (to know whether to attempt again).
181///
182/// # Implementation Notes
183///
184/// All implementations must also implement `std::error::Error` and be `Send + Sync`
185/// to ensure thread-safe error handling in async contexts.
186///
187/// # Example
188///
189/// ```ignore
190/// impl CapabilityError for MyError {
191///     fn category(&self) -> ErrorCategory {
192///         match self {
193///             Self::TimedOut => ErrorCategory::Timeout,
194///             Self::BadInput(_) => ErrorCategory::InvalidInput,
195///             // ...
196///         }
197///     }
198///
199///     fn is_transient(&self) -> bool {
200///         matches!(self.category(), ErrorCategory::Timeout | ErrorCategory::Unavailable)
201///     }
202///
203///     fn is_retryable(&self) -> bool {
204///         self.is_transient() || matches!(self.category(), ErrorCategory::Conflict)
205///     }
206///
207///     fn retry_after(&self) -> Option<Duration> {
208///         None // Override when rate limit info available
209///     }
210/// }
211/// ```
212pub trait CapabilityError: std::error::Error + Send + Sync {
213    /// Returns the category of this error for generic handling.
214    ///
215    /// Categories enable middleware to operate without knowing specific error types.
216    fn category(&self) -> ErrorCategory;
217
218    /// Returns `true` if the underlying condition may clear without changing the request.
219    ///
220    /// Transient errors indicate temporary conditions like rate limiting, network
221    /// issues, or service overload. Circuit breakers use this to detect unhealthy backends.
222    fn is_transient(&self) -> bool;
223
224    /// Returns `true` if retrying the operation makes sense given typical idempotency.
225    ///
226    /// Retryable errors include transient errors, but also cases like conflicts where
227    /// re-fetching and retrying with updated state may succeed.
228    fn is_retryable(&self) -> bool;
229
230    /// Returns the suggested delay before retrying, if known.
231    ///
232    /// Primarily used for rate limiting where the backend specifies a backoff period.
233    /// Returns `None` if no specific delay is suggested.
234    fn retry_after(&self) -> Option<Duration>;
235}
236
237#[cfg(test)]
238mod tests {
239    use super::*;
240
241    #[test]
242    fn error_category_display() {
243        assert_eq!(ErrorCategory::Timeout.to_string(), "timeout");
244        assert_eq!(ErrorCategory::RateLimit.to_string(), "rate_limit");
245        assert_eq!(ErrorCategory::Auth.to_string(), "auth");
246        assert_eq!(ErrorCategory::InvalidInput.to_string(), "invalid_input");
247        assert_eq!(ErrorCategory::NotFound.to_string(), "not_found");
248        assert_eq!(ErrorCategory::Conflict.to_string(), "conflict");
249        assert_eq!(ErrorCategory::Unavailable.to_string(), "unavailable");
250        assert_eq!(
251            ErrorCategory::InvariantViolation.to_string(),
252            "invariant_violation"
253        );
254        assert_eq!(ErrorCategory::Internal.to_string(), "internal");
255    }
256
257    #[test]
258    fn error_category_equality() {
259        assert_eq!(ErrorCategory::Timeout, ErrorCategory::Timeout);
260        assert_ne!(ErrorCategory::Timeout, ErrorCategory::RateLimit);
261    }
262
263    #[test]
264    fn error_category_serde_roundtrip() {
265        let categories = [
266            ErrorCategory::Timeout,
267            ErrorCategory::RateLimit,
268            ErrorCategory::Auth,
269            ErrorCategory::InvalidInput,
270            ErrorCategory::NotFound,
271            ErrorCategory::Conflict,
272            ErrorCategory::Unavailable,
273            ErrorCategory::InvariantViolation,
274            ErrorCategory::Internal,
275        ];
276        for cat in categories {
277            let json = serde_json::to_string(&cat).unwrap();
278            let back: ErrorCategory = serde_json::from_str(&json).unwrap();
279            assert_eq!(cat, back);
280        }
281    }
282
283    #[derive(Debug)]
284    struct TestError {
285        cat: ErrorCategory,
286        transient: bool,
287    }
288
289    impl std::fmt::Display for TestError {
290        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
291            write!(f, "test error: {}", self.cat)
292        }
293    }
294
295    impl std::error::Error for TestError {}
296
297    impl CapabilityError for TestError {
298        fn category(&self) -> ErrorCategory {
299            self.cat
300        }
301        fn is_transient(&self) -> bool {
302            self.transient
303        }
304        fn is_retryable(&self) -> bool {
305            self.transient
306        }
307        fn retry_after(&self) -> Option<Duration> {
308            if self.cat == ErrorCategory::RateLimit {
309                Some(Duration::from_secs(5))
310            } else {
311                None
312            }
313        }
314    }
315
316    #[test]
317    fn capability_error_trait_implementation() {
318        let err = TestError {
319            cat: ErrorCategory::RateLimit,
320            transient: true,
321        };
322        assert_eq!(err.category(), ErrorCategory::RateLimit);
323        assert!(err.is_transient());
324        assert!(err.is_retryable());
325        assert_eq!(err.retry_after(), Some(Duration::from_secs(5)));
326    }
327
328    #[test]
329    fn non_transient_error_not_retryable() {
330        let err = TestError {
331            cat: ErrorCategory::InvalidInput,
332            transient: false,
333        };
334        assert!(!err.is_transient());
335        assert!(!err.is_retryable());
336        assert!(err.retry_after().is_none());
337    }
338}