multi_llm/
error.rs

1//! Error types for LLM operations.
2//!
3//! This module provides structured error handling for multi-llm operations,
4//! including categorization, severity levels, and retry guidance.
5//!
6//! # Error Types
7//!
8//! The main error type is [`LlmError`], which covers all failure modes:
9//! - Configuration errors (missing API keys, invalid settings)
10//! - Request failures (network issues, provider errors)
11//! - Rate limiting and timeouts
12//! - Authentication failures
13//! - Token limit exceeded
14//! - Tool execution failures
15//!
16//! # Error Handling Example
17//!
18//! ```rust,no_run
19//! use multi_llm::{LlmError, LlmResult};
20//!
21//! fn handle_error(err: LlmError) {
22//!     // Check if we should retry
23//!     if err.is_retryable() {
24//!         println!("Retryable error: {}", err);
25//!         // Implement retry logic...
26//!     }
27//!
28//!     // Get user-friendly message
29//!     let user_msg = err.user_message();
30//!     println!("Tell user: {}", user_msg);
31//!
32//!     // Check error category for routing
33//!     match err.category() {
34//!         multi_llm::error::ErrorCategory::Transient => {
35//!             println!("Temporary issue, try again later");
36//!         }
37//!         multi_llm::error::ErrorCategory::Client => {
38//!             println!("Fix the request and try again");
39//!         }
40//!         _ => {
41//!             println!("System issue, contact support");
42//!         }
43//!     }
44//! }
45//! ```
46//!
47//! # Result Type
48//!
49//! Use [`LlmResult<T>`] as a convenient alias for `Result<T, LlmError>`:
50//!
51//! ```rust
52//! use multi_llm::LlmResult;
53//!
54//! fn my_function() -> LlmResult<String> {
55//!     Ok("Success".to_string())
56//! }
57//! ```
58
59use crate::logging::{log_error, log_warn};
60use thiserror::Error;
61
62// ============================================================================
63// Error categorization types
64// ============================================================================
65
66/// High-level categorization of errors for routing and handling decisions.
67///
68/// Use [`LlmError::category()`] to get the category for any error.
69///
70/// # Example
71///
72/// ```rust,no_run
73/// use multi_llm::{LlmError, error::ErrorCategory};
74///
75/// fn should_alert_ops(err: &LlmError) -> bool {
76///     matches!(err.category(), ErrorCategory::Internal | ErrorCategory::External)
77/// }
78/// ```
79#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
80#[non_exhaustive]
81pub enum ErrorCategory {
82    /// Expected business logic outcomes (not typically errors).
83    ///
84    /// These are "errors" that represent normal application flow,
85    /// like "user not found" when checking if a user exists.
86    BusinessLogic,
87
88    /// External service failures (LLM providers, network issues).
89    ///
90    /// The LLM provider or network had an issue. May be transient
91    /// or indicate a provider outage.
92    #[default]
93    External,
94
95    /// Internal system errors (bugs, invariant violations).
96    ///
97    /// Something went wrong in the code itself. These should be
98    /// logged and investigated.
99    Internal,
100
101    /// Client errors (invalid input, authentication, configuration).
102    ///
103    /// The caller made a mistake that they can fix (wrong API key,
104    /// invalid parameters, etc.).
105    Client,
106
107    /// Temporary failures that should be retried.
108    ///
109    /// Rate limits, timeouts, and other transient issues. Retry
110    /// with exponential backoff.
111    Transient,
112}
113
114/// Severity level for logging and alerting decisions.
115///
116/// Use [`LlmError::severity()`] to get the severity for any error.
117#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
118#[non_exhaustive]
119pub enum ErrorSeverity {
120    /// System is unusable or data integrity is at risk.
121    ///
122    /// Requires immediate attention. Page on-call if configured.
123    Critical,
124
125    /// Action failed but system is stable.
126    ///
127    /// Should be logged and investigated but not urgent.
128    #[default]
129    Error,
130
131    /// Unexpected but recoverable situation.
132    ///
133    /// Worth logging for monitoring but may not require action.
134    Warning,
135
136    /// Expected failure (e.g., not found, validation error).
137    ///
138    /// Normal operation, log at info/debug level.
139    Info,
140}
141
142/// User-facing error categories for conversation flow control.
143///
144/// When a tool execution fails, this category helps the LLM understand
145/// how to respond to the user and what actions might help.
146///
147/// # Example
148///
149/// ```rust
150/// use multi_llm::{ToolResult, error::UserErrorCategory};
151///
152/// // User needs to complete a prerequisite first
153/// let result = ToolResult {
154///     tool_call_id: "call_123".to_string(),
155///     content: "Please log in first".to_string(),
156///     is_error: true,
157///     error_category: Some(UserErrorCategory::WorkflowDependency),
158/// };
159/// ```
160#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
161#[non_exhaustive]
162pub enum UserErrorCategory {
163    /// User needs to complete a prerequisite action.
164    ///
165    /// Example: "You need to log in before accessing your profile."
166    WorkflowDependency,
167
168    /// Request is missing required context/parameters.
169    ///
170    /// Example: "Please specify which city you want weather for."
171    MissingContext,
172
173    /// Requested item/resource not found.
174    ///
175    /// Example: "I couldn't find a user with that email address."
176    NotFound,
177
178    /// Attempting to create something that already exists.
179    ///
180    /// Example: "An account with that email already exists."
181    Duplicate,
182
183    /// Input validation failed.
184    ///
185    /// Example: "That doesn't look like a valid email address."
186    Validation,
187
188    /// Technical/system error - don't expose details to user.
189    ///
190    /// Example: "Something went wrong. Please try again later."
191    Technical,
192}
193
194// ============================================================================
195// LLM Error types
196// ============================================================================
197
198/// Convenient result type for LLM operations.
199///
200/// Alias for `Result<T, LlmError>`. Use this throughout your application
201/// for consistent error handling.
202///
203/// # Example
204///
205/// ```rust
206/// use multi_llm::LlmResult;
207///
208/// fn process_response(text: &str) -> LlmResult<String> {
209///     if text.is_empty() {
210///         return Err(multi_llm::LlmError::response_parsing_error("Empty response"));
211///     }
212///     Ok(text.to_uppercase())
213/// }
214/// ```
215pub type LlmResult<T> = std::result::Result<T, LlmError>;
216
217/// Errors that can occur during LLM operations.
218///
219/// This enum covers all error conditions you might encounter when using multi-llm.
220/// Each variant includes relevant context and can be:
221/// - Categorized via [`category()`](Self::category)
222/// - Assessed for severity via [`severity()`](Self::severity)
223/// - Checked for retryability via [`is_retryable()`](Self::is_retryable)
224/// - Converted to user-friendly messages via [`user_message()`](Self::user_message)
225///
226/// # Creating Errors
227///
228/// Use the constructor methods which automatically log the error:
229///
230/// ```rust
231/// use multi_llm::LlmError;
232///
233/// // These methods log automatically
234/// let err = LlmError::configuration_error("Missing API key");
235/// let err = LlmError::rate_limit_exceeded(60);
236/// let err = LlmError::timeout(30);
237/// ```
238///
239/// # Error Categories
240///
241/// | Variant | Category | Retryable |
242/// |---------|----------|-----------|
243/// | `UnsupportedProvider` | Client | No |
244/// | `ConfigurationError` | Client | No |
245/// | `RequestFailed` | External | Yes |
246/// | `ResponseParsingError` | External | No |
247/// | `RateLimitExceeded` | Transient | Yes |
248/// | `Timeout` | Transient | Yes |
249/// | `AuthenticationFailed` | Client | No |
250/// | `TokenLimitExceeded` | Client | No |
251/// | `ToolExecutionFailed` | External | No |
252/// | `SchemaValidationFailed` | Client | No |
253#[derive(Error, Debug)]
254#[non_exhaustive]
255pub enum LlmError {
256    /// The specified provider is not supported.
257    ///
258    /// Supported providers: "anthropic", "openai", "ollama", "lmstudio"
259    #[error("Provider not supported: {provider}")]
260    UnsupportedProvider {
261        /// The provider name that was requested.
262        provider: String,
263    },
264
265    /// Provider configuration is invalid or incomplete.
266    ///
267    /// Common causes:
268    /// - Missing API key for providers that require one
269    /// - Invalid base URL format
270    /// - Incompatible configuration values
271    #[error("Provider configuration error: {message}")]
272    ConfigurationError {
273        /// Description of the configuration problem.
274        message: String,
275    },
276
277    /// The HTTP request to the provider failed.
278    ///
279    /// This is a general failure that may be retryable. Check the source
280    /// error for more details about the underlying cause.
281    #[error("Request failed: {message}")]
282    RequestFailed {
283        /// Description of the failure.
284        message: String,
285        /// The underlying error, if available.
286        #[source]
287        source: Option<Box<dyn std::error::Error + Send + Sync>>,
288    },
289
290    /// Failed to parse the provider's response.
291    ///
292    /// The provider returned a response, but it couldn't be parsed.
293    /// This might indicate a provider API change or malformed response.
294    #[error("Response parsing failed: {message}")]
295    ResponseParsingError {
296        /// Details about the parsing failure.
297        message: String,
298    },
299
300    /// Provider rate limit exceeded.
301    ///
302    /// The provider is throttling requests. Wait the indicated time
303    /// before retrying. Consider implementing exponential backoff.
304    #[error("Rate limit exceeded, retry after {retry_after_seconds}s")]
305    RateLimitExceeded {
306        /// Recommended wait time before retrying.
307        retry_after_seconds: u64,
308    },
309
310    /// Request timed out.
311    ///
312    /// The provider didn't respond within the configured timeout.
313    /// This is usually retryable but may indicate an overloaded provider.
314    #[error("Request timed out after {timeout_seconds}s")]
315    Timeout {
316        /// The timeout duration that was exceeded.
317        timeout_seconds: u64,
318    },
319
320    /// Authentication with the provider failed.
321    ///
322    /// Check your API key or credentials. This is not retryable without
323    /// fixing the authentication.
324    #[error("Authentication failed: {message}")]
325    AuthenticationFailed {
326        /// Details about the authentication failure.
327        message: String,
328    },
329
330    /// Request exceeds the model's token limit.
331    ///
332    /// The combined input (messages + tools) is too large for the model's
333    /// context window. Reduce the input size or use a model with larger context.
334    #[error("Token limit exceeded: {current} > {max}")]
335    TokenLimitExceeded {
336        /// The actual token count of the request.
337        current: usize,
338        /// The maximum allowed tokens for the model.
339        max: usize,
340    },
341
342    /// A tool execution failed.
343    ///
344    /// The tool was called but couldn't complete successfully.
345    /// Check the message for details about why the tool failed.
346    #[error("Tool execution failed: {tool_name} - {message}")]
347    ToolExecutionFailed {
348        /// The name of the tool that failed.
349        tool_name: String,
350        /// Details about the failure.
351        message: String,
352    },
353
354    /// Response doesn't match the requested JSON schema.
355    ///
356    /// When using structured output, the model's response didn't conform
357    /// to the provided JSON schema. May require a clearer prompt or
358    /// different schema design.
359    #[error("JSON schema validation failed: {message}")]
360    SchemaValidationFailed {
361        /// Details about the validation failure.
362        message: String,
363    },
364}
365
366impl LlmError {
367    /// Get the error category for routing and handling decisions.
368    ///
369    /// Use this to determine how to handle different types of errors:
370    /// - `Client`: Fix the request (invalid input, auth, config)
371    /// - `External`: Provider issue, may need ops attention
372    /// - `Transient`: Retry with backoff
373    ///
374    /// # Example
375    ///
376    /// ```rust,no_run
377    /// use multi_llm::{LlmError, error::ErrorCategory};
378    ///
379    /// fn handle(err: LlmError) {
380    ///     match err.category() {
381    ///         ErrorCategory::Transient => {
382    ///             // Implement retry logic
383    ///         }
384    ///         ErrorCategory::Client => {
385    ///             // User can fix this, show helpful message
386    ///         }
387    ///         _ => {
388    ///             // Log for investigation
389    ///         }
390    ///     }
391    /// }
392    /// ```
393    pub fn category(&self) -> ErrorCategory {
394        match self {
395            Self::UnsupportedProvider { .. } => ErrorCategory::Client,
396            Self::ConfigurationError { .. } => ErrorCategory::Client,
397            Self::RequestFailed { .. } => ErrorCategory::External,
398            Self::ResponseParsingError { .. } => ErrorCategory::External,
399            Self::RateLimitExceeded { .. } => ErrorCategory::Transient,
400            Self::Timeout { .. } => ErrorCategory::Transient,
401            Self::AuthenticationFailed { .. } => ErrorCategory::Client,
402            Self::TokenLimitExceeded { .. } => ErrorCategory::Client,
403            Self::ToolExecutionFailed { .. } => ErrorCategory::External,
404            Self::SchemaValidationFailed { .. } => ErrorCategory::Client,
405        }
406    }
407
408    /// Get the error severity for logging and alerting.
409    ///
410    /// Use this to determine logging level and whether to alert on-call.
411    pub fn severity(&self) -> ErrorSeverity {
412        match self {
413            Self::UnsupportedProvider { .. } => ErrorSeverity::Error,
414            Self::ConfigurationError { .. } => ErrorSeverity::Error,
415            Self::RequestFailed { .. } => ErrorSeverity::Error,
416            Self::ResponseParsingError { .. } => ErrorSeverity::Warning,
417            Self::RateLimitExceeded { .. } => ErrorSeverity::Warning,
418            Self::Timeout { .. } => ErrorSeverity::Warning,
419            Self::AuthenticationFailed { .. } => ErrorSeverity::Error,
420            Self::TokenLimitExceeded { .. } => ErrorSeverity::Info,
421            Self::ToolExecutionFailed { .. } => ErrorSeverity::Error,
422            Self::SchemaValidationFailed { .. } => ErrorSeverity::Warning,
423        }
424    }
425
426    /// Whether this error is transient and should trigger a retry.
427    ///
428    /// Returns `true` for:
429    /// - Rate limit exceeded
430    /// - Timeouts
431    /// - General request failures (may be network issues)
432    ///
433    /// Implement exponential backoff when retrying these errors.
434    pub fn is_retryable(&self) -> bool {
435        matches!(
436            self,
437            Self::RateLimitExceeded { .. } | Self::Timeout { .. } | Self::RequestFailed { .. }
438        )
439    }
440
441    /// Convert to a user-friendly message suitable for display.
442    ///
443    /// Returns a message that's safe to show to end users - technical
444    /// details and internal information are stripped or generalized.
445    ///
446    /// # Example
447    ///
448    /// ```rust
449    /// use multi_llm::LlmError;
450    ///
451    /// let err = LlmError::rate_limit_exceeded(60);
452    /// let msg = err.user_message();
453    /// // "Service is busy. Please wait 60 seconds and try again"
454    /// ```
455    pub fn user_message(&self) -> String {
456        match self {
457            Self::UnsupportedProvider { .. } => {
458                "The requested AI provider is not supported".to_string()
459            }
460            Self::ConfigurationError { .. } => {
461                "AI service configuration issue. Please check your settings".to_string()
462            }
463            Self::RequestFailed { .. } => {
464                "Unable to communicate with AI service. Please try again".to_string()
465            }
466            Self::ResponseParsingError { .. } => {
467                "Received an invalid response from AI service".to_string()
468            }
469            Self::RateLimitExceeded {
470                retry_after_seconds,
471            } => {
472                format!("Service is busy. Please wait {retry_after_seconds} seconds and try again")
473            }
474            Self::Timeout { .. } => "Request timed out. Please try again".to_string(),
475            Self::AuthenticationFailed { .. } => {
476                "Authentication failed. Please check your credentials".to_string()
477            }
478            Self::TokenLimitExceeded { .. } => {
479                "Your request is too long. Please shorten it and try again".to_string()
480            }
481            Self::ToolExecutionFailed { .. } => {
482                "Unable to execute the requested action".to_string()
483            }
484            Self::SchemaValidationFailed { .. } => "Response format validation failed".to_string(),
485        }
486    }
487
488    // =========================================================================
489    // Constructor methods with automatic logging
490    // =========================================================================
491    //
492    // These methods automatically log the error at the appropriate level.
493    // Use them instead of constructing variants directly.
494
495    /// Create an unsupported provider error (logs at ERROR level).
496    pub fn unsupported_provider(provider: impl Into<String>) -> Self {
497        let provider = provider.into();
498        log_error!(
499            provider = %provider,
500            error_type = "unsupported_provider",
501            "Unsupported LLM provider requested"
502        );
503        Self::UnsupportedProvider { provider }
504    }
505
506    pub fn configuration_error(message: impl Into<String>) -> Self {
507        let message = message.into();
508        log_error!(
509            error_type = "configuration_error",
510            message = %message,
511            "LLM configuration validation failed"
512        );
513        Self::ConfigurationError { message }
514    }
515
516    pub fn request_failed(
517        message: impl Into<String>,
518        source: Option<Box<dyn std::error::Error + Send + Sync>>,
519    ) -> Self {
520        let message = message.into();
521        log_error!(
522            error_type = "request_failed",
523            message = %message,
524            has_source = source.is_some(),
525            "LLM request execution failed"
526        );
527        Self::RequestFailed { message, source }
528    }
529
530    pub fn response_parsing_error(message: impl Into<String>) -> Self {
531        let message = message.into();
532        log_warn!(
533            error_type = "response_parsing_error",
534            message = %message,
535            "LLM response format invalid"
536        );
537        Self::ResponseParsingError { message }
538    }
539
540    pub fn rate_limit_exceeded(retry_after_seconds: u64) -> Self {
541        log_warn!(
542            error_type = "rate_limit_exceeded",
543            retry_after_seconds = retry_after_seconds,
544            "LLM provider rate limit exceeded"
545        );
546        Self::RateLimitExceeded {
547            retry_after_seconds,
548        }
549    }
550
551    pub fn timeout(timeout_seconds: u64) -> Self {
552        log_warn!(
553            error_type = "timeout",
554            timeout_seconds = timeout_seconds,
555            "LLM request timed out"
556        );
557        Self::Timeout { timeout_seconds }
558    }
559
560    pub fn authentication_failed(message: impl Into<String>) -> Self {
561        let message = message.into();
562        log_error!(
563            error_type = "authentication_failed",
564            message = %message,
565            "LLM provider authentication failed"
566        );
567        Self::AuthenticationFailed { message }
568    }
569
570    pub fn token_limit_exceeded(current: usize, max: usize) -> Self {
571        log_warn!(
572            error_type = "token_limit_exceeded",
573            current_tokens = current,
574            max_tokens = max,
575            "Request exceeds LLM token limit"
576        );
577        Self::TokenLimitExceeded { current, max }
578    }
579
580    pub fn tool_execution_failed(tool_name: impl Into<String>, message: impl Into<String>) -> Self {
581        let tool_name = tool_name.into();
582        let message = message.into();
583        log_error!(
584            error_type = "tool_execution_failed",
585            tool_name = %tool_name,
586            message = %message,
587            "LLM tool execution failed"
588        );
589        Self::ToolExecutionFailed { tool_name, message }
590    }
591
592    pub fn schema_validation_failed(message: impl Into<String>) -> Self {
593        let message = message.into();
594        log_warn!(
595            error_type = "schema_validation_failed",
596            message = %message,
597            "LLM response schema validation failed"
598        );
599        Self::SchemaValidationFailed { message }
600    }
601}