multi_llm/error.rs
1//! Error types for LLM operations.
2//!
3//! This module provides structured error handling for multi-llm operations,
4//! including categorization, severity levels, and retry guidance.
5//!
6//! # Error Types
7//!
8//! The main error type is [`LlmError`], which covers all failure modes:
9//! - Configuration errors (missing API keys, invalid settings)
10//! - Request failures (network issues, provider errors)
11//! - Rate limiting and timeouts
12//! - Authentication failures
13//! - Token limit exceeded
14//! - Tool execution failures
15//!
16//! # Error Handling Example
17//!
18//! ```rust,no_run
19//! use multi_llm::{LlmError, LlmResult};
20//!
21//! fn handle_error(err: LlmError) {
22//! // Check if we should retry
23//! if err.is_retryable() {
24//! println!("Retryable error: {}", err);
25//! // Implement retry logic...
26//! }
27//!
28//! // Get user-friendly message
29//! let user_msg = err.user_message();
30//! println!("Tell user: {}", user_msg);
31//!
32//! // Check error category for routing
33//! match err.category() {
34//! multi_llm::error::ErrorCategory::Transient => {
35//! println!("Temporary issue, try again later");
36//! }
37//! multi_llm::error::ErrorCategory::Client => {
38//! println!("Fix the request and try again");
39//! }
40//! _ => {
41//! println!("System issue, contact support");
42//! }
43//! }
44//! }
45//! ```
46//!
47//! # Result Type
48//!
49//! Use [`LlmResult<T>`] as a convenient alias for `Result<T, LlmError>`:
50//!
51//! ```rust
52//! use multi_llm::LlmResult;
53//!
54//! fn my_function() -> LlmResult<String> {
55//! Ok("Success".to_string())
56//! }
57//! ```
58
59use crate::logging::{log_error, log_warn};
60use thiserror::Error;
61
62// ============================================================================
63// Error categorization types
64// ============================================================================
65
66/// High-level categorization of errors for routing and handling decisions.
67///
68/// Use [`LlmError::category()`] to get the category for any error.
69///
70/// # Example
71///
72/// ```rust,no_run
73/// use multi_llm::{LlmError, error::ErrorCategory};
74///
75/// fn should_alert_ops(err: &LlmError) -> bool {
76/// matches!(err.category(), ErrorCategory::Internal | ErrorCategory::External)
77/// }
78/// ```
79#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
80#[non_exhaustive]
81pub enum ErrorCategory {
82 /// Expected business logic outcomes (not typically errors).
83 ///
84 /// These are "errors" that represent normal application flow,
85 /// like "user not found" when checking if a user exists.
86 BusinessLogic,
87
88 /// External service failures (LLM providers, network issues).
89 ///
90 /// The LLM provider or network had an issue. May be transient
91 /// or indicate a provider outage.
92 #[default]
93 External,
94
95 /// Internal system errors (bugs, invariant violations).
96 ///
97 /// Something went wrong in the code itself. These should be
98 /// logged and investigated.
99 Internal,
100
101 /// Client errors (invalid input, authentication, configuration).
102 ///
103 /// The caller made a mistake that they can fix (wrong API key,
104 /// invalid parameters, etc.).
105 Client,
106
107 /// Temporary failures that should be retried.
108 ///
109 /// Rate limits, timeouts, and other transient issues. Retry
110 /// with exponential backoff.
111 Transient,
112}
113
114/// Severity level for logging and alerting decisions.
115///
116/// Use [`LlmError::severity()`] to get the severity for any error.
117#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
118#[non_exhaustive]
119pub enum ErrorSeverity {
120 /// System is unusable or data integrity is at risk.
121 ///
122 /// Requires immediate attention. Page on-call if configured.
123 Critical,
124
125 /// Action failed but system is stable.
126 ///
127 /// Should be logged and investigated but not urgent.
128 #[default]
129 Error,
130
131 /// Unexpected but recoverable situation.
132 ///
133 /// Worth logging for monitoring but may not require action.
134 Warning,
135
136 /// Expected failure (e.g., not found, validation error).
137 ///
138 /// Normal operation, log at info/debug level.
139 Info,
140}
141
142/// User-facing error categories for conversation flow control.
143///
144/// When a tool execution fails, this category helps the LLM understand
145/// how to respond to the user and what actions might help.
146///
147/// # Example
148///
149/// ```rust
150/// use multi_llm::{ToolResult, error::UserErrorCategory};
151///
152/// // User needs to complete a prerequisite first
153/// let result = ToolResult {
154/// tool_call_id: "call_123".to_string(),
155/// content: "Please log in first".to_string(),
156/// is_error: true,
157/// error_category: Some(UserErrorCategory::WorkflowDependency),
158/// };
159/// ```
160#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
161#[non_exhaustive]
162pub enum UserErrorCategory {
163 /// User needs to complete a prerequisite action.
164 ///
165 /// Example: "You need to log in before accessing your profile."
166 WorkflowDependency,
167
168 /// Request is missing required context/parameters.
169 ///
170 /// Example: "Please specify which city you want weather for."
171 MissingContext,
172
173 /// Requested item/resource not found.
174 ///
175 /// Example: "I couldn't find a user with that email address."
176 NotFound,
177
178 /// Attempting to create something that already exists.
179 ///
180 /// Example: "An account with that email already exists."
181 Duplicate,
182
183 /// Input validation failed.
184 ///
185 /// Example: "That doesn't look like a valid email address."
186 Validation,
187
188 /// Technical/system error - don't expose details to user.
189 ///
190 /// Example: "Something went wrong. Please try again later."
191 Technical,
192}
193
194// ============================================================================
195// LLM Error types
196// ============================================================================
197
198/// Convenient result type for LLM operations.
199///
200/// Alias for `Result<T, LlmError>`. Use this throughout your application
201/// for consistent error handling.
202///
203/// # Example
204///
205/// ```rust
206/// use multi_llm::LlmResult;
207///
208/// fn process_response(text: &str) -> LlmResult<String> {
209/// if text.is_empty() {
210/// return Err(multi_llm::LlmError::response_parsing_error("Empty response"));
211/// }
212/// Ok(text.to_uppercase())
213/// }
214/// ```
215pub type LlmResult<T> = std::result::Result<T, LlmError>;
216
217/// Errors that can occur during LLM operations.
218///
219/// This enum covers all error conditions you might encounter when using multi-llm.
220/// Each variant includes relevant context and can be:
221/// - Categorized via [`category()`](Self::category)
222/// - Assessed for severity via [`severity()`](Self::severity)
223/// - Checked for retryability via [`is_retryable()`](Self::is_retryable)
224/// - Converted to user-friendly messages via [`user_message()`](Self::user_message)
225///
226/// # Creating Errors
227///
228/// Use the constructor methods which automatically log the error:
229///
230/// ```rust
231/// use multi_llm::LlmError;
232///
233/// // These methods log automatically
234/// let err = LlmError::configuration_error("Missing API key");
235/// let err = LlmError::rate_limit_exceeded(60);
236/// let err = LlmError::timeout(30);
237/// ```
238///
239/// # Error Categories
240///
241/// | Variant | Category | Retryable |
242/// |---------|----------|-----------|
243/// | `UnsupportedProvider` | Client | No |
244/// | `ConfigurationError` | Client | No |
245/// | `RequestFailed` | External | Yes |
246/// | `ResponseParsingError` | External | No |
247/// | `RateLimitExceeded` | Transient | Yes |
248/// | `Timeout` | Transient | Yes |
249/// | `AuthenticationFailed` | Client | No |
250/// | `TokenLimitExceeded` | Client | No |
251/// | `ToolExecutionFailed` | External | No |
252/// | `SchemaValidationFailed` | Client | No |
253#[derive(Error, Debug)]
254#[non_exhaustive]
255pub enum LlmError {
256 /// The specified provider is not supported.
257 ///
258 /// Supported providers: "anthropic", "openai", "ollama", "lmstudio"
259 #[error("Provider not supported: {provider}")]
260 UnsupportedProvider {
261 /// The provider name that was requested.
262 provider: String,
263 },
264
265 /// Provider configuration is invalid or incomplete.
266 ///
267 /// Common causes:
268 /// - Missing API key for providers that require one
269 /// - Invalid base URL format
270 /// - Incompatible configuration values
271 #[error("Provider configuration error: {message}")]
272 ConfigurationError {
273 /// Description of the configuration problem.
274 message: String,
275 },
276
277 /// The HTTP request to the provider failed.
278 ///
279 /// This is a general failure that may be retryable. Check the source
280 /// error for more details about the underlying cause.
281 #[error("Request failed: {message}")]
282 RequestFailed {
283 /// Description of the failure.
284 message: String,
285 /// The underlying error, if available.
286 #[source]
287 source: Option<Box<dyn std::error::Error + Send + Sync>>,
288 },
289
290 /// Failed to parse the provider's response.
291 ///
292 /// The provider returned a response, but it couldn't be parsed.
293 /// This might indicate a provider API change or malformed response.
294 #[error("Response parsing failed: {message}")]
295 ResponseParsingError {
296 /// Details about the parsing failure.
297 message: String,
298 },
299
300 /// Provider rate limit exceeded.
301 ///
302 /// The provider is throttling requests. Wait the indicated time
303 /// before retrying. Consider implementing exponential backoff.
304 #[error("Rate limit exceeded, retry after {retry_after_seconds}s")]
305 RateLimitExceeded {
306 /// Recommended wait time before retrying.
307 retry_after_seconds: u64,
308 },
309
310 /// Request timed out.
311 ///
312 /// The provider didn't respond within the configured timeout.
313 /// This is usually retryable but may indicate an overloaded provider.
314 #[error("Request timed out after {timeout_seconds}s")]
315 Timeout {
316 /// The timeout duration that was exceeded.
317 timeout_seconds: u64,
318 },
319
320 /// Authentication with the provider failed.
321 ///
322 /// Check your API key or credentials. This is not retryable without
323 /// fixing the authentication.
324 #[error("Authentication failed: {message}")]
325 AuthenticationFailed {
326 /// Details about the authentication failure.
327 message: String,
328 },
329
330 /// Request exceeds the model's token limit.
331 ///
332 /// The combined input (messages + tools) is too large for the model's
333 /// context window. Reduce the input size or use a model with larger context.
334 #[error("Token limit exceeded: {current} > {max}")]
335 TokenLimitExceeded {
336 /// The actual token count of the request.
337 current: usize,
338 /// The maximum allowed tokens for the model.
339 max: usize,
340 },
341
342 /// A tool execution failed.
343 ///
344 /// The tool was called but couldn't complete successfully.
345 /// Check the message for details about why the tool failed.
346 #[error("Tool execution failed: {tool_name} - {message}")]
347 ToolExecutionFailed {
348 /// The name of the tool that failed.
349 tool_name: String,
350 /// Details about the failure.
351 message: String,
352 },
353
354 /// Response doesn't match the requested JSON schema.
355 ///
356 /// When using structured output, the model's response didn't conform
357 /// to the provided JSON schema. May require a clearer prompt or
358 /// different schema design.
359 #[error("JSON schema validation failed: {message}")]
360 SchemaValidationFailed {
361 /// Details about the validation failure.
362 message: String,
363 },
364}
365
366impl LlmError {
367 /// Get the error category for routing and handling decisions.
368 ///
369 /// Use this to determine how to handle different types of errors:
370 /// - `Client`: Fix the request (invalid input, auth, config)
371 /// - `External`: Provider issue, may need ops attention
372 /// - `Transient`: Retry with backoff
373 ///
374 /// # Example
375 ///
376 /// ```rust,no_run
377 /// use multi_llm::{LlmError, error::ErrorCategory};
378 ///
379 /// fn handle(err: LlmError) {
380 /// match err.category() {
381 /// ErrorCategory::Transient => {
382 /// // Implement retry logic
383 /// }
384 /// ErrorCategory::Client => {
385 /// // User can fix this, show helpful message
386 /// }
387 /// _ => {
388 /// // Log for investigation
389 /// }
390 /// }
391 /// }
392 /// ```
393 pub fn category(&self) -> ErrorCategory {
394 match self {
395 Self::UnsupportedProvider { .. } => ErrorCategory::Client,
396 Self::ConfigurationError { .. } => ErrorCategory::Client,
397 Self::RequestFailed { .. } => ErrorCategory::External,
398 Self::ResponseParsingError { .. } => ErrorCategory::External,
399 Self::RateLimitExceeded { .. } => ErrorCategory::Transient,
400 Self::Timeout { .. } => ErrorCategory::Transient,
401 Self::AuthenticationFailed { .. } => ErrorCategory::Client,
402 Self::TokenLimitExceeded { .. } => ErrorCategory::Client,
403 Self::ToolExecutionFailed { .. } => ErrorCategory::External,
404 Self::SchemaValidationFailed { .. } => ErrorCategory::Client,
405 }
406 }
407
408 /// Get the error severity for logging and alerting.
409 ///
410 /// Use this to determine logging level and whether to alert on-call.
411 pub fn severity(&self) -> ErrorSeverity {
412 match self {
413 Self::UnsupportedProvider { .. } => ErrorSeverity::Error,
414 Self::ConfigurationError { .. } => ErrorSeverity::Error,
415 Self::RequestFailed { .. } => ErrorSeverity::Error,
416 Self::ResponseParsingError { .. } => ErrorSeverity::Warning,
417 Self::RateLimitExceeded { .. } => ErrorSeverity::Warning,
418 Self::Timeout { .. } => ErrorSeverity::Warning,
419 Self::AuthenticationFailed { .. } => ErrorSeverity::Error,
420 Self::TokenLimitExceeded { .. } => ErrorSeverity::Info,
421 Self::ToolExecutionFailed { .. } => ErrorSeverity::Error,
422 Self::SchemaValidationFailed { .. } => ErrorSeverity::Warning,
423 }
424 }
425
426 /// Whether this error is transient and should trigger a retry.
427 ///
428 /// Returns `true` for:
429 /// - Rate limit exceeded
430 /// - Timeouts
431 /// - General request failures (may be network issues)
432 ///
433 /// Implement exponential backoff when retrying these errors.
434 pub fn is_retryable(&self) -> bool {
435 matches!(
436 self,
437 Self::RateLimitExceeded { .. } | Self::Timeout { .. } | Self::RequestFailed { .. }
438 )
439 }
440
441 /// Convert to a user-friendly message suitable for display.
442 ///
443 /// Returns a message that's safe to show to end users - technical
444 /// details and internal information are stripped or generalized.
445 ///
446 /// # Example
447 ///
448 /// ```rust
449 /// use multi_llm::LlmError;
450 ///
451 /// let err = LlmError::rate_limit_exceeded(60);
452 /// let msg = err.user_message();
453 /// // "Service is busy. Please wait 60 seconds and try again"
454 /// ```
455 pub fn user_message(&self) -> String {
456 match self {
457 Self::UnsupportedProvider { .. } => {
458 "The requested AI provider is not supported".to_string()
459 }
460 Self::ConfigurationError { .. } => {
461 "AI service configuration issue. Please check your settings".to_string()
462 }
463 Self::RequestFailed { .. } => {
464 "Unable to communicate with AI service. Please try again".to_string()
465 }
466 Self::ResponseParsingError { .. } => {
467 "Received an invalid response from AI service".to_string()
468 }
469 Self::RateLimitExceeded {
470 retry_after_seconds,
471 } => {
472 format!("Service is busy. Please wait {retry_after_seconds} seconds and try again")
473 }
474 Self::Timeout { .. } => "Request timed out. Please try again".to_string(),
475 Self::AuthenticationFailed { .. } => {
476 "Authentication failed. Please check your credentials".to_string()
477 }
478 Self::TokenLimitExceeded { .. } => {
479 "Your request is too long. Please shorten it and try again".to_string()
480 }
481 Self::ToolExecutionFailed { .. } => {
482 "Unable to execute the requested action".to_string()
483 }
484 Self::SchemaValidationFailed { .. } => "Response format validation failed".to_string(),
485 }
486 }
487
488 // =========================================================================
489 // Constructor methods with automatic logging
490 // =========================================================================
491 //
492 // These methods automatically log the error at the appropriate level.
493 // Use them instead of constructing variants directly.
494
495 /// Create an unsupported provider error (logs at ERROR level).
496 pub fn unsupported_provider(provider: impl Into<String>) -> Self {
497 let provider = provider.into();
498 log_error!(
499 provider = %provider,
500 error_type = "unsupported_provider",
501 "Unsupported LLM provider requested"
502 );
503 Self::UnsupportedProvider { provider }
504 }
505
506 pub fn configuration_error(message: impl Into<String>) -> Self {
507 let message = message.into();
508 log_error!(
509 error_type = "configuration_error",
510 message = %message,
511 "LLM configuration validation failed"
512 );
513 Self::ConfigurationError { message }
514 }
515
516 pub fn request_failed(
517 message: impl Into<String>,
518 source: Option<Box<dyn std::error::Error + Send + Sync>>,
519 ) -> Self {
520 let message = message.into();
521 log_error!(
522 error_type = "request_failed",
523 message = %message,
524 has_source = source.is_some(),
525 "LLM request execution failed"
526 );
527 Self::RequestFailed { message, source }
528 }
529
530 pub fn response_parsing_error(message: impl Into<String>) -> Self {
531 let message = message.into();
532 log_warn!(
533 error_type = "response_parsing_error",
534 message = %message,
535 "LLM response format invalid"
536 );
537 Self::ResponseParsingError { message }
538 }
539
540 pub fn rate_limit_exceeded(retry_after_seconds: u64) -> Self {
541 log_warn!(
542 error_type = "rate_limit_exceeded",
543 retry_after_seconds = retry_after_seconds,
544 "LLM provider rate limit exceeded"
545 );
546 Self::RateLimitExceeded {
547 retry_after_seconds,
548 }
549 }
550
551 pub fn timeout(timeout_seconds: u64) -> Self {
552 log_warn!(
553 error_type = "timeout",
554 timeout_seconds = timeout_seconds,
555 "LLM request timed out"
556 );
557 Self::Timeout { timeout_seconds }
558 }
559
560 pub fn authentication_failed(message: impl Into<String>) -> Self {
561 let message = message.into();
562 log_error!(
563 error_type = "authentication_failed",
564 message = %message,
565 "LLM provider authentication failed"
566 );
567 Self::AuthenticationFailed { message }
568 }
569
570 pub fn token_limit_exceeded(current: usize, max: usize) -> Self {
571 log_warn!(
572 error_type = "token_limit_exceeded",
573 current_tokens = current,
574 max_tokens = max,
575 "Request exceeds LLM token limit"
576 );
577 Self::TokenLimitExceeded { current, max }
578 }
579
580 pub fn tool_execution_failed(tool_name: impl Into<String>, message: impl Into<String>) -> Self {
581 let tool_name = tool_name.into();
582 let message = message.into();
583 log_error!(
584 error_type = "tool_execution_failed",
585 tool_name = %tool_name,
586 message = %message,
587 "LLM tool execution failed"
588 );
589 Self::ToolExecutionFailed { tool_name, message }
590 }
591
592 pub fn schema_validation_failed(message: impl Into<String>) -> Self {
593 let message = message.into();
594 log_warn!(
595 error_type = "schema_validation_failed",
596 message = %message,
597 "LLM response schema validation failed"
598 );
599 Self::SchemaValidationFailed { message }
600 }
601}