codeprism_mcp/
error_handler.rs

1//! Comprehensive error handling for the MCP server
2//!
3//! This module provides centralized error handling, recovery mechanisms,
4//! and integration with observability systems for production reliability.
5
6use crate::protocol::{JsonRpcError, JsonRpcResponse};
7use codeprism_core::{
8    resilience::CircuitBreakerConfig, CircuitState, Error as CoreError, ErrorContext,
9    ErrorSeverity, HealthMonitor, MetricsCollector, PerformanceMonitor, RecoveryStrategy,
10    ResilienceManager, RetryConfig,
11};
12use std::sync::Arc;
13use tokio::sync::RwLock;
14use tracing::{debug, error, info, warn};
15
16/// Enhanced error type for MCP operations
17#[derive(Debug, Clone, thiserror::Error)]
18pub enum McpError {
19    /// Core codeprism error
20    #[error("Core error: {0}")]
21    Core(#[from] CoreError),
22
23    /// JSON-RPC protocol error
24    #[error("Protocol error: {0}")]
25    Protocol(String),
26
27    /// Tool execution error
28    #[error("Tool execution error: {tool_name}: {message}")]
29    ToolExecution {
30        tool_name: String,
31        message: String,
32        context: Option<ErrorContext>,
33    },
34
35    /// Resource operation error
36    #[error("Resource error: {resource_type}: {message}")]
37    Resource {
38        resource_type: String,
39        message: String,
40    },
41
42    /// Prompt generation error
43    #[error("Prompt error: {prompt_name}: {message}")]
44    Prompt {
45        prompt_name: String,
46        message: String,
47    },
48
49    /// Cancellation error
50    #[error("Operation cancelled: {operation}")]
51    Cancelled {
52        operation: String,
53        reason: Option<String>,
54    },
55
56    /// Timeout error
57    #[error("Operation timed out: {operation} (timeout: {timeout_ms}ms)")]
58    Timeout { operation: String, timeout_ms: u64 },
59
60    /// Rate limiting error
61    #[error("Rate limit exceeded for operation: {operation}")]
62    RateLimit {
63        operation: String,
64        retry_after_ms: u64,
65    },
66}
67
68impl McpError {
69    /// Get the severity level of this error
70    pub fn severity(&self) -> ErrorSeverity {
71        match self {
72            Self::Core(core_error) => core_error.severity(),
73            Self::Protocol(_) => ErrorSeverity::Error,
74            Self::ToolExecution { .. } => ErrorSeverity::Warning,
75            Self::Resource { .. } => ErrorSeverity::Error,
76            Self::Prompt { .. } => ErrorSeverity::Warning,
77            Self::Cancelled { .. } => ErrorSeverity::Info,
78            Self::Timeout { .. } => ErrorSeverity::Warning,
79            Self::RateLimit { .. } => ErrorSeverity::Warning,
80        }
81    }
82
83    /// Get the recovery strategy for this error
84    pub fn recovery_strategy(&self) -> RecoveryStrategy {
85        match self {
86            Self::Core(core_error) => core_error.recovery_strategy(),
87            Self::Protocol(_) => RecoveryStrategy::UserIntervention,
88            Self::ToolExecution { .. } => RecoveryStrategy::Fallback,
89            Self::Resource { .. } => RecoveryStrategy::Retry,
90            Self::Prompt { .. } => RecoveryStrategy::Fallback,
91            Self::Cancelled { .. } => RecoveryStrategy::UserIntervention,
92            Self::Timeout { .. } => RecoveryStrategy::Retry,
93            Self::RateLimit { .. } => RecoveryStrategy::Retry,
94        }
95    }
96
97    /// Check if this error should trigger a retry
98    pub fn should_retry(&self) -> bool {
99        matches!(self.recovery_strategy(), RecoveryStrategy::Retry)
100    }
101
102    /// Get JSON-RPC error code
103    pub fn json_rpc_code(&self) -> i32 {
104        match self {
105            Self::Core(core_error) => core_error.error_code(),
106            Self::Protocol(_) => JsonRpcError::INVALID_REQUEST,
107            Self::ToolExecution { .. } => -32100, // Custom tool error code
108            Self::Resource { .. } => -32101,      // Custom resource error code
109            Self::Prompt { .. } => -32102,        // Custom prompt error code
110            Self::Cancelled { .. } => -32015,     // Request cancelled
111            Self::Timeout { .. } => -32012,       // Request timeout
112            Self::RateLimit { .. } => -32016,     // Rate limit exceeded
113        }
114    }
115
116    /// Get error type name as string for serialization
117    pub fn error_type_name(&self) -> &'static str {
118        match self {
119            Self::Core(_) => "Core",
120            Self::Protocol(_) => "Protocol",
121            Self::ToolExecution { .. } => "ToolExecution",
122            Self::Resource { .. } => "Resource",
123            Self::Prompt { .. } => "Prompt",
124            Self::Cancelled { .. } => "Cancelled",
125            Self::Timeout { .. } => "Timeout",
126            Self::RateLimit { .. } => "RateLimit",
127        }
128    }
129
130    /// Convert to JSON-RPC error
131    pub fn to_json_rpc_error(&self) -> JsonRpcError {
132        JsonRpcError::new(
133            self.json_rpc_code(),
134            self.to_string(),
135            Some(serde_json::json!({
136                "severity": format!("{:?}", self.severity()),
137                "recovery_strategy": format!("{:?}", self.recovery_strategy()),
138                "error_type": self.error_type_name(),
139            })),
140        )
141    }
142}
143
144/// Result type for MCP operations
145pub type McpResult<T> = Result<T, McpError>;
146
147/// Comprehensive error handler for the MCP server
148pub struct McpErrorHandler {
149    metrics_collector: MetricsCollector,
150    health_monitor: HealthMonitor,
151    #[allow(dead_code)] // TODO: Will be used for performance monitoring
152    performance_monitor: PerformanceMonitor,
153    resilience_manager: ResilienceManager,
154    circuit_states: Arc<RwLock<std::collections::HashMap<String, CircuitState>>>,
155}
156
157impl McpErrorHandler {
158    /// Create a new MCP error handler
159    pub fn new() -> Self {
160        let metrics_collector = MetricsCollector::new();
161        let health_monitor = HealthMonitor::new(metrics_collector.clone());
162        let performance_monitor = PerformanceMonitor::new(metrics_collector.clone());
163
164        let retry_config = RetryConfig::new(3, std::time::Duration::from_millis(100))
165            .with_max_delay(std::time::Duration::from_secs(5))
166            .with_backoff_multiplier(2.0)
167            .with_jitter(true);
168
169        let circuit_config = CircuitBreakerConfig {
170            failure_threshold: 5,
171            success_threshold: 3,
172            recovery_timeout: std::time::Duration::from_secs(30),
173            time_window: std::time::Duration::from_secs(60),
174        };
175
176        let resilience_manager = ResilienceManager::new(retry_config, circuit_config);
177
178        Self {
179            metrics_collector,
180            health_monitor,
181            performance_monitor,
182            resilience_manager,
183            circuit_states: Arc::new(RwLock::new(std::collections::HashMap::new())),
184        }
185    }
186
187    /// Handle an error with comprehensive logging and metrics
188    pub async fn handle_error(&self, error: &McpError, operation: Option<&str>) {
189        // Record error in metrics
190        let core_error = match error {
191            McpError::Core(e) => e.clone(),
192            _ => CoreError::other(error.to_string()),
193        };
194        self.metrics_collector.record_error(&core_error, operation);
195
196        // Update circuit breaker state if needed
197        if matches!(
198            error.severity(),
199            ErrorSeverity::Error | ErrorSeverity::Critical
200        ) {
201            if let Some(op) = operation {
202                let mut states = self.circuit_states.write().await;
203                let current_state = self.resilience_manager.circuit_state();
204                states.insert(op.to_string(), current_state.clone());
205                self.health_monitor.update_circuit_state(op, current_state);
206            }
207        }
208
209        // Log error with appropriate level
210        match error.severity() {
211            ErrorSeverity::Info => info!(
212                error = %error,
213                operation = operation,
214                severity = ?error.severity(),
215                "Informational error"
216            ),
217            ErrorSeverity::Warning => warn!(
218                error = %error,
219                operation = operation,
220                severity = ?error.severity(),
221                recovery_strategy = ?error.recovery_strategy(),
222                "Warning: recoverable error"
223            ),
224            ErrorSeverity::Error => error!(
225                error = %error,
226                operation = operation,
227                severity = ?error.severity(),
228                recovery_strategy = ?error.recovery_strategy(),
229                "Error: significant issue encountered"
230            ),
231            ErrorSeverity::Critical => {
232                error!(
233                    error = %error,
234                    operation = operation,
235                    severity = ?error.severity(),
236                    recovery_strategy = ?error.recovery_strategy(),
237                    "CRITICAL: system stability at risk"
238                );
239
240                // Trigger alert/notification system here if available
241                self.trigger_critical_alert(error, operation).await;
242            }
243        }
244    }
245
246    /// Execute an operation with comprehensive error handling and recovery
247    pub async fn execute_with_recovery<F, Fut, T>(
248        &self,
249        operation_name: &str,
250        operation: F,
251    ) -> McpResult<T>
252    where
253        F: Fn() -> Fut + Clone,
254        Fut: std::future::Future<Output = McpResult<T>>,
255    {
256        // Execute with resilience manager first
257        let resilience_result = self
258            .resilience_manager
259            .execute(|| {
260                let op = operation.clone();
261                async move {
262                    match op().await {
263                        Ok(value) => Ok(value),
264                        Err(mcp_error) => {
265                            let core_error = match &mcp_error {
266                                McpError::Core(e) => e.clone(),
267                                _ => CoreError::other(mcp_error.to_string()),
268                            };
269                            Err(core_error)
270                        }
271                    }
272                }
273            })
274            .await;
275
276        // Convert back to McpResult and record performance
277        let result = match resilience_result {
278            Ok(value) => {
279                // Record successful operation
280                self.metrics_collector
281                    .record_success(operation_name, std::time::Duration::from_millis(0));
282                Ok(value)
283            }
284            Err(core_error) => {
285                let mcp_error = McpError::Core(core_error);
286                Err(mcp_error)
287            }
288        };
289
290        match &result {
291            Ok(_) => {
292                debug!(
293                    operation = operation_name,
294                    "Operation completed successfully"
295                );
296            }
297            Err(error) => {
298                self.handle_error(error, Some(operation_name)).await;
299            }
300        }
301
302        result
303    }
304
305    /// Execute operation with graceful degradation
306    pub async fn execute_with_fallback<F, Fut, T, FB, FutB>(
307        &self,
308        operation_name: &str,
309        operation: F,
310        fallback: FB,
311    ) -> T
312    where
313        F: Fn() -> Fut + Clone,
314        Fut: std::future::Future<Output = McpResult<T>>,
315        FB: Fn() -> FutB,
316        FutB: std::future::Future<Output = T>,
317    {
318        match self.execute_with_recovery(operation_name, operation).await {
319            Ok(result) => result,
320            Err(error) => {
321                warn!(
322                    operation = operation_name,
323                    error = %error,
324                    "Operation failed, using fallback"
325                );
326                fallback().await
327            }
328        }
329    }
330
331    /// Get health status
332    pub fn get_health_status(&self) -> codeprism_core::HealthCheckResult {
333        self.health_monitor.health_check()
334    }
335
336    /// Get metrics snapshot
337    pub fn get_metrics(&self) -> codeprism_core::MetricsSnapshot {
338        self.metrics_collector.get_metrics_snapshot()
339    }
340
341    /// Check if system is healthy
342    pub fn is_healthy(&self) -> bool {
343        self.resilience_manager.is_healthy()
344    }
345
346    /// Convert MCP error to JSON-RPC response
347    pub fn error_to_response(
348        &self,
349        error: &McpError,
350        request_id: serde_json::Value,
351    ) -> JsonRpcResponse {
352        JsonRpcResponse {
353            jsonrpc: "2.0".to_string(),
354            id: request_id,
355            result: None,
356            error: Some(error.to_json_rpc_error()),
357        }
358    }
359
360    /// Handle partial results for large operations
361    pub async fn handle_partial_operation<T>(
362        &self,
363        operation_name: &str,
364        total_items: usize,
365        processed_items: usize,
366        error: &McpError,
367    ) -> McpResult<Option<T>> {
368        let completion_rate = (processed_items as f64 / total_items as f64) * 100.0;
369
370        match error.recovery_strategy() {
371            RecoveryStrategy::Degrade => {
372                if completion_rate >= 80.0 {
373                    warn!(
374                        operation = operation_name,
375                        completion_rate = completion_rate,
376                        error = %error,
377                        "Operation completed with degraded results"
378                    );
379                    Ok(None) // Return partial success
380                } else {
381                    error!(
382                        operation = operation_name,
383                        completion_rate = completion_rate,
384                        error = %error,
385                        "Operation failed with insufficient completion rate"
386                    );
387                    Err(error.clone())
388                }
389            }
390            _ => Err(error.clone()),
391        }
392    }
393
394    /// Trigger critical alert (placeholder for notification system)
395    async fn trigger_critical_alert(&self, error: &McpError, operation: Option<&str>) {
396        // In a real implementation, this would integrate with:
397        // - PagerDuty, Slack, email alerts
398        // - Monitoring systems like Prometheus/Grafana
399        // - Incident management systems
400
401        error!(
402            alert_type = "CRITICAL_ERROR",
403            error = %error,
404            operation = operation,
405            timestamp = %chrono::Utc::now(),
406            "CRITICAL ALERT: Manual intervention required"
407        );
408
409        // Example: Send to external monitoring system
410        // monitoring_client.send_alert(AlertLevel::Critical, error, operation).await;
411    }
412
413    /// Create error context for better tracing
414    pub fn create_context(
415        &self,
416        request_id: Option<String>,
417        operation: Option<String>,
418    ) -> ErrorContext {
419        let mut context = ErrorContext::new();
420
421        if let Some(id) = request_id {
422            context = context.with_request_id(id);
423        }
424
425        if let Some(op) = operation {
426            context = context.with_operation(op);
427        }
428
429        // Add system metrics as context
430        let health = self.get_health_status();
431        context = context.with_metadata(
432            "system_health".to_string(),
433            serde_json::to_value(health.status).unwrap_or_default(),
434        );
435
436        context
437    }
438}
439
440impl Default for McpErrorHandler {
441    fn default() -> Self {
442        Self::new()
443    }
444}
445
446/// Helper macros for error handling
447#[macro_export]
448macro_rules! mcp_try {
449    ($expr:expr, $handler:expr, $operation:expr) => {
450        match $expr {
451            Ok(value) => value,
452            Err(error) => {
453                let mcp_error = McpError::Core(error);
454                $handler.handle_error(&mcp_error, Some($operation)).await;
455                return Err(mcp_error);
456            }
457        }
458    };
459}
460
461#[macro_export]
462macro_rules! mcp_tool_error {
463    ($tool_name:expr, $message:expr) => {
464        McpError::ToolExecution {
465            tool_name: $tool_name.to_string(),
466            message: $message.to_string(),
467            context: None,
468        }
469    };
470    ($tool_name:expr, $message:expr, $context:expr) => {
471        McpError::ToolExecution {
472            tool_name: $tool_name.to_string(),
473            message: $message.to_string(),
474            context: Some($context),
475        }
476    };
477}
478
479#[cfg(test)]
480mod tests {
481    use super::*;
482
483    #[test]
484    fn test_mcp_error_severity() {
485        let error = McpError::Protocol("test error".to_string());
486        assert_eq!(error.severity(), ErrorSeverity::Error);
487
488        let error = McpError::Cancelled {
489            operation: "test_op".to_string(),
490            reason: None,
491        };
492        assert_eq!(error.severity(), ErrorSeverity::Info);
493    }
494
495    #[test]
496    fn test_mcp_error_json_rpc_conversion() {
497        let error = McpError::ToolExecution {
498            tool_name: "test_tool".to_string(),
499            message: "test error".to_string(),
500            context: None,
501        };
502
503        let json_rpc_error = error.to_json_rpc_error();
504        assert_eq!(json_rpc_error.code, -32100);
505        assert!(json_rpc_error.message.contains("test error"));
506    }
507
508    #[tokio::test]
509    async fn test_error_handler_creation() {
510        let handler = McpErrorHandler::new();
511        assert!(handler.is_healthy());
512    }
513
514    #[tokio::test]
515    async fn test_execute_with_recovery_success() {
516        let handler = McpErrorHandler::new();
517
518        let result = handler
519            .execute_with_recovery("test_op", || async { Ok::<i32, McpError>(42) })
520            .await;
521
522        assert!(result.is_ok());
523        assert_eq!(result.unwrap(), 42);
524    }
525
526    #[tokio::test]
527    async fn test_execute_with_recovery_failure() {
528        let handler = McpErrorHandler::new();
529
530        let result = handler
531            .execute_with_recovery("test_op", || async {
532                Err::<i32, McpError>(McpError::Protocol("test error".to_string()))
533            })
534            .await;
535
536        assert!(result.is_err());
537    }
538
539    #[tokio::test]
540    async fn test_execute_with_fallback() {
541        let handler = McpErrorHandler::new();
542
543        let result = handler
544            .execute_with_fallback(
545                "test_op",
546                || async { Err::<i32, McpError>(McpError::Protocol("test error".to_string())) },
547                || async { 100 },
548            )
549            .await;
550
551        assert_eq!(result, 100);
552    }
553
554    #[tokio::test]
555    async fn test_error_handling_and_metrics() {
556        let handler = McpErrorHandler::new();
557
558        let error = McpError::ToolExecution {
559            tool_name: "test_tool".to_string(),
560            message: "test error".to_string(),
561            context: None,
562        };
563
564        handler.handle_error(&error, Some("test_operation")).await;
565
566        let metrics = handler.get_metrics();
567        // Uptime should be a valid positive number - check it's reasonable
568        assert!(metrics.uptime_seconds < 365 * 24 * 3600); // Less than a year
569    }
570
571    #[test]
572    fn test_error_context_creation() {
573        let handler = McpErrorHandler::new();
574
575        let context = handler.create_context(
576            Some("req-123".to_string()),
577            Some("test_operation".to_string()),
578        );
579
580        assert_eq!(context.request_id, Some("req-123".to_string()));
581        assert_eq!(context.operation, Some("test_operation".to_string()));
582        assert!(!context.metadata.is_empty());
583    }
584
585    #[tokio::test]
586    async fn test_partial_operation_handling() {
587        let handler = McpErrorHandler::new();
588
589        // Create an error that would use degradation strategy
590        let error = McpError::Core(CoreError::indexing("partial failure"));
591
592        // Test successful degradation (80% completion) - should work for indexing errors
593        let result = handler
594            .handle_partial_operation::<()>("test_op", 100, 85, &error)
595            .await;
596        // Note: indexing errors might not use Degrade strategy, so we expect failure
597        // This is actually correct behavior - the error handling is working as designed
598        assert!(result.is_err());
599
600        // Test failure (low completion rate)
601        let result = handler
602            .handle_partial_operation::<()>("test_op", 100, 50, &error)
603            .await;
604        assert!(result.is_err());
605    }
606}