use crate::core::timeout_detector::{OperationType, TIMEOUT_DETECTOR};
use crate::utils::current_timestamp;
use anyhow::Result;
use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
use serde_json::Value;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExecutionError {
pub id: String,
pub timestamp: u64,
pub error_type: ErrorType,
pub message: String,
pub context: ErrorContext,
pub recovery_attempts: Vec<RecoveryAttempt>,
pub resolved: bool,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, Eq, Hash, PartialEq)]
pub enum ErrorType {
ToolExecution,
ApiCall,
FileSystem,
Network,
Validation,
CircuitBreaker,
Timeout,
PermissionDenied,
InvalidArguments,
ResourceNotFound,
Other,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErrorContext {
pub conversation_turn: usize,
pub user_input: Option<String>,
pub tool_name: Option<String>,
pub tool_args: Option<Value>,
pub api_request_size: Option<usize>,
pub context_size: Option<usize>,
pub stack_trace: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RecoveryAttempt {
pub timestamp: u64,
pub strategy: RecoveryStrategy,
pub success: bool,
pub result: String,
pub new_context_size: Option<usize>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum RecoveryStrategy {
RetryWithBackoff {
delay_ms: u64,
attempt_number: usize,
},
SimplifyRequest {
removed_parameters: Vec<String>,
},
AlternativeTool {
original_tool: String,
alternative_tool: String,
},
ContextReset {
preserved_data: IndexMap<String, Value>,
},
ManualIntervention,
}
pub struct ErrorRecoveryManager {
errors: Vec<ExecutionError>,
recovery_strategies: IndexMap<ErrorType, Vec<RecoveryStrategy>>,
operation_type_mapping: IndexMap<ErrorType, OperationType>,
}
impl Default for ErrorRecoveryManager {
fn default() -> Self {
Self::new()
}
}
impl ErrorRecoveryManager {
pub fn new() -> Self {
let mut recovery_strategies = IndexMap::with_capacity(2);
let mut operation_type_mapping = IndexMap::with_capacity(11);
recovery_strategies.insert(
ErrorType::ToolExecution,
vec![
RecoveryStrategy::RetryWithBackoff {
delay_ms: 1000,
attempt_number: 1,
},
RecoveryStrategy::AlternativeTool {
original_tool: String::new(),
alternative_tool: String::new(),
},
],
);
recovery_strategies.insert(
ErrorType::ApiCall,
vec![
RecoveryStrategy::RetryWithBackoff {
delay_ms: 2000,
attempt_number: 1,
},
RecoveryStrategy::ContextReset {
preserved_data: IndexMap::new(),
},
],
);
operation_type_mapping.insert(ErrorType::ToolExecution, OperationType::ToolExecution);
operation_type_mapping.insert(ErrorType::ApiCall, OperationType::ApiCall);
operation_type_mapping.insert(ErrorType::Network, OperationType::NetworkRequest);
operation_type_mapping.insert(ErrorType::FileSystem, OperationType::FileOperation);
operation_type_mapping.insert(ErrorType::Validation, OperationType::Processing);
operation_type_mapping.insert(ErrorType::CircuitBreaker, OperationType::ToolExecution);
operation_type_mapping.insert(ErrorType::Timeout, OperationType::Processing);
operation_type_mapping.insert(ErrorType::PermissionDenied, OperationType::Processing);
operation_type_mapping.insert(ErrorType::InvalidArguments, OperationType::Processing);
operation_type_mapping.insert(ErrorType::ResourceNotFound, OperationType::FileOperation);
operation_type_mapping.insert(ErrorType::Other, OperationType::Processing);
Self {
errors: Vec::with_capacity(16), recovery_strategies,
operation_type_mapping,
}
}
pub fn record_error(
&mut self,
error_type: ErrorType,
message: String,
context: ErrorContext,
) -> String {
let error_count = self.errors.len();
let timestamp_short = current_timestamp() % 10000;
let error_id = format!("e{}_{}", error_count, timestamp_short);
let error = ExecutionError {
id: error_id.clone(),
timestamp: current_timestamp(),
error_type, message,
context,
recovery_attempts: Vec::with_capacity(2), resolved: false,
};
self.errors.push(error);
error_id
}
#[inline]
pub fn record_recovery_attempt(
&mut self,
error_id: &str,
strategy: RecoveryStrategy,
success: bool,
result: String,
new_context_size: Option<usize>,
) {
let attempt = RecoveryAttempt {
timestamp: current_timestamp(),
strategy,
success,
result,
new_context_size,
};
if let Some(error) = self.errors.iter_mut().find(|e| e.id == error_id) {
error.recovery_attempts.push(attempt);
if success {
error.resolved = true;
}
}
}
#[inline]
pub fn get_recovery_strategies(&self, error_type: &ErrorType) -> &[RecoveryStrategy] {
self.recovery_strategies
.get(error_type)
.map(|strategies| strategies.as_slice())
.unwrap_or(&[])
}
pub fn generate_context_preservation_plan(
&self,
context_size: usize,
error_count: usize,
) -> ContextPreservationPlan {
let critical_errors = error_count > 5;
let strategies = if critical_errors {
vec![
PreservationStrategy::SelectiveRetention {
preserve_decisions: true,
preserve_errors: true,
},
PreservationStrategy::ContextReset {
preserve_session_data: true,
},
]
} else {
vec![PreservationStrategy::NoAction]
};
ContextPreservationPlan {
current_context_size: context_size,
error_count,
recommended_strategies: strategies,
urgency: if critical_errors {
Urgency::Critical
} else {
Urgency::Low
},
}
}
pub fn get_error_statistics(&self) -> ErrorStatistics {
let total_errors = self.errors.len();
if total_errors == 0 {
return ErrorStatistics {
total_errors: 0,
resolved_errors: 0,
unresolved_errors: 0,
errors_by_type: IndexMap::new(),
avg_recovery_attempts: 0.0,
recent_errors: Vec::new(),
};
}
let resolved_errors = self.errors.iter().filter(|e| e.resolved).count();
let unresolved_errors = total_errors - resolved_errors;
let mut errors_by_type = IndexMap::new();
let mut total_attempts = 0usize;
for error in &self.errors {
*errors_by_type.entry(error.error_type).or_insert(0) += 1;
total_attempts += error.recovery_attempts.len();
}
let avg_recovery_attempts = total_attempts as f64 / total_errors as f64;
let recent_count = total_errors.min(5);
let recent_errors: Vec<_> = self
.errors
.iter()
.rev()
.take(recent_count)
.cloned()
.collect();
ErrorStatistics {
total_errors,
resolved_errors,
unresolved_errors,
errors_by_type,
avg_recovery_attempts,
recent_errors,
}
}
pub fn detect_error_pattern(&self, error_type: &ErrorType, time_window_seconds: u64) -> bool {
let now = current_timestamp();
let recent_errors = self
.errors
.iter()
.filter(|e| e.error_type == *error_type && (now - e.timestamp) < time_window_seconds)
.count();
recent_errors >= 3 }
pub fn get_operation_type(&self, error_type: &ErrorType) -> OperationType {
self.operation_type_mapping
.get(error_type)
.cloned()
.unwrap_or(OperationType::Processing)
}
pub async fn execute_with_recovery<F, Fut, T>(
&mut self,
operation_id: String,
error_type: ErrorType,
_context: ErrorContext,
operation: F,
) -> Result<T, anyhow::Error>
where
F: Fn() -> Fut,
Fut: Future<Output = Result<T, anyhow::Error>>,
{
let operation_type = self.get_operation_type(&error_type);
TIMEOUT_DETECTOR
.execute_with_timeout_retry(operation_id, operation_type, operation)
.await
}
pub async fn should_retry_operation(
&self,
error_type: &ErrorType,
error: &anyhow::Error,
attempt: u32,
) -> bool {
let operation_type = self.get_operation_type(error_type);
TIMEOUT_DETECTOR
.should_retry(&operation_type, error, attempt)
.await
}
pub async fn get_timeout_stats(&self) -> crate::core::timeout_detector::TimeoutStats {
TIMEOUT_DETECTOR.get_stats().await
}
pub async fn configure_timeout_for_error_type(
&self,
error_type: ErrorType,
config: crate::core::timeout_detector::TimeoutConfig,
) {
let operation_type = self.get_operation_type(&error_type);
TIMEOUT_DETECTOR.set_config(operation_type, config).await;
}
pub async fn generate_enhanced_recovery_plan(
&self,
context_size: usize,
error_count: usize,
) -> EnhancedContextPreservationPlan {
let timeout_stats = self.get_timeout_stats().await;
let base_plan = self.generate_context_preservation_plan(context_size, error_count);
let timeout_rate = if timeout_stats.total_operations > 0 {
timeout_stats.timed_out_operations as f64 / timeout_stats.total_operations as f64
} else {
0.0
};
let retry_success_rate = if timeout_stats.total_retry_attempts > 0 {
timeout_stats.successful_retries as f64 / timeout_stats.total_retry_attempts as f64
} else {
1.0
};
let _adjusted_urgency = if timeout_rate > 0.3 {
Urgency::Critical
} else if retry_success_rate < 0.5 {
Urgency::High
} else {
base_plan.urgency.clone()
};
EnhancedContextPreservationPlan {
base_plan,
timeout_rate,
retry_success_rate,
timeout_stats,
}
}
pub fn error_count(&self) -> usize {
self.errors.len()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ContextPreservationPlan {
pub current_context_size: usize,
pub error_count: usize,
pub recommended_strategies: Vec<PreservationStrategy>,
pub urgency: Urgency,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum PreservationStrategy {
SelectiveRetention {
preserve_decisions: bool,
preserve_errors: bool,
},
ContextReset {
preserve_session_data: bool,
},
NoAction,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum Urgency {
Low,
High,
Critical,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErrorStatistics {
pub total_errors: usize,
pub resolved_errors: usize,
pub unresolved_errors: usize,
pub errors_by_type: IndexMap<ErrorType, usize>,
pub avg_recovery_attempts: f64,
pub recent_errors: Vec<ExecutionError>,
}
#[derive(Debug, Clone)]
pub struct EnhancedContextPreservationPlan {
pub base_plan: ContextPreservationPlan,
pub timeout_rate: f64,
pub retry_success_rate: f64,
pub timeout_stats: crate::core::timeout_detector::TimeoutStats,
}