Skip to main content

symbi_runtime/types/
error.rs

1//! Error types and recovery strategies for the Agent Runtime System
2
3use std::time::Duration;
4use thiserror::Error;
5
6use super::{AgentId, MessageId, PolicyId, RequestId};
7
8/// Main runtime error type
9#[derive(Error, Debug, Clone)]
10pub enum RuntimeError {
11    #[error("Configuration error: {0}")]
12    Configuration(#[from] ConfigError),
13
14    #[error("Resource error: {0}")]
15    Resource(#[from] ResourceError),
16
17    #[error("Security error: {0}")]
18    Security(#[from] SecurityError),
19
20    #[error("Communication error: {0}")]
21    Communication(#[from] CommunicationError),
22
23    #[error("Policy error: {0}")]
24    Policy(#[from] PolicyError),
25
26    #[error("Sandbox error: {0}")]
27    Sandbox(#[from] SandboxError),
28
29    #[error("Scheduler error: {0}")]
30    Scheduler(#[from] SchedulerError),
31
32    #[error("Lifecycle error: {0}")]
33    Lifecycle(#[from] LifecycleError),
34
35    #[error("Audit error: {0}")]
36    Audit(#[from] AuditError),
37
38    #[error("Error handler error: {0}")]
39    ErrorHandler(#[from] ErrorHandlerError),
40
41    #[error("Internal error: {0}")]
42    Internal(String),
43}
44
45/// Configuration-related errors
46#[derive(Error, Debug, Clone)]
47pub enum ConfigError {
48    #[error("Invalid configuration: {0}")]
49    Invalid(String),
50
51    #[error("Missing required field: {0}")]
52    MissingField(String),
53
54    #[error("Configuration file not found: {0}")]
55    FileNotFound(String),
56
57    #[error("Failed to parse configuration: {0}")]
58    ParseError(String),
59}
60
61/// Resource management errors
62#[derive(Error, Debug, Clone)]
63pub enum ResourceError {
64    #[error("Insufficient resources: {0}")]
65    Insufficient(String),
66
67    #[error("Resource allocation failed for agent {agent_id}: {reason}")]
68    AllocationFailed { agent_id: AgentId, reason: Box<str> },
69
70    #[error("Resource limit exceeded: {0}")]
71    LimitExceeded(String),
72
73    #[error("Resource not found: {0}")]
74    NotFound(String),
75
76    #[error("Resource monitoring failed: {0}")]
77    MonitoringFailed(String),
78
79    #[error("Agent not found: {agent_id}")]
80    AgentNotFound { agent_id: AgentId },
81
82    #[error("System is shutting down")]
83    ShuttingDown,
84
85    #[error("Allocation already exists for agent: {agent_id}")]
86    AllocationExists { agent_id: AgentId },
87
88    #[error("Insufficient resources for requirements: {requirements:?}")]
89    InsufficientResources { requirements: Box<str> },
90
91    #[error("Policy error: {0}")]
92    PolicyError(String),
93
94    #[error("Policy violation: {reason}")]
95    PolicyViolation { reason: Box<str> },
96
97    #[error("Resource allocation queued: {reason}")]
98    AllocationQueued { reason: Box<str> },
99
100    #[error("Escalation required: {reason}")]
101    EscalationRequired { reason: Box<str> },
102}
103
104/// Security-related errors
105#[derive(Error, Debug, Clone)]
106pub enum SecurityError {
107    #[error("Authentication failed: {0}")]
108    AuthenticationFailed(String),
109
110    #[error("Authorization denied: {0}")]
111    AuthorizationDenied(String),
112
113    #[error("Encryption failed: {0}")]
114    EncryptionFailed(String),
115
116    #[error("Signature verification failed: {0}")]
117    SignatureVerificationFailed(String),
118
119    #[error("Policy violation: {0}")]
120    PolicyViolation(String),
121
122    #[error("Sandbox breach detected: {0}")]
123    SandboxBreach(String),
124
125    #[error("Key management error: {0}")]
126    KeyManagement(String),
127}
128
129/// Communication system errors
130#[derive(Error, Debug, Clone)]
131pub enum CommunicationError {
132    #[error("Message delivery failed for message {message_id}: {reason}")]
133    DeliveryFailed {
134        message_id: MessageId,
135        reason: Box<str>,
136    },
137
138    #[error("Connection failed: {0}")]
139    ConnectionFailed(String),
140
141    #[error("Message timeout: {0}")]
142    Timeout(String),
143
144    #[error("Invalid message format: {0}")]
145    InvalidFormat(String),
146
147    #[error("Rate limit exceeded: {0}")]
148    RateLimitExceeded(String),
149
150    #[error("Channel not found: {0}")]
151    ChannelNotFound(String),
152
153    #[error("Message too large: {size} bytes, max allowed: {max_size} bytes")]
154    MessageTooLarge { size: usize, max_size: usize },
155
156    #[error("Communication system is shutting down")]
157    ShuttingDown,
158
159    #[error("Event processing failed: {reason}")]
160    EventProcessingFailed { reason: Box<str> },
161
162    #[error("Agent not registered: {agent_id}")]
163    AgentNotRegistered { agent_id: AgentId },
164
165    #[error("Message not found: {message_id}")]
166    MessageNotFound { message_id: MessageId },
167
168    #[error("Request timeout: request {request_id} timed out after {timeout:?}")]
169    RequestTimeout {
170        request_id: RequestId,
171        timeout: Duration,
172    },
173
174    #[error("Request cancelled: {request_id}")]
175    RequestCancelled { request_id: RequestId },
176
177    #[error("Policy denied: {reason}")]
178    PolicyDenied { reason: Box<str> },
179}
180
181/// Policy enforcement errors
182#[derive(Error, Debug, Clone)]
183pub enum PolicyError {
184    #[error("Policy not found: {policy_id}")]
185    NotFound { policy_id: PolicyId },
186
187    #[error("Policy not found: {id}")]
188    PolicyNotFound { id: PolicyId },
189
190    #[error("Policy evaluation failed: {0}")]
191    EvaluationFailed(String),
192
193    #[error("Policy compilation failed: {0}")]
194    CompilationFailed(String),
195
196    #[error("Policy conflict detected: {0}")]
197    Conflict(String),
198
199    #[error("Policy engine unavailable: {0}")]
200    EngineUnavailable(String),
201
202    #[error("Invalid policy: {reason}")]
203    InvalidPolicy { reason: Box<str> },
204}
205
206/// Sandbox orchestration errors
207#[derive(Error, Debug, Clone)]
208pub enum SandboxError {
209    #[error("Sandbox creation failed: {0}")]
210    CreationFailed(String),
211
212    #[error("Sandbox execution failed: {0}")]
213    ExecutionFailed(String),
214
215    #[error("Sandbox not found: {0}")]
216    NotFound(String),
217
218    #[error("Sandbox not found: {id}")]
219    SandboxNotFound { id: String },
220
221    #[error("Snapshot not found: {id}")]
222    SnapshotNotFound { id: String },
223
224    #[error("Sandbox termination failed: {0}")]
225    TerminationFailed(String),
226
227    #[error("Sandbox monitoring failed: {0}")]
228    MonitoringFailed(String),
229
230    #[error("Unsupported security tier: {0}")]
231    UnsupportedTier(String),
232}
233
234/// Scheduler errors
235#[derive(Error, Debug, Clone)]
236pub enum SchedulerError {
237    #[error("Agent scheduling failed for {agent_id}: {reason}")]
238    SchedulingFailed { agent_id: AgentId, reason: Box<str> },
239
240    #[error("Agent not found: {agent_id}")]
241    AgentNotFound { agent_id: AgentId },
242
243    #[error("Scheduler overloaded: {0}")]
244    Overloaded(Box<str>),
245
246    #[error("Invalid priority: {0}")]
247    InvalidPriority(String),
248
249    #[error("Scheduler shutdown in progress")]
250    ShuttingDown,
251
252    #[error("Serialization failed: {0}")]
253    SerializationFailed(String),
254}
255
256impl From<serde_json::Error> for SchedulerError {
257    fn from(error: serde_json::Error) -> Self {
258        SchedulerError::SerializationFailed(format!("JSON serialization error: {}", error))
259    }
260}
261
262/// Lifecycle management errors
263#[derive(Error, Debug, Clone)]
264pub enum LifecycleError {
265    #[error("Agent initialization failed for {agent_id}: {reason}")]
266    InitializationFailed { agent_id: AgentId, reason: String },
267
268    #[error("Agent execution failed for {agent_id}: {reason}")]
269    ExecutionFailed { agent_id: AgentId, reason: String },
270
271    #[error("Agent termination failed for {agent_id}: {reason}")]
272    TerminationFailed { agent_id: AgentId, reason: String },
273
274    #[error("Invalid state transition from {from} to {to}")]
275    InvalidStateTransition { from: String, to: String },
276
277    #[error("DSL parsing failed: {0}")]
278    DslParsingFailed(String),
279
280    #[error("Agent not found: {agent_id}")]
281    AgentNotFound { agent_id: AgentId },
282
283    #[error("Event processing failed: {reason}")]
284    EventProcessingFailed { reason: String },
285
286    #[error("System is shutting down")]
287    ShuttingDown,
288
289    #[error("Resource exhausted: {reason}")]
290    ResourceExhausted { reason: String },
291}
292
293/// Audit trail errors
294#[derive(Error, Debug, Clone)]
295pub enum AuditError {
296    #[error("Audit logging failed: {0}")]
297    LoggingFailed(String),
298
299    #[error("Audit verification failed: {0}")]
300    VerificationFailed(String),
301
302    #[error("Audit query failed: {0}")]
303    QueryFailed(String),
304
305    #[error("Audit storage full: {0}")]
306    StorageFull(String),
307
308    #[error("Audit trail corrupted: {0}")]
309    Corrupted(String),
310
311    #[error("Record not found: {id}")]
312    RecordNotFound { id: String },
313
314    #[error("Export failed: {reason}")]
315    ExportFailed { reason: String },
316
317    #[error("Unsupported format: {format}")]
318    UnsupportedFormat { format: String },
319}
320
321/// Error handler errors
322#[derive(Error, Debug, Clone)]
323pub enum ErrorHandlerError {
324    #[error("Configuration error: {reason}")]
325    ConfigurationError { reason: String },
326
327    #[error("Event processing failed: {reason}")]
328    EventProcessingFailed { reason: String },
329
330    #[error("Error handler is shutting down")]
331    ShuttingDown,
332}
333
334/// Recovery strategies for different types of errors
335#[derive(Debug, Clone)]
336pub enum RecoveryStrategy {
337    /// Retry the operation with exponential backoff
338    Retry {
339        max_attempts: u32,
340        backoff: Duration,
341    },
342    /// Restart the agent, optionally preserving state
343    Restart { preserve_state: bool },
344    /// Failover to a backup agent
345    Failover { backup_agent: Option<AgentId> },
346    /// Terminate the agent with cleanup
347    Terminate { cleanup: bool },
348    /// Manual intervention required
349    Manual { reason: String },
350    /// No recovery possible
351    None,
352}
353
354impl Default for RecoveryStrategy {
355    fn default() -> Self {
356        RecoveryStrategy::Retry {
357            max_attempts: 3,
358            backoff: Duration::from_secs(1),
359        }
360    }
361}
362
363/// Error recovery configuration
364#[derive(Debug, Clone)]
365pub struct ErrorRecoveryConfig {
366    pub default_strategy: RecoveryStrategy,
367    pub max_recovery_attempts: u32,
368    pub recovery_timeout: Duration,
369    pub escalation_threshold: u32,
370}
371
372impl Default for ErrorRecoveryConfig {
373    fn default() -> Self {
374        Self {
375            default_strategy: RecoveryStrategy::default(),
376            max_recovery_attempts: 5,
377            recovery_timeout: Duration::from_secs(300), // 5 minutes
378            escalation_threshold: 10,
379        }
380    }
381}
382
383/// Error context for better debugging and recovery
384#[derive(Debug, Clone)]
385pub struct ErrorContext {
386    pub agent_id: Option<AgentId>,
387    pub operation: String,
388    pub timestamp: std::time::SystemTime,
389    pub recovery_attempts: u32,
390    pub additional_info: std::collections::HashMap<String, String>,
391}
392
393impl ErrorContext {
394    pub fn new(operation: String) -> Self {
395        Self {
396            agent_id: None,
397            operation,
398            timestamp: std::time::SystemTime::now(),
399            recovery_attempts: 0,
400            additional_info: std::collections::HashMap::new(),
401        }
402    }
403
404    pub fn with_agent(mut self, agent_id: AgentId) -> Self {
405        self.agent_id = Some(agent_id);
406        self
407    }
408
409    pub fn with_info(mut self, key: String, value: String) -> Self {
410        self.additional_info.insert(key, value);
411        self
412    }
413
414    pub fn increment_attempts(&mut self) {
415        self.recovery_attempts += 1;
416    }
417}
418
419/// Result type with error context
420pub type RuntimeResult<T> = Result<T, RuntimeError>;
421
422/// Trait for error recovery
423pub trait ErrorRecovery {
424    fn get_recovery_strategy(&self, error: &RuntimeError) -> RecoveryStrategy;
425    fn should_retry(&self, error: &RuntimeError, attempts: u32) -> bool;
426    fn escalate_error(&self, error: &RuntimeError, context: &ErrorContext);
427}