Skip to main content

symbi_runtime/types/
error.rs

1//! Error types and recovery strategies for the Agent Runtime System
2
3use std::time::Duration;
4use thiserror::Error;
5
6use super::{AgentId, MessageId, PolicyId, RequestId};
7
8/// Main runtime error type
9#[derive(Error, Debug, Clone)]
10pub enum RuntimeError {
11    #[error("Configuration error: {0}")]
12    Configuration(#[from] ConfigError),
13
14    #[error("Resource error: {0}")]
15    Resource(#[from] ResourceError),
16
17    #[error("Security error: {0}")]
18    Security(#[from] SecurityError),
19
20    #[error("Communication error: {0}")]
21    Communication(#[from] CommunicationError),
22
23    #[error("Policy error: {0}")]
24    Policy(#[from] PolicyError),
25
26    #[error("Sandbox error: {0}")]
27    Sandbox(#[from] SandboxError),
28
29    #[error("Scheduler error: {0}")]
30    Scheduler(#[from] SchedulerError),
31
32    #[error("Lifecycle error: {0}")]
33    Lifecycle(#[from] LifecycleError),
34
35    #[error("Audit error: {0}")]
36    Audit(#[from] AuditError),
37
38    #[error("Error handler error: {0}")]
39    ErrorHandler(#[from] ErrorHandlerError),
40
41    #[error("Internal error: {0}")]
42    Internal(String),
43}
44
45/// Configuration-related errors
46#[derive(Error, Debug, Clone)]
47pub enum ConfigError {
48    #[error("Invalid configuration: {0}")]
49    Invalid(String),
50
51    #[error("Missing required field: {0}")]
52    MissingField(String),
53
54    #[error("Configuration file not found: {0}")]
55    FileNotFound(String),
56
57    #[error("Failed to parse configuration: {0}")]
58    ParseError(String),
59}
60
61/// Resource management errors
62#[derive(Error, Debug, Clone)]
63pub enum ResourceError {
64    #[error("Insufficient resources: {0}")]
65    Insufficient(String),
66
67    #[error("Resource allocation failed for agent {agent_id}: {reason}")]
68    AllocationFailed { agent_id: AgentId, reason: String },
69
70    #[error("Resource limit exceeded: {0}")]
71    LimitExceeded(String),
72
73    #[error("Resource not found: {0}")]
74    NotFound(String),
75
76    #[error("Resource monitoring failed: {0}")]
77    MonitoringFailed(String),
78
79    #[error("Agent not found: {agent_id}")]
80    AgentNotFound { agent_id: AgentId },
81
82    #[error("System is shutting down")]
83    ShuttingDown,
84
85    #[error("Allocation already exists for agent: {agent_id}")]
86    AllocationExists { agent_id: AgentId },
87
88    #[error("Insufficient resources for requirements: {requirements:?}")]
89    InsufficientResources { requirements: String },
90
91    #[error("Policy error: {0}")]
92    PolicyError(String),
93
94    #[error("Policy violation: {reason}")]
95    PolicyViolation { reason: String },
96
97    #[error("Resource allocation queued: {reason}")]
98    AllocationQueued { reason: String },
99
100    #[error("Escalation required: {reason}")]
101    EscalationRequired { reason: String },
102}
103
104/// Security-related errors
105#[derive(Error, Debug, Clone)]
106pub enum SecurityError {
107    #[error("Authentication failed: {0}")]
108    AuthenticationFailed(String),
109
110    #[error("Authorization denied: {0}")]
111    AuthorizationDenied(String),
112
113    #[error("Encryption failed: {0}")]
114    EncryptionFailed(String),
115
116    #[error("Signature verification failed: {0}")]
117    SignatureVerificationFailed(String),
118
119    #[error("Policy violation: {0}")]
120    PolicyViolation(String),
121
122    #[error("Sandbox breach detected: {0}")]
123    SandboxBreach(String),
124
125    #[error("Key management error: {0}")]
126    KeyManagement(String),
127}
128
129/// Communication system errors
130#[derive(Error, Debug, Clone)]
131pub enum CommunicationError {
132    #[error("Message delivery failed for message {message_id}: {reason}")]
133    DeliveryFailed {
134        message_id: MessageId,
135        reason: String,
136    },
137
138    #[error("Connection failed: {0}")]
139    ConnectionFailed(String),
140
141    #[error("Message timeout: {0}")]
142    Timeout(String),
143
144    #[error("Invalid message format: {0}")]
145    InvalidFormat(String),
146
147    #[error("Rate limit exceeded: {0}")]
148    RateLimitExceeded(String),
149
150    #[error("Channel not found: {0}")]
151    ChannelNotFound(String),
152
153    #[error("Message too large: {size} bytes, max allowed: {max_size} bytes")]
154    MessageTooLarge { size: usize, max_size: usize },
155
156    #[error("Communication system is shutting down")]
157    ShuttingDown,
158
159    #[error("Event processing failed: {reason}")]
160    EventProcessingFailed { reason: String },
161
162    #[error("Agent not registered: {agent_id}")]
163    AgentNotRegistered { agent_id: AgentId },
164
165    #[error("Message not found: {message_id}")]
166    MessageNotFound { message_id: MessageId },
167
168    #[error("Request timeout: request {request_id} timed out after {timeout:?}")]
169    RequestTimeout {
170        request_id: RequestId,
171        timeout: Duration,
172    },
173
174    #[error("Request cancelled: {request_id}")]
175    RequestCancelled { request_id: RequestId },
176}
177
178/// Policy enforcement errors
179#[derive(Error, Debug, Clone)]
180pub enum PolicyError {
181    #[error("Policy not found: {policy_id}")]
182    NotFound { policy_id: PolicyId },
183
184    #[error("Policy not found: {id}")]
185    PolicyNotFound { id: PolicyId },
186
187    #[error("Policy evaluation failed: {0}")]
188    EvaluationFailed(String),
189
190    #[error("Policy compilation failed: {0}")]
191    CompilationFailed(String),
192
193    #[error("Policy conflict detected: {0}")]
194    Conflict(String),
195
196    #[error("Policy engine unavailable: {0}")]
197    EngineUnavailable(String),
198
199    #[error("Invalid policy: {reason}")]
200    InvalidPolicy { reason: String },
201}
202
203/// Sandbox orchestration errors
204#[derive(Error, Debug, Clone)]
205pub enum SandboxError {
206    #[error("Sandbox creation failed: {0}")]
207    CreationFailed(String),
208
209    #[error("Sandbox execution failed: {0}")]
210    ExecutionFailed(String),
211
212    #[error("Sandbox not found: {0}")]
213    NotFound(String),
214
215    #[error("Sandbox not found: {id}")]
216    SandboxNotFound { id: String },
217
218    #[error("Snapshot not found: {id}")]
219    SnapshotNotFound { id: String },
220
221    #[error("Sandbox termination failed: {0}")]
222    TerminationFailed(String),
223
224    #[error("Sandbox monitoring failed: {0}")]
225    MonitoringFailed(String),
226
227    #[error("Unsupported security tier: {0}")]
228    UnsupportedTier(String),
229}
230
231/// Scheduler errors
232#[derive(Error, Debug, Clone)]
233pub enum SchedulerError {
234    #[error("Agent scheduling failed for {agent_id}: {reason}")]
235    SchedulingFailed { agent_id: AgentId, reason: String },
236
237    #[error("Agent not found: {agent_id}")]
238    AgentNotFound { agent_id: AgentId },
239
240    #[error("Scheduler overloaded: {0}")]
241    Overloaded(String),
242
243    #[error("Invalid priority: {0}")]
244    InvalidPriority(String),
245
246    #[error("Scheduler shutdown in progress")]
247    ShuttingDown,
248
249    #[error("Serialization failed: {0}")]
250    SerializationFailed(String),
251}
252
253impl From<serde_json::Error> for SchedulerError {
254    fn from(error: serde_json::Error) -> Self {
255        SchedulerError::SerializationFailed(format!("JSON serialization error: {}", error))
256    }
257}
258
259/// Lifecycle management errors
260#[derive(Error, Debug, Clone)]
261pub enum LifecycleError {
262    #[error("Agent initialization failed for {agent_id}: {reason}")]
263    InitializationFailed { agent_id: AgentId, reason: String },
264
265    #[error("Agent execution failed for {agent_id}: {reason}")]
266    ExecutionFailed { agent_id: AgentId, reason: String },
267
268    #[error("Agent termination failed for {agent_id}: {reason}")]
269    TerminationFailed { agent_id: AgentId, reason: String },
270
271    #[error("Invalid state transition from {from} to {to}")]
272    InvalidStateTransition { from: String, to: String },
273
274    #[error("DSL parsing failed: {0}")]
275    DslParsingFailed(String),
276
277    #[error("Agent not found: {agent_id}")]
278    AgentNotFound { agent_id: AgentId },
279
280    #[error("Event processing failed: {reason}")]
281    EventProcessingFailed { reason: String },
282
283    #[error("System is shutting down")]
284    ShuttingDown,
285
286    #[error("Resource exhausted: {reason}")]
287    ResourceExhausted { reason: String },
288}
289
290/// Audit trail errors
291#[derive(Error, Debug, Clone)]
292pub enum AuditError {
293    #[error("Audit logging failed: {0}")]
294    LoggingFailed(String),
295
296    #[error("Audit verification failed: {0}")]
297    VerificationFailed(String),
298
299    #[error("Audit query failed: {0}")]
300    QueryFailed(String),
301
302    #[error("Audit storage full: {0}")]
303    StorageFull(String),
304
305    #[error("Audit trail corrupted: {0}")]
306    Corrupted(String),
307
308    #[error("Record not found: {id}")]
309    RecordNotFound { id: String },
310
311    #[error("Export failed: {reason}")]
312    ExportFailed { reason: String },
313
314    #[error("Unsupported format: {format}")]
315    UnsupportedFormat { format: String },
316}
317
318/// Error handler errors
319#[derive(Error, Debug, Clone)]
320pub enum ErrorHandlerError {
321    #[error("Configuration error: {reason}")]
322    ConfigurationError { reason: String },
323
324    #[error("Event processing failed: {reason}")]
325    EventProcessingFailed { reason: String },
326
327    #[error("Error handler is shutting down")]
328    ShuttingDown,
329}
330
331/// Recovery strategies for different types of errors
332#[derive(Debug, Clone)]
333pub enum RecoveryStrategy {
334    /// Retry the operation with exponential backoff
335    Retry {
336        max_attempts: u32,
337        backoff: Duration,
338    },
339    /// Restart the agent, optionally preserving state
340    Restart { preserve_state: bool },
341    /// Failover to a backup agent
342    Failover { backup_agent: Option<AgentId> },
343    /// Terminate the agent with cleanup
344    Terminate { cleanup: bool },
345    /// Manual intervention required
346    Manual { reason: String },
347    /// No recovery possible
348    None,
349}
350
351impl Default for RecoveryStrategy {
352    fn default() -> Self {
353        RecoveryStrategy::Retry {
354            max_attempts: 3,
355            backoff: Duration::from_secs(1),
356        }
357    }
358}
359
360/// Error recovery configuration
361#[derive(Debug, Clone)]
362pub struct ErrorRecoveryConfig {
363    pub default_strategy: RecoveryStrategy,
364    pub max_recovery_attempts: u32,
365    pub recovery_timeout: Duration,
366    pub escalation_threshold: u32,
367}
368
369impl Default for ErrorRecoveryConfig {
370    fn default() -> Self {
371        Self {
372            default_strategy: RecoveryStrategy::default(),
373            max_recovery_attempts: 5,
374            recovery_timeout: Duration::from_secs(300), // 5 minutes
375            escalation_threshold: 10,
376        }
377    }
378}
379
380/// Error context for better debugging and recovery
381#[derive(Debug, Clone)]
382pub struct ErrorContext {
383    pub agent_id: Option<AgentId>,
384    pub operation: String,
385    pub timestamp: std::time::SystemTime,
386    pub recovery_attempts: u32,
387    pub additional_info: std::collections::HashMap<String, String>,
388}
389
390impl ErrorContext {
391    pub fn new(operation: String) -> Self {
392        Self {
393            agent_id: None,
394            operation,
395            timestamp: std::time::SystemTime::now(),
396            recovery_attempts: 0,
397            additional_info: std::collections::HashMap::new(),
398        }
399    }
400
401    pub fn with_agent(mut self, agent_id: AgentId) -> Self {
402        self.agent_id = Some(agent_id);
403        self
404    }
405
406    pub fn with_info(mut self, key: String, value: String) -> Self {
407        self.additional_info.insert(key, value);
408        self
409    }
410
411    pub fn increment_attempts(&mut self) {
412        self.recovery_attempts += 1;
413    }
414}
415
416/// Result type with error context
417pub type RuntimeResult<T> = Result<T, RuntimeError>;
418
419/// Trait for error recovery
420pub trait ErrorRecovery {
421    fn get_recovery_strategy(&self, error: &RuntimeError) -> RecoveryStrategy;
422    fn should_retry(&self, error: &RuntimeError, attempts: u32) -> bool;
423    fn escalate_error(&self, error: &RuntimeError, context: &ErrorContext);
424}