Skip to main content

symbi_runtime/types/
error.rs

1//! Error types and recovery strategies for the Agent Runtime System
2
3use std::time::Duration;
4use thiserror::Error;
5
6use super::{AgentId, MessageId, PolicyId, RequestId};
7
8/// Main runtime error type
9#[derive(Error, Debug, Clone)]
10pub enum RuntimeError {
11    #[error("Configuration error: {0}")]
12    Configuration(#[from] ConfigError),
13
14    #[error("Resource error: {0}")]
15    Resource(#[from] ResourceError),
16
17    #[error("Security error: {0}")]
18    Security(#[from] SecurityError),
19
20    #[error("Communication error: {0}")]
21    Communication(#[from] CommunicationError),
22
23    #[error("Policy error: {0}")]
24    Policy(#[from] PolicyError),
25
26    #[error("Sandbox error: {0}")]
27    Sandbox(#[from] SandboxError),
28
29    #[error("Scheduler error: {0}")]
30    Scheduler(#[from] SchedulerError),
31
32    #[error("Lifecycle error: {0}")]
33    Lifecycle(#[from] LifecycleError),
34
35    #[error("Audit error: {0}")]
36    Audit(#[from] AuditError),
37
38    #[error("Error handler error: {0}")]
39    ErrorHandler(#[from] ErrorHandlerError),
40
41    #[error("Internal error: {0}")]
42    Internal(String),
43
44    #[error("Authentication failed: {0}")]
45    Authentication(String),
46}
47
48/// Configuration-related errors
49#[derive(Error, Debug, Clone)]
50pub enum ConfigError {
51    #[error("Invalid configuration: {0}")]
52    Invalid(String),
53
54    #[error("Missing required field: {0}")]
55    MissingField(String),
56
57    #[error("Configuration file not found: {0}")]
58    FileNotFound(String),
59
60    #[error("Failed to parse configuration: {0}")]
61    ParseError(String),
62}
63
64/// Resource management errors
65#[derive(Error, Debug, Clone)]
66pub enum ResourceError {
67    #[error("Insufficient resources: {0}")]
68    Insufficient(String),
69
70    #[error("Resource allocation failed for agent {agent_id}: {reason}")]
71    AllocationFailed { agent_id: AgentId, reason: Box<str> },
72
73    #[error("Resource limit exceeded: {0}")]
74    LimitExceeded(String),
75
76    #[error("Resource not found: {0}")]
77    NotFound(String),
78
79    #[error("Resource monitoring failed: {0}")]
80    MonitoringFailed(String),
81
82    #[error("Agent not found: {agent_id}")]
83    AgentNotFound { agent_id: AgentId },
84
85    #[error("System is shutting down")]
86    ShuttingDown,
87
88    #[error("Allocation already exists for agent: {agent_id}")]
89    AllocationExists { agent_id: AgentId },
90
91    #[error("Insufficient resources for requirements: {requirements:?}")]
92    InsufficientResources { requirements: Box<str> },
93
94    #[error("Policy error: {0}")]
95    PolicyError(String),
96
97    #[error("Policy violation: {reason}")]
98    PolicyViolation { reason: Box<str> },
99
100    #[error("Resource allocation queued: {reason}")]
101    AllocationQueued { reason: Box<str> },
102
103    #[error("Escalation required: {reason}")]
104    EscalationRequired { reason: Box<str> },
105}
106
107/// Security-related errors
108#[derive(Error, Debug, Clone)]
109pub enum SecurityError {
110    #[error("Authentication failed: {0}")]
111    AuthenticationFailed(String),
112
113    #[error("Authorization denied: {0}")]
114    AuthorizationDenied(String),
115
116    #[error("Encryption failed: {0}")]
117    EncryptionFailed(String),
118
119    #[error("Signature verification failed: {0}")]
120    SignatureVerificationFailed(String),
121
122    #[error("Policy violation: {0}")]
123    PolicyViolation(String),
124
125    #[error("Sandbox breach detected: {0}")]
126    SandboxBreach(String),
127
128    #[error("Key management error: {0}")]
129    KeyManagement(String),
130}
131
132/// Communication system errors
133#[derive(Error, Debug, Clone)]
134pub enum CommunicationError {
135    #[error("Message delivery failed for message {}: {reason}", .message_id.map(|m| m.to_string()).unwrap_or_else(|| "<unknown>".to_string()))]
136    DeliveryFailed {
137        /// ID of the affected message, if known. `None` for transport-level
138        /// failures that occur before a message was assigned an ID (e.g. a
139        /// remote HTTP request that never reached the bus).
140        message_id: Option<MessageId>,
141        reason: Box<str>,
142    },
143
144    #[error("Connection failed: {0}")]
145    ConnectionFailed(String),
146
147    #[error("Message timeout: {0}")]
148    Timeout(String),
149
150    #[error("Invalid message format: {0}")]
151    InvalidFormat(String),
152
153    #[error("Rate limit exceeded: {0}")]
154    RateLimitExceeded(String),
155
156    #[error("Channel not found: {0}")]
157    ChannelNotFound(String),
158
159    #[error("Message too large: {size} bytes, max allowed: {max_size} bytes")]
160    MessageTooLarge { size: usize, max_size: usize },
161
162    #[error("Communication system is shutting down")]
163    ShuttingDown,
164
165    #[error("Event processing failed: {reason}")]
166    EventProcessingFailed { reason: Box<str> },
167
168    #[error("Agent not registered: {agent_id}")]
169    AgentNotRegistered { agent_id: AgentId },
170
171    #[error("Message not found: {message_id}")]
172    MessageNotFound { message_id: MessageId },
173
174    #[error("Request timeout: request {request_id} timed out after {timeout:?}")]
175    RequestTimeout {
176        request_id: RequestId,
177        timeout: Duration,
178    },
179
180    #[error("Request cancelled: {request_id}")]
181    RequestCancelled { request_id: RequestId },
182
183    #[error("Policy denied: {reason}")]
184    PolicyDenied { reason: Box<str> },
185
186    #[error("Signature verification failed for message {message_id}: {reason}")]
187    SignatureInvalid {
188        message_id: MessageId,
189        reason: Box<str>,
190    },
191}
192
193/// Policy enforcement errors
194#[derive(Error, Debug, Clone)]
195pub enum PolicyError {
196    #[error("Policy not found: {policy_id}")]
197    NotFound { policy_id: PolicyId },
198
199    #[error("Policy not found: {id}")]
200    PolicyNotFound { id: PolicyId },
201
202    #[error("Policy evaluation failed: {0}")]
203    EvaluationFailed(String),
204
205    #[error("Policy compilation failed: {0}")]
206    CompilationFailed(String),
207
208    #[error("Policy conflict detected: {0}")]
209    Conflict(String),
210
211    #[error("Policy engine unavailable: {0}")]
212    EngineUnavailable(String),
213
214    #[error("Invalid policy: {reason}")]
215    InvalidPolicy { reason: Box<str> },
216}
217
218/// Sandbox orchestration errors
219#[derive(Error, Debug, Clone)]
220pub enum SandboxError {
221    #[error("Sandbox creation failed: {0}")]
222    CreationFailed(String),
223
224    #[error("Sandbox execution failed: {0}")]
225    ExecutionFailed(String),
226
227    #[error("Sandbox not found: {0}")]
228    NotFound(String),
229
230    #[error("Sandbox not found: {id}")]
231    SandboxNotFound { id: String },
232
233    #[error("Snapshot not found: {id}")]
234    SnapshotNotFound { id: String },
235
236    #[error("Sandbox termination failed: {0}")]
237    TerminationFailed(String),
238
239    #[error("Sandbox monitoring failed: {0}")]
240    MonitoringFailed(String),
241
242    #[error("Unsupported security tier: {0}")]
243    UnsupportedTier(String),
244}
245
246/// Scheduler errors
247#[derive(Error, Debug, Clone)]
248pub enum SchedulerError {
249    #[error("Agent scheduling failed for {agent_id}: {reason}")]
250    SchedulingFailed { agent_id: AgentId, reason: Box<str> },
251
252    #[error("Agent not found: {agent_id}")]
253    AgentNotFound { agent_id: AgentId },
254
255    #[error("Scheduler overloaded: {0}")]
256    Overloaded(Box<str>),
257
258    #[error("Invalid priority: {0}")]
259    InvalidPriority(String),
260
261    #[error("Scheduler shutdown in progress")]
262    ShuttingDown,
263
264    #[error("Serialization failed: {0}")]
265    SerializationFailed(String),
266}
267
268impl From<serde_json::Error> for SchedulerError {
269    fn from(error: serde_json::Error) -> Self {
270        SchedulerError::SerializationFailed(format!("JSON serialization error: {}", error))
271    }
272}
273
274/// Lifecycle management errors
275#[derive(Error, Debug, Clone)]
276pub enum LifecycleError {
277    #[error("Agent initialization failed for {agent_id}: {reason}")]
278    InitializationFailed { agent_id: AgentId, reason: String },
279
280    #[error("Agent execution failed for {agent_id}: {reason}")]
281    ExecutionFailed { agent_id: AgentId, reason: String },
282
283    #[error("Agent termination failed for {agent_id}: {reason}")]
284    TerminationFailed { agent_id: AgentId, reason: String },
285
286    #[error("Invalid state transition from {from} to {to}")]
287    InvalidStateTransition { from: String, to: String },
288
289    #[error("DSL parsing failed: {0}")]
290    DslParsingFailed(String),
291
292    #[error("Agent not found: {agent_id}")]
293    AgentNotFound { agent_id: AgentId },
294
295    #[error("Event processing failed: {reason}")]
296    EventProcessingFailed { reason: String },
297
298    #[error("System is shutting down")]
299    ShuttingDown,
300
301    #[error("Resource exhausted: {reason}")]
302    ResourceExhausted { reason: String },
303}
304
305/// Audit trail errors
306#[derive(Error, Debug, Clone)]
307pub enum AuditError {
308    #[error("Audit logging failed: {0}")]
309    LoggingFailed(String),
310
311    #[error("Audit verification failed: {0}")]
312    VerificationFailed(String),
313
314    #[error("Audit query failed: {0}")]
315    QueryFailed(String),
316
317    #[error("Audit storage full: {0}")]
318    StorageFull(String),
319
320    #[error("Audit trail corrupted: {0}")]
321    Corrupted(String),
322
323    #[error("Record not found: {id}")]
324    RecordNotFound { id: String },
325
326    #[error("Export failed: {reason}")]
327    ExportFailed { reason: String },
328
329    #[error("Unsupported format: {format}")]
330    UnsupportedFormat { format: String },
331}
332
333/// Error handler errors
334#[derive(Error, Debug, Clone)]
335pub enum ErrorHandlerError {
336    #[error("Configuration error: {reason}")]
337    ConfigurationError { reason: String },
338
339    #[error("Event processing failed: {reason}")]
340    EventProcessingFailed { reason: String },
341
342    #[error("Error handler is shutting down")]
343    ShuttingDown,
344}
345
346/// Recovery strategies for different types of errors
347#[derive(Debug, Clone)]
348pub enum RecoveryStrategy {
349    /// Retry the operation with exponential backoff
350    Retry {
351        max_attempts: u32,
352        backoff: Duration,
353    },
354    /// Restart the agent, optionally preserving state
355    Restart { preserve_state: bool },
356    /// Failover to a backup agent
357    Failover { backup_agent: Option<AgentId> },
358    /// Terminate the agent with cleanup
359    Terminate { cleanup: bool },
360    /// Manual intervention required
361    Manual { reason: String },
362    /// No recovery possible
363    None,
364}
365
366impl Default for RecoveryStrategy {
367    fn default() -> Self {
368        RecoveryStrategy::Retry {
369            max_attempts: 3,
370            backoff: Duration::from_secs(1),
371        }
372    }
373}
374
375/// Error recovery configuration
376#[derive(Debug, Clone)]
377pub struct ErrorRecoveryConfig {
378    pub default_strategy: RecoveryStrategy,
379    pub max_recovery_attempts: u32,
380    pub recovery_timeout: Duration,
381    pub escalation_threshold: u32,
382}
383
384impl Default for ErrorRecoveryConfig {
385    fn default() -> Self {
386        Self {
387            default_strategy: RecoveryStrategy::default(),
388            max_recovery_attempts: 5,
389            recovery_timeout: Duration::from_secs(300), // 5 minutes
390            escalation_threshold: 10,
391        }
392    }
393}
394
395/// Error context for better debugging and recovery
396#[derive(Debug, Clone)]
397pub struct ErrorContext {
398    pub agent_id: Option<AgentId>,
399    pub operation: String,
400    pub timestamp: std::time::SystemTime,
401    pub recovery_attempts: u32,
402    pub additional_info: std::collections::HashMap<String, String>,
403}
404
405impl ErrorContext {
406    pub fn new(operation: String) -> Self {
407        Self {
408            agent_id: None,
409            operation,
410            timestamp: std::time::SystemTime::now(),
411            recovery_attempts: 0,
412            additional_info: std::collections::HashMap::new(),
413        }
414    }
415
416    pub fn with_agent(mut self, agent_id: AgentId) -> Self {
417        self.agent_id = Some(agent_id);
418        self
419    }
420
421    pub fn with_info(mut self, key: String, value: String) -> Self {
422        self.additional_info.insert(key, value);
423        self
424    }
425
426    pub fn increment_attempts(&mut self) {
427        self.recovery_attempts += 1;
428    }
429}
430
431/// Result type with error context
432pub type RuntimeResult<T> = Result<T, RuntimeError>;
433
434/// Trait for error recovery
435pub trait ErrorRecovery {
436    fn get_recovery_strategy(&self, error: &RuntimeError) -> RecoveryStrategy;
437    fn should_retry(&self, error: &RuntimeError, attempts: u32) -> bool;
438    fn escalate_error(&self, error: &RuntimeError, context: &ErrorContext);
439}