actr_runtime/
error.rs

1//! Runtime layer error definition
2//!
3//! Error handling follows microservices patterns rather than classic Actor supervision:
4//! - Application errors: Propagated as RPC responses (handled by caller)
5//! - Framework errors: Classified as Transient/Permanent/Poison
6//!
7//! See docs/3.11-production-readiness.zh.md for complete error handling strategy.
8
9use thiserror::Error;
10
11/// Runtime error types following gRPC-style classification
12///
13/// # Error Classification
14///
15/// - **Transient**: Temporary failures, safe to retry (UNAVAILABLE, DEADLINE_EXCEEDED)
16/// - **Permanent**: Require system state fix, do NOT retry (NOT_FOUND, INVALID_ARGUMENT)
17/// - **Poison**: Corrupted messages requiring manual intervention (decode failures)
18///
19/// # Design Philosophy
20///
21/// Unlike classic Actor systems (Erlang/Akka) that use Supervision trees,
22/// Actor-RTC treats each Actr as a microservice unit:
23/// - Caller controls retry logic (not framework)
24/// - Explicit error propagation (not transparent restart)
25/// - Dead Letter Queue for poison messages
26#[derive(Error, Debug)]
27pub enum RuntimeError {
28    // ========== Transient Errors (Retryable) ==========
29    /// Service temporarily unavailable (gRPC UNAVAILABLE)
30    ///
31    /// **Transient**: Connection lost, peer overloaded, temporary resource exhaustion
32    /// **Caller should**: Retry with exponential backoff
33    #[error("Service unavailable: {message}")]
34    Unavailable {
35        message: String,
36        /// Optional target Actor ID
37        target: Option<actr_protocol::ActrId>,
38    },
39
40    /// Request timeout exceeded (gRPC DEADLINE_EXCEEDED)
41    ///
42    /// **Transient**: Network delay, peer slow response
43    /// **Caller should**: Retry with longer timeout or give up
44    #[error("Deadline exceeded: {message}")]
45    DeadlineExceeded { message: String, timeout_ms: u64 },
46
47    // ========== Permanent Errors (Not Retryable) ==========
48    /// Target Actor not found (gRPC NOT_FOUND)
49    ///
50    /// **Permanent**: Actor not registered in signaling server
51    /// **Caller should**: NOT retry, perform service discovery first
52    #[error("Actor not found: {actor_id:?}")]
53    NotFound {
54        actor_id: actr_protocol::ActrId,
55        message: String,
56    },
57
58    /// Invalid argument provided (gRPC INVALID_ARGUMENT)
59    ///
60    /// **Permanent**: Malformed request, validation failure
61    /// **Caller should**: NOT retry, fix the request
62    #[error("Invalid argument: {0}")]
63    InvalidArgument(String),
64
65    /// Precondition not met (gRPC FAILED_PRECONDITION)
66    ///
67    /// **Permanent**: System state incompatible with operation
68    /// **Caller should**: NOT retry, fix system state first
69    #[error("Failed precondition: {0}")]
70    FailedPrecondition(String),
71
72    /// Permission denied (gRPC PERMISSION_DENIED)
73    ///
74    /// **Permanent**: ACL check failed
75    /// **Caller should**: NOT retry, check authorization
76    #[error("Permission denied: {0}")]
77    PermissionDenied(String),
78
79    // ========== Poison Message Errors (DLQ) ==========
80    /// Protobuf decode failure
81    ///
82    /// **Poison**: Corrupted message, cannot be processed
83    /// **Framework**: Move to Dead Letter Queue, log raw bytes
84    #[error("Protobuf decode failed: {message}")]
85    DecodeFailure {
86        message: String,
87        /// Raw bytes for manual analysis
88        raw_bytes: Option<Vec<u8>>,
89    },
90
91    // ========== Internal Errors ==========
92    /// Internal framework error (gRPC INTERNAL)
93    ///
94    /// **Severity**: High - indicates framework bug or panic
95    /// **Framework**: Log stack trace, capture panic info
96    #[error("Internal error: {message}")]
97    Internal {
98        message: String,
99        /// Panic info if caused by handler panic
100        panic_info: Option<String>,
101    },
102
103    /// Mailbox operation error
104    ///
105    /// **Severity**: Critical - SQLite database issue
106    /// **Framework**: Trigger alert, may need manual intervention
107    #[error("Mailbox error: {0}")]
108    MailboxError(String),
109
110    // ========== Legacy Errors (To Be Migrated) ==========
111    /// Configuration error
112    #[error("Configuration error: {0}")]
113    ConfigurationError(String),
114
115    /// Initialization error
116    #[error("Initialization error: {0}")]
117    InitializationError(String),
118
119    /// Shutdown error
120    #[error("Shutdown error: {0}")]
121    ShutdownError(String),
122
123    /// IO Error
124    #[error("IO error: {0}")]
125    IoError(#[from] std::io::Error),
126
127    /// JSON Error
128    #[error("JSON error: {0}")]
129    JsonError(#[from] serde_json::Error),
130
131    /// Protocol Error
132    #[error("Protocol error: {0}")]
133    ProtocolError(#[from] actr_protocol::ProtocolError),
134
135    /// Other error
136    #[error("Other error: {0}")]
137    Other(#[from] anyhow::Error),
138}
139
140impl From<crate::transport::error::NetworkError> for RuntimeError {
141    fn from(err: crate::transport::error::NetworkError) -> Self {
142        // Map network errors to appropriate RuntimeError variants
143        use crate::transport::error::NetworkError;
144        match err {
145            // Transient errors
146            NetworkError::ConnectionError(_)
147            | NetworkError::SignalingError(_)
148            | NetworkError::WebRtcError(_)
149            | NetworkError::NetworkUnreachableError(_)
150            | NetworkError::ResourceExhaustedError(_)
151            | NetworkError::NatTraversalError(_)
152            | NetworkError::IceError(_)
153            | NetworkError::WebSocketError(_) => RuntimeError::Unavailable {
154                message: err.to_string(),
155                target: None,
156            },
157
158            // Timeout errors
159            NetworkError::TimeoutError(_) => RuntimeError::DeadlineExceeded {
160                message: err.to_string(),
161                timeout_ms: 0,
162            },
163
164            // Not found errors
165            NetworkError::ConnectionNotFound(_)
166            | NetworkError::ChannelNotFound(_)
167            | NetworkError::NoRoute(_) => RuntimeError::NotFound {
168                actor_id: actr_protocol::ActrId::default(),
169                message: err.to_string(),
170            },
171
172            // Invalid argument errors
173            NetworkError::InvalidArgument(_) | NetworkError::InvalidOperation(_) => {
174                RuntimeError::InvalidArgument(err.to_string())
175            }
176
177            // Permanent configuration errors
178            NetworkError::ConfigurationError(_) => {
179                RuntimeError::ConfigurationError(err.to_string())
180            }
181
182            // Permission errors
183            NetworkError::AuthenticationError(_) | NetworkError::PermissionError(_) => {
184                RuntimeError::PermissionDenied(err.to_string())
185            }
186
187            // Decode/encode failures → poison messages
188            NetworkError::DeserializationError(msg) => RuntimeError::DecodeFailure {
189                message: msg,
190                raw_bytes: None,
191            },
192
193            // Other errors
194            NetworkError::ProtocolError(_)
195            | NetworkError::SerializationError(_)
196            | NetworkError::DataChannelError(_)
197            | NetworkError::BroadcastError(_)
198            | NetworkError::DtlsError(_)
199            | NetworkError::StunTurnError(_)
200            | NetworkError::ServiceDiscoveryError(_)
201            | NetworkError::NotImplemented(_)
202            | NetworkError::ChannelClosed(_)
203            | NetworkError::ConnectionClosed(_)
204            | NetworkError::SendError(_)
205            | NetworkError::IoError(_)
206            | NetworkError::UrlParseError(_)
207            | NetworkError::JsonError(_)
208            | NetworkError::Timeout(_)
209            | NetworkError::Other(_) => RuntimeError::Other(anyhow::anyhow!("{err}")),
210        }
211    }
212}
213
214impl RuntimeError {
215    /// Error classification for retry decision
216    ///
217    /// Follows gRPC status code semantics:
218    /// - Transient: Safe to retry (UNAVAILABLE, DEADLINE_EXCEEDED)
219    /// - Permanent: Do NOT retry (NOT_FOUND, INVALID_ARGUMENT, etc.)
220    /// - Poison: Needs manual intervention (DecodeFailure)
221    pub fn classification(&self) -> ErrorClassification {
222        match self {
223            // Transient errors
224            RuntimeError::Unavailable { .. } | RuntimeError::DeadlineExceeded { .. } => {
225                ErrorClassification::Transient
226            }
227
228            // Permanent errors
229            RuntimeError::NotFound { .. }
230            | RuntimeError::InvalidArgument(_)
231            | RuntimeError::FailedPrecondition(_)
232            | RuntimeError::PermissionDenied(_)
233            | RuntimeError::ConfigurationError(_)
234            | RuntimeError::InitializationError(_) => ErrorClassification::Permanent,
235
236            // Poison messages
237            RuntimeError::DecodeFailure { .. } => ErrorClassification::Poison,
238
239            // Internal errors (may be transient or permanent, depends on context)
240            RuntimeError::Internal { .. } | RuntimeError::MailboxError(_) => {
241                ErrorClassification::Internal
242            }
243
244            // Legacy errors - default to permanent
245            RuntimeError::ShutdownError(_)
246            | RuntimeError::IoError(_)
247            | RuntimeError::JsonError(_)
248            | RuntimeError::ProtocolError(_)
249            | RuntimeError::Other(_) => ErrorClassification::Permanent,
250        }
251    }
252
253    /// Check if error is retryable (Transient classification)
254    ///
255    /// Caller should use exponential backoff for retry.
256    pub fn is_retryable(&self) -> bool {
257        matches!(
258            self.classification(),
259            ErrorClassification::Transient | ErrorClassification::Internal
260        )
261    }
262
263    /// Check if error requires Dead Letter Queue
264    ///
265    /// Poison messages cannot be processed and need manual intervention.
266    pub fn requires_dlq(&self) -> bool {
267        matches!(self.classification(), ErrorClassification::Poison)
268    }
269
270    /// Get gRPC-style status code name
271    ///
272    /// For logging and metrics (compatible with gRPC status codes).
273    pub fn status_code(&self) -> &'static str {
274        match self {
275            RuntimeError::Unavailable { .. } => "UNAVAILABLE",
276            RuntimeError::DeadlineExceeded { .. } => "DEADLINE_EXCEEDED",
277            RuntimeError::NotFound { .. } => "NOT_FOUND",
278            RuntimeError::InvalidArgument(_) => "INVALID_ARGUMENT",
279            RuntimeError::FailedPrecondition(_) => "FAILED_PRECONDITION",
280            RuntimeError::PermissionDenied(_) => "PERMISSION_DENIED",
281            RuntimeError::DecodeFailure { .. } => "DATA_LOSS",
282            RuntimeError::Internal { .. } => "INTERNAL",
283            RuntimeError::MailboxError(_) => "INTERNAL",
284            RuntimeError::ConfigurationError(_) => "FAILED_PRECONDITION",
285            RuntimeError::InitializationError(_) => "FAILED_PRECONDITION",
286            RuntimeError::ShutdownError(_) => "UNAVAILABLE",
287            RuntimeError::IoError(_) => "INTERNAL",
288            RuntimeError::JsonError(_) => "INTERNAL",
289            RuntimeError::ProtocolError(_) => "INTERNAL",
290            RuntimeError::Other(_) => "UNKNOWN",
291        }
292    }
293
294    /// Get error severity (1-10, 10 is most critical)
295    ///
296    /// Used for alerting thresholds and monitoring.
297    pub fn severity(&self) -> u8 {
298        match self {
299            // Critical: System cannot function
300            RuntimeError::ConfigurationError(_) | RuntimeError::InitializationError(_) => 10,
301
302            // High: Data loss or corruption
303            RuntimeError::MailboxError(_) | RuntimeError::DecodeFailure { .. } => 9,
304
305            // Medium-High: Internal errors, may indicate bugs
306            RuntimeError::Internal { .. } => 8,
307
308            // Medium: Access control
309            RuntimeError::PermissionDenied(_) => 7,
310
311            // Medium-Low: Client errors
312            RuntimeError::NotFound { .. }
313            | RuntimeError::InvalidArgument(_)
314            | RuntimeError::FailedPrecondition(_) => 5,
315
316            // Low: Transient failures
317            RuntimeError::Unavailable { .. } | RuntimeError::DeadlineExceeded { .. } => 3,
318
319            // Very Low: Expected errors
320            RuntimeError::ShutdownError(_) => 2,
321
322            // Minimal: Infrastructure
323            RuntimeError::IoError(_) | RuntimeError::JsonError(_) => 1,
324
325            // Unknown
326            RuntimeError::ProtocolError(_) | RuntimeError::Other(_) => 4,
327        }
328    }
329
330    /// Check if error requires system shutdown
331    ///
332    /// Only fatal configuration/initialization errors should shutdown.
333    pub fn requires_system_shutdown(&self) -> bool {
334        matches!(
335            self,
336            RuntimeError::ConfigurationError(_) | RuntimeError::InitializationError(_)
337        )
338    }
339
340    /// Get error category for metrics
341    ///
342    /// Used in Prometheus labels: `errors_total{category="unavailable"}`
343    pub fn category(&self) -> &'static str {
344        match self {
345            RuntimeError::Unavailable { .. } => "unavailable",
346            RuntimeError::DeadlineExceeded { .. } => "timeout",
347            RuntimeError::NotFound { .. } => "not_found",
348            RuntimeError::InvalidArgument(_) => "invalid_argument",
349            RuntimeError::FailedPrecondition(_) => "failed_precondition",
350            RuntimeError::PermissionDenied(_) => "permission_denied",
351            RuntimeError::DecodeFailure { .. } => "decode_failure",
352            RuntimeError::Internal { .. } => "internal",
353            RuntimeError::MailboxError(_) => "mailbox",
354            RuntimeError::ConfigurationError(_) => "configuration",
355            RuntimeError::InitializationError(_) => "initialization",
356            RuntimeError::ShutdownError(_) => "shutdown",
357            RuntimeError::IoError(_) => "io",
358            RuntimeError::JsonError(_) => "json",
359            RuntimeError::ProtocolError(_) => "protocol",
360            RuntimeError::Other(_) => "other",
361        }
362    }
363}
364
365/// Error classification for retry decision
366#[derive(Debug, Clone, Copy, PartialEq, Eq)]
367pub enum ErrorClassification {
368    /// Transient: Temporary failure, safe to retry
369    Transient,
370    /// Permanent: Requires system state fix, do NOT retry
371    Permanent,
372    /// Poison: Corrupted message, requires manual intervention (DLQ)
373    Poison,
374    /// Internal: Framework error, may be transient or permanent
375    Internal,
376}
377
378/// Runtime result type
379pub type RuntimeResult<T> = Result<T, RuntimeError>;