actr_runtime/
error.rs

1//! Runtime layer error definition
2//!
3//! Error handling follows microservices patterns rather than classic Actor supervision:
4//! - Application errors: Propagated as RPC responses (handled by caller)
5//! - Framework errors: Classified as Transient/Permanent/Poison
6//!
7//! See docs/3.11-production-readiness.zh.md for complete error handling strategy.
8
9use thiserror::Error;
10
11/// Runtime error types following gRPC-style classification
12///
13/// # Error Classification
14///
15/// - **Transient**: Temporary failures, safe to retry (UNAVAILABLE, DEADLINE_EXCEEDED)
16/// - **Permanent**: Require system state fix, do NOT retry (NOT_FOUND, INVALID_ARGUMENT)
17/// - **Poison**: Corrupted messages requiring manual intervention (decode failures)
18///
19/// # Design Philosophy
20///
21/// Unlike classic Actor systems (Erlang/Akka) that use Supervision trees,
22/// Actor-RTC treats each Actr as a microservice unit:
23/// - Caller controls retry logic (not framework)
24/// - Explicit error propagation (not transparent restart)
25/// - Dead Letter Queue for poison messages
26#[derive(Error, Debug)]
27pub enum RuntimeError {
28    // ========== Transient Errors (Retryable) ==========
29    /// Service temporarily unavailable (gRPC UNAVAILABLE)
30    ///
31    /// **Transient**: Connection lost, peer overloaded, temporary resource exhaustion
32    /// **Caller should**: Retry with exponential backoff
33    #[error("Service unavailable: {message}")]
34    Unavailable {
35        message: String,
36        /// Optional target Actor ID
37        target: Option<actr_protocol::ActrId>,
38    },
39
40    /// Request timeout exceeded (gRPC DEADLINE_EXCEEDED)
41    ///
42    /// **Transient**: Network delay, peer slow response
43    /// **Caller should**: Retry with longer timeout or give up
44    #[error("Deadline exceeded: {message}")]
45    DeadlineExceeded { message: String, timeout_ms: u64 },
46
47    // ========== Permanent Errors (Not Retryable) ==========
48    /// Target Actor not found (gRPC NOT_FOUND)
49    ///
50    /// **Permanent**: Actor not registered in signaling server
51    /// **Caller should**: NOT retry, perform service discovery first
52    #[error("Actor not found: {actor_id:?}")]
53    NotFound {
54        actor_id: actr_protocol::ActrId,
55        message: String,
56    },
57
58    /// Invalid argument provided (gRPC INVALID_ARGUMENT)
59    ///
60    /// **Permanent**: Malformed request, validation failure
61    /// **Caller should**: NOT retry, fix the request
62    #[error("Invalid argument: {0}")]
63    InvalidArgument(String),
64
65    /// Precondition not met (gRPC FAILED_PRECONDITION)
66    ///
67    /// **Permanent**: System state incompatible with operation
68    /// **Caller should**: NOT retry, fix system state first
69    #[error("Failed precondition: {0}")]
70    FailedPrecondition(String),
71
72    /// Permission denied (gRPC PERMISSION_DENIED)
73    ///
74    /// **Permanent**: ACL check failed
75    /// **Caller should**: NOT retry, check authorization
76    #[error("Permission denied: {0}")]
77    PermissionDenied(String),
78
79    // ========== Poison Message Errors (DLQ) ==========
80    /// Protobuf decode failure
81    ///
82    /// **Poison**: Corrupted message, cannot be processed
83    /// **Framework**: Move to Dead Letter Queue, log raw bytes
84    #[error("Protobuf decode failed: {message}")]
85    DecodeFailure {
86        message: String,
87        /// Raw bytes for manual analysis
88        raw_bytes: Option<Vec<u8>>,
89    },
90
91    // ========== Internal Errors ==========
92    /// Internal framework error (gRPC INTERNAL)
93    ///
94    /// **Severity**: High - indicates framework bug or panic
95    /// **Framework**: Log stack trace, capture panic info
96    #[error("Internal error: {message}")]
97    Internal {
98        message: String,
99        /// Panic info if caused by handler panic
100        panic_info: Option<String>,
101    },
102
103    /// Mailbox operation error
104    ///
105    /// **Severity**: Critical - SQLite database issue
106    /// **Framework**: Trigger alert, may need manual intervention
107    #[error("Mailbox error: {0}")]
108    MailboxError(String),
109
110    // ========== Legacy Errors (To Be Migrated) ==========
111    /// Configuration error
112    #[error("Configuration error: {0}")]
113    ConfigurationError(String),
114
115    /// Initialization error
116    #[error("Initialization error: {0}")]
117    InitializationError(String),
118
119    /// Shutdown error
120    #[error("Shutdown error: {0}")]
121    ShutdownError(String),
122
123    /// IO Error
124    #[error("IO error: {0}")]
125    IoError(#[from] std::io::Error),
126
127    /// JSON Error
128    #[error("JSON error: {0}")]
129    JsonError(#[from] serde_json::Error),
130
131    /// Protocol Error
132    #[error("Protocol error: {0}")]
133    ProtocolError(#[from] actr_protocol::ProtocolError),
134
135    /// Other error
136    #[error("Other error: {0}")]
137    Other(#[from] anyhow::Error),
138}
139
140impl From<crate::transport::error::NetworkError> for RuntimeError {
141    fn from(err: crate::transport::error::NetworkError) -> Self {
142        // Map network errors to appropriate RuntimeError variants
143        use crate::transport::error::NetworkError;
144        match err {
145            // Transient errors
146            NetworkError::ConnectionError(_)
147            | NetworkError::SignalingError(_)
148            | NetworkError::WebRtcError(_)
149            | NetworkError::NetworkUnreachableError(_)
150            | NetworkError::ResourceExhaustedError(_)
151            | NetworkError::NatTraversalError(_)
152            | NetworkError::IceError(_)
153            | NetworkError::WebSocketError(_) => RuntimeError::Unavailable {
154                message: err.to_string(),
155                target: None,
156            },
157
158            // Timeout errors
159            NetworkError::TimeoutError(_) => RuntimeError::DeadlineExceeded {
160                message: err.to_string(),
161                timeout_ms: 0,
162            },
163
164            // Not found errors
165            NetworkError::ConnectionNotFound(_)
166            | NetworkError::ChannelNotFound(_)
167            | NetworkError::NoRoute(_) => RuntimeError::NotFound {
168                actor_id: actr_protocol::ActrId::default(),
169                message: err.to_string(),
170            },
171
172            // Invalid argument errors
173            NetworkError::InvalidArgument(_) | NetworkError::InvalidOperation(_) => {
174                RuntimeError::InvalidArgument(err.to_string())
175            }
176
177            // Permanent configuration errors
178            NetworkError::ConfigurationError(_) => {
179                RuntimeError::ConfigurationError(err.to_string())
180            }
181
182            // Permission errors
183            NetworkError::AuthenticationError(_) | NetworkError::PermissionError(_) => {
184                RuntimeError::PermissionDenied(err.to_string())
185            }
186
187            // Decode/encode failures → poison messages
188            NetworkError::DeserializationError(msg) => RuntimeError::DecodeFailure {
189                message: msg,
190                raw_bytes: None,
191            },
192
193            // Other errors
194            NetworkError::ProtocolError(_)
195            | NetworkError::SerializationError(_)
196            | NetworkError::DataChannelError(_)
197            | NetworkError::BroadcastError(_)
198            | NetworkError::DtlsError(_)
199            | NetworkError::StunTurnError(_)
200            | NetworkError::ServiceDiscoveryError(_)
201            | NetworkError::NotImplemented(_)
202            | NetworkError::ChannelClosed(_)
203            | NetworkError::SendError(_)
204            | NetworkError::IoError(_)
205            | NetworkError::UrlParseError(_)
206            | NetworkError::JsonError(_)
207            | NetworkError::Other(_) => RuntimeError::Other(anyhow::anyhow!("{err}")),
208        }
209    }
210}
211
212impl RuntimeError {
213    /// Error classification for retry decision
214    ///
215    /// Follows gRPC status code semantics:
216    /// - Transient: Safe to retry (UNAVAILABLE, DEADLINE_EXCEEDED)
217    /// - Permanent: Do NOT retry (NOT_FOUND, INVALID_ARGUMENT, etc.)
218    /// - Poison: Needs manual intervention (DecodeFailure)
219    pub fn classification(&self) -> ErrorClassification {
220        match self {
221            // Transient errors
222            RuntimeError::Unavailable { .. } | RuntimeError::DeadlineExceeded { .. } => {
223                ErrorClassification::Transient
224            }
225
226            // Permanent errors
227            RuntimeError::NotFound { .. }
228            | RuntimeError::InvalidArgument(_)
229            | RuntimeError::FailedPrecondition(_)
230            | RuntimeError::PermissionDenied(_)
231            | RuntimeError::ConfigurationError(_)
232            | RuntimeError::InitializationError(_) => ErrorClassification::Permanent,
233
234            // Poison messages
235            RuntimeError::DecodeFailure { .. } => ErrorClassification::Poison,
236
237            // Internal errors (may be transient or permanent, depends on context)
238            RuntimeError::Internal { .. } | RuntimeError::MailboxError(_) => {
239                ErrorClassification::Internal
240            }
241
242            // Legacy errors - default to permanent
243            RuntimeError::ShutdownError(_)
244            | RuntimeError::IoError(_)
245            | RuntimeError::JsonError(_)
246            | RuntimeError::ProtocolError(_)
247            | RuntimeError::Other(_) => ErrorClassification::Permanent,
248        }
249    }
250
251    /// Check if error is retryable (Transient classification)
252    ///
253    /// Caller should use exponential backoff for retry.
254    pub fn is_retryable(&self) -> bool {
255        matches!(
256            self.classification(),
257            ErrorClassification::Transient | ErrorClassification::Internal
258        )
259    }
260
261    /// Check if error requires Dead Letter Queue
262    ///
263    /// Poison messages cannot be processed and need manual intervention.
264    pub fn requires_dlq(&self) -> bool {
265        matches!(self.classification(), ErrorClassification::Poison)
266    }
267
268    /// Get gRPC-style status code name
269    ///
270    /// For logging and metrics (compatible with gRPC status codes).
271    pub fn status_code(&self) -> &'static str {
272        match self {
273            RuntimeError::Unavailable { .. } => "UNAVAILABLE",
274            RuntimeError::DeadlineExceeded { .. } => "DEADLINE_EXCEEDED",
275            RuntimeError::NotFound { .. } => "NOT_FOUND",
276            RuntimeError::InvalidArgument(_) => "INVALID_ARGUMENT",
277            RuntimeError::FailedPrecondition(_) => "FAILED_PRECONDITION",
278            RuntimeError::PermissionDenied(_) => "PERMISSION_DENIED",
279            RuntimeError::DecodeFailure { .. } => "DATA_LOSS",
280            RuntimeError::Internal { .. } => "INTERNAL",
281            RuntimeError::MailboxError(_) => "INTERNAL",
282            RuntimeError::ConfigurationError(_) => "FAILED_PRECONDITION",
283            RuntimeError::InitializationError(_) => "FAILED_PRECONDITION",
284            RuntimeError::ShutdownError(_) => "UNAVAILABLE",
285            RuntimeError::IoError(_) => "INTERNAL",
286            RuntimeError::JsonError(_) => "INTERNAL",
287            RuntimeError::ProtocolError(_) => "INTERNAL",
288            RuntimeError::Other(_) => "UNKNOWN",
289        }
290    }
291
292    /// Get error severity (1-10, 10 is most critical)
293    ///
294    /// Used for alerting thresholds and monitoring.
295    pub fn severity(&self) -> u8 {
296        match self {
297            // Critical: System cannot function
298            RuntimeError::ConfigurationError(_) | RuntimeError::InitializationError(_) => 10,
299
300            // High: Data loss or corruption
301            RuntimeError::MailboxError(_) | RuntimeError::DecodeFailure { .. } => 9,
302
303            // Medium-High: Internal errors, may indicate bugs
304            RuntimeError::Internal { .. } => 8,
305
306            // Medium: Access control
307            RuntimeError::PermissionDenied(_) => 7,
308
309            // Medium-Low: Client errors
310            RuntimeError::NotFound { .. }
311            | RuntimeError::InvalidArgument(_)
312            | RuntimeError::FailedPrecondition(_) => 5,
313
314            // Low: Transient failures
315            RuntimeError::Unavailable { .. } | RuntimeError::DeadlineExceeded { .. } => 3,
316
317            // Very Low: Expected errors
318            RuntimeError::ShutdownError(_) => 2,
319
320            // Minimal: Infrastructure
321            RuntimeError::IoError(_) | RuntimeError::JsonError(_) => 1,
322
323            // Unknown
324            RuntimeError::ProtocolError(_) | RuntimeError::Other(_) => 4,
325        }
326    }
327
328    /// Check if error requires system shutdown
329    ///
330    /// Only fatal configuration/initialization errors should shutdown.
331    pub fn requires_system_shutdown(&self) -> bool {
332        matches!(
333            self,
334            RuntimeError::ConfigurationError(_) | RuntimeError::InitializationError(_)
335        )
336    }
337
338    /// Get error category for metrics
339    ///
340    /// Used in Prometheus labels: `errors_total{category="unavailable"}`
341    pub fn category(&self) -> &'static str {
342        match self {
343            RuntimeError::Unavailable { .. } => "unavailable",
344            RuntimeError::DeadlineExceeded { .. } => "timeout",
345            RuntimeError::NotFound { .. } => "not_found",
346            RuntimeError::InvalidArgument(_) => "invalid_argument",
347            RuntimeError::FailedPrecondition(_) => "failed_precondition",
348            RuntimeError::PermissionDenied(_) => "permission_denied",
349            RuntimeError::DecodeFailure { .. } => "decode_failure",
350            RuntimeError::Internal { .. } => "internal",
351            RuntimeError::MailboxError(_) => "mailbox",
352            RuntimeError::ConfigurationError(_) => "configuration",
353            RuntimeError::InitializationError(_) => "initialization",
354            RuntimeError::ShutdownError(_) => "shutdown",
355            RuntimeError::IoError(_) => "io",
356            RuntimeError::JsonError(_) => "json",
357            RuntimeError::ProtocolError(_) => "protocol",
358            RuntimeError::Other(_) => "other",
359        }
360    }
361}
362
363/// Error classification for retry decision
364#[derive(Debug, Clone, Copy, PartialEq, Eq)]
365pub enum ErrorClassification {
366    /// Transient: Temporary failure, safe to retry
367    Transient,
368    /// Permanent: Requires system state fix, do NOT retry
369    Permanent,
370    /// Poison: Corrupted message, requires manual intervention (DLQ)
371    Poison,
372    /// Internal: Framework error, may be transient or permanent
373    Internal,
374}
375
376/// Runtime result type
377pub type RuntimeResult<T> = Result<T, RuntimeError>;