Skip to main content

harn_vm/value/
error.rs

1use harn_lexer::Span;
2
3use super::VmValue;
4
5/// Bound expressing how many arguments a callable accepts. Used in
6/// [`VmError::ArityMismatch`] so error messages can render the exact
7/// signature contract the caller violated.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ArityExpect {
10    /// Exactly N parameters, no defaults, no rest.
11    Exact(usize),
12    /// `min..=max`: some params have defaults but the upper bound is fixed.
13    Range { min: usize, max: usize },
14    /// At least N parameters; further args land in a rest list. Used for
15    /// `print` / `log` / variadics.
16    AtLeast(usize),
17}
18
19impl std::fmt::Display for ArityExpect {
20    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
21        match self {
22            ArityExpect::Exact(n) => write!(f, "{n}"),
23            ArityExpect::Range { min, max } => write!(f, "{min}..={max}"),
24            ArityExpect::AtLeast(n) => write!(f, "at least {n}"),
25        }
26    }
27}
28
29#[derive(Debug, Clone)]
30pub struct ArityMismatchError {
31    pub callee: String,
32    pub expected: ArityExpect,
33    pub got: usize,
34    pub span: Option<Span>,
35}
36
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub enum DeadlockDiagnostic {
39    SelfDeadlock,
40    WaitForGraph,
41}
42
43impl DeadlockDiagnostic {
44    fn code(self) -> &'static str {
45        match self {
46            Self::SelfDeadlock => "HARN-ORC-011",
47            Self::WaitForGraph => "HARN-ORC-012",
48        }
49    }
50}
51
52/// Payload for [`VmError::Deadlock`]. `kind` is the primitive kind
53/// (`"mutex"`, `"channel"`) or `"task"`; `key` is the primitive key or task
54/// id; `detail` names the specific footgun.
55#[derive(Debug, Clone)]
56pub struct DeadlockError {
57    pub diagnostic: DeadlockDiagnostic,
58    pub kind: String,
59    pub key: String,
60    pub detail: String,
61}
62
63impl DeadlockError {
64    pub(crate) fn self_deadlock(
65        kind: impl Into<String>,
66        key: impl Into<String>,
67        detail: impl Into<String>,
68    ) -> Self {
69        Self {
70            diagnostic: DeadlockDiagnostic::SelfDeadlock,
71            kind: kind.into(),
72            key: key.into(),
73            detail: detail.into(),
74        }
75    }
76
77    pub(crate) fn wait_for_graph(
78        kind: impl Into<String>,
79        key: impl Into<String>,
80        detail: impl Into<String>,
81    ) -> Self {
82        Self {
83            diagnostic: DeadlockDiagnostic::WaitForGraph,
84            kind: kind.into(),
85            key: key.into(),
86            detail: detail.into(),
87        }
88    }
89}
90
91#[derive(Debug, Clone)]
92pub struct ArgTypeMismatchError {
93    pub callee: String,
94    pub param: String,
95    pub expected: String,
96    pub got: &'static str,
97    pub span: Option<Span>,
98}
99
100#[derive(Debug, Clone)]
101pub enum VmError {
102    StackUnderflow,
103    StackOverflow,
104    UndefinedVariable(String),
105    UndefinedBuiltin(String),
106    ImmutableAssignment(String),
107    TypeError(String),
108    Runtime(String),
109    DivisionByZero,
110    Thrown(VmValue),
111    /// Thrown with error category for structured error handling.
112    CategorizedError {
113        message: String,
114        category: ErrorCategory,
115    },
116    DaemonQueueFull {
117        daemon_id: String,
118        capacity: usize,
119    },
120    /// A deterministic, provably-unresolvable self-deadlock caught before the
121    /// VM would block forever (Rust's borrow checker prevents data races but
122    /// not deadlocks; this is the Go-runtime "all goroutines asleep" analogue
123    /// for the cases we can prove). Boxed — like [`VmError::ArityMismatch`] —
124    /// so the rare three-`String` payload doesn't enlarge `VmError` on the
125    /// pervasive `Result<VmValue, VmError>` hot path. Carries `HARN-ORC-011`.
126    Deadlock(Box<DeadlockError>),
127    Return(VmValue),
128    InvalidInstruction(u8),
129    /// Wrong number of arguments at a call site. Distinct from
130    /// [`VmError::TypeError`] so the runtime can match-and-recover (and
131    /// so error UX renders `expected 2..=3 got 1` consistently).
132    ArityMismatch(Box<ArityMismatchError>),
133    /// Argument value did not satisfy the declared parameter type.
134    /// `expected` is a pretty-printed type expression; `got` is the value's
135    /// runtime type name (`VmValue::type_name`). Used for both
136    /// user-defined function parameters (with declared types) and
137    /// registry-known builtin parameters.
138    ArgTypeMismatch(Box<ArgTypeMismatchError>),
139}
140
141/// Error categories for structured error handling in agent orchestration.
142#[derive(Debug, Clone, PartialEq, Eq)]
143pub enum ErrorCategory {
144    /// Network/connection timeout
145    Timeout,
146    /// Authentication/authorization failure
147    Auth,
148    /// Rate limit exceeded (HTTP 429 / quota)
149    RateLimit,
150    /// Upstream provider is overloaded (HTTP 503 / 529).
151    /// Distinct from RateLimit: the client hasn't exceeded a quota — the
152    /// provider is shedding load and will recover on its own.
153    Overloaded,
154    /// Provider-side 5xx error (500, 502) that isn't specifically overload.
155    ServerError,
156    /// Network-level transient failure (connection reset, DNS hiccup,
157    /// partial stream) — retryable but not provider-status-coded.
158    TransientNetwork,
159    /// LLM output failed schema validation. Retryable via `schema_retries`.
160    SchemaValidation,
161    /// LLM streaming response was aborted mid-stream because the partial
162    /// JSON content could not conceivably satisfy `output_schema`. Surfaced
163    /// by `llm_call` when `schema_stream_abort` is on (the default for
164    /// schema-bearing calls). Consumes one `schema_retries` budget slot;
165    /// the retry replays the prompt with a corrective nudge that cites
166    /// the abort path + reason.
167    SchemaStreamAborted,
168    /// Tool execution failure
169    ToolError,
170    /// Tool was rejected by the host (not permitted / not in allowlist)
171    ToolRejected,
172    /// Outbound network egress was blocked by policy.
173    EgressBlocked,
174    /// Operation was cancelled
175    Cancelled,
176    /// Resource not found
177    NotFound,
178    /// Circuit breaker is open
179    CircuitOpen,
180    /// LLM cost or token budget would be exceeded
181    BudgetExceeded,
182    /// Generic/unclassified error
183    Generic,
184}
185
186impl ErrorCategory {
187    pub fn as_str(&self) -> &'static str {
188        match self {
189            ErrorCategory::Timeout => "timeout",
190            ErrorCategory::Auth => "auth",
191            ErrorCategory::RateLimit => "rate_limit",
192            ErrorCategory::Overloaded => "overloaded",
193            ErrorCategory::ServerError => "server_error",
194            ErrorCategory::TransientNetwork => "transient_network",
195            ErrorCategory::SchemaValidation => "schema_validation",
196            ErrorCategory::SchemaStreamAborted => "schema_stream_aborted",
197            ErrorCategory::ToolError => "tool_error",
198            ErrorCategory::ToolRejected => "tool_rejected",
199            ErrorCategory::EgressBlocked => "egress_blocked",
200            ErrorCategory::Cancelled => "cancelled",
201            ErrorCategory::NotFound => "not_found",
202            ErrorCategory::CircuitOpen => "circuit_open",
203            ErrorCategory::BudgetExceeded => "budget_exceeded",
204            ErrorCategory::Generic => "generic",
205        }
206    }
207
208    pub fn parse(s: &str) -> Self {
209        match s {
210            "timeout" => ErrorCategory::Timeout,
211            "auth" => ErrorCategory::Auth,
212            "rate_limit" => ErrorCategory::RateLimit,
213            "overloaded" => ErrorCategory::Overloaded,
214            "server_error" => ErrorCategory::ServerError,
215            "transient_network" => ErrorCategory::TransientNetwork,
216            "schema_validation" => ErrorCategory::SchemaValidation,
217            "schema_stream_aborted" => ErrorCategory::SchemaStreamAborted,
218            "tool_error" => ErrorCategory::ToolError,
219            "tool_rejected" => ErrorCategory::ToolRejected,
220            "egress_blocked" => ErrorCategory::EgressBlocked,
221            "cancelled" => ErrorCategory::Cancelled,
222            "not_found" => ErrorCategory::NotFound,
223            "circuit_open" => ErrorCategory::CircuitOpen,
224            "budget_exceeded" => ErrorCategory::BudgetExceeded,
225            _ => ErrorCategory::Generic,
226        }
227    }
228
229    /// Whether an error of this category is worth retrying for a transient
230    /// provider-side reason. Agent loops consult this to decide whether to
231    /// back off and retry vs surface the error to the user.
232    pub fn is_transient(&self) -> bool {
233        matches!(
234            self,
235            ErrorCategory::Timeout
236                | ErrorCategory::RateLimit
237                | ErrorCategory::Overloaded
238                | ErrorCategory::ServerError
239                | ErrorCategory::TransientNetwork
240        )
241    }
242}
243
244/// Create a categorized error conveniently.
245pub fn categorized_error(message: impl Into<String>, category: ErrorCategory) -> VmError {
246    VmError::CategorizedError {
247        message: message.into(),
248        category,
249    }
250}
251
252/// Extract error category from a VmError.
253///
254/// Classification priority:
255/// 1. Explicit CategorizedError variant (set by throw_error or internal code)
256/// 2. Thrown dict with a "category" field (user-created structured errors)
257/// 3. HTTP status code extraction (standard, unambiguous)
258/// 4. Deadline exceeded (VM-internal)
259/// 5. Fallback to Generic
260pub fn error_to_category(err: &VmError) -> ErrorCategory {
261    match err {
262        VmError::CategorizedError { category, .. } => category.clone(),
263        VmError::Thrown(VmValue::Dict(d)) => d
264            .get("category")
265            .map(|v| ErrorCategory::parse(&v.display()))
266            .unwrap_or(ErrorCategory::Generic),
267        VmError::Thrown(VmValue::String(s)) => classify_error_message(s),
268        VmError::Runtime(msg) => classify_error_message(msg),
269        // A deadlock is permanently non-retryable and not provider-related —
270        // `Generic` is the correct "surface it, don't back off" bucket.
271        VmError::Deadlock(_) => ErrorCategory::Generic,
272        _ => ErrorCategory::Generic,
273    }
274}
275
276/// Classify an error message using HTTP status codes and well-known patterns.
277/// Prefers unambiguous signals (status codes) over substring heuristics.
278pub fn classify_error_message(msg: &str) -> ErrorCategory {
279    // 1. HTTP status codes — most reliable signal
280    if let Some(cat) = classify_by_http_status(msg) {
281        return cat;
282    }
283    // 2. Well-known error identifiers from major APIs
284    //    (Anthropic, OpenAI, and standard HTTP patterns)
285    let lower = msg.to_lowercase();
286    if lower.contains("cancelled") || lower.contains("canceled") {
287        return ErrorCategory::Cancelled;
288    }
289    if msg.contains("Deadline exceeded") || msg.contains("context deadline exceeded") {
290        return ErrorCategory::Timeout;
291    }
292    if msg.contains("overloaded_error") {
293        // Anthropic overloaded_error surfaces as HTTP 529.
294        return ErrorCategory::Overloaded;
295    }
296    if msg.contains("api_error") {
297        // Anthropic catch-all server-side error.
298        return ErrorCategory::ServerError;
299    }
300    if msg.contains("insufficient_quota") || msg.contains("billing_hard_limit_reached") {
301        // OpenAI-specific quota error types.
302        return ErrorCategory::RateLimit;
303    }
304    if msg.contains("invalid_api_key") || msg.contains("authentication_error") {
305        return ErrorCategory::Auth;
306    }
307    if msg.contains("not_found_error") || msg.contains("model_not_found") {
308        return ErrorCategory::NotFound;
309    }
310    if msg.contains("circuit_open") {
311        return ErrorCategory::CircuitOpen;
312    }
313    // Network-level transient patterns (pre-HTTP-status, pre-provider-framing).
314    if lower.contains("connection reset")
315        || lower.contains("connection refused")
316        || lower.contains("connection closed")
317        || lower.contains("broken pipe")
318        || lower.contains("dns error")
319        || lower.contains("stream error")
320        || lower.contains("unexpected eof")
321    {
322        return ErrorCategory::TransientNetwork;
323    }
324    ErrorCategory::Generic
325}
326
327/// Classify errors by HTTP status code if one appears in the message.
328/// This is the most reliable classification method since status codes
329/// are standardized (RFC 9110) and unambiguous.
330fn classify_by_http_status(msg: &str) -> Option<ErrorCategory> {
331    // Extract 3-digit HTTP status codes from common patterns:
332    // "HTTP 429", "status 429", "429 Too Many", "error: 401"
333    for code in extract_http_status_codes(msg) {
334        return Some(match code {
335            401 | 403 => ErrorCategory::Auth,
336            404 | 410 => ErrorCategory::NotFound,
337            408 | 504 | 522 | 524 => ErrorCategory::Timeout,
338            429 => ErrorCategory::RateLimit,
339            503 | 529 => ErrorCategory::Overloaded,
340            500 | 502 => ErrorCategory::ServerError,
341            _ => continue,
342        });
343    }
344    None
345}
346
347/// Extract plausible HTTP status codes from an error message.
348fn extract_http_status_codes(msg: &str) -> Vec<u16> {
349    let mut codes = Vec::new();
350    let bytes = msg.as_bytes();
351    for i in 0..bytes.len().saturating_sub(2) {
352        // Look for 3-digit sequences in the 100-599 range
353        if bytes[i].is_ascii_digit()
354            && bytes[i + 1].is_ascii_digit()
355            && bytes[i + 2].is_ascii_digit()
356        {
357            // Ensure it's not part of a longer number
358            let before_ok = i == 0 || !bytes[i - 1].is_ascii_digit();
359            let after_ok = i + 3 >= bytes.len() || !bytes[i + 3].is_ascii_digit();
360            if before_ok && after_ok {
361                if let Ok(code) = msg[i..i + 3].parse::<u16>() {
362                    if (400..=599).contains(&code) {
363                        codes.push(code);
364                    }
365                }
366            }
367        }
368    }
369    codes
370}
371
372impl std::fmt::Display for VmError {
373    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
374        match self {
375            VmError::StackUnderflow => write!(f, "Stack underflow"),
376            VmError::StackOverflow => write!(f, "Stack overflow: too many nested calls"),
377            VmError::UndefinedVariable(n) => write!(f, "Undefined variable: {n}"),
378            VmError::UndefinedBuiltin(n) => write!(f, "Undefined builtin: {n}"),
379            VmError::ImmutableAssignment(n) => {
380                write!(f, "Cannot assign to immutable binding: {n}")
381            }
382            VmError::TypeError(msg) => write!(f, "Type error: {msg}"),
383            VmError::Runtime(msg) => write!(f, "Runtime error: {msg}"),
384            VmError::DivisionByZero => write!(f, "Division by zero"),
385            VmError::Thrown(v) => write!(f, "Thrown: {}", v.display()),
386            VmError::CategorizedError { message, category } => {
387                write!(f, "Error [{}]: {}", category.as_str(), message)
388            }
389            VmError::DaemonQueueFull {
390                daemon_id,
391                capacity,
392            } => write!(
393                f,
394                "Daemon queue full: daemon '{daemon_id}' reached its event_queue_capacity of {capacity}"
395            ),
396            VmError::Deadlock(err) => match err.diagnostic {
397                DeadlockDiagnostic::SelfDeadlock => write!(
398                    f,
399                    "{}: deadlock detected: {} ({} '{}') — this wait can never complete and would block forever",
400                    err.diagnostic.code(),
401                    err.detail,
402                    err.kind,
403                    err.key
404                ),
405                DeadlockDiagnostic::WaitForGraph => write!(
406                    f,
407                    "{}: wait-for deadlock detected: {} ({} '{}') — no active task can make progress",
408                    err.diagnostic.code(),
409                    err.detail,
410                    err.kind,
411                    err.key
412                ),
413            },
414            VmError::Return(_) => write!(f, "Return from function"),
415            VmError::InvalidInstruction(op) => write!(f, "Invalid instruction: 0x{op:02x}"),
416            VmError::ArityMismatch(err) => {
417                let arg_word = match err.expected {
418                    ArityExpect::Exact(1) | ArityExpect::AtLeast(1) => "argument",
419                    _ => "arguments",
420                };
421                write!(
422                    f,
423                    "Arity mismatch: '{}' expects {} {}, got {}{}",
424                    err.callee,
425                    err.expected,
426                    arg_word,
427                    err.got,
428                    fmt_span_suffix(&err.span)
429                )
430            }
431            VmError::ArgTypeMismatch(err) => {
432                write!(
433                    f,
434                    "Type error: '{}' parameter `{}` expects {}, got {}{}",
435                    err.callee,
436                    err.param,
437                    err.expected,
438                    err.got,
439                    fmt_span_suffix(&err.span)
440                )
441            }
442        }
443    }
444}
445
446fn fmt_span_suffix(span: &Option<Span>) -> String {
447    match span {
448        Some(s) => format!(" (at byte {}..{})", s.start, s.end),
449        None => String::new(),
450    }
451}
452
453impl std::error::Error for VmError {}
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458
459    #[test]
460    fn classifies_cancelled_messages() {
461        assert_eq!(
462            classify_error_message("Bridge: operation cancelled"),
463            ErrorCategory::Cancelled
464        );
465        assert_eq!(
466            classify_error_message("operation canceled by host"),
467            ErrorCategory::Cancelled
468        );
469    }
470
471    #[test]
472    fn deadlock_renders_with_stable_code() {
473        let err = VmError::Deadlock(Box::new(DeadlockError::self_deadlock(
474            "mutex",
475            "__default__",
476            "re-entrant acquire",
477        )));
478        assert!(
479            err.to_string().starts_with("HARN-ORC-011"),
480            "deadlock Display must carry the stable code: {err}"
481        );
482    }
483
484    #[test]
485    fn deadlock_maps_to_generic_category() {
486        let err = VmError::Deadlock(Box::new(DeadlockError::self_deadlock(
487            "task",
488            "task_1",
489            "self-join",
490        )));
491        let category = error_to_category(&err);
492        assert_eq!(category, ErrorCategory::Generic);
493        assert!(
494            !category.is_transient(),
495            "a deadlock must not be treated as a retryable transient error"
496        );
497    }
498}