Skip to main content

harn_vm/value/
error.rs

1use harn_lexer::Span;
2
3use super::VmValue;
4
5/// Bound expressing how many arguments a callable accepts. Used in
6/// [`VmError::ArityMismatch`] so error messages can render the exact
7/// signature contract the caller violated.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ArityExpect {
10    /// Exactly N parameters, no defaults, no rest.
11    Exact(usize),
12    /// `min..=max`: some params have defaults but the upper bound is fixed.
13    Range { min: usize, max: usize },
14    /// At least N parameters; further args land in a rest list. Used for
15    /// `print` / `log` / variadics.
16    AtLeast(usize),
17}
18
19impl std::fmt::Display for ArityExpect {
20    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
21        match self {
22            ArityExpect::Exact(n) => write!(f, "{n}"),
23            ArityExpect::Range { min, max } => write!(f, "{min}..={max}"),
24            ArityExpect::AtLeast(n) => write!(f, "at least {n}"),
25        }
26    }
27}
28
29#[derive(Debug, Clone)]
30pub struct ArityMismatchError {
31    pub callee: String,
32    pub expected: ArityExpect,
33    pub got: usize,
34    pub span: Option<Span>,
35}
36
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub enum DeadlockDiagnostic {
39    SelfDeadlock,
40    WaitForGraph,
41}
42
43impl DeadlockDiagnostic {
44    fn code(self) -> &'static str {
45        match self {
46            Self::SelfDeadlock => "HARN-ORC-011",
47            Self::WaitForGraph => "HARN-ORC-012",
48        }
49    }
50}
51
52/// Payload for [`VmError::Deadlock`]. `kind` is the primitive kind
53/// (`"mutex"`, `"channel"`) or `"task"`; `key` is the primitive key or task
54/// id; `detail` names the specific footgun.
55#[derive(Debug, Clone)]
56pub struct DeadlockError {
57    pub diagnostic: DeadlockDiagnostic,
58    pub kind: String,
59    pub key: String,
60    pub detail: String,
61}
62
63impl DeadlockError {
64    pub(crate) fn self_deadlock(
65        kind: impl Into<String>,
66        key: impl Into<String>,
67        detail: impl Into<String>,
68    ) -> Self {
69        Self {
70            diagnostic: DeadlockDiagnostic::SelfDeadlock,
71            kind: kind.into(),
72            key: key.into(),
73            detail: detail.into(),
74        }
75    }
76
77    pub(crate) fn wait_for_graph(
78        kind: impl Into<String>,
79        key: impl Into<String>,
80        detail: impl Into<String>,
81    ) -> Self {
82        Self {
83            diagnostic: DeadlockDiagnostic::WaitForGraph,
84            kind: kind.into(),
85            key: key.into(),
86            detail: detail.into(),
87        }
88    }
89}
90
91#[derive(Debug, Clone)]
92pub struct ArgTypeMismatchError {
93    pub callee: String,
94    pub param: String,
95    pub expected: String,
96    pub got: &'static str,
97    pub span: Option<Span>,
98}
99
100#[derive(Debug, Clone)]
101pub enum VmError {
102    StackUnderflow,
103    StackOverflow,
104    UndefinedVariable(String),
105    UndefinedBuiltin(String),
106    ImmutableAssignment(String),
107    TypeError(String),
108    Runtime(String),
109    DivisionByZero,
110    Thrown(VmValue),
111    /// Thrown with error category for structured error handling.
112    CategorizedError {
113        message: String,
114        category: ErrorCategory,
115    },
116    DaemonQueueFull {
117        daemon_id: String,
118        capacity: usize,
119    },
120    /// A deterministic, provably-unresolvable self-deadlock caught before the
121    /// VM would block forever (Rust's borrow checker prevents data races but
122    /// not deadlocks; this is the Go-runtime "all goroutines asleep" analogue
123    /// for the cases we can prove). Boxed — like [`VmError::ArityMismatch`] —
124    /// so the rare three-`String` payload doesn't enlarge `VmError` on the
125    /// pervasive `Result<VmValue, VmError>` hot path. Carries `HARN-ORC-011`.
126    Deadlock(Box<DeadlockError>),
127    Return(VmValue),
128    InvalidInstruction(u8),
129    /// Wrong number of arguments at a call site. Distinct from
130    /// [`VmError::TypeError`] so the runtime can match-and-recover (and
131    /// so error UX renders `expected 2..=3 got 1` consistently).
132    ArityMismatch(Box<ArityMismatchError>),
133    /// Argument value did not satisfy the declared parameter type.
134    /// `expected` is a pretty-printed type expression; `got` is the value's
135    /// runtime type name (`VmValue::type_name`). Used for both
136    /// user-defined function parameters (with declared types) and
137    /// registry-known builtin parameters.
138    ArgTypeMismatch(Box<ArgTypeMismatchError>),
139}
140
141/// Error categories for structured error handling in agent orchestration.
142#[derive(Debug, Clone, PartialEq, Eq)]
143pub enum ErrorCategory {
144    /// Network/connection timeout
145    Timeout,
146    /// Authentication/authorization failure
147    Auth,
148    /// Rate limit exceeded (HTTP 429 / quota)
149    RateLimit,
150    /// Upstream provider is overloaded (HTTP 503 / 529).
151    /// Distinct from RateLimit: the client hasn't exceeded a quota — the
152    /// provider is shedding load and will recover on its own.
153    Overloaded,
154    /// Provider-side 5xx error (500, 502) that isn't specifically overload.
155    ServerError,
156    /// Network-level transient failure (connection reset, DNS hiccup,
157    /// partial stream) — retryable but not provider-status-coded.
158    TransientNetwork,
159    /// LLM output failed schema validation. Retryable via `schema_retries`.
160    SchemaValidation,
161    /// LLM streaming response was aborted mid-stream because the partial
162    /// JSON content could not conceivably satisfy `output_schema`. Surfaced
163    /// by `llm_call` when `schema_stream_abort` is on (the default for
164    /// schema-bearing calls). Consumes one `schema_retries` budget slot;
165    /// the retry replays the prompt with a corrective nudge that cites
166    /// the abort path + reason.
167    SchemaStreamAborted,
168    /// Tool execution failure
169    ToolError,
170    /// Tool was rejected by the host (not permitted / not in allowlist)
171    ToolRejected,
172    /// Outbound network egress was blocked by policy.
173    EgressBlocked,
174    /// Operation was cancelled
175    Cancelled,
176    /// Channel was closed before the operation could complete.
177    ChannelClosed,
178    /// Resource not found
179    NotFound,
180    /// Circuit breaker is open
181    CircuitOpen,
182    /// LLM cost or token budget would be exceeded
183    BudgetExceeded,
184    /// Generic/unclassified error
185    Generic,
186}
187
188impl ErrorCategory {
189    pub fn as_str(&self) -> &'static str {
190        match self {
191            ErrorCategory::Timeout => "timeout",
192            ErrorCategory::Auth => "auth",
193            ErrorCategory::RateLimit => "rate_limit",
194            ErrorCategory::Overloaded => "overloaded",
195            ErrorCategory::ServerError => "server_error",
196            ErrorCategory::TransientNetwork => "transient_network",
197            ErrorCategory::SchemaValidation => "schema_validation",
198            ErrorCategory::SchemaStreamAborted => "schema_stream_aborted",
199            ErrorCategory::ToolError => "tool_error",
200            ErrorCategory::ToolRejected => "tool_rejected",
201            ErrorCategory::EgressBlocked => "egress_blocked",
202            ErrorCategory::Cancelled => "cancelled",
203            ErrorCategory::ChannelClosed => "channel_closed",
204            ErrorCategory::NotFound => "not_found",
205            ErrorCategory::CircuitOpen => "circuit_open",
206            ErrorCategory::BudgetExceeded => "budget_exceeded",
207            ErrorCategory::Generic => "generic",
208        }
209    }
210
211    pub fn parse(s: &str) -> Self {
212        match s {
213            "timeout" => ErrorCategory::Timeout,
214            "auth" => ErrorCategory::Auth,
215            "rate_limit" => ErrorCategory::RateLimit,
216            "overloaded" => ErrorCategory::Overloaded,
217            "server_error" => ErrorCategory::ServerError,
218            "transient_network" => ErrorCategory::TransientNetwork,
219            "schema_validation" => ErrorCategory::SchemaValidation,
220            "schema_stream_aborted" => ErrorCategory::SchemaStreamAborted,
221            "tool_error" => ErrorCategory::ToolError,
222            "tool_rejected" => ErrorCategory::ToolRejected,
223            "egress_blocked" => ErrorCategory::EgressBlocked,
224            "cancelled" => ErrorCategory::Cancelled,
225            "channel_closed" => ErrorCategory::ChannelClosed,
226            "not_found" => ErrorCategory::NotFound,
227            "circuit_open" => ErrorCategory::CircuitOpen,
228            "budget_exceeded" => ErrorCategory::BudgetExceeded,
229            _ => ErrorCategory::Generic,
230        }
231    }
232
233    /// Whether an error of this category is worth retrying for a transient
234    /// provider-side reason. Agent loops consult this to decide whether to
235    /// back off and retry vs surface the error to the user.
236    pub fn is_transient(&self) -> bool {
237        matches!(
238            self,
239            ErrorCategory::Timeout
240                | ErrorCategory::RateLimit
241                | ErrorCategory::Overloaded
242                | ErrorCategory::ServerError
243                | ErrorCategory::TransientNetwork
244        )
245    }
246}
247
248/// Create a categorized error conveniently.
249pub fn categorized_error(message: impl Into<String>, category: ErrorCategory) -> VmError {
250    VmError::CategorizedError {
251        message: message.into(),
252        category,
253    }
254}
255
256/// Extract error category from a VmError.
257///
258/// Classification priority:
259/// 1. Explicit CategorizedError variant (set by throw_error or internal code)
260/// 2. Thrown dict with a "category" field (user-created structured errors)
261/// 3. HTTP status code extraction (standard, unambiguous)
262/// 4. Deadline exceeded (VM-internal)
263/// 5. Fallback to Generic
264pub fn error_to_category(err: &VmError) -> ErrorCategory {
265    match err {
266        VmError::CategorizedError { category, .. } => category.clone(),
267        VmError::Thrown(VmValue::Dict(d)) => d
268            .get("category")
269            .map(|v| ErrorCategory::parse(&v.display()))
270            .unwrap_or(ErrorCategory::Generic),
271        VmError::Thrown(VmValue::String(s)) => classify_error_message(s),
272        VmError::Runtime(msg) => classify_error_message(msg),
273        // A deadlock is permanently non-retryable and not provider-related —
274        // `Generic` is the correct "surface it, don't back off" bucket.
275        VmError::Deadlock(_) => ErrorCategory::Generic,
276        _ => ErrorCategory::Generic,
277    }
278}
279
280/// Classify an error message using HTTP status codes and well-known patterns.
281/// Prefers unambiguous signals (status codes) over substring heuristics.
282pub fn classify_error_message(msg: &str) -> ErrorCategory {
283    // 1. HTTP status codes — most reliable signal
284    if let Some(cat) = classify_by_http_status(msg) {
285        return cat;
286    }
287    // 2. Well-known error identifiers from major APIs
288    //    (Anthropic, OpenAI, and standard HTTP patterns)
289    let lower = msg.to_lowercase();
290    if lower.contains("cancelled") || lower.contains("canceled") {
291        return ErrorCategory::Cancelled;
292    }
293    if msg.contains("ChannelClosed") || lower.contains("channel closed") {
294        return ErrorCategory::ChannelClosed;
295    }
296    if msg.contains("Deadline exceeded") || msg.contains("context deadline exceeded") {
297        return ErrorCategory::Timeout;
298    }
299    if msg.contains("overloaded_error") {
300        // Anthropic overloaded_error surfaces as HTTP 529.
301        return ErrorCategory::Overloaded;
302    }
303    if msg.contains("api_error") {
304        // Anthropic catch-all server-side error.
305        return ErrorCategory::ServerError;
306    }
307    if msg.contains("insufficient_quota") || msg.contains("billing_hard_limit_reached") {
308        // OpenAI-specific quota error types.
309        return ErrorCategory::RateLimit;
310    }
311    if msg.contains("invalid_api_key") || msg.contains("authentication_error") {
312        return ErrorCategory::Auth;
313    }
314    if msg.contains("not_found_error") || msg.contains("model_not_found") {
315        return ErrorCategory::NotFound;
316    }
317    // OpenRouter reports an unknown model as HTTP 400 with the body
318    // "<id> is not a valid model ID" — no status-code or typed-error signal
319    // that `classify_by_http_status` / the checks above can latch onto. Map
320    // the prose to NotFound so it lines up with Cerebras's 404 path (and with
321    // `errors::is_model_unavailable`'s reason taxonomy).
322    if lower.contains("is not a valid model id") || lower.contains("invalid model id") {
323        return ErrorCategory::NotFound;
324    }
325    if msg.contains("circuit_open") {
326        return ErrorCategory::CircuitOpen;
327    }
328    // Network-level transient patterns (pre-HTTP-status, pre-provider-framing).
329    if lower.contains("connection reset")
330        || lower.contains("connection refused")
331        || lower.contains("connection closed")
332        || lower.contains("broken pipe")
333        || lower.contains("dns error")
334        || lower.contains("stream error")
335        || lower.contains("unexpected eof")
336    {
337        return ErrorCategory::TransientNetwork;
338    }
339    ErrorCategory::Generic
340}
341
342/// Classify errors by HTTP status code if one appears in the message.
343/// This is the most reliable classification method since status codes
344/// are standardized (RFC 9110) and unambiguous.
345fn classify_by_http_status(msg: &str) -> Option<ErrorCategory> {
346    // Extract 3-digit HTTP status codes from common patterns:
347    // "HTTP 429", "status 429", "429 Too Many", "error: 401"
348    for code in extract_http_status_codes(msg) {
349        return Some(match code {
350            401 | 403 => ErrorCategory::Auth,
351            404 | 410 => ErrorCategory::NotFound,
352            408 | 504 | 522 | 524 => ErrorCategory::Timeout,
353            429 => ErrorCategory::RateLimit,
354            503 | 529 => ErrorCategory::Overloaded,
355            500 | 502 => ErrorCategory::ServerError,
356            _ => continue,
357        });
358    }
359    None
360}
361
362/// Extract plausible HTTP status codes from an error message.
363fn extract_http_status_codes(msg: &str) -> Vec<u16> {
364    let mut codes = Vec::new();
365    let bytes = msg.as_bytes();
366    for i in 0..bytes.len().saturating_sub(2) {
367        // Look for 3-digit sequences in the 100-599 range
368        if bytes[i].is_ascii_digit()
369            && bytes[i + 1].is_ascii_digit()
370            && bytes[i + 2].is_ascii_digit()
371        {
372            // Ensure it's not part of a longer number
373            let before_ok = i == 0 || !bytes[i - 1].is_ascii_digit();
374            let after_ok = i + 3 >= bytes.len() || !bytes[i + 3].is_ascii_digit();
375            if before_ok && after_ok {
376                if let Ok(code) = msg[i..i + 3].parse::<u16>() {
377                    if (400..=599).contains(&code) {
378                        codes.push(code);
379                    }
380                }
381            }
382        }
383    }
384    codes
385}
386
387impl std::fmt::Display for VmError {
388    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
389        match self {
390            VmError::StackUnderflow => write!(f, "Stack underflow"),
391            VmError::StackOverflow => write!(f, "Stack overflow: too many nested calls"),
392            VmError::UndefinedVariable(n) => write!(f, "Undefined variable: {n}"),
393            VmError::UndefinedBuiltin(n) => write!(f, "Undefined builtin: {n}"),
394            VmError::ImmutableAssignment(n) => {
395                write!(f, "Cannot assign to immutable binding: {n}")
396            }
397            VmError::TypeError(msg) => write!(f, "Type error: {msg}"),
398            VmError::Runtime(msg) => write!(f, "Runtime error: {msg}"),
399            VmError::DivisionByZero => write!(f, "Division by zero"),
400            VmError::Thrown(v) => write!(f, "Thrown: {}", v.display()),
401            VmError::CategorizedError { message, category } => {
402                write!(f, "Error [{}]: {}", category.as_str(), message)
403            }
404            VmError::DaemonQueueFull {
405                daemon_id,
406                capacity,
407            } => write!(
408                f,
409                "Daemon queue full: daemon '{daemon_id}' reached its event_queue_capacity of {capacity}"
410            ),
411            VmError::Deadlock(err) => match err.diagnostic {
412                DeadlockDiagnostic::SelfDeadlock => write!(
413                    f,
414                    "{}: deadlock detected: {} ({} '{}') — this wait can never complete and would block forever",
415                    err.diagnostic.code(),
416                    err.detail,
417                    err.kind,
418                    err.key
419                ),
420                DeadlockDiagnostic::WaitForGraph => write!(
421                    f,
422                    "{}: wait-for deadlock detected: {} ({} '{}') — no active task can make progress",
423                    err.diagnostic.code(),
424                    err.detail,
425                    err.kind,
426                    err.key
427                ),
428            },
429            VmError::Return(_) => write!(f, "Return from function"),
430            VmError::InvalidInstruction(op) => write!(f, "Invalid instruction: 0x{op:02x}"),
431            VmError::ArityMismatch(err) => {
432                let arg_word = match err.expected {
433                    ArityExpect::Exact(1) | ArityExpect::AtLeast(1) => "argument",
434                    _ => "arguments",
435                };
436                write!(
437                    f,
438                    "Arity mismatch: '{}' expects {} {}, got {}{}",
439                    err.callee,
440                    err.expected,
441                    arg_word,
442                    err.got,
443                    fmt_span_suffix(&err.span)
444                )
445            }
446            VmError::ArgTypeMismatch(err) => {
447                write!(
448                    f,
449                    "Type error: '{}' parameter `{}` expects {}, got {}{}",
450                    err.callee,
451                    err.param,
452                    err.expected,
453                    err.got,
454                    fmt_span_suffix(&err.span)
455                )
456            }
457        }
458    }
459}
460
461fn fmt_span_suffix(span: &Option<Span>) -> String {
462    match span {
463        Some(s) => format!(" (at byte {}..{})", s.start, s.end),
464        None => String::new(),
465    }
466}
467
468impl std::error::Error for VmError {}
469
470#[cfg(test)]
471mod tests {
472    use super::*;
473
474    #[test]
475    fn classifies_cancelled_messages() {
476        assert_eq!(
477            classify_error_message("Bridge: operation cancelled"),
478            ErrorCategory::Cancelled
479        );
480        assert_eq!(
481            classify_error_message("operation canceled by host"),
482            ErrorCategory::Cancelled
483        );
484    }
485
486    #[test]
487    fn classifies_openrouter_invalid_model_id_as_not_found() {
488        // OpenRouter reports an unknown model as HTTP 400 + prose. The 400 is
489        // not classified by status, so the prose substring must lift it to
490        // NotFound to match Cerebras's 404 path.
491        assert_eq!(
492            classify_error_message(
493                "openrouter API error: qwen/qwen3-coder-bogus is not a valid model ID"
494            ),
495            ErrorCategory::NotFound
496        );
497        assert_eq!(
498            classify_error_message("invalid model id supplied"),
499            ErrorCategory::NotFound
500        );
501    }
502
503    #[test]
504    fn deadlock_renders_with_stable_code() {
505        let err = VmError::Deadlock(Box::new(DeadlockError::self_deadlock(
506            "mutex",
507            "__default__",
508            "re-entrant acquire",
509        )));
510        assert!(
511            err.to_string().starts_with("HARN-ORC-011"),
512            "deadlock Display must carry the stable code: {err}"
513        );
514    }
515
516    #[test]
517    fn deadlock_maps_to_generic_category() {
518        let err = VmError::Deadlock(Box::new(DeadlockError::self_deadlock(
519            "task",
520            "task_1",
521            "self-join",
522        )));
523        let category = error_to_category(&err);
524        assert_eq!(category, ErrorCategory::Generic);
525        assert!(
526            !category.is_transient(),
527            "a deadlock must not be treated as a retryable transient error"
528        );
529    }
530}