Skip to main content

harn_vm/value/
error.rs

1use harn_lexer::Span;
2
3use super::VmValue;
4
5/// Bound expressing how many arguments a callable accepts. Used in
6/// [`VmError::ArityMismatch`] so error messages can render the exact
7/// signature contract the caller violated.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ArityExpect {
10    /// Exactly N parameters, no defaults, no rest.
11    Exact(usize),
12    /// `min..=max`: some params have defaults but the upper bound is fixed.
13    Range { min: usize, max: usize },
14    /// At least N parameters; further args land in a rest list. Used for
15    /// `print` / `log` / variadics.
16    AtLeast(usize),
17}
18
19impl std::fmt::Display for ArityExpect {
20    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
21        match self {
22            ArityExpect::Exact(n) => write!(f, "{n}"),
23            ArityExpect::Range { min, max } => write!(f, "{min}..={max}"),
24            ArityExpect::AtLeast(n) => write!(f, "at least {n}"),
25        }
26    }
27}
28
29#[derive(Debug, Clone)]
30pub struct ArityMismatchError {
31    pub callee: String,
32    pub expected: ArityExpect,
33    pub got: usize,
34    pub span: Option<Span>,
35}
36
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub enum DeadlockDiagnostic {
39    SelfDeadlock,
40    WaitForGraph,
41}
42
43impl DeadlockDiagnostic {
44    fn code(self) -> &'static str {
45        match self {
46            Self::SelfDeadlock => "HARN-ORC-011",
47            Self::WaitForGraph => "HARN-ORC-012",
48        }
49    }
50}
51
52/// Payload for [`VmError::Deadlock`]. `kind` is the primitive kind
53/// (`"mutex"`, `"channel"`) or `"task"`; `key` is the primitive key or task
54/// id; `detail` names the specific footgun.
55#[derive(Debug, Clone)]
56pub struct DeadlockError {
57    pub diagnostic: DeadlockDiagnostic,
58    pub kind: String,
59    pub key: String,
60    pub detail: String,
61}
62
63impl DeadlockError {
64    pub(crate) fn self_deadlock(
65        kind: impl Into<String>,
66        key: impl Into<String>,
67        detail: impl Into<String>,
68    ) -> Self {
69        Self {
70            diagnostic: DeadlockDiagnostic::SelfDeadlock,
71            kind: kind.into(),
72            key: key.into(),
73            detail: detail.into(),
74        }
75    }
76
77    pub(crate) fn wait_for_graph(
78        kind: impl Into<String>,
79        key: impl Into<String>,
80        detail: impl Into<String>,
81    ) -> Self {
82        Self {
83            diagnostic: DeadlockDiagnostic::WaitForGraph,
84            kind: kind.into(),
85            key: key.into(),
86            detail: detail.into(),
87        }
88    }
89}
90
91#[derive(Debug, Clone)]
92pub struct ArgTypeMismatchError {
93    pub callee: String,
94    pub param: String,
95    pub expected: String,
96    pub got: &'static str,
97    pub span: Option<Span>,
98}
99
100#[derive(Debug, Clone)]
101pub enum VmError {
102    StackUnderflow,
103    StackOverflow,
104    UndefinedVariable(String),
105    UndefinedBuiltin(String),
106    ImmutableAssignment(String),
107    TypeError(String),
108    Runtime(String),
109    DivisionByZero,
110    Thrown(VmValue),
111    /// Thrown with error category for structured error handling.
112    CategorizedError {
113        message: String,
114        category: ErrorCategory,
115    },
116    DaemonQueueFull {
117        daemon_id: String,
118        capacity: usize,
119    },
120    /// A deterministic, provably-unresolvable self-deadlock caught before the
121    /// VM would block forever (Rust's borrow checker prevents data races but
122    /// not deadlocks; this is the Go-runtime "all goroutines asleep" analogue
123    /// for the cases we can prove). Boxed — like [`VmError::ArityMismatch`] —
124    /// so the rare three-`String` payload doesn't enlarge `VmError` on the
125    /// pervasive `Result<VmValue, VmError>` hot path. Carries `HARN-ORC-011`.
126    Deadlock(Box<DeadlockError>),
127    Return(VmValue),
128    InvalidInstruction(u8),
129    /// Wrong number of arguments at a call site. Distinct from
130    /// [`VmError::TypeError`] so the runtime can match-and-recover (and
131    /// so error UX renders `expected 2..=3 got 1` consistently).
132    ArityMismatch(Box<ArityMismatchError>),
133    /// Argument value did not satisfy the declared parameter type.
134    /// `expected` is a pretty-printed type expression; `got` is the value's
135    /// runtime type name (`VmValue::type_name`). Used for both
136    /// user-defined function parameters (with declared types) and
137    /// registry-known builtin parameters.
138    ArgTypeMismatch(Box<ArgTypeMismatchError>),
139}
140
141impl VmError {
142    /// The `VmValue` a `catch` binding (or a `parallel settle` result) observes
143    /// for this error: the raw thrown value for [`VmError::Thrown`] (so a
144    /// structured error — e.g. a `{category, message}` dict from `throw_error` —
145    /// keeps its shape and category), otherwise the rendered message.
146    ///
147    /// Single source of truth for VM-error-to-value lowering so every seam that
148    /// surfaces a caught error to Harn (`try`/`catch` via `handle_error`,
149    /// `parallel settle`) exposes identical, structure-preserving values. Before
150    /// this was shared, `parallel settle` stringified errors via `to_string()`,
151    /// so a categorized error thrown in a settle branch lost its category (a
152    /// `cancelled`/`internal` fault that must propagate looked `generic`).
153    pub fn thrown_value(&self) -> VmValue {
154        match self {
155            VmError::Thrown(v) => v.clone(),
156            other => VmValue::String(arcstr::ArcStr::from(other.to_string())),
157        }
158    }
159}
160
161/// Error categories for structured error handling in agent orchestration.
162#[derive(Debug, Clone, PartialEq, Eq)]
163pub enum ErrorCategory {
164    /// Network/connection timeout
165    Timeout,
166    /// Authentication/authorization failure
167    Auth,
168    /// Rate limit exceeded (HTTP 429 / quota)
169    RateLimit,
170    /// Upstream provider is overloaded (HTTP 503 / 529).
171    /// Distinct from RateLimit: the client hasn't exceeded a quota — the
172    /// provider is shedding load and will recover on its own.
173    Overloaded,
174    /// Provider-side 5xx error (500, 502) that isn't specifically overload.
175    ServerError,
176    /// Network-level transient failure (connection reset, DNS hiccup,
177    /// partial stream) — retryable but not provider-status-coded.
178    TransientNetwork,
179    /// LLM output failed schema validation. Retryable via `schema_retries`.
180    SchemaValidation,
181    /// LLM streaming response was aborted mid-stream because the partial
182    /// JSON content could not conceivably satisfy `output_schema`. Surfaced
183    /// by `llm_call` when `schema_stream_abort` is on (the default for
184    /// schema-bearing calls). Consumes one `schema_retries` budget slot;
185    /// the retry replays the prompt with a corrective nudge that cites
186    /// the abort path + reason.
187    SchemaStreamAborted,
188    /// Tool execution failure
189    ToolError,
190    /// Tool was rejected by the host (not permitted / not in allowlist)
191    ToolRejected,
192    /// Outbound network egress was blocked by policy.
193    EgressBlocked,
194    /// Operation was cancelled
195    Cancelled,
196    /// Channel was closed before the operation could complete.
197    ChannelClosed,
198    /// Resource not found
199    NotFound,
200    /// Circuit breaker is open
201    CircuitOpen,
202    /// LLM cost or token budget would be exceeded
203    BudgetExceeded,
204    /// An internal engine/wiring bug — an undefined builtin, corrupt bytecode,
205    /// or another VM invariant violation that no amount of retrying or model
206    /// reasoning can fix. Distinct from `Generic` so callers (notably the agent
207    /// loop) can re-raise it loudly instead of folding it into a tool-error
208    /// observation and marching on to a `done` status. This is the category
209    /// that keeps a mis-wired builtin (e.g. a `#[harn_builtin]` def missing
210    /// from its install array) from shipping silently inert.
211    Internal,
212    /// Generic/unclassified error
213    Generic,
214}
215
216impl ErrorCategory {
217    pub fn as_str(&self) -> &'static str {
218        match self {
219            ErrorCategory::Timeout => "timeout",
220            ErrorCategory::Auth => "auth",
221            ErrorCategory::RateLimit => "rate_limit",
222            ErrorCategory::Overloaded => "overloaded",
223            ErrorCategory::ServerError => "server_error",
224            ErrorCategory::TransientNetwork => "transient_network",
225            ErrorCategory::SchemaValidation => "schema_validation",
226            ErrorCategory::SchemaStreamAborted => "schema_stream_aborted",
227            ErrorCategory::ToolError => "tool_error",
228            ErrorCategory::ToolRejected => "tool_rejected",
229            ErrorCategory::EgressBlocked => "egress_blocked",
230            ErrorCategory::Cancelled => "cancelled",
231            ErrorCategory::ChannelClosed => "channel_closed",
232            ErrorCategory::NotFound => "not_found",
233            ErrorCategory::CircuitOpen => "circuit_open",
234            ErrorCategory::BudgetExceeded => "budget_exceeded",
235            ErrorCategory::Internal => "internal",
236            ErrorCategory::Generic => "generic",
237        }
238    }
239
240    pub fn parse(s: &str) -> Self {
241        match s {
242            "timeout" => ErrorCategory::Timeout,
243            "auth" => ErrorCategory::Auth,
244            "rate_limit" => ErrorCategory::RateLimit,
245            "overloaded" => ErrorCategory::Overloaded,
246            "server_error" => ErrorCategory::ServerError,
247            "transient_network" => ErrorCategory::TransientNetwork,
248            "schema_validation" => ErrorCategory::SchemaValidation,
249            "schema_stream_aborted" => ErrorCategory::SchemaStreamAborted,
250            "tool_error" => ErrorCategory::ToolError,
251            "tool_rejected" => ErrorCategory::ToolRejected,
252            "egress_blocked" => ErrorCategory::EgressBlocked,
253            "cancelled" => ErrorCategory::Cancelled,
254            "channel_closed" => ErrorCategory::ChannelClosed,
255            "not_found" => ErrorCategory::NotFound,
256            "circuit_open" => ErrorCategory::CircuitOpen,
257            "budget_exceeded" => ErrorCategory::BudgetExceeded,
258            "internal" => ErrorCategory::Internal,
259            _ => ErrorCategory::Generic,
260        }
261    }
262
263    /// Whether this category represents an internal engine/wiring bug that must
264    /// be surfaced rather than retried or swallowed as a recoverable failure.
265    pub fn is_internal(&self) -> bool {
266        matches!(self, ErrorCategory::Internal)
267    }
268
269    /// Whether an error of this category is worth retrying for a transient
270    /// provider-side reason. Agent loops consult this to decide whether to
271    /// back off and retry vs surface the error to the user.
272    pub fn is_transient(&self) -> bool {
273        matches!(
274            self,
275            ErrorCategory::Timeout
276                | ErrorCategory::RateLimit
277                | ErrorCategory::Overloaded
278                | ErrorCategory::ServerError
279                | ErrorCategory::TransientNetwork
280        )
281    }
282}
283
284/// Create a categorized error conveniently.
285pub fn categorized_error(message: impl Into<String>, category: ErrorCategory) -> VmError {
286    VmError::CategorizedError {
287        message: message.into(),
288        category,
289    }
290}
291
292/// Extract error category from a VmError.
293///
294/// Classification priority:
295/// 1. Explicit CategorizedError variant (set by throw_error or internal code)
296/// 2. Thrown dict with a "category" field (user-created structured errors)
297/// 3. HTTP status code extraction (standard, unambiguous)
298/// 4. Deadline exceeded (VM-internal)
299/// 5. Fallback to Generic
300pub fn error_to_category(err: &VmError) -> ErrorCategory {
301    match err {
302        VmError::CategorizedError { category, .. } => category.clone(),
303        VmError::Thrown(VmValue::Dict(d)) => d
304            .get("category")
305            .map(|v| ErrorCategory::parse(&v.display()))
306            .unwrap_or(ErrorCategory::Generic),
307        VmError::Thrown(VmValue::String(s)) => classify_error_message(s),
308        VmError::Runtime(msg) => classify_error_message(msg),
309        // Engine/wiring bugs: an undefined builtin (declared but not installed,
310        // or a typo in stdlib/host code) or corrupt bytecode. No retry or model
311        // reasoning fixes these, so they get their own category the agent loop
312        // re-raises instead of swallowing.
313        VmError::UndefinedBuiltin(_) | VmError::InvalidInstruction(_) => ErrorCategory::Internal,
314        // A deadlock is permanently non-retryable and not provider-related —
315        // `Generic` is the correct "surface it, don't back off" bucket.
316        VmError::Deadlock(_) => ErrorCategory::Generic,
317        _ => ErrorCategory::Generic,
318    }
319}
320
321/// Classify an error message using HTTP status codes and well-known patterns.
322/// Prefers unambiguous signals (status codes) over substring heuristics.
323pub fn classify_error_message(msg: &str) -> ErrorCategory {
324    // 1. HTTP status codes — most reliable signal
325    if let Some(cat) = classify_by_http_status(msg) {
326        return cat;
327    }
328    // 2. Internal engine/wiring bug surfaced as a plain message. Some call
329    //    sites build `Runtime("Undefined builtin: …")` strings instead of the
330    //    structured `VmError::UndefinedBuiltin` variant; classify both the same
331    //    so the agent loop re-raises rather than swallows.
332    if msg.contains("Undefined builtin") {
333        return ErrorCategory::Internal;
334    }
335    // 3. Well-known error identifiers from major APIs
336    //    (Anthropic, OpenAI, and standard HTTP patterns)
337    let lower = msg.to_lowercase();
338    if lower.contains("cancelled") || lower.contains("canceled") {
339        return ErrorCategory::Cancelled;
340    }
341    if msg.contains("ChannelClosed") || lower.contains("channel closed") {
342        return ErrorCategory::ChannelClosed;
343    }
344    if msg.contains("Deadline exceeded") || msg.contains("context deadline exceeded") {
345        return ErrorCategory::Timeout;
346    }
347    if msg.contains("overloaded_error") {
348        // Anthropic overloaded_error surfaces as HTTP 529.
349        return ErrorCategory::Overloaded;
350    }
351    if msg.contains("api_error") {
352        // Anthropic catch-all server-side error.
353        return ErrorCategory::ServerError;
354    }
355    if msg.contains("insufficient_quota") || msg.contains("billing_hard_limit_reached") {
356        // OpenAI-specific quota error types.
357        return ErrorCategory::RateLimit;
358    }
359    if msg.contains("invalid_api_key") || msg.contains("authentication_error") {
360        return ErrorCategory::Auth;
361    }
362    if msg.contains("not_found_error") || msg.contains("model_not_found") {
363        return ErrorCategory::NotFound;
364    }
365    // OpenRouter reports an unknown model as HTTP 400 with the body
366    // "<id> is not a valid model ID" — no status-code or typed-error signal
367    // that `classify_by_http_status` / the checks above can latch onto. Map
368    // the prose to NotFound so it lines up with Cerebras's 404 path (and with
369    // `errors::is_model_unavailable`'s reason taxonomy).
370    if lower.contains("is not a valid model id") || lower.contains("invalid model id") {
371        return ErrorCategory::NotFound;
372    }
373    if msg.contains("circuit_open") {
374        return ErrorCategory::CircuitOpen;
375    }
376    // Network-level transient patterns (pre-HTTP-status, pre-provider-framing).
377    if lower.contains("connection reset")
378        || lower.contains("connection refused")
379        || lower.contains("connection closed")
380        || lower.contains("broken pipe")
381        || lower.contains("dns error")
382        || lower.contains("stream error")
383        || lower.contains("unexpected eof")
384    {
385        return ErrorCategory::TransientNetwork;
386    }
387    ErrorCategory::Generic
388}
389
390/// Classify errors by HTTP status code if one appears in the message.
391/// This is the most reliable classification method since status codes
392/// are standardized (RFC 9110) and unambiguous.
393fn classify_by_http_status(msg: &str) -> Option<ErrorCategory> {
394    // Extract 3-digit HTTP status codes from common patterns:
395    // "HTTP 429", "status 429", "429 Too Many", "error: 401"
396    for code in extract_http_status_codes(msg) {
397        return Some(match code {
398            401 | 403 => ErrorCategory::Auth,
399            404 | 410 => ErrorCategory::NotFound,
400            408 | 504 | 522 | 524 => ErrorCategory::Timeout,
401            429 => ErrorCategory::RateLimit,
402            503 | 529 => ErrorCategory::Overloaded,
403            500 | 502 => ErrorCategory::ServerError,
404            _ => continue,
405        });
406    }
407    None
408}
409
410/// Extract plausible HTTP status codes from an error message.
411fn extract_http_status_codes(msg: &str) -> Vec<u16> {
412    let mut codes = Vec::new();
413    let bytes = msg.as_bytes();
414    for i in 0..bytes.len().saturating_sub(2) {
415        // Look for 3-digit sequences in the 100-599 range
416        if bytes[i].is_ascii_digit()
417            && bytes[i + 1].is_ascii_digit()
418            && bytes[i + 2].is_ascii_digit()
419        {
420            // Ensure it's not part of a longer number
421            let before_ok = i == 0 || !bytes[i - 1].is_ascii_digit();
422            let after_ok = i + 3 >= bytes.len() || !bytes[i + 3].is_ascii_digit();
423            if before_ok && after_ok {
424                if let Ok(code) = msg[i..i + 3].parse::<u16>() {
425                    if (400..=599).contains(&code) {
426                        codes.push(code);
427                    }
428                }
429            }
430        }
431    }
432    codes
433}
434
435impl std::fmt::Display for VmError {
436    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
437        match self {
438            VmError::StackUnderflow => write!(f, "Stack underflow"),
439            VmError::StackOverflow => write!(f, "Stack overflow: too many nested calls"),
440            VmError::UndefinedVariable(n) => write!(f, "Undefined variable: {n}"),
441            VmError::UndefinedBuiltin(n) => write!(f, "Undefined builtin: {n}"),
442            VmError::ImmutableAssignment(n) => {
443                write!(f, "Cannot assign to immutable binding: {n}")
444            }
445            VmError::TypeError(msg) => write!(f, "Type error: {msg}"),
446            VmError::Runtime(msg) => write!(f, "Runtime error: {msg}"),
447            VmError::DivisionByZero => write!(f, "Division by zero"),
448            VmError::Thrown(v) => write!(f, "Thrown: {}", v.display()),
449            VmError::CategorizedError { message, category } => {
450                write!(f, "Error [{}]: {}", category.as_str(), message)
451            }
452            VmError::DaemonQueueFull {
453                daemon_id,
454                capacity,
455            } => write!(
456                f,
457                "Daemon queue full: daemon '{daemon_id}' reached its event_queue_capacity of {capacity}"
458            ),
459            VmError::Deadlock(err) => match err.diagnostic {
460                DeadlockDiagnostic::SelfDeadlock => write!(
461                    f,
462                    "{}: deadlock detected: {} ({} '{}') — this wait can never complete and would block forever",
463                    err.diagnostic.code(),
464                    err.detail,
465                    err.kind,
466                    err.key
467                ),
468                DeadlockDiagnostic::WaitForGraph => write!(
469                    f,
470                    "{}: wait-for deadlock detected: {} ({} '{}') — no active task can make progress",
471                    err.diagnostic.code(),
472                    err.detail,
473                    err.kind,
474                    err.key
475                ),
476            },
477            VmError::Return(_) => write!(f, "Return from function"),
478            VmError::InvalidInstruction(op) => write!(f, "Invalid instruction: 0x{op:02x}"),
479            VmError::ArityMismatch(err) => {
480                let arg_word = match err.expected {
481                    ArityExpect::Exact(1) | ArityExpect::AtLeast(1) => "argument",
482                    _ => "arguments",
483                };
484                write!(
485                    f,
486                    "Arity mismatch: '{}' expects {} {}, got {}{}",
487                    err.callee,
488                    err.expected,
489                    arg_word,
490                    err.got,
491                    fmt_span_suffix(&err.span)
492                )
493            }
494            VmError::ArgTypeMismatch(err) => {
495                write!(
496                    f,
497                    "Type error: '{}' parameter `{}` expects {}, got {}{}",
498                    err.callee,
499                    err.param,
500                    err.expected,
501                    err.got,
502                    fmt_span_suffix(&err.span)
503                )
504            }
505        }
506    }
507}
508
509fn fmt_span_suffix(span: &Option<Span>) -> String {
510    match span {
511        Some(s) => format!(" (at byte {}..{})", s.start, s.end),
512        None => String::new(),
513    }
514}
515
516impl std::error::Error for VmError {}
517
518#[cfg(test)]
519mod tests {
520    use super::*;
521
522    #[test]
523    fn classifies_cancelled_messages() {
524        assert_eq!(
525            classify_error_message("Bridge: operation cancelled"),
526            ErrorCategory::Cancelled
527        );
528        assert_eq!(
529            classify_error_message("operation canceled by host"),
530            ErrorCategory::Cancelled
531        );
532    }
533
534    #[test]
535    fn classifies_undefined_builtin_as_internal() {
536        // Structured variant (dispatch table miss / uninstalled builtin).
537        assert_eq!(
538            error_to_category(&VmError::UndefinedBuiltin("__host_agent_foo".into())),
539            ErrorCategory::Internal
540        );
541        // Corrupt bytecode / compiler-VM opcode drift.
542        assert_eq!(
543            error_to_category(&VmError::InvalidInstruction(200)),
544            ErrorCategory::Internal
545        );
546        // Stringly form: some call sites build a `Runtime("Undefined builtin: …")`
547        // message instead of the structured variant — both must classify the same.
548        assert_eq!(
549            error_to_category(&VmError::Runtime(
550                "Undefined builtin: __host_agent_foo (did you mean `bar`?)".into()
551            )),
552            ErrorCategory::Internal
553        );
554        assert_eq!(
555            classify_error_message("Undefined builtin: __host_agent_foo"),
556            ErrorCategory::Internal
557        );
558        // Internal errors are never treated as transient/retryable.
559        assert!(!ErrorCategory::Internal.is_transient());
560        assert!(ErrorCategory::Internal.is_internal());
561        // Round-trips through the string form the agent loop compares against.
562        assert_eq!(ErrorCategory::Internal.as_str(), "internal");
563        assert_eq!(ErrorCategory::parse("internal"), ErrorCategory::Internal);
564    }
565
566    #[test]
567    fn classifies_openrouter_invalid_model_id_as_not_found() {
568        // OpenRouter reports an unknown model as HTTP 400 + prose. The 400 is
569        // not classified by status, so the prose substring must lift it to
570        // NotFound to match Cerebras's 404 path.
571        assert_eq!(
572            classify_error_message(
573                "openrouter API error: qwen/qwen3-coder-bogus is not a valid model ID"
574            ),
575            ErrorCategory::NotFound
576        );
577        assert_eq!(
578            classify_error_message("invalid model id supplied"),
579            ErrorCategory::NotFound
580        );
581    }
582
583    #[test]
584    fn deadlock_renders_with_stable_code() {
585        let err = VmError::Deadlock(Box::new(DeadlockError::self_deadlock(
586            "mutex",
587            "__default__",
588            "re-entrant acquire",
589        )));
590        assert!(
591            err.to_string().starts_with("HARN-ORC-011"),
592            "deadlock Display must carry the stable code: {err}"
593        );
594    }
595
596    #[test]
597    fn deadlock_maps_to_generic_category() {
598        let err = VmError::Deadlock(Box::new(DeadlockError::self_deadlock(
599            "task",
600            "task_1",
601            "self-join",
602        )));
603        let category = error_to_category(&err);
604        assert_eq!(category, ErrorCategory::Generic);
605        assert!(
606            !category.is_transient(),
607            "a deadlock must not be treated as a retryable transient error"
608        );
609    }
610}