Skip to main content

harn_vm/value/
error.rs

1use harn_lexer::Span;
2
3use super::VmValue;
4
5/// Bound expressing how many arguments a callable accepts. Used in
6/// [`VmError::ArityMismatch`] so error messages can render the exact
7/// signature contract the caller violated.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum ArityExpect {
10    /// Exactly N parameters, no defaults, no rest.
11    Exact(usize),
12    /// `min..=max`: some params have defaults but the upper bound is fixed.
13    Range { min: usize, max: usize },
14    /// At least N parameters; further args land in a rest list. Used for
15    /// `print` / `log` / variadics.
16    AtLeast(usize),
17}
18
19impl std::fmt::Display for ArityExpect {
20    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
21        match self {
22            ArityExpect::Exact(n) => write!(f, "{n}"),
23            ArityExpect::Range { min, max } => write!(f, "{min}..={max}"),
24            ArityExpect::AtLeast(n) => write!(f, "at least {n}"),
25        }
26    }
27}
28
29#[derive(Debug, Clone)]
30pub struct ArityMismatchError {
31    pub callee: String,
32    pub expected: ArityExpect,
33    pub got: usize,
34    pub span: Option<Span>,
35}
36
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub enum DeadlockDiagnostic {
39    SelfDeadlock,
40    WaitForGraph,
41}
42
43impl DeadlockDiagnostic {
44    fn code(self) -> &'static str {
45        match self {
46            Self::SelfDeadlock => "HARN-ORC-011",
47            Self::WaitForGraph => "HARN-ORC-012",
48        }
49    }
50}
51
52/// Payload for [`VmError::Deadlock`]. `kind` is the primitive kind
53/// (`"mutex"`, `"channel"`) or `"task"`; `key` is the primitive key or task
54/// id; `detail` names the specific footgun.
55#[derive(Debug, Clone)]
56pub struct DeadlockError {
57    pub diagnostic: DeadlockDiagnostic,
58    pub kind: String,
59    pub key: String,
60    pub detail: String,
61}
62
63impl DeadlockError {
64    pub(crate) fn self_deadlock(
65        kind: impl Into<String>,
66        key: impl Into<String>,
67        detail: impl Into<String>,
68    ) -> Self {
69        Self {
70            diagnostic: DeadlockDiagnostic::SelfDeadlock,
71            kind: kind.into(),
72            key: key.into(),
73            detail: detail.into(),
74        }
75    }
76
77    pub(crate) fn wait_for_graph(
78        kind: impl Into<String>,
79        key: impl Into<String>,
80        detail: impl Into<String>,
81    ) -> Self {
82        Self {
83            diagnostic: DeadlockDiagnostic::WaitForGraph,
84            kind: kind.into(),
85            key: key.into(),
86            detail: detail.into(),
87        }
88    }
89}
90
91#[derive(Debug, Clone)]
92pub struct ArgTypeMismatchError {
93    pub callee: String,
94    pub param: String,
95    pub expected: String,
96    pub got: &'static str,
97    pub span: Option<Span>,
98}
99
100#[derive(Debug, Clone)]
101pub enum VmError {
102    StackUnderflow,
103    StackOverflow,
104    UndefinedVariable(String),
105    UndefinedBuiltin(String),
106    ImmutableAssignment(String),
107    TypeError(String),
108    Runtime(String),
109    DivisionByZero,
110    Thrown(VmValue),
111    /// Thrown with error category for structured error handling.
112    CategorizedError {
113        message: String,
114        category: ErrorCategory,
115    },
116    DaemonQueueFull {
117        daemon_id: String,
118        capacity: usize,
119    },
120    /// A deterministic, provably-unresolvable self-deadlock caught before the
121    /// VM would block forever (Rust's borrow checker prevents data races but
122    /// not deadlocks; this is the Go-runtime "all goroutines asleep" analogue
123    /// for the cases we can prove). Boxed — like [`VmError::ArityMismatch`] —
124    /// so the rare three-`String` payload doesn't enlarge `VmError` on the
125    /// pervasive `Result<VmValue, VmError>` hot path. Carries `HARN-ORC-011`.
126    Deadlock(Box<DeadlockError>),
127    Return(VmValue),
128    InvalidInstruction(u8),
129    /// Wrong number of arguments at a call site. Distinct from
130    /// [`VmError::TypeError`] so the runtime can match-and-recover (and
131    /// so error UX renders `expected 2..=3 got 1` consistently).
132    ArityMismatch(Box<ArityMismatchError>),
133    /// Argument value did not satisfy the declared parameter type.
134    /// `expected` is a pretty-printed type expression; `got` is the value's
135    /// runtime type name (`VmValue::type_name`). Used for both
136    /// user-defined function parameters (with declared types) and
137    /// registry-known builtin parameters.
138    ArgTypeMismatch(Box<ArgTypeMismatchError>),
139}
140
141/// Error categories for structured error handling in agent orchestration.
142#[derive(Debug, Clone, PartialEq, Eq)]
143pub enum ErrorCategory {
144    /// Network/connection timeout
145    Timeout,
146    /// Authentication/authorization failure
147    Auth,
148    /// Rate limit exceeded (HTTP 429 / quota)
149    RateLimit,
150    /// Upstream provider is overloaded (HTTP 503 / 529).
151    /// Distinct from RateLimit: the client hasn't exceeded a quota — the
152    /// provider is shedding load and will recover on its own.
153    Overloaded,
154    /// Provider-side 5xx error (500, 502) that isn't specifically overload.
155    ServerError,
156    /// Network-level transient failure (connection reset, DNS hiccup,
157    /// partial stream) — retryable but not provider-status-coded.
158    TransientNetwork,
159    /// LLM output failed schema validation. Retryable via `schema_retries`.
160    SchemaValidation,
161    /// LLM streaming response was aborted mid-stream because the partial
162    /// JSON content could not conceivably satisfy `output_schema`. Surfaced
163    /// by `llm_call` when `schema_stream_abort` is on (the default for
164    /// schema-bearing calls). Consumes one `schema_retries` budget slot;
165    /// the retry replays the prompt with a corrective nudge that cites
166    /// the abort path + reason.
167    SchemaStreamAborted,
168    /// Tool execution failure
169    ToolError,
170    /// Tool was rejected by the host (not permitted / not in allowlist)
171    ToolRejected,
172    /// Outbound network egress was blocked by policy.
173    EgressBlocked,
174    /// Operation was cancelled
175    Cancelled,
176    /// Channel was closed before the operation could complete.
177    ChannelClosed,
178    /// Resource not found
179    NotFound,
180    /// Circuit breaker is open
181    CircuitOpen,
182    /// LLM cost or token budget would be exceeded
183    BudgetExceeded,
184    /// Generic/unclassified error
185    Generic,
186}
187
188impl ErrorCategory {
189    pub fn as_str(&self) -> &'static str {
190        match self {
191            ErrorCategory::Timeout => "timeout",
192            ErrorCategory::Auth => "auth",
193            ErrorCategory::RateLimit => "rate_limit",
194            ErrorCategory::Overloaded => "overloaded",
195            ErrorCategory::ServerError => "server_error",
196            ErrorCategory::TransientNetwork => "transient_network",
197            ErrorCategory::SchemaValidation => "schema_validation",
198            ErrorCategory::SchemaStreamAborted => "schema_stream_aborted",
199            ErrorCategory::ToolError => "tool_error",
200            ErrorCategory::ToolRejected => "tool_rejected",
201            ErrorCategory::EgressBlocked => "egress_blocked",
202            ErrorCategory::Cancelled => "cancelled",
203            ErrorCategory::ChannelClosed => "channel_closed",
204            ErrorCategory::NotFound => "not_found",
205            ErrorCategory::CircuitOpen => "circuit_open",
206            ErrorCategory::BudgetExceeded => "budget_exceeded",
207            ErrorCategory::Generic => "generic",
208        }
209    }
210
211    pub fn parse(s: &str) -> Self {
212        match s {
213            "timeout" => ErrorCategory::Timeout,
214            "auth" => ErrorCategory::Auth,
215            "rate_limit" => ErrorCategory::RateLimit,
216            "overloaded" => ErrorCategory::Overloaded,
217            "server_error" => ErrorCategory::ServerError,
218            "transient_network" => ErrorCategory::TransientNetwork,
219            "schema_validation" => ErrorCategory::SchemaValidation,
220            "schema_stream_aborted" => ErrorCategory::SchemaStreamAborted,
221            "tool_error" => ErrorCategory::ToolError,
222            "tool_rejected" => ErrorCategory::ToolRejected,
223            "egress_blocked" => ErrorCategory::EgressBlocked,
224            "cancelled" => ErrorCategory::Cancelled,
225            "channel_closed" => ErrorCategory::ChannelClosed,
226            "not_found" => ErrorCategory::NotFound,
227            "circuit_open" => ErrorCategory::CircuitOpen,
228            "budget_exceeded" => ErrorCategory::BudgetExceeded,
229            _ => ErrorCategory::Generic,
230        }
231    }
232
233    /// Whether an error of this category is worth retrying for a transient
234    /// provider-side reason. Agent loops consult this to decide whether to
235    /// back off and retry vs surface the error to the user.
236    pub fn is_transient(&self) -> bool {
237        matches!(
238            self,
239            ErrorCategory::Timeout
240                | ErrorCategory::RateLimit
241                | ErrorCategory::Overloaded
242                | ErrorCategory::ServerError
243                | ErrorCategory::TransientNetwork
244        )
245    }
246}
247
248/// Create a categorized error conveniently.
249pub fn categorized_error(message: impl Into<String>, category: ErrorCategory) -> VmError {
250    VmError::CategorizedError {
251        message: message.into(),
252        category,
253    }
254}
255
256/// Extract error category from a VmError.
257///
258/// Classification priority:
259/// 1. Explicit CategorizedError variant (set by throw_error or internal code)
260/// 2. Thrown dict with a "category" field (user-created structured errors)
261/// 3. HTTP status code extraction (standard, unambiguous)
262/// 4. Deadline exceeded (VM-internal)
263/// 5. Fallback to Generic
264pub fn error_to_category(err: &VmError) -> ErrorCategory {
265    match err {
266        VmError::CategorizedError { category, .. } => category.clone(),
267        VmError::Thrown(VmValue::Dict(d)) => d
268            .get("category")
269            .map(|v| ErrorCategory::parse(&v.display()))
270            .unwrap_or(ErrorCategory::Generic),
271        VmError::Thrown(VmValue::String(s)) => classify_error_message(s),
272        VmError::Runtime(msg) => classify_error_message(msg),
273        // A deadlock is permanently non-retryable and not provider-related —
274        // `Generic` is the correct "surface it, don't back off" bucket.
275        VmError::Deadlock(_) => ErrorCategory::Generic,
276        _ => ErrorCategory::Generic,
277    }
278}
279
280/// Classify an error message using HTTP status codes and well-known patterns.
281/// Prefers unambiguous signals (status codes) over substring heuristics.
282pub fn classify_error_message(msg: &str) -> ErrorCategory {
283    // 1. HTTP status codes — most reliable signal
284    if let Some(cat) = classify_by_http_status(msg) {
285        return cat;
286    }
287    // 2. Well-known error identifiers from major APIs
288    //    (Anthropic, OpenAI, and standard HTTP patterns)
289    let lower = msg.to_lowercase();
290    if lower.contains("cancelled") || lower.contains("canceled") {
291        return ErrorCategory::Cancelled;
292    }
293    if msg.contains("ChannelClosed") || lower.contains("channel closed") {
294        return ErrorCategory::ChannelClosed;
295    }
296    if msg.contains("Deadline exceeded") || msg.contains("context deadline exceeded") {
297        return ErrorCategory::Timeout;
298    }
299    if msg.contains("overloaded_error") {
300        // Anthropic overloaded_error surfaces as HTTP 529.
301        return ErrorCategory::Overloaded;
302    }
303    if msg.contains("api_error") {
304        // Anthropic catch-all server-side error.
305        return ErrorCategory::ServerError;
306    }
307    if msg.contains("insufficient_quota") || msg.contains("billing_hard_limit_reached") {
308        // OpenAI-specific quota error types.
309        return ErrorCategory::RateLimit;
310    }
311    if msg.contains("invalid_api_key") || msg.contains("authentication_error") {
312        return ErrorCategory::Auth;
313    }
314    if msg.contains("not_found_error") || msg.contains("model_not_found") {
315        return ErrorCategory::NotFound;
316    }
317    if msg.contains("circuit_open") {
318        return ErrorCategory::CircuitOpen;
319    }
320    // Network-level transient patterns (pre-HTTP-status, pre-provider-framing).
321    if lower.contains("connection reset")
322        || lower.contains("connection refused")
323        || lower.contains("connection closed")
324        || lower.contains("broken pipe")
325        || lower.contains("dns error")
326        || lower.contains("stream error")
327        || lower.contains("unexpected eof")
328    {
329        return ErrorCategory::TransientNetwork;
330    }
331    ErrorCategory::Generic
332}
333
334/// Classify errors by HTTP status code if one appears in the message.
335/// This is the most reliable classification method since status codes
336/// are standardized (RFC 9110) and unambiguous.
337fn classify_by_http_status(msg: &str) -> Option<ErrorCategory> {
338    // Extract 3-digit HTTP status codes from common patterns:
339    // "HTTP 429", "status 429", "429 Too Many", "error: 401"
340    for code in extract_http_status_codes(msg) {
341        return Some(match code {
342            401 | 403 => ErrorCategory::Auth,
343            404 | 410 => ErrorCategory::NotFound,
344            408 | 504 | 522 | 524 => ErrorCategory::Timeout,
345            429 => ErrorCategory::RateLimit,
346            503 | 529 => ErrorCategory::Overloaded,
347            500 | 502 => ErrorCategory::ServerError,
348            _ => continue,
349        });
350    }
351    None
352}
353
354/// Extract plausible HTTP status codes from an error message.
355fn extract_http_status_codes(msg: &str) -> Vec<u16> {
356    let mut codes = Vec::new();
357    let bytes = msg.as_bytes();
358    for i in 0..bytes.len().saturating_sub(2) {
359        // Look for 3-digit sequences in the 100-599 range
360        if bytes[i].is_ascii_digit()
361            && bytes[i + 1].is_ascii_digit()
362            && bytes[i + 2].is_ascii_digit()
363        {
364            // Ensure it's not part of a longer number
365            let before_ok = i == 0 || !bytes[i - 1].is_ascii_digit();
366            let after_ok = i + 3 >= bytes.len() || !bytes[i + 3].is_ascii_digit();
367            if before_ok && after_ok {
368                if let Ok(code) = msg[i..i + 3].parse::<u16>() {
369                    if (400..=599).contains(&code) {
370                        codes.push(code);
371                    }
372                }
373            }
374        }
375    }
376    codes
377}
378
379impl std::fmt::Display for VmError {
380    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
381        match self {
382            VmError::StackUnderflow => write!(f, "Stack underflow"),
383            VmError::StackOverflow => write!(f, "Stack overflow: too many nested calls"),
384            VmError::UndefinedVariable(n) => write!(f, "Undefined variable: {n}"),
385            VmError::UndefinedBuiltin(n) => write!(f, "Undefined builtin: {n}"),
386            VmError::ImmutableAssignment(n) => {
387                write!(f, "Cannot assign to immutable binding: {n}")
388            }
389            VmError::TypeError(msg) => write!(f, "Type error: {msg}"),
390            VmError::Runtime(msg) => write!(f, "Runtime error: {msg}"),
391            VmError::DivisionByZero => write!(f, "Division by zero"),
392            VmError::Thrown(v) => write!(f, "Thrown: {}", v.display()),
393            VmError::CategorizedError { message, category } => {
394                write!(f, "Error [{}]: {}", category.as_str(), message)
395            }
396            VmError::DaemonQueueFull {
397                daemon_id,
398                capacity,
399            } => write!(
400                f,
401                "Daemon queue full: daemon '{daemon_id}' reached its event_queue_capacity of {capacity}"
402            ),
403            VmError::Deadlock(err) => match err.diagnostic {
404                DeadlockDiagnostic::SelfDeadlock => write!(
405                    f,
406                    "{}: deadlock detected: {} ({} '{}') — this wait can never complete and would block forever",
407                    err.diagnostic.code(),
408                    err.detail,
409                    err.kind,
410                    err.key
411                ),
412                DeadlockDiagnostic::WaitForGraph => write!(
413                    f,
414                    "{}: wait-for deadlock detected: {} ({} '{}') — no active task can make progress",
415                    err.diagnostic.code(),
416                    err.detail,
417                    err.kind,
418                    err.key
419                ),
420            },
421            VmError::Return(_) => write!(f, "Return from function"),
422            VmError::InvalidInstruction(op) => write!(f, "Invalid instruction: 0x{op:02x}"),
423            VmError::ArityMismatch(err) => {
424                let arg_word = match err.expected {
425                    ArityExpect::Exact(1) | ArityExpect::AtLeast(1) => "argument",
426                    _ => "arguments",
427                };
428                write!(
429                    f,
430                    "Arity mismatch: '{}' expects {} {}, got {}{}",
431                    err.callee,
432                    err.expected,
433                    arg_word,
434                    err.got,
435                    fmt_span_suffix(&err.span)
436                )
437            }
438            VmError::ArgTypeMismatch(err) => {
439                write!(
440                    f,
441                    "Type error: '{}' parameter `{}` expects {}, got {}{}",
442                    err.callee,
443                    err.param,
444                    err.expected,
445                    err.got,
446                    fmt_span_suffix(&err.span)
447                )
448            }
449        }
450    }
451}
452
453fn fmt_span_suffix(span: &Option<Span>) -> String {
454    match span {
455        Some(s) => format!(" (at byte {}..{})", s.start, s.end),
456        None => String::new(),
457    }
458}
459
460impl std::error::Error for VmError {}
461
462#[cfg(test)]
463mod tests {
464    use super::*;
465
466    #[test]
467    fn classifies_cancelled_messages() {
468        assert_eq!(
469            classify_error_message("Bridge: operation cancelled"),
470            ErrorCategory::Cancelled
471        );
472        assert_eq!(
473            classify_error_message("operation canceled by host"),
474            ErrorCategory::Cancelled
475        );
476    }
477
478    #[test]
479    fn deadlock_renders_with_stable_code() {
480        let err = VmError::Deadlock(Box::new(DeadlockError::self_deadlock(
481            "mutex",
482            "__default__",
483            "re-entrant acquire",
484        )));
485        assert!(
486            err.to_string().starts_with("HARN-ORC-011"),
487            "deadlock Display must carry the stable code: {err}"
488        );
489    }
490
491    #[test]
492    fn deadlock_maps_to_generic_category() {
493        let err = VmError::Deadlock(Box::new(DeadlockError::self_deadlock(
494            "task",
495            "task_1",
496            "self-join",
497        )));
498        let category = error_to_category(&err);
499        assert_eq!(category, ErrorCategory::Generic);
500        assert!(
501            !category.is_transient(),
502            "a deadlock must not be treated as a retryable transient error"
503        );
504    }
505}