Skip to main content

heartbit_core/
error.rs

1//! Error type for all heartbit-core fallible operations.
2
3use std::time::Duration;
4
5use crate::types::TokenUsage;
6use thiserror::Error;
7
8/// Top-level error type for the heartbit-core crate.
9///
10/// All fallible public APIs return `Result<T, Error>`. Callers should match on
11/// specific variants rather than converting to strings so that retry logic and
12/// error reporting remain precise.
13///
14/// ## Retryable variants
15///
16/// The following variants indicate transient conditions that callers *may* retry:
17/// - [`Error::Http`] — network-level failures (connection reset, timeout, …)
18/// - [`Error::Api`] with `status >= 500` or `status == 429`
19/// - [`Error::TenantOverloaded`] — back off and retry when capacity is available
20/// - [`Error::CircuitOpen`] — retry after the `until` instant
21///
22/// ## Token accounting
23///
24/// [`Error::WithPartialUsage`] wraps any other variant and carries the token
25/// usage accumulated before the failure. Inspect it with [`Error::partial_usage`]
26/// to charge tokens even on error.
27#[derive(Error, Debug)]
28pub enum Error {
29    /// An HTTP-level error from the `reqwest` client (network failure, TLS error, etc.).
30    ///
31    /// Potentially retryable depending on the underlying cause.
32    #[error("HTTP request failed: {0}")]
33    Http(#[from] reqwest::Error),
34
35    /// JSON serialization or deserialization failed.
36    ///
37    /// Indicates a protocol mismatch or a malformed API response. Not retryable.
38    #[error("JSON serialization/deserialization failed: {0}")]
39    Json(#[from] serde_json::Error),
40
41    /// The LLM API returned a non-2xx HTTP status code.
42    ///
43    /// `status == 429` is rate-limited (retryable). `status >= 500` is a
44    /// server error (retryable). `status == 400` / `401` / `403` are not
45    /// retryable without changing the request.
46    #[error("API error ({status}): {message}")]
47    Api {
48        /// HTTP status code returned by the API.
49        status: u16,
50        /// Human-readable error message from the response body.
51        message: String,
52    },
53
54    /// A general agent-level error not covered by a more specific variant.
55    ///
56    /// Produced by tool execution failures, orchestrator logic errors, and
57    /// other agent-layer problems.
58    #[error("Agent error: {0}")]
59    Agent(String),
60
61    /// Authentication or authorization failure.
62    ///
63    /// Typically indicates a missing or invalid API key. Not retryable without
64    /// supplying valid credentials.
65    #[error("Authentication error: {0}")]
66    Auth(String),
67
68    /// The agent loop reached its configured maximum turn count without finishing.
69    ///
70    /// Not retryable — callers should increase `max_turns` or redesign the task.
71    #[error("Max turns ({0}) exceeded")]
72    MaxTurnsExceeded(usize),
73
74    /// The LLM response was cut off because `max_tokens` was reached.
75    ///
76    /// The agent loop surfaces this as an error when truncation is fatal. Callers
77    /// can increase `max_tokens` or compress context and retry.
78    #[error("Response truncated (max_tokens reached)")]
79    Truncated,
80
81    /// The agent run exceeded the configured wall-clock timeout.
82    ///
83    /// Potentially retryable with a longer timeout or a simpler task.
84    #[error("Run timed out after {0:?}")]
85    RunTimeout(Duration),
86
87    /// An error originating from the Model Context Protocol (MCP) client or server.
88    ///
89    /// Covers handshake failures, protocol violations, and tool call errors
90    /// returned by remote MCP servers.
91    #[error("MCP error: {0}")]
92    Mcp(String),
93
94    /// An error from the Agent-to-Agent (A2A) protocol layer.
95    ///
96    /// Returned when communicating with remote A2A agents fails.
97    #[error("A2A error: {0}")]
98    A2a(String),
99
100    /// An error in configuration parsing or validation.
101    ///
102    /// Produced by `HeartbitConfig` deserialization and by builder `build()` calls
103    /// that detect invalid combinations of options.
104    #[error("Configuration error: {0}")]
105    Config(String),
106
107    /// A persistence-layer error (e.g., PostgreSQL task-store failure).
108    ///
109    /// Potentially retryable on transient connection errors.
110    #[error("Store error: {0}")]
111    Store(String),
112
113    /// An error in the agent memory subsystem (recall, store, prune, etc.).
114    #[error("Memory error: {0}")]
115    Memory(String),
116
117    /// An error in the knowledge-base subsystem (indexing, chunking, search).
118    #[error("Knowledge error: {0}")]
119    Knowledge(String),
120
121    /// A guardrail denied or errored during a request.
122    ///
123    /// Produced when a [`crate::Guardrail`] hook returns `Deny` or when the
124    /// guardrail itself fails. The message contains the denial reason.
125    #[error("Guardrail error: {0}")]
126    Guardrail(String),
127
128    /// An error in the daemon execution path (Kafka consumer, dispatcher, etc.).
129    #[error("Daemon error: {0}")]
130    Daemon(String),
131
132    /// An error in the sensor pipeline (RSS, webhook, schedule triggers).
133    #[error("Sensor error: {0}")]
134    Sensor(String),
135
136    /// The agent exceeded its token budget before completing.
137    ///
138    /// `used` is the total tokens consumed; `limit` is the configured cap.
139    /// Not retryable without either increasing the budget or reducing the task.
140    #[error("Token budget exceeded: used {used}, limit {limit}")]
141    BudgetExceeded {
142        /// Total tokens consumed before the budget was exhausted.
143        used: u64,
144        /// The configured token budget that was exceeded.
145        limit: u64,
146    },
147
148    /// An error in the WebSocket/session channel layer.
149    #[error("Channel error: {0}")]
150    Channel(String),
151
152    /// An error originating from the Telegram bot adapter.
153    #[error("Telegram error: {0}")]
154    Telegram(String),
155
156    /// A kill switch was activated, terminating the agent run immediately.
157    ///
158    /// Produced by the kill-switch guardrail when a prohibited pattern is detected.
159    #[error("Kill switch activated: {0}")]
160    KillSwitch(String),
161
162    /// The agent attempted a filesystem operation that violates the sandbox policy.
163    ///
164    /// Produced by `CorePathPolicy::check_path` or the Landlock sandbox.
165    #[error("Sandbox violation: {0}")]
166    Sandbox(String),
167
168    /// The tenant has reached its maximum concurrent-request capacity.
169    ///
170    /// Retryable: callers should back off and retry after a delay.
171    #[error("tenant {tenant_id} overloaded: in_flight={in_flight}, cap={cap}")]
172    TenantOverloaded {
173        /// The tenant identifier that is overloaded.
174        tenant_id: String,
175        /// Number of requests currently in flight for this tenant.
176        in_flight: usize,
177        /// Maximum allowed concurrent requests for this tenant.
178        cap: usize,
179    },
180
181    /// The LLM provider's circuit breaker is open; requests are being shed.
182    ///
183    /// Retryable: callers should retry after the `until` instant has passed.
184    #[error("circuit breaker open: retry after {until:?} (prev open duration: {prev_duration:?})")]
185    CircuitOpen {
186        /// The instant after which requests should be retried.
187        until: std::time::Instant,
188        /// How long the circuit was open in the previous open window.
189        prev_duration: std::time::Duration,
190    },
191
192    /// Wraps another error with partial token usage accumulated before failure.
193    ///
194    /// Used by `AgentRunner::execute` to surface tokens consumed before an error.
195    /// Inspect partial usage with [`Error::partial_usage`]. Re-wrapping an existing
196    /// `WithPartialUsage` replaces the usage rather than nesting.
197    #[error("{source}")]
198    WithPartialUsage {
199        /// The underlying error that caused the agent run to abort.
200        #[source]
201        source: Box<Error>,
202        /// Token usage accumulated before the error occurred.
203        usage: TokenUsage,
204    },
205}
206
207impl Error {
208    /// Wrap this error with partial token usage data.
209    ///
210    /// If `self` is already `WithPartialUsage`, the inner error is unwrapped
211    /// first to prevent nesting. The new `usage` replaces the old one.
212    pub fn with_partial_usage(self, usage: TokenUsage) -> Self {
213        let inner = match self {
214            Error::WithPartialUsage { source, .. } => *source,
215            other => other,
216        };
217        Error::WithPartialUsage {
218            source: Box::new(inner),
219            usage,
220        }
221    }
222
223    /// Wrap this error with the sum of `prior` usage and the error's own partial usage.
224    ///
225    /// Shorthand for `e.with_partial_usage(prior + e.partial_usage())`.
226    pub fn accumulate_usage(self, prior: TokenUsage) -> Self {
227        let mut usage = prior;
228        usage += self.partial_usage();
229        self.with_partial_usage(usage)
230    }
231
232    /// Extract partial token usage from this error.
233    /// Returns `TokenUsage::default()` for errors that don't carry usage data.
234    pub fn partial_usage(&self) -> TokenUsage {
235        match self {
236            Error::WithPartialUsage { usage, .. } => *usage,
237            _ => TokenUsage::default(),
238        }
239    }
240}
241
242#[cfg(test)]
243mod tests {
244    use super::*;
245
246    #[test]
247    fn error_display_messages() {
248        let err = Error::Api {
249            status: 429,
250            message: "rate limited".into(),
251        };
252        assert_eq!(err.to_string(), "API error (429): rate limited");
253
254        let err = Error::MaxTurnsExceeded(10);
255        assert_eq!(err.to_string(), "Max turns (10) exceeded");
256
257        let err = Error::Truncated;
258        assert_eq!(err.to_string(), "Response truncated (max_tokens reached)");
259    }
260
261    #[test]
262    fn error_auth_display_message() {
263        let err = Error::Auth("invalid token".into());
264        assert_eq!(err.to_string(), "Authentication error: invalid token");
265    }
266
267    #[test]
268    fn error_mcp_display_message() {
269        let err = Error::Mcp("connection refused".into());
270        assert_eq!(err.to_string(), "MCP error: connection refused");
271    }
272
273    #[test]
274    fn error_a2a_display_message() {
275        let err = Error::A2a("agent not found".into());
276        assert_eq!(err.to_string(), "A2A error: agent not found");
277    }
278
279    #[test]
280    fn error_store_display_message() {
281        let err = Error::Store("connection refused".into());
282        assert_eq!(err.to_string(), "Store error: connection refused");
283    }
284
285    #[test]
286    fn error_memory_display_message() {
287        let err = Error::Memory("not found".into());
288        assert_eq!(err.to_string(), "Memory error: not found");
289    }
290
291    #[test]
292    fn error_knowledge_display_message() {
293        let err = Error::Knowledge("file not found".into());
294        assert_eq!(err.to_string(), "Knowledge error: file not found");
295    }
296
297    #[test]
298    fn error_guardrail_display_message() {
299        let err = Error::Guardrail("PII detected in output".into());
300        assert_eq!(err.to_string(), "Guardrail error: PII detected in output");
301    }
302
303    #[test]
304    fn error_daemon_display_message() {
305        let err = Error::Daemon("broker connection refused".into());
306        assert_eq!(err.to_string(), "Daemon error: broker connection refused");
307    }
308
309    #[test]
310    fn error_sensor_display_message() {
311        let err = Error::Sensor("RSS feed unreachable".into());
312        assert_eq!(err.to_string(), "Sensor error: RSS feed unreachable");
313    }
314
315    #[test]
316    fn error_channel_display_message() {
317        let err = Error::Channel("connection closed".into());
318        assert_eq!(err.to_string(), "Channel error: connection closed");
319    }
320
321    #[test]
322    fn error_telegram_display_message() {
323        let err = Error::Telegram("bot token invalid".into());
324        assert_eq!(err.to_string(), "Telegram error: bot token invalid");
325    }
326
327    #[test]
328    fn error_run_timeout_display_message() {
329        let err = Error::RunTimeout(Duration::from_secs(30));
330        assert_eq!(err.to_string(), "Run timed out after 30s");
331    }
332
333    #[test]
334    fn run_timeout_with_partial_usage() {
335        let usage = TokenUsage {
336            input_tokens: 200,
337            output_tokens: 100,
338            ..Default::default()
339        };
340        let err = Error::RunTimeout(Duration::from_secs(60)).with_partial_usage(usage);
341        assert_eq!(err.to_string(), "Run timed out after 60s");
342        let partial = err.partial_usage();
343        assert_eq!(partial.input_tokens, 200);
344        assert_eq!(partial.output_tokens, 100);
345    }
346
347    #[test]
348    fn with_partial_usage_wraps_error() {
349        let usage = TokenUsage {
350            input_tokens: 100,
351            output_tokens: 50,
352            ..Default::default()
353        };
354        let err = Error::MaxTurnsExceeded(5).with_partial_usage(usage);
355        assert_eq!(err.to_string(), "Max turns (5) exceeded");
356        let partial = err.partial_usage();
357        assert_eq!(partial.input_tokens, 100);
358        assert_eq!(partial.output_tokens, 50);
359    }
360
361    #[test]
362    fn with_partial_usage_unwraps_existing() {
363        let inner_usage = TokenUsage {
364            input_tokens: 50,
365            output_tokens: 25,
366            ..Default::default()
367        };
368        let outer_usage = TokenUsage {
369            input_tokens: 100,
370            output_tokens: 50,
371            ..Default::default()
372        };
373        // First wrap
374        let err = Error::MaxTurnsExceeded(5).with_partial_usage(inner_usage);
375        // Second wrap should unwrap the first, not nest
376        let err = err.with_partial_usage(outer_usage);
377
378        // Should be exactly one layer of WithPartialUsage
379        match &err {
380            Error::WithPartialUsage { source, usage } => {
381                assert!(
382                    matches!(**source, Error::MaxTurnsExceeded(5)),
383                    "inner error should be MaxTurnsExceeded, got: {source}"
384                );
385                assert_eq!(usage.input_tokens, 100);
386                assert_eq!(usage.output_tokens, 50);
387            }
388            other => panic!("expected WithPartialUsage, got: {other}"),
389        }
390    }
391
392    #[test]
393    fn error_budget_exceeded_display_message() {
394        let err = Error::BudgetExceeded {
395            used: 150000,
396            limit: 100000,
397        };
398        assert_eq!(
399            err.to_string(),
400            "Token budget exceeded: used 150000, limit 100000"
401        );
402    }
403
404    #[test]
405    fn budget_exceeded_with_partial_usage() {
406        let usage = TokenUsage {
407            input_tokens: 100000,
408            output_tokens: 50000,
409            ..Default::default()
410        };
411        let err = Error::BudgetExceeded {
412            used: 150000,
413            limit: 100000,
414        }
415        .with_partial_usage(usage);
416        assert_eq!(
417            err.to_string(),
418            "Token budget exceeded: used 150000, limit 100000"
419        );
420        let partial = err.partial_usage();
421        assert_eq!(partial.input_tokens, 100000);
422        assert_eq!(partial.output_tokens, 50000);
423    }
424
425    #[test]
426    fn partial_usage_returns_default_for_plain_errors() {
427        let err = Error::Truncated;
428        let partial = err.partial_usage();
429        assert_eq!(partial, TokenUsage::default());
430    }
431}