heartbit_core/error.rs
1//! Error type for all heartbit-core fallible operations.
2
3use std::time::Duration;
4
5use crate::types::TokenUsage;
6use thiserror::Error;
7
8/// Top-level error type for the heartbit-core crate.
9///
10/// All fallible public APIs return `Result<T, Error>`. Callers should match on
11/// specific variants rather than converting to strings so that retry logic and
12/// error reporting remain precise.
13///
14/// ## Retryable variants
15///
16/// The following variants indicate transient conditions that callers *may* retry:
17/// - [`Error::Http`] — network-level failures (connection reset, timeout, …)
18/// - [`Error::Api`] with `status >= 500` or `status == 429`
19/// - [`Error::TenantOverloaded`] — back off and retry when capacity is available
20/// - [`Error::CircuitOpen`] — retry after the `until` instant
21///
22/// ## Token accounting
23///
24/// [`Error::WithPartialUsage`] wraps any other variant and carries the token
25/// usage accumulated before the failure. Inspect it with [`Error::partial_usage`]
26/// to charge tokens even on error.
27#[derive(Error, Debug)]
28pub enum Error {
29 /// An HTTP-level error from the `reqwest` client (network failure, TLS error, etc.).
30 ///
31 /// Potentially retryable depending on the underlying cause.
32 #[error("HTTP request failed: {0}")]
33 Http(#[from] reqwest::Error),
34
35 /// JSON serialization or deserialization failed.
36 ///
37 /// Indicates a protocol mismatch or a malformed API response. Not retryable.
38 #[error("JSON serialization/deserialization failed: {0}")]
39 Json(#[from] serde_json::Error),
40
41 /// The LLM API returned a non-2xx HTTP status code.
42 ///
43 /// `status == 429` is rate-limited (retryable). `status >= 500` is a
44 /// server error (retryable). `status == 400` / `401` / `403` are not
45 /// retryable without changing the request.
46 #[error("API error ({status}): {message}")]
47 Api {
48 /// HTTP status code returned by the API.
49 status: u16,
50 /// Human-readable error message from the response body.
51 message: String,
52 },
53
54 /// A general agent-level error not covered by a more specific variant.
55 ///
56 /// Produced by tool execution failures, orchestrator logic errors, and
57 /// other agent-layer problems.
58 #[error("Agent error: {0}")]
59 Agent(String),
60
61 /// Authentication or authorization failure.
62 ///
63 /// Typically indicates a missing or invalid API key. Not retryable without
64 /// supplying valid credentials.
65 #[error("Authentication error: {0}")]
66 Auth(String),
67
68 /// The agent loop reached its configured maximum turn count without finishing.
69 ///
70 /// Not retryable — callers should increase `max_turns` or redesign the task.
71 #[error("Max turns ({0}) exceeded")]
72 MaxTurnsExceeded(usize),
73
74 /// The LLM response was cut off because `max_tokens` was reached.
75 ///
76 /// The agent loop surfaces this as an error when truncation is fatal. Callers
77 /// can increase `max_tokens` or compress context and retry.
78 #[error("Response truncated (max_tokens reached)")]
79 Truncated,
80
81 /// The agent run exceeded the configured wall-clock timeout.
82 ///
83 /// Potentially retryable with a longer timeout or a simpler task.
84 #[error("Run timed out after {0:?}")]
85 RunTimeout(Duration),
86
87 /// An error originating from the Model Context Protocol (MCP) client or server.
88 ///
89 /// Covers handshake failures, protocol violations, and tool call errors
90 /// returned by remote MCP servers.
91 #[error("MCP error: {0}")]
92 Mcp(String),
93
94 /// An error from the Agent-to-Agent (A2A) protocol layer.
95 ///
96 /// Returned when communicating with remote A2A agents fails.
97 #[error("A2A error: {0}")]
98 A2a(String),
99
100 /// An error in configuration parsing or validation.
101 ///
102 /// Produced by `HeartbitConfig` deserialization and by builder `build()` calls
103 /// that detect invalid combinations of options.
104 #[error("Configuration error: {0}")]
105 Config(String),
106
107 /// A persistence-layer error (e.g., PostgreSQL task-store failure).
108 ///
109 /// Potentially retryable on transient connection errors.
110 #[error("Store error: {0}")]
111 Store(String),
112
113 /// An error in the agent memory subsystem (recall, store, prune, etc.).
114 #[error("Memory error: {0}")]
115 Memory(String),
116
117 /// An error in the knowledge-base subsystem (indexing, chunking, search).
118 #[error("Knowledge error: {0}")]
119 Knowledge(String),
120
121 /// A guardrail denied or errored during a request.
122 ///
123 /// Produced when a [`crate::Guardrail`] hook returns `Deny` or when the
124 /// guardrail itself fails. The message contains the denial reason.
125 #[error("Guardrail error: {0}")]
126 Guardrail(String),
127
128 /// An error in the daemon execution path (Kafka consumer, dispatcher, etc.).
129 #[error("Daemon error: {0}")]
130 Daemon(String),
131
132 /// An error in the sensor pipeline (RSS, webhook, schedule triggers).
133 #[error("Sensor error: {0}")]
134 Sensor(String),
135
136 /// The agent exceeded its token budget before completing.
137 ///
138 /// `used` is the total tokens consumed; `limit` is the configured cap.
139 /// Not retryable without either increasing the budget or reducing the task.
140 #[error("Token budget exceeded: used {used}, limit {limit}")]
141 BudgetExceeded {
142 /// Total tokens consumed before the budget was exhausted.
143 used: u64,
144 /// The configured token budget that was exceeded.
145 limit: u64,
146 },
147
148 /// An error in the WebSocket/session channel layer.
149 #[error("Channel error: {0}")]
150 Channel(String),
151
152 /// An error originating from the Telegram bot adapter.
153 #[error("Telegram error: {0}")]
154 Telegram(String),
155
156 /// A kill switch was activated, terminating the agent run immediately.
157 ///
158 /// Produced by the kill-switch guardrail when a prohibited pattern is detected.
159 #[error("Kill switch activated: {0}")]
160 KillSwitch(String),
161
162 /// The agent attempted a filesystem operation that violates the sandbox policy.
163 ///
164 /// Produced by `CorePathPolicy::check_path` or the Landlock sandbox.
165 #[error("Sandbox violation: {0}")]
166 Sandbox(String),
167
168 /// The tenant has reached its maximum concurrent-request capacity.
169 ///
170 /// Retryable: callers should back off and retry after a delay.
171 #[error("tenant {tenant_id} overloaded: in_flight={in_flight}, cap={cap}")]
172 TenantOverloaded {
173 /// The tenant identifier that is overloaded.
174 tenant_id: String,
175 /// Number of requests currently in flight for this tenant.
176 in_flight: usize,
177 /// Maximum allowed concurrent requests for this tenant.
178 cap: usize,
179 },
180
181 /// The LLM provider's circuit breaker is open; requests are being shed.
182 ///
183 /// Retryable: callers should retry after the `until` instant has passed.
184 #[error("circuit breaker open: retry after {until:?} (prev open duration: {prev_duration:?})")]
185 CircuitOpen {
186 /// The instant after which requests should be retried.
187 until: std::time::Instant,
188 /// How long the circuit was open in the previous open window.
189 prev_duration: std::time::Duration,
190 },
191
192 /// Wraps another error with partial token usage accumulated before failure.
193 ///
194 /// Used by `AgentRunner::execute` to surface tokens consumed before an error.
195 /// Inspect partial usage with [`Error::partial_usage`]. Re-wrapping an existing
196 /// `WithPartialUsage` replaces the usage rather than nesting.
197 #[error("{source}")]
198 WithPartialUsage {
199 /// The underlying error that caused the agent run to abort.
200 #[source]
201 source: Box<Error>,
202 /// Token usage accumulated before the error occurred.
203 usage: TokenUsage,
204 },
205}
206
207impl Error {
208 /// Wrap this error with partial token usage data.
209 ///
210 /// If `self` is already `WithPartialUsage`, the inner error is unwrapped
211 /// first to prevent nesting. The new `usage` replaces the old one.
212 pub fn with_partial_usage(self, usage: TokenUsage) -> Self {
213 let inner = match self {
214 Error::WithPartialUsage { source, .. } => *source,
215 other => other,
216 };
217 Error::WithPartialUsage {
218 source: Box::new(inner),
219 usage,
220 }
221 }
222
223 /// Wrap this error with the sum of `prior` usage and the error's own partial usage.
224 ///
225 /// Shorthand for `e.with_partial_usage(prior + e.partial_usage())`.
226 pub fn accumulate_usage(self, prior: TokenUsage) -> Self {
227 let mut usage = prior;
228 usage += self.partial_usage();
229 self.with_partial_usage(usage)
230 }
231
232 /// Extract partial token usage from this error.
233 /// Returns `TokenUsage::default()` for errors that don't carry usage data.
234 pub fn partial_usage(&self) -> TokenUsage {
235 match self {
236 Error::WithPartialUsage { usage, .. } => *usage,
237 _ => TokenUsage::default(),
238 }
239 }
240}
241
242#[cfg(test)]
243mod tests {
244 use super::*;
245
246 #[test]
247 fn error_display_messages() {
248 let err = Error::Api {
249 status: 429,
250 message: "rate limited".into(),
251 };
252 assert_eq!(err.to_string(), "API error (429): rate limited");
253
254 let err = Error::MaxTurnsExceeded(10);
255 assert_eq!(err.to_string(), "Max turns (10) exceeded");
256
257 let err = Error::Truncated;
258 assert_eq!(err.to_string(), "Response truncated (max_tokens reached)");
259 }
260
261 #[test]
262 fn error_auth_display_message() {
263 let err = Error::Auth("invalid token".into());
264 assert_eq!(err.to_string(), "Authentication error: invalid token");
265 }
266
267 #[test]
268 fn error_mcp_display_message() {
269 let err = Error::Mcp("connection refused".into());
270 assert_eq!(err.to_string(), "MCP error: connection refused");
271 }
272
273 #[test]
274 fn error_a2a_display_message() {
275 let err = Error::A2a("agent not found".into());
276 assert_eq!(err.to_string(), "A2A error: agent not found");
277 }
278
279 #[test]
280 fn error_store_display_message() {
281 let err = Error::Store("connection refused".into());
282 assert_eq!(err.to_string(), "Store error: connection refused");
283 }
284
285 #[test]
286 fn error_memory_display_message() {
287 let err = Error::Memory("not found".into());
288 assert_eq!(err.to_string(), "Memory error: not found");
289 }
290
291 #[test]
292 fn error_knowledge_display_message() {
293 let err = Error::Knowledge("file not found".into());
294 assert_eq!(err.to_string(), "Knowledge error: file not found");
295 }
296
297 #[test]
298 fn error_guardrail_display_message() {
299 let err = Error::Guardrail("PII detected in output".into());
300 assert_eq!(err.to_string(), "Guardrail error: PII detected in output");
301 }
302
303 #[test]
304 fn error_daemon_display_message() {
305 let err = Error::Daemon("broker connection refused".into());
306 assert_eq!(err.to_string(), "Daemon error: broker connection refused");
307 }
308
309 #[test]
310 fn error_sensor_display_message() {
311 let err = Error::Sensor("RSS feed unreachable".into());
312 assert_eq!(err.to_string(), "Sensor error: RSS feed unreachable");
313 }
314
315 #[test]
316 fn error_channel_display_message() {
317 let err = Error::Channel("connection closed".into());
318 assert_eq!(err.to_string(), "Channel error: connection closed");
319 }
320
321 #[test]
322 fn error_telegram_display_message() {
323 let err = Error::Telegram("bot token invalid".into());
324 assert_eq!(err.to_string(), "Telegram error: bot token invalid");
325 }
326
327 #[test]
328 fn error_run_timeout_display_message() {
329 let err = Error::RunTimeout(Duration::from_secs(30));
330 assert_eq!(err.to_string(), "Run timed out after 30s");
331 }
332
333 #[test]
334 fn run_timeout_with_partial_usage() {
335 let usage = TokenUsage {
336 input_tokens: 200,
337 output_tokens: 100,
338 ..Default::default()
339 };
340 let err = Error::RunTimeout(Duration::from_secs(60)).with_partial_usage(usage);
341 assert_eq!(err.to_string(), "Run timed out after 60s");
342 let partial = err.partial_usage();
343 assert_eq!(partial.input_tokens, 200);
344 assert_eq!(partial.output_tokens, 100);
345 }
346
347 #[test]
348 fn with_partial_usage_wraps_error() {
349 let usage = TokenUsage {
350 input_tokens: 100,
351 output_tokens: 50,
352 ..Default::default()
353 };
354 let err = Error::MaxTurnsExceeded(5).with_partial_usage(usage);
355 assert_eq!(err.to_string(), "Max turns (5) exceeded");
356 let partial = err.partial_usage();
357 assert_eq!(partial.input_tokens, 100);
358 assert_eq!(partial.output_tokens, 50);
359 }
360
361 #[test]
362 fn with_partial_usage_unwraps_existing() {
363 let inner_usage = TokenUsage {
364 input_tokens: 50,
365 output_tokens: 25,
366 ..Default::default()
367 };
368 let outer_usage = TokenUsage {
369 input_tokens: 100,
370 output_tokens: 50,
371 ..Default::default()
372 };
373 // First wrap
374 let err = Error::MaxTurnsExceeded(5).with_partial_usage(inner_usage);
375 // Second wrap should unwrap the first, not nest
376 let err = err.with_partial_usage(outer_usage);
377
378 // Should be exactly one layer of WithPartialUsage
379 match &err {
380 Error::WithPartialUsage { source, usage } => {
381 assert!(
382 matches!(**source, Error::MaxTurnsExceeded(5)),
383 "inner error should be MaxTurnsExceeded, got: {source}"
384 );
385 assert_eq!(usage.input_tokens, 100);
386 assert_eq!(usage.output_tokens, 50);
387 }
388 other => panic!("expected WithPartialUsage, got: {other}"),
389 }
390 }
391
392 #[test]
393 fn error_budget_exceeded_display_message() {
394 let err = Error::BudgetExceeded {
395 used: 150000,
396 limit: 100000,
397 };
398 assert_eq!(
399 err.to_string(),
400 "Token budget exceeded: used 150000, limit 100000"
401 );
402 }
403
404 #[test]
405 fn budget_exceeded_with_partial_usage() {
406 let usage = TokenUsage {
407 input_tokens: 100000,
408 output_tokens: 50000,
409 ..Default::default()
410 };
411 let err = Error::BudgetExceeded {
412 used: 150000,
413 limit: 100000,
414 }
415 .with_partial_usage(usage);
416 assert_eq!(
417 err.to_string(),
418 "Token budget exceeded: used 150000, limit 100000"
419 );
420 let partial = err.partial_usage();
421 assert_eq!(partial.input_tokens, 100000);
422 assert_eq!(partial.output_tokens, 50000);
423 }
424
425 #[test]
426 fn partial_usage_returns_default_for_plain_errors() {
427 let err = Error::Truncated;
428 let partial = err.partial_usage();
429 assert_eq!(partial, TokenUsage::default());
430 }
431}