rig_compose/reliability.rs
1//! Reliability primitives for tool dispatch loops.
2//!
3//! This module groups three small, host-driven utilities that downstream
4//! agents combine when wrapping a normalized tool loop:
5//!
6//! 1. [`RetryClass`] + [`RetryClassifier`] — turn a [`KernelError`] into a
7//! deterministic "should I retry this call?" verdict. The default impl
8//! ([`DefaultRetryClassifier`]) covers every existing
9//! `KernelError` variant; hosts can supply a custom classifier when they
10//! layer in transport-specific errors (timeouts, rate limits, etc.) by
11//! chaining or overriding.
12//! 2. [`ToolCallFingerprint`] — a stable, content-addressed hash of a
13//! [`ToolInvocation`] (tool name + canonical JSON args). Used to detect
14//! repeated calls and group retry attempts.
15//! 3. [`HistoryEntry`] + [`repair_history`] — deterministic coalescing of
16//! a raw `(invocation, outcome)` sequence into the smallest history the
17//! model should see. Multiple retries of the same fingerprint collapse to
18//! a single canonical entry; the host stays in control of how many
19//! physical retries actually happened.
20//!
21//! These primitives are intentionally synchronous and infallible: they
22//! operate on already-materialized invocations and outcomes, never on live
23//! transports.
24//!
25//! # Example
26//!
27//! ```no_run
28//! use rig_compose::{
29//! DefaultRetryClassifier, HistoryEntry, KernelError, RetryClass,
30//! RetryClassifier, ToolInvocation, repair_history,
31//! };
32//! use serde_json::json;
33//!
34//! let classifier = DefaultRetryClassifier;
35//! let inv = ToolInvocation::new("search", json!({"q": "rig"})).expect("valid");
36//! let history = vec![
37//! HistoryEntry::Failed {
38//! invocation: inv.clone(),
39//! class: classifier.classify(&KernelError::ToolFailed("timeout".into())),
40//! message: "timeout".into(),
41//! },
42//! HistoryEntry::Completed {
43//! invocation: inv,
44//! output: json!({"hits": 3}),
45//! },
46//! ];
47//! let repaired = repair_history(&history);
48//! assert_eq!(repaired.len(), 1);
49//! assert!(matches!(repaired[0], HistoryEntry::Completed { .. }));
50//! ```
51
52use std::collections::hash_map::DefaultHasher;
53use std::hash::{Hash, Hasher};
54
55use serde_json::Value;
56
57use crate::normalizer::{ToolInvocation, ToolInvocationResult};
58use crate::registry::KernelError;
59
60// ── Retry classification ─────────────────────────────────────────────────────
61
62/// Deterministic verdict on whether an errored tool invocation may be retried.
63///
64/// `Transient` means the failure was likely environmental (network blip, flaky
65/// dependency) and a retry has a real chance of succeeding. `Permanent` means
66/// the inputs or policy are wrong and retrying would just waste the budget.
67#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
68pub enum RetryClass {
69 /// Retry is allowed; the failure is likely environmental.
70 Transient,
71 /// Retry is forbidden; the failure is intrinsic to the inputs or policy.
72 Permanent,
73}
74
75/// Classify a [`KernelError`] as transient or permanent.
76///
77/// Hosts can implement this trait to inject transport-specific knowledge
78/// (e.g. mapping HTTP 5xx onto `Transient` and 4xx onto `Permanent`). The
79/// crate ships [`DefaultRetryClassifier`] as a starting point that covers
80/// every current `KernelError` variant.
81pub trait RetryClassifier: Send + Sync {
82 /// Return the retry verdict for `error`.
83 fn classify(&self, error: &KernelError) -> RetryClass;
84}
85
86/// Default classifier covering every [`KernelError`] variant.
87///
88/// The mapping is conservative: anything that *could* be a flake (the tool
89/// body errored, a skill body errored) is `Transient`. Everything that
90/// signals a permanent disagreement (auth, missing names, invalid args,
91/// budget exhaustion, dispatch termination, JSON parse errors) is
92/// `Permanent`.
93#[derive(Debug, Clone, Copy, Default)]
94pub struct DefaultRetryClassifier;
95
96impl RetryClassifier for DefaultRetryClassifier {
97 fn classify(&self, error: &KernelError) -> RetryClass {
98 match error {
99 // Body errors — likely environmental, may succeed on retry.
100 KernelError::ToolFailed(_) | KernelError::SkillFailed(_) => RetryClass::Transient,
101
102 // Configuration / policy / argument errors — retrying with the
103 // same inputs cannot help.
104 KernelError::ToolNotFound(_)
105 | KernelError::ToolNotAuthorised(_)
106 | KernelError::SkillNotFound(_)
107 | KernelError::ToolNotApplicable(_)
108 | KernelError::InvalidArgument(_)
109 | KernelError::NormalizerFailed(_)
110 | KernelError::ToolDispatchTerminated(_)
111 | KernelError::BudgetFailed(_)
112 | KernelError::Serde(_) => RetryClass::Permanent,
113 }
114 }
115}
116
117// ── Fingerprints ─────────────────────────────────────────────────────────────
118
119/// Stable content hash of a [`ToolInvocation`].
120///
121/// Two invocations with the same tool name and the same canonical JSON
122/// arguments produce the same fingerprint, regardless of which dispatch
123/// attempt produced them. Uses `std::collections::hash_map::DefaultHasher`,
124/// so values are stable within a single process run but should not be
125/// persisted across versions.
126///
127/// Determinism relies on `serde_json::Map`'s default key ordering
128/// (`BTreeMap`-backed). If a downstream crate enables the `preserve_order`
129/// feature globally, fingerprints will no longer be argument-order-independent.
130#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
131pub struct ToolCallFingerprint(pub u64);
132
133impl ToolInvocation {
134 /// Return a stable fingerprint over `(name, canonical(args))`.
135 ///
136 /// Suitable as a hash-map key to group retry attempts of the same
137 /// logical call, or to detect a stuck-loop pattern where the model
138 /// keeps reissuing identical calls.
139 pub fn fingerprint(&self) -> ToolCallFingerprint {
140 let mut hasher = DefaultHasher::new();
141 self.name.hash(&mut hasher);
142 // Serializing through serde_json gives a canonical form because
143 // `serde_json::Map` is BTreeMap-backed without `preserve_order`.
144 canonicalize_value(&self.args).to_string().hash(&mut hasher);
145 ToolCallFingerprint(hasher.finish())
146 }
147}
148
149/// Re-serialize `value` to a canonical form for hashing.
150///
151/// `serde_json::Value::to_string` is already deterministic when the
152/// `preserve_order` feature is off, but going through a normalize step
153/// makes that contract explicit and gives us a single hook to add float
154/// canonicalization later if needed.
155fn canonicalize_value(value: &Value) -> Value {
156 match value {
157 Value::Array(items) => Value::Array(items.iter().map(canonicalize_value).collect()),
158 Value::Object(map) => {
159 let mut out = serde_json::Map::new();
160 for (key, inner) in map {
161 out.insert(key.clone(), canonicalize_value(inner));
162 }
163 Value::Object(out)
164 }
165 other => other.clone(),
166 }
167}
168
169// ── History repair ───────────────────────────────────────────────────────────
170
171/// One entry in a tool-call history slice fed back to the model.
172#[derive(Debug, Clone, PartialEq)]
173pub enum HistoryEntry {
174 /// The invocation completed and produced `output`.
175 Completed {
176 /// The invocation that ran.
177 invocation: ToolInvocation,
178 /// The JSON result the tool returned.
179 output: Value,
180 },
181 /// The invocation failed; `class` records the retry verdict and
182 /// `message` carries the error rendering.
183 Failed {
184 /// The invocation that failed.
185 invocation: ToolInvocation,
186 /// Retry verdict from a [`RetryClassifier`].
187 class: RetryClass,
188 /// `error.to_string()` of the underlying [`KernelError`].
189 message: String,
190 },
191}
192
193impl HistoryEntry {
194 /// Return the fingerprint of this entry's invocation.
195 pub fn fingerprint(&self) -> ToolCallFingerprint {
196 match self {
197 HistoryEntry::Completed { invocation, .. }
198 | HistoryEntry::Failed { invocation, .. } => invocation.fingerprint(),
199 }
200 }
201
202 /// Convenience: build a `Completed` entry from a [`ToolInvocationResult`].
203 pub fn completed(result: ToolInvocationResult) -> Self {
204 HistoryEntry::Completed {
205 invocation: result.invocation,
206 output: result.output,
207 }
208 }
209
210 /// Convenience: classify `error` with `classifier` and build a `Failed`
211 /// entry that records the verdict and the error rendering.
212 pub fn failed<C: RetryClassifier>(
213 invocation: ToolInvocation,
214 error: &KernelError,
215 classifier: &C,
216 ) -> Self {
217 HistoryEntry::Failed {
218 invocation,
219 class: classifier.classify(error),
220 message: error.to_string(),
221 }
222 }
223}
224
225/// Deterministically coalesce a tool-call history.
226///
227/// The repair rule is:
228///
229/// 1. Walk `entries` in order, grouping by [`ToolCallFingerprint`].
230/// 2. For each group, if **any** entry is [`HistoryEntry::Completed`],
231/// keep the **first** completion (idempotent: once we have a real
232/// answer, later retries don't change the story).
233/// 3. Otherwise keep the **last** [`HistoryEntry::Failed`] for that
234/// fingerprint (most recent verdict wins for terminal failures).
235/// 4. Emit results in **first-occurrence order** of each fingerprint.
236///
237/// The transform is total, deterministic, and idempotent
238/// (`repair_history(repair_history(x)) == repair_history(x)`).
239pub fn repair_history(entries: &[HistoryEntry]) -> Vec<HistoryEntry> {
240 // Track first-seen position per fingerprint so output preserves order,
241 // and the chosen entry index per fingerprint.
242 let mut order: Vec<ToolCallFingerprint> = Vec::new();
243 let mut chosen: std::collections::HashMap<ToolCallFingerprint, usize> =
244 std::collections::HashMap::new();
245 let mut has_completed: std::collections::HashSet<ToolCallFingerprint> =
246 std::collections::HashSet::new();
247
248 for (idx, entry) in entries.iter().enumerate() {
249 let fp = entry.fingerprint();
250 if let std::collections::hash_map::Entry::Vacant(slot) = chosen.entry(fp) {
251 order.push(fp);
252 slot.insert(idx);
253 if matches!(entry, HistoryEntry::Completed { .. }) {
254 has_completed.insert(fp);
255 }
256 continue;
257 }
258 match entry {
259 HistoryEntry::Completed { .. } => {
260 // Rule 2: keep the *first* completion. If we already have
261 // one chosen and it's a completion, skip. If the chosen
262 // one is a failure, replace it.
263 if !has_completed.contains(&fp) {
264 chosen.insert(fp, idx);
265 has_completed.insert(fp);
266 }
267 }
268 HistoryEntry::Failed { .. } => {
269 // Rule 3: last failure wins, but only if we don't already
270 // have a completion locked in.
271 if !has_completed.contains(&fp) {
272 chosen.insert(fp, idx);
273 }
274 }
275 }
276 }
277
278 order
279 .into_iter()
280 .filter_map(|fp| chosen.get(&fp).and_then(|&i| entries.get(i)).cloned())
281 .collect()
282}
283
284// ── Tests ────────────────────────────────────────────────────────────────────
285
286#[cfg(test)]
287#[allow(
288 clippy::unwrap_used,
289 clippy::expect_used,
290 clippy::panic,
291 clippy::indexing_slicing
292)]
293mod tests {
294 use super::*;
295 use serde_json::json;
296
297 fn inv(name: &str, args: Value) -> ToolInvocation {
298 ToolInvocation::new(name, args).unwrap()
299 }
300
301 // ── Classifier ───────────────────────────────────────────────────────
302
303 #[test]
304 fn default_classifier_marks_tool_failed_transient() {
305 let c = DefaultRetryClassifier;
306 assert_eq!(
307 c.classify(&KernelError::ToolFailed("boom".into())),
308 RetryClass::Transient,
309 );
310 assert_eq!(
311 c.classify(&KernelError::SkillFailed("boom".into())),
312 RetryClass::Transient,
313 );
314 }
315
316 #[test]
317 fn default_classifier_marks_policy_errors_permanent() {
318 let c = DefaultRetryClassifier;
319 for err in [
320 KernelError::ToolNotFound("x".into()),
321 KernelError::ToolNotAuthorised("x".into()),
322 KernelError::SkillNotFound("x".into()),
323 KernelError::ToolNotApplicable("x".into()),
324 KernelError::InvalidArgument("x".into()),
325 KernelError::NormalizerFailed("x".into()),
326 KernelError::ToolDispatchTerminated("x".into()),
327 KernelError::BudgetFailed("x".into()),
328 ] {
329 assert_eq!(c.classify(&err), RetryClass::Permanent, "{err:?}");
330 }
331 }
332
333 // ── Fingerprints ────────────────────────────────────────────────────
334
335 #[test]
336 fn fingerprint_is_stable_for_same_invocation() {
337 let a = inv("search", json!({"q": "rig", "limit": 5}));
338 let b = inv("search", json!({"q": "rig", "limit": 5}));
339 assert_eq!(a.fingerprint(), b.fingerprint());
340 }
341
342 #[test]
343 fn fingerprint_is_order_independent_for_object_args() {
344 let a = inv("search", json!({"q": "rig", "limit": 5}));
345 let b = inv("search", json!({"limit": 5, "q": "rig"}));
346 assert_eq!(a.fingerprint(), b.fingerprint());
347 }
348
349 #[test]
350 fn fingerprint_differs_when_args_differ() {
351 let a = inv("search", json!({"q": "rig"}));
352 let b = inv("search", json!({"q": "tokio"}));
353 assert_ne!(a.fingerprint(), b.fingerprint());
354 }
355
356 #[test]
357 fn fingerprint_differs_when_tool_name_differs() {
358 let a = inv("search", json!({"q": "rig"}));
359 let b = inv("lookup", json!({"q": "rig"}));
360 assert_ne!(a.fingerprint(), b.fingerprint());
361 }
362
363 // ── History repair ──────────────────────────────────────────────────
364
365 #[test]
366 fn repair_keeps_first_completion_after_retries() {
367 let i = inv("search", json!({"q": "rig"}));
368 let history = vec![
369 HistoryEntry::Failed {
370 invocation: i.clone(),
371 class: RetryClass::Transient,
372 message: "timeout".into(),
373 },
374 HistoryEntry::Completed {
375 invocation: i.clone(),
376 output: json!({"hits": 1}),
377 },
378 HistoryEntry::Completed {
379 invocation: i,
380 output: json!({"hits": 99}),
381 },
382 ];
383 let repaired = repair_history(&history);
384 assert_eq!(repaired.len(), 1);
385 match &repaired[0] {
386 HistoryEntry::Completed { output, .. } => assert_eq!(output, &json!({"hits": 1})),
387 other => panic!("expected Completed, got {other:?}"),
388 }
389 }
390
391 #[test]
392 fn repair_keeps_last_failure_when_no_completion() {
393 let i = inv("search", json!({"q": "rig"}));
394 let history = vec![
395 HistoryEntry::Failed {
396 invocation: i.clone(),
397 class: RetryClass::Transient,
398 message: "first".into(),
399 },
400 HistoryEntry::Failed {
401 invocation: i,
402 class: RetryClass::Permanent,
403 message: "last".into(),
404 },
405 ];
406 let repaired = repair_history(&history);
407 assert_eq!(repaired.len(), 1);
408 match &repaired[0] {
409 HistoryEntry::Failed { message, class, .. } => {
410 assert_eq!(message, "last");
411 assert_eq!(*class, RetryClass::Permanent);
412 }
413 other => panic!("expected Failed, got {other:?}"),
414 }
415 }
416
417 #[test]
418 fn repair_preserves_first_occurrence_order_across_fingerprints() {
419 let a = inv("a", json!({"k": 1}));
420 let b = inv("b", json!({"k": 2}));
421 let history = vec![
422 HistoryEntry::Completed {
423 invocation: a.clone(),
424 output: json!(null),
425 },
426 HistoryEntry::Completed {
427 invocation: b.clone(),
428 output: json!(null),
429 },
430 HistoryEntry::Completed {
431 invocation: a,
432 output: json!("ignored"),
433 },
434 ];
435 let repaired = repair_history(&history);
436 assert_eq!(repaired.len(), 2);
437 // First entry is the first occurrence of `a`.
438 assert_eq!(
439 repaired[0].fingerprint(),
440 inv("a", json!({"k": 1})).fingerprint()
441 );
442 assert_eq!(
443 repaired[1].fingerprint(),
444 inv("b", json!({"k": 2})).fingerprint()
445 );
446 }
447
448 #[test]
449 fn repair_is_idempotent() {
450 let i = inv("search", json!({"q": "rig"}));
451 let history = vec![
452 HistoryEntry::Failed {
453 invocation: i.clone(),
454 class: RetryClass::Transient,
455 message: "x".into(),
456 },
457 HistoryEntry::Completed {
458 invocation: i,
459 output: json!({"ok": true}),
460 },
461 ];
462 let once = repair_history(&history);
463 let twice = repair_history(&once);
464 assert_eq!(once, twice);
465 }
466
467 #[test]
468 fn repair_on_empty_history_returns_empty() {
469 assert!(repair_history(&[]).is_empty());
470 }
471
472 #[test]
473 fn history_entry_failed_helper_records_classifier_verdict() {
474 let entry = HistoryEntry::failed(
475 inv("search", json!({"q": "rig"})),
476 &KernelError::ToolFailed("flake".into()),
477 &DefaultRetryClassifier,
478 );
479 match entry {
480 HistoryEntry::Failed { class, message, .. } => {
481 assert_eq!(class, RetryClass::Transient);
482 assert!(message.contains("flake"));
483 }
484 other => panic!("expected Failed, got {other:?}"),
485 }
486 }
487}