1use serde::{Deserialize, Serialize};
2use serde_json::{Value, json};
3use time::OffsetDateTime;
4
5use crate::events::{ThreadId, TurnId};
6
7#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash)]
8#[serde(rename_all = "snake_case")]
9pub enum ReliabilityErrorClass {
10 InvalidArguments,
11 UnexpectedEnvironment,
12 ProviderError,
13 Timeout,
14 PolicyDenied,
15 UserAborted,
16 VerifierFailed,
17 Unknown,
18}
19
20#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
21#[serde(rename_all = "snake_case")]
22pub enum ReliabilityRetryDecision {
23 Retry,
24 DoNotRetry,
25 Exhausted,
26}
27
28#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
29#[serde(rename_all = "snake_case")]
30pub enum ReliabilityLimitDecision {
31 Continue,
32 StopTurn,
33 RequestContinuation,
34}
35
36#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
37#[serde(rename_all = "snake_case")]
38pub enum ReliabilityLimitKind {
39 ConsecutiveToolFailures,
40 ToolFailuresPerTurn,
41 ModelCallsPerTurn,
42 ProviderAttempts,
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
46#[serde(rename_all = "camelCase")]
47pub struct ReliabilityDetails {
48 pub message: String,
49 #[serde(default)]
50 pub redacted: bool,
51}
52
53impl ReliabilityDetails {
54 pub fn redacted(message: impl AsRef<str>) -> Self {
55 Self {
56 message: redact_secret_like_text(message.as_ref()),
57 redacted: true,
58 }
59 }
60}
61
62#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
63#[serde(rename_all = "camelCase")]
64pub struct ReliabilityContext {
65 pub thread_id: ThreadId,
66 pub turn_id: TurnId,
67 #[serde(default, skip_serializing_if = "Option::is_none")]
68 pub tool_id: Option<String>,
69 #[serde(default, skip_serializing_if = "Option::is_none")]
70 pub tool_name: Option<String>,
71 #[serde(default, skip_serializing_if = "Option::is_none")]
72 pub provider: Option<String>,
73 #[serde(default, skip_serializing_if = "Option::is_none")]
74 pub model: Option<String>,
75}
76
77#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
78#[serde(rename_all = "camelCase")]
79pub struct ReliabilityFailureRecorded {
80 pub context: ReliabilityContext,
81 pub error_class: ReliabilityErrorClass,
82 pub details: ReliabilityDetails,
83 #[serde(with = "time::serde::rfc3339")]
84 pub timestamp: OffsetDateTime,
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
88#[serde(rename_all = "camelCase")]
89pub struct ReliabilityRetryRecorded {
90 pub context: ReliabilityContext,
91 pub error_class: ReliabilityErrorClass,
92 pub decision: ReliabilityRetryDecision,
93 pub attempt: u32,
94 pub max_attempts: u32,
95 #[serde(default, skip_serializing_if = "Option::is_none")]
96 pub delay_ms: Option<u64>,
97 pub details: ReliabilityDetails,
98 #[serde(with = "time::serde::rfc3339")]
99 pub timestamp: OffsetDateTime,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
103#[serde(rename_all = "camelCase")]
104pub struct ReliabilityLimitRecorded {
105 pub context: ReliabilityContext,
106 pub error_class: ReliabilityErrorClass,
107 pub limit_kind: ReliabilityLimitKind,
108 pub decision: ReliabilityLimitDecision,
109 pub current: u32,
110 pub limit: u32,
111 pub details: ReliabilityDetails,
112 #[serde(with = "time::serde::rfc3339")]
113 pub timestamp: OffsetDateTime,
114}
115
116#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
117#[serde(rename_all = "camelCase")]
118pub struct ReliabilityMetricRecorded {
119 pub context: ReliabilityContext,
120 pub metric: String,
121 pub value: f64,
122 #[serde(default, skip_serializing_if = "Option::is_none")]
123 pub error_class: Option<ReliabilityErrorClass>,
124 #[serde(with = "time::serde::rfc3339")]
125 pub timestamp: OffsetDateTime,
126}
127
128#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
129#[serde(rename_all = "camelCase")]
130pub struct ReliabilityRequestPolicy {
131 pub provider_retry_max_attempts: u32,
132 pub provider_retry_initial_backoff_ms: u64,
133 pub provider_retry_backoff_factor: u32,
134 pub retry_empty_provider_body: bool,
135 #[serde(default, skip_serializing_if = "Vec::is_empty")]
136 pub provider_retry_status_codes: Vec<u16>,
137}
138
139impl Default for ReliabilityRequestPolicy {
140 fn default() -> Self {
141 Self {
142 provider_retry_max_attempts: 3,
143 provider_retry_initial_backoff_ms: 1_000,
144 provider_retry_backoff_factor: 2,
145 retry_empty_provider_body: true,
146 provider_retry_status_codes: vec![429, 500, 502, 503, 504],
147 }
148 }
149}
150
151pub fn provider_retry_delay_ms(policy: &ReliabilityRequestPolicy, attempt: u32) -> u64 {
152 let factor = policy.provider_retry_backoff_factor.max(1) as u64;
153 policy
154 .provider_retry_initial_backoff_ms
155 .saturating_mul(factor.saturating_pow(attempt.saturating_sub(1)))
156}
157
158pub fn provider_retry_status_cause(status: u16) -> String {
159 format!("status_{status}")
160}
161
162pub fn provider_retry_metadata(
163 attempt: u32,
164 cause: &str,
165 policy: &ReliabilityRequestPolicy,
166) -> Value {
167 json!({
168 "kind": "reliability_retry_attempt",
169 "errorClass": ReliabilityErrorClass::ProviderError,
170 "decision": ReliabilityRetryDecision::Retry,
171 "attempt": attempt,
172 "delayMs": provider_retry_delay_ms(policy, attempt),
173 "cause": cause,
174 })
175}
176
177fn redact_secret_like_text(input: &str) -> String {
178 input
179 .split_whitespace()
180 .map(|part| {
181 let lower = part.to_ascii_lowercase();
182 if lower.starts_with("sk-")
183 || lower.starts_with("bearer")
184 || lower.starts_with("authorization:")
185 || lower.contains("api_key=")
186 || lower.contains("apikey=")
187 || lower.contains("token=")
188 {
189 "[redacted]"
190 } else {
191 part
192 }
193 })
194 .collect::<Vec<_>>()
195 .join(" ")
196}
197
198#[cfg(test)]
199mod tests {
200 use super::*;
201 use crate::events::{EventSource, RoderEvent};
202
203 fn context() -> ReliabilityContext {
204 ReliabilityContext {
205 thread_id: "thread-a".to_string(),
206 turn_id: "turn-a".to_string(),
207 tool_id: Some("tool-call-1".to_string()),
208 tool_name: Some("read_file".to_string()),
209 provider: Some("openai".to_string()),
210 model: Some("gpt-5.5".to_string()),
211 }
212 }
213
214 fn timestamp() -> OffsetDateTime {
215 OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap()
216 }
217
218 #[test]
219 fn reliability_provider_retry_fixture_serializes_redacted_context() {
220 let event = ReliabilityRetryRecorded {
221 context: context(),
222 error_class: ReliabilityErrorClass::ProviderError,
223 decision: ReliabilityRetryDecision::Retry,
224 attempt: 1,
225 max_attempts: 3,
226 delay_ms: Some(1_000),
227 details: ReliabilityDetails::redacted(
228 "provider 429 Authorization: Bearer sk-secret-token",
229 ),
230 timestamp: timestamp(),
231 };
232
233 let json = serde_json::to_value(&event).unwrap();
234 assert_eq!(json["errorClass"], "provider_error");
235 assert_eq!(json["decision"], "retry");
236 assert_eq!(json["context"]["threadId"], "thread-a");
237 assert_eq!(json["context"]["turnId"], "turn-a");
238 assert_eq!(json["context"]["provider"], "openai");
239 assert_eq!(json["context"]["model"], "gpt-5.5");
240 let rendered = serde_json::to_string(&json).unwrap();
241 assert!(!rendered.contains("sk-secret-token"));
242 }
243
244 #[test]
245 fn provider_retry_metadata_is_classified_and_redacted() {
246 let policy = ReliabilityRequestPolicy {
247 provider_retry_initial_backoff_ms: 250,
248 provider_retry_backoff_factor: 3,
249 ..ReliabilityRequestPolicy::default()
250 };
251
252 let metadata = provider_retry_metadata(2, &provider_retry_status_cause(429), &policy);
253
254 assert_eq!(metadata["kind"], "reliability_retry_attempt");
255 assert_eq!(metadata["errorClass"], "provider_error");
256 assert_eq!(metadata["decision"], "retry");
257 assert_eq!(metadata["attempt"], 2);
258 assert_eq!(metadata["delayMs"], 750);
259 assert_eq!(metadata["cause"], "status_429");
260 }
261
262 #[test]
263 fn reliability_tool_validation_failure_fixture_serializes() {
264 let event = ReliabilityFailureRecorded {
265 context: context(),
266 error_class: ReliabilityErrorClass::InvalidArguments,
267 details: ReliabilityDetails::redacted("missing required field path"),
268 timestamp: timestamp(),
269 };
270
271 let round_trip: ReliabilityFailureRecorded =
272 serde_json::from_value(serde_json::to_value(&event).unwrap()).unwrap();
273 assert_eq!(
274 round_trip.error_class,
275 ReliabilityErrorClass::InvalidArguments
276 );
277 assert_eq!(round_trip.context.tool_name.as_deref(), Some("read_file"));
278 }
279
280 #[test]
281 fn reliability_failure_limit_stop_fixture_serializes() {
282 let event = ReliabilityLimitRecorded {
283 context: context(),
284 error_class: ReliabilityErrorClass::InvalidArguments,
285 limit_kind: ReliabilityLimitKind::ConsecutiveToolFailures,
286 decision: ReliabilityLimitDecision::StopTurn,
287 current: 5,
288 limit: 5,
289 details: ReliabilityDetails::redacted("tool failure limit reached"),
290 timestamp: timestamp(),
291 };
292
293 let json = serde_json::to_value(&event).unwrap();
294 assert_eq!(json["limitKind"], "consecutive_tool_failures");
295 assert_eq!(json["decision"], "stop_turn");
296 }
297
298 #[test]
299 fn reliability_timeout_fixture_serializes() {
300 let event = ReliabilityMetricRecorded {
301 context: context(),
302 metric: "timeout_count".to_string(),
303 value: 1.0,
304 error_class: Some(ReliabilityErrorClass::Timeout),
305 timestamp: timestamp(),
306 };
307
308 let json = serde_json::to_value(&event).unwrap();
309 assert_eq!(json["metric"], "timeout_count");
310 assert_eq!(json["errorClass"], "timeout");
311 }
312
313 #[test]
314 fn reliability_unknown_error_fixture_serializes() {
315 let event = ReliabilityFailureRecorded {
316 context: ReliabilityContext {
317 tool_id: None,
318 tool_name: None,
319 ..context()
320 },
321 error_class: ReliabilityErrorClass::Unknown,
322 details: ReliabilityDetails::redacted("panic converted into unknown harness error"),
323 timestamp: timestamp(),
324 };
325
326 let json = serde_json::to_value(&event).unwrap();
327 assert_eq!(json["errorClass"], "unknown");
328 assert!(json["context"].get("toolId").is_none());
329 assert_eq!(json["context"]["threadId"], "thread-a");
330 assert_eq!(json["context"]["turnId"], "turn-a");
331 }
332
333 #[test]
334 fn reliability_events_expose_kind_source_and_turn_scope() {
335 let event = RoderEvent::ReliabilityLimitRecorded(ReliabilityLimitRecorded {
336 context: context(),
337 error_class: ReliabilityErrorClass::InvalidArguments,
338 limit_kind: ReliabilityLimitKind::ToolFailuresPerTurn,
339 decision: ReliabilityLimitDecision::RequestContinuation,
340 current: 10,
341 limit: 10,
342 details: ReliabilityDetails::redacted("tool failures per turn reached"),
343 timestamp: timestamp(),
344 });
345
346 assert_eq!(event.kind(), "reliability.limit");
347 assert_eq!(event.source(), EventSource::Core);
348 assert_eq!(event.thread_id().map(String::as_str), Some("thread-a"));
349 assert_eq!(event.turn_id().map(String::as_str), Some("turn-a"));
350 }
351}