Skip to main content

roder_api/
reliability.rs

1use serde::{Deserialize, Serialize};
2use serde_json::{Value, json};
3use time::OffsetDateTime;
4
5use crate::events::{ThreadId, TurnId};
6
7#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash)]
8#[serde(rename_all = "snake_case")]
9pub enum ReliabilityErrorClass {
10    InvalidArguments,
11    UnexpectedEnvironment,
12    ProviderError,
13    Timeout,
14    PolicyDenied,
15    UserAborted,
16    VerifierFailed,
17    Unknown,
18}
19
20#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
21#[serde(rename_all = "snake_case")]
22pub enum ReliabilityRetryDecision {
23    Retry,
24    DoNotRetry,
25    Exhausted,
26}
27
28#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
29#[serde(rename_all = "snake_case")]
30pub enum ReliabilityLimitDecision {
31    Continue,
32    StopTurn,
33    RequestContinuation,
34}
35
36#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
37#[serde(rename_all = "snake_case")]
38pub enum ReliabilityLimitKind {
39    ConsecutiveToolFailures,
40    ToolFailuresPerTurn,
41    ModelCallsPerTurn,
42    ProviderAttempts,
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
46#[serde(rename_all = "camelCase")]
47pub struct ReliabilityDetails {
48    pub message: String,
49    #[serde(default)]
50    pub redacted: bool,
51}
52
53impl ReliabilityDetails {
54    pub fn redacted(message: impl AsRef<str>) -> Self {
55        Self {
56            message: redact_secret_like_text(message.as_ref()),
57            redacted: true,
58        }
59    }
60}
61
62#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
63#[serde(rename_all = "camelCase")]
64pub struct ReliabilityContext {
65    pub thread_id: ThreadId,
66    pub turn_id: TurnId,
67    #[serde(default, skip_serializing_if = "Option::is_none")]
68    pub tool_id: Option<String>,
69    #[serde(default, skip_serializing_if = "Option::is_none")]
70    pub tool_name: Option<String>,
71    #[serde(default, skip_serializing_if = "Option::is_none")]
72    pub provider: Option<String>,
73    #[serde(default, skip_serializing_if = "Option::is_none")]
74    pub model: Option<String>,
75}
76
77#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
78#[serde(rename_all = "camelCase")]
79pub struct ReliabilityFailureRecorded {
80    pub context: ReliabilityContext,
81    pub error_class: ReliabilityErrorClass,
82    pub details: ReliabilityDetails,
83    #[serde(with = "time::serde::rfc3339")]
84    pub timestamp: OffsetDateTime,
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
88#[serde(rename_all = "camelCase")]
89pub struct ReliabilityRetryRecorded {
90    pub context: ReliabilityContext,
91    pub error_class: ReliabilityErrorClass,
92    pub decision: ReliabilityRetryDecision,
93    pub attempt: u32,
94    pub max_attempts: u32,
95    #[serde(default, skip_serializing_if = "Option::is_none")]
96    pub delay_ms: Option<u64>,
97    pub details: ReliabilityDetails,
98    #[serde(with = "time::serde::rfc3339")]
99    pub timestamp: OffsetDateTime,
100}
101
102#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
103#[serde(rename_all = "camelCase")]
104pub struct ReliabilityLimitRecorded {
105    pub context: ReliabilityContext,
106    pub error_class: ReliabilityErrorClass,
107    pub limit_kind: ReliabilityLimitKind,
108    pub decision: ReliabilityLimitDecision,
109    pub current: u32,
110    pub limit: u32,
111    pub details: ReliabilityDetails,
112    #[serde(with = "time::serde::rfc3339")]
113    pub timestamp: OffsetDateTime,
114}
115
116#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
117#[serde(rename_all = "camelCase")]
118pub struct ReliabilityMetricRecorded {
119    pub context: ReliabilityContext,
120    pub metric: String,
121    pub value: f64,
122    #[serde(default, skip_serializing_if = "Option::is_none")]
123    pub error_class: Option<ReliabilityErrorClass>,
124    #[serde(with = "time::serde::rfc3339")]
125    pub timestamp: OffsetDateTime,
126}
127
128#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
129#[serde(rename_all = "camelCase")]
130pub struct ReliabilityRequestPolicy {
131    pub provider_retry_max_attempts: u32,
132    pub provider_retry_initial_backoff_ms: u64,
133    pub provider_retry_backoff_factor: u32,
134    pub retry_empty_provider_body: bool,
135    #[serde(default, skip_serializing_if = "Vec::is_empty")]
136    pub provider_retry_status_codes: Vec<u16>,
137}
138
139impl Default for ReliabilityRequestPolicy {
140    fn default() -> Self {
141        Self {
142            provider_retry_max_attempts: 3,
143            provider_retry_initial_backoff_ms: 1_000,
144            provider_retry_backoff_factor: 2,
145            retry_empty_provider_body: true,
146            provider_retry_status_codes: vec![429, 500, 502, 503, 504],
147        }
148    }
149}
150
151pub fn provider_retry_delay_ms(policy: &ReliabilityRequestPolicy, attempt: u32) -> u64 {
152    let factor = policy.provider_retry_backoff_factor.max(1) as u64;
153    policy
154        .provider_retry_initial_backoff_ms
155        .saturating_mul(factor.saturating_pow(attempt.saturating_sub(1)))
156}
157
158pub fn provider_retry_status_cause(status: u16) -> String {
159    format!("status_{status}")
160}
161
162pub fn provider_retry_metadata(
163    attempt: u32,
164    cause: &str,
165    policy: &ReliabilityRequestPolicy,
166) -> Value {
167    json!({
168        "kind": "reliability_retry_attempt",
169        "errorClass": ReliabilityErrorClass::ProviderError,
170        "decision": ReliabilityRetryDecision::Retry,
171        "attempt": attempt,
172        "delayMs": provider_retry_delay_ms(policy, attempt),
173        "cause": cause,
174    })
175}
176
177fn redact_secret_like_text(input: &str) -> String {
178    input
179        .split_whitespace()
180        .map(|part| {
181            let lower = part.to_ascii_lowercase();
182            if lower.starts_with("sk-")
183                || lower.starts_with("bearer")
184                || lower.starts_with("authorization:")
185                || lower.contains("api_key=")
186                || lower.contains("apikey=")
187                || lower.contains("token=")
188            {
189                "[redacted]"
190            } else {
191                part
192            }
193        })
194        .collect::<Vec<_>>()
195        .join(" ")
196}
197
198#[cfg(test)]
199mod tests {
200    use super::*;
201    use crate::events::{EventSource, RoderEvent};
202
203    fn context() -> ReliabilityContext {
204        ReliabilityContext {
205            thread_id: "thread-a".to_string(),
206            turn_id: "turn-a".to_string(),
207            tool_id: Some("tool-call-1".to_string()),
208            tool_name: Some("read_file".to_string()),
209            provider: Some("openai".to_string()),
210            model: Some("gpt-5.5".to_string()),
211        }
212    }
213
214    fn timestamp() -> OffsetDateTime {
215        OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap()
216    }
217
218    #[test]
219    fn reliability_provider_retry_fixture_serializes_redacted_context() {
220        let event = ReliabilityRetryRecorded {
221            context: context(),
222            error_class: ReliabilityErrorClass::ProviderError,
223            decision: ReliabilityRetryDecision::Retry,
224            attempt: 1,
225            max_attempts: 3,
226            delay_ms: Some(1_000),
227            details: ReliabilityDetails::redacted(
228                "provider 429 Authorization: Bearer sk-secret-token",
229            ),
230            timestamp: timestamp(),
231        };
232
233        let json = serde_json::to_value(&event).unwrap();
234        assert_eq!(json["errorClass"], "provider_error");
235        assert_eq!(json["decision"], "retry");
236        assert_eq!(json["context"]["threadId"], "thread-a");
237        assert_eq!(json["context"]["turnId"], "turn-a");
238        assert_eq!(json["context"]["provider"], "openai");
239        assert_eq!(json["context"]["model"], "gpt-5.5");
240        let rendered = serde_json::to_string(&json).unwrap();
241        assert!(!rendered.contains("sk-secret-token"));
242    }
243
244    #[test]
245    fn provider_retry_metadata_is_classified_and_redacted() {
246        let policy = ReliabilityRequestPolicy {
247            provider_retry_initial_backoff_ms: 250,
248            provider_retry_backoff_factor: 3,
249            ..ReliabilityRequestPolicy::default()
250        };
251
252        let metadata = provider_retry_metadata(2, &provider_retry_status_cause(429), &policy);
253
254        assert_eq!(metadata["kind"], "reliability_retry_attempt");
255        assert_eq!(metadata["errorClass"], "provider_error");
256        assert_eq!(metadata["decision"], "retry");
257        assert_eq!(metadata["attempt"], 2);
258        assert_eq!(metadata["delayMs"], 750);
259        assert_eq!(metadata["cause"], "status_429");
260    }
261
262    #[test]
263    fn reliability_tool_validation_failure_fixture_serializes() {
264        let event = ReliabilityFailureRecorded {
265            context: context(),
266            error_class: ReliabilityErrorClass::InvalidArguments,
267            details: ReliabilityDetails::redacted("missing required field path"),
268            timestamp: timestamp(),
269        };
270
271        let round_trip: ReliabilityFailureRecorded =
272            serde_json::from_value(serde_json::to_value(&event).unwrap()).unwrap();
273        assert_eq!(
274            round_trip.error_class,
275            ReliabilityErrorClass::InvalidArguments
276        );
277        assert_eq!(round_trip.context.tool_name.as_deref(), Some("read_file"));
278    }
279
280    #[test]
281    fn reliability_failure_limit_stop_fixture_serializes() {
282        let event = ReliabilityLimitRecorded {
283            context: context(),
284            error_class: ReliabilityErrorClass::InvalidArguments,
285            limit_kind: ReliabilityLimitKind::ConsecutiveToolFailures,
286            decision: ReliabilityLimitDecision::StopTurn,
287            current: 5,
288            limit: 5,
289            details: ReliabilityDetails::redacted("tool failure limit reached"),
290            timestamp: timestamp(),
291        };
292
293        let json = serde_json::to_value(&event).unwrap();
294        assert_eq!(json["limitKind"], "consecutive_tool_failures");
295        assert_eq!(json["decision"], "stop_turn");
296    }
297
298    #[test]
299    fn reliability_timeout_fixture_serializes() {
300        let event = ReliabilityMetricRecorded {
301            context: context(),
302            metric: "timeout_count".to_string(),
303            value: 1.0,
304            error_class: Some(ReliabilityErrorClass::Timeout),
305            timestamp: timestamp(),
306        };
307
308        let json = serde_json::to_value(&event).unwrap();
309        assert_eq!(json["metric"], "timeout_count");
310        assert_eq!(json["errorClass"], "timeout");
311    }
312
313    #[test]
314    fn reliability_unknown_error_fixture_serializes() {
315        let event = ReliabilityFailureRecorded {
316            context: ReliabilityContext {
317                tool_id: None,
318                tool_name: None,
319                ..context()
320            },
321            error_class: ReliabilityErrorClass::Unknown,
322            details: ReliabilityDetails::redacted("panic converted into unknown harness error"),
323            timestamp: timestamp(),
324        };
325
326        let json = serde_json::to_value(&event).unwrap();
327        assert_eq!(json["errorClass"], "unknown");
328        assert!(json["context"].get("toolId").is_none());
329        assert_eq!(json["context"]["threadId"], "thread-a");
330        assert_eq!(json["context"]["turnId"], "turn-a");
331    }
332
333    #[test]
334    fn reliability_events_expose_kind_source_and_turn_scope() {
335        let event = RoderEvent::ReliabilityLimitRecorded(ReliabilityLimitRecorded {
336            context: context(),
337            error_class: ReliabilityErrorClass::InvalidArguments,
338            limit_kind: ReliabilityLimitKind::ToolFailuresPerTurn,
339            decision: ReliabilityLimitDecision::RequestContinuation,
340            current: 10,
341            limit: 10,
342            details: ReliabilityDetails::redacted("tool failures per turn reached"),
343            timestamp: timestamp(),
344        });
345
346        assert_eq!(event.kind(), "reliability.limit");
347        assert_eq!(event.source(), EventSource::Core);
348        assert_eq!(event.thread_id().map(String::as_str), Some("thread-a"));
349        assert_eq!(event.turn_id().map(String::as_str), Some("turn-a"));
350    }
351}