zag_agent/usage_limits.rs
1//! Shared types and helpers for provider usage-limit detection and auto-resume.
2//!
3//! When a provider's CLI hits an upstream rate / usage / weekly limit, zag wants
4//! to (a) detect the limit, (b) compute when it resets, (c) schedule a resume
5//! attempt at that moment, and (d) record the lifecycle in the session log via
6//! [`crate::session_log::LogEventKind::UsageLimitHit`] / `UsageLimitResumed` /
7//! `UsageLimitResumeFailed`.
8//!
9//! Each provider has its own detector module (e.g.
10//! `providers/claude/usage_limits.rs`). All detectors return [`UsageLimit`] so
11//! the scheduler in `zag-orch` can treat the four providers uniformly.
12
13use chrono::{DateTime, Duration, Utc};
14use serde::{Deserialize, Serialize};
15use std::collections::HashMap;
16
17/// A successful detection of an upstream usage / rate limit.
18#[derive(Debug, Clone)]
19pub struct UsageLimit {
20 pub provider: &'static str,
21 pub scope: UsageLimitScope,
22 /// When usage resets, if the provider gave us something parseable.
23 /// `None` means "we don't know" — `compute_resume_at` will fall back to
24 /// `default_fallback_secs`.
25 pub reset_at: Option<DateTime<Utc>>,
26 /// The exact substring or JSON snippet that matched. Recorded into the
27 /// session log so future maintainers can see why detection fired even
28 /// after the upstream format has drifted.
29 pub raw: String,
30}
31
32#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
33#[serde(rename_all = "snake_case")]
34pub enum UsageLimitScope {
35 /// Single-turn / short window (e.g. Copilot `rate_limited`).
36 Session,
37 /// Weekly cap (Claude `weekly`, Copilot `user_weekly_rate_limited`).
38 Weekly,
39 /// Account-wide global cap.
40 Global,
41 /// Per-day quota (Gemini `*PerDay`).
42 Daily,
43 /// Provider didn't surface enough info to classify.
44 Unknown,
45}
46
47impl UsageLimitScope {
48 pub fn as_str(self) -> &'static str {
49 match self {
50 UsageLimitScope::Session => "session",
51 UsageLimitScope::Weekly => "weekly",
52 UsageLimitScope::Global => "global",
53 UsageLimitScope::Daily => "daily",
54 UsageLimitScope::Unknown => "unknown",
55 }
56 }
57}
58
59/// Top-level `[usage_limits]` config block. Loaded from `zag.toml`.
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct UsageLimitConfig {
62 /// Master switch. Detection always runs; this gates auto-resume scheduling.
63 #[serde(default = "default_true")]
64 pub enabled: bool,
65 /// Message injected into the session when the timer fires. Default `"Continue"`.
66 #[serde(default = "default_resume_message")]
67 pub resume_message: String,
68 /// Hard cap on how long a single wait can be. Default 24h.
69 #[serde(default = "default_max_wait_secs")]
70 pub max_wait_secs: u64,
71 /// Used when the provider didn't tell us a reset time. Default 1h.
72 /// On self-retrigger (resume failed because limit still active), the cycle
73 /// just runs again — eventually the window passes.
74 #[serde(default = "default_fallback_secs")]
75 pub default_fallback_secs: u64,
76 /// Jitter added on top of the computed reset time, to spread retries.
77 /// Default 30s.
78 #[serde(default = "default_jitter_secs")]
79 pub jitter_secs: u64,
80 /// Maximum auto-resume attempts within a single foreground `zag exec` or
81 /// `zag spawn` invocation. Default 12 — with the default 1h fallback this
82 /// caps a stuck batch at ~12h. Set to 0 to disable the cap.
83 #[serde(default = "default_max_attempts")]
84 pub max_attempts: u32,
85 /// Per-provider overrides keyed by provider name.
86 #[serde(default)]
87 pub providers: HashMap<String, UsageLimitProviderOverride>,
88}
89
90impl Default for UsageLimitConfig {
91 fn default() -> Self {
92 Self {
93 enabled: true,
94 resume_message: default_resume_message(),
95 max_wait_secs: default_max_wait_secs(),
96 default_fallback_secs: default_fallback_secs(),
97 jitter_secs: default_jitter_secs(),
98 max_attempts: default_max_attempts(),
99 providers: HashMap::new(),
100 }
101 }
102}
103
104/// Per-provider override. Any unset field falls back to the top-level value.
105#[derive(Debug, Clone, Default, Serialize, Deserialize)]
106pub struct UsageLimitProviderOverride {
107 pub enabled: Option<bool>,
108 pub resume_message: Option<String>,
109 pub fallback_secs: Option<u64>,
110 /// User-supplied regex sources OR'd into the provider's default patterns.
111 /// Provider detectors compile these once via `OnceLock`.
112 #[serde(default)]
113 pub extra_patterns: Vec<String>,
114}
115
116fn default_true() -> bool {
117 true
118}
119fn default_resume_message() -> String {
120 "Continue".to_string()
121}
122fn default_max_wait_secs() -> u64 {
123 86_400
124}
125fn default_fallback_secs() -> u64 {
126 3_600
127}
128fn default_jitter_secs() -> u64 {
129 30
130}
131fn default_max_attempts() -> u32 {
132 12
133}
134
135impl UsageLimitConfig {
136 /// True if auto-resume should be attempted for `provider`.
137 pub fn enabled_for(&self, provider: &str) -> bool {
138 if !self.enabled {
139 return false;
140 }
141 self.providers
142 .get(provider)
143 .and_then(|o| o.enabled)
144 .unwrap_or(true)
145 }
146
147 /// Effective resume message for `provider`, honoring overrides.
148 pub fn resume_message_for(&self, provider: &str) -> &str {
149 self.providers
150 .get(provider)
151 .and_then(|o| o.resume_message.as_deref())
152 .unwrap_or(&self.resume_message)
153 }
154
155 /// Effective fallback duration (seconds) for `provider`.
156 pub fn fallback_secs_for(&self, provider: &str) -> u64 {
157 self.providers
158 .get(provider)
159 .and_then(|o| o.fallback_secs)
160 .unwrap_or(self.default_fallback_secs)
161 }
162
163 /// User-supplied additional patterns for `provider`, empty slice if none.
164 pub fn extra_patterns_for(&self, provider: &str) -> &[String] {
165 self.providers
166 .get(provider)
167 .map(|o| o.extra_patterns.as_slice())
168 .unwrap_or(&[])
169 }
170}
171
172/// Compute the moment zag should attempt to resume.
173///
174/// Returns `(scheduled_at, fallback_used)`. `fallback_used` is true when
175/// `hit.reset_at` was `None` and we substituted `fallback_secs`. The result is
176/// always clamped to `now + max_wait_secs` so a malformed epoch can never
177/// pin a wait into next century.
178pub fn compute_resume_at(hit: &UsageLimit, cfg: &UsageLimitConfig) -> (DateTime<Utc>, bool) {
179 let now = Utc::now();
180 let max_wait = Duration::seconds(cfg.max_wait_secs as i64);
181 let jitter = Duration::seconds(cfg.jitter_secs as i64);
182
183 let (target, fallback_used) = match hit.reset_at {
184 Some(t) => (t, false),
185 None => {
186 let fb = cfg.fallback_secs_for(hit.provider) as i64;
187 (now + Duration::seconds(fb), true)
188 }
189 };
190
191 // If the reset is in the past, clamp to "now + jitter" — gives the upstream
192 // a beat to settle before we retry.
193 let after_clamp = if target < now {
194 now + jitter
195 } else {
196 target + jitter
197 };
198
199 let capped = if after_clamp > now + max_wait {
200 now + max_wait
201 } else {
202 after_clamp
203 };
204
205 (capped, fallback_used)
206}
207
208/// Build a [`crate::session_log::LogEventKind::UsageLimitHit`] from a detected
209/// `UsageLimit`. Single source of truth for the scheduled-resume path — every
210/// site that knows a resume timer is going to fire (relay, foreground
211/// auto-resume loop) calls this so the wire shape can never drift.
212///
213/// `scheduled_resume_at` and `fallback_used` come from [`compute_resume_at`].
214/// `incident_id` is provided by the caller so it can be stitched into the
215/// matching `UsageLimitResumed` / `UsageLimitResumeFailed` events.
216pub fn log_event_hit(
217 hit: &UsageLimit,
218 incident_id: &str,
219 scheduled_resume_at: Option<DateTime<Utc>>,
220 fallback_used: bool,
221) -> crate::session_log::LogEventKind {
222 crate::session_log::LogEventKind::UsageLimitHit {
223 provider: hit.provider.to_string(),
224 scope: hit.scope.as_str().to_string(),
225 reset_at: hit.reset_at.map(|t| t.to_rfc3339()),
226 scheduled_resume_at: scheduled_resume_at.map(|t| t.to_rfc3339()),
227 fallback_used,
228 incident_id: incident_id.to_string(),
229 raw: Some(hit.raw.clone()),
230 }
231}
232
233/// Build a `UsageLimitHit` log event for orphan log-only detections (e.g. the
234/// Codex TUI line parser) where no auto-resume scheduler is involved.
235/// Generates a fresh incident id; scheduling fields are left empty because
236/// nothing is scheduled.
237pub fn to_log_event_hit(hit: UsageLimit) -> crate::session_log::LogEventKind {
238 log_event_hit(&hit, &uuid::Uuid::new_v4().to_string(), None, false)
239}
240
241#[cfg(test)]
242#[path = "usage_limits_tests.rs"]
243mod tests;