Skip to main content

zag_agent/
usage_limits.rs

1//! Shared types and helpers for provider usage-limit detection and auto-resume.
2//!
3//! When a provider's CLI hits an upstream rate / usage / weekly limit, zag wants
4//! to (a) detect the limit, (b) compute when it resets, (c) schedule a resume
5//! attempt at that moment, and (d) record the lifecycle in the session log via
6//! [`crate::session_log::LogEventKind::UsageLimitHit`] / `UsageLimitResumed` /
7//! `UsageLimitResumeFailed`.
8//!
9//! Each provider has its own detector module (e.g.
10//! `providers/claude/usage_limits.rs`). All detectors return [`UsageLimit`] so
11//! the scheduler in `zag-orch` can treat the four providers uniformly.
12
13use chrono::{DateTime, Duration, Utc};
14use serde::{Deserialize, Serialize};
15use std::collections::HashMap;
16
17/// A successful detection of an upstream usage / rate limit.
18#[derive(Debug, Clone)]
19pub struct UsageLimit {
20    pub provider: &'static str,
21    pub scope: UsageLimitScope,
22    /// When usage resets, if the provider gave us something parseable.
23    /// `None` means "we don't know" — `compute_resume_at` will fall back to
24    /// `default_fallback_secs`.
25    pub reset_at: Option<DateTime<Utc>>,
26    /// The exact substring or JSON snippet that matched. Recorded into the
27    /// session log so future maintainers can see why detection fired even
28    /// after the upstream format has drifted.
29    pub raw: String,
30}
31
32#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
33#[serde(rename_all = "snake_case")]
34pub enum UsageLimitScope {
35    /// Single-turn / short window (e.g. Copilot `rate_limited`).
36    Session,
37    /// Weekly cap (Claude `weekly`, Copilot `user_weekly_rate_limited`).
38    Weekly,
39    /// Account-wide global cap.
40    Global,
41    /// Per-day quota (Gemini `*PerDay`).
42    Daily,
43    /// Provider didn't surface enough info to classify.
44    Unknown,
45}
46
47impl UsageLimitScope {
48    pub fn as_str(self) -> &'static str {
49        match self {
50            UsageLimitScope::Session => "session",
51            UsageLimitScope::Weekly => "weekly",
52            UsageLimitScope::Global => "global",
53            UsageLimitScope::Daily => "daily",
54            UsageLimitScope::Unknown => "unknown",
55        }
56    }
57}
58
59/// Top-level `[usage_limits]` config block. Loaded from `zag.toml`.
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct UsageLimitConfig {
62    /// Master switch. Detection always runs; this gates auto-resume scheduling.
63    #[serde(default = "default_true")]
64    pub enabled: bool,
65    /// Message injected into the session when the timer fires. Default `"Continue"`.
66    #[serde(default = "default_resume_message")]
67    pub resume_message: String,
68    /// Hard cap on how long a single wait can be. Default 24h.
69    #[serde(default = "default_max_wait_secs")]
70    pub max_wait_secs: u64,
71    /// Used when the provider didn't tell us a reset time. Default 1h.
72    /// On self-retrigger (resume failed because limit still active), the cycle
73    /// just runs again — eventually the window passes.
74    #[serde(default = "default_fallback_secs")]
75    pub default_fallback_secs: u64,
76    /// Jitter added on top of the computed reset time, to spread retries.
77    /// Default 30s.
78    #[serde(default = "default_jitter_secs")]
79    pub jitter_secs: u64,
80    /// Maximum auto-resume attempts within a single foreground `zag exec` or
81    /// `zag spawn` invocation. Default 12 — with the default 1h fallback this
82    /// caps a stuck batch at ~12h. Set to 0 to disable the cap.
83    #[serde(default = "default_max_attempts")]
84    pub max_attempts: u32,
85    /// Per-provider overrides keyed by provider name.
86    #[serde(default)]
87    pub providers: HashMap<String, UsageLimitProviderOverride>,
88}
89
90impl Default for UsageLimitConfig {
91    fn default() -> Self {
92        Self {
93            enabled: true,
94            resume_message: default_resume_message(),
95            max_wait_secs: default_max_wait_secs(),
96            default_fallback_secs: default_fallback_secs(),
97            jitter_secs: default_jitter_secs(),
98            max_attempts: default_max_attempts(),
99            providers: HashMap::new(),
100        }
101    }
102}
103
104/// Per-provider override. Any unset field falls back to the top-level value.
105#[derive(Debug, Clone, Default, Serialize, Deserialize)]
106pub struct UsageLimitProviderOverride {
107    pub enabled: Option<bool>,
108    pub resume_message: Option<String>,
109    pub fallback_secs: Option<u64>,
110    /// User-supplied regex sources OR'd into the provider's default patterns.
111    /// Provider detectors compile these once via `OnceLock`.
112    #[serde(default)]
113    pub extra_patterns: Vec<String>,
114}
115
116fn default_true() -> bool {
117    true
118}
119fn default_resume_message() -> String {
120    "Continue".to_string()
121}
122fn default_max_wait_secs() -> u64 {
123    86_400
124}
125fn default_fallback_secs() -> u64 {
126    3_600
127}
128fn default_jitter_secs() -> u64 {
129    30
130}
131fn default_max_attempts() -> u32 {
132    12
133}
134
135impl UsageLimitConfig {
136    /// True if auto-resume should be attempted for `provider`.
137    pub fn enabled_for(&self, provider: &str) -> bool {
138        if !self.enabled {
139            return false;
140        }
141        self.providers
142            .get(provider)
143            .and_then(|o| o.enabled)
144            .unwrap_or(true)
145    }
146
147    /// Effective resume message for `provider`, honoring overrides.
148    pub fn resume_message_for(&self, provider: &str) -> &str {
149        self.providers
150            .get(provider)
151            .and_then(|o| o.resume_message.as_deref())
152            .unwrap_or(&self.resume_message)
153    }
154
155    /// Effective fallback duration (seconds) for `provider`.
156    pub fn fallback_secs_for(&self, provider: &str) -> u64 {
157        self.providers
158            .get(provider)
159            .and_then(|o| o.fallback_secs)
160            .unwrap_or(self.default_fallback_secs)
161    }
162
163    /// User-supplied additional patterns for `provider`, empty slice if none.
164    pub fn extra_patterns_for(&self, provider: &str) -> &[String] {
165        self.providers
166            .get(provider)
167            .map(|o| o.extra_patterns.as_slice())
168            .unwrap_or(&[])
169    }
170}
171
172/// Compute the moment zag should attempt to resume.
173///
174/// Returns `(scheduled_at, fallback_used)`. `fallback_used` is true when
175/// `hit.reset_at` was `None` and we substituted `fallback_secs`. The result is
176/// always clamped to `now + max_wait_secs` so a malformed epoch can never
177/// pin a wait into next century.
178pub fn compute_resume_at(hit: &UsageLimit, cfg: &UsageLimitConfig) -> (DateTime<Utc>, bool) {
179    let now = Utc::now();
180    let max_wait = Duration::seconds(cfg.max_wait_secs as i64);
181    let jitter = Duration::seconds(cfg.jitter_secs as i64);
182
183    let (target, fallback_used) = match hit.reset_at {
184        Some(t) => (t, false),
185        None => {
186            let fb = cfg.fallback_secs_for(hit.provider) as i64;
187            (now + Duration::seconds(fb), true)
188        }
189    };
190
191    // If the reset is in the past, clamp to "now + jitter" — gives the upstream
192    // a beat to settle before we retry.
193    let after_clamp = if target < now {
194        now + jitter
195    } else {
196        target + jitter
197    };
198
199    let capped = if after_clamp > now + max_wait {
200        now + max_wait
201    } else {
202        after_clamp
203    };
204
205    (capped, fallback_used)
206}
207
208/// Build a [`crate::session_log::LogEventKind::UsageLimitHit`] from a detected
209/// `UsageLimit`. Single source of truth for the scheduled-resume path — every
210/// site that knows a resume timer is going to fire (relay, foreground
211/// auto-resume loop) calls this so the wire shape can never drift.
212///
213/// `scheduled_resume_at` and `fallback_used` come from [`compute_resume_at`].
214/// `incident_id` is provided by the caller so it can be stitched into the
215/// matching `UsageLimitResumed` / `UsageLimitResumeFailed` events.
216pub fn log_event_hit(
217    hit: &UsageLimit,
218    incident_id: &str,
219    scheduled_resume_at: Option<DateTime<Utc>>,
220    fallback_used: bool,
221) -> crate::session_log::LogEventKind {
222    crate::session_log::LogEventKind::UsageLimitHit {
223        provider: hit.provider.to_string(),
224        scope: hit.scope.as_str().to_string(),
225        reset_at: hit.reset_at.map(|t| t.to_rfc3339()),
226        scheduled_resume_at: scheduled_resume_at.map(|t| t.to_rfc3339()),
227        fallback_used,
228        incident_id: incident_id.to_string(),
229        raw: Some(hit.raw.clone()),
230    }
231}
232
233/// Build a `UsageLimitHit` log event for orphan log-only detections (e.g. the
234/// Codex TUI line parser) where no auto-resume scheduler is involved.
235/// Generates a fresh incident id; scheduling fields are left empty because
236/// nothing is scheduled.
237pub fn to_log_event_hit(hit: UsageLimit) -> crate::session_log::LogEventKind {
238    log_event_hit(&hit, &uuid::Uuid::new_v4().to_string(), None, false)
239}
240
241#[cfg(test)]
242#[path = "usage_limits_tests.rs"]
243mod tests;