Skip to main content

ralph_workflow/reducer/event/
agent.rs

1// NOTE: split from reducer/event.rs to keep the main file under line limits.
2use super::types::{default_timeout_output_kind, AgentErrorKind, TimeoutOutputKind};
3use crate::agents::{AgentDrain, AgentRole};
4use crate::common::domain_types::{AgentName, ModelName};
5use crate::ChildProcessInfo;
6use serde::{Deserialize, Serialize};
7
8/// Agent invocation and chain management events.
9///
10/// Events related to agent execution, fallback chains, model switching,
11/// rate limiting, and retry cycles. The agent chain provides fault tolerance
12/// through multiple fallback levels:
13///
14/// 1. Model level: Try different models for the same agent
15/// 2. Agent level: Switch to a fallback agent
16/// 3. Retry cycle: Start over with exponential backoff
17///
18/// # State Transitions
19///
20/// - `InvocationFailed(retriable=true)`: Advances to next model
21/// - `InvocationFailed(retriable=false)`: Typically switches to next agent (policy may vary by kind)
22/// - `RateLimited`: Typically immediate agent switch with prompt preservation
23/// - `ChainExhausted`: Starts new retry cycle
24/// - `InvocationSucceeded`: Clears continuation prompt
25#[derive(Clone, Serialize, Deserialize, Debug, PartialEq)]
26pub enum AgentEvent {
27    /// Agent invocation started.
28    InvocationStarted {
29        /// Compatibility role metadata for the active drain.
30        ///
31        /// Runtime routing is drain-owned; reducers use explicit drain state as the
32        /// authoritative consumer identity.
33        role: AgentRole,
34        /// The agent being invoked.
35        agent: AgentName,
36        /// The model being used, if specified.
37        model: Option<ModelName>,
38    },
39    /// Agent invocation succeeded.
40    InvocationSucceeded {
41        /// Compatibility role metadata for the active drain.
42        role: AgentRole,
43        /// The agent that succeeded.
44        agent: AgentName,
45    },
46    /// Agent invocation failed.
47    InvocationFailed {
48        /// Compatibility role metadata for the active drain.
49        role: AgentRole,
50        /// The agent that failed.
51        agent: AgentName,
52        /// The exit code from the agent process.
53        exit_code: i32,
54        /// The kind of error that occurred.
55        error_kind: AgentErrorKind,
56        /// Whether this error is retriable with the same agent.
57        retriable: bool,
58    },
59    /// Fallback triggered to switch to a different agent.
60    FallbackTriggered {
61        /// The role being fulfilled.
62        role: AgentRole,
63        /// The agent being switched from.
64        from_agent: AgentName,
65        /// The agent being switched to.
66        to_agent: AgentName,
67    },
68    /// Model fallback triggered within the same agent.
69    ModelFallbackTriggered {
70        /// The role being fulfilled.
71        role: AgentRole,
72        /// The agent whose model is changing.
73        agent: AgentName,
74        /// The model being switched from.
75        from_model: ModelName,
76        /// The model being switched to.
77        to_model: ModelName,
78    },
79    /// Retry cycle started (all agents exhausted, starting over).
80    RetryCycleStarted {
81        /// The role being retried.
82        role: AgentRole,
83        /// The cycle number starting.
84        cycle: u32,
85    },
86    /// Agent chain exhausted (no more agents/models to try).
87    ChainExhausted {
88        /// The role whose chain is exhausted.
89        role: AgentRole,
90    },
91    /// Agent chain initialized with available agents.
92    ChainInitialized {
93        /// The explicit runtime drain this chain is for.
94        drain: AgentDrain,
95        /// The agents available in this chain.
96        agents: Vec<AgentName>,
97        /// Per-agent model flag lists, parallel to `agents`.
98        ///
99        /// Each inner `Vec` contains model flags (e.g. `["-m opencode/glm-4.7-free"]`) for the
100        /// corresponding agent. An empty inner `Vec` means no model-level fallback for that agent
101        /// (treated as a single-model agent).
102        models_per_agent: Vec<Vec<String>>,
103        /// Maximum number of retry cycles allowed for this chain.
104        max_cycles: u32,
105        /// Base retry-cycle delay in milliseconds.
106        retry_delay_ms: u64,
107        /// Exponential backoff multiplier.
108        backoff_multiplier: f64,
109        /// Maximum backoff delay in milliseconds.
110        max_backoff_ms: u64,
111    },
112    /// Agent hit rate limit (429).
113    ///
114    /// Effects/executors emit this as a *fact* event. The reducer decides
115    /// whether/when to switch agents.
116    RateLimited {
117        /// The role being fulfilled.
118        role: AgentRole,
119        /// The agent that hit the rate limit.
120        agent: AgentName,
121        /// The prompt that was being executed when rate limit was hit.
122        /// This allows the next agent to continue the same work.
123        prompt_context: Option<String>,
124    },
125
126    /// Agent hit authentication failure (401/403).
127    ///
128    /// Effects/executors emit this as a *fact* event. The reducer decides
129    /// whether/when to switch agents.
130    AuthFailed {
131        /// The role being fulfilled.
132        role: AgentRole,
133        /// The agent that failed authentication.
134        agent: AgentName,
135    },
136
137    /// Agent hit an idle timeout.
138    ///
139    /// Emitted as a fact; the reducer decides retry vs fallback based on `output_kind`.
140    /// `NoResult` triggers immediate agent switch; `PartialResult` uses the same-agent
141    /// retry budget (same semantics as before this feature).
142    TimedOut {
143        /// The role being fulfilled.
144        role: AgentRole,
145        /// The agent that timed out.
146        agent: AgentName,
147        /// Whether the agent produced a result file before timing out.
148        #[serde(default = "default_timeout_output_kind")]
149        output_kind: TimeoutOutputKind,
150        /// Path to the agent's logfile (for context extraction on `PartialResult` retry).
151        ///
152        /// When `output_kind` is `PartialResult` and the agent has no session ID,
153        /// this path is used to extract context for the retry prompt.
154        #[serde(default)]
155        logfile_path: Option<String>,
156        /// Child process status when the timeout was enforced.
157        ///
158        /// `None` if no children existed or child checking was disabled.
159        /// When `Some`, contains the child count and cumulative CPU time at timeout.
160        #[serde(default)]
161        child_status_at_timeout: Option<ChildProcessInfo>,
162    },
163
164    /// Session established with agent.
165    ///
166    /// Emitted when an agent response includes a session ID that can be
167    /// used for XSD retry continuation. This enables reusing the same
168    /// session when retrying due to validation failures.
169    SessionEstablished {
170        /// The role this agent is fulfilling.
171        role: AgentRole,
172        /// The agent name.
173        agent: AgentName,
174        /// The session ID returned by the agent.
175        session_id: String,
176    },
177
178    /// XSD validation failed for agent output.
179    ///
180    /// Emitted when agent output cannot be parsed or fails XSD validation.
181    /// Distinct from `OutputValidationFailed` events in phase-specific enums,
182    /// this is the canonical XSD retry trigger that the reducer uses to
183    /// decide whether to retry with the same agent/session or advance the chain.
184    XsdValidationFailed {
185        /// The role whose output failed validation.
186        role: AgentRole,
187        /// The artifact type that failed validation.
188        artifact: crate::reducer::state::ArtifactType,
189        /// Error message from validation.
190        error: String,
191        /// Current XSD retry count for this artifact.
192        retry_count: u32,
193    },
194
195    /// Template rendering failed due to missing required variables or unresolved placeholders.
196    ///
197    /// Emitted when a prompt template cannot be rendered because required variables
198    /// are missing or unresolved placeholders (e.g., `{{VAR}}`) remain in the output.
199    /// The reducer decides fallback policy, typically switching to the next agent.
200    TemplateVariablesInvalid {
201        /// The role whose template failed to render.
202        role: AgentRole,
203        /// The name of the template that failed.
204        template_name: String,
205        /// Variables that were required but not provided.
206        missing_variables: Vec<String>,
207        /// Placeholder patterns that remain unresolved in the rendered output.
208        unresolved_placeholders: Vec<String>,
209    },
210
211    /// Timeout context written to temp file for session-less agent retry.
212    ///
213    /// Emitted when a timeout with meaningful output occurs but the agent doesn't
214    /// support session IDs. The prior context is extracted from the logfile and
215    /// written to a temp file for the retry prompt to reference.
216    TimeoutContextWritten {
217        /// The role this agent is fulfilling.
218        role: AgentRole,
219        /// Source logfile path the context was extracted from.
220        logfile_path: String,
221        /// Target temp file path where context was written.
222        context_path: String,
223    },
224
225    /// Connectivity probe succeeded (network is reachable).
226    ///
227    /// Emitted when a connectivity probe succeeds. The reducer processes this
228    /// to update ConnectivityState accordingly. If the pipeline was waiting
229    /// for connectivity verification, this clears that pending flag.
230    ConnectivityCheckSucceeded,
231
232    /// Connectivity probe failed (network is unreachable).
233    ///
234    /// Emitted when a connectivity probe fails. The reducer processes this
235    /// to update ConnectivityState. If the failure threshold is reached,
236    /// the pipeline enters offline mode.
237    ConnectivityCheckFailed,
238}