ralph_workflow/reducer/event/agent.rs
1// NOTE: split from reducer/event.rs to keep the main file under line limits.
2use super::types::{default_timeout_output_kind, AgentErrorKind, TimeoutOutputKind};
3use crate::agents::{AgentDrain, AgentRole};
4use crate::common::domain_types::{AgentName, ModelName};
5use crate::ChildProcessInfo;
6use serde::{Deserialize, Serialize};
7
8/// Agent invocation and chain management events.
9///
10/// Events related to agent execution, fallback chains, model switching,
11/// rate limiting, and retry cycles. The agent chain provides fault tolerance
12/// through multiple fallback levels:
13///
14/// 1. Model level: Try different models for the same agent
15/// 2. Agent level: Switch to a fallback agent
16/// 3. Retry cycle: Start over with exponential backoff
17///
18/// # State Transitions
19///
20/// - `InvocationFailed(retriable=true)`: Advances to next model
21/// - `InvocationFailed(retriable=false)`: Typically switches to next agent (policy may vary by kind)
22/// - `RateLimited`: Typically immediate agent switch with prompt preservation
23/// - `ChainExhausted`: Starts new retry cycle
24/// - `InvocationSucceeded`: Clears continuation prompt
25#[derive(Clone, Serialize, Deserialize, Debug, PartialEq)]
26pub enum AgentEvent {
27 /// Agent invocation started.
28 InvocationStarted {
29 /// Compatibility role metadata for the active drain.
30 ///
31 /// Runtime routing is drain-owned; reducers use explicit drain state as the
32 /// authoritative consumer identity.
33 role: AgentRole,
34 /// The agent being invoked.
35 agent: AgentName,
36 /// The model being used, if specified.
37 model: Option<ModelName>,
38 },
39 /// Agent invocation succeeded.
40 InvocationSucceeded {
41 /// Compatibility role metadata for the active drain.
42 role: AgentRole,
43 /// The agent that succeeded.
44 agent: AgentName,
45 },
46 /// Agent invocation failed.
47 InvocationFailed {
48 /// Compatibility role metadata for the active drain.
49 role: AgentRole,
50 /// The agent that failed.
51 agent: AgentName,
52 /// The exit code from the agent process.
53 exit_code: i32,
54 /// The kind of error that occurred.
55 error_kind: AgentErrorKind,
56 /// Whether this error is retriable with the same agent.
57 retriable: bool,
58 },
59 /// Fallback triggered to switch to a different agent.
60 FallbackTriggered {
61 /// The role being fulfilled.
62 role: AgentRole,
63 /// The agent being switched from.
64 from_agent: AgentName,
65 /// The agent being switched to.
66 to_agent: AgentName,
67 },
68 /// Model fallback triggered within the same agent.
69 ModelFallbackTriggered {
70 /// The role being fulfilled.
71 role: AgentRole,
72 /// The agent whose model is changing.
73 agent: AgentName,
74 /// The model being switched from.
75 from_model: ModelName,
76 /// The model being switched to.
77 to_model: ModelName,
78 },
79 /// Retry cycle started (all agents exhausted, starting over).
80 RetryCycleStarted {
81 /// The role being retried.
82 role: AgentRole,
83 /// The cycle number starting.
84 cycle: u32,
85 },
86 /// Agent chain exhausted (no more agents/models to try).
87 ChainExhausted {
88 /// The role whose chain is exhausted.
89 role: AgentRole,
90 },
91 /// Agent chain initialized with available agents.
92 ChainInitialized {
93 /// The explicit runtime drain this chain is for.
94 drain: AgentDrain,
95 /// The agents available in this chain.
96 agents: Vec<AgentName>,
97 /// Per-agent model flag lists, parallel to `agents`.
98 ///
99 /// Each inner `Vec` contains model flags (e.g. `["-m opencode/glm-4.7-free"]`) for the
100 /// corresponding agent. An empty inner `Vec` means no model-level fallback for that agent
101 /// (treated as a single-model agent).
102 models_per_agent: Vec<Vec<String>>,
103 /// Maximum number of retry cycles allowed for this chain.
104 max_cycles: u32,
105 /// Base retry-cycle delay in milliseconds.
106 retry_delay_ms: u64,
107 /// Exponential backoff multiplier.
108 backoff_multiplier: f64,
109 /// Maximum backoff delay in milliseconds.
110 max_backoff_ms: u64,
111 },
112 /// Agent hit rate limit (429).
113 ///
114 /// Effects/executors emit this as a *fact* event. The reducer decides
115 /// whether/when to switch agents.
116 RateLimited {
117 /// The role being fulfilled.
118 role: AgentRole,
119 /// The agent that hit the rate limit.
120 agent: AgentName,
121 /// The prompt that was being executed when rate limit was hit.
122 /// This allows the next agent to continue the same work.
123 prompt_context: Option<String>,
124 },
125
126 /// Agent hit authentication failure (401/403).
127 ///
128 /// Effects/executors emit this as a *fact* event. The reducer decides
129 /// whether/when to switch agents.
130 AuthFailed {
131 /// The role being fulfilled.
132 role: AgentRole,
133 /// The agent that failed authentication.
134 agent: AgentName,
135 },
136
137 /// Agent hit an idle timeout.
138 ///
139 /// Emitted as a fact; the reducer decides retry vs fallback based on `output_kind`.
140 /// `NoResult` triggers immediate agent switch; `PartialResult` uses the same-agent
141 /// retry budget (same semantics as before this feature).
142 TimedOut {
143 /// The role being fulfilled.
144 role: AgentRole,
145 /// The agent that timed out.
146 agent: AgentName,
147 /// Whether the agent produced a result file before timing out.
148 #[serde(default = "default_timeout_output_kind")]
149 output_kind: TimeoutOutputKind,
150 /// Path to the agent's logfile (for context extraction on `PartialResult` retry).
151 ///
152 /// When `output_kind` is `PartialResult` and the agent has no session ID,
153 /// this path is used to extract context for the retry prompt.
154 #[serde(default)]
155 logfile_path: Option<String>,
156 /// Child process status when the timeout was enforced.
157 ///
158 /// `None` if no children existed or child checking was disabled.
159 /// When `Some`, contains the child count and cumulative CPU time at timeout.
160 #[serde(default)]
161 child_status_at_timeout: Option<ChildProcessInfo>,
162 },
163
164 /// Session established with agent.
165 ///
166 /// Emitted when an agent response includes a session ID that can be
167 /// used for XSD retry continuation. This enables reusing the same
168 /// session when retrying due to validation failures.
169 SessionEstablished {
170 /// The role this agent is fulfilling.
171 role: AgentRole,
172 /// The agent name.
173 agent: AgentName,
174 /// The session ID returned by the agent.
175 session_id: String,
176 },
177
178 /// XSD validation failed for agent output.
179 ///
180 /// Emitted when agent output cannot be parsed or fails XSD validation.
181 /// Distinct from `OutputValidationFailed` events in phase-specific enums,
182 /// this is the canonical XSD retry trigger that the reducer uses to
183 /// decide whether to retry with the same agent/session or advance the chain.
184 XsdValidationFailed {
185 /// The role whose output failed validation.
186 role: AgentRole,
187 /// The artifact type that failed validation.
188 artifact: crate::reducer::state::ArtifactType,
189 /// Error message from validation.
190 error: String,
191 /// Current XSD retry count for this artifact.
192 retry_count: u32,
193 },
194
195 /// Template rendering failed due to missing required variables or unresolved placeholders.
196 ///
197 /// Emitted when a prompt template cannot be rendered because required variables
198 /// are missing or unresolved placeholders (e.g., `{{VAR}}`) remain in the output.
199 /// The reducer decides fallback policy, typically switching to the next agent.
200 TemplateVariablesInvalid {
201 /// The role whose template failed to render.
202 role: AgentRole,
203 /// The name of the template that failed.
204 template_name: String,
205 /// Variables that were required but not provided.
206 missing_variables: Vec<String>,
207 /// Placeholder patterns that remain unresolved in the rendered output.
208 unresolved_placeholders: Vec<String>,
209 },
210
211 /// Timeout context written to temp file for session-less agent retry.
212 ///
213 /// Emitted when a timeout with meaningful output occurs but the agent doesn't
214 /// support session IDs. The prior context is extracted from the logfile and
215 /// written to a temp file for the retry prompt to reference.
216 TimeoutContextWritten {
217 /// The role this agent is fulfilling.
218 role: AgentRole,
219 /// Source logfile path the context was extracted from.
220 logfile_path: String,
221 /// Target temp file path where context was written.
222 context_path: String,
223 },
224
225 /// Connectivity probe succeeded (network is reachable).
226 ///
227 /// Emitted when a connectivity probe succeeds. The reducer processes this
228 /// to update ConnectivityState accordingly. If the pipeline was waiting
229 /// for connectivity verification, this clears that pending flag.
230 ConnectivityCheckSucceeded,
231
232 /// Connectivity probe failed (network is unreachable).
233 ///
234 /// Emitted when a connectivity probe fails. The reducer processes this
235 /// to update ConnectivityState. If the failure threshold is reached,
236 /// the pipeline enters offline mode.
237 ConnectivityCheckFailed,
238}