ralph_workflow/reducer/event/agent.rs
1// NOTE: split from reducer/event.rs to keep the main file under line limits.
2use super::types::{default_timeout_output_kind, AgentErrorKind, TimeoutOutputKind};
3use crate::agents::{AgentDrain, AgentRole};
4use crate::executor::ChildProcessInfo;
5use serde::{Deserialize, Serialize};
6
7/// Agent invocation and chain management events.
8///
9/// Events related to agent execution, fallback chains, model switching,
10/// rate limiting, and retry cycles. The agent chain provides fault tolerance
11/// through multiple fallback levels:
12///
13/// 1. Model level: Try different models for the same agent
14/// 2. Agent level: Switch to a fallback agent
15/// 3. Retry cycle: Start over with exponential backoff
16///
17/// # State Transitions
18///
19/// - `InvocationFailed(retriable=true)`: Advances to next model
20/// - `InvocationFailed(retriable=false)`: Typically switches to next agent (policy may vary by kind)
21/// - `RateLimited`: Typically immediate agent switch with prompt preservation
22/// - `ChainExhausted`: Starts new retry cycle
23/// - `InvocationSucceeded`: Clears continuation prompt
24#[derive(Clone, Serialize, Deserialize, Debug)]
25pub enum AgentEvent {
26 /// Agent invocation started.
27 InvocationStarted {
28 /// Compatibility role metadata for the active drain.
29 ///
30 /// Runtime routing is drain-owned; reducers use explicit drain state as the
31 /// authoritative consumer identity.
32 role: AgentRole,
33 /// The agent being invoked.
34 agent: String,
35 /// The model being used, if specified.
36 model: Option<String>,
37 },
38 /// Agent invocation succeeded.
39 InvocationSucceeded {
40 /// Compatibility role metadata for the active drain.
41 role: AgentRole,
42 /// The agent that succeeded.
43 agent: String,
44 },
45 /// Agent invocation failed.
46 InvocationFailed {
47 /// Compatibility role metadata for the active drain.
48 role: AgentRole,
49 /// The agent that failed.
50 agent: String,
51 /// The exit code from the agent process.
52 exit_code: i32,
53 /// The kind of error that occurred.
54 error_kind: AgentErrorKind,
55 /// Whether this error is retriable with the same agent.
56 retriable: bool,
57 },
58 /// Fallback triggered to switch to a different agent.
59 FallbackTriggered {
60 /// The role being fulfilled.
61 role: AgentRole,
62 /// The agent being switched from.
63 from_agent: String,
64 /// The agent being switched to.
65 to_agent: String,
66 },
67 /// Model fallback triggered within the same agent.
68 ModelFallbackTriggered {
69 /// The role being fulfilled.
70 role: AgentRole,
71 /// The agent whose model is changing.
72 agent: String,
73 /// The model being switched from.
74 from_model: String,
75 /// The model being switched to.
76 to_model: String,
77 },
78 /// Retry cycle started (all agents exhausted, starting over).
79 RetryCycleStarted {
80 /// The role being retried.
81 role: AgentRole,
82 /// The cycle number starting.
83 cycle: u32,
84 },
85 /// Agent chain exhausted (no more agents/models to try).
86 ChainExhausted {
87 /// The role whose chain is exhausted.
88 role: AgentRole,
89 },
90 /// Agent chain initialized with available agents.
91 ChainInitialized {
92 /// The explicit runtime drain this chain is for.
93 drain: AgentDrain,
94 /// The agents available in this chain.
95 agents: Vec<String>,
96 /// Maximum number of retry cycles allowed for this chain.
97 max_cycles: u32,
98 /// Base retry-cycle delay in milliseconds.
99 retry_delay_ms: u64,
100 /// Exponential backoff multiplier.
101 backoff_multiplier: f64,
102 /// Maximum backoff delay in milliseconds.
103 max_backoff_ms: u64,
104 },
105 /// Agent hit rate limit (429).
106 ///
107 /// Effects/executors emit this as a *fact* event. The reducer decides
108 /// whether/when to switch agents.
109 RateLimited {
110 /// The role being fulfilled.
111 role: AgentRole,
112 /// The agent that hit the rate limit.
113 agent: String,
114 /// The prompt that was being executed when rate limit was hit.
115 /// This allows the next agent to continue the same work.
116 prompt_context: Option<String>,
117 },
118
119 /// Agent hit authentication failure (401/403).
120 ///
121 /// Effects/executors emit this as a *fact* event. The reducer decides
122 /// whether/when to switch agents.
123 AuthFailed {
124 /// The role being fulfilled.
125 role: AgentRole,
126 /// The agent that failed authentication.
127 agent: String,
128 },
129
130 /// Agent hit an idle timeout.
131 ///
132 /// Emitted as a fact; the reducer decides retry vs fallback based on `output_kind`.
133 /// `NoOutput` triggers immediate agent switch; `PartialOutput` uses the same-agent
134 /// retry budget (same semantics as before this feature).
135 TimedOut {
136 /// The role being fulfilled.
137 role: AgentRole,
138 /// The agent that timed out.
139 agent: String,
140 /// Whether the agent produced any output before timing out.
141 #[serde(default = "default_timeout_output_kind")]
142 output_kind: TimeoutOutputKind,
143 /// Path to the agent's logfile (for context extraction on `PartialOutput` retry).
144 ///
145 /// When `output_kind` is `PartialOutput` and the agent has no session ID,
146 /// this path is used to extract context for the retry prompt.
147 #[serde(default)]
148 logfile_path: Option<String>,
149 /// Child process status when the timeout was enforced.
150 ///
151 /// `None` if no children existed or child checking was disabled.
152 /// When `Some`, contains the child count and cumulative CPU time at timeout.
153 #[serde(default)]
154 child_status_at_timeout: Option<ChildProcessInfo>,
155 },
156
157 /// Session established with agent.
158 ///
159 /// Emitted when an agent response includes a session ID that can be
160 /// used for XSD retry continuation. This enables reusing the same
161 /// session when retrying due to validation failures.
162 SessionEstablished {
163 /// The role this agent is fulfilling.
164 role: AgentRole,
165 /// The agent name.
166 agent: String,
167 /// The session ID returned by the agent.
168 session_id: String,
169 },
170
171 /// XSD validation failed for agent output.
172 ///
173 /// Emitted when agent output cannot be parsed or fails XSD validation.
174 /// Distinct from `OutputValidationFailed` events in phase-specific enums,
175 /// this is the canonical XSD retry trigger that the reducer uses to
176 /// decide whether to retry with the same agent/session or advance the chain.
177 XsdValidationFailed {
178 /// The role whose output failed validation.
179 role: AgentRole,
180 /// The artifact type that failed validation.
181 artifact: crate::reducer::state::ArtifactType,
182 /// Error message from validation.
183 error: String,
184 /// Current XSD retry count for this artifact.
185 retry_count: u32,
186 },
187
188 /// Template rendering failed due to missing required variables or unresolved placeholders.
189 ///
190 /// Emitted when a prompt template cannot be rendered because required variables
191 /// are missing or unresolved placeholders (e.g., `{{VAR}}`) remain in the output.
192 /// The reducer decides fallback policy, typically switching to the next agent.
193 TemplateVariablesInvalid {
194 /// The role whose template failed to render.
195 role: AgentRole,
196 /// The name of the template that failed.
197 template_name: String,
198 /// Variables that were required but not provided.
199 missing_variables: Vec<String>,
200 /// Placeholder patterns that remain unresolved in the rendered output.
201 unresolved_placeholders: Vec<String>,
202 },
203
204 /// Timeout context written to temp file for session-less agent retry.
205 ///
206 /// Emitted when a timeout with meaningful output occurs but the agent doesn't
207 /// support session IDs. The prior context is extracted from the logfile and
208 /// written to a temp file for the retry prompt to reference.
209 TimeoutContextWritten {
210 /// The role this agent is fulfilling.
211 role: AgentRole,
212 /// Source logfile path the context was extracted from.
213 logfile_path: String,
214 /// Target temp file path where context was written.
215 context_path: String,
216 },
217}