1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
// NOTE: split from reducer/event.rs to keep the main file under line limits.
use super::types::{default_timeout_output_kind, AgentErrorKind, TimeoutOutputKind};
use crate::agents::{AgentDrain, AgentRole};
use crate::common::domain_types::{AgentName, ModelName};
use crate::ChildProcessInfo;
use serde::{Deserialize, Serialize};
/// Agent invocation and chain management events.
///
/// Events related to agent execution, fallback chains, model switching,
/// rate limiting, and retry cycles. The agent chain provides fault tolerance
/// through multiple fallback levels:
///
/// 1. Model level: Try different models for the same agent
/// 2. Agent level: Switch to a fallback agent
/// 3. Retry cycle: Start over with exponential backoff
///
/// # State Transitions
///
/// - `InvocationFailed(retriable=true)`: Advances to next model
/// - `InvocationFailed(retriable=false)`: Typically switches to next agent (policy may vary by kind)
/// - `RateLimited`: Typically immediate agent switch with prompt preservation
/// - `ChainExhausted`: Starts new retry cycle
/// - `InvocationSucceeded`: Clears continuation prompt
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq)]
pub enum AgentEvent {
/// Agent invocation started.
InvocationStarted {
/// Compatibility role metadata for the active drain.
///
/// Runtime routing is drain-owned; reducers use explicit drain state as the
/// authoritative consumer identity.
role: AgentRole,
/// The agent being invoked.
agent: AgentName,
/// The model being used, if specified.
model: Option<ModelName>,
},
/// Agent invocation succeeded.
InvocationSucceeded {
/// Compatibility role metadata for the active drain.
role: AgentRole,
/// The agent that succeeded.
agent: AgentName,
},
/// Agent invocation failed.
InvocationFailed {
/// Compatibility role metadata for the active drain.
role: AgentRole,
/// The agent that failed.
agent: AgentName,
/// The exit code from the agent process.
exit_code: i32,
/// The kind of error that occurred.
error_kind: AgentErrorKind,
/// Whether this error is retriable with the same agent.
retriable: bool,
},
/// Fallback triggered to switch to a different agent.
FallbackTriggered {
/// The role being fulfilled.
role: AgentRole,
/// The agent being switched from.
from_agent: AgentName,
/// The agent being switched to.
to_agent: AgentName,
},
/// Model fallback triggered within the same agent.
ModelFallbackTriggered {
/// The role being fulfilled.
role: AgentRole,
/// The agent whose model is changing.
agent: AgentName,
/// The model being switched from.
from_model: ModelName,
/// The model being switched to.
to_model: ModelName,
},
/// Retry cycle started (all agents exhausted, starting over).
RetryCycleStarted {
/// The role being retried.
role: AgentRole,
/// The cycle number starting.
cycle: u32,
},
/// Agent chain exhausted (no more agents/models to try).
ChainExhausted {
/// The role whose chain is exhausted.
role: AgentRole,
},
/// Agent chain initialized with available agents.
ChainInitialized {
/// The explicit runtime drain this chain is for.
drain: AgentDrain,
/// The agents available in this chain.
agents: Vec<AgentName>,
/// Per-agent model flag lists, parallel to `agents`.
///
/// Each inner `Vec` contains model flags (e.g. `["-m opencode/glm-4.7-free"]`) for the
/// corresponding agent. An empty inner `Vec` means no model-level fallback for that agent
/// (treated as a single-model agent).
models_per_agent: Vec<Vec<String>>,
/// Maximum number of retry cycles allowed for this chain.
max_cycles: u32,
/// Base retry-cycle delay in milliseconds.
retry_delay_ms: u64,
/// Exponential backoff multiplier.
backoff_multiplier: f64,
/// Maximum backoff delay in milliseconds.
max_backoff_ms: u64,
},
/// Agent hit rate limit (429).
///
/// Effects/executors emit this as a *fact* event. The reducer decides
/// whether/when to switch agents.
RateLimited {
/// The role being fulfilled.
role: AgentRole,
/// The agent that hit the rate limit.
agent: AgentName,
/// The prompt that was being executed when rate limit was hit.
/// This allows the next agent to continue the same work.
prompt_context: Option<String>,
},
/// Agent hit authentication failure (401/403).
///
/// Effects/executors emit this as a *fact* event. The reducer decides
/// whether/when to switch agents.
AuthFailed {
/// The role being fulfilled.
role: AgentRole,
/// The agent that failed authentication.
agent: AgentName,
},
/// Agent hit an idle timeout.
///
/// Emitted as a fact; the reducer decides retry vs fallback based on `output_kind`.
/// `NoResult` triggers immediate agent switch; `PartialResult` uses the same-agent
/// retry budget (same semantics as before this feature).
TimedOut {
/// The role being fulfilled.
role: AgentRole,
/// The agent that timed out.
agent: AgentName,
/// Whether the agent produced a result file before timing out.
#[serde(default = "default_timeout_output_kind")]
output_kind: TimeoutOutputKind,
/// Path to the agent's logfile (for context extraction on `PartialResult` retry).
///
/// When `output_kind` is `PartialResult` and the agent has no session ID,
/// this path is used to extract context for the retry prompt.
#[serde(default)]
logfile_path: Option<String>,
/// Child process status when the timeout was enforced.
///
/// `None` if no children existed or child checking was disabled.
/// When `Some`, contains the child count and cumulative CPU time at timeout.
#[serde(default)]
child_status_at_timeout: Option<ChildProcessInfo>,
},
/// Session established with agent.
///
/// Emitted when an agent response includes a session ID that can be
/// used for XSD retry continuation. This enables reusing the same
/// session when retrying due to validation failures.
SessionEstablished {
/// The role this agent is fulfilling.
role: AgentRole,
/// The agent name.
agent: AgentName,
/// The session ID returned by the agent.
session_id: String,
},
/// XSD validation failed for agent output.
///
/// Emitted when agent output cannot be parsed or fails XSD validation.
/// Distinct from `OutputValidationFailed` events in phase-specific enums,
/// this is the canonical XSD retry trigger that the reducer uses to
/// decide whether to retry with the same agent/session or advance the chain.
XsdValidationFailed {
/// The role whose output failed validation.
role: AgentRole,
/// The artifact type that failed validation.
artifact: crate::reducer::state::ArtifactType,
/// Error message from validation.
error: String,
/// Current XSD retry count for this artifact.
retry_count: u32,
},
/// Template rendering failed due to missing required variables or unresolved placeholders.
///
/// Emitted when a prompt template cannot be rendered because required variables
/// are missing or unresolved placeholders (e.g., `{{VAR}}`) remain in the output.
/// The reducer decides fallback policy, typically switching to the next agent.
TemplateVariablesInvalid {
/// The role whose template failed to render.
role: AgentRole,
/// The name of the template that failed.
template_name: String,
/// Variables that were required but not provided.
missing_variables: Vec<String>,
/// Placeholder patterns that remain unresolved in the rendered output.
unresolved_placeholders: Vec<String>,
},
/// Timeout context written to temp file for session-less agent retry.
///
/// Emitted when a timeout with meaningful output occurs but the agent doesn't
/// support session IDs. The prior context is extracted from the logfile and
/// written to a temp file for the retry prompt to reference.
TimeoutContextWritten {
/// The role this agent is fulfilling.
role: AgentRole,
/// Source logfile path the context was extracted from.
logfile_path: String,
/// Target temp file path where context was written.
context_path: String,
},
/// Connectivity probe succeeded (network is reachable).
///
/// Emitted when a connectivity probe succeeds. The reducer processes this
/// to update ConnectivityState accordingly. If the pipeline was waiting
/// for connectivity verification, this clears that pending flag.
ConnectivityCheckSucceeded,
/// Connectivity probe failed (network is unreachable).
///
/// Emitted when a connectivity probe fails. The reducer processes this
/// to update ConnectivityState. If the failure threshold is reached,
/// the pipeline enters offline mode.
ConnectivityCheckFailed,
}