Skip to main content

ralph_workflow/reducer/event/
agent.rs

1// NOTE: split from reducer/event.rs to keep the main file under line limits.
2use super::types::{default_timeout_output_kind, AgentErrorKind, TimeoutOutputKind};
3use crate::agents::{AgentDrain, AgentRole};
4use crate::executor::ChildProcessInfo;
5use serde::{Deserialize, Serialize};
6
7/// Agent invocation and chain management events.
8///
9/// Events related to agent execution, fallback chains, model switching,
10/// rate limiting, and retry cycles. The agent chain provides fault tolerance
11/// through multiple fallback levels:
12///
13/// 1. Model level: Try different models for the same agent
14/// 2. Agent level: Switch to a fallback agent
15/// 3. Retry cycle: Start over with exponential backoff
16///
17/// # State Transitions
18///
19/// - `InvocationFailed(retriable=true)`: Advances to next model
20/// - `InvocationFailed(retriable=false)`: Typically switches to next agent (policy may vary by kind)
21/// - `RateLimited`: Typically immediate agent switch with prompt preservation
22/// - `ChainExhausted`: Starts new retry cycle
23/// - `InvocationSucceeded`: Clears continuation prompt
24#[derive(Clone, Serialize, Deserialize, Debug)]
25pub enum AgentEvent {
26    /// Agent invocation started.
27    InvocationStarted {
28        /// Compatibility role metadata for the active drain.
29        ///
30        /// Runtime routing is drain-owned; reducers use explicit drain state as the
31        /// authoritative consumer identity.
32        role: AgentRole,
33        /// The agent being invoked.
34        agent: String,
35        /// The model being used, if specified.
36        model: Option<String>,
37    },
38    /// Agent invocation succeeded.
39    InvocationSucceeded {
40        /// Compatibility role metadata for the active drain.
41        role: AgentRole,
42        /// The agent that succeeded.
43        agent: String,
44    },
45    /// Agent invocation failed.
46    InvocationFailed {
47        /// Compatibility role metadata for the active drain.
48        role: AgentRole,
49        /// The agent that failed.
50        agent: String,
51        /// The exit code from the agent process.
52        exit_code: i32,
53        /// The kind of error that occurred.
54        error_kind: AgentErrorKind,
55        /// Whether this error is retriable with the same agent.
56        retriable: bool,
57    },
58    /// Fallback triggered to switch to a different agent.
59    FallbackTriggered {
60        /// The role being fulfilled.
61        role: AgentRole,
62        /// The agent being switched from.
63        from_agent: String,
64        /// The agent being switched to.
65        to_agent: String,
66    },
67    /// Model fallback triggered within the same agent.
68    ModelFallbackTriggered {
69        /// The role being fulfilled.
70        role: AgentRole,
71        /// The agent whose model is changing.
72        agent: String,
73        /// The model being switched from.
74        from_model: String,
75        /// The model being switched to.
76        to_model: String,
77    },
78    /// Retry cycle started (all agents exhausted, starting over).
79    RetryCycleStarted {
80        /// The role being retried.
81        role: AgentRole,
82        /// The cycle number starting.
83        cycle: u32,
84    },
85    /// Agent chain exhausted (no more agents/models to try).
86    ChainExhausted {
87        /// The role whose chain is exhausted.
88        role: AgentRole,
89    },
90    /// Agent chain initialized with available agents.
91    ChainInitialized {
92        /// The explicit runtime drain this chain is for.
93        drain: AgentDrain,
94        /// The agents available in this chain.
95        agents: Vec<String>,
96        /// Maximum number of retry cycles allowed for this chain.
97        max_cycles: u32,
98        /// Base retry-cycle delay in milliseconds.
99        retry_delay_ms: u64,
100        /// Exponential backoff multiplier.
101        backoff_multiplier: f64,
102        /// Maximum backoff delay in milliseconds.
103        max_backoff_ms: u64,
104    },
105    /// Agent hit rate limit (429).
106    ///
107    /// Effects/executors emit this as a *fact* event. The reducer decides
108    /// whether/when to switch agents.
109    RateLimited {
110        /// The role being fulfilled.
111        role: AgentRole,
112        /// The agent that hit the rate limit.
113        agent: String,
114        /// The prompt that was being executed when rate limit was hit.
115        /// This allows the next agent to continue the same work.
116        prompt_context: Option<String>,
117    },
118
119    /// Agent hit authentication failure (401/403).
120    ///
121    /// Effects/executors emit this as a *fact* event. The reducer decides
122    /// whether/when to switch agents.
123    AuthFailed {
124        /// The role being fulfilled.
125        role: AgentRole,
126        /// The agent that failed authentication.
127        agent: String,
128    },
129
130    /// Agent hit an idle timeout.
131    ///
132    /// Emitted as a fact; the reducer decides retry vs fallback based on `output_kind`.
133    /// `NoOutput` triggers immediate agent switch; `PartialOutput` uses the same-agent
134    /// retry budget (same semantics as before this feature).
135    TimedOut {
136        /// The role being fulfilled.
137        role: AgentRole,
138        /// The agent that timed out.
139        agent: String,
140        /// Whether the agent produced any output before timing out.
141        #[serde(default = "default_timeout_output_kind")]
142        output_kind: TimeoutOutputKind,
143        /// Path to the agent's logfile (for context extraction on `PartialOutput` retry).
144        ///
145        /// When `output_kind` is `PartialOutput` and the agent has no session ID,
146        /// this path is used to extract context for the retry prompt.
147        #[serde(default)]
148        logfile_path: Option<String>,
149        /// Child process status when the timeout was enforced.
150        ///
151        /// `None` if no children existed or child checking was disabled.
152        /// When `Some`, contains the child count and cumulative CPU time at timeout.
153        #[serde(default)]
154        child_status_at_timeout: Option<ChildProcessInfo>,
155    },
156
157    /// Session established with agent.
158    ///
159    /// Emitted when an agent response includes a session ID that can be
160    /// used for XSD retry continuation. This enables reusing the same
161    /// session when retrying due to validation failures.
162    SessionEstablished {
163        /// The role this agent is fulfilling.
164        role: AgentRole,
165        /// The agent name.
166        agent: String,
167        /// The session ID returned by the agent.
168        session_id: String,
169    },
170
171    /// XSD validation failed for agent output.
172    ///
173    /// Emitted when agent output cannot be parsed or fails XSD validation.
174    /// Distinct from `OutputValidationFailed` events in phase-specific enums,
175    /// this is the canonical XSD retry trigger that the reducer uses to
176    /// decide whether to retry with the same agent/session or advance the chain.
177    XsdValidationFailed {
178        /// The role whose output failed validation.
179        role: AgentRole,
180        /// The artifact type that failed validation.
181        artifact: crate::reducer::state::ArtifactType,
182        /// Error message from validation.
183        error: String,
184        /// Current XSD retry count for this artifact.
185        retry_count: u32,
186    },
187
188    /// Template rendering failed due to missing required variables or unresolved placeholders.
189    ///
190    /// Emitted when a prompt template cannot be rendered because required variables
191    /// are missing or unresolved placeholders (e.g., `{{VAR}}`) remain in the output.
192    /// The reducer decides fallback policy, typically switching to the next agent.
193    TemplateVariablesInvalid {
194        /// The role whose template failed to render.
195        role: AgentRole,
196        /// The name of the template that failed.
197        template_name: String,
198        /// Variables that were required but not provided.
199        missing_variables: Vec<String>,
200        /// Placeholder patterns that remain unresolved in the rendered output.
201        unresolved_placeholders: Vec<String>,
202    },
203
204    /// Timeout context written to temp file for session-less agent retry.
205    ///
206    /// Emitted when a timeout with meaningful output occurs but the agent doesn't
207    /// support session IDs. The prior context is extracted from the logfile and
208    /// written to a temp file for the retry prompt to reference.
209    TimeoutContextWritten {
210        /// The role this agent is fulfilling.
211        role: AgentRole,
212        /// Source logfile path the context was extracted from.
213        logfile_path: String,
214        /// Target temp file path where context was written.
215        context_path: String,
216    },
217}