Skip to main content

ralph_workflow/reducer/event/
agent.rs

1// NOTE: split from reducer/event.rs to keep the main file under line limits.
2use super::types::{default_timeout_output_kind, AgentErrorKind, TimeoutOutputKind};
3use crate::agents::{AgentDrain, AgentRole};
4use crate::common::domain_types::{AgentName, ModelName};
5use crate::ChildProcessInfo;
6use serde::{Deserialize, Serialize};
7
8/// Agent invocation and chain management events.
9///
10/// Events related to agent execution, fallback chains, model switching,
11/// rate limiting, and retry cycles. The agent chain provides fault tolerance
12/// through multiple fallback levels:
13///
14/// 1. Model level: Try different models for the same agent
15/// 2. Agent level: Switch to a fallback agent
16/// 3. Retry cycle: Start over with exponential backoff
17///
18/// # State Transitions
19///
20/// - `InvocationFailed(retriable=true)`: Advances to next model
21/// - `InvocationFailed(retriable=false)`: Typically switches to next agent (policy may vary by kind)
22/// - `RateLimited`: Typically immediate agent switch with prompt preservation
23/// - `ChainExhausted`: Starts new retry cycle
24/// - `InvocationSucceeded`: Clears continuation prompt
25#[derive(Clone, Serialize, Deserialize, Debug, PartialEq)]
26pub enum AgentEvent {
27    /// Agent invocation started.
28    InvocationStarted {
29        /// Compatibility role metadata for the active drain.
30        ///
31        /// Runtime routing is drain-owned; reducers use explicit drain state as the
32        /// authoritative consumer identity.
33        role: AgentRole,
34        /// The agent being invoked.
35        agent: AgentName,
36        /// The model being used, if specified.
37        model: Option<ModelName>,
38    },
39    /// Agent invocation succeeded.
40    InvocationSucceeded {
41        /// Compatibility role metadata for the active drain.
42        role: AgentRole,
43        /// The agent that succeeded.
44        agent: AgentName,
45    },
46    /// Agent invocation failed.
47    InvocationFailed {
48        /// Compatibility role metadata for the active drain.
49        role: AgentRole,
50        /// The agent that failed.
51        agent: AgentName,
52        /// The exit code from the agent process.
53        exit_code: i32,
54        /// The kind of error that occurred.
55        error_kind: AgentErrorKind,
56        /// Whether this error is retriable with the same agent.
57        retriable: bool,
58    },
59    /// Fallback triggered to switch to a different agent.
60    FallbackTriggered {
61        /// The role being fulfilled.
62        role: AgentRole,
63        /// The agent being switched from.
64        from_agent: AgentName,
65        /// The agent being switched to.
66        to_agent: AgentName,
67    },
68    /// Model fallback triggered within the same agent.
69    ModelFallbackTriggered {
70        /// The role being fulfilled.
71        role: AgentRole,
72        /// The agent whose model is changing.
73        agent: AgentName,
74        /// The model being switched from.
75        from_model: ModelName,
76        /// The model being switched to.
77        to_model: ModelName,
78    },
79    /// Retry cycle started (all agents exhausted, starting over).
80    RetryCycleStarted {
81        /// The role being retried.
82        role: AgentRole,
83        /// The cycle number starting.
84        cycle: u32,
85    },
86    /// Agent chain exhausted (no more agents/models to try).
87    ChainExhausted {
88        /// The role whose chain is exhausted.
89        role: AgentRole,
90    },
91    /// Agent chain initialized with available agents.
92    ChainInitialized {
93        /// The explicit runtime drain this chain is for.
94        drain: AgentDrain,
95        /// The agents available in this chain.
96        agents: Vec<AgentName>,
97        /// Maximum number of retry cycles allowed for this chain.
98        max_cycles: u32,
99        /// Base retry-cycle delay in milliseconds.
100        retry_delay_ms: u64,
101        /// Exponential backoff multiplier.
102        backoff_multiplier: f64,
103        /// Maximum backoff delay in milliseconds.
104        max_backoff_ms: u64,
105    },
106    /// Agent hit rate limit (429).
107    ///
108    /// Effects/executors emit this as a *fact* event. The reducer decides
109    /// whether/when to switch agents.
110    RateLimited {
111        /// The role being fulfilled.
112        role: AgentRole,
113        /// The agent that hit the rate limit.
114        agent: AgentName,
115        /// The prompt that was being executed when rate limit was hit.
116        /// This allows the next agent to continue the same work.
117        prompt_context: Option<String>,
118    },
119
120    /// Agent hit authentication failure (401/403).
121    ///
122    /// Effects/executors emit this as a *fact* event. The reducer decides
123    /// whether/when to switch agents.
124    AuthFailed {
125        /// The role being fulfilled.
126        role: AgentRole,
127        /// The agent that failed authentication.
128        agent: AgentName,
129    },
130
131    /// Agent hit an idle timeout.
132    ///
133    /// Emitted as a fact; the reducer decides retry vs fallback based on `output_kind`.
134    /// `NoOutput` triggers immediate agent switch; `PartialOutput` uses the same-agent
135    /// retry budget (same semantics as before this feature).
136    TimedOut {
137        /// The role being fulfilled.
138        role: AgentRole,
139        /// The agent that timed out.
140        agent: AgentName,
141        /// Whether the agent produced any output before timing out.
142        #[serde(default = "default_timeout_output_kind")]
143        output_kind: TimeoutOutputKind,
144        /// Path to the agent's logfile (for context extraction on `PartialOutput` retry).
145        ///
146        /// When `output_kind` is `PartialOutput` and the agent has no session ID,
147        /// this path is used to extract context for the retry prompt.
148        #[serde(default)]
149        logfile_path: Option<String>,
150        /// Child process status when the timeout was enforced.
151        ///
152        /// `None` if no children existed or child checking was disabled.
153        /// When `Some`, contains the child count and cumulative CPU time at timeout.
154        #[serde(default)]
155        child_status_at_timeout: Option<ChildProcessInfo>,
156    },
157
158    /// Session established with agent.
159    ///
160    /// Emitted when an agent response includes a session ID that can be
161    /// used for XSD retry continuation. This enables reusing the same
162    /// session when retrying due to validation failures.
163    SessionEstablished {
164        /// The role this agent is fulfilling.
165        role: AgentRole,
166        /// The agent name.
167        agent: AgentName,
168        /// The session ID returned by the agent.
169        session_id: String,
170    },
171
172    /// XSD validation failed for agent output.
173    ///
174    /// Emitted when agent output cannot be parsed or fails XSD validation.
175    /// Distinct from `OutputValidationFailed` events in phase-specific enums,
176    /// this is the canonical XSD retry trigger that the reducer uses to
177    /// decide whether to retry with the same agent/session or advance the chain.
178    XsdValidationFailed {
179        /// The role whose output failed validation.
180        role: AgentRole,
181        /// The artifact type that failed validation.
182        artifact: crate::reducer::state::ArtifactType,
183        /// Error message from validation.
184        error: String,
185        /// Current XSD retry count for this artifact.
186        retry_count: u32,
187    },
188
189    /// Template rendering failed due to missing required variables or unresolved placeholders.
190    ///
191    /// Emitted when a prompt template cannot be rendered because required variables
192    /// are missing or unresolved placeholders (e.g., `{{VAR}}`) remain in the output.
193    /// The reducer decides fallback policy, typically switching to the next agent.
194    TemplateVariablesInvalid {
195        /// The role whose template failed to render.
196        role: AgentRole,
197        /// The name of the template that failed.
198        template_name: String,
199        /// Variables that were required but not provided.
200        missing_variables: Vec<String>,
201        /// Placeholder patterns that remain unresolved in the rendered output.
202        unresolved_placeholders: Vec<String>,
203    },
204
205    /// Timeout context written to temp file for session-less agent retry.
206    ///
207    /// Emitted when a timeout with meaningful output occurs but the agent doesn't
208    /// support session IDs. The prior context is extracted from the logfile and
209    /// written to a temp file for the retry prompt to reference.
210    TimeoutContextWritten {
211        /// The role this agent is fulfilling.
212        role: AgentRole,
213        /// Source logfile path the context was extracted from.
214        logfile_path: String,
215        /// Target temp file path where context was written.
216        context_path: String,
217    },
218}