Skip to main content

ralph_workflow/reducer/event/
agent.rs

1// NOTE: split from reducer/event.rs to keep the main file under line limits.
2use super::types::{default_timeout_output_kind, AgentErrorKind, TimeoutOutputKind};
3use crate::agents::AgentRole;
4use crate::executor::ChildProcessInfo;
5use serde::{Deserialize, Serialize};
6
7/// Agent invocation and chain management events.
8///
9/// Events related to agent execution, fallback chains, model switching,
10/// rate limiting, and retry cycles. The agent chain provides fault tolerance
11/// through multiple fallback levels:
12///
13/// 1. Model level: Try different models for the same agent
14/// 2. Agent level: Switch to a fallback agent
15/// 3. Retry cycle: Start over with exponential backoff
16///
17/// # State Transitions
18///
19/// - `InvocationFailed(retriable=true)`: Advances to next model
20/// - `InvocationFailed(retriable=false)`: Typically switches to next agent (policy may vary by kind)
21/// - `RateLimited`: Typically immediate agent switch with prompt preservation
22/// - `ChainExhausted`: Starts new retry cycle
23/// - `InvocationSucceeded`: Clears continuation prompt
24#[derive(Clone, Serialize, Deserialize, Debug)]
25pub enum AgentEvent {
26    /// Agent invocation started.
27    InvocationStarted {
28        /// The role this agent is fulfilling.
29        role: AgentRole,
30        /// The agent being invoked.
31        agent: String,
32        /// The model being used, if specified.
33        model: Option<String>,
34    },
35    /// Agent invocation succeeded.
36    InvocationSucceeded {
37        /// The role this agent fulfilled.
38        role: AgentRole,
39        /// The agent that succeeded.
40        agent: String,
41    },
42    /// Agent invocation failed.
43    InvocationFailed {
44        /// The role this agent was fulfilling.
45        role: AgentRole,
46        /// The agent that failed.
47        agent: String,
48        /// The exit code from the agent process.
49        exit_code: i32,
50        /// The kind of error that occurred.
51        error_kind: AgentErrorKind,
52        /// Whether this error is retriable with the same agent.
53        retriable: bool,
54    },
55    /// Fallback triggered to switch to a different agent.
56    FallbackTriggered {
57        /// The role being fulfilled.
58        role: AgentRole,
59        /// The agent being switched from.
60        from_agent: String,
61        /// The agent being switched to.
62        to_agent: String,
63    },
64    /// Model fallback triggered within the same agent.
65    ModelFallbackTriggered {
66        /// The role being fulfilled.
67        role: AgentRole,
68        /// The agent whose model is changing.
69        agent: String,
70        /// The model being switched from.
71        from_model: String,
72        /// The model being switched to.
73        to_model: String,
74    },
75    /// Retry cycle started (all agents exhausted, starting over).
76    RetryCycleStarted {
77        /// The role being retried.
78        role: AgentRole,
79        /// The cycle number starting.
80        cycle: u32,
81    },
82    /// Agent chain exhausted (no more agents/models to try).
83    ChainExhausted {
84        /// The role whose chain is exhausted.
85        role: AgentRole,
86    },
87    /// Agent chain initialized with available agents.
88    ChainInitialized {
89        /// The role this chain is for.
90        role: AgentRole,
91        /// The agents available in this chain.
92        agents: Vec<String>,
93        /// Maximum number of retry cycles allowed for this chain.
94        max_cycles: u32,
95        /// Base retry-cycle delay in milliseconds.
96        retry_delay_ms: u64,
97        /// Exponential backoff multiplier.
98        backoff_multiplier: f64,
99        /// Maximum backoff delay in milliseconds.
100        max_backoff_ms: u64,
101    },
102    /// Agent hit rate limit (429).
103    ///
104    /// Effects/executors emit this as a *fact* event. The reducer decides
105    /// whether/when to switch agents.
106    RateLimited {
107        /// The role being fulfilled.
108        role: AgentRole,
109        /// The agent that hit the rate limit.
110        agent: String,
111        /// The prompt that was being executed when rate limit was hit.
112        /// This allows the next agent to continue the same work.
113        prompt_context: Option<String>,
114    },
115
116    /// Agent hit authentication failure (401/403).
117    ///
118    /// Effects/executors emit this as a *fact* event. The reducer decides
119    /// whether/when to switch agents.
120    AuthFailed {
121        /// The role being fulfilled.
122        role: AgentRole,
123        /// The agent that failed authentication.
124        agent: String,
125    },
126
127    /// Agent hit an idle timeout.
128    ///
129    /// Emitted as a fact; the reducer decides retry vs fallback based on `output_kind`.
130    /// `NoOutput` triggers immediate agent switch; `PartialOutput` uses the same-agent
131    /// retry budget (same semantics as before this feature).
132    TimedOut {
133        /// The role being fulfilled.
134        role: AgentRole,
135        /// The agent that timed out.
136        agent: String,
137        /// Whether the agent produced any output before timing out.
138        #[serde(default = "default_timeout_output_kind")]
139        output_kind: TimeoutOutputKind,
140        /// Path to the agent's logfile (for context extraction on `PartialOutput` retry).
141        ///
142        /// When `output_kind` is `PartialOutput` and the agent has no session ID,
143        /// this path is used to extract context for the retry prompt.
144        #[serde(default)]
145        logfile_path: Option<String>,
146        /// Child process status when the timeout was enforced.
147        ///
148        /// `None` if no children existed or child checking was disabled.
149        /// When `Some`, contains the child count and cumulative CPU time at timeout.
150        #[serde(default)]
151        child_status_at_timeout: Option<ChildProcessInfo>,
152    },
153
154    /// Session established with agent.
155    ///
156    /// Emitted when an agent response includes a session ID that can be
157    /// used for XSD retry continuation. This enables reusing the same
158    /// session when retrying due to validation failures.
159    SessionEstablished {
160        /// The role this agent is fulfilling.
161        role: AgentRole,
162        /// The agent name.
163        agent: String,
164        /// The session ID returned by the agent.
165        session_id: String,
166    },
167
168    /// XSD validation failed for agent output.
169    ///
170    /// Emitted when agent output cannot be parsed or fails XSD validation.
171    /// Distinct from `OutputValidationFailed` events in phase-specific enums,
172    /// this is the canonical XSD retry trigger that the reducer uses to
173    /// decide whether to retry with the same agent/session or advance the chain.
174    XsdValidationFailed {
175        /// The role whose output failed validation.
176        role: AgentRole,
177        /// The artifact type that failed validation.
178        artifact: crate::reducer::state::ArtifactType,
179        /// Error message from validation.
180        error: String,
181        /// Current XSD retry count for this artifact.
182        retry_count: u32,
183    },
184
185    /// Template rendering failed due to missing required variables or unresolved placeholders.
186    ///
187    /// Emitted when a prompt template cannot be rendered because required variables
188    /// are missing or unresolved placeholders (e.g., `{{VAR}}`) remain in the output.
189    /// The reducer decides fallback policy, typically switching to the next agent.
190    TemplateVariablesInvalid {
191        /// The role whose template failed to render.
192        role: AgentRole,
193        /// The name of the template that failed.
194        template_name: String,
195        /// Variables that were required but not provided.
196        missing_variables: Vec<String>,
197        /// Placeholder patterns that remain unresolved in the rendered output.
198        unresolved_placeholders: Vec<String>,
199    },
200
201    /// Timeout context written to temp file for session-less agent retry.
202    ///
203    /// Emitted when a timeout with meaningful output occurs but the agent doesn't
204    /// support session IDs. The prior context is extracted from the logfile and
205    /// written to a temp file for the retry prompt to reference.
206    TimeoutContextWritten {
207        /// The role this agent is fulfilling.
208        role: AgentRole,
209        /// Source logfile path the context was extracted from.
210        logfile_path: String,
211        /// Target temp file path where context was written.
212        context_path: String,
213    },
214}