ralph_workflow/pipeline/
runner.rs

1//! Command execution helpers and fallback orchestration.
2
3use crate::agents::{validate_model_flag, AgentConfig, AgentRegistry, AgentRole, JsonParserType};
4use crate::common::split_command;
5use std::sync::Arc;
6
7use super::fallback::try_agent_with_retries;
8use super::fallback::TryAgentResult;
9use super::model_flag::resolve_model_with_provider;
10use super::prompt::PipelineRuntime;
11
12/// Build the list of agents to try and log the fallback chain.
13fn build_agents_to_try<'a>(fallbacks: &'a [&'a str], primary_agent: &'a str) -> Vec<&'a str> {
14    let mut agents_to_try: Vec<&'a str> = vec![primary_agent];
15    for fb in fallbacks {
16        if *fb != primary_agent && !agents_to_try.contains(fb) {
17            agents_to_try.push(fb);
18        }
19    }
20    agents_to_try
21}
22
23/// Get CLI model/provider overrides based on role.
24fn get_cli_overrides(
25    role: AgentRole,
26    runtime: &PipelineRuntime<'_>,
27) -> (Option<String>, Option<String>) {
28    match role {
29        AgentRole::Developer => (
30            runtime.config.developer_model.clone(),
31            runtime.config.developer_provider.clone(),
32        ),
33        AgentRole::Reviewer => (
34            runtime.config.reviewer_model.clone(),
35            runtime.config.reviewer_provider.clone(),
36        ),
37        AgentRole::Commit => (None, None), // Commit role doesn't have CLI overrides
38    }
39}
40
41/// Context for building model flags.
42struct ModelFlagBuildContext<'a> {
43    agent_index: usize,
44    cli_model_override: Option<&'a String>,
45    cli_provider_override: Option<&'a String>,
46    agent_config: &'a AgentConfig,
47    agent_name: &'a str,
48    fallback_config: &'a crate::agents::fallback::FallbackConfig,
49    display_name: &'a str,
50    runtime: &'a PipelineRuntime<'a>,
51}
52
53/// Build the list of model flags to try for an agent.
54fn build_model_flags_list(ctx: &ModelFlagBuildContext<'_>) -> Vec<Option<String>> {
55    let mut model_flags_to_try: Vec<Option<String>> = Vec::new();
56
57    // CLI override takes highest priority for primary agent
58    // Provider override can modify the model's provider prefix
59    if ctx.agent_index == 0
60        && (ctx.cli_model_override.is_some() || ctx.cli_provider_override.is_some())
61    {
62        let resolved = resolve_model_with_provider(
63            ctx.cli_provider_override.map(std::string::String::as_str),
64            ctx.cli_model_override.map(std::string::String::as_str),
65            ctx.agent_config.model_flag.as_deref(),
66        );
67        if resolved.is_some() {
68            model_flags_to_try.push(resolved);
69        }
70    }
71
72    // Add the agent's default model (None means use agent's configured model_flag or no model)
73    if model_flags_to_try.is_empty() {
74        model_flags_to_try.push(None);
75    }
76
77    // Add provider fallback models for this agent
78    if ctx.fallback_config.has_provider_fallbacks(ctx.agent_name) {
79        let provider_fallbacks = ctx.fallback_config.get_provider_fallbacks(ctx.agent_name);
80        ctx.runtime.logger.info(&format!(
81            "Agent '{}' has {} provider fallback(s) configured",
82            ctx.display_name,
83            provider_fallbacks.len()
84        ));
85        for model in provider_fallbacks {
86            model_flags_to_try.push(Some(model.clone()));
87        }
88    }
89
90    model_flags_to_try
91}
92
93/// Build the command string for a specific model configuration.
94fn build_command_for_model(ctx: &TryModelContext<'_>, runtime: &PipelineRuntime<'_>) -> String {
95    let model_ref = ctx.model_flag.map(std::string::String::as_str);
96    // Enable yolo for ALL roles - this is an automated pipeline, not interactive.
97    // All agents need file write access to output their XML results.
98    let yolo = true;
99
100    if ctx.agent_index == 0 && ctx.cycle == 0 && ctx.model_index == 0 {
101        // For primary agent on first cycle, respect env var command overrides
102        match ctx.role {
103            AgentRole::Developer => runtime.config.developer_cmd.clone().unwrap_or_else(|| {
104                ctx.agent_config
105                    .build_cmd_with_model(true, true, true, model_ref)
106            }),
107            AgentRole::Reviewer => runtime.config.reviewer_cmd.clone().unwrap_or_else(|| {
108                ctx.agent_config
109                    .build_cmd_with_model(true, true, yolo, model_ref)
110            }),
111            AgentRole::Commit => runtime.config.commit_cmd.clone().unwrap_or_else(|| {
112                ctx.agent_config
113                    .build_cmd_with_model(true, true, yolo, model_ref)
114            }),
115        }
116    } else {
117        ctx.agent_config
118            .build_cmd_with_model(true, true, yolo, model_ref)
119    }
120}
121
122/// GLM-specific validation for print flag.
123///
124/// This validation only applies to CCS/Claude-based GLM agents that use the `-p` flag
125/// for non-interactive mode. OpenCode agents are excluded because they use
126/// `--auto-approve` for non-interactive mode instead.
127fn validate_glm_print_flag(
128    agent_name: &str,
129    agent_config: &AgentConfig,
130    cmd_str: &str,
131    agent_index: usize,
132    cycle: u32,
133    model_index: usize,
134    runtime: &PipelineRuntime<'_>,
135) {
136    // Skip validation for non-CCS/Claude GLM agents
137    // is_glm_like_agent only matches CCS/Claude-based GLM agents, not OpenCode
138    if !crate::agents::is_glm_like_agent(agent_name)
139        || agent_index != 0
140        || cycle != 0
141        || model_index != 0
142    {
143        return;
144    }
145
146    let cmd_argv = split_command(cmd_str).ok();
147    let has_print_flag = cmd_argv
148        .as_ref()
149        .is_some_and(|argv| argv.iter().any(|arg| arg == "-p"));
150    if !has_print_flag {
151        if agent_config.print_flag.is_empty() {
152            runtime.logger.warn(&format!(
153                "GLM agent '{agent_name}' is missing '-p' flag: print_flag is empty in configuration. \
154                 Add 'print_flag = \"-p\"' to [ccs] section in ~/.config/ralph-workflow.toml"
155            ));
156        } else {
157            runtime.logger.warn(&format!(
158                "GLM agent '{agent_name}' may be missing '-p' flag in command. Check configuration."
159            ));
160        }
161    }
162}
163
164/// Build label and logfile paths for execution.
165fn build_execution_metadata(
166    model_flag: Option<&String>,
167    display_name: &str,
168    base_label: &str,
169    agent_name: &str,
170    logfile_prefix: &str,
171    model_index: usize,
172) -> (String, String, String) {
173    let model_suffix = model_flag.map(|m| format!(" [{m}]")).unwrap_or_default();
174    let display_name_with_suffix = format!("{display_name}{model_suffix}");
175    let label = format!("{base_label} ({display_name_with_suffix})");
176    let logfile = super::logfile::build_logfile_path(logfile_prefix, agent_name, model_index);
177    (label, logfile, display_name_with_suffix)
178}
179
180/// Result of trying a single agent.
181#[derive(Debug, Clone, Copy, PartialEq, Eq)]
182enum TrySingleAgentResult {
183    /// Agent succeeded - return success
184    Success,
185    /// Unrecoverable error - abort immediately
186    Unrecoverable(i32),
187    /// Should fall back to next agent
188    Fallback,
189    /// Continue to next model (no retry)
190    NoRetry,
191}
192
193/// Context for trying a single model.
194struct TryModelContext<'a> {
195    agent_config: &'a AgentConfig,
196    agent_name: &'a str,
197    display_name: &'a str,
198    agent_index: usize,
199    cycle: u32,
200    model_index: usize,
201    role: AgentRole,
202    model_flag: Option<&'a String>,
203    base_label: &'a str,
204    prompt: &'a str,
205    logfile_prefix: &'a str,
206    fallback_config: &'a crate::agents::fallback::FallbackConfig,
207    output_validator: Option<crate::pipeline::fallback::OutputValidator>,
208    retry_timer: Arc<dyn crate::agents::RetryTimerProvider>,
209    workspace: &'a dyn crate::workspace::Workspace,
210}
211
212/// Try a single model configuration for an agent.
213fn try_single_model(
214    ctx: &TryModelContext<'_>,
215    runtime: &mut PipelineRuntime<'_>,
216) -> std::io::Result<TrySingleAgentResult> {
217    let mut parser_type = ctx.agent_config.json_parser;
218
219    if ctx.role == AgentRole::Reviewer {
220        if let Some(ref parser_override) = runtime.config.reviewer_json_parser {
221            parser_type = JsonParserType::parse(parser_override);
222            if ctx.agent_index == 0 && ctx.cycle == 0 && ctx.model_index == 0 {
223                runtime.logger.info(&format!(
224                    "Using JSON parser override '{parser_override}' for reviewer"
225                ));
226            }
227        }
228    }
229
230    let cmd_str = build_command_for_model(ctx, runtime);
231
232    validate_glm_print_flag(
233        ctx.agent_name,
234        ctx.agent_config,
235        &cmd_str,
236        ctx.agent_index,
237        ctx.cycle,
238        ctx.model_index,
239        runtime,
240    );
241
242    let (label, logfile, display_name_with_suffix) = build_execution_metadata(
243        ctx.model_flag,
244        ctx.display_name,
245        ctx.base_label,
246        ctx.agent_name,
247        ctx.logfile_prefix,
248        ctx.model_index,
249    );
250
251    let attempt_config = crate::pipeline::fallback::AgentAttemptConfig {
252        agent_name: ctx.agent_name,
253        model_flag: ctx.model_flag.map(std::string::String::as_str),
254        label: &label,
255        display_name: &display_name_with_suffix,
256        cmd_str: &cmd_str,
257        prompt: ctx.prompt,
258        logfile: &logfile,
259        logfile_prefix: ctx.logfile_prefix,
260        parser_type,
261        env_vars: &ctx.agent_config.env_vars,
262        model_index: ctx.model_index,
263        agent_index: ctx.agent_index,
264        cycle: ctx.cycle as usize,
265        fallback_config: ctx.fallback_config,
266        output_validator: ctx.output_validator,
267        retry_timer: Arc::clone(&ctx.retry_timer),
268        workspace: ctx.workspace,
269    };
270    let result = try_agent_with_retries(&attempt_config, runtime)?;
271
272    match result {
273        TryAgentResult::Success => Ok(TrySingleAgentResult::Success),
274        TryAgentResult::Unrecoverable(exit_code) => {
275            Ok(TrySingleAgentResult::Unrecoverable(exit_code))
276        }
277        TryAgentResult::Fallback => Ok(TrySingleAgentResult::Fallback),
278        TryAgentResult::NoRetry => Ok(TrySingleAgentResult::NoRetry),
279    }
280}
281
282/// Context for trying a single agent.
283struct TryAgentContext<'a> {
284    agent_name: &'a str,
285    agent_index: usize,
286    cycle: u32,
287    role: AgentRole,
288    base_label: &'a str,
289    prompt: &'a str,
290    logfile_prefix: &'a str,
291    cli_model_override: Option<&'a String>,
292    cli_provider_override: Option<&'a String>,
293    output_validator: Option<crate::pipeline::fallback::OutputValidator>,
294    retry_timer: Arc<dyn crate::agents::RetryTimerProvider>,
295    workspace: &'a dyn crate::workspace::Workspace,
296}
297
298/// Try a single agent with all its model configurations.
299fn try_single_agent(
300    ctx: &TryAgentContext<'_>,
301    runtime: &mut PipelineRuntime<'_>,
302    registry: &AgentRegistry,
303    fallback_config: &crate::agents::fallback::FallbackConfig,
304) -> std::io::Result<TrySingleAgentResult> {
305    let Some(agent_config) = registry.resolve_config(ctx.agent_name) else {
306        runtime.logger.warn(&format!(
307            "Agent '{}' not found in registry, skipping",
308            ctx.agent_name
309        ));
310        return Ok(TrySingleAgentResult::Fallback);
311    };
312
313    let display_name = registry.display_name(ctx.agent_name);
314    let model_ctx = ModelFlagBuildContext {
315        agent_index: ctx.agent_index,
316        cli_model_override: ctx.cli_model_override,
317        cli_provider_override: ctx.cli_provider_override,
318        agent_config: &agent_config,
319        agent_name: ctx.agent_name,
320        fallback_config,
321        display_name: &display_name,
322        runtime,
323    };
324    let model_flags_to_try = build_model_flags_list(&model_ctx);
325
326    if ctx.agent_index == 0 && ctx.cycle == 0 {
327        for model_flag in model_flags_to_try.iter().flatten() {
328            for warning in validate_model_flag(model_flag) {
329                runtime.logger.warn(&warning);
330            }
331        }
332    }
333
334    for (model_index, model_flag) in model_flags_to_try.iter().enumerate() {
335        let model_ctx = TryModelContext {
336            agent_config: &agent_config,
337            agent_name: ctx.agent_name,
338            display_name: &display_name,
339            agent_index: ctx.agent_index,
340            cycle: ctx.cycle,
341            model_index,
342            role: ctx.role,
343            model_flag: model_flag.as_ref(),
344            base_label: ctx.base_label,
345            prompt: ctx.prompt,
346            logfile_prefix: ctx.logfile_prefix,
347            fallback_config,
348            output_validator: ctx.output_validator,
349            retry_timer: Arc::clone(&ctx.retry_timer),
350            workspace: ctx.workspace,
351        };
352        let result = try_single_model(&model_ctx, runtime)?;
353
354        match result {
355            TrySingleAgentResult::Success => return Ok(TrySingleAgentResult::Success),
356            TrySingleAgentResult::Unrecoverable(exit_code) => {
357                return Ok(TrySingleAgentResult::Unrecoverable(exit_code))
358            }
359            TrySingleAgentResult::Fallback => return Ok(TrySingleAgentResult::Fallback),
360            TrySingleAgentResult::NoRetry => {}
361        }
362    }
363
364    Ok(TrySingleAgentResult::NoRetry)
365}
366
367/// Configuration for running with fallback.
368pub struct FallbackConfig<'a, 'b> {
369    pub role: AgentRole,
370    pub base_label: &'a str,
371    pub prompt: &'a str,
372    pub logfile_prefix: &'a str,
373    pub runtime: &'a mut PipelineRuntime<'b>,
374    pub registry: &'a AgentRegistry,
375    pub primary_agent: &'a str,
376    pub output_validator: Option<crate::pipeline::fallback::OutputValidator>,
377    pub workspace: &'a dyn crate::workspace::Workspace,
378}
379
380/// Run a command with automatic fallback to alternative agents on failure.
381///
382/// Includes an optional output validator callback that checks if the agent
383/// produced valid output after `exit_code=0`. If validation fails, triggers fallback.
384///
385/// This variant takes a `FallbackConfig` directly for cases where you need
386/// to specify an output validator.
387pub fn run_with_fallback_and_validator(
388    config: &mut FallbackConfig<'_, '_>,
389) -> std::io::Result<i32> {
390    run_with_fallback_internal(config)
391}
392
393/// Run a command with automatic fallback to alternative agents on failure.
394///
395/// Includes an optional output validator callback that checks if the agent
396/// produced valid output after `exit_code=0`. If validation fails, triggers fallback.
397fn run_with_fallback_internal(config: &mut FallbackConfig<'_, '_>) -> std::io::Result<i32> {
398    let fallback_config = config.registry.fallback_config();
399    let fallbacks = config.registry.available_fallbacks(config.role);
400    if fallback_config.has_fallbacks(config.role) {
401        config.runtime.logger.info(&format!(
402            "Agent fallback chain for {}: {}",
403            config.role,
404            fallbacks.join(", ")
405        ));
406    } else {
407        config.runtime.logger.info(&format!(
408            "No configured fallbacks for {}, using primary only",
409            config.role
410        ));
411    }
412
413    let agents_to_try = build_agents_to_try(&fallbacks, config.primary_agent);
414    let (cli_model_override, cli_provider_override) =
415        get_cli_overrides(config.role, config.runtime);
416
417    for cycle in 0..fallback_config.max_cycles {
418        if cycle > 0 {
419            let backoff_ms = fallback_config.calculate_backoff(cycle - 1);
420            config.runtime.logger.info(&format!(
421                "Cycle {}/{}: All agents exhausted, waiting {}ms before retry (exponential backoff)...",
422                cycle + 1,
423                fallback_config.max_cycles,
424                backoff_ms
425            ));
426            config
427                .registry
428                .retry_timer()
429                .sleep(std::time::Duration::from_millis(backoff_ms));
430        }
431
432        for (agent_index, agent_name) in agents_to_try.iter().enumerate() {
433            let ctx = TryAgentContext {
434                agent_name,
435                agent_index,
436                cycle,
437                role: config.role,
438                base_label: config.base_label,
439                prompt: config.prompt,
440                logfile_prefix: config.logfile_prefix,
441                cli_model_override: cli_model_override.as_ref(),
442                cli_provider_override: cli_provider_override.as_ref(),
443                output_validator: config.output_validator,
444                retry_timer: config.registry.retry_timer(),
445                workspace: config.workspace,
446            };
447            let result = try_single_agent(&ctx, config.runtime, config.registry, fallback_config)?;
448
449            match result {
450                TrySingleAgentResult::Success => return Ok(0),
451                TrySingleAgentResult::Unrecoverable(exit_code) => return Ok(exit_code),
452                TrySingleAgentResult::Fallback | TrySingleAgentResult::NoRetry => {}
453            }
454        }
455    }
456
457    config.runtime.logger.error(&format!(
458        "All agents exhausted after {} cycles with exponential backoff",
459        fallback_config.max_cycles
460    ));
461    Ok(1)
462}
463
464// ============================================================================
465// Session Continuation for XSD Retries
466// ============================================================================
467//
468// Session continuation allows XSD validation retries to continue the same
469// agent session, so the AI retains memory of its previous reasoning.
470//
471// DESIGN PRINCIPLE: Session continuation is an OPTIMIZATION, not a requirement.
472// It must be completely fault-tolerant:
473//
474// 1. If session continuation produces output (regardless of exit code) -> use it
475// 2. If it fails for ANY reason (segfault, crash, invalid session, I/O error,
476//    timeout, or any other failure) -> silently fall back to normal behavior
477//
478// The fallback chain must NEVER be affected by session continuation failures.
479// Even a segfaulting agent during session continuation must not break anything.
480//
481// IMPORTANT: Some AI agents have quirky behavior where they return non-zero exit
482// codes but still produce valid XML. For example, an agent might output valid XML
483// with status="partial" and then exit with code 1. We should still use that XML.
484// The caller is responsible for checking if valid XML exists in the log file.
485
486/// Result of attempting session continuation.
487#[derive(Debug)]
488pub enum SessionContinuationResult {
489    /// Session continuation ran (agent was invoked).
490    /// NOTE: This does NOT mean the agent succeeded - the caller must check
491    /// the log file for valid output. Some agents produce valid XML even
492    /// when returning non-zero exit codes.
493    Ran { exit_code: i32 },
494    /// Session continuation failed to run or was not attempted.
495    /// The caller should fall back to normal `run_with_fallback`.
496    Fallback,
497}
498
499/// Configuration for XSD retry with optional session continuation.
500pub struct XsdRetryConfig<'a, 'b> {
501    /// Agent role for the retry.
502    pub role: AgentRole,
503    /// Label for logging (e.g., "planning #1").
504    pub base_label: &'a str,
505    /// The prompt to send.
506    pub prompt: &'a str,
507    /// Log file prefix (e.g., ".agent/logs/planning_1").
508    pub logfile_prefix: &'a str,
509    /// Pipeline runtime for logging and timing.
510    pub runtime: &'a mut PipelineRuntime<'b>,
511    /// Agent registry for resolving agent configs.
512    pub registry: &'a AgentRegistry,
513    /// Primary agent name.
514    pub primary_agent: &'a str,
515    /// Optional session info from previous run.
516    /// If provided and valid, session continuation will be attempted first.
517    pub session_info: Option<&'a crate::pipeline::session::SessionInfo>,
518    /// Retry number (0 = first attempt, 1+ = XSD retries).
519    pub retry_num: usize,
520    /// Optional output validator to check if agent produced valid output.
521    /// Used by review phase to validate JSON output extraction.
522    pub output_validator: Option<crate::pipeline::fallback::OutputValidator>,
523    /// Workspace for file operations.
524    pub workspace: &'a dyn crate::workspace::Workspace,
525}
526
527/// Run an XSD retry with optional session continuation.
528///
529/// This function attempts session continuation first (if session info is available),
530/// and falls back to normal `run_with_fallback` if:
531/// - No session info is available
532/// - The agent doesn't support session continuation
533/// - Session continuation fails to even start (I/O error, panic, etc.)
534///
535/// # Important: Quirky Agent Behavior
536///
537/// Some AI agents return non-zero exit codes but still produce valid XML output.
538/// For example, an agent might output valid XML with status="partial" and then
539/// exit with code 1. This function does NOT treat non-zero exit codes as failures
540/// for session continuation - it returns the exit code and lets the caller check
541/// if valid XML was produced.
542///
543/// # Fault Tolerance
544///
545/// This function is designed to be completely fault-tolerant. Even if the agent
546/// segfaults during session continuation, this function will catch the error and
547/// fall back to normal behavior. The fallback chain is NEVER affected.
548///
549/// # Arguments
550///
551/// * `config` - XSD retry configuration
552///
553/// # Returns
554///
555/// * `Ok(exit_code)` - The agent's exit code (may be non-zero even with valid output)
556/// * `Err(_)` - I/O error (only from the fallback path, never from session continuation)
557pub fn run_xsd_retry_with_session(config: &mut XsdRetryConfig<'_, '_>) -> std::io::Result<i32> {
558    // Try session continuation first (if we have session info and it's a retry)
559    if config.retry_num > 0 {
560        if let Some(session_info) = config.session_info {
561            // Log session continuation attempt
562            config.runtime.logger.info(&format!(
563                "  Attempting session continuation with {} (session: {}...)",
564                session_info.agent_name,
565                &session_info.session_id[..8.min(session_info.session_id.len())]
566            ));
567            match try_session_continuation(config, session_info) {
568                SessionContinuationResult::Ran { exit_code } => {
569                    // Session continuation ran - agent was invoked and produced a log file
570                    // Return the exit code; the caller will check for valid XML
571                    // Even if exit_code != 0, there might be valid XML in the log
572                    config
573                        .runtime
574                        .logger
575                        .info("  Session continuation succeeded");
576                    return Ok(exit_code);
577                }
578                SessionContinuationResult::Fallback => {
579                    // Session continuation failed to start - fall through to normal behavior
580                    config
581                        .runtime
582                        .logger
583                        .warn("  Session continuation failed, falling back to new session");
584                }
585            }
586        } else {
587            config
588                .runtime
589                .logger
590                .warn("  No session info available for retry, starting new session");
591        }
592    }
593
594    // Normal fallback path (first attempt or session continuation failed to start)
595    let mut fallback_config = FallbackConfig {
596        role: config.role,
597        base_label: config.base_label,
598        prompt: config.prompt,
599        logfile_prefix: config.logfile_prefix,
600        runtime: config.runtime,
601        registry: config.registry,
602        primary_agent: config.primary_agent,
603        output_validator: config.output_validator,
604        workspace: config.workspace,
605    };
606    run_with_fallback_and_validator(&mut fallback_config)
607}
608
609/// Attempt session continuation with full fault tolerance.
610///
611/// This function catches ALL errors and returns `Fallback` instead of propagating them.
612/// Even segfaults are handled gracefully (they appear as non-zero exit codes or I/O errors).
613///
614/// # Returns
615///
616/// - `Ran { logfile, exit_code }` if the agent was successfully invoked (even if it crashed)
617/// - `Fallback` if session continuation couldn't even start
618fn try_session_continuation(
619    config: &mut XsdRetryConfig<'_, '_>,
620    session_info: &crate::pipeline::session::SessionInfo,
621) -> SessionContinuationResult {
622    // The agent name from session_info should already be the registry name
623    // (e.g., "ccs/glm", "opencode/anthropic/claude-sonnet-4") when passed from
624    // the calling code. For backwards compatibility and robustness, we still try
625    // to resolve it in case it's a sanitized name from an old log file.
626    let registry_name = config
627        .registry
628        .resolve_from_logfile_name(&session_info.agent_name)
629        .unwrap_or_else(|| session_info.agent_name.clone());
630
631    // Check if the agent supports session continuation
632    let agent_config = match config.registry.resolve_config(&registry_name) {
633        Some(cfg) => cfg,
634        None => {
635            // Agent not found - fall back silently
636            return SessionContinuationResult::Fallback;
637        }
638    };
639
640    if !agent_config.supports_session_continuation() {
641        // Agent doesn't support session continuation - fall back silently
642        return SessionContinuationResult::Fallback;
643    }
644
645    // Build the command with session continuation flag
646    let yolo = matches!(config.role, AgentRole::Developer);
647    let cmd_str = agent_config.build_cmd_with_session(
648        true, // output (JSON)
649        yolo, // yolo mode
650        true, // verbose
651        None, // model override
652        Some(&session_info.session_id),
653    );
654
655    // Build log file path - use a unique name to avoid overwriting previous logs
656    // Sanitize the agent name to avoid creating subdirectories from slashes
657    let sanitized_agent = super::logfile::sanitize_agent_name(&session_info.agent_name);
658    let logfile = format!(
659        "{}_{}_session_{}.log",
660        config.logfile_prefix, sanitized_agent, config.retry_num
661    );
662
663    // Log the attempt (debug level since this is an optimization)
664    if config.runtime.config.verbosity.is_debug() {
665        config.runtime.logger.info(&format!(
666            "  Attempting session continuation with {} (session: {})",
667            session_info.agent_name, session_info.session_id
668        ));
669    }
670
671    // Create the prompt command
672    let cmd = crate::pipeline::PromptCommand {
673        cmd_str: &cmd_str,
674        prompt: config.prompt,
675        label: &format!("{} (session)", config.base_label),
676        display_name: &session_info.agent_name,
677        logfile: &logfile,
678        parser_type: agent_config.json_parser,
679        env_vars: &agent_config.env_vars,
680    };
681
682    // Execute with full error handling - catch EVERYTHING
683    // Use catch_unwind to handle panics, and Result to handle I/O errors
684    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
685        crate::pipeline::run_with_prompt(&cmd, config.runtime)
686    }));
687
688    match result {
689        Ok(Ok(cmd_result)) => {
690            // Agent ran (even if it returned non-zero exit code)
691            // The caller will check if valid XML was produced
692            SessionContinuationResult::Ran {
693                exit_code: cmd_result.exit_code,
694            }
695        }
696        Ok(Err(_io_error)) => {
697            // I/O error during execution (e.g., couldn't spawn process)
698            // Fall back to normal behavior
699            SessionContinuationResult::Fallback
700        }
701        Err(_panic) => {
702            // Panic during execution (shouldn't happen, but handle it)
703            // Fall back to normal behavior
704            SessionContinuationResult::Fallback
705        }
706    }
707}
ralph_workflow/pipeline/runner.rs

ralph_workflow/pipeline/
runner.rs