ralph_workflow/pipeline/
runner.rs

1//! Command execution helpers and fallback orchestration.
2
3use crate::agents::{validate_model_flag, AgentConfig, AgentRegistry, AgentRole, JsonParserType};
4use crate::common::split_command;
5use std::sync::Arc;
6
7use super::fallback::try_agent_with_retries;
8use super::fallback::TryAgentResult;
9use super::model_flag::resolve_model_with_provider;
10use super::prompt::PipelineRuntime;
11
12/// Build the list of agents to try and log the fallback chain.
13fn build_agents_to_try<'a>(fallbacks: &'a [&'a str], primary_agent: &'a str) -> Vec<&'a str> {
14    let mut agents_to_try: Vec<&'a str> = vec![primary_agent];
15    for fb in fallbacks {
16        if *fb != primary_agent && !agents_to_try.contains(fb) {
17            agents_to_try.push(fb);
18        }
19    }
20    agents_to_try
21}
22
23/// Get CLI model/provider overrides based on role.
24fn get_cli_overrides(
25    role: AgentRole,
26    runtime: &PipelineRuntime<'_>,
27) -> (Option<String>, Option<String>) {
28    match role {
29        AgentRole::Developer => (
30            runtime.config.developer_model.clone(),
31            runtime.config.developer_provider.clone(),
32        ),
33        AgentRole::Reviewer => (
34            runtime.config.reviewer_model.clone(),
35            runtime.config.reviewer_provider.clone(),
36        ),
37        AgentRole::Commit => (None, None), // Commit role doesn't have CLI overrides
38    }
39}
40
41/// Context for building model flags.
42struct ModelFlagBuildContext<'a> {
43    agent_index: usize,
44    cli_model_override: Option<&'a String>,
45    cli_provider_override: Option<&'a String>,
46    agent_config: &'a AgentConfig,
47    agent_name: &'a str,
48    fallback_config: &'a crate::agents::fallback::FallbackConfig,
49    display_name: &'a str,
50    runtime: &'a PipelineRuntime<'a>,
51}
52
53/// Build the list of model flags to try for an agent.
54fn build_model_flags_list(ctx: &ModelFlagBuildContext<'_>) -> Vec<Option<String>> {
55    let mut model_flags_to_try: Vec<Option<String>> = Vec::new();
56
57    // CLI override takes highest priority for primary agent
58    // Provider override can modify the model's provider prefix
59    if ctx.agent_index == 0
60        && (ctx.cli_model_override.is_some() || ctx.cli_provider_override.is_some())
61    {
62        let resolved = resolve_model_with_provider(
63            ctx.cli_provider_override.map(std::string::String::as_str),
64            ctx.cli_model_override.map(std::string::String::as_str),
65            ctx.agent_config.model_flag.as_deref(),
66        );
67        if resolved.is_some() {
68            model_flags_to_try.push(resolved);
69        }
70    }
71
72    // Add the agent's default model (None means use agent's configured model_flag or no model)
73    if model_flags_to_try.is_empty() {
74        model_flags_to_try.push(None);
75    }
76
77    // Add provider fallback models for this agent
78    if ctx.fallback_config.has_provider_fallbacks(ctx.agent_name) {
79        let provider_fallbacks = ctx.fallback_config.get_provider_fallbacks(ctx.agent_name);
80        ctx.runtime.logger.info(&format!(
81            "Agent '{}' has {} provider fallback(s) configured",
82            ctx.display_name,
83            provider_fallbacks.len()
84        ));
85        for model in provider_fallbacks {
86            model_flags_to_try.push(Some(model.clone()));
87        }
88    }
89
90    model_flags_to_try
91}
92
93/// Build the command string for a specific model configuration.
94fn build_command_for_model(ctx: &TryModelContext<'_>, runtime: &PipelineRuntime<'_>) -> String {
95    let model_ref = ctx.model_flag.map(std::string::String::as_str);
96    // Enable yolo for Developer role always.
97    // For Reviewer and Commit, only enable in fix mode (detected via base_label starting with "fix").
98    let yolo = matches!(ctx.role, AgentRole::Developer)
99        || (ctx.role == AgentRole::Commit && ctx.base_label.starts_with("fix"))
100        || (ctx.role == AgentRole::Reviewer && ctx.base_label.starts_with("fix"));
101
102    if ctx.agent_index == 0 && ctx.cycle == 0 && ctx.model_index == 0 {
103        // For primary agent on first cycle, respect env var command overrides
104        match ctx.role {
105            AgentRole::Developer => runtime.config.developer_cmd.clone().unwrap_or_else(|| {
106                ctx.agent_config
107                    .build_cmd_with_model(true, true, true, model_ref)
108            }),
109            AgentRole::Reviewer => runtime.config.reviewer_cmd.clone().unwrap_or_else(|| {
110                ctx.agent_config
111                    .build_cmd_with_model(true, true, yolo, model_ref)
112            }),
113            AgentRole::Commit => runtime.config.commit_cmd.clone().unwrap_or_else(|| {
114                ctx.agent_config
115                    .build_cmd_with_model(true, true, yolo, model_ref)
116            }),
117        }
118    } else {
119        ctx.agent_config
120            .build_cmd_with_model(true, true, yolo, model_ref)
121    }
122}
123
124/// GLM-specific validation for print flag.
125///
126/// This validation only applies to CCS/Claude-based GLM agents that use the `-p` flag
127/// for non-interactive mode. OpenCode agents are excluded because they use
128/// `--auto-approve` for non-interactive mode instead.
129fn validate_glm_print_flag(
130    agent_name: &str,
131    agent_config: &AgentConfig,
132    cmd_str: &str,
133    agent_index: usize,
134    cycle: u32,
135    model_index: usize,
136    runtime: &PipelineRuntime<'_>,
137) {
138    // Skip validation for non-CCS/Claude GLM agents
139    // is_glm_like_agent only matches CCS/Claude-based GLM agents, not OpenCode
140    if !crate::agents::is_glm_like_agent(agent_name)
141        || agent_index != 0
142        || cycle != 0
143        || model_index != 0
144    {
145        return;
146    }
147
148    let cmd_argv = split_command(cmd_str).ok();
149    let has_print_flag = cmd_argv
150        .as_ref()
151        .is_some_and(|argv| argv.iter().any(|arg| arg == "-p"));
152    if !has_print_flag {
153        if agent_config.print_flag.is_empty() {
154            runtime.logger.warn(&format!(
155                "GLM agent '{agent_name}' is missing '-p' flag: print_flag is empty in configuration. \
156                 Add 'print_flag = \"-p\"' to [ccs] section in ~/.config/ralph-workflow.toml"
157            ));
158        } else {
159            runtime.logger.warn(&format!(
160                "GLM agent '{agent_name}' may be missing '-p' flag in command. Check configuration."
161            ));
162        }
163    }
164}
165
166/// Build label and logfile paths for execution.
167fn build_execution_metadata(
168    model_flag: Option<&String>,
169    display_name: &str,
170    base_label: &str,
171    agent_name: &str,
172    logfile_prefix: &str,
173    model_index: usize,
174) -> (String, String, String) {
175    let model_suffix = model_flag.map(|m| format!(" [{m}]")).unwrap_or_default();
176    let display_name_with_suffix = format!("{display_name}{model_suffix}");
177    let label = format!("{base_label} ({display_name_with_suffix})");
178    let logfile = super::logfile::build_logfile_path(logfile_prefix, agent_name, model_index);
179    (label, logfile, display_name_with_suffix)
180}
181
182/// Result of trying a single agent.
183#[derive(Debug, Clone, Copy, PartialEq, Eq)]
184enum TrySingleAgentResult {
185    /// Agent succeeded - return success
186    Success,
187    /// Unrecoverable error - abort immediately
188    Unrecoverable(i32),
189    /// Should fall back to next agent
190    Fallback,
191    /// Continue to next model (no retry)
192    NoRetry,
193}
194
195/// Context for trying a single model.
196struct TryModelContext<'a> {
197    agent_config: &'a AgentConfig,
198    agent_name: &'a str,
199    display_name: &'a str,
200    agent_index: usize,
201    cycle: u32,
202    model_index: usize,
203    role: AgentRole,
204    model_flag: Option<&'a String>,
205    base_label: &'a str,
206    prompt: &'a str,
207    logfile_prefix: &'a str,
208    fallback_config: &'a crate::agents::fallback::FallbackConfig,
209    output_validator: Option<crate::pipeline::fallback::OutputValidator>,
210    retry_timer: Arc<dyn crate::agents::RetryTimerProvider>,
211}
212
213/// Try a single model configuration for an agent.
214fn try_single_model(
215    ctx: &TryModelContext<'_>,
216    runtime: &mut PipelineRuntime<'_>,
217) -> std::io::Result<TrySingleAgentResult> {
218    let mut parser_type = ctx.agent_config.json_parser;
219
220    if ctx.role == AgentRole::Reviewer {
221        if let Some(ref parser_override) = runtime.config.reviewer_json_parser {
222            parser_type = JsonParserType::parse(parser_override);
223            if ctx.agent_index == 0 && ctx.cycle == 0 && ctx.model_index == 0 {
224                runtime.logger.info(&format!(
225                    "Using JSON parser override '{parser_override}' for reviewer"
226                ));
227            }
228        }
229    }
230
231    let cmd_str = build_command_for_model(ctx, runtime);
232
233    validate_glm_print_flag(
234        ctx.agent_name,
235        ctx.agent_config,
236        &cmd_str,
237        ctx.agent_index,
238        ctx.cycle,
239        ctx.model_index,
240        runtime,
241    );
242
243    let (label, logfile, display_name_with_suffix) = build_execution_metadata(
244        ctx.model_flag,
245        ctx.display_name,
246        ctx.base_label,
247        ctx.agent_name,
248        ctx.logfile_prefix,
249        ctx.model_index,
250    );
251
252    let attempt_config = crate::pipeline::fallback::AgentAttemptConfig {
253        agent_name: ctx.agent_name,
254        model_flag: ctx.model_flag.map(std::string::String::as_str),
255        label: &label,
256        display_name: &display_name_with_suffix,
257        cmd_str: &cmd_str,
258        prompt: ctx.prompt,
259        logfile: &logfile,
260        logfile_prefix: ctx.logfile_prefix,
261        parser_type,
262        env_vars: &ctx.agent_config.env_vars,
263        model_index: ctx.model_index,
264        agent_index: ctx.agent_index,
265        cycle: ctx.cycle as usize,
266        fallback_config: ctx.fallback_config,
267        output_validator: ctx.output_validator,
268        retry_timer: Arc::clone(&ctx.retry_timer),
269    };
270    let result = try_agent_with_retries(&attempt_config, runtime)?;
271
272    match result {
273        TryAgentResult::Success => Ok(TrySingleAgentResult::Success),
274        TryAgentResult::Unrecoverable(exit_code) => {
275            Ok(TrySingleAgentResult::Unrecoverable(exit_code))
276        }
277        TryAgentResult::Fallback => Ok(TrySingleAgentResult::Fallback),
278        TryAgentResult::NoRetry => Ok(TrySingleAgentResult::NoRetry),
279    }
280}
281
282/// Context for trying a single agent.
283struct TryAgentContext<'a> {
284    agent_name: &'a str,
285    agent_index: usize,
286    cycle: u32,
287    role: AgentRole,
288    base_label: &'a str,
289    prompt: &'a str,
290    logfile_prefix: &'a str,
291    cli_model_override: Option<&'a String>,
292    cli_provider_override: Option<&'a String>,
293    output_validator: Option<crate::pipeline::fallback::OutputValidator>,
294    retry_timer: Arc<dyn crate::agents::RetryTimerProvider>,
295}
296
297/// Try a single agent with all its model configurations.
298fn try_single_agent(
299    ctx: &TryAgentContext<'_>,
300    runtime: &mut PipelineRuntime<'_>,
301    registry: &AgentRegistry,
302    fallback_config: &crate::agents::fallback::FallbackConfig,
303) -> std::io::Result<TrySingleAgentResult> {
304    let Some(agent_config) = registry.resolve_config(ctx.agent_name) else {
305        runtime.logger.warn(&format!(
306            "Agent '{}' not found in registry, skipping",
307            ctx.agent_name
308        ));
309        return Ok(TrySingleAgentResult::Fallback);
310    };
311
312    let display_name = registry.display_name(ctx.agent_name);
313    let model_ctx = ModelFlagBuildContext {
314        agent_index: ctx.agent_index,
315        cli_model_override: ctx.cli_model_override,
316        cli_provider_override: ctx.cli_provider_override,
317        agent_config: &agent_config,
318        agent_name: ctx.agent_name,
319        fallback_config,
320        display_name: &display_name,
321        runtime,
322    };
323    let model_flags_to_try = build_model_flags_list(&model_ctx);
324
325    if ctx.agent_index == 0 && ctx.cycle == 0 {
326        for model_flag in model_flags_to_try.iter().flatten() {
327            for warning in validate_model_flag(model_flag) {
328                runtime.logger.warn(&warning);
329            }
330        }
331    }
332
333    for (model_index, model_flag) in model_flags_to_try.iter().enumerate() {
334        let model_ctx = TryModelContext {
335            agent_config: &agent_config,
336            agent_name: ctx.agent_name,
337            display_name: &display_name,
338            agent_index: ctx.agent_index,
339            cycle: ctx.cycle,
340            model_index,
341            role: ctx.role,
342            model_flag: model_flag.as_ref(),
343            base_label: ctx.base_label,
344            prompt: ctx.prompt,
345            logfile_prefix: ctx.logfile_prefix,
346            fallback_config,
347            output_validator: ctx.output_validator,
348            retry_timer: Arc::clone(&ctx.retry_timer),
349        };
350        let result = try_single_model(&model_ctx, runtime)?;
351
352        match result {
353            TrySingleAgentResult::Success => return Ok(TrySingleAgentResult::Success),
354            TrySingleAgentResult::Unrecoverable(exit_code) => {
355                return Ok(TrySingleAgentResult::Unrecoverable(exit_code))
356            }
357            TrySingleAgentResult::Fallback => return Ok(TrySingleAgentResult::Fallback),
358            TrySingleAgentResult::NoRetry => {}
359        }
360    }
361
362    Ok(TrySingleAgentResult::NoRetry)
363}
364
365/// Configuration for running with fallback.
366pub struct FallbackConfig<'a, 'b> {
367    pub role: AgentRole,
368    pub base_label: &'a str,
369    pub prompt: &'a str,
370    pub logfile_prefix: &'a str,
371    pub runtime: &'a mut PipelineRuntime<'b>,
372    pub registry: &'a AgentRegistry,
373    pub primary_agent: &'a str,
374    pub output_validator: Option<crate::pipeline::fallback::OutputValidator>,
375}
376
377/// Run a command with automatic fallback to alternative agents on failure.
378#[cfg_attr(not(test), allow(dead_code))] // Used by tests
379pub fn run_with_fallback(
380    role: AgentRole,
381    base_label: &str,
382    prompt: &str,
383    logfile_prefix: &str,
384    runtime: &mut PipelineRuntime<'_>,
385    registry: &AgentRegistry,
386    primary_agent: &str,
387) -> std::io::Result<i32> {
388    let mut config = FallbackConfig {
389        role,
390        base_label,
391        prompt,
392        logfile_prefix,
393        runtime,
394        registry,
395        primary_agent,
396        output_validator: None,
397    };
398    run_with_fallback_internal(&mut config)
399}
400
401/// Run a command with automatic fallback to alternative agents on failure.
402///
403/// Includes an optional output validator callback that checks if the agent
404/// produced valid output after `exit_code=0`. If validation fails, triggers fallback.
405///
406/// This variant takes a `FallbackConfig` directly for cases where you need
407/// to specify an output validator.
408pub fn run_with_fallback_and_validator(
409    config: &mut FallbackConfig<'_, '_>,
410) -> std::io::Result<i32> {
411    run_with_fallback_internal(config)
412}
413
414/// Run a command with automatic fallback to alternative agents on failure.
415///
416/// Includes an optional output validator callback that checks if the agent
417/// produced valid output after `exit_code=0`. If validation fails, triggers fallback.
418fn run_with_fallback_internal(config: &mut FallbackConfig<'_, '_>) -> std::io::Result<i32> {
419    let fallback_config = config.registry.fallback_config();
420    let fallbacks = config.registry.available_fallbacks(config.role);
421    if fallback_config.has_fallbacks(config.role) {
422        config.runtime.logger.info(&format!(
423            "Agent fallback chain for {}: {}",
424            config.role,
425            fallbacks.join(", ")
426        ));
427    } else {
428        config.runtime.logger.info(&format!(
429            "No configured fallbacks for {}, using primary only",
430            config.role
431        ));
432    }
433
434    let agents_to_try = build_agents_to_try(&fallbacks, config.primary_agent);
435    let (cli_model_override, cli_provider_override) =
436        get_cli_overrides(config.role, config.runtime);
437
438    for cycle in 0..fallback_config.max_cycles {
439        if cycle > 0 {
440            let backoff_ms = fallback_config.calculate_backoff(cycle - 1);
441            config.runtime.logger.info(&format!(
442                "Cycle {}/{}: All agents exhausted, waiting {}ms before retry (exponential backoff)...",
443                cycle + 1,
444                fallback_config.max_cycles,
445                backoff_ms
446            ));
447            config
448                .registry
449                .retry_timer()
450                .sleep(std::time::Duration::from_millis(backoff_ms));
451        }
452
453        for (agent_index, agent_name) in agents_to_try.iter().enumerate() {
454            let ctx = TryAgentContext {
455                agent_name,
456                agent_index,
457                cycle,
458                role: config.role,
459                base_label: config.base_label,
460                prompt: config.prompt,
461                logfile_prefix: config.logfile_prefix,
462                cli_model_override: cli_model_override.as_ref(),
463                cli_provider_override: cli_provider_override.as_ref(),
464                output_validator: config.output_validator,
465                retry_timer: config.registry.retry_timer(),
466            };
467            let result = try_single_agent(&ctx, config.runtime, config.registry, fallback_config)?;
468
469            match result {
470                TrySingleAgentResult::Success => return Ok(0),
471                TrySingleAgentResult::Unrecoverable(exit_code) => return Ok(exit_code),
472                TrySingleAgentResult::Fallback | TrySingleAgentResult::NoRetry => {}
473            }
474        }
475    }
476
477    config.runtime.logger.error(&format!(
478        "All agents exhausted after {} cycles with exponential backoff",
479        fallback_config.max_cycles
480    ));
481    Ok(1)
482}
483
484// ============================================================================
485// Session Continuation for XSD Retries
486// ============================================================================
487//
488// Session continuation allows XSD validation retries to continue the same
489// agent session, so the AI retains memory of its previous reasoning.
490//
491// DESIGN PRINCIPLE: Session continuation is an OPTIMIZATION, not a requirement.
492// It must be completely fault-tolerant:
493//
494// 1. If session continuation produces output (regardless of exit code) -> use it
495// 2. If it fails for ANY reason (segfault, crash, invalid session, I/O error,
496//    timeout, or any other failure) -> silently fall back to normal behavior
497//
498// The fallback chain must NEVER be affected by session continuation failures.
499// Even a segfaulting agent during session continuation must not break anything.
500//
501// IMPORTANT: Some AI agents have quirky behavior where they return non-zero exit
502// codes but still produce valid XML. For example, an agent might output valid XML
503// with status="partial" and then exit with code 1. We should still use that XML.
504// The caller is responsible for checking if valid XML exists in the log file.
505
506/// Result of attempting session continuation.
507#[derive(Debug)]
508pub enum SessionContinuationResult {
509    /// Session continuation ran (agent was invoked).
510    /// Contains the log file path for output extraction.
511    /// NOTE: This does NOT mean the agent succeeded - the caller must check
512    /// the log file for valid output. Some agents produce valid XML even
513    /// when returning non-zero exit codes.
514    #[allow(dead_code)] // logfile is kept for debugging/future use
515    Ran { logfile: String, exit_code: i32 },
516    /// Session continuation failed to run or was not attempted.
517    /// The caller should fall back to normal `run_with_fallback`.
518    Fallback,
519}
520
521/// Configuration for XSD retry with optional session continuation.
522pub struct XsdRetryConfig<'a, 'b> {
523    /// Agent role for the retry.
524    pub role: AgentRole,
525    /// Label for logging (e.g., "planning #1").
526    pub base_label: &'a str,
527    /// The prompt to send.
528    pub prompt: &'a str,
529    /// Log file prefix (e.g., ".agent/logs/planning_1").
530    pub logfile_prefix: &'a str,
531    /// Pipeline runtime for logging and timing.
532    pub runtime: &'a mut PipelineRuntime<'b>,
533    /// Agent registry for resolving agent configs.
534    pub registry: &'a AgentRegistry,
535    /// Primary agent name.
536    pub primary_agent: &'a str,
537    /// Optional session info from previous run.
538    /// If provided and valid, session continuation will be attempted first.
539    pub session_info: Option<&'a crate::pipeline::session::SessionInfo>,
540    /// Retry number (0 = first attempt, 1+ = XSD retries).
541    pub retry_num: usize,
542    /// Optional output validator to check if agent produced valid output.
543    /// Used by review phase to validate JSON output extraction.
544    pub output_validator: Option<crate::pipeline::fallback::OutputValidator>,
545}
546
547/// Run an XSD retry with optional session continuation.
548///
549/// This function attempts session continuation first (if session info is available),
550/// and falls back to normal `run_with_fallback` if:
551/// - No session info is available
552/// - The agent doesn't support session continuation
553/// - Session continuation fails to even start (I/O error, panic, etc.)
554///
555/// # Important: Quirky Agent Behavior
556///
557/// Some AI agents return non-zero exit codes but still produce valid XML output.
558/// For example, an agent might output valid XML with status="partial" and then
559/// exit with code 1. This function does NOT treat non-zero exit codes as failures
560/// for session continuation - it returns the exit code and lets the caller check
561/// if valid XML was produced.
562///
563/// # Fault Tolerance
564///
565/// This function is designed to be completely fault-tolerant. Even if the agent
566/// segfaults during session continuation, this function will catch the error and
567/// fall back to normal behavior. The fallback chain is NEVER affected.
568///
569/// # Arguments
570///
571/// * `config` - XSD retry configuration
572///
573/// # Returns
574///
575/// * `Ok(exit_code)` - The agent's exit code (may be non-zero even with valid output)
576/// * `Err(_)` - I/O error (only from the fallback path, never from session continuation)
577pub fn run_xsd_retry_with_session(config: &mut XsdRetryConfig<'_, '_>) -> std::io::Result<i32> {
578    // Try session continuation first (if we have session info and it's a retry)
579    if config.retry_num > 0 {
580        if let Some(session_info) = config.session_info {
581            // Log session continuation attempt
582            config.runtime.logger.info(&format!(
583                "  Attempting session continuation with {} (session: {}...)",
584                session_info.agent_name,
585                &session_info.session_id[..8.min(session_info.session_id.len())]
586            ));
587            match try_session_continuation(config, session_info) {
588                SessionContinuationResult::Ran {
589                    logfile: _,
590                    exit_code,
591                } => {
592                    // Session continuation ran - agent was invoked and produced a log file
593                    // Return the exit code; the caller will check for valid XML
594                    // Even if exit_code != 0, there might be valid XML in the log
595                    config
596                        .runtime
597                        .logger
598                        .info("  Session continuation succeeded");
599                    return Ok(exit_code);
600                }
601                SessionContinuationResult::Fallback => {
602                    // Session continuation failed to start - fall through to normal behavior
603                    config
604                        .runtime
605                        .logger
606                        .warn("  Session continuation failed, falling back to new session");
607                }
608            }
609        } else {
610            config
611                .runtime
612                .logger
613                .warn("  No session info available for retry, starting new session");
614        }
615    }
616
617    // Normal fallback path (first attempt or session continuation failed to start)
618    let mut fallback_config = FallbackConfig {
619        role: config.role,
620        base_label: config.base_label,
621        prompt: config.prompt,
622        logfile_prefix: config.logfile_prefix,
623        runtime: config.runtime,
624        registry: config.registry,
625        primary_agent: config.primary_agent,
626        output_validator: config.output_validator,
627    };
628    run_with_fallback_and_validator(&mut fallback_config)
629}
630
631/// Attempt session continuation with full fault tolerance.
632///
633/// This function catches ALL errors and returns `Fallback` instead of propagating them.
634/// Even segfaults are handled gracefully (they appear as non-zero exit codes or I/O errors).
635///
636/// # Returns
637///
638/// - `Ran { logfile, exit_code }` if the agent was successfully invoked (even if it crashed)
639/// - `Fallback` if session continuation couldn't even start
640fn try_session_continuation(
641    config: &mut XsdRetryConfig<'_, '_>,
642    session_info: &crate::pipeline::session::SessionInfo,
643) -> SessionContinuationResult {
644    // The agent name from session_info should already be the registry name
645    // (e.g., "ccs/glm", "opencode/anthropic/claude-sonnet-4") when passed from
646    // the calling code. For backwards compatibility and robustness, we still try
647    // to resolve it in case it's a sanitized name from an old log file.
648    let registry_name = config
649        .registry
650        .resolve_from_logfile_name(&session_info.agent_name)
651        .unwrap_or_else(|| session_info.agent_name.clone());
652
653    // Check if the agent supports session continuation
654    let agent_config = match config.registry.resolve_config(&registry_name) {
655        Some(cfg) => cfg,
656        None => {
657            // Agent not found - fall back silently
658            return SessionContinuationResult::Fallback;
659        }
660    };
661
662    if !agent_config.supports_session_continuation() {
663        // Agent doesn't support session continuation - fall back silently
664        return SessionContinuationResult::Fallback;
665    }
666
667    // Build the command with session continuation flag
668    let yolo = matches!(config.role, AgentRole::Developer);
669    let cmd_str = agent_config.build_cmd_with_session(
670        true, // output (JSON)
671        yolo, // yolo mode
672        true, // verbose
673        None, // model override
674        Some(&session_info.session_id),
675    );
676
677    // Build log file path - use a unique name to avoid overwriting previous logs
678    // Sanitize the agent name to avoid creating subdirectories from slashes
679    let sanitized_agent = super::logfile::sanitize_agent_name(&session_info.agent_name);
680    let logfile = format!(
681        "{}_{}_session_{}.log",
682        config.logfile_prefix, sanitized_agent, config.retry_num
683    );
684
685    // Log the attempt (debug level since this is an optimization)
686    if config.runtime.config.verbosity.is_debug() {
687        config.runtime.logger.info(&format!(
688            "  Attempting session continuation with {} (session: {})",
689            session_info.agent_name, session_info.session_id
690        ));
691    }
692
693    // Create the prompt command
694    let cmd = crate::pipeline::PromptCommand {
695        cmd_str: &cmd_str,
696        prompt: config.prompt,
697        label: &format!("{} (session)", config.base_label),
698        display_name: &session_info.agent_name,
699        logfile: &logfile,
700        parser_type: agent_config.json_parser,
701        env_vars: &agent_config.env_vars,
702    };
703
704    // Execute with full error handling - catch EVERYTHING
705    // Use catch_unwind to handle panics, and Result to handle I/O errors
706    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
707        crate::pipeline::run_with_prompt(&cmd, config.runtime)
708    }));
709
710    match result {
711        Ok(Ok(cmd_result)) => {
712            // Agent ran (even if it returned non-zero exit code)
713            // The caller will check if valid XML was produced
714            SessionContinuationResult::Ran {
715                logfile,
716                exit_code: cmd_result.exit_code,
717            }
718        }
719        Ok(Err(_io_error)) => {
720            // I/O error during execution (e.g., couldn't spawn process)
721            // Fall back to normal behavior
722            SessionContinuationResult::Fallback
723        }
724        Err(_panic) => {
725            // Panic during execution (shouldn't happen, but handle it)
726            // Fall back to normal behavior
727            SessionContinuationResult::Fallback
728        }
729    }
730}
731
732#[cfg(test)]
733mod tests {
734    use std::sync::Mutex;
735
736    static ENV_MUTEX: Mutex<()> = Mutex::new(());
737
738    struct EnvGuard {
739        key: &'static str,
740        prev: Option<std::ffi::OsString>,
741    }
742
743    impl EnvGuard {
744        fn set_multiple(vars: &[(&'static str, &str)]) -> Vec<Self> {
745            let _lock = ENV_MUTEX
746                .lock()
747                .unwrap_or_else(std::sync::PoisonError::into_inner);
748            vars.iter()
749                .map(|&(key, value)| {
750                    let prev = std::env::var_os(key);
751                    std::env::set_var(key, value);
752                    Self { key, prev }
753                })
754                .collect()
755        }
756    }
757
758    impl Drop for EnvGuard {
759        fn drop(&mut self) {
760            match self.prev.take() {
761                Some(v) => std::env::set_var(self.key, v),
762                None => std::env::remove_var(self.key),
763            }
764        }
765    }
766
767    /// Test that environment variable sanitization works correctly.
768    ///
769    /// This regression test ensures that when running agents with empty `env_vars`,
770    /// GLM/CCS environment variables from the parent shell are NOT passed to
771    /// the subprocess. This is critical for preventing "Invalid API key" errors
772    /// when switching between GLM (CCS) and standard Claude agents.
773    ///
774    /// The test:
775    /// 1. Sets GLM-like environment variables in the test process
776    /// 2. Creates a Command that would be used for an agent with empty `env_vars`
777    /// 3. Verifies that the problematic Anthropic env vars are cleared
778    #[test]
779    fn test_runner_sanitizes_anthropic_env_vars() {
780        // Anthropic environment variables to sanitize
781        const ANTHROPIC_ENV_VARS_TO_SANITIZE: &[&str] = &[
782            "ANTHROPIC_API_KEY",
783            "ANTHROPIC_BASE_URL",
784            "ANTHROPIC_AUTH_TOKEN",
785            "ANTHROPIC_MODEL",
786            "ANTHROPIC_DEFAULT_HAIKU_MODEL",
787            "ANTHROPIC_DEFAULT_OPUS_MODEL",
788            "ANTHROPIC_DEFAULT_SONNET_MODEL",
789        ];
790
791        let _guard = EnvGuard::set_multiple(&[
792            ("ANTHROPIC_API_KEY", "test-token-glm"),
793            ("ANTHROPIC_BASE_URL", "https://glm.example.com"),
794        ]);
795
796        // Simulate running an agent with empty env_vars (like codex)
797        // The ANTHROPIC_* vars should be sanitized from the parent environment
798        let mut cmd = std::process::Command::new("printenv");
799        for &var in ANTHROPIC_ENV_VARS_TO_SANITIZE {
800            cmd.env_remove(var);
801        }
802
803        // Execute the command and check that GLM variables are NOT present
804        let output = match cmd.output() {
805            Ok(o) => o,
806            Err(e) => {
807                // printenv might not be available on all systems
808                eprintln!("Skipping test: printenv not available ({e})");
809                return;
810            }
811        };
812        let stdout = String::from_utf8_lossy(&output.stdout);
813
814        // The GLM-set variables should NOT be in the subprocess environment
815        // (they were sanitized by env_remove)
816        assert!(!stdout.contains("test-token-glm"));
817        assert!(!stdout.contains("https://glm.example.com"));
818    }
819
820    #[test]
821    fn test_runner_does_not_sanitize_explicit_env_vars() {
822        // If an agent explicitly sets ANTHROPIC_API_KEY in its env_vars,
823        // that should NOT be sanitized
824
825        let mut cmd = std::process::Command::new("printenv");
826
827        // Simulate agent setting its own ANTHROPIC_API_KEY
828        let agent_env_vars =
829            std::collections::HashMap::from([("ANTHROPIC_API_KEY", "agent-specific-key")]);
830
831        // First, sanitize all Anthropic vars
832        for &var in &["ANTHROPIC_API_KEY", "ANTHROPIC_BASE_URL"] {
833            cmd.env_remove(var);
834        }
835
836        // Then, apply agent's env_vars (which should NOT be sanitized)
837        for (key, value) in &agent_env_vars {
838            cmd.env(key, value);
839        }
840
841        let output = match cmd.output() {
842            Ok(o) => o,
843            Err(e) => {
844                // printenv might not be available on all systems
845                eprintln!("Skipping test: printenv not available ({e})");
846                return;
847            }
848        };
849        let stdout = String::from_utf8_lossy(&output.stdout);
850
851        // The agent-specific key should be present
852        assert!(stdout.contains("agent-specific-key"));
853    }
854}
ralph_workflow/pipeline/runner.rs

ralph_workflow/pipeline/
runner.rs