omne-cli 0.2.0 - Docs.rs

//! Node executor (plan Unit 11).
//!
//! `dispatch(node, ctx)` is the single entry point the DAG runner in
//! Unit 12 calls for each `ready()` node. Responsibilities:
//!
//! 1. Emit `node.started` before any subprocess spawn.
//! 2. Dispatch to the node's kind — bash, AI (command / prompt), or
//!    loop — and run the subprocess under a wall-clock deadline.
//! 3. For loops, iterate under a sentinel [`Scanner`] that watches for
//!    `BLOCKED` (always reserved) plus the user's `until` token;
//!    session lifecycle follows the Unit 0 spike (`--session-id` first
//!    iteration, `--resume` thereafter, UUID not ULID).
//! 4. If the node carries a `gate`, run the platform-current hook
//!    script under a 60s deadline after a successful node terminates.
//! 5. Emit exactly one terminal event (`node.completed` or
//!    `node.failed`) and, when a gate ran, a `gate.passed` event
//!    between the node's body completing and the terminal event.
//!
//! The executor is the sole owner of node-level deadline handling —
//! `claude_proc::ClaudeProcess` deliberately has no per-call timeout so
//! that this module can decide, for example, that a loop's wall-clock
//! budget covers all iterations rather than each one independently.

#![allow(dead_code)]

use std::io::{Read, Write};
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use std::sync::mpsc;
use std::thread;
use std::time::{Duration, Instant};

use ulid::Ulid;
use uuid::Uuid;

use crate::claude_proc::{self, CaptureMode, ChildKiller, ClaudeProcess, Session, SpawnOpts};
use crate::event_log::EventLog;
use crate::events::{
    ErrorKind, Event, GateMethod, GatePassed, Input, IterationStarted, NodeCompleted, NodeError,
    NodeFailed, NodeKind, NodeStarted,
};
use crate::pipe::{ExecutionKind, LoopBody, Node};
use crate::sentinel::{self, Scanner};
use crate::volume;

/// Default node wall-clock budget when a pipe node declares no
/// `timeout:`. Picked to match the memory-locked 1800s default called
/// out in the plan's "Open questions deferred to implementation" block.
pub const DEFAULT_NODE_TIMEOUT: Duration = Duration::from_secs(1800);

/// Gate hook deadline, per plan R12.
pub const GATE_TIMEOUT: Duration = Duration::from_secs(60);

/// Inputs the executor needs that span every dispatch call on one run.
///
/// Built once by Unit 12's `omne run` handler and threaded through every
/// node. Borrowed data everywhere; the executor never needs ownership.
pub struct ExecutorContext<'a> {
    /// `.omne/` volume root. Resolves worktree + events paths.
    pub volume_root: &'a Path,
    /// `run_id` (e.g. `feature-01arz3ndektsv4rrffq69g5fa0`).
    pub run_id: &'a str,
    /// Per-run worktree cwd (`.omne/wt/<run_id>`). All subprocesses
    /// spawn with this as their cwd.
    pub worktree: &'a Path,
    /// Append-only log; executor emits `node.*` + `gate.passed` through
    /// it.
    pub event_log: &'a EventLog,
    /// `--input k=v` pairs. Bash nodes export these as
    /// `OMNE_INPUT_<KEY>`; AI nodes receive them via the gate hook env
    /// and the pipe-level prompt template (not interpolated at the
    /// runner in v1 — the pipe body references them).
    pub inputs: &'a [Input],
    /// Pipe-level `default_model:`. AI nodes inherit it unless they
    /// specify their own `model:`.
    pub default_model: Option<&'a str>,
    /// Optional `claude` binary override, mainly a test seam.
    pub claude_bin: Option<&'a Path>,
    /// Wall-clock budget when a node does not override `timeout:`.
    /// Defaults to [`DEFAULT_NODE_TIMEOUT`].
    pub default_node_timeout: Duration,
    /// Wall-clock budget for gate hook execution. Defaults to
    /// [`GATE_TIMEOUT`] (60s per plan R12). Exposed on the context so
    /// integration tests can drive a short gate-timeout path without
    /// waiting a full minute.
    pub gate_timeout: Duration,
}

impl<'a> ExecutorContext<'a> {
    /// Convenience constructor that fills in the plan-default wall-clock
    /// budgets. `worktree` is typically `.omne/wt/<run_id>` — the
    /// runner computes it once and threads it through.
    pub fn new(
        volume_root: &'a Path,
        run_id: &'a str,
        worktree: &'a Path,
        event_log: &'a EventLog,
        inputs: &'a [Input],
    ) -> Self {
        Self {
            volume_root,
            run_id,
            worktree,
            event_log,
            inputs,
            default_model: None,
            claude_bin: None,
            default_node_timeout: DEFAULT_NODE_TIMEOUT,
            gate_timeout: GATE_TIMEOUT,
        }
    }
}

/// Result of dispatching one node.
///
/// `Completed` and `Failed` map 1:1 to `node.completed` and
/// `node.failed` in the event log; the runner threads the variant
/// through `dag::Scheduler::mark`.
#[derive(Debug, Clone, Eq, PartialEq)]
pub enum NodeOutcome {
    Completed,
    Failed {
        kind: ErrorKind,
        message: Option<String>,
    },
}

/// Dispatch one node end-to-end.
///
/// Emits `node.started` on entry, runs the node body, runs any attached
/// gate, and emits exactly one terminal event (`node.completed` or
/// `node.failed`). The returned [`NodeOutcome`] mirrors the terminal
/// event so the runner can feed the scheduler without re-reading the
/// event log.
pub fn dispatch(node: &Node, ctx: &ExecutorContext<'_>) -> Result<NodeOutcome, DispatchError> {
    let kind = node
        .execution_kind()
        .ok_or_else(|| DispatchError::InvalidNode {
            node_id: node.id.clone(),
            reason: "node has no execution kind (validator should have caught this)".into(),
        })?;

    let event_kind = match kind {
        ExecutionKind::Command => NodeKind::Command,
        ExecutionKind::Prompt => NodeKind::Prompt,
        ExecutionKind::Bash => NodeKind::Bash,
        ExecutionKind::Loop => NodeKind::Loop,
    };

    ctx.event_log
        .append(&Event::NodeStarted(NodeStarted {
            id: new_event_id(),
            ts: iso_utc_now(),
            run_id: ctx.run_id.to_string(),
            node_id: node.id.clone(),
            kind: event_kind,
            name: None,
            model: effective_model(node, ctx).map(str::to_string),
        }))
        .map_err(DispatchError::from)?;

    // Compute the body + gate outcome with infrastructure errors
    // folded into a `NodeFailed{Crash}` so `node.started` is always
    // paired with exactly one terminal event. The only residual orphan
    // window is `emit_terminal` itself failing — at that point the
    // event log is unwritable and no compensation is possible.
    let final_outcome = body_with_gate(node, kind, ctx);
    emit_terminal(node, &final_outcome, ctx)?;
    Ok(final_outcome)
}

/// Run the node body (and gate, if any) returning a `NodeOutcome` that
/// is safe to emit as a terminal event. Dispatch-level infrastructure
/// errors — failed spawns, event-log writes during the body, capture
/// I/O — are converted to `NodeFailed{Crash}` with the error message
/// embedded, so the recorded event stream never contains a dangling
/// `node.started`.
fn body_with_gate(node: &Node, kind: ExecutionKind, ctx: &ExecutorContext<'_>) -> NodeOutcome {
    let body_outcome = match kind {
        ExecutionKind::Bash => run_bash(node, ctx),
        ExecutionKind::Command | ExecutionKind::Prompt => run_ai(node, ctx),
        ExecutionKind::Loop => run_loop(node, ctx),
    };
    let body_outcome = match body_outcome {
        Ok(o) => o,
        Err(e) => {
            return NodeOutcome::Failed {
                kind: ErrorKind::Crash,
                message: Some(format!("dispatch error in node body: {e}")),
            }
        }
    };
    match body_outcome {
        NodeOutcome::Completed => match node.gate.as_deref() {
            Some(gate) => match run_gate(node, gate, ctx) {
                Ok(o) => o,
                Err(e) => NodeOutcome::Failed {
                    kind: ErrorKind::GateFailed,
                    message: Some(format!("dispatch error in gate: {e}")),
                },
            },
            None => NodeOutcome::Completed,
        },
        other => other,
    }
}

// ── Bash path ──────────────────────────────────────────────────────

fn run_bash(node: &Node, ctx: &ExecutorContext<'_>) -> Result<NodeOutcome, DispatchError> {
    let bash_body = node
        .bash
        .as_deref()
        .ok_or_else(|| DispatchError::InvalidNode {
            node_id: node.id.clone(),
            reason: "bash node missing bash: body".into(),
        })?;

    let capture_path = node_capture_path(ctx, &node.id);
    ensure_parent_dir(&capture_path)?;

    let mut cmd = bash_command(bash_body);
    cmd.current_dir(ctx.worktree)
        .stdin(Stdio::null())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped());
    for input in ctx.inputs {
        let env_key = format!("OMNE_INPUT_{}", input.key.to_uppercase());
        cmd.env(env_key, &input.value);
    }

    let budget = node_timeout(node, ctx);
    let outcome = run_command_with_timeout(&mut cmd, budget, Some(&capture_path))?;
    Ok(outcome_from_exit(outcome))
}

#[cfg(windows)]
fn bash_command(body: &str) -> Command {
    use std::os::windows::process::CommandExt;
    let mut cmd = Command::new("cmd");
    // Mirror `tests/claude_proc_stub.rs`: `raw_arg` bypasses Rust's
    // default Windows argument escaper, which would double-escape any
    // embedded double quotes that `cmd /S /C "..."` then mis-parses.
    // `/S` tells `cmd.exe` to strip exactly the outer pair of quotes
    // before interpreting the rest, so the body is forwarded verbatim
    // (preserving `"` characters inside the body).
    //
    // We cannot neutralise cmd's own metacharacter re-parsing (`&`,
    // `|`, `%VAR%`) without switching to PowerShell or WSL — that is
    // documented deferred scope. The concrete bug this mitigates is
    // Rust-level quote mangling, which would silently corrupt bodies
    // containing double quotes regardless of cmd behaviour.
    cmd.raw_arg(format!("/S /C \"{body}\""));
    cmd
}

#[cfg(not(windows))]
fn bash_command(body: &str) -> Command {
    let mut cmd = Command::new("sh");
    cmd.arg("-c").arg(body);
    cmd
}

// ── AI path (command / prompt) ────────────────────────────────────

fn run_ai(node: &Node, ctx: &ExecutorContext<'_>) -> Result<NodeOutcome, DispatchError> {
    let prompt = ai_prompt_for(node)?;
    let capture_path = node_capture_path(ctx, &node.id);
    ensure_parent_dir(&capture_path)?;

    let opts = build_spawn_opts(node, ctx, prompt, None);
    let budget = node_timeout(node, ctx);
    let outcome = run_claude_iteration(&opts, &capture_path, CaptureMode::Truncate, budget, &[])?;

    Ok(match outcome {
        ClaudeOutcome::CleanExit => NodeOutcome::Completed,
        ClaudeOutcome::SentinelHit { token: _ } => {
            // Only registered sentinel for non-loop AI nodes is BLOCKED.
            NodeOutcome::Failed {
                kind: ErrorKind::Blocked,
                message: Some("assistant emitted BLOCKED".into()),
            }
        }
        other => claude_failure(other, node, budget),
    })
}

/// Translate the non-`CleanExit`, non-`SentinelHit` arms of
/// [`ClaudeOutcome`] into [`NodeOutcome::Failed`]. Shared by `run_ai`
/// and `run_loop` so their error-message shapes do not drift. The
/// sentinel-hit and clean-exit cases differ per caller (the AI path
/// treats any hit as BLOCKED; the loop path discriminates BLOCKED vs
/// `until`), so they stay inline at each call site.
fn claude_failure(outcome: ClaudeOutcome, node: &Node, budget: Duration) -> NodeOutcome {
    match outcome {
        ClaudeOutcome::CleanExit | ClaudeOutcome::SentinelHit { .. } => {
            unreachable!("claude_failure invoked on a non-failure outcome");
        }
        ClaudeOutcome::Timeout => NodeOutcome::Failed {
            kind: ErrorKind::Timeout,
            message: Some(format!(
                "node {} exceeded timeout of {}s",
                node.id,
                budget.as_secs()
            )),
        },
        ClaudeOutcome::HostMissing => NodeOutcome::Failed {
            kind: ErrorKind::HostMissing,
            message: Some("claude binary not found on PATH".into()),
        },
        ClaudeOutcome::Crash { stderr_tail } => NodeOutcome::Failed {
            kind: ErrorKind::Crash,
            message: Some(stderr_tail),
        },
    }
}

fn ai_prompt_for(node: &Node) -> Result<String, DispatchError> {
    if let Some(prompt) = &node.prompt {
        return Ok(prompt.clone());
    }
    if let Some(command) = &node.command {
        // v1 convention: commands map to Claude Code slash commands.
        return Ok(format!("/{command}"));
    }
    if node.loop_.is_some() {
        return Err(DispatchError::InvalidNode {
            node_id: node.id.clone(),
            reason: "loop body should route through run_loop, not run_ai".into(),
        });
    }
    Err(DispatchError::InvalidNode {
        node_id: node.id.clone(),
        reason: "AI node carried neither prompt: nor command:".into(),
    })
}

// ── Loop path ────────────────────────────────────────────────────────

fn run_loop(node: &Node, ctx: &ExecutorContext<'_>) -> Result<NodeOutcome, DispatchError> {
    let body = node
        .loop_
        .as_ref()
        .ok_or_else(|| DispatchError::InvalidNode {
            node_id: node.id.clone(),
            reason: "loop dispatch on a non-loop node".into(),
        })?;

    let prompt = loop_body_prompt(node, body)?;
    let capture_path = node_capture_path(ctx, &node.id);
    ensure_parent_dir(&capture_path)?;

    // Pre-allocate the session UUID when the loop wants a resumed
    // session. `claude --session-id` requires a UUID (spike-locked).
    let session_uuid = if !body.fresh_context {
        Some(Uuid::new_v4().to_string())
    } else {
        None
    };

    let until_tokens: Vec<String> = vec![body.until.clone()];
    let budget = node_timeout(node, ctx);
    let deadline = Instant::now() + budget;

    for iter in 1..=body.max_iterations {
        // Deadline first: do not mutate the capture file after the
        // wall-clock budget has expired. This prevents dangling
        // `--- iteration N ---` headers with no body below on
        // timeout: 0 / near-exhausted-budget runs.
        let remaining = deadline.saturating_duration_since(Instant::now());
        if remaining.is_zero() {
            return Ok(NodeOutcome::Failed {
                kind: ErrorKind::Timeout,
                message: Some(format!(
                    "loop {} exceeded timeout of {}s",
                    node.id,
                    budget.as_secs()
                )),
            });
        }

        // Iter 1 truncates any stale capture from a prior dispatch of
        // this node; subsequent iterations append below the iter-1
        // marker. StreamParser then opens the file with Append so its
        // writes land after our header.
        if iter == 1 {
            truncate_capture(&capture_path)?;
        }
        let byte_offset = write_iteration_marker(&capture_path, iter)?;
        ctx.event_log
            .append(&Event::IterationStarted(IterationStarted {
                id: new_event_id(),
                ts: iso_utc_now(),
                run_id: ctx.run_id.to_string(),
                node_id: node.id.clone(),
                iteration: iter,
                byte_offset,
            }))
            .map_err(DispatchError::from)?;

        let session = session_uuid.as_ref().map(|uuid| {
            if iter == 1 {
                Session::New(uuid.clone())
            } else {
                Session::Resume(uuid.clone())
            }
        });

        let opts = build_spawn_opts(node, ctx, prompt.clone(), session);

        let outcome = run_claude_iteration(
            &opts,
            &capture_path,
            CaptureMode::Append,
            remaining,
            &until_tokens,
        )?;

        match outcome {
            ClaudeOutcome::SentinelHit { token } => {
                // BLOCKED always takes priority over the user's
                // `until` token — even if a malformed pipe somehow
                // slipped `until: BLOCKED` past the validator.
                if token == sentinel::BLOCKED {
                    return Ok(NodeOutcome::Failed {
                        kind: ErrorKind::Blocked,
                        message: Some(format!(
                            "loop {} assistant emitted BLOCKED on iteration {iter}",
                            node.id
                        )),
                    });
                }
                if token == body.until {
                    return Ok(NodeOutcome::Completed);
                }
                // Scanner surfaces only BLOCKED or the registered
                // `until` token; no other token can reach here.
                return Err(DispatchError::InvalidNode {
                    node_id: node.id.clone(),
                    reason: format!("scanner returned unexpected token `{token}`"),
                });
            }
            ClaudeOutcome::CleanExit => {
                // Iteration ended without firing the `until` sentinel.
                // Continue to the next iteration.
            }
            failure => return Ok(claude_failure(failure, node, budget)),
        }
    }

    Ok(NodeOutcome::Failed {
        kind: ErrorKind::MaxIterationsExceeded,
        message: Some(format!(
            "loop {} exhausted {} iterations without matching `{}`",
            node.id, body.max_iterations, body.until
        )),
    })
}

fn loop_body_prompt(node: &Node, body: &LoopBody) -> Result<String, DispatchError> {
    if let Some(prompt) = &body.prompt {
        return Ok(prompt.clone());
    }
    if let Some(command) = &body.command {
        return Ok(format!("/{command}"));
    }
    Err(DispatchError::InvalidNode {
        node_id: node.id.clone(),
        reason: "loop body carries neither prompt: nor command:".into(),
    })
}

/// Namespaced, machine-parseable iteration marker line. The `omne:`
/// prefix plus the trailing sentinel `===` make a collision with normal
/// assistant prose vanishingly unlikely (unlike the previous
/// `--- iteration N ---` form, which clashes with Markdown setext
/// headings and `git diff` hunk headers). Agents that still prefer
/// structured boundaries should read `iteration.started` from the event
/// log, which also carries the byte offset where each iteration's
/// content begins.
const ITERATION_MARKER_PREFIX: &str = "=== omne:iteration:";

/// Append an iteration marker line to `capture_path` and return the
/// byte offset at which the iteration's assistant output will start —
/// i.e. the file size immediately after the marker line is written.
/// Agents slice `[byte_offset_N .. byte_offset_{N+1})` to reconstruct a
/// single iteration's text without having to re-parse markers.
fn write_iteration_marker(capture_path: &Path, iter: u32) -> Result<u64, DispatchError> {
    let mut f = std::fs::OpenOptions::new()
        .create(true)
        .append(true)
        .open(capture_path)
        .map_err(|source| DispatchError::Io {
            path: capture_path.to_path_buf(),
            source,
        })?;
    writeln!(f, "\n{ITERATION_MARKER_PREFIX}{iter} ===").map_err(|source| DispatchError::Io {
        path: capture_path.to_path_buf(),
        source,
    })?;
    f.sync_data().map_err(|source| DispatchError::Io {
        path: capture_path.to_path_buf(),
        source,
    })?;
    let meta = std::fs::metadata(capture_path).map_err(|source| DispatchError::Io {
        path: capture_path.to_path_buf(),
        source,
    })?;
    Ok(meta.len())
}

/// Truncate `capture_path` to zero bytes, creating it if absent. Used
/// by loop iteration 1 so a re-dispatch of a failed loop against the
/// same `run_id` starts from a clean capture file rather than stacking
/// on top of prior-run output.
fn truncate_capture(capture_path: &Path) -> Result<(), DispatchError> {
    std::fs::OpenOptions::new()
        .create(true)
        .write(true)
        .truncate(true)
        .open(capture_path)
        .map(|_| ())
        .map_err(|source| DispatchError::Io {
            path: capture_path.to_path_buf(),
            source,
        })
}

// ── Claude iteration runner (shared by AI + loop) ────────────────

/// Outcome of one `claude -p` subprocess invocation as interpreted by
/// the executor — the scanner hit, a timeout, a clean exit, or a crash.
enum ClaudeOutcome {
    CleanExit,
    SentinelHit { token: String },
    Timeout,
    HostMissing,
    Crash { stderr_tail: String },
}

/// Run one `claude -p` invocation and interpret the stream line-by-line.
///
/// A background watchdog thread kills the child when `budget` elapses;
/// the main thread drains the parser, feeds each line to a
/// [`Scanner`], and returns as soon as a sentinel hits (or the stream
/// closes). `capture_mode` lets the loop controller preserve prior
/// iterations' text with [`CaptureMode::Append`].
fn run_claude_iteration(
    opts: &SpawnOpts,
    capture_path: &Path,
    capture_mode: CaptureMode,
    budget: Duration,
    until_tokens: &[String],
) -> Result<ClaudeOutcome, DispatchError> {
    let child = match claude_proc::spawn(opts) {
        Ok(c) => c,
        Err(claude_proc::Error::HostMissing) => return Ok(ClaudeOutcome::HostMissing),
        Err(other) => return Err(DispatchError::from(other)),
    };

    let proc = ClaudeProcess::from_child_with_mode(child, capture_path, capture_mode)?;
    let killer = proc.killer();
    let (cancel_tx, cancel_rx) = mpsc::channel::<()>();
    let watchdog = spawn_watchdog(killer.clone(), cancel_rx, budget);

    let scanner = Scanner::new(until_tokens);
    let mut proc = proc;
    let mut hit: Option<String> = None;
    let mut stream_error: Option<claude_proc::Error> = None;
    for line in proc.by_ref() {
        match line {
            Ok(al) => {
                if let Some(h) = scanner.feed(&al.text) {
                    hit = Some(h.token);
                    // Sentinel short-circuit: kill the child so the
                    // stream does not keep reading. Both `BLOCKED`
                    // (always reserved) and user `until` tokens end
                    // the iteration here.
                    let _ = killer.kill();
                    break;
                }
            }
            Err(e) => {
                stream_error = Some(e);
                break;
            }
        }
    }

    // Signal the watchdog before waiting for the child so it doesn't
    // kill a naturally-finished process after the fact. Dropping the
    // sender signals "cancelled" via RecvTimeoutError::Disconnected.
    drop(cancel_tx);
    let timed_out = watchdog.join().unwrap_or(false);

    let (status, stderr) = match proc.finish() {
        Ok(s) => s,
        Err(e) => {
            return Err(DispatchError::from(e));
        }
    };

    if let Some(token) = hit {
        return Ok(ClaudeOutcome::SentinelHit { token });
    }
    if timed_out {
        return Ok(ClaudeOutcome::Timeout);
    }
    if let Some(e) = stream_error {
        // Stream-level I/O errors (malformed stdin we couldn't parse)
        // are rare — surface as a crash so the runner can record the
        // stderr tail alongside.
        return Ok(ClaudeOutcome::Crash {
            stderr_tail: format!("stream error: {e}\nstderr: {}", tail(&stderr, 1024)),
        });
    }
    if !status.success() {
        return Ok(ClaudeOutcome::Crash {
            stderr_tail: tail(&stderr, 1024),
        });
    }
    Ok(ClaudeOutcome::CleanExit)
}

/// Spawn a thread that kills `killer`'s child after `budget`, unless
/// the main thread signals cancellation first.
///
/// Cancellation is signalled by dropping the `cancel_tx` side of the
/// channel (which surfaces as `RecvTimeoutError::Disconnected` on the
/// rx side). The channel is never `.send()`-used today — a value
/// arriving via `Ok(_)` would also mean "cancelled; don't kill", but
/// no caller exercises that path so we treat it as unreachable to keep
/// the intent of the protocol explicit. Returns `true` once joined iff
/// the watchdog actually killed the child.
fn spawn_watchdog(
    killer: ChildKiller,
    rx: mpsc::Receiver<()>,
    budget: Duration,
) -> thread::JoinHandle<bool> {
    thread::spawn(move || match rx.recv_timeout(budget) {
        Err(mpsc::RecvTimeoutError::Disconnected) => false,
        Err(mpsc::RecvTimeoutError::Timeout) => {
            let _ = killer.kill();
            true
        }
        Ok(()) => {
            // No caller currently sends on this channel; cancellation
            // is by drop. Reaching this arm means a future refactor
            // added an explicit cancel path without updating this
            // comment — treat as cancellation.
            false
        }
    })
}

fn build_spawn_opts(
    node: &Node,
    ctx: &ExecutorContext<'_>,
    prompt: String,
    session: Option<Session>,
) -> SpawnOpts {
    SpawnOpts {
        prompt,
        cwd: ctx.worktree.to_path_buf(),
        model: effective_model(node, ctx).map(str::to_string),
        allowed_tools: node.allowed_tools.clone(),
        session,
        extra_args: Vec::new(),
        bin: ctx.claude_bin.map(|p| p.to_path_buf()),
    }
}

fn effective_model<'a>(node: &'a Node, ctx: &'a ExecutorContext<'a>) -> Option<&'a str> {
    node.model
        .as_deref()
        .or_else(|| ctx.default_model.map(|s| s as &str))
}

// ── Bash / gate shared subprocess runner ────────────────────────

/// Outcome of a subprocess we do not stream-parse — bash and gate hooks
/// both use this path.
struct RawExit {
    status: std::process::ExitStatus,
    stdout: Vec<u8>,
    stderr: Vec<u8>,
    timed_out: bool,
}

/// Ask `cmd` to spawn its child into a fresh process group (Unix) or
/// process-console group (Windows). Enables [`kill_process_tree`] to
/// target the whole subtree when a timeout fires.
#[cfg(unix)]
fn set_new_process_group(cmd: &mut Command) {
    use std::os::unix::process::CommandExt;
    // `0` means "make the child the leader of a new process group
    // whose id equals its pid". Then `kill -KILL -$pid` (negative pid)
    // signals every member of that group.
    cmd.process_group(0);
}

#[cfg(windows)]
fn set_new_process_group(cmd: &mut Command) {
    use std::os::windows::process::CommandExt;
    // CREATE_NEW_PROCESS_GROUP = 0x00000200 — Win32 process creation
    // flag that dissociates the child from our console group so
    // `taskkill /T` walks the right tree.
    const CREATE_NEW_PROCESS_GROUP: u32 = 0x0000_0200;
    cmd.creation_flags(CREATE_NEW_PROCESS_GROUP);
}

/// Send a terminal signal to every process in `pid`'s group (Unix) or
/// every process in the tree rooted at `pid` (Windows). Used when the
/// wall-clock budget expires so backgrounded grandchildren cannot keep
/// the drained pipe handles open past the timeout.
///
/// Deliberately delegated to the platform's canonical tree-kill
/// utility (`kill`, `taskkill`) rather than a raw syscall: the utility
/// already handles the edge cases (zombies, permission errors, missing
/// targets) that a minimal `libc::killpg` or `OpenProcess` wrapper
/// would re-invent.
#[cfg(unix)]
fn kill_process_tree(pid: u32) {
    let _ = Command::new("kill")
        .args(["-KILL", "--", &format!("-{pid}")])
        .stdin(Stdio::null())
        .stdout(Stdio::null())
        .stderr(Stdio::null())
        .status();
}

#[cfg(windows)]
fn kill_process_tree(pid: u32) {
    let _ = Command::new("taskkill")
        .args(["/T", "/F", "/PID", &pid.to_string()])
        .stdin(Stdio::null())
        .stdout(Stdio::null())
        .stderr(Stdio::null())
        .status();
}

fn run_command_with_timeout(
    cmd: &mut Command,
    budget: Duration,
    capture_stdout_at: Option<&Path>,
) -> Result<RawExit, DispatchError> {
    // Place the child in its own process group / console group so a
    // timeout kill reaches backgrounded grandchildren too. Without
    // this, a bash body that spawns a background subprocess (`& sleep
    // 60` on Unix, or any native `start` / pipeline spawn on Windows)
    // inherits our stdout/stderr pipes; killing only the direct child
    // leaves grandchildren holding the pipe handles open and
    // `stdout_thread.join()` blocks until those grandchildren die
    // naturally — potentially forever.
    set_new_process_group(cmd);
    let mut child = cmd
        .spawn()
        .map_err(|source| DispatchError::Spawn { source })?;
    let child_pid = child.id();

    // Drain stdout + stderr on dedicated threads so buffer-full
    // deadlocks never stall a long-running child, same pattern as
    // `ClaudeProcess`.
    let stdout_handle = child.stdout.take();
    let stderr_handle = child.stderr.take();
    let stdout_thread = stdout_handle.map(|mut s| {
        thread::spawn(move || -> std::io::Result<Vec<u8>> {
            let mut buf = Vec::new();
            s.read_to_end(&mut buf)?;
            Ok(buf)
        })
    });
    let stderr_thread = stderr_handle.map(|mut s| {
        thread::spawn(move || -> std::io::Result<Vec<u8>> {
            let mut buf = Vec::new();
            s.read_to_end(&mut buf)?;
            Ok(buf)
        })
    });

    use wait_timeout::ChildExt;
    let (status, timed_out) = match child
        .wait_timeout(budget)
        .map_err(|source| DispatchError::Wait { source })?
    {
        Some(s) => (s, false),
        None => {
            // Tree-kill first so grandchildren release the pipe
            // handles that would otherwise block `stdout_thread.join`
            // forever. `child.kill()` is a belt-and-suspenders safety
            // net for the direct child on platforms where the
            // tree-kill utility is missing.
            kill_process_tree(child_pid);
            let _ = child.kill();
            let s = child
                .wait()
                .map_err(|source| DispatchError::Wait { source })?;
            (s, true)
        }
    };

    let stdout = stdout_thread
        .map(|h| {
            h.join()
                .unwrap_or_else(|_| Ok(Vec::new()))
                .unwrap_or_default()
        })
        .unwrap_or_default();
    let stderr = stderr_thread
        .map(|h| {
            h.join()
                .unwrap_or_else(|_| Ok(Vec::new()))
                .unwrap_or_default()
        })
        .unwrap_or_default();

    if let Some(cap) = capture_stdout_at {
        std::fs::write(cap, &stdout).map_err(|source| DispatchError::Io {
            path: cap.to_path_buf(),
            source,
        })?;
    }
    Ok(RawExit {
        status,
        stdout,
        stderr,
        timed_out,
    })
}

fn outcome_from_exit(raw: RawExit) -> NodeOutcome {
    if raw.timed_out {
        return NodeOutcome::Failed {
            kind: ErrorKind::Timeout,
            message: Some(format!(
                "subprocess killed after exceeding wall-clock budget; stderr: {}",
                String::from_utf8_lossy(&tail_bytes(&raw.stderr, 1024)).into_owned()
            )),
        };
    }
    if raw.status.success() {
        return NodeOutcome::Completed;
    }
    NodeOutcome::Failed {
        kind: ErrorKind::Crash,
        message: Some(
            String::from_utf8_lossy(&tail_bytes(&raw.stderr, 1024))
                .trim()
                .to_string(),
        ),
    }
}

// ── Gate path ────────────────────────────────────────────────────

fn run_gate(
    node: &Node,
    gate: &str,
    ctx: &ExecutorContext<'_>,
) -> Result<NodeOutcome, DispatchError> {
    let script = gate_script_path(ctx.volume_root, gate);
    // Validator checked existence at load time, but the file could
    // have been deleted in the interval. Recheck so a missing hook
    // surfaces as `NodeFailed{GateFailed}` rather than a spawn-time
    // `DispatchError` the caller has to special-case.
    if !script.is_file() {
        return Ok(NodeOutcome::Failed {
            kind: ErrorKind::GateFailed,
            message: Some(format!(
                "gate hook {} missing at dispatch time",
                script.display()
            )),
        });
    }
    // Defence-in-depth path traversal check. The validator already
    // rejects gate names with `/`, `\`, or `..`, but a DAG-earlier
    // bash node could plant a file at a traversal target after
    // validation ran — we rebuild the canonical path here and assert
    // it's still under `dist/hooks/`.
    if let Some(outcome) = enforce_gate_boundary(&script, ctx.volume_root, gate)? {
        return Ok(outcome);
    }
    // Gate runs from the volume root, not the per-run worktree. Hooks
    // are distro-level scripts that may need to see the whole volume
    // layout (e.g. `.omne/lib/cfg/`), so placing them in the worktree
    // would hide paths that only exist at the root.
    let mut cmd = gate_command(&script);
    cmd.current_dir(ctx.volume_root)
        .stdin(Stdio::null())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .env("OMNE_RUN_ID", ctx.run_id)
        .env("OMNE_NODE_ID", &node.id)
        .env("OMNE_GATE_NAME", gate)
        .env("OMNE_VOLUME_ROOT", ctx.volume_root);

    let raw = run_command_with_timeout(&mut cmd, ctx.gate_timeout, None)?;
    if raw.timed_out {
        return Ok(NodeOutcome::Failed {
            kind: ErrorKind::GateTimeout,
            message: Some(format!(
                "gate {gate} exceeded {}s budget",
                ctx.gate_timeout.as_secs()
            )),
        });
    }
    if !raw.status.success() {
        return Ok(NodeOutcome::Failed {
            kind: ErrorKind::GateFailed,
            message: Some(
                String::from_utf8_lossy(&tail_bytes(&raw.stderr, 1024))
                    .trim()
                    .to_string(),
            ),
        });
    }

    let stdout_tail = if raw.stdout.is_empty() {
        None
    } else {
        let trimmed = String::from_utf8_lossy(&tail_bytes(&raw.stdout, 1024))
            .trim()
            .to_string();
        (!trimmed.is_empty()).then_some(trimmed)
    };

    ctx.event_log
        .append(&Event::GatePassed(GatePassed {
            id: new_event_id(),
            ts: iso_utc_now(),
            run_id: ctx.run_id.to_string(),
            node_id: node.id.clone(),
            gate: gate.to_string(),
            method: GateMethod::Hook,
            stdout: stdout_tail,
        }))
        .map_err(DispatchError::from)?;
    Ok(NodeOutcome::Completed)
}

/// Assert that `script` (known to exist at this point) resolves inside
/// `<volume_root>/.omne/dist/hooks/`. Returns `Some(NodeFailed)` when
/// the resolved path escapes the hooks directory, so the caller can
/// record a clean `gate.failed` terminal event instead of propagating
/// a `DispatchError`. Returns `None` when the path is in bounds.
///
/// Both paths are `canonicalize`d so symlinks that leave the hooks dir
/// are also rejected. The check is best-effort: if the hooks directory
/// itself is missing (distro not installed) we return `None` and let
/// the subsequent spawn fail with a standard error.
fn enforce_gate_boundary(
    script: &Path,
    volume_root: &Path,
    gate: &str,
) -> Result<Option<NodeOutcome>, DispatchError> {
    let hooks_dir = volume::dist_dir(volume_root).join("hooks");
    let canonical_hooks = match hooks_dir.canonicalize() {
        Ok(p) => p,
        Err(_) => return Ok(None),
    };
    let canonical_script = script.canonicalize().map_err(|source| DispatchError::Io {
        path: script.to_path_buf(),
        source,
    })?;
    if !canonical_script.starts_with(&canonical_hooks) {
        return Ok(Some(NodeOutcome::Failed {
            kind: ErrorKind::GateFailed,
            message: Some(format!(
                "gate `{gate}` resolves outside {}: {}",
                canonical_hooks.display(),
                canonical_script.display()
            )),
        }));
    }
    Ok(None)
}

fn gate_script_path(volume_root: &Path, gate: &str) -> PathBuf {
    volume::dist_dir(volume_root)
        .join("hooks")
        .join(format!("{gate}.{}", platform_hook_extension()))
}

#[cfg(windows)]
fn gate_command(script: &Path) -> Command {
    let mut cmd = Command::new("powershell");
    // Intentionally no `-ExecutionPolicy Bypass`: overriding the user's
    // AllSigned / RemoteSigned policy silently removed the OS signing
    // check for every gate hook. Distro authors relying on unsigned
    // hooks should document the `Unblock-File` step in their install
    // guide rather than have the runner bypass the policy.
    cmd.arg("-NoProfile").arg("-File").arg(script);
    cmd
}

#[cfg(not(windows))]
fn gate_command(script: &Path) -> Command {
    let mut cmd = Command::new("sh");
    cmd.arg(script);
    cmd
}

#[cfg(windows)]
fn platform_hook_extension() -> &'static str {
    "ps1"
}

#[cfg(not(windows))]
fn platform_hook_extension() -> &'static str {
    "sh"
}

// ── Terminal event emission ─────────────────────────────────────

fn emit_terminal(
    node: &Node,
    outcome: &NodeOutcome,
    ctx: &ExecutorContext<'_>,
) -> Result<(), DispatchError> {
    let event = match outcome {
        NodeOutcome::Completed => Event::NodeCompleted(NodeCompleted {
            id: new_event_id(),
            ts: iso_utc_now(),
            run_id: ctx.run_id.to_string(),
            node_id: node.id.clone(),
            output_path: capture_output_path_wire(ctx.run_id, &node.id),
        }),
        NodeOutcome::Failed { kind, message } => Event::NodeFailed(NodeFailed {
            id: new_event_id(),
            ts: iso_utc_now(),
            run_id: ctx.run_id.to_string(),
            node_id: node.id.clone(),
            error: NodeError { kind: *kind },
            message: message.clone(),
        }),
    };
    ctx.event_log.append(&event).map_err(DispatchError::from)?;
    Ok(())
}

// ── Helpers ──────────────────────────────────────────────────────

fn node_capture_path(ctx: &ExecutorContext<'_>, node_id: &str) -> PathBuf {
    volume::nodes_dir(ctx.volume_root, ctx.run_id).join(format!("{node_id}.out"))
}

/// Forward-slash, volume-root-relative path for the wire `output_path`
/// field on `node.completed`. Single source of truth lives in
/// [`volume::node_capture_wire_path`] so a layout change in `volume.rs`
/// propagates here automatically.
fn capture_output_path_wire(run_id: &str, node_id: &str) -> String {
    volume::node_capture_wire_path(run_id, node_id)
}

fn ensure_parent_dir(path: &Path) -> Result<(), DispatchError> {
    if let Some(parent) = path.parent() {
        if !parent.as_os_str().is_empty() {
            std::fs::create_dir_all(parent).map_err(|source| DispatchError::Io {
                path: parent.to_path_buf(),
                source,
            })?;
        }
    }
    Ok(())
}

fn node_timeout(node: &Node, ctx: &ExecutorContext<'_>) -> Duration {
    node.timeout
        .map(Duration::from_secs)
        .unwrap_or(ctx.default_node_timeout)
}

fn new_event_id() -> String {
    Ulid::new().to_string().to_lowercase()
}

/// Current UTC instant as `YYYY-MM-DDTHH:MM:SSZ`. Delegates to
/// [`crate::clock`] so `init::chrono_today` and `events.jsonl`
/// timestamps share a single civil-calendar implementation.
fn iso_utc_now() -> String {
    crate::clock::now_utc().format_iso_utc()
}

/// Byte-tail of `b` at most `max_bytes` long, aligned forward to the
/// next valid UTF-8 start boundary. Aligning inside the helper means
/// a caller that later decodes the slice as UTF-8 (via
/// `from_utf8_lossy`) never ends up with a leading U+FFFD replacement
/// character from a split multi-byte codepoint.
fn tail_bytes(b: &[u8], max_bytes: usize) -> Vec<u8> {
    if b.len() <= max_bytes {
        return b.to_vec();
    }
    let mut start = b.len() - max_bytes;
    // UTF-8 continuation bytes are `10xxxxxx` (0x80..=0xBF). Skip
    // forward until we reach a lead byte (or ASCII) so decoding
    // doesn't split a codepoint.
    while start < b.len() && (b[start] & 0b1100_0000) == 0b1000_0000 {
        start += 1;
    }
    b[start..].to_vec()
}

/// UTF-8-safe trimmed tail of `s`. Built on top of [`tail_bytes`] so
/// the boundary-alignment logic lives in one place.
fn tail(s: &str, max_bytes: usize) -> String {
    let bytes = tail_bytes(s.as_bytes(), max_bytes);
    String::from_utf8_lossy(&bytes).trim().to_string()
}

// ── Errors ──────────────────────────────────────────────────────

/// Failure categories returned from [`dispatch`] itself (distinct from
/// `NodeOutcome::Failed`, which is a *recorded* node failure). These
/// are infrastructure errors that prevent the executor from even
/// deciding the node's outcome — event-log I/O, malformed pipe data
/// leaking past validation, stream-parser crashes, etc.
#[derive(Debug, thiserror::Error)]
pub enum DispatchError {
    #[error("node `{node_id}` rejected by executor: {reason}")]
    InvalidNode { node_id: String, reason: String },

    #[error("event log error: {0}")]
    EventLog(#[from] crate::event_log::Error),

    #[error("claude subprocess error: {0}")]
    ClaudeProc(#[from] claude_proc::Error),

    /// Path-bearing I/O error (capture-file writes, iteration-marker
    /// writes, `mkdir_p`). Spawn and wait failures use [`Spawn`] /
    /// [`Wait`] instead so the error message does not render as
    /// `I/O error on : ...` with an empty path.
    #[error("I/O error on {}: {source}", path.display())]
    Io {
        path: PathBuf,
        #[source]
        source: std::io::Error,
    },

    /// `Command::spawn` failed before the child ever started. The
    /// `Command` itself is lost (spawn consumed it by reference but we
    /// have no useful identifier beyond the originating subprocess
    /// family, which the caller can infer from context).
    #[error("failed to spawn subprocess: {source}")]
    Spawn {
        #[source]
        source: std::io::Error,
    },

    /// `wait_timeout` / `wait` on a live child reported an I/O error
    /// (EINTR left unhandled, process reaping race). Distinct from
    /// [`Spawn`] so telemetry can distinguish start-time failures from
    /// mid-flight failures.
    #[error("failed to wait on subprocess: {source}")]
    Wait {
        #[source]
        source: std::io::Error,
    },
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn iso_utc_now_is_plausible_shape() {
        let s = iso_utc_now();
        assert_eq!(s.len(), 20);
        assert!(s.ends_with('Z'));
        assert!(s.contains('T'));
    }

    #[test]
    fn tail_trims_and_bounds() {
        assert_eq!(tail("   hi  ", 1024), "hi");
        let long: String = "x".repeat(5000);
        assert_eq!(tail(&long, 10).len(), 10);
    }

    #[test]
    fn output_path_wire_uses_forward_slashes() {
        let wire = capture_output_path_wire("feature-01abc", "research");
        assert!(!wire.contains('\\'));
        assert_eq!(wire, ".omne/var/runs/feature-01abc/nodes/research.out");
    }
}