haz-exec 0.1.0 - Docs.rs

//! Workspace-wide scheduler loop body and its public-surface
//! types.
//!
//! Three public items live here:
//!
//! - [`RunGraphOutcome`]: the per-task outcome map, per-task
//!   error map, and run-level diagnostic vector returned by a
//!   completed [`run_graph`] invocation.
//! - [`RuntimeInvariantViolation`]: the typed diagnostic shape
//!   for `EXEC-019` runtime cycles and `EXEC-020` runtime output
//!   overlaps.
//! - [`RunGraphError`]: reserved for future scheduler-level
//!   error variants; empty in the current revision (`EXEC-010`
//!   keeps single-task failures out of the top-level return).
//!
//! Plus the [`run_graph`] async function: the scheduler loop
//! itself. The loop composes the helpers from
//! [`crate::run_graph::state`], [`crate::run_graph::overlap`],
//! [`crate::run_graph::cycle`], [`crate::run_graph::cascade`],
//! and [`crate::run_graph::steps`] into the admission /
//! completion state machine described on [`run_graph`]'s
//! rustdoc.
//!
//! Re-exported by [`crate::run_graph`] so the existing
//! external paths (`haz_exec::run_graph::run_graph`, etc.)
//! continue to resolve unchanged.

use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::num::NonZeroUsize;

use futures::StreamExt;
use futures::stream::FuturesUnordered;
use haz_domain::mutex::Mutex;
use haz_domain::name::{ProjectName, TagName};
use haz_domain::path::CanonicalPath;
use haz_domain::task_id::TaskId;
use haz_vfs::WritableFilesystem;
use snafu::Snafu;

use crate::hold_set::HoldSet;
use crate::process::ProcessSpawner;
use crate::run_graph::cascade::{
    drain_ready_to_cancelled, emit_cascade_cancellations, emit_cascade_skips,
};
use crate::run_graph::cycle::{
    check_and_record_runtime_cycle_for_completion, skip_ready_cycle_members,
};
use crate::run_graph::overlap::check_and_record_output_overlap;
use crate::run_graph::state::{
    InFlightCounts, ReadyState, StreamHashAccumulator, precompute_task_tags, resolve_global_cap,
};
use crate::run_graph::steps::{
    InFlightCompletion, InFlightFuture, LookupStepOutcome, run_lookup_step, run_spawn_step,
};
use crate::run_task::{
    CancelledRecord, CompletedRecord, RunContext, RunObserver, RunOutcome, RunState, RunTaskError,
    SkipCause,
};

/// Outcome of a single [`run_graph`] invocation.
///
/// `outcomes` carries one entry per task that reached a terminal
/// state through the lookup-then-spawn pipeline; each entry is a
/// [`RunOutcome::Completed`] wrapping the underlying
/// [`CompletedRecord`]. Tasks for which the pipeline returned
/// `Err` are absent from `outcomes` and present in `task_errors`.
/// Tasks the cascade skipped per `EXEC-010` are absent from both
/// maps in this revision; a follow-up commit lands the formal
/// [`RunOutcome::Skipped`] entries.
///
/// `invariant_violations` carries run-level diagnostics for
/// `EXEC-019` (runtime cycle) and `EXEC-020` (output overlap).
/// The Vec is empty on a clean run; a non-empty Vec means the
/// scheduler detected a workspace-level invariant violation
/// during the run. Partial per-task outcomes are preserved
/// alongside; [`crate::exit_code::exit_code_for`] consults both
/// maps and the violation Vec to classify the run for
/// `EXEC-021`.
///
/// [`CompletedRecord`]: crate::run_task::CompletedRecord
#[derive(Debug)]
pub struct RunGraphOutcome {
    /// Per-task outcome in canonical `(ProjectName, TaskName)`
    /// order.
    pub outcomes: BTreeMap<TaskId, RunOutcome>,
    /// Per-task error captured when the lookup-then-spawn
    /// pipeline returned `Err` (failed cache lookup, spawn error,
    /// stream-read error, store error, etc.). The cascade treats
    /// an `Err` task the same way as an `Ok(Failed)` task: hard
    /// descendants are marked skip; unrelated subgraphs continue.
    pub task_errors: BTreeMap<TaskId, RunTaskError>,
    /// Run-level diagnostics for runtime DAG invariants
    /// (`EXEC-019` cycle, `EXEC-020` output overlap). Order is
    /// detection order: the scheduler appends one entry per
    /// detected violation as it discovers them.
    pub invariant_violations: Vec<RuntimeInvariantViolation>,
}

/// A workspace-level invariant the scheduler detected at runtime.
///
/// Each variant carries enough information to produce a
/// diagnostic shape-equivalent to the static `DAG-014` /
/// `DAG-016` error it is the runtime analogue of: a cycle's node
/// set and the offending edge; an overlap's two task identities
/// and the shared path.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum RuntimeInvariantViolation {
    /// `EXEC-019` runtime cycle: a length-≥2 cycle in the union
    /// of hard, soft, and producer-matching edges, discovered
    /// when a newly-materialised output added a runtime
    /// producer-matching edge that closed the cycle.
    RuntimeCycle {
        /// Every task identified as a member of the cycle.
        nodes: BTreeSet<TaskId>,
        /// The newly-added producer-matching edge whose
        /// addition closed the cycle (predecessor, successor).
        offending_edge: (TaskId, TaskId),
    },
    /// `EXEC-020` runtime output overlap: two tasks have both
    /// materialised the same workspace-absolute path.
    OutputOverlap {
        /// The task that claimed the path first (chronologically
        /// in the run's completion order).
        first_task: TaskId,
        /// The task whose materialisation discovered the
        /// pre-existing claim.
        second_task: TaskId,
        /// The path both tasks materialised.
        shared_path: CanonicalPath,
    },
}

/// Top-level error raised by [`run_graph`].
///
/// This revision defines no variants: a single task's failure
/// does not abort the run (per `EXEC-010`), and every failure
/// path the scheduler can currently produce is surfaced as a
/// per-task `RunTaskError` captured in
/// [`RunGraphOutcome::task_errors`] or as a typed
/// [`RuntimeInvariantViolation`] in
/// [`RunGraphOutcome::invariant_violations`]. The enum exists
/// so a future scheduler-level diagnostic (e.g. a mutex
/// acquisition error not attributable to a single task) can
/// grow without changing [`run_graph`]'s signature.
#[derive(Debug, Snafu)]
#[snafu(visibility(pub(crate)))]
pub enum RunGraphError {}

/// Run every task in the validated graph subject to the
/// workspace's concurrency caps and the mutex compatibility
/// rules of `EXEC-006` condition 3 + `EXEC-007`.
///
/// The loop:
///
/// 1. Resolves the workspace's `concurrency.default` to a
///    concrete cap and pre-computes the per-task tag set.
/// 2. On each admission round, walks `state.ready` in canonical
///    order and admits every task whose caps (`EXEC-004`,
///    `EXEC-005`) permit. Admission consults caps ONLY; the
///    mutex check is deferred per `EXEC-007` step 1 ("no mutex
///    hold is taken during cache lookup"). Each admitted task
///    fires [`RunObserver::on_task_started`] once (tracked via
///    `started`) and pushes a lookup-step future.
/// 3. The lookup-step future runs [`cache_lookup_phase`]. On
///    hit, it drives [`restore_from_hit`] inline (no mutex
///    interaction, per `MUTEX-007`) and reports the outcome. On
///    miss, it stops short and reports the data the scheduler
///    needs for the post-lookup mutex check.
/// 4. On a miss completion the scheduler evaluates the live
///    mutex hold set. When compatible, it acquires the
///    mutex per `MUTEX-006` and pushes a spawn-step future
///    ([`run_fresh`]). When incompatible (per `MUTEX-005` /
///    `EXEC-007` step 3 incompatible branch), the slot is
///    released, the task returns to `ready`, and the next
///    admission round re-evaluates.
/// 5. On a spawn-step completion the scheduler releases the
///    held mutex, releases the slot, fires
///    [`RunObserver::on_task_finished`], records the outcome,
///    and either promotes hard successors to ready (succeeded)
///    or cascade-skips hard descendants (failed, per `EXEC-010`).
/// 6. Terminates when no futures are in flight; tasks marked
///    skip never enter `ready`, so the loop's emptiness
///    condition is well-defined for any graph including
///    diamonds, fan-ins, and disconnected subgraphs.
///
/// # Errors
///
/// This revision never returns `Err`: a single task's failure
/// does not abort the run (per `EXEC-010`), and the only paths
/// that could yield a top-level error in the current scope are
/// covered by per-task `RunTaskError`s captured in
/// [`RunGraphOutcome::task_errors`]. Future revisions will grow
/// [`RunGraphError`] with scheduler-level diagnostics (runtime
/// cycle detection, runtime output overlap).
///
/// # Panics
///
/// Panics if `ctx.graph` references a task whose project or
/// task name is absent from `ctx.workspace`. A validated graph
/// is built from the workspace's effective task set; a node
/// referring to a non-existent task would be a builder bug.
///
/// [`cache_lookup_phase`]: crate::run_task::cache_lookup_phase
/// [`restore_from_hit`]: crate::run_task::restore_from_hit
/// [`run_fresh`]: crate::run_task::run_fresh
pub async fn run_graph<F, S, O>(
    ctx: &RunContext<'_, F, S, O>,
    created_at_unix: u64,
) -> Result<RunGraphOutcome, RunGraphError>
where
    F: WritableFilesystem,
    S: ProcessSpawner,
    O: RunObserver,
{
    // EXEC-019 step 3 plumbing: the scheduler watches an
    // internal child of `ctx.cancel`. User cancellation
    // (`EXEC-012`) propagates parent-to-child automatically; the
    // scheduler trips the child directly on runtime cycle
    // detection. Tripping the child does NOT cancel the user's
    // parent token: an observer holding a clone of `ctx.cancel`
    // sees no cancellation when the run aborts for a cycle.
    let internal_cancel = ctx.cancel.child_token();
    let internal_ctx = RunContext {
        fs: ctx.fs,
        cache: ctx.cache,
        spawner: ctx.spawner,
        observer: ctx.observer,
        workspace: ctx.workspace,
        graph: ctx.graph,
        host_env: ctx.host_env,
        algo: ctx.algo,
        cancel: &internal_cancel,
    };
    let mut sched = SchedulerState::new(&internal_ctx);

    loop {
        if !sched.cancelled && sched.ctx.cancel.is_cancelled() {
            sched.cancelled = true;
        }

        if sched.cancelled {
            sched.drain_cancelled();
        } else {
            sched.admit_ready();
        }

        if sched.in_flight.is_empty() {
            break;
        }

        let Some(completion) = sched.next_completion().await else {
            continue;
        };

        match completion {
            InFlightCompletion::Lookup { task, result } => {
                sched.handle_lookup(task, result, created_at_unix);
            }
            InFlightCompletion::Spawn { task, result } => {
                sched.handle_spawn(task, result);
            }
        }
    }

    Ok(sched.into_outcome())
}

/// Aggregate of the per-run scheduler-local state.
///
/// Owns every piece of mutable bookkeeping the [`run_graph`] loop
/// reads or writes: the ready set, the in-flight counts, the
/// stream-hash accumulator, the mutex hold set, the per-task
/// hold metadata, the outcome / error / violation maps, the
/// EXEC-020 output-claim tracker, the EXEC-019 augmented edge
/// set, the sticky cancel flag, and the `FuturesUnordered` of
/// in-flight task futures.
///
/// Lives inside `scheduler.rs` (no `pub(super)`); the only
/// caller is [`run_graph`].
struct SchedulerState<'a, F, S, O>
where
    F: WritableFilesystem,
    S: ProcessSpawner,
    O: RunObserver,
{
    ctx: &'a RunContext<'a, F, S, O>,
    /// Resolved `concurrency.default` (`EXEC-004`).
    global_cap: NonZeroUsize,
    /// Pre-computed per-task tag sets feeding the per-tag
    /// admission accounting (`EXEC-005`).
    task_tags: BTreeMap<TaskId, BTreeSet<TagName>>,
    /// Ready set, hard-edge index, cascade-skip closure
    /// (`EXEC-001`, `EXEC-010`, `EXEC-011`).
    ready_state: ReadyState,
    /// Global + per-tag in-flight counters (`EXEC-006`
    /// conditions 1, 2).
    counts: InFlightCounts,
    /// Captured `(stdout_hash, stderr_hash)` per terminated task
    /// (`CACHE-007`, `DAG-017`).
    accum: StreamHashAccumulator,
    /// Live mutex holds (`EXEC-006` condition 3, `MUTEX-001..007`).
    hold_set: HoldSet,
    /// Tasks that have already fired
    /// [`RunObserver::on_task_started`]. A task that re-enters
    /// the lifecycle after a `MUTEX-005` yield MUST NOT refire
    /// the event (S3).
    started: BTreeSet<TaskId>,
    /// Per-task hold metadata captured at acquire time so the
    /// spawn-step completion handler releases the right hold
    /// without re-deriving it from the workspace.
    spawn_step_holds: BTreeMap<TaskId, (ProjectName, Option<Mutex>)>,
    /// Per-task terminal outcomes.
    outcomes: BTreeMap<TaskId, RunOutcome>,
    /// Per-task errors captured when the lookup-then-spawn
    /// pipeline returned `Err`.
    task_errors: BTreeMap<TaskId, RunTaskError>,
    /// Run-level diagnostics for `EXEC-019` runtime cycle and
    /// `EXEC-020` runtime output overlap.
    invariant_violations: Vec<RuntimeInvariantViolation>,
    /// `EXEC-020` output-claim tracker: maps each materialised
    /// workspace-absolute path to the first task that claimed
    /// it. Lookup-only iteration semantics; `HashMap` because
    /// `CanonicalPath` implements `Hash + Eq`.
    output_claims: HashMap<CanonicalPath, TaskId>,
    /// `EXEC-019` augmented edge set: the static graph's edges
    /// plus every producer-matching edge discovered at runtime.
    augmented_edges: BTreeSet<(TaskId, TaskId)>,
    /// `FuturesUnordered` of in-flight lookup-step and spawn-
    /// step futures, all borrowing from `ctx`.
    in_flight: FuturesUnordered<InFlightFuture<'a>>,
    /// Sticky cancellation flag. Tripped by user cancel
    /// (`EXEC-012`), EXEC-020 overlap detection, or EXEC-019
    /// cycle detection. Once set, admission stops and the ready
    /// set drains into `RunCancelled` entries each iteration.
    cancelled: bool,
}

impl<'a, F, S, O> SchedulerState<'a, F, S, O>
where
    F: WritableFilesystem,
    S: ProcessSpawner,
    O: RunObserver,
{
    fn new(ctx: &'a RunContext<'a, F, S, O>) -> Self {
        let global_cap = resolve_global_cap(&ctx.workspace.settings.concurrency);
        let task_tags = precompute_task_tags(ctx.workspace, ctx.graph);
        let ready_state = ReadyState::from_graph(ctx.graph);
        let augmented_edges: BTreeSet<(TaskId, TaskId)> = ctx
            .graph
            .edges
            .iter()
            .map(|e| (e.from.clone(), e.to.clone()))
            .collect();
        Self {
            ctx,
            global_cap,
            task_tags,
            ready_state,
            counts: InFlightCounts::default(),
            accum: StreamHashAccumulator::default(),
            hold_set: HoldSet::default(),
            started: BTreeSet::new(),
            spawn_step_holds: BTreeMap::new(),
            outcomes: BTreeMap::new(),
            task_errors: BTreeMap::new(),
            invariant_violations: Vec::new(),
            output_claims: HashMap::new(),
            augmented_edges,
            in_flight: FuturesUnordered::new(),
            cancelled: false,
        }
    }

    /// Drain `ready_state.ready` into `RunCancelled` outcomes.
    /// Called every iteration after the cancel flag trips so a
    /// successor newly promoted by a post-cancel succeeded
    /// completion still surfaces as cancelled.
    fn drain_cancelled(&mut self) {
        drain_ready_to_cancelled(self.ctx.observer, &mut self.ready_state, &mut self.outcomes);
    }

    /// Admission round (`EXEC-003`, `EXEC-004`, `EXEC-005`).
    ///
    /// Walks the ready set in canonical order and admits every
    /// task whose every cap permits. Mutex compatibility is NOT
    /// consulted here (`EXEC-007` step 1 / `MUTEX-007`); the
    /// lookup-step future runs first, and the post-lookup
    /// branch in [`Self::handle_lookup`] does the mutex check.
    fn admit_ready(&mut self) {
        let candidates: Vec<TaskId> = self.ready_state.ready.iter().cloned().collect();
        for task in candidates {
            if self.ready_state.skip.contains(&task) {
                // Cannot occur given admission and completion
                // do not interleave on a single async task, but
                // kept defensive so the invariant is documented.
                self.ready_state.ready.remove(&task);
                continue;
            }
            let tags = self
                .task_tags
                .get(&task)
                .expect("ready task must have a precomputed tag set");
            if !self.counts.can_admit(
                tags,
                &self.ctx.workspace.settings.concurrency,
                self.global_cap,
            ) {
                continue;
            }
            self.ready_state.ready.remove(&task);
            self.counts.admit(tags);

            // EXEC-007 step 1 begins (no mutex hold). Fire
            // on_task_started exactly once per task even when
            // subsequent mutex contention causes the task to
            // yield and re-enter the lifecycle.
            if self.started.insert(task.clone()) {
                self.ctx.observer.on_task_started(&task);
            }

            let preds_snapshot = self.accum.by_task.clone();
            let task_for_future = task.clone();
            self.in_flight.push(Box::pin(run_lookup_step(
                self.ctx,
                task_for_future,
                preds_snapshot,
            )));
        }
    }

    /// Await the next completion, racing against the parent
    /// cancel token. Returns [`None`] if the cancel token fired
    /// before any completion; the caller should loop to the
    /// next iteration (which will observe `self.cancelled`).
    /// Returns [`Some`] with the completion otherwise.
    ///
    /// `in_flight` MUST be non-empty when called.
    async fn next_completion(&mut self) -> Option<InFlightCompletion> {
        if self.cancelled {
            return Some(
                self.in_flight
                    .next()
                    .await
                    .expect("in_flight checked non-empty above"),
            );
        }
        tokio::select! {
            biased;
            () = self.ctx.cancel.cancelled() => {
                self.cancelled = true;
                None
            }
            next = self.in_flight.next() => {
                Some(next.expect("in_flight checked non-empty above"))
            }
        }
    }

    /// Handle a lookup-step future's completion. Branches on
    /// (i) cancel-fire mid-flight (`EXEC-013` step 1), (ii)
    /// lookup error, (iii) cache hit (no mutex per `MUTEX-007`),
    /// (iv) cache miss compatible (acquire mutex, dispatch
    /// spawn-step), (v) cache miss incompatible (`MUTEX-005`
    /// yield: return to ready).
    fn handle_lookup(
        &mut self,
        task: TaskId,
        result: Result<LookupStepOutcome, RunTaskError>,
        created_at_unix: u64,
    ) {
        let tags = self
            .task_tags
            .get(&task)
            .expect("completed task must have a precomputed tag set")
            .clone();

        if self.cancelled {
            // EXEC-013 step 1 in flight: a task that was
            // admitted before the cancel-fire but whose
            // lookup-step completed after is reclassified as
            // cancelled. The result is discarded; no spawn-step
            // is dispatched. The cascade is emitted via the same
            // complete_failed mechanism Failed and Cancelled
            // use, so descendants land as UpstreamCancelled.
            self.counts.release(&tags);
            let record = CancelledRecord::RunCancelled { task: task.clone() };
            self.ctx.observer.on_task_cancelled(&task, &record);
            let newly = self.ready_state.complete_failed(&task);
            emit_cascade_cancellations(self.ctx.observer, &mut self.outcomes, &task, newly);
            self.outcomes.insert(task, RunOutcome::Cancelled(record));
            return;
        }

        match result {
            Err(err) => {
                self.counts.release(&tags);
                let newly = self.ready_state.complete_failed(&task);
                let cause = SkipCause::UpstreamErrored {
                    upstream: task.clone(),
                };
                emit_cascade_skips(self.ctx.observer, &mut self.outcomes, &cause, newly);
                self.task_errors.insert(task, err);
            }
            Ok(LookupStepOutcome::Hit(record)) => {
                self.counts.release(&tags);
                self.ctx.observer.on_task_finished(&task, &record);
                self.accum.record(&task, &record);
                self.record_completion_invariants(task, record);
            }
            Ok(LookupStepOutcome::Miss {
                key,
                mutex,
                project_name,
            }) => {
                // EXEC-006 condition 3 / EXEC-007 step 3.
                if self.hold_set.compatible(&project_name, mutex.as_ref()) {
                    // MUTEX-006 "hold lifetime begins at spawn time".
                    self.hold_set.acquire(&project_name, mutex.as_ref());
                    self.spawn_step_holds
                        .insert(task.clone(), (project_name, mutex));
                    let task_for_future = task.clone();
                    self.in_flight.push(Box::pin(run_spawn_step(
                        self.ctx,
                        task_for_future,
                        key,
                        created_at_unix,
                    )));
                } else {
                    // MUTEX-005 yield: release slot, return task
                    // to ready; next admission re-evaluates.
                    self.counts.release(&tags);
                    self.ready_state.ready.insert(task);
                }
            }
        }
    }

    /// Handle a spawn-step future's completion. Branches on the
    /// per-task `RunState`: succeeded (run EXEC-020 / EXEC-019
    /// invariant checks), failed (cascade-skip hard descendants),
    /// cancelled (translate to `SignaledInFlight`, cascade
    /// `UpstreamCancelled`), or pipeline error (cascade-skip with
    /// `UpstreamErrored`).
    fn handle_spawn(&mut self, task: TaskId, result: Result<CompletedRecord, RunTaskError>) {
        let tags = self
            .task_tags
            .get(&task)
            .expect("completed task must have a precomputed tag set")
            .clone();
        // MUTEX-006: release at command termination, regardless
        // of success or failure.
        if let Some((project_name, mutex)) = self.spawn_step_holds.remove(&task) {
            self.hold_set.release(&project_name, mutex.as_ref());
        }
        self.counts.release(&tags);
        match result {
            Ok(record) => match record.state {
                RunState::Succeeded => {
                    self.ctx.observer.on_task_finished(&task, &record);
                    self.accum.record(&task, &record);
                    self.record_completion_invariants(task, record);
                }
                RunState::Failed => {
                    self.ctx.observer.on_task_finished(&task, &record);
                    self.accum.record(&task, &record);
                    let newly = self.ready_state.complete_failed(&task);
                    let cause = SkipCause::UpstreamFailed {
                        upstream: task.clone(),
                    };
                    emit_cascade_skips(self.ctx.observer, &mut self.outcomes, &cause, newly);
                    self.outcomes.insert(task, RunOutcome::Completed(record));
                }
                RunState::Cancelled => {
                    // The single-task lifecycle saw the run's
                    // cancellation token fire and signalled the
                    // child. Translate to the run-graph aggregate
                    // view: emit SignaledInFlight and cascade hard
                    // descendants as UpstreamCancelled (EXEC-011
                    // for cancellation).
                    let cancelled_record = CancelledRecord::SignaledInFlight {
                        task: task.clone(),
                        exit_status: record
                            .exit_status
                            .expect("a cancelled fresh run always carries an exit status"),
                        stdout_hash: record.stdout_hash,
                        stderr_hash: record.stderr_hash,
                    };
                    self.ctx
                        .observer
                        .on_task_cancelled(&task, &cancelled_record);
                    let newly = self.ready_state.complete_failed(&task);
                    emit_cascade_cancellations(self.ctx.observer, &mut self.outcomes, &task, newly);
                    self.outcomes
                        .insert(task, RunOutcome::Cancelled(cancelled_record));
                }
            },
            Err(err) => {
                let newly = self.ready_state.complete_failed(&task);
                let cause = SkipCause::UpstreamErrored {
                    upstream: task.clone(),
                };
                emit_cascade_skips(self.ctx.observer, &mut self.outcomes, &cause, newly);
                self.task_errors.insert(task, err);
            }
        }
    }

    /// Run the `EXEC-020` output-overlap check and the
    /// `EXEC-019` runtime-cycle check on a just-completed
    /// (`Succeeded`) `record`, then either drop the completion
    /// into the outcomes map normally or, on cycle detection,
    /// trip the internal cancel token and skip ready cycle
    /// members before recording.
    fn record_completion_invariants(&mut self, task: TaskId, record: CompletedRecord) {
        if check_and_record_output_overlap(
            &mut self.output_claims,
            &mut self.invariant_violations,
            &task,
            &record.materialised_outputs,
        ) {
            self.cancelled = true;
        }
        if let Some(cycle_nodes) = check_and_record_runtime_cycle_for_completion(
            &mut self.augmented_edges,
            &mut self.invariant_violations,
            self.ctx.workspace,
            &task,
            &record.materialised_outputs,
        ) {
            self.cancelled = true;
            self.ctx.cancel.cancel();
            self.ready_state.complete_succeeded(&task);
            self.outcomes.insert(task, RunOutcome::Completed(record));
            skip_ready_cycle_members(
                self.ctx.observer,
                &mut self.ready_state,
                &mut self.outcomes,
                &cycle_nodes,
            );
            return;
        }
        self.ready_state.complete_succeeded(&task);
        self.outcomes.insert(task, RunOutcome::Completed(record));
    }

    fn into_outcome(self) -> RunGraphOutcome {
        RunGraphOutcome {
            outcomes: self.outcomes,
            task_errors: self.task_errors,
            invariant_violations: self.invariant_violations,
        }
    }
}

#[cfg(test)]
mod tests {
    use std::collections::BTreeSet;

    use haz_domain::settings::WorkspaceSettings;
    use tokio_util::sync::CancellationToken;

    use crate::mock_impl::{MockBehaviour, MockProcessSpawner, MockSpec};
    use crate::process::Signal;
    use crate::run_graph::scheduler::run_graph;
    use crate::run_graph::test_fixtures::*;
    use crate::run_task::{CancelledRecord, RunSource, RunState, SkipCause};

    #[tokio::test]
    async fn exec_001_empty_graph_terminates_with_empty_outcomes() {
        let ws = make_workspace(vec![], WorkspaceSettings::default());
        let g = make_graph(vec![], vec![]);
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let result = run_graph(&ctx, 1_700_000_000).await.unwrap();

        assert!(result.outcomes.is_empty());
        assert!(result.task_errors.is_empty());
        assert!(observer.events().is_empty());
        assert!(spawner.spawns().is_empty());
    }

    #[tokio::test]
    async fn single_task_succeeds_writes_outcome() {
        let task = make_task("build");
        let p = make_project("p", BTreeSet::new(), vec![task]);
        let ws = make_workspace(vec![p], WorkspaceSettings::default());
        let g = make_graph(vec![tid("p", "build")], vec![]);
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        push_n_default_specs(&spawner, 1);
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let result = run_graph(&ctx, 1).await.unwrap();

        assert_eq!(result.outcomes.len(), 1);
        let record = completed_for(&result.outcomes, &tid("p", "build"));
        assert_eq!(record.state, RunState::Succeeded);
        assert_eq!(record.source, RunSource::FreshRun);
        assert!(result.task_errors.is_empty());
    }

    #[tokio::test]
    async fn exec_002_linear_chain_runs_in_topological_order() {
        let p = make_project(
            "p",
            BTreeSet::new(),
            vec![make_task("a"), make_task("b"), make_task("c")],
        );
        let ws = make_workspace(vec![p], WorkspaceSettings::default());
        let g = make_graph(
            vec![tid("p", "a"), tid("p", "b"), tid("p", "c")],
            vec![
                h_edge(tid("p", "a"), tid("p", "b")),
                h_edge(tid("p", "b"), tid("p", "c")),
            ],
        );
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        push_n_default_specs(&spawner, 3);
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let result = run_graph(&ctx, 1).await.unwrap();
        assert_eq!(result.outcomes.len(), 3);
        assert_eq!(
            observer.started_order(),
            vec![tid("p", "a"), tid("p", "b"), tid("p", "c")],
        );
    }

    #[tokio::test]
    async fn diamond_dag_runs_branches_and_joins_correctly() {
        let p = make_project(
            "p",
            BTreeSet::new(),
            vec![
                make_task("bot"),
                make_task("l"),
                make_task("r"),
                make_task("top"),
            ],
        );
        let ws = make_workspace(vec![p], WorkspaceSettings::default());
        let g = make_graph(
            vec![
                tid("p", "bot"),
                tid("p", "l"),
                tid("p", "r"),
                tid("p", "top"),
            ],
            vec![
                h_edge(tid("p", "top"), tid("p", "l")),
                h_edge(tid("p", "top"), tid("p", "r")),
                h_edge(tid("p", "l"), tid("p", "bot")),
                h_edge(tid("p", "r"), tid("p", "bot")),
            ],
        );
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        push_n_default_specs(&spawner, 4);
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let result = run_graph(&ctx, 1).await.unwrap();
        assert_eq!(result.outcomes.len(), 4);
        let started = observer.started_order();
        // `top` is the only initial root, so it starts first.
        assert_eq!(started.first(), Some(&tid("p", "top")));
        // `bot` runs only after both `l` and `r` succeed; it must
        // therefore start last in any valid scheduling order.
        assert_eq!(started.last(), Some(&tid("p", "bot")));
    }

    #[tokio::test]
    async fn exec_004_global_cap_one_serialises_independent_tasks() {
        let p = make_project("p", BTreeSet::new(), vec![make_task("a"), make_task("b")]);
        let ws = make_workspace(vec![p], workspace_settings_with(fixed_cap(1)));
        let g = make_graph(vec![tid("p", "a"), tid("p", "b")], vec![]);
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        push_n_default_specs(&spawner, 2);
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        run_graph(&ctx, 1).await.unwrap();

        // With cap 1, Started(b) must come AFTER Finished(a).
        let events = observer.events();
        let started_b = events
            .iter()
            .position(|e| matches!(e, Event::Started(t) if *t == tid("p", "b")))
            .expect("b started");
        let finished_a = events
            .iter()
            .position(|e| matches!(e, Event::Finished(t, _, _) if *t == tid("p", "a")))
            .expect("a finished");
        assert!(
            started_b > finished_a,
            "b started ({started_b}) must follow a finished ({finished_a}): {events:?}",
        );
    }

    #[tokio::test]
    async fn exec_004_global_cap_two_admits_three_independent_in_bursts() {
        let p = make_project(
            "p",
            BTreeSet::new(),
            vec![make_task("a"), make_task("b"), make_task("c")],
        );
        let ws = make_workspace(vec![p], workspace_settings_with(fixed_cap(2)));
        let g = make_graph(vec![tid("p", "a"), tid("p", "b"), tid("p", "c")], vec![]);
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        push_n_default_specs(&spawner, 3);
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        run_graph(&ctx, 1).await.unwrap();

        // With cap 2 and 3 tasks: c starts only after one of {a, b}
        // has finished.
        let events = observer.events();
        let started_c = events
            .iter()
            .position(|e| matches!(e, Event::Started(t) if *t == tid("p", "c")))
            .expect("c started");
        let any_finish_before_c = events[..started_c].iter().any(
            |e| matches!(e, Event::Finished(t, _, _) if *t == tid("p", "a") || *t == tid("p", "b")),
        );
        assert!(
            any_finish_before_c,
            "c starting at {started_c} must follow at least one finish: {events:?}",
        );
    }

    #[tokio::test]
    async fn exec_005_per_tag_cap_serialises_tagged_tasks_across_projects() {
        let task_a = make_task("compute");
        let task_b = make_task("compute");
        let pa = make_project("pa", BTreeSet::from([tag("db")]), vec![task_a]);
        let pb = make_project("pb", BTreeSet::from([tag("db")]), vec![task_b]);
        let ws = make_workspace(
            vec![pa, pb],
            workspace_settings_with_tag_cap(fixed_cap(10), "db", 1),
        );
        let g = make_graph(vec![tid("pa", "compute"), tid("pb", "compute")], vec![]);
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        push_n_default_specs(&spawner, 2);
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        run_graph(&ctx, 1).await.unwrap();

        let events = observer.events();
        let started_pb = events
            .iter()
            .position(|e| matches!(e, Event::Started(t) if *t == tid("pb", "compute")))
            .expect("pb:compute started");
        let finished_pa = events
            .iter()
            .position(|e| matches!(e, Event::Finished(t, _, _) if *t == tid("pa", "compute")))
            .expect("pa:compute finished");
        assert!(
            started_pb > finished_pa,
            "pb:compute ({started_pb}) must follow pa:compute finish ({finished_pa}): {events:?}",
        );
    }

    #[tokio::test]
    async fn exec_003_canonical_order_under_partial_slot_availability() {
        // Three independent tasks under cap=1; canonical
        // `(ProjectName, TaskName)` order is (p,a) < (p,b) < (p,c).
        // The node insertion order below is intentionally reverse
        // declaration order to prove the scheduler ignores it.
        let p = make_project(
            "p",
            BTreeSet::new(),
            vec![make_task("c"), make_task("a"), make_task("b")],
        );
        let ws = make_workspace(vec![p], workspace_settings_with(fixed_cap(1)));
        let g = make_graph(vec![tid("p", "c"), tid("p", "b"), tid("p", "a")], vec![]);
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        push_n_default_specs(&spawner, 3);
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        run_graph(&ctx, 1).await.unwrap();

        assert_eq!(
            observer.started_order(),
            vec![tid("p", "a"), tid("p", "b"), tid("p", "c")],
        );
    }

    #[tokio::test]
    async fn exec_010_task_failure_does_not_halt_unrelated_subgraph() {
        // Two independent subgraphs: a -> a_child and b -> b_child.
        // `a` fails; `a_child` must be skipped; `b` and `b_child`
        // must both complete.
        let p = make_project(
            "p",
            BTreeSet::new(),
            vec![
                make_task("a"),
                make_task("a_child"),
                make_task("b"),
                make_task("b_child"),
            ],
        );
        let ws = make_workspace(vec![p], WorkspaceSettings::default());
        let g = make_graph(
            vec![
                tid("p", "a"),
                tid("p", "a_child"),
                tid("p", "b"),
                tid("p", "b_child"),
            ],
            vec![
                h_edge(tid("p", "a"), tid("p", "a_child")),
                h_edge(tid("p", "b"), tid("p", "b_child")),
            ],
        );
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        // Canonical admission order: a (fail), b (succeed),
        // then b_child (succeed). a_child never spawns.
        push_spec_with_exit(&spawner, 1);
        push_n_default_specs(&spawner, 2);
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let result = run_graph(&ctx, 1).await.unwrap();

        assert_eq!(
            completed_for(&result.outcomes, &tid("p", "a")).state,
            RunState::Failed,
            "a should be Failed",
        );
        // a_child is cascade-skipped, surfaced as
        // RunOutcome::Skipped with `a` as the root cause
        // (EXEC-011 + S3).
        assert_eq!(
            skipped_for(&result.outcomes, &tid("p", "a_child")).cause,
            SkipCause::UpstreamFailed {
                upstream: tid("p", "a"),
            },
            "a_child cascade-skipped with root cause `a`",
        );
        assert_eq!(
            completed_for(&result.outcomes, &tid("p", "b")).state,
            RunState::Succeeded,
            "sibling b should succeed",
        );
        assert_eq!(
            completed_for(&result.outcomes, &tid("p", "b_child")).state,
            RunState::Succeeded,
            "sibling b_child should succeed",
        );
    }

    #[tokio::test]
    async fn exec_011_task_failure_cascades_to_hard_descendants() {
        // Chain root -> mid -> leaf. `root` fails; `mid` and
        // `leaf` are both cascade-skipped and surface as
        // `RunOutcome::Skipped` with `root` (NOT `mid`) as
        // the cause on `leaf` per S3's root-cause attribution.
        let p = make_project(
            "p",
            BTreeSet::new(),
            vec![make_task("root"), make_task("mid"), make_task("leaf")],
        );
        let ws = make_workspace(vec![p], WorkspaceSettings::default());
        let g = make_graph(
            vec![tid("p", "root"), tid("p", "mid"), tid("p", "leaf")],
            vec![
                h_edge(tid("p", "root"), tid("p", "mid")),
                h_edge(tid("p", "mid"), tid("p", "leaf")),
            ],
        );
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        // Only one spawn happens (root's). Push one failing spec.
        push_spec_with_exit(&spawner, 2);
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let result = run_graph(&ctx, 1).await.unwrap();

        assert_eq!(result.outcomes.len(), 3);
        assert_eq!(
            completed_for(&result.outcomes, &tid("p", "root")).state,
            RunState::Failed,
        );
        let cause = SkipCause::UpstreamFailed {
            upstream: tid("p", "root"),
        };
        assert_eq!(
            skipped_for(&result.outcomes, &tid("p", "mid")).cause,
            cause,
            "mid records root cause = root",
        );
        assert_eq!(
            skipped_for(&result.outcomes, &tid("p", "leaf")).cause,
            cause,
            "leaf records root cause = root (NOT mid)",
        );
        assert_eq!(spawner.spawns().len(), 1);
    }

    #[tokio::test]
    async fn exec_010_soft_edge_predecessor_failure_does_not_cascade() {
        // a -(soft)-> b. Soft edges do not establish a hard
        // predecessor; b has hard in-degree 0 and is ready from the
        // start. When a fails, the cascade walks only hard edges,
        // so b is not skipped.
        let p = make_project("p", BTreeSet::new(), vec![make_task("a"), make_task("b")]);
        let ws = make_workspace(vec![p], WorkspaceSettings::default());
        let g = make_graph(
            vec![tid("p", "a"), tid("p", "b")],
            vec![s_edge(tid("p", "a"), tid("p", "b"))],
        );
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        // Canonical admission order is a, b. a fails; b succeeds.
        push_spec_with_exit(&spawner, 1);
        push_n_default_specs(&spawner, 1);
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let result = run_graph(&ctx, 1).await.unwrap();
        assert_eq!(
            completed_for(&result.outcomes, &tid("p", "a")).state,
            RunState::Failed,
        );
        assert_eq!(
            completed_for(&result.outcomes, &tid("p", "b")).state,
            RunState::Succeeded,
            "soft-edge successor must not be cascade-skipped",
        );
    }

    #[tokio::test]
    async fn exec_011_observer_emits_no_started_or_finished_for_skipped_tasks() {
        // Chain root -> mid -> leaf with root failing. The
        // observer event stream must contain exactly one
        // Started (root), one Finished (root, Failed,
        // FreshRun), and one Skipped per cascade-descendant
        // (mid, leaf). Skipped tasks NEVER fire Started or
        // Finished (EXEC-011 + S4).
        let p = make_project(
            "p",
            BTreeSet::new(),
            vec![make_task("root"), make_task("mid"), make_task("leaf")],
        );
        let ws = make_workspace(vec![p], WorkspaceSettings::default());
        let g = make_graph(
            vec![tid("p", "root"), tid("p", "mid"), tid("p", "leaf")],
            vec![
                h_edge(tid("p", "root"), tid("p", "mid")),
                h_edge(tid("p", "mid"), tid("p", "leaf")),
            ],
        );
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        push_spec_with_exit(&spawner, 3);
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let _ = run_graph(&ctx, 1).await.unwrap();

        let events = observer.events();
        let cause = SkipCause::UpstreamFailed {
            upstream: tid("p", "root"),
        };
        // Skipped events fire in canonical (ProjectName,
        // TaskName) order because the cascade closure is a
        // BTreeSet; with task names "mid" and "leaf", lex
        // ordering puts "leaf" first regardless of the
        // graph's chain shape.
        assert_eq!(
            events,
            vec![
                Event::Started(tid("p", "root")),
                Event::Finished(tid("p", "root"), RunState::Failed, RunSource::FreshRun),
                Event::Skipped(tid("p", "leaf"), cause.clone()),
                Event::Skipped(tid("p", "mid"), cause),
            ],
            "expected exactly one Started + Finished for root \
             and one Skipped per descendant in canonical order",
        );
    }

    #[tokio::test]
    async fn exec_011_diamond_cascade_records_each_descendant_once() {
        // Diamond: top -> left, top -> right, left -> bot,
        // right -> bot. `top` fails and the cascade reaches
        // `bot` via two paths. The outcomes map MUST contain
        // exactly one Skipped entry for `bot` (not two), and
        // the observer MUST fire on_task_skipped exactly once
        // for `bot`. This exercises the diamond-uniqueness
        // invariant of complete_failed's returned set
        // end-to-end (the unit test
        // `complete_failed_diamond_attributes_each_descendant_once`
        // covers it at the state level).
        let p = make_project(
            "p",
            BTreeSet::new(),
            vec![
                make_task("bot"),
                make_task("left"),
                make_task("right"),
                make_task("top"),
            ],
        );
        let ws = make_workspace(vec![p], WorkspaceSettings::default());
        let g = make_graph(
            vec![
                tid("p", "bot"),
                tid("p", "left"),
                tid("p", "right"),
                tid("p", "top"),
            ],
            vec![
                h_edge(tid("p", "top"), tid("p", "left")),
                h_edge(tid("p", "top"), tid("p", "right")),
                h_edge(tid("p", "left"), tid("p", "bot")),
                h_edge(tid("p", "right"), tid("p", "bot")),
            ],
        );
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        push_spec_with_exit(&spawner, 4);
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let result = run_graph(&ctx, 1).await.unwrap();

        // 4 outcomes total: top Completed(Failed), left/right/bot Skipped.
        assert_eq!(result.outcomes.len(), 4);
        assert_eq!(
            completed_for(&result.outcomes, &tid("p", "top")).state,
            RunState::Failed,
        );
        let cause = SkipCause::UpstreamFailed {
            upstream: tid("p", "top"),
        };
        for descendant in [tid("p", "left"), tid("p", "right"), tid("p", "bot")] {
            assert_eq!(
                skipped_for(&result.outcomes, &descendant).cause,
                cause,
                "{descendant:?} should be Skipped with root cause top",
            );
        }

        // The observer fires on_task_skipped exactly once per
        // descendant. `bot` is reachable along two cascade
        // paths but must be reported once.
        let skipped_count_bot = observer
            .events()
            .iter()
            .filter(|e| matches!(e, Event::Skipped(t, _) if *t == tid("p", "bot")))
            .count();
        assert_eq!(
            skipped_count_bot, 1,
            "bot must fire on_task_skipped exactly once across both cascade paths",
        );
    }

    // ====================================================================
    // EXEC-012..015: cancellation
    // ====================================================================

    /// Build a [`MockSpec`] that ignores SIGTERM and exits
    /// only on SIGKILL with `kill_exit_code`.
    fn exit_on_kill_only_spec(kill_exit_code: i32) -> MockSpec {
        MockSpec {
            behaviour: MockBehaviour::OnKillOnly,
            exit_code: kill_exit_code,
            ..MockSpec::default()
        }
    }

    #[tokio::test]
    async fn exec_013_cancel_before_admission_marks_all_ready_as_run_cancelled() {
        // Cap=1, three independent tasks. The token fires
        // BEFORE run_graph starts. EXEC-013 step 1: every
        // still-ready task that has not been started enters
        // the cancelled state; no spawn ever happens.
        let p = make_project(
            "p",
            BTreeSet::new(),
            vec![make_task("a"), make_task("b"), make_task("c")],
        );
        let ws = make_workspace(vec![p], workspace_settings_with(fixed_cap(1)));
        let g = make_graph(vec![tid("p", "a"), tid("p", "b"), tid("p", "c")], vec![]);
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        fixture.cancel.cancel();
        let result = run_graph(&ctx, 1).await.unwrap();

        assert_eq!(result.outcomes.len(), 3);
        for name in ["a", "b", "c"] {
            match cancelled_for(&result.outcomes, &tid("p", name)) {
                CancelledRecord::RunCancelled { task } => {
                    assert_eq!(task, &tid("p", name));
                }
                other => panic!("expected RunCancelled for {name}, got {other:?}"),
            }
        }
        assert!(
            spawner.spawns().is_empty(),
            "no task should have been spawned: {:?}",
            spawner.spawns(),
        );
        // Observer fires on_task_cancelled (Event::Cancelled)
        // for each task; on_task_started fires for none.
        let events = observer.events();
        assert!(
            events
                .iter()
                .all(|e| matches!(e, Event::Cancelled(_, CancelledRecord::RunCancelled { .. }))),
            "expected only Cancelled events, got {events:?}",
        );
        assert_eq!(events.len(), 3);
    }

    #[tokio::test]
    async fn exec_013_cancel_mid_flight_signals_in_flight_task() {
        // One task whose mock child responds to SIGTERM.
        // The trigger fires cancel while the spawn-step is
        // blocked on the mock's wait. Per-future grace dance
        // sends SIGTERM; mock exits; outcome is
        // SignaledInFlight; only Signal::Terminate was
        // delivered.
        let p = make_project("p", BTreeSet::new(), vec![make_task("solo")]);
        let ws = make_workspace(vec![p], workspace_settings_with(fixed_cap(1)));
        let g = make_graph(vec![tid("p", "solo")], vec![]);
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        spawner.push_spec(exit_on_terminate_spec(0));
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let trigger_cancel = fixture.cancel.clone();
        let trigger = async move {
            // Yield long enough for the scheduler to admit
            // the task and reach the spawn-step wait. A
            // few milliseconds is more than enough for the
            // MemFilesystem-backed cache lookup to land.
            tokio::time::sleep(std::time::Duration::from_millis(20)).await;
            trigger_cancel.cancel();
        };

        let (result, ()) = tokio::join!(run_graph(&ctx, 1), trigger);
        let result = result.unwrap();

        match cancelled_for(&result.outcomes, &tid("p", "solo")) {
            CancelledRecord::SignaledInFlight { task, .. } => {
                assert_eq!(task, &tid("p", "solo"));
            }
            other => panic!("expected SignaledInFlight, got {other:?}"),
        }
        assert_eq!(spawner.spawns().len(), 1);
        // Mock recorded exactly Signal::Terminate: the
        // polite-exit-on-SIGTERM branch wins before grace
        // expires, so SIGKILL never fires.
        assert_eq!(
            spawner.signals_for(0).unwrap(),
            vec![Signal::Terminate],
            "expected exactly one Terminate, got {:?}",
            spawner.signals_for(0),
        );
    }

    #[tokio::test]
    async fn exec_014_cancel_mid_flight_escalates_to_kill_after_grace() {
        // Stubborn child: ignores SIGTERM, only SIGKILL
        // exits the wait. A short grace (50 ms) so the
        // test does not sit on the default 5 s grace.
        // The per-future grace dance sends SIGTERM, sleeps
        // grace, then sends SIGKILL; the mock then unblocks.
        let p = make_project("p", BTreeSet::new(), vec![make_task("solo")]);
        let ws = make_workspace(vec![p], workspace_settings_with_grace(fixed_cap(1), 0.05));
        let g = make_graph(vec![tid("p", "solo")], vec![]);
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        spawner.push_spec(exit_on_kill_only_spec(137));
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let trigger_cancel = fixture.cancel.clone();
        let trigger = async move {
            tokio::time::sleep(std::time::Duration::from_millis(20)).await;
            trigger_cancel.cancel();
        };

        let (result, ()) = tokio::join!(run_graph(&ctx, 1), trigger);
        let result = result.unwrap();

        match cancelled_for(&result.outcomes, &tid("p", "solo")) {
            CancelledRecord::SignaledInFlight { .. } => {}
            other => panic!("expected SignaledInFlight, got {other:?}"),
        }
        // Mock recorded SIGTERM then SIGKILL: the polite
        // signal failed to exit the wait, the grace timer
        // fired, the executor escalated.
        assert_eq!(
            spawner.signals_for(0).unwrap(),
            vec![Signal::Terminate, Signal::Kill],
        );
    }

    #[tokio::test]
    async fn exec_014_cancel_grace_zero_sends_kill_immediately() {
        // With cancel_grace = 0, the per-future grace dance
        // collapses: SIGTERM and SIGKILL fire one after the
        // other with no virtual time between them. The
        // stubborn OnKillOnly child exits on SIGKILL.
        let p = make_project("p", BTreeSet::new(), vec![make_task("solo")]);
        let ws = make_workspace(vec![p], workspace_settings_with_grace(fixed_cap(1), 0.0));
        let g = make_graph(vec![tid("p", "solo")], vec![]);
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        spawner.push_spec(exit_on_kill_only_spec(137));
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let trigger_cancel = fixture.cancel.clone();
        let trigger = async move {
            tokio::time::sleep(std::time::Duration::from_millis(20)).await;
            trigger_cancel.cancel();
        };

        let (result, ()) = tokio::join!(run_graph(&ctx, 1), trigger);
        let result = result.unwrap();

        match cancelled_for(&result.outcomes, &tid("p", "solo")) {
            CancelledRecord::SignaledInFlight { .. } => {}
            other => panic!("expected SignaledInFlight, got {other:?}"),
        }
        assert_eq!(
            spawner.signals_for(0).unwrap(),
            vec![Signal::Terminate, Signal::Kill],
        );
    }

    #[tokio::test]
    async fn exec_011_cancelled_task_cascades_descendants_as_upstream_cancelled() {
        // Chain root -> mid -> leaf. The cancel fires while
        // root is in-flight; root's per-future grace dance
        // signals the child, root ends as SignaledInFlight,
        // and the scheduler cascades hard descendants as
        // UpstreamCancelled per the cancellation arm of
        // EXEC-011.
        let p = make_project(
            "p",
            BTreeSet::new(),
            vec![make_task("root"), make_task("mid"), make_task("leaf")],
        );
        let ws = make_workspace(vec![p], workspace_settings_with(fixed_cap(1)));
        let g = make_graph(
            vec![tid("p", "root"), tid("p", "mid"), tid("p", "leaf")],
            vec![
                h_edge(tid("p", "root"), tid("p", "mid")),
                h_edge(tid("p", "mid"), tid("p", "leaf")),
            ],
        );
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        spawner.push_spec(exit_on_terminate_spec(0));
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let trigger_cancel = fixture.cancel.clone();
        let trigger = async move {
            tokio::time::sleep(std::time::Duration::from_millis(20)).await;
            trigger_cancel.cancel();
        };

        let (result, ()) = tokio::join!(run_graph(&ctx, 1), trigger);
        let result = result.unwrap();

        assert_eq!(result.outcomes.len(), 3);
        match cancelled_for(&result.outcomes, &tid("p", "root")) {
            CancelledRecord::SignaledInFlight { task, .. } => {
                assert_eq!(task, &tid("p", "root"));
            }
            other => panic!("expected SignaledInFlight for root, got {other:?}"),
        }
        for name in ["mid", "leaf"] {
            match cancelled_for(&result.outcomes, &tid("p", name)) {
                CancelledRecord::UpstreamCancelled { task, upstream } => {
                    assert_eq!(task, &tid("p", name));
                    assert_eq!(
                        upstream,
                        &tid("p", "root"),
                        "cascade attributes the root cancelled task to {name}",
                    );
                }
                other => {
                    panic!("expected UpstreamCancelled for {name}, got {other:?}")
                }
            }
        }
    }

    #[tokio::test]
    async fn exec_015_cancelled_run_does_not_produce_cache_entry() {
        // Run 1 cancels mid-flight; EXEC-015 forbids the
        // cache store. Run 2 over the same fixture (fresh
        // never-cancelled token, fresh observer/spawner)
        // MUST be a cache miss (FreshRun), confirming the
        // cancelled run left nothing behind.
        let p = make_project("p", BTreeSet::new(), vec![make_task("solo")]);
        let ws = make_workspace(vec![p], workspace_settings_with(fixed_cap(1)));
        let g = make_graph(vec![tid("p", "solo")], vec![]);
        let fixture = Fixture::new(ws, g);

        // Run 1: cancel mid-flight.
        {
            let spawner1 = MockProcessSpawner::new();
            spawner1.push_spec(exit_on_terminate_spec(0));
            let observer1 = Recorder::default();
            let ctx1 = make_ctx(&fixture, &spawner1, &observer1);
            let trigger_cancel = fixture.cancel.clone();
            let trigger = async move {
                tokio::time::sleep(std::time::Duration::from_millis(20)).await;
                trigger_cancel.cancel();
            };
            let (run1, ()) = tokio::join!(run_graph(&ctx1, 1), trigger);
            let run1 = run1.unwrap();
            match cancelled_for(&run1.outcomes, &tid("p", "solo")) {
                CancelledRecord::SignaledInFlight { .. } => {}
                other => panic!("run 1 expected SignaledInFlight, got {other:?}"),
            }
        }

        // Run 2: fresh token (the fixture's was cancelled),
        // ExitImmediately mock, no cancel-trigger.
        let fresh_cancel = CancellationToken::new();
        let spawner2 = MockProcessSpawner::new();
        push_n_default_specs(&spawner2, 1);
        let observer2 = Recorder::default();
        let ctx2 = make_ctx_with_cancel(&fixture, &spawner2, &observer2, &fresh_cancel);
        let run2 = run_graph(&ctx2, 2).await.unwrap();

        let rec2 = completed_for(&run2.outcomes, &tid("p", "solo"));
        assert_eq!(
            rec2.source,
            RunSource::FreshRun,
            "run 2 must be a fresh run; a cache hit would mean run 1 stored an entry",
        );
        assert_eq!(rec2.state, RunState::Succeeded);
        // Exactly one fresh spawn occurred in run 2 (the
        // mock recorded it); run 1 also had one, but on a
        // separate spawner.
        assert_eq!(spawner2.spawns().len(), 1);
    }

    #[tokio::test]
    async fn exec_010_cancel_one_subgraph_does_not_halt_another() {
        // Two independent tasks (no edges between them).
        // Cap=2 so both are admitted. Task `fast` uses
        // ExitImmediately and completes before the trigger
        // fires; task `slow` uses OnTerminate and is
        // blocked when the trigger fires. The cancel
        // therefore catches only `slow` in flight; `fast`'s
        // Completed outcome must NOT be reclassified by the
        // cancellation flow.
        //
        // Distinct commands so cache keys do not collide
        // (CACHE-001 content addressing).
        let task_fast = make_task_with("fast", &["echo", "fast"], None);
        let task_slow = make_task_with("slow", &["echo", "slow"], None);
        let p = make_project("p", BTreeSet::new(), vec![task_fast, task_slow]);
        let ws = make_workspace(vec![p], workspace_settings_with(fixed_cap(2)));
        let g = make_graph(vec![tid("p", "fast"), tid("p", "slow")], vec![]);
        let fixture = Fixture::new(ws, g);
        let spawner = MockProcessSpawner::new();
        // BTreeSet admission order is canonical lex: "fast"
        // then "slow". The mock spawner pops specs in FIFO
        // order, so push `fast`'s spec first.
        spawner.push_spec(MockSpec::default());
        spawner.push_spec(exit_on_terminate_spec(0));
        let observer = Recorder::default();
        let ctx = make_ctx(&fixture, &spawner, &observer);

        let trigger_cancel = fixture.cancel.clone();
        let trigger = async move {
            // Give the scheduler time to admit both tasks,
            // complete fast's lookup+spawn, and have slow
            // sitting in its spawn-step wait.
            tokio::time::sleep(std::time::Duration::from_millis(30)).await;
            trigger_cancel.cancel();
        };

        let (result, ()) = tokio::join!(run_graph(&ctx, 1), trigger);
        let result = result.unwrap();

        assert_eq!(result.outcomes.len(), 2);
        let fast_rec = completed_for(&result.outcomes, &tid("p", "fast"));
        assert_eq!(fast_rec.state, RunState::Succeeded);
        match cancelled_for(&result.outcomes, &tid("p", "slow")) {
            CancelledRecord::SignaledInFlight { .. } => {}
            other => panic!("expected SignaledInFlight for slow, got {other:?}"),
        }
    }
}