ktstr 0.17.0 - Docs.rs

use super::*;

/// One significant per-metric finding produced by [`compare_rows_by`].
///
/// `pairing_key` carries the dynamic identity the row pair joined
/// on — `scenario` plus every NON-slicing dimension's value. The
/// table renderer in [`compare_partitions`] decodes the key against
/// the slicing-dim list to produce a label like
/// `scenario/topology/work_type` (when topology + work_type are
/// pairing dims) or just `scenario` (when every other dim slices).
///
/// The `scenario` / `topology` / `work_type` fields carry the
/// matched row's values verbatim for legacy-shape consumers and
/// test fixtures that pre-date the dimensional-slicing refactor.
/// New code should read [`Finding::pairing_key`] directly so the
/// slicing-dim variation stays visible.
///
/// `metric` is the registry entry the comparison ran against;
/// consumers read polarity, display unit, and name through it
/// directly without re-looking up [`metric_def`].
#[derive(Debug, Clone, serde::Serialize)]
pub(crate) struct Finding {
    pub pairing_key: PairingKey,
    pub scenario: String,
    pub topology: String,
    pub work_type: String,
    pub metric: &'static MetricDef,
    pub val_a: f64,
    pub val_b: f64,
    pub delta: f64,
    pub is_regression: bool,
}

/// Aggregate result of comparing two row sets via [`compare_rows_by`].
///
/// `regressions` and `improvements` count significant entries in
/// `findings`; `unchanged` counts metrics that fell below the dual
/// gate; `excluded_pairs` counts paired (scenario, topology, work_type)
/// row pairs where either side is not a real pass — `fail`,
/// `inconclusive`, and `skip` rows all route here. The field name
/// captures "excluded from regression math" rather than encoding any
/// of the three excluded states, because the per-side disposition
/// (which side, which state) is recoverable from the individual
/// `GauntletRow::is_*` accessors when the operator drills in.
/// `new_in_b`
/// counts B-side rows whose key has no match on the A side; the
/// converse is `removed_from_a`. The filter (when set) applies to
/// every counter, so excluded rows do not contribute.
///
/// `phase_deltas` and `unpaired_phases` carry the per-phase
/// comparison shape derived from
/// [`crate::assert::ScenarioStats::phases`] on each row pair. The
/// phase pass runs after the scalar-row pass via the same pairing
/// key; rows whose `phases` slice is empty on either side
/// contribute nothing here (single-phase scenarios skip the
/// per-phase view, falling back to the scalar findings already in
/// `findings`).
#[derive(Debug, Clone, Default, serde::Serialize)]
pub(crate) struct CompareReport {
    pub regressions: u32,
    pub improvements: u32,
    pub unchanged: u32,
    pub excluded_pairs: u32,
    pub new_in_b: u32,
    pub removed_from_a: u32,
    pub findings: Vec<Finding>,
    pub phase_deltas: Vec<PhaseDeltaRow>,
    pub unpaired_phases: Vec<UnpairedPhaseRow>,
}

/// Which side of an A/B comparison a row belongs to. Typed surface
/// for the per-phase rows so new code does not propagate the
/// `"A"` / `"B"` string-literal pattern the scalar-finding path
/// uses (string siting cited at L4011-4012 etc., kept as-is for
/// the existing call sites).
#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize)]
pub(crate) enum ComparePartition {
    A,
    B,
}

impl ComparePartition {
    /// Render the side as the same one-letter label
    /// `render_side_label` produces for the scalar table headers,
    /// so the new per-phase tables and the existing scalar table
    /// share the same operator-facing identifier.
    pub fn as_str(self) -> &'static str {
        match self {
            Self::A => "A",
            Self::B => "B",
        }
    }
}

/// One per-phase metric delta between matched A/B rows. Populated
/// when both rows carry a [`crate::assert::PhaseBucket`] at the
/// same `step_index` AND both buckets carry a value for the same
/// metric name. Generated by the parallel-pass after the
/// scalar-finding compare runs.
#[derive(Clone, Debug, serde::Serialize)]
pub(crate) struct PhaseDeltaRow {
    /// Same pairing key the scalar [`Finding`] uses, so the per-
    /// phase delta is unambiguously scoped to the matched row pair.
    pub pairing_key: PairingKey,
    /// 1-indexed phase encoding per the framework convention:
    /// `0` is BASELINE, `1..=N` are scenario Step ordinals.
    pub step_index: u16,
    /// Human-readable label mirroring
    /// [`crate::assert::PhaseBucket::label`] (`"BASELINE"` or
    /// `"Step[k-1]"`). The renderer prints this in the PHASE
    /// column.
    pub label: String,
    /// Registry entry the delta was computed against. Carries the
    /// `MetricKind` (Counter / Peak / Gauge / Timestamp) the
    /// phase aggregator used to fold the per-sample readings into
    /// the per-phase value, plus the `Polarity` the renderer uses
    /// to classify the delta direction.
    pub metric: &'static MetricDef,
    /// A-side phase-aggregated value (from
    /// `phase_bucket.metrics[metric.name]`).
    pub a: f64,
    /// B-side phase-aggregated value.
    pub b: f64,
    /// `b - a` for higher-is-worse metrics, `a - b` for
    /// lower-is-worse; matches the scalar [`Finding::delta`]
    /// polarity convention.
    pub delta: f64,
    /// `true` when the delta exceeds the dual-gate threshold in
    /// the regression direction (per metric polarity).
    pub is_regression: bool,
}

/// One per-phase bucket present on exactly one side of the A/B
/// comparison. Generated when a matched row pair has phase
/// coverage asymmetry — e.g. A ran a 3-Step scenario and B ran a
/// 4-Step version, so B's Step\[3\] has no A counterpart. The
/// renderer surfaces these in a dedicated "Phase Coverage
/// Asymmetry" section so the operator sees explicitly which side
/// is missing data; silently dropping them would mask the
/// scenario-shape difference.
#[derive(Clone, Debug, serde::Serialize)]
pub(crate) struct UnpairedPhaseRow {
    /// Which side carries the orphan bucket.
    pub side: ComparePartition,
    /// Same pairing key the matched [`PhaseDeltaRow`]s use.
    pub pairing_key: PairingKey,
    pub step_index: u16,
    pub label: String,
    /// Per-metric values carried verbatim from the orphan bucket's
    /// [`crate::assert::PhaseBucket::metrics`] map; the renderer
    /// prints them one-sided with `—` (em dash) in the absent
    /// column.
    pub metrics: std::collections::BTreeMap<String, f64>,
}

/// Per-metric threshold policy driving `compare_rows` /
/// `compare_partitions`.
///
/// Resolution priority for a given metric's relative significance
/// threshold, highest first:
///
/// 1. `per_metric_percent[metric_name]` — explicit override for
///    this metric.
/// 2. `default_percent` — uniform override across every metric
///    not listed in the map (equivalent to the old `--threshold N`
///    CLI flag).
/// 3. The metric's built-in `default_rel` from the `METRICS`
///    registry — the "no policy" fallback.
///
/// Values in the struct are stored as PERCENT (e.g. `10.0` meaning
/// 10%), NOT fractions. [`Self::rel_threshold`] does the `/100.0`
/// conversion so every caller inside `compare_rows` reads a
/// fraction without re-deriving the division.
///
/// Note on the registry-fallback branch: the `default_rel` field
/// on `MetricDef` is already a FRACTION (e.g. `0.25` for 25%),
/// not a percent. `rel_threshold` returns it verbatim — it
/// does NOT divide by 100. Only the override branches
/// (per-metric map, `default_percent`) do the percent-to-fraction
/// conversion because their inputs are percents. This asymmetry
/// is deliberate so callers supplying CLI/file-based overrides
/// work in human-intuitive percent units while the registry
/// defaults (which already ship in fraction form) pass through
/// unchanged.
///
/// The struct is `serde::Serialize` / `serde::Deserialize` so
/// `cargo ktstr stats compare --policy <path>` can load a
/// JSON-persisted policy file. Default construction produces an
/// empty policy that uses every registry default; [`Self::uniform`]
/// reproduces the old `--threshold N` behaviour without any
/// per-metric override plumbing at the call site.
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
#[serde(default, deny_unknown_fields)]
pub struct ComparisonPolicy {
    /// Uniform override: when `Some(p)`, every metric whose name is
    /// NOT in [`Self::per_metric_percent`] uses `p / 100.0` as its
    /// relative threshold. `None` falls through to the registry
    /// `default_rel`. Stored as percent (e.g. `10.0` for 10%).
    pub default_percent: Option<f64>,
    /// Per-metric overrides keyed by metric name. Each value is a
    /// percent (e.g. `15.0` → 15%). An entry here takes precedence
    /// over both [`Self::default_percent`] and the registry
    /// `default_rel`.
    pub per_metric_percent: BTreeMap<String, f64>,
}

/// CLI-controlled rendering of the per-phase delta block in
/// `cargo ktstr stats compare`. Bundled as a struct so the
/// 5-flag clap surface threads through `compare_partitions` as
/// a single positional rather than five. Default value renders
/// every phase / every metric / every paired row — equivalent
/// to passing no phase flags.
///
/// The flags compose via AND on independent axes (block-level
/// suppression × phase-id × row-significance), with three
/// mutex constraints enforced at CLI parse time:
///
/// - `--no-phases` excludes every other phase flag (the whole
///   block is suppressed; refining what to render is a
///   contradiction).
/// - `--phases-only` excludes `--no-phases` (same reason).
/// - `--steps-only` excludes `--phase` (one of them collapses
///   to a single bucket; the other suppresses BASELINE — both
///   together are confused phrasing).
///
/// The 5 flags trigger renderer behaviour ONLY — the data
/// layer in `compare_rows_by` always emits the full set of
/// matched `PhaseDeltaRow`s and `UnpairedPhaseRow`s so
/// programmatic consumers of `CompareReport` see the
/// unfiltered surface. Filtering is render-time projection.
#[derive(Debug, Default, Clone)]
pub struct PhaseDisplayOptions {
    /// `--no-phases`: suppress the per-phase delta + unpaired
    /// tables entirely. The scalar findings table and footer
    /// render unchanged; the only effect is hiding the phase
    /// block (and the phase footer hint). Mutually exclusive
    /// with every other phase flag at CLI parse time.
    pub no_phases: bool,
    /// `--phases-only`: suppress the scalar findings table and
    /// the host-context delta; render ONLY the per-phase block.
    /// Useful for narrowing investigation to a phase regression
    /// when the scalar rollup is noise. Composes with
    /// `--steps-only`, `--phase`, and `--phase-threshold`.
    pub phases_only: bool,
    /// `--steps-only`: within the per-phase block, suppress
    /// the BASELINE bucket (`step_index == 0`); render only
    /// scenario Step buckets. Useful when the BASELINE settle
    /// window is dominated by scheduler startup transients.
    /// Mutually exclusive with `--phase`.
    pub steps_only: bool,
    /// `--phase <N>`: within the per-phase block, render only
    /// rows whose `step_index == N`. `0` selects BASELINE;
    /// `1..=N` selects scenario Step ordinals (1 → Step\[0\],
    /// 2 → Step\[1\], ...). Integer chosen over label so a label
    /// rename (`"Step[0]"` → `"Step:0"`) doesn't break operator
    /// CI invocations. Mutually exclusive with `--steps-only`.
    pub phase: Option<u16>,
    /// `--phase-threshold <PCT>`: render-side relative-delta
    /// gate for the per-phase pass. Suppresses paired rows
    /// where `|delta| / max(|a|, 1.0) < PCT / 100.0`. `0.0`
    /// shows every paired row; absence falls through to the
    /// registry's per-metric `default_rel`. Independent from
    /// the scalar `--threshold` — the two passes have separate
    /// filters so an operator can widen the phase view without
    /// widening the scalar view (the diagnostic "show me every
    /// per-phase delta but only load-bearing scalar findings"
    /// use case).
    pub phase_threshold: Option<f64>,
}

impl PhaseDisplayOptions {
    /// Resolve the per-phase relative threshold for a given
    /// metric. Returns the override fraction when
    /// `phase_threshold` is set, else falls through to the
    /// `ComparisonPolicy` resolution the scalar pass uses. The
    /// `metric_name` + `default_rel` shape mirrors
    /// [`ComparisonPolicy::rel_threshold`] so the two surfaces
    /// stay symmetric.
    pub fn rel_threshold(
        &self,
        policy: &ComparisonPolicy,
        metric_name: &str,
        default_rel: f64,
    ) -> f64 {
        match self.phase_threshold {
            Some(pct) => pct / 100.0,
            None => policy.rel_threshold(metric_name, default_rel),
        }
    }

    /// True when a phase row at the given `step_index` should
    /// render under the current display flags. Combines the two
    /// step-axis predicates (`--phase <N>` filter and
    /// `--steps-only` BASELINE-suppressor) into a single
    /// row-level decision the renderer can apply uniformly
    /// across `PhaseDeltaRow` and `UnpairedPhaseRow` vecs.
    /// Returns `true` when no relevant flag is set (default
    /// path: every step renders).
    pub fn matches_phase(&self, step_index: u16) -> bool {
        if let Some(want) = self.phase
            && step_index != want
        {
            return false;
        }
        if self.steps_only && step_index == 0 {
            return false;
        }
        true
    }

    /// True when a [`PhaseDeltaRow`] passes the
    /// `--phase-threshold` relative-significance gate. Computes
    /// `|delta| / max(|a|, 1.0) >= phase_threshold / 100.0` —
    /// the `max(|a|, 1.0)` denominator floor prevents NaN from
    /// `a == 0.0` (the row that pairs a zero against any
    /// non-zero produces a delta of finite magnitude that
    /// should not divide by zero). Returns `true` when no
    /// flag is set (default path: every row passes; per the
    /// `--phase-threshold` clap doc — absence keeps every
    /// paired row in the rendered output).
    ///
    /// `pub(crate)` rather than `pub` because [`PhaseDeltaRow`]
    /// is `pub(crate)` — the row type is an internal renderer
    /// detail, not a public surface. External consumers reach
    /// per-row decisions through the rendered output, not by
    /// instantiating a `PhaseDeltaRow` themselves.
    pub(crate) fn passes_delta_threshold(&self, delta: &PhaseDeltaRow) -> bool {
        let Some(pct) = self.phase_threshold else {
            return true;
        };
        let denom = delta.a.abs().max(1.0);
        let rel = delta.delta.abs() / denom;
        rel >= pct / 100.0
    }
}

impl ComparisonPolicy {
    /// Empty policy — every metric uses its `METRICS` registry
    /// default. Equivalent to the old `--threshold None` CLI path.
    pub fn new() -> Self {
        Self::default()
    }

    /// Uniform override: every metric uses `percent / 100.0`.
    /// Mirrors the old `--threshold N` CLI behaviour; the CLI
    /// dispatch at `cargo-ktstr stats compare --threshold N`
    /// constructs a policy via this constructor.
    pub fn uniform(percent: f64) -> Self {
        Self {
            default_percent: Some(percent),
            per_metric_percent: BTreeMap::new(),
        }
    }

    /// Load a JSON-persisted policy from a file. Errors propagate
    /// the read / parse reason as an `anyhow::Error` with the file
    /// path in the context chain so a malformed `--policy path.json`
    /// surfaces an actionable message rather than a generic
    /// "invalid JSON."
    ///
    /// Validates after parsing via [`Self::validate`]: rejects
    /// negative thresholds (a misconfigured 10 vs -10 would
    /// invert the dual-gate logic at the `.abs() >= rel_thresh`
    /// check and silently classify every metric as significant)
    /// and rejects per-metric keys not registered in `METRICS`
    /// (a typo like `"wrost_spread"` would otherwise be silently
    /// ignored — the key simply never matches during resolution
    /// and the metric falls through to `default_percent`).
    pub fn load_json(path: &std::path::Path) -> anyhow::Result<Self> {
        use anyhow::Context;
        let data = std::fs::read_to_string(path)
            .with_context(|| format!("read comparison policy from {}", path.display()))?;
        let policy: ComparisonPolicy = serde_json::from_str(&data)
            .with_context(|| format!("parse comparison policy from {}", path.display()))?;
        policy
            .validate()
            .with_context(|| format!("validate comparison policy from {}", path.display()))?;
        Ok(policy)
    }

    /// Structural validation separate from parsing so both the
    /// `load_json` path and programmatic constructors (after
    /// [`Self::uniform`] with a user-supplied percent) can share
    /// one set of invariants without re-implementing checks at
    /// each call site. Called automatically by [`Self::load_json`];
    /// CLI dispatch should call it after constructing via
    /// [`Self::uniform`] to catch `--threshold -10` at the
    /// entry point rather than deep inside `compare_rows` where
    /// the dual-gate math silently misbehaves.
    ///
    /// Rejects:
    /// - Negative `default_percent` (nonsensical — thresholds are
    ///   absolute-value comparisons).
    /// - Negative entries in `per_metric_percent`.
    /// - Per-metric keys not in the `METRICS` registry (silent
    ///   typos would otherwise fall through to `default_percent`
    ///   unnoticed).
    pub fn validate(&self) -> anyhow::Result<()> {
        if let Some(p) = self.default_percent
            && p < 0.0
        {
            anyhow::bail!(
                "ComparisonPolicy: default_percent must be non-negative; got {p}. \
                 Thresholds are absolute-value comparisons — a negative value \
                 would invert the dual-gate logic and silently classify every \
                 delta as significant."
            );
        }
        for (name, p) in &self.per_metric_percent {
            if !METRICS.iter().any(|m| m.name == name) {
                let known: Vec<&str> = METRICS.iter().map(|m| m.name).collect();
                anyhow::bail!(
                    "ComparisonPolicy: per_metric_percent contains unknown \
                     metric `{name}`. A typo in the key would silently fall \
                     through to default_percent. Registered metrics: {}",
                    known.join(", "),
                );
            }
            if *p < 0.0 {
                anyhow::bail!(
                    "ComparisonPolicy: per_metric_percent[{name:?}] must be \
                     non-negative; got {p}",
                );
            }
        }
        Ok(())
    }

    /// Resolve the mutually-exclusive `--threshold` / `--policy` CLI
    /// pair into a policy: `--threshold N` is sugar for a uniform N%
    /// default (validated for sign); `--policy PATH` loads a
    /// per-metric JSON policy; neither falls through to the registry
    /// defaults. Shared by every subcommand that accepts the pair
    /// (`stats compare`, `perf-delta`) so the resolution rules — and
    /// the "exactly one of the two" contract — live in one place.
    ///
    /// Both flags set is rejected with an error. At the CLI call
    /// sites clap `conflicts_with` makes that unreachable, but this is
    /// a library entry point and must not panic on its inputs; the
    /// error is the defence-in-depth backstop.
    pub fn from_cli_flags(
        threshold: Option<f64>,
        policy: Option<&std::path::Path>,
    ) -> anyhow::Result<Self> {
        match (threshold, policy) {
            (Some(t), None) => {
                let p = Self::uniform(t);
                p.validate()?;
                Ok(p)
            }
            (None, Some(path)) => Self::load_json(path),
            (None, None) => Ok(Self::default()),
            (Some(_), Some(_)) => anyhow::bail!(
                "--threshold and --policy are mutually exclusive; use --policy \
                 for per-metric overrides"
            ),
        }
    }

    /// Resolve the relative threshold (as a fraction, e.g. `0.10`
    /// for 10%) for `metric_name` with `default_rel` as the
    /// registry-level fallback. Handles the percent→fraction
    /// conversion so `compare_rows_by` does not need to re-derive
    /// `p / 100.0` at every call site.
    pub fn rel_threshold(&self, metric_name: &str, default_rel: f64) -> f64 {
        if let Some(p) = self.per_metric_percent.get(metric_name) {
            p / 100.0
        } else if let Some(p) = self.default_percent {
            p / 100.0
        } else {
            default_rel
        }
    }
}

/// Compare two row sets metric-by-metric, parametrised on
/// `pairing_dims`.
///
/// Pure function: no I/O, no globals. Two rows pair iff their
/// [`PairingKey`] (scenario + every value for each dimension in
/// `pairing_dims`) is equal — this is the dimensional-slicing
/// pipeline's join primitive, with slicing dims EXCLUDED from
/// `pairing_dims` so rows on the A/B sides that differ on those
/// dims still pair as long as they agree on every non-slicing
/// dim. When `filter` is `Some(s)`, a row is included only if
/// `s` appears as a substring of the joined `"scenario topology
/// scheduler work_type"` string. The scheduler is
/// searchable via the substring filter but is not part of the
/// pairing key by default (only when `Dimension::Scheduler` is
/// in `pairing_dims`), so the same scenario+topology+work_type
/// pair compares correctly across different scheduler binaries
/// when the filter does not constrain it.
///
/// Row-pair accounting:
/// - B-side rows with no A-side match are counted in `new_in_b`.
/// - A-side rows with no B-side match are counted in `removed_from_a`
///   (a separate pass over `rows_a`).
/// - Paired rows where either side has `passed=false` are dropped
///   from the regression math and counted in `excluded_pairs`: a
///   failed scenario's metrics reflect the failure mode (short run,
///   stalled workload, missing samples), not the scheduler's
///   behavior.
///
/// The filter (when set) applies to every counter -- excluded rows
/// never reach the matching, pass, or metric stages.
///
/// `policy` carries the comparison thresholds. See
/// [`ComparisonPolicy`] for the resolution rules — per-metric
/// override → `default_percent` → registry `default_rel`. The
/// absolute gate always uses the metric's `default_abs`. A delta
/// must clear both gates to count as significant.
pub(crate) fn compare_rows_by(
    rows_a: &[GauntletRow],
    rows_b: &[GauntletRow],
    pairing_dims: &[Dimension],
    filter: Option<&str>,
    policy: &ComparisonPolicy,
) -> CompareReport {
    let mut report = CompareReport::default();

    // Build a HashMap<PairingKey, &GauntletRow> from rows_a once so
    // each row_b lookup is O(1) instead of O(rows_a). `or_insert_with`
    // preserves first-match semantics from the prior `rows_a.iter().find()`
    // call: on the rare path where two A-side rows share a key (the
    // averaging path produces unique keys; the `--no-average` path
    // bails earlier via `check_no_duplicate_pairing_keys`), the
    // earlier-iterated row wins.
    let mut a_by_key: HashMap<PairingKey, &GauntletRow> = HashMap::with_capacity(rows_a.len());
    for row_a in rows_a {
        let key = PairingKey::from_row(row_a, pairing_dims);
        a_by_key.entry(key).or_insert(row_a);
    }

    // Hoist the per-metric relative threshold out of the row×metric
    // loop. `policy.rel_threshold(m.name, m.default_rel)` is a pure
    // function of the metric — recomputing it for every row pair was
    // O(rows_b × METRICS) BTreeMap probes for nothing.
    let rel_thresholds: Vec<f64> = METRICS
        .iter()
        .map(|m| policy.rel_threshold(m.name, m.default_rel))
        .collect();
    // Same hoist for the render-suppression predicate: it is a pure
    // function of the metric (a small fixed-slice membership scan), so
    // probing it per (row_b x metric) re-ran the scan for nothing.
    let suppressed: Vec<bool> = METRICS
        .iter()
        .map(|m| is_render_suppressed_component(m.name))
        .collect();

    for row_b in rows_b {
        // Dynamic pairing key: scenario + every NON-slicing
        // dimension's value. Two rows pair iff their dynamic keys
        // match.
        let key_b = PairingKey::from_row(row_b, pairing_dims);
        if let Some(f) = filter {
            // Substring filter joins all identity-bearing fields —
            // including the SLICING dim values — so an operator
            // can narrow by any visible field via `-E`.
            let joined = format!(
                "{} {} {} {}",
                row_b.scenario, row_b.topology, row_b.scheduler, row_b.work_type,
            );
            if !joined.contains(f) {
                continue;
            }
        }
        let Some(&row_a) = a_by_key.get(&key_b) else {
            report.new_in_b += 1;
            continue;
        };

        // Drop from regression math when either side is a skip,
        // inconclusive, or failure. Skips carry no executed metrics
        // (the run didn't happen); inconclusive runs ran but lacked
        // signal to evaluate (zero-denominator ratio gate); failures
        // carry telemetry dominated by the failure mode (short run,
        // stalled workload), not the scheduler's behavior —
        // comparing any of these against a real run produces
        // meaningless deltas.
        if row_a.is_fail()
            || row_b.is_fail()
            || row_a.is_inconclusive()
            || row_b.is_inconclusive()
            || row_a.is_skip()
            || row_b.is_skip()
        {
            report.excluded_pairs += 1;
            continue;
        }

        push_scalar_findings(
            &mut report,
            row_a,
            row_b,
            &key_b,
            &rel_thresholds,
            &suppressed,
        );
        push_phase_deltas(&mut report, row_a, row_b, &key_b, policy);
    }

    // Second pass: A-side rows whose key has no match on the B side.
    // Filter applies here too, so rows excluded by the filter never
    // count as removed. Build a HashSet<PairingKey> from rows_b once
    // so the existence check is O(1) per row_a; rows_b are inserted
    // unfiltered to preserve prior behaviour where a row_b that fails
    // the substring filter still suppresses a same-key row_a's
    // removed_from_a increment (the substring filter compares against
    // identity-bearing fields including slicing dims, so two rows
    // sharing a pairing key can disagree on filter membership).
    let b_keys: HashSet<PairingKey> = rows_b
        .iter()
        .map(|r| PairingKey::from_row(r, pairing_dims))
        .collect();
    for row_a in rows_a {
        let key_a = PairingKey::from_row(row_a, pairing_dims);
        if let Some(f) = filter {
            let joined = format!(
                "{} {} {} {}",
                row_a.scenario, row_a.topology, row_a.scheduler, row_a.work_type,
            );
            if !joined.contains(f) {
                continue;
            }
        }
        if !b_keys.contains(&key_a) {
            report.removed_from_a += 1;
        }
    }

    report
}

/// Append the scalar per-metric findings for one matched `(row_a,
/// row_b)` pair to `report`. Indexed by the `METRICS` enumerate
/// position: `rel_thresholds[i]` is the hoisted relative threshold
/// and `suppressed[i]` the hoisted render-suppression flag for the
/// i-th metric (both built once by [`compare_rows_by`] over the same
/// `METRICS` order). Bumps `report.unchanged` for sub-dual-gate
/// deltas and `report.regressions` / `report.improvements` per
/// metric polarity for the rest, pushing a [`Finding`] for each
/// significant delta.
fn push_scalar_findings(
    report: &mut CompareReport,
    row_a: &GauntletRow,
    row_b: &GauntletRow,
    key_b: &PairingKey,
    rel_thresholds: &[f64],
    suppressed: &[bool],
) {
    for (i, m) in METRICS.iter().enumerate() {
        // Rate components are internal plumbing — suppressed from compare
        // output (they remain in storage for the cross-run re-pool).
        if suppressed[i] {
            continue;
        }
        let val_a = m.read(row_a).unwrap_or(0.0);
        let val_b = m.read(row_b).unwrap_or(0.0);
        if val_a.abs() < f64::EPSILON && val_b.abs() < f64::EPSILON {
            continue;
        }

        let rel_thresh = rel_thresholds[i];

        let delta = val_b - val_a;
        let rel_delta = if val_a.abs() > f64::EPSILON {
            (delta / val_a).abs()
        } else {
            0.0
        };

        if delta.abs() < m.default_abs || rel_delta < rel_thresh {
            report.unchanged += 1;
            continue;
        }

        let is_regression = if m.higher_is_worse() {
            delta > 0.0
        } else {
            delta < 0.0
        };
        if is_regression {
            report.regressions += 1;
        } else {
            report.improvements += 1;
        }
        report.findings.push(Finding {
            pairing_key: key_b.clone(),
            scenario: row_b.scenario.clone(),
            topology: row_b.topology.clone(),
            work_type: row_b.work_type.clone(),
            metric: m,
            val_a,
            val_b,
            delta,
            is_regression,
        });
    }
}

/// Append the per-phase delta rows for one matched `(row_a, row_b)`
/// pair to `report`. Runs on every paired row pair alongside the
/// scalar findings (see [`push_scalar_findings`]). Walks the union of
/// step_index keys from `row_a.phases` and `row_b.phases` and emits
/// one [`PhaseDeltaRow`] per matched (step_index, metric_name) pair
/// where both sides carry a value, or one [`UnpairedPhaseRow`] per
/// side-only step_index. Rows whose `phases` slice is empty on
/// either side contribute nothing here — single-phase scenarios skip
/// the per-phase view entirely without emitting orphan
/// UnpairedPhaseRows (an empty A-side against a populated B-side
/// would otherwise flood the unpaired section for every B phase). The
/// early-skip matches the "Empty when scenario produced no periodic
/// captures" semantic on `ScenarioStats.phases`.
fn push_phase_deltas(
    report: &mut CompareReport,
    row_a: &GauntletRow,
    row_b: &GauntletRow,
    key_b: &PairingKey,
    policy: &ComparisonPolicy,
) {
    if !row_a.phases.is_empty() && !row_b.phases.is_empty() {
        let a_by_step: std::collections::BTreeMap<u16, &crate::assert::PhaseBucket> =
            row_a.phases.iter().map(|p| (p.step_index, p)).collect();
        let b_by_step: std::collections::BTreeMap<u16, &crate::assert::PhaseBucket> =
            row_b.phases.iter().map(|p| (p.step_index, p)).collect();
        let union: std::collections::BTreeSet<u16> =
            a_by_step.keys().chain(b_by_step.keys()).copied().collect();
        for step_index in union {
            match (a_by_step.get(&step_index), b_by_step.get(&step_index)) {
                (Some(pa), Some(pb)) => {
                    // Matched phase on both sides — emit a
                    // PhaseDeltaRow per metric_name present on
                    // BOTH sides. A name on only one side
                    // surfaces as an absent entry via the
                    // sentinel-free `PhaseBucket::get` contract;
                    // the renderer does not invent a synthetic
                    // delta for it.
                    //
                    // `is_regression` honors the same dual-gate
                    // the scalar pass applies inside its
                    // per-metric loop (search for `default_abs <`
                    // in `compare_rows_by` above): a row whose
                    // `|delta| < default_abs` OR whose
                    // `rel_delta < policy.rel_threshold` is
                    // classified `is_regression = false` even
                    // when the direction matches `polarity`.
                    // This mirrors the scalar `unchanged`
                    // semantic so a sub-threshold per-phase
                    // delta (e.g. `+0.1 ms` on a 10-ms-default
                    // gate) does not produce a false-positive
                    // REGRESSION verdict in the rendered table.
                    // The row is still emitted into
                    // `phase_deltas` so programmatic consumers
                    // of `CompareReport.phase_deltas` see every
                    // paired comparison; the filter is on the
                    // classification only.
                    for (metric_name, &val_a) in &pa.metrics {
                        // Suppress Rate components from the per-phase view
                        // too (they stay in PhaseBucket.metrics for the
                        // re-pool; only the rendered delta is dropped).
                        if is_render_suppressed_component(metric_name) {
                            continue;
                        }
                        let Some(&val_b) = pb.metrics.get(metric_name) else {
                            continue;
                        };
                        let Some(metric_def) = metric_def(metric_name) else {
                            continue;
                        };
                        let delta = val_b - val_a;
                        let rel_thresh =
                            policy.rel_threshold(metric_def.name, metric_def.default_rel);
                        let rel_delta = if val_a.abs() > f64::EPSILON {
                            (delta / val_a).abs()
                        } else {
                            0.0
                        };
                        let below_dual_gate =
                            delta.abs() < metric_def.default_abs || rel_delta < rel_thresh;
                        let is_regression = if below_dual_gate {
                            false
                        } else if metric_def.higher_is_worse() {
                            delta > 0.0
                        } else {
                            delta < 0.0
                        };
                        report.phase_deltas.push(PhaseDeltaRow {
                            pairing_key: key_b.clone(),
                            step_index,
                            label: pa.label.clone(),
                            metric: metric_def,
                            a: val_a,
                            b: val_b,
                            delta,
                            is_regression,
                        });
                    }
                }
                (Some(orphan), None) => {
                    report.unpaired_phases.push(UnpairedPhaseRow {
                        side: ComparePartition::A,
                        pairing_key: key_b.clone(),
                        step_index,
                        label: orphan.label.clone(),
                        metrics: metrics_without_suppressed(&orphan.metrics),
                    });
                }
                (None, Some(orphan)) => {
                    report.unpaired_phases.push(UnpairedPhaseRow {
                        side: ComparePartition::B,
                        pairing_key: key_b.clone(),
                        step_index,
                        label: orphan.label.clone(),
                        metrics: metrics_without_suppressed(&orphan.metrics),
                    });
                }
                (None, None) => {
                    unreachable!("step_index taken from union of a_by_step / b_by_step keys")
                }
            }
        }
    }
}

/// Emit a stderr warning naming any `-dirty` commit values present
/// in the partitioned rows so the operator knows the comparison
/// includes builds whose source tree may not match the recorded
/// HEAD.
///
/// Scans `commit` (project HEAD) and `kernel_commit` (kernel source
/// tree HEAD) on both sides' rows, dedupes the surviving values,
/// and emits one warning block listing each distinct dirty value
/// per dimension. Emits at most one block — silent when no row
/// carries a `-dirty` suffix on either dimension.
///
/// Dirty runs reuse the same sidecar filename as their clean HEAD
/// (the variant hash excludes `commit` / `kernel_commit` per
/// `crate::test_support::sidecar`), so re-running the same test
/// from a dirty tree overwrites the previous record. The warning
/// surfaces this so an operator can decide whether to commit the
/// working tree before re-running for a reproducible comparison.
///
/// Splits collection from emission via [`render_dirty_warning`] so
/// unit tests can pin the rendered text without trapping `stderr`.
fn warn_on_dirty_builds(rows_a: &[GauntletRow], rows_b: &[GauntletRow]) {
    if let Some(text) = render_dirty_warning(rows_a, rows_b) {
        eprint!("{text}");
    }
}

/// Emit the CPU-budget hazard warning for a comparison, if any.
/// Pure-render half is [`render_overcommit_warning`]; this only
/// `eprint!`s it, mirroring [`warn_on_dirty_builds`].
fn warn_on_overcommit(rows_a: &[GauntletRow], rows_b: &[GauntletRow], pairing_dims: &[Dimension]) {
    if let Some(text) = render_overcommit_warning(rows_a, rows_b, pairing_dims) {
        eprint!("{text}");
    }
}

/// Build the CPU-budget hazard warning from the filtered compare
/// sides, or `None` when neither hazard is present.
///
/// Two independent hazards, both read from [`GauntletRow::cpu_budget`]
/// / [`GauntletRow::vcpus`] — the consumers that make those fields
/// load-bearing on the compare path:
///
/// - OVERCOMMIT (`cpu_budget < vcpus`): the host time-sliced that
///   run's vCPU threads, so its wake-latency / off-CPU / run-delay
///   timing metrics are host-contention artifacts, not scheduler
///   signal (see [`crate::vmm::host_topology::overcommit_warning`]).
///   Always flagged when present on either side: comparing raw timing
///   from an overcommitted run is the silent-wrong-answer the budget
///   stamp exists to surface.
/// - MIXED BUDGET: a single pairing group on a side holds more than
///   one distinct non-skip budget. [`group_and_average_by`] folds rows
///   that share a full [`PairingKey`], so this is exactly the set
///   `--average` would average together across budgets. It only arises
///   when [`Dimension::CpuBudget`] is NOT a pairing dim (the operator
///   sliced on cpu-budget, dropping it from the key); when it IS a
///   pairing dim, each budget keys its own group and is never folded.
///   Detection is per pairing group, NOT side-wide: two rows of
///   different scenarios (or any differing pairing dim) carry different
///   keys and never average, so a side merely spanning budgets across
///   distinct groups is not flagged.
///
/// Skip rows (budget 0 -> `None` in [`sidecar_to_row`]) carry no
/// budget identity and are ignored by both checks. Split from
/// emission so a unit test pins the text and the `None`-when-clean
/// polarity without trapping stderr, mirroring [`render_dirty_warning`].
pub(crate) fn render_overcommit_warning(
    rows_a: &[GauntletRow],
    rows_b: &[GauntletRow],
    pairing_dims: &[Dimension],
) -> Option<String> {
    use std::collections::BTreeSet;
    use std::fmt::Write;

    // Side-wide: the distinct overcommitted (budget, vcpus) pairs.
    let overcommitted = |rows: &[GauntletRow]| -> BTreeSet<(u32, u32)> {
        let mut over = BTreeSet::new();
        for r in rows {
            if let (Some(b), Some(v)) = (r.cpu_budget, r.vcpus)
                && b < v
            {
                over.insert((b, v));
            }
        }
        over
    };

    // Per pairing group: the union of budgets across groups that hold
    // >1 distinct budget — exactly the budgets `--average` folds into
    // one mean. Empty when CpuBudget is a pairing dim (each budget keys
    // its own group, so no group ever holds two).
    let cpu_budget_is_pairing = pairing_dims.contains(&Dimension::CpuBudget);
    let mixed_folded = |rows: &[GauntletRow]| -> BTreeSet<u32> {
        let mut folded = BTreeSet::new();
        if cpu_budget_is_pairing {
            return folded;
        }
        let mut by_key: std::collections::HashMap<PairingKey, BTreeSet<u32>> =
            std::collections::HashMap::new();
        for r in rows {
            if let Some(b) = r.cpu_budget {
                by_key
                    .entry(PairingKey::from_row(r, pairing_dims))
                    .or_default()
                    .insert(b);
            }
        }
        for budgets in by_key.values() {
            if budgets.len() > 1 {
                folded.extend(budgets.iter().copied());
            }
        }
        folded
    };

    let over_a = overcommitted(rows_a);
    let over_b = overcommitted(rows_b);
    let mixed_a = mixed_folded(rows_a);
    let mixed_b = mixed_folded(rows_b);

    if over_a.is_empty() && over_b.is_empty() && mixed_a.is_empty() && mixed_b.is_empty() {
        return None;
    }

    let any_overcommit = !over_a.is_empty() || !over_b.is_empty();
    let mut out = String::new();
    if any_overcommit {
        // Host time-slicing actually occurred -> raw timing is confounded.
        let _ = writeln!(
            out,
            "ktstr: WARNING: CPU-budget hazard in this comparison — a run was \
             host-overcommitted, so its guest-scheduler timing metrics \
             (wake-latency / off-CPU / run-delay) are host-contention-confounded. \
             Compare the overcommit-invariant worst_iterations_per_cpu_sec metric \
             (`stats compare --metric worst_iterations_per_cpu_sec`) instead of raw \
             timing."
        );
    } else {
        // Mixed budgets with NO overcommit: no host contention, the hazard is
        // collapsing two different measurement conditions into one number.
        let _ = writeln!(
            out,
            "ktstr: WARNING: CPU-budget hazard in this comparison — runs of \
             different CPU budgets share a pairing group, mixing two measurement \
             conditions. Slice with --cpu-budget, or compare the budget-invariant \
             worst_iterations_per_cpu_sec metric."
        );
    }
    let mut emit_side = |label: &str, over: &BTreeSet<(u32, u32)>, mixed: &BTreeSet<u32>| {
        if !over.is_empty() {
            let list = over
                .iter()
                .map(|(b, v)| format!("{b}/{v}"))
                .collect::<Vec<_>>()
                .join(", ");
            let _ = writeln!(
                out,
                "  side {label}: host-overcommitted run(s) [budget/vcpus]: {list}"
            );
        }
        if !mixed.is_empty() {
            let list = mixed
                .iter()
                .map(|b| b.to_string())
                .collect::<Vec<_>>()
                .join(", ");
            let _ = writeln!(
                out,
                "  side {label}: CPU budgets [{list}] share a pairing group — \
                 --average folds them into one mean (--no-average rejects them as \
                 duplicate keys); slice with --cpu-budget so cross-budget runs are \
                 not compared under one key"
            );
        }
    };
    emit_side("A", &over_a, &mixed_a);
    emit_side("B", &over_b, &mixed_b);
    Some(out)
}

/// Build the dirty-builds warning block from row data.
///
/// Returns `None` when no row on either side carries a `-dirty`
/// suffix on either `commit` or `kernel_commit`. Otherwise returns
/// the full multi-line warning text — the body emitted to stderr by
/// [`warn_on_dirty_builds`] — terminated with a trailing newline so
/// the caller can `eprint!` it without further formatting.
///
/// Dimensions render in fixed order ("kernel source" before
/// "project") so the same dirty hashes always produce byte-identical
/// output across runs; values within each dimension are
/// `BTreeSet`-deduped so multiple rows sharing one dirty hash list
/// it once, and multiple distinct dirty hashes on one dimension list
/// in lex order.
pub(crate) fn render_dirty_warning(
    rows_a: &[GauntletRow],
    rows_b: &[GauntletRow],
) -> Option<String> {
    use std::collections::BTreeSet;
    use std::fmt::Write;

    let mut dirty_kernel: BTreeSet<&str> = BTreeSet::new();
    let mut dirty_project: BTreeSet<&str> = BTreeSet::new();
    for row in rows_a.iter().chain(rows_b.iter()) {
        // `ends_with` matches the producer contract: `detect_kernel_commit`
        // and `detect_project_commit` (sidecar.rs:851, :983) append
        // `-dirty` as a SUFFIX to the 7-char hex via
        // `format!("{short_hash}-dirty")`, so the dirty marker is
        // always tail-positioned. `contains` would also match a
        // hex hash that legitimately contains the substring `-dirty`
        // somewhere in the middle (impossible for the current
        // 7-char hex prefix, but a future commit-ish format change
        // would let a non-dirty value flag itself dirty under
        // `contains`).
        if let Some(c) = row.kernel_commit.as_deref()
            && c.ends_with("-dirty")
        {
            dirty_kernel.insert(c);
        }
        if let Some(c) = row.commit.as_deref()
            && c.ends_with("-dirty")
        {
            dirty_project.insert(c);
        }
    }

    if dirty_kernel.is_empty() && dirty_project.is_empty() {
        return None;
    }

    let mut out = String::new();
    writeln!(out, "warning: comparison includes dirty builds:").unwrap();
    for v in &dirty_kernel {
        writeln!(
            out,
            "  - kernel source: {v} (working tree may have changed since this run)"
        )
        .unwrap();
    }
    for v in &dirty_project {
        writeln!(
            out,
            "  - project: {v} (working tree may have changed since this run)"
        )
        .unwrap();
    }
    writeln!(
        out,
        "  Dirty runs overwrite previous results with the same HEAD."
    )
    .unwrap();
    writeln!(out, "  Commit changes for reproducible-ish comparisons.").unwrap();
    Some(out)
}

/// Render the actionable bail message emitted when one side's filter
/// matches zero sidecars in the pool.
///
/// Beyond the generic "check filters / run `cargo ktstr stats list`"
/// redirect, this helper inspects WHY the filter matched nothing and
/// adds three operator-actionable hints when applicable:
///
/// 1. **Dirty-form hint**: when the user passed
///    `--project-commit X` (or per-side / kernel-commit equivalent)
///    and the pool contains a row whose `commit` (or `kernel_commit`)
///    is `X-dirty`, append "Did you mean `--project-commit X-dirty`?".
///    A clean-vs-dirty mismatch is the single most common cause of a
///    false-zero on the commit dims — `detect_project_commit` /
///    `detect_kernel_commit` append `-dirty` whenever HEAD-vs-index
///    or index-vs-worktree changes are observed, so an operator who
///    expected `abcdef1` but the recorded value is `abcdef1-dirty`
///    sees no rows match without realizing why.
///
/// 2. **Unknown run-source hint**: when the user passed
///    `--run-source X` (or per-side equivalent) and `X` is NOT
///    among the distinct `run_source` values present in the pool,
///    append a hint listing the actual values seen. The schema is
///    deliberately extensible (`"benchmark"` and other future tags
///    are valid), so this is a hint rather than a hard validator —
///    but a typo (`--run-source loca` for `local`, or `--run-source CI`
///    for `ci` since the values are case-sensitive) is the most
///    common cause of a false-zero on the source dim, and listing
///    the distinct values present is more actionable than asking
///    the operator to consult the schema doc.
///
/// 3. **list-values redirect for commit dims**: when the user
///    populated any commit dimension (`project_commits` /
///    `kernel_commits`), suggest `cargo ktstr stats list-values`
///    specifically — that command emits the exact distinct values
///    present per dimension, which is more actionable than the
///    generic `stats list` which only shows top-level run keys.
///
/// `side` is `"A"` or `"B"` for diagnostic context. `filter` is the
/// per-side `RowFilter`. `rows` is the sidecar-derived row vec
/// (post-`sidecar_to_row` mapping, pre-filtering). `pool_len` is
/// the raw pool count for the "(N pooled)" diagnostic context.
pub(crate) fn zero_match_diagnostic(
    side: &str,
    filter: &RowFilter,
    rows: &[GauntletRow],
    pool_len: usize,
) -> String {
    let mut msg = format!(
        "stats compare: {side} side filter matched 0 sidecars in \
         pool ({pool_len} pooled). Check the per-side filters or \
         confirm the runs exist with `cargo ktstr stats list`."
    );

    // Dirty-form hint per commit dimension. Only fires when a
    // populated filter value's `-dirty` form is in the pool.
    let mut dirty_hints: Vec<String> = Vec::new();
    for want in &filter.project_commits {
        let dirty = format!("{want}-dirty");
        let found = rows
            .iter()
            .any(|r| r.commit.as_deref() == Some(dirty.as_str()));
        if found {
            dirty_hints.push(format!(
                "no rows match `--project-commit {want}` but `{dirty}` exists in the pool — \
                 did you mean `--project-commit {dirty}`?"
            ));
        }
    }
    for want in &filter.kernel_commits {
        let dirty = format!("{want}-dirty");
        let found = rows
            .iter()
            .any(|r| r.kernel_commit.as_deref() == Some(dirty.as_str()));
        if found {
            dirty_hints.push(format!(
                "no rows match `--kernel-commit {want}` but `{dirty}` exists in the pool — \
                 did you mean `--kernel-commit {dirty}`?"
            ));
        }
    }
    for hint in dirty_hints {
        msg.push_str("\nhint: ");
        msg.push_str(&hint);
    }

    // Unknown-run-source hint. Fires when a `--run-source X` value
    // is not present in the pool — typo / wrong casing is the most
    // common cause. Schema is intentionally extensible (operators
    // can write `"benchmark"` etc.), so this is a hint not a hard
    // validator: the bail still fires, the operator still sees the
    // distinct values present, and the producer side is free to
    // emit any tag.
    if !filter.run_sources.is_empty() {
        let pool_run_sources: std::collections::BTreeSet<&str> = rows
            .iter()
            .filter_map(|r| r.run_source.as_deref())
            .collect();
        let unknowns: Vec<&str> = filter
            .run_sources
            .iter()
            .map(String::as_str)
            .filter(|want| !pool_run_sources.contains(*want))
            .collect();
        if !unknowns.is_empty() {
            let mut present: Vec<&str> = pool_run_sources.iter().copied().collect();
            present.sort_unstable();
            let unknown_list = unknowns
                .iter()
                .map(|s| format!("`{s}`"))
                .collect::<Vec<_>>()
                .join(", ");
            let present_list = if present.is_empty() {
                "(none — every row has `run_source: null`)".to_string()
            } else {
                present
                    .iter()
                    .map(|s| format!("`{s}`"))
                    .collect::<Vec<_>>()
                    .join(", ")
            };
            msg.push_str(&format!(
                "\nhint: --run-source {unknown_list} not found in pool; \
                 distinct values present: {present_list}. Values are \
                 case-sensitive (`ci` ≠ `CI`)."
            ));
        }
    }

    // Unknown-cpu-budget hint. Mirrors the run_sources hint for the
    // numeric budget dimension: fires when a `--cpu-budget` value is
    // not among the budgets present in the pool (the budgets render
    // canonically as decimal via `cpu_budget.to_string()`, so a
    // non-canonical input like `032` lists as not-found against the
    // canonical present set). Skip rows (`cpu_budget == None`) carry no
    // budget and are excluded.
    if !filter.cpu_budgets.is_empty() {
        let pool_budgets: std::collections::BTreeSet<u32> =
            rows.iter().filter_map(|r| r.cpu_budget).collect();
        let present_strs: std::collections::BTreeSet<String> =
            pool_budgets.iter().map(|b| b.to_string()).collect();
        let unknowns: Vec<&str> = filter
            .cpu_budgets
            .iter()
            .map(String::as_str)
            .filter(|want| !present_strs.contains(*want))
            .collect();
        if !unknowns.is_empty() {
            let unknown_list = unknowns
                .iter()
                .map(|s| format!("`{s}`"))
                .collect::<Vec<_>>()
                .join(", ");
            let present_list = if pool_budgets.is_empty() {
                "(none — every row is a skip with no recorded budget)".to_string()
            } else {
                pool_budgets
                    .iter()
                    .map(|b| format!("`{b}`"))
                    .collect::<Vec<_>>()
                    .join(", ")
            };
            msg.push_str(&format!(
                "\nhint: --cpu-budget {unknown_list} not found in pool; \
                 distinct budgets present: {present_list}."
            ));
        }
    }

    // list-values redirect: only fires when the operator narrowed
    // on a commit dimension. Generic case (no commit filter) keeps
    // the existing `stats list` redirect at the top of the message
    // — `list-values` would emit a long per-dimension dump that
    // isn't more actionable than `stats list` for a kernel/scheduler
    // /topology miss.
    let touched_commit_dim =
        !filter.project_commits.is_empty() || !filter.kernel_commits.is_empty();
    if touched_commit_dim {
        msg.push_str(
            "\nhint: run `cargo ktstr stats list-values` to see every \
             distinct commit value present in the pool — the specific \
             value the filter expected may not have a sidecar yet, or \
             may differ from what was recorded by \
             `detect_project_commit` / `detect_kernel_commit`.",
        );
    }
    msg
}

/// Resolved inputs for the `stats compare --runs` render phase.
///
/// Produced by [`prepare_partitioned_comparison`] — the validation,
/// pooling, partitioning, and averaging steps of [`compare_partitions`]
/// extracted into an owned bundle so the render half reads from one
/// destructure rather than a long flat prelude. Every field carries
/// the exact value the prior in-function prelude bound; the render
/// half computes labels and headers from these, then runs the four
/// print helpers.
struct PartitionedComparison {
    /// Dimensions on which `filter_a` differs from `filter_b` — the
    /// A/B contrast axes. Guaranteed non-empty (the empty case bails).
    slicing_dims: Vec<Dimension>,
    /// Dimensions NOT in `slicing_dims`, in canonical
    /// [`Dimension::ALL`] order — the join axes for pairing.
    pairing_dims: Vec<Dimension>,
    /// Every sidecar under the runs root (or `--dir` override).
    /// Guaranteed non-empty (the empty pool bails).
    pool: Vec<crate::test_support::SidecarResult>,
    /// `pool` converted to rows, same length and iteration order.
    rows: Vec<GauntletRow>,
    /// A-side rows fed to [`compare_rows_by`]: averaged mean rows
    /// when `no_average` is false, the raw filtered rows otherwise.
    rows_a_for_compare: Vec<GauntletRow>,
    /// B-side counterpart of `rows_a_for_compare`.
    rows_b_for_compare: Vec<GauntletRow>,
    /// A-side averaged groups when `no_average` is false; `None`
    /// under `--no-average`. Drives the per-group pass-count block.
    avg_a: Option<Vec<AveragedGroup>>,
    /// B-side counterpart of `avg_a`.
    avg_b: Option<Vec<AveragedGroup>>,
    /// Post-typed-filter A-side contributor row count (pre-aggregation)
    /// — the "averaged across N runs" header numerator.
    pre_agg_a: usize,
    /// B-side counterpart of `pre_agg_a`.
    pre_agg_b: usize,
}

/// Validate, pool, partition, and average the inputs for
/// [`compare_partitions`]. Returns the owned [`PartitionedComparison`]
/// bundle the render half destructures, or bails with the same
/// diagnostics in the same order as the original in-function prelude:
/// identical-rows gate, empty-pool gate, then the two zero-match
/// gates, then (under `--no-average`) the duplicate-pairing-key
/// gates. The multi-dim slicing warning and the dirty-build /
/// overcommit warnings are emitted here so they precede the render
/// half's header lines, preserving output order.
fn prepare_partitioned_comparison(
    filter_a: &RowFilter,
    filter_b: &RowFilter,
    dir: Option<&std::path::Path>,
    no_average: bool,
) -> anyhow::Result<PartitionedComparison> {
    // Validation gate 1: there must be at least one dimension
    // on which filter_a differs from filter_b — otherwise the
    // operator hasn't expressed a contrast and the function has
    // nothing to compare. Empty slicing dims OR identical filters
    // are both rejected here with actionable diagnostics so the
    // user knows which knob to turn.
    let slicing_dims = derive_slicing_dims(filter_a, filter_b);
    if slicing_dims.is_empty() {
        anyhow::bail!(
            "stats compare: A and B select identical rows. \
             Specify at least one per-side filter (e.g. \
             --a-kernel 6.14 --b-kernel 6.15) to define what \
             dimension separates the two sides."
        );
    }

    // Validation gate 2: warn (not error) when slicing on
    // multiple dimensions. The result is still well-defined —
    // the comparison joins on remaining pairing dims and
    // collapses the slicing-dim cross-product into a single
    // A/B contrast — but the operator is asking for a multi-axis
    // delta which is harder to interpret. The warning surfaces
    // the dim list so they can confirm the cohort shape.
    if slicing_dims.len() > 1 {
        let dim_names: Vec<&str> = slicing_dims.iter().map(|d| d.name()).collect();
        eprintln!(
            "warning: stats compare: slicing on {n} dimensions [{dims}]; \
             results compress multiple axes into a single A/B contrast.",
            n = slicing_dims.len(),
            dims = dim_names.join(", "),
        );
    }

    // Pairing dims = every dimension NOT in the slicing-dim set,
    // in canonical [`Dimension::ALL`] order. The dynamic key
    // shape `(scenario, *pairing_dims)` matches whatever
    // dimensions are currently NOT being contrasted across A
    // and B.
    let pairing_dims = Dimension::pairing_dims(&slicing_dims);

    // Pool every sidecar under the runs root (or the operator's
    // --dir override) and convert to rows. The full-scan cost
    // is acceptable for the single-comparison-per-session
    // workflow.
    //
    // `--dir`-loaded sidecars get their `source` field rewritten
    // to `"archive"` via `apply_archive_source_override` before
    // row conversion. The producer-side `"local"` / `"ci"`
    // distinction is meaningful on the host that wrote the
    // sidecars; once the files have been copied off, the only
    // useful classification is "this came from elsewhere", which
    // is what `--run-source archive` queries for. Operators who need
    // to retain the producer-side distinction read from the
    // default root (no `--dir`) so values pass through untouched.
    let (root, override_archive) = match dir {
        Some(d) => (d.to_path_buf(), true),
        None => (crate::test_support::runs_root(), false),
    };
    let mut pool = crate::test_support::collect_pool(&root);
    if override_archive {
        crate::test_support::apply_archive_source_override(&mut pool);
    }
    if pool.is_empty() {
        anyhow::bail!(
            "stats compare: no sidecar data found under {}. \
             Run `cargo ktstr test` to generate runs, or pass \
             --dir to point at an archived sidecar tree.",
            root.display(),
        );
    }
    let rows: Vec<GauntletRow> = pool.iter().map(sidecar_to_row).collect();

    // Partition: apply each side's filter to the same pool. A
    // row may match both sides (e.g. when scheduler is the
    // slicing dim and kernel is unconstrained on both, a row
    // whose `scheduler` is in `filter_a.schedulers` matches A
    // but NOT B unless `filter_b.schedulers` also contains it —
    // typically not when scheduler is the slicing axis).
    let rows_a = apply_row_filters(&rows, filter_a);
    let rows_b = apply_row_filters(&rows, filter_b);
    if rows_a.is_empty() {
        anyhow::bail!(
            "{}",
            zero_match_diagnostic("A", filter_a, &rows, pool.len()),
        );
    }
    if rows_b.is_empty() {
        anyhow::bail!(
            "{}",
            zero_match_diagnostic("B", filter_b, &rows, pool.len()),
        );
    }

    warn_on_dirty_builds(&rows_a, &rows_b);
    warn_on_overcommit(&rows_a, &rows_b, &pairing_dims);

    let pre_agg_a = rows_a.len();
    let pre_agg_b = rows_b.len();

    // Average by default: fold same-pairing-key rows on each
    // side into one mean row. `--no-average` keeps every
    // sidecar distinct but still rejects duplicate pairing keys
    // because compare_rows can't pair an A-row against multiple
    // B-rows with the same key.
    let (rows_a_for_compare, rows_b_for_compare, avg_a, avg_b) = if !no_average {
        let avg_a = group_and_average_by(&rows_a, &pairing_dims);
        let avg_b = group_and_average_by(&rows_b, &pairing_dims);
        let a_rows: Vec<GauntletRow> = avg_a.iter().map(|r| r.row.clone()).collect();
        let b_rows: Vec<GauntletRow> = avg_b.iter().map(|r| r.row.clone()).collect();
        (a_rows, b_rows, Some(avg_a), Some(avg_b))
    } else {
        // Detect duplicates manually so the error names the key
        // rather than letting compare_rows silently latch onto
        // the first match.
        check_no_duplicate_pairing_keys(&rows_a, &pairing_dims, "A")?;
        check_no_duplicate_pairing_keys(&rows_b, &pairing_dims, "B")?;
        (rows_a, rows_b, None, None)
    };

    Ok(PartitionedComparison {
        slicing_dims,
        pairing_dims,
        pool,
        rows,
        rows_a_for_compare,
        rows_b_for_compare,
        avg_a,
        avg_b,
        pre_agg_a,
        pre_agg_b,
    })
}

/// Compare two filter-defined partitions of the sidecar pool and
/// report regressions across slicing dimensions.
///
/// `filter_a` and `filter_b` are the per-side row filters that
/// define the A/B contrast. The dimensions on which the two
/// filters DIFFER are the SLICING dimensions; the dimensions on
/// which they AGREE (or on which both are unconstrained) are the
/// PAIRING dimensions. Two rows pair across the A/B sides iff
/// their dynamic [`PairingKey`] (scenario plus every pairing-dim
/// value) is equal — so the comparison naturally ignores
/// differences on the slicing axes (those ARE the contrast) and
/// joins on everything else.
///
/// `dir` overrides the default `runs_root()` for pool collection.
/// Pass `Some(path)` to compare archived sidecar trees copied off
/// a CI host; pass `None` to walk `target/ktstr/` (or
/// `CARGO_TARGET_DIR/ktstr/`).
///
/// Validation:
/// - Empty slicing-dim set (every dimension is identical between
///   A and B): bail with "specify at least one --a-X / --b-X to
///   define what to compare". This includes the no-flags-at-all
///   case (both filters are the empty default).
/// - Identical effective filters with at least one slicing dim is
///   a contradiction caught by clap-level construction; the
///   downstream check is "every value in filter_a appears in
///   filter_b on the same dim and vice versa." We catch that as
///   "A and B select identical rows" — symmetric to the empty
///   case.
/// - More than one slicing dimension prints a warning to stderr
///   ("warning: slicing on N dimensions; results compress
///   multiple axes into a single A/B contrast") but does NOT
///   bail — multi-dim slicing is a deliberate feature for
///   comparing e.g. (kernel A + scheduler A) against (kernel B +
///   scheduler B).
///
/// `no_average = false` (the default) groups every matching
/// sidecar within each side by pairing key and averages the
/// metrics across the group. `no_average = true` keeps each
/// sidecar row distinct; if multiple rows on one side share the
/// same pairing key the function bails with an actionable
/// "duplicate pairing keys" error rather than picking one
/// arbitrarily.
///
/// Returns 0 on no regressions, 1 if regressions detected.
pub fn compare_partitions(
    filter_a: &RowFilter,
    filter_b: &RowFilter,
    filter: Option<&str>,
    policy: &ComparisonPolicy,
    dir: Option<&std::path::Path>,
    no_average: bool,
    phase_opts: &PhaseDisplayOptions,
) -> anyhow::Result<i32> {
    let prepared = prepare_partitioned_comparison(filter_a, filter_b, dir, no_average)?;
    let PartitionedComparison {
        slicing_dims,
        pairing_dims,
        pool,
        rows,
        rows_a_for_compare,
        rows_b_for_compare,
        avg_a,
        avg_b,
        pre_agg_a,
        pre_agg_b,
    } = &prepared;

    let report = compare_rows_by(
        rows_a_for_compare,
        rows_b_for_compare,
        pairing_dims,
        filter,
        policy,
    );

    // Side labels derive from the slicing dims' filter values.
    // Single slicing dim: e.g. "6.14.2" / "6.15.0". Multi: e.g.
    // "6.14.2:scx_rusty" / "6.15.0:scx_alpha". >3 values per dim:
    // collapse to "A"/"B" to keep column headers readable.
    let label_a = render_side_label(filter_a, slicing_dims, "A");
    let label_b = render_side_label(filter_b, slicing_dims, "B");

    // Header lines: name the slicing and pairing axes so the
    // operator can confirm the comparison shape at a glance.
    let slice_names: Vec<&str> = slicing_dims.iter().map(|d| d.name()).collect();
    let pair_names: Vec<&str> = pairing_dims.iter().map(|d| d.name()).collect();
    println!("slicing dimensions: {}", slice_names.join(", "));
    println!(
        "pairing on: scenario{}{}",
        if pair_names.is_empty() { "" } else { ", " },
        pair_names.join(", "),
    );

    if !no_average {
        println!(
            "{}",
            format_average_header(*pre_agg_a, *pre_agg_b, &label_a, &label_b)
        );
    }

    // Scalar findings table — suppressed when the operator
    // passed `--phases-only` (they want the per-phase block
    // only). The scalar pre-aggregation already ran; this just
    // hides its render.
    if !phase_opts.phases_only {
        print_scalar_findings_table(&report, &label_a, &label_b);
    }

    print_phase_block(&report, phase_opts, &label_a, &label_b);

    // Scalar summary block — regressions / improvements /
    // unchanged + skipped-failed + per-group pass counts +
    // new_in_b / removed_from_a. All four lines describe the
    // scalar findings table; suppress them under `--phases-only`
    // so the operator's "phase-block only" projection stays
    // pure (the phase block has its own footer hint above).
    if !phase_opts.phases_only {
        print_summary_block(&report, avg_a, avg_b, &label_a, &label_b);
    }

    // Host-context delta. Same first-Some(host) baseline
    // `compare_partitions` uses — picking representative hosts
    // off the partitioned sidecars rather than the full pool so
    // the delta reflects what actually fed the comparison.
    // Suppressed under `--phases-only`, which renders ONLY the
    // per-phase block (see `PhaseDisplayOptions::phases_only`),
    // matching the scalar table and summary gates above.
    if !phase_opts.phases_only {
        print_host_context_delta(pool, rows, filter_a, filter_b, &label_a, &label_b);
    }

    Ok(if report.regressions > 0 { 1 } else { 0 })
}

/// Render the scalar findings table for `stats compare --runs`.
///
/// Extracted from [`compare_partitions`] verbatim; the
/// `--phases-only` gate stays at the call site so this prints
/// unconditionally when invoked.
fn print_scalar_findings_table(report: &CompareReport, label_a: &str, label_b: &str) {
    use comfy_table::{Cell, Color};
    let mut table = crate::cli::new_table();
    table.set_header(vec!["TEST", "METRIC", label_a, label_b, "DELTA", "VERDICT"]);
    for f in &report.findings {
        let (verdict_text, verdict_color) = if f.is_regression {
            ("REGRESSION", Color::Red)
        } else {
            ("improvement", Color::Green)
        };
        // PairingKey's first slot is scenario; subsequent slots
        // are the pairing-dim values in canonical order. Joining
        // with `/` produces a label whose shape mirrors the
        // pairing-dim count — so a comparison that pairs on
        // (topology, work_type) renders a `scenario/topology/work_type`
        // label, while a comparison that slices on most dims
        // renders a shorter identifier. The operator can always
        // cross-reference the "pairing on:" header line above to
        // see what each segment means.
        let label = f.pairing_key.0.join("/");
        table.add_row(vec![
            Cell::new(label),
            Cell::new(f.metric.name),
            Cell::new(format!("{:.2}", f.val_a)),
            Cell::new(format!("{:.2}", f.val_b)),
            Cell::new(format!("{:+.2}{}", f.delta, f.metric.display_unit)),
            Cell::new(verdict_text).fg(verdict_color),
        ]);
    }
    println!("{table}");
}

/// Render the per-phase delta block for `stats compare --runs`.
/// Activated when the parallel pass
/// populated either phase_deltas or unpaired_phases for the
/// current row-pair set AND `--no-phases` was not passed.
/// Single-phase scenarios (no periodic captures) leave both
/// vecs empty and the phase block is suppressed entirely.
///
/// CLI filters compose by AND on independent axes:
/// - `--phase <N>` keeps only the named step_index
/// - `--steps-only` suppresses BASELINE (step_index == 0)
/// - `--phase-threshold <PCT>` filters paired rows whose
///   `|delta| / max(|a|, 1.0)` is below `PCT / 100.0`
///
/// Filtering is render-time projection — the underlying
/// CompareReport.phase_deltas / unpaired_phases vecs hold
/// the unfiltered data so programmatic consumers see every
/// paired row regardless of CLI flags.
fn print_phase_block(
    report: &CompareReport,
    phase_opts: &PhaseDisplayOptions,
    label_a: &str,
    label_b: &str,
) {
    use comfy_table::{Cell, Color};
    let render_phase_block = !phase_opts.no_phases
        && (!report.phase_deltas.is_empty() || !report.unpaired_phases.is_empty());
    if render_phase_block {
        let filtered_deltas: Vec<&PhaseDeltaRow> = report
            .phase_deltas
            .iter()
            .filter(|d| phase_opts.matches_phase(d.step_index))
            .filter(|d| phase_opts.passes_delta_threshold(d))
            .collect();
        let filtered_unpaired: Vec<&UnpairedPhaseRow> = report
            .unpaired_phases
            .iter()
            .filter(|u| phase_opts.matches_phase(u.step_index))
            .collect();
        // Capture filtered counts BEFORE moving `filtered_deltas`
        // into `sorted_deltas` below — the footer hint reads them
        // after the table rendering consumes the Vec.
        let filtered_delta_total = filtered_deltas.len();
        let filtered_delta_regressions = filtered_deltas.iter().filter(|d| d.is_regression).count();
        if !filtered_deltas.is_empty() || !filtered_unpaired.is_empty() {
            println!();
            println!("phase coverage:");
            if !filtered_deltas.is_empty() {
                let mut phase_table = crate::cli::new_table();
                phase_table.set_header(vec![
                    "PHASE", "TEST", "METRIC", label_a, label_b, "DELTA", "VERDICT",
                ]);
                // Sort by step_index ascending, then pairing key,
                // then metric name. step_index-first ordering matches
                // the operator-facing time order from BASELINE
                // through Step[N] so the reader scans top-down by
                // phase boundary; ties within a phase sort by row
                // pair then metric so the table is stable across
                // runs with identical input.
                let mut sorted_deltas = filtered_deltas;
                sorted_deltas.sort_by(|a, b| {
                    a.step_index
                        .cmp(&b.step_index)
                        .then_with(|| a.pairing_key.0.cmp(&b.pairing_key.0))
                        .then_with(|| a.metric.name.cmp(b.metric.name))
                });
                for d in sorted_deltas {
                    let (verdict_text, verdict_color) = if d.is_regression {
                        ("REGRESSION", Color::Red)
                    } else {
                        ("improvement", Color::Green)
                    };
                    let test_label = d.pairing_key.0.join("/");
                    let phase_cell = format!("{}: {}", d.step_index, d.label);
                    phase_table.add_row(vec![
                        Cell::new(phase_cell),
                        Cell::new(test_label),
                        Cell::new(d.metric.name),
                        Cell::new(format!("{:.2}", d.a)),
                        Cell::new(format!("{:.2}", d.b)),
                        Cell::new(format!("{:+.2}{}", d.delta, d.metric.display_unit)),
                        Cell::new(verdict_text).fg(verdict_color),
                    ]);
                }
                println!("{phase_table}");
            }
            if !filtered_unpaired.is_empty() {
                println!();
                println!("phase coverage asymmetry (one-sided phases):");
                let mut unpaired_table = crate::cli::new_table();
                unpaired_table.set_header(vec!["SIDE", "TEST", "PHASE", "METRIC", "VALUE"]);
                // Sort by step_index then side then pairing key then
                // metric name. Time-order (step_index first) reads
                // most naturally — the reader sees missing data in
                // the order it would have appeared during the
                // scenario, not grouped by which side is missing
                // (side grouping would force a mental flip-flop
                // across the paired rows above).
                let mut sorted_unpaired = filtered_unpaired;
                sorted_unpaired.sort_by(|a, b| {
                    a.step_index
                        .cmp(&b.step_index)
                        .then_with(|| a.side.as_str().cmp(b.side.as_str()))
                        .then_with(|| a.pairing_key.0.cmp(&b.pairing_key.0))
                });
                for u in sorted_unpaired {
                    let test_label = u.pairing_key.0.join("/");
                    let phase_cell = format!("{}: {}", u.step_index, u.label);
                    if u.metrics.is_empty() {
                        // Bucket present but no metrics — surface
                        // the empty shape rather than hiding it. The
                        // operator sees that the phase fired but
                        // produced no readable metric data on the
                        // single side it ran on, which is itself a
                        // signal. Two paths reach here: (1) capture
                        // landed but MetricDef::read_sample returned
                        // None for every registered metric on these
                        // samples; (2) the phase's only metrics were
                        // suppressed Rate components, which
                        // metrics_without_suppressed drops from the
                        // unpaired row.
                        unpaired_table.add_row(vec![
                            Cell::new(u.side.as_str()),
                            Cell::new(test_label),
                            Cell::new(phase_cell),
                            Cell::new("—"),
                            Cell::new("—"),
                        ]);
                    } else {
                        for (metric_name, &value) in &u.metrics {
                            unpaired_table.add_row(vec![
                                Cell::new(u.side.as_str()),
                                Cell::new(&test_label),
                                Cell::new(&phase_cell),
                                Cell::new(metric_name),
                                Cell::new(format!("{value:.2}")),
                            ]);
                        }
                    }
                }
                println!("{unpaired_table}");
            }
            // Operator hint surfaces only when the default-on
            // path is producing rows AND no filter flag was set —
            // a user who already passed `--phase`, `--steps-only`,
            // `--phase-threshold`, or `--phases-only` doesn't need
            // the discovery hint. `--no-phases` already
            // short-circuited the entire block above so it can't
            // reach here.
            let any_flag_set = phase_opts.phases_only
                || phase_opts.steps_only
                || phase_opts.phase.is_some()
                || phase_opts.phase_threshold.is_some();
            if !any_flag_set {
                println!(
                    "  phases: {filtered_delta_total} delta row(s) shown \
                     ({filtered_delta_regressions} regression{plural}). \
                     Filter with --phase N / --phases-only / --steps-only / \
                     --phase-threshold P / --no-phases.",
                    plural = if filtered_delta_regressions == 1 {
                        ""
                    } else {
                        "s"
                    },
                );
            }
        }
    }
}

/// Render the scalar summary block for `stats compare --runs` —
/// regressions / improvements / unchanged + skipped-failed +
/// per-group pass counts + new_in_b / removed_from_a. All lines
/// describe the scalar findings table; the `--phases-only` gate
/// stays at the call site so this prints unconditionally when
/// invoked.
fn print_summary_block(
    report: &CompareReport,
    avg_a: &Option<Vec<AveragedGroup>>,
    avg_b: &Option<Vec<AveragedGroup>>,
    label_a: &str,
    label_b: &str,
) {
    println!();
    println!(
        "summary: {} regressions, {} improvements, {} unchanged",
        report.regressions, report.improvements, report.unchanged,
    );
    if report.excluded_pairs > 0 {
        println!(
            "  {} pairing-key row pair(s) excluded from regression math because one \
             or both sides did not pass (failed, inconclusive, or skipped)",
            report.excluded_pairs,
        );
    }
    if let (Some(avg_a), Some(avg_b)) = (avg_a, avg_b) {
        let block = format_per_group_pass_counts(avg_a, avg_b, label_a, label_b);
        if !block.is_empty() {
            print!("{block}");
        }
    }
    if report.new_in_b > 0 {
        println!(
            "  {} row(s) new in '{}' (no matching key in '{}')",
            report.new_in_b, label_b, label_a,
        );
    }
    if report.removed_from_a > 0 {
        println!(
            "  {} row(s) removed from '{}' (no matching key in '{}')",
            report.removed_from_a, label_a, label_b,
        );
    }
}

/// Print the host-context delta for `stats compare --runs`. Same
/// first-Some(host) baseline `compare_partitions` uses — picking
/// representative hosts off the partitioned sidecars rather than
/// the full pool so the delta reflects what actually fed the
/// comparison.
fn print_host_context_delta(
    pool: &[crate::test_support::SidecarResult],
    rows: &[GauntletRow],
    filter_a: &RowFilter,
    filter_b: &RowFilter,
    label_a: &str,
    label_b: &str,
) {
    // Zip the pool with the pre-computed `rows` (built once above
    // via `pool.iter().map(sidecar_to_row).collect()`) so the
    // per-side filter reuses the existing row instead of calling
    // `sidecar_to_row` a second and third time. `pool` and `rows`
    // are the same length and same iteration order by construction.
    let sidecars_a: Vec<&crate::test_support::SidecarResult> = pool
        .iter()
        .zip(rows.iter())
        .filter(|(_, r)| filter_a.matches(r))
        .map(|(s, _)| s)
        .collect();
    let sidecars_b: Vec<&crate::test_support::SidecarResult> = pool
        .iter()
        .zip(rows.iter())
        .filter(|(_, r)| filter_b.matches(r))
        .map(|(s, _)| s)
        .collect();
    let host_a = sidecars_a.iter().find_map(|s| s.host.as_ref());
    let host_b = sidecars_b.iter().find_map(|s| s.host.as_ref());
    print!("{}", format_host_delta(host_a, host_b, label_a, label_b));
}

/// Bail when `rows` contains two or more entries with the same
/// pairing key — only relevant under `--no-average`, where each
/// sidecar row stays distinct and `compare_rows_by` would
/// silently latch onto whichever entry happened to be first in
/// iteration order. Names the offending key in the diagnostic
/// so the operator can choose to either drop `--no-average` or
/// add another per-side filter to disambiguate.
pub(crate) fn check_no_duplicate_pairing_keys(
    rows: &[GauntletRow],
    pairing_dims: &[Dimension],
    side_label: &str,
) -> anyhow::Result<()> {
    let mut seen: BTreeMap<PairingKey, usize> = BTreeMap::new();
    for row in rows {
        let key = PairingKey::from_row(row, pairing_dims);
        *seen.entry(key).or_insert(0) += 1;
    }
    if let Some((dup_key, count)) = seen.iter().find(|&(_, &c)| c > 1) {
        anyhow::bail!(
            "stats compare --no-average: side {side_label} has {count} \
             sidecars with the same pairing key {key:?}. Either drop \
             --no-average to average them, or add another --{side}-X \
             filter to disambiguate.",
            key = dup_key.0,
            side = side_label.to_lowercase(),
        );
    }
    Ok(())
}

/// Render the host-context delta section of `stats compare --runs`
/// as a block of text ready to `print!`. Extracted as a pure
/// function of `(Option<&HostContext>, Option<&HostContext>, &str,
/// &str)` so the five match arms can be unit-tested without
/// fixturing a real run directory.
///
/// The returned string is either empty (when both sides have no
/// host data — nothing to print) or ends with a newline so callers
/// can chain further output. Single-side cases print a clear
/// "captured in X only, delta unavailable" message rather than
/// silently suppressing the section — a mixed-tooling-version run
/// comparison should surface the asymmetry.
/// Format the one-line averaging-mode header that prints above
/// the comparison table when `--average` is active.
///
/// Pure function of (`pre_agg_a`, `pre_agg_b`, `a`, `b`) so the
/// exact-string contract — the operator-visible "averaged across
/// N runs (A) and M runs (B)" surface — can be unit-tested
/// without capturing stdout from `compare_partitions`.
///
/// `pre_agg_a` / `pre_agg_b` are the post-typed-filter contributor
/// row counts (i.e. the number of sidecar rows that fed
/// [`group_and_average_by`]), NOT the post-aggregation unique-key
/// counts. The two answer different operator questions; the
/// header surfaces the contributor count because that's the
/// "how many trials got folded?" intuition the `--average` flag
/// is actually delivering.
pub(crate) fn format_average_header(
    pre_agg_a: usize,
    pre_agg_b: usize,
    a: &str,
    b: &str,
) -> String {
    format!("averaged across {pre_agg_a} runs ({a}) and {pre_agg_b} runs ({b})")
}

/// Format the per-group `passes_observed/total_observed` block
/// that prints below the summary line when `--average` is active.
///
/// Pure function of (`avg_a`, `avg_b`, `a`, `b`) so the rendered
/// surface — one line per (scenario, topology, work_type) group
/// present on either side, with `N/M` per side and `-` for any
/// side that lacks the group — can be unit-tested without
/// capturing stdout. Returns the trailing-newline-terminated
/// block, or empty string when neither side has groups.
///
/// Line shape:
/// `  scenario/topology/work_type: {a}=N/M {b}=N/M`
///
/// The leading two-space indent matches the sibling
/// `summary:` block's continuation lines (e.g.
/// `"  N (scenario, topology, work_type) row pair(s) skipped..."`)
/// so the per-group block reads as a continuation of the same
/// summary section. A blank line separates this block from the
/// preceding `summary:` line for readability.
///
/// Groups present on only one side render `-` for the missing
/// side (also counted in `compare_rows`' `new_in_b` /
/// `removed_from_a` upstream — the per-group block surfaces the
/// asymmetry by name so the operator can see *which* groups went
/// missing without cross-referencing the summary counters).
pub(crate) fn format_per_group_pass_counts(
    avg_a: &[AveragedGroup],
    avg_b: &[AveragedGroup],
    a: &str,
    b: &str,
) -> String {
    type SummaryKey<'a> = (&'a str, &'a str, &'a str);
    type SummaryValue<'a> = (Option<&'a AveragedGroup>, Option<&'a AveragedGroup>);
    let mut keys: BTreeMap<SummaryKey<'_>, SummaryValue<'_>> = BTreeMap::new();
    for ar in avg_a {
        let k = (
            ar.row.scenario.as_str(),
            ar.row.topology.as_str(),
            ar.row.work_type.as_str(),
        );
        keys.entry(k).or_insert((None, None)).0 = Some(ar);
    }
    for br in avg_b {
        let k = (
            br.row.scenario.as_str(),
            br.row.topology.as_str(),
            br.row.work_type.as_str(),
        );
        keys.entry(k).or_insert((None, None)).1 = Some(br);
    }
    if keys.is_empty() {
        return String::new();
    }
    let mut out = String::new();
    out.push('\n');
    out.push_str(
        "per-group pass counts (passes/total + skip/inconc/fail breakdown when non-zero):\n",
    );
    for ((scn, topo, wt), (ka, kb)) in keys.into_iter() {
        let fmt_side = |r: Option<&AveragedGroup>| -> String {
            let Some(x) = r else {
                return "-".to_string();
            };
            // Mirror format_dimension_summary's 4-state breakdown —
            // operators reading per-group lines must be able to
            // distinguish skip / inconclusive / fail buckets, not
            // see them collapsed into the (total - pass) denominator
            // gap. Skip silently rendering buckets that are zero so
            // the common-case "all passed" line stays terse.
            let mut s = format!("{}/{}", x.passes_observed, x.total_observed);
            let mut extras: Vec<String> = Vec::with_capacity(3);
            if x.skips_observed > 0 {
                extras.push(format!("{} skip", x.skips_observed));
            }
            if x.inconclusives_observed > 0 {
                extras.push(format!("{} inc", x.inconclusives_observed));
            }
            if x.failures_observed > 0 {
                extras.push(format!("{} fail", x.failures_observed));
            }
            if !extras.is_empty() {
                s.push_str(&format!(" ({})", extras.join(", ")));
            }
            s
        };
        out.push_str(&format!(
            "  {scn}/{topo}/{wt}: {a}={pa} {b}={pb}\n",
            pa = fmt_side(ka),
            pb = fmt_side(kb),
        ));
    }
    out
}

pub(crate) fn format_host_delta(
    host_a: Option<&crate::host_context::HostContext>,
    host_b: Option<&crate::host_context::HostContext>,
    a: &str,
    b: &str,
) -> String {
    match (host_a, host_b) {
        (Some(ha), Some(hb)) => {
            let delta = ha.diff(hb);
            if delta.is_empty() {
                // Identical hosts: surface arch when both sides
                // carry it so the operator sees WHAT is identical
                // (the two runs share x86_64 vs both being aarch64
                // is the operator's question). When
                // either side leaves arch as `None` (pre-host-
                // context-landing archive, or arch probe failed
                // on at least one side), fall through to the
                // bare "identical" message — emitting a partial
                // hint would mislead the reader into thinking
                // the silent side disagreed.
                match (ha.arch.as_deref(), hb.arch.as_deref()) {
                    (Some(arch_a), Some(arch_b)) if arch_a == arch_b => {
                        format!("\nhost: identical between '{a}' and '{b}' (arch: {arch_a})\n",)
                    }
                    _ => format!("\nhost: identical between '{a}' and '{b}'\n"),
                }
            } else {
                format!("\nhost delta ('{a}' → '{b}'):\n{delta}")
            }
        }
        (Some(_), None) => {
            format!("\nhost: captured in '{a}' only, delta unavailable\n")
        }
        (None, Some(_)) => {
            format!("\nhost: captured in '{b}' only, delta unavailable\n")
        }
        (None, None) => String::new(),
    }
}