ktstr 0.4.12 - Docs.rs

//! Group, aggregate, and render the comparison between two
//! [`CtprofSnapshot`]s.
//!
//! Design summary: the per-thread profiler emits
//! one snapshot per run. Comparison groups threads within each
//! snapshot by a single axis (pcomm, cgroup, comm, or
//! comm-exact — see [`GroupBy`]), aggregates every metric per
//! the rule on its [`CtprofMetricDef`], then matches groups
//! across the two snapshots and emits one row per
//! `(group, metric)` pair. Groups present on only one side
//! surface as unmatched entries rather than imaginary
//! zero-valued rows — a row is missing because the process did
//! not exist, not because it did zero work.
//!
//! No judgment labels. The comparison prints raw numbers and
//! percent delta; interpretation (regression vs improvement) is
//! scheduler-specific and left to the user. This mirrors the
//! no-label principle for the broader stats comparison pipeline
//! (see the `stats.rs` module doc).

use std::collections::BTreeMap;
use std::fmt;
use std::path::Path;
use std::sync::LazyLock;

use anyhow::Context;
use regex::Regex;

use crate::ctprof::{
    CgroupCpuStats, CgroupMemoryStats, CgroupPidsStats, CgroupStats, CtprofSnapshot, Psi, PsiHalf,
    PsiResource, ThreadState,
};

/// Grouping key for the ctprof compare.
///
/// The default is [`GroupBy::Pcomm`] — aggregate every thread
/// belonging to the same process name together with token-based
/// pattern normalization, so ephemeral worker pools whose pcomm
/// differs only by digit-suffix collapse across snapshots. The
/// other variants exist for operators who want to slice along a
/// different axis: `Cgroup` groups by cgroup path (useful for
/// container-per-workload deployments); `Comm` groups by thread
/// name across every process with the same token-based pattern
/// normalization (so `tokio-worker-{0..N}` collapse into one
/// `tokio-worker-{N}` bucket and `kworker/0:1H-events_highpri`,
/// `kworker/1:0H-events_highpri`, ... collapse into one
/// `kworker/{N}:{N}H-events_highpri` bucket); `CommExact` groups
/// by literal thread name (useful when distinct token values
/// carry meaning that the normalizer would erase, e.g. tracking
/// each per-CPU `kworker/u8:N` independently).
#[derive(Debug, Clone, Copy, PartialEq, Eq, clap::ValueEnum)]
pub enum GroupBy {
    /// Group by process name (`pcomm`). Default grouping: pcomm
    /// is the leader thread's `task->comm`, read from
    /// `/proc/<tgid>/comm` at capture time (see
    /// [`crate::ctprof::ThreadState::pcomm`]). Per-thread `comm`
    /// values, by contrast, can drift over a process's lifetime
    /// (worker threads reset their comm under load, `taskset`
    /// toggles names, etc.); the leader's comm is the
    /// per-process identity captured at snapshot time and stays
    /// constant across that snapshot. Pcomm grouping is therefore
    /// the most reliable axis for "give me the per-process
    /// resource picture", which is why it's the default.
    ///
    /// Naive pcomm grouping has one common failure mode: workers
    /// with digit suffixes (`worker-0`, `worker-1`, ...) each
    /// land in their own bucket and the per-pool aggregate gets
    /// scattered across N rows. Token-based pattern
    /// normalization handles this: pcomms that produce the same
    /// skeleton under [`pattern_key`]'s normalizer cluster into
    /// one bucket whose internal join key is the skeleton. The
    /// normalizer splits each pcomm on a separator class
    /// (`[.\-_/:@+\[\]\s]+`) and classifies each token as
    /// pure-digit (`{N}`), hex-like (`{H}`),
    /// alpha-prefix-plus-digits (`prefix{N}`),
    /// digits-plus-alpha-suffix (`{N}suffix`), or literal —
    /// identical rules to the [`Comm`](Self::Comm) axis.
    /// Singleton buckets revert to the literal pcomm so a lone
    /// process stays ungrouped instead of advertising a
    /// `worker-{N}` pattern that no other process shares.
    /// Display labels are generated by `grex` for buckets with
    /// ≥ 2 distinct member pcomms; the rendered label is a
    /// regex showing the shared prefix + alternation, while the
    /// join key remains deterministic across snapshots. Disable
    /// normalization with [`CompareOptions::no_thread_normalize`]
    /// to group by literal `pcomm`.
    Pcomm,
    /// Group by cgroup path. Cgroup-level enrichment is surfaced
    /// in the output alongside the aggregated thread metrics.
    Cgroup,
    /// Group by thread name pattern across every process. Threads
    /// whose names produce the same skeleton under
    /// [`pattern_key`]'s token-based normalizer cluster into one
    /// bucket whose internal join key is the skeleton. The
    /// normalizer splits each comm on a separator class
    /// (`[.\-_/:@+\[\]\s]+`) and classifies each token as pure-digit
    /// (`{N}`), hex-like (`{H}`), alpha-prefix-plus-digits
    /// (`prefix{N}`), digits-plus-alpha-suffix (`{N}suffix`), or
    /// literal. Singleton buckets revert to the literal thread
    /// name so a lone worker stays ungrouped.
    /// Display labels are generated by `grex` for buckets with ≥2
    /// distinct member names; the rendered label is a regex
    /// showing the shared prefix + alternation, while the join key
    /// remains deterministic across snapshots. Disable
    /// normalization with
    /// [`CompareOptions::no_thread_normalize`].
    Comm,
    /// Group by literal thread name (`comm`) — exact match, no
    /// pattern aggregation. Use this when distinct token values
    /// carry meaning the normalizer would erase, e.g. tracking each
    /// per-CPU `kworker/u8:N` independently rather than collapsing
    /// the fleet into one `kworker/u{N}:{N}` bucket.
    ///
    /// Distinct from `--group-by comm --no-thread-normalize`:
    /// this variant ONLY disables thread-axis normalization,
    /// leaving the smaps_rollup pcomm keying still normalized
    /// (per [`collect_smaps_rollup`]). The
    /// `--no-thread-normalize` flag, by contrast, disables
    /// normalization across every name-family axis (Comm, Pcomm,
    /// AND smaps_rollup). Pick `CommExact` when you want literal
    /// thread names but still want smaps to join across
    /// snapshots; pick `Comm + no_thread_normalize` when you
    /// also want literal smaps PID identity.
    CommExact,
    /// Run all three pattern-aware axes (Cgroup → Pcomm → Comm)
    /// and render each as a labeled block. Gives a comprehensive
    /// at-a-glance summary without re-running with different
    /// `--group-by` flags. Each axis gets its own `## Primary
    /// metrics` section, independently truncated by `--limit`.
    All,
}

/// Options controlling [`compare`].
#[derive(Debug, Clone, Default)]
#[non_exhaustive]
pub struct CompareOptions {
    pub group_by: GroupByOrDefault,
    /// Glob patterns that collapse dynamic cgroup path segments
    /// to a canonical form before grouping. Applied in listed
    /// order; each pattern that matches a thread's cgroup path
    /// rewrites the matched segments with the literal portions
    /// of the pattern. See [`flatten_cgroup_path`] for the
    /// rewrite rule and examples.
    ///
    /// Independent of [`Self::no_cg_normalize`] — explicit
    /// glob patterns apply first; auto-normalization (token-based)
    /// runs after, gated by `no_cg_normalize`.
    pub cgroup_flatten: Vec<String>,
    /// When true, disable token-based pattern normalization
    /// across every name-family axis: [`GroupBy::Comm`],
    /// [`GroupBy::Pcomm`], AND the smaps_rollup keying in
    /// [`collect_smaps_rollup`] (which keys by
    /// `pattern_key(&t.pcomm)` under default normalization, but
    /// reverts to literal `pcomm[tgid]` when this flag is set so
    /// each PID stays attributable).
    ///
    /// Under this flag: threads / processes group by their
    /// literal name; smaps rows preserve their per-PID identity.
    /// The pure-digit/hex/alpha+digits placeholders never fire on
    /// any of those axes. Mirror of [`Self::no_cg_normalize`] for
    /// the thread / process axes. Has no effect under
    /// [`GroupBy::CommExact`] (already literal) or
    /// [`GroupBy::Cgroup`].
    pub no_thread_normalize: bool,
    /// When true, disable token-based pattern normalization for
    /// cgroup-path grouping ([`GroupBy::Cgroup`]). Cgroup paths
    /// group by their literal post-flatten path (no Layer 1, 2,
    /// or 3 substitutions). Explicit `cgroup_flatten` glob
    /// patterns still apply. Has no effect under other groupings.
    pub no_cg_normalize: bool,
    /// Multi-key sort spec for the diff rows. When non-empty,
    /// overrides the default `delta_pct desc` sort. Each
    /// [`SortKey`] names one metric from
    /// [`CTPROF_METRICS`] or [`CTPROF_DERIVED_METRICS`]
    /// and a direction; groups rank by the tuple
    /// (`metric_1_delta`, `metric_2_delta`, ...) under
    /// lexicographic order with per-key direction. Within a
    /// group, rows appear in registry order. The sort
    /// composes with [`Self::group_by`]: groups are formed under
    /// the chosen axis (pcomm / cgroup / comm / comm-exact) and
    /// then ranked by their aggregated metric values, so the
    /// same `sort_by` spec works under every grouping. See
    /// [`parse_sort_by`] for the CLI string parser.
    pub sort_by: Vec<SortKey>,
}

/// One key in a multi-key `--sort-by` spec. Names a metric from
/// [`CTPROF_METRICS`] or [`CTPROF_DERIVED_METRICS`] and
/// the sort direction for that key. Direction defaults to
/// descending (largest delta first) so the common operator
/// request — "show me the biggest regressions first" — is the
/// unmarked form.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct SortKey {
    /// Metric name. Holds one of the [`CTPROF_METRICS`] or
    /// [`CTPROF_DERIVED_METRICS`] entries' `name` fields
    /// verbatim — [`parse_sort_by`] looks up the input string in
    /// either registry and stores the matched `&'static str`, so
    /// this never carries an allocation. Equality against a
    /// registry `name` is by content (`str::eq`); both sides
    /// reference the same `&'static str` from the registry, so
    /// the byte-by-byte comparison succeeds in O(name.len())
    /// without any heap access. The two registries are disjoint
    /// (the `registry_and_derived_names_disjoint` test pins
    /// this) so a `metric` value resolves unambiguously to one
    /// or the other.
    pub metric: &'static str,
    /// True for descending (largest first), false for ascending
    /// (smallest first).
    pub descending: bool,
}

/// Newtype wrapper around [`GroupBy`] that defaults to
/// [`GroupBy::Pcomm`]. Separate type so `CompareOptions::default()`
/// does not need to spell out every field.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct GroupByOrDefault(pub GroupBy);

impl Default for GroupByOrDefault {
    fn default() -> Self {
        Self(GroupBy::Pcomm)
    }
}

impl From<GroupBy> for GroupByOrDefault {
    fn from(g: GroupBy) -> Self {
        Self(g)
    }
}

/// Aggregation rule for a single metric.
///
/// Encoded as an enum rather than a trait object so the registry
/// table ([`CTPROF_METRICS`]) can live in static memory. Each
/// variant's accessor returns the typed
/// [`crate::metric_types`] newtype that matches the reduction
/// — the reader and rule are paired by construction so a new
/// metric cannot register a peak field against a sum reducer
/// (`SumNs(|t| t.wait_max)` fails to compile because `wait_max`
/// is `PeakNs`, not `MonotonicNs`).
///
/// Each variant maps 1:1 to a marker trait in
/// [`crate::metric_types`]: `Sum*` variants take a [`Summable`]
/// type, `Max*` variants take a [`Maxable`] type that is NOT
/// also `Summable` (counters use `Sum*` even though they
/// implement both — registering a counter as `Max*` would mask
/// the sum semantics with the per-contributor maximum), `Range*`
/// variants take a [`Rangeable`] type, `Mode*` variants take a
/// [`Modeable`] type or a primitive that the dispatch coerces to
/// `String`, and [`AggRule::Affinity`] takes the dedicated
/// [`crate::metric_types::CpuSet`] for the affinity-summary
/// reduction.
///
/// [`Summable`]: crate::metric_types::Summable
/// [`Maxable`]: crate::metric_types::Maxable
/// [`Rangeable`]: crate::metric_types::Rangeable
/// [`Modeable`]: crate::metric_types::Modeable
#[derive(Debug, Clone, Copy)]
pub enum AggRule {
    /// Sum across the group of a [`MonotonicCount`] field. Used
    /// for unitless cumulative counters (`nr_wakeups`,
    /// `voluntary_csw`, `minflt`, syscall counts, …). The
    /// dispatch routes through
    /// [`crate::metric_types::Summable::sum_across`] which uses
    /// `saturating_add` per the no-wraparound contract.
    ///
    /// [`MonotonicCount`]: crate::metric_types::MonotonicCount
    SumCount(fn(&ThreadState) -> crate::metric_types::MonotonicCount),
    /// Sum across the group of a [`MonotonicNs`] field. Used for
    /// cumulative-time counters in nanoseconds (`run_time_ns`,
    /// `wait_time_ns`, `wait_sum`, `voluntary_sleep_ns`,
    /// `block_sum`, `iowait_sum`, `core_forceidle_sum`).
    ///
    /// [`MonotonicNs`]: crate::metric_types::MonotonicNs
    SumNs(fn(&ThreadState) -> crate::metric_types::MonotonicNs),
    /// Sum across the group of a [`ClockTicks`] field. Used for
    /// USER_HZ-scaled cumulative time counters
    /// (`utime_clock_ticks`, `stime_clock_ticks`).
    ///
    /// [`ClockTicks`]: crate::metric_types::ClockTicks
    SumTicks(fn(&ThreadState) -> crate::metric_types::ClockTicks),
    /// Sum across the group of a [`Bytes`] field. Used for
    /// IEC-binary-scaled byte counters (`allocated_bytes`,
    /// `deallocated_bytes`, `rchar`, `wchar`, `read_bytes`,
    /// `write_bytes`, `cancelled_write_bytes`).
    ///
    /// [`Bytes`]: crate::metric_types::Bytes
    SumBytes(fn(&ThreadState) -> crate::metric_types::Bytes),
    /// Maximum across the group of a [`PeakNs`] field — the
    /// kernel `*_max` schedstats (`wait_max`, `sleep_max`,
    /// `block_max`, `exec_max`, `slice_max`). Each thread
    /// already carries its own lifetime max-seen value from the
    /// kernel's scheduler call sites (e.g. `update_se` in
    /// `kernel/sched/fair.c` for `exec_max`; see
    /// `struct sched_statistics` in `include/linux/sched.h`).
    /// Group-level reduction takes the largest across members so
    /// a row surfaces the worst single window any thread in the
    /// group has ever experienced. Summing per-thread maxes
    /// would conflate "one thread with a 1s spike" with "1000
    /// threads with 1ms spikes each" — `PeakNs` therefore does
    /// NOT implement `Summable`, and trying to register one as
    /// `SumNs` is a compile error.
    ///
    /// [`PeakNs`]: crate::metric_types::PeakNs
    MaxPeak(fn(&ThreadState) -> crate::metric_types::PeakNs),
    /// Maximum across the group of a [`PeakBytes`] field — the
    /// byte-typed twin of [`MaxPeak`]. Used for taskstats-sourced
    /// lifetime memory watermarks (`hiwater_rss`, `hiwater_vm`).
    /// `xacct_add_tsk` (`kernel/tsacct.c::xacct_add_tsk`, lines
    /// 99-104) reads the watermark out of the SHARED `mm_struct`
    /// via `get_mm_hiwater_rss(mm)` / `get_mm_hiwater_vm(mm)`, so
    /// sibling threads of the same tgid all report the same
    /// value; cross-thread Max within a single process is a no-op.
    /// Cross-PROCESS Max (e.g. under `--group-by pcomm` when the
    /// bucket spans multiple parent processes) is the meaningful
    /// reduction: it picks the largest watermark any tgid in the
    /// bucket reported. Routes through the IEC binary auto-scale
    /// ladder ([`crate::metric_types::ScaleLadder::Bytes`]) so a
    /// 7.5 GiB watermark renders as `7.500GiB` instead of
    /// dominating the table with raw byte counts. Summing
    /// watermarks would over-count shared address-space mappings
    /// across sibling threads N-fold — `PeakBytes` does NOT
    /// implement `Summable`.
    ///
    /// [`PeakBytes`]: crate::metric_types::PeakBytes
    MaxPeakBytes(fn(&ThreadState) -> crate::metric_types::PeakBytes),
    /// Maximum across the group of a [`GaugeNs`] field —
    /// instantaneous-time gauges where summing is meaningless.
    /// `fair_slice_ns` is the per-thread CURRENT scheduler slice
    /// (stale under SCHED_EXT — see field doc) read at capture
    /// time, not a high-water value. Summing instantaneous
    /// gauges produces a number with no physical meaning — N
    /// nearly-identical instantaneous values sum to `N * gauge`
    /// regardless of group composition, drowning the signal.
    /// Max instead surfaces "the longest current slice any
    /// thread in the bucket is running with", which IS the
    /// signal a user comparing two snapshots cares about.
    ///
    /// [`GaugeNs`]: crate::metric_types::GaugeNs
    MaxGaugeNs(fn(&ThreadState) -> crate::metric_types::GaugeNs),
    /// Maximum across the group of a [`GaugeCount`] field —
    /// leader-deduped structural counts. `nr_threads` is
    /// populated only on the tgid leader (`tid == tgid`) and
    /// zero on every non-leader thread of the same process; see
    /// `capture_thread_at_with_tally`. Sum across a comm- or
    /// cgroup-bucketed group would render 0 for any bucket
    /// whose leader fell elsewhere because non-leader members
    /// each contribute 0. Max instead reads through to the
    /// leader's value, surfacing "the largest process
    /// represented in this bucket" regardless of which axis the
    /// bucket is built around. The row count already covers
    /// "how many threads are here", so the structural field's
    /// value adds new information rather than restating the row
    /// count.
    ///
    /// [`GaugeCount`]: crate::metric_types::GaugeCount
    MaxGaugeCount(fn(&ThreadState) -> crate::metric_types::GaugeCount),
    /// Ordinal i32, aggregated as the observed [min, max] range.
    /// Used for signed-domain ordinals (`nice`, `priority`,
    /// `processor`). Delta math uses the midpoint of each range
    /// as the scalar; output prints both the range and the
    /// delta. The dispatch routes through
    /// [`crate::metric_types::Rangeable::range_across`] and
    /// widens to `i64` for [`Aggregated::OrdinalRange`].
    ///
    /// [`OrdinalI32`]: crate::metric_types::OrdinalI32
    RangeI32(fn(&ThreadState) -> crate::metric_types::OrdinalI32),
    /// Ordinal u32, aggregated as the observed [min, max] range.
    /// Used for unsigned-domain ordinals (`rt_priority`,
    /// kernel-typed `unsigned int`). Same shape as
    /// [`AggRule::RangeI32`] but the inner width matches the
    /// kernel-side `unsigned int` declaration; the dispatch
    /// widens the resulting `u32` to `i64` for
    /// [`Aggregated::OrdinalRange`].
    ///
    /// [`OrdinalU32`]: crate::metric_types::OrdinalU32
    RangeU32(fn(&ThreadState) -> crate::metric_types::OrdinalU32),
    /// Categorical string, aggregated as the mode (most-frequent
    /// value). Used for `policy` (string-valued
    /// [`crate::metric_types::CategoricalString`]). Delta is
    /// textual: "same" if both modes agree, "differs" otherwise
    /// — there is no arithmetic on a policy name. The dispatch
    /// routes through
    /// [`crate::metric_types::Modeable::mode_across`].
    Mode(fn(&ThreadState) -> crate::metric_types::CategoricalString),
    /// Categorical char, aggregated as the mode. Used for
    /// `state` (single-letter task state from
    /// `/proc/<tid>/status`). The dispatch coerces the `char`
    /// to a `String` via `to_string()` before reducing — `char`
    /// itself is NOT [`Modeable`] (only
    /// [`crate::metric_types::CategoricalString`] is), so this
    /// variant exists to keep the registry's accessor type
    /// matching the `ThreadState` field type without forcing the
    /// field into a wrapper. If a second char-valued metric
    /// appears, promote both fields to a dedicated
    /// `CategoricalChar` wrapper rather than continuing the
    /// ad-hoc coercion (mirrors the `CategoricalBool`
    /// promotion guidance on [`AggRule::ModeBool`]).
    ///
    /// [`Modeable`]: crate::metric_types::Modeable
    ModeChar(fn(&ThreadState) -> char),
    /// Categorical bool, aggregated as the mode. Used for
    /// `ext_enabled` (sched_ext class membership). Same shape as
    /// [`AggRule::ModeChar`]: the dispatch coerces via
    /// `to_string()` so `"true"`/`"false"` participate in the
    /// mode reduction. If a second bool-valued metric appears,
    /// promote both fields to a dedicated `CategoricalBool`
    /// wrapper rather than continuing the ad-hoc coercion.
    ///
    /// Tiebreak skew (FA-2): the lex-smallest-wins tiebreak
    /// inside `Modeable::mode_across` makes `"false"` (`'f'`,
    /// 0x66) win an equal-count tie against `"true"` (`'t'`,
    /// 0x74). This matches the legacy pre-phase-3 behavior —
    /// the old `to_string()` coercion fed the same string pair
    /// through the same lex-tiebreak — but is worth flagging
    /// explicitly: a 50/50 sched_ext-on/off bucket renders
    /// `false` as the mode rather than picking the more
    /// "informative" `true`. Operators reading a `false` mode
    /// in a heterogeneous bucket should check the `count/total`
    /// fraction.
    ModeBool(fn(&ThreadState) -> bool),
    /// CPU affinity set, aggregated as the num_cpus range across
    /// the group plus a uniform-cpuset rendering when every
    /// thread shared the same allowed set. Used for
    /// `cpu_affinity`. The accessor returns
    /// [`crate::metric_types::CpuSet`]; the dispatch unwraps to
    /// `Vec<u32>` for the [`AffinitySummary`] reduction.
    ///
    /// Unlike the `Sum*` / `Max*` / `Range*` / `Mode*` rules,
    /// `Affinity` does NOT route through a
    /// [`crate::metric_types`] trait method — its reduction
    /// produces an [`AffinitySummary`] (num_cpus range +
    /// uniform-cpuset flag), not a homogeneous `CpuSet`, so the
    /// inline aggregator in [`aggregate`] walks the per-thread
    /// `Vec<u32>` directly. A future `Affinable` trait could
    /// fold the body into [`crate::metric_types`] but the
    /// summary type is single-use today.
    ///
    /// Type-system bypass caveat (FA-1): the typed `AggRule`
    /// shape catches "wrong wrapper" mistakes
    /// (`SumNs(|t| t.wait_max)` fails to compile because
    /// `wait_max` is `PeakNs`), but a closure body that
    /// actively MISWRAPS the underlying field — e.g.
    /// `SumNs(|t| MonotonicNs(t.wait_max.0))` — laundering a
    /// peak through the sum wrapper still type-checks. Don't
    /// do that. The wrapper category is load-bearing; the type
    /// system catches the variant mismatch but cannot
    /// inspect the inside of an arbitrary closure.
    Affinity(fn(&ThreadState) -> crate::metric_types::CpuSet),
}

/// One metric exposed by the comparison pipeline.
///
/// The auto-scale ladder for the rendered cell is derived from
/// [`AggRule::ladder`] at render time — there is no separate
/// `unit` tag on the metric def. A registry entry that pairs an
/// AggRule variant with a category-mismatched ladder fails at
/// compile time (the ladder mapping is a closed match on the
/// variant, not a free-form string).
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
pub struct CtprofMetricDef {
    pub name: &'static str,
    pub rule: AggRule,
    /// Scheduler-class scope for the metric. `None` means
    /// class-agnostic — every task class accumulates the value
    /// (e.g. `nr_migrations`). Concrete spellings:
    /// - `"cfs-only"` — incremented strictly inside CFS-class
    ///   call paths (`kernel/sched/fair.c`), zero under
    ///   SCHED_EXT / SCHED_FIFO / SCHED_RR / SCHED_DEADLINE /
    ///   SCHED_IDLE. Examples: `nr_wakeups_affine`,
    ///   `nr_wakeups_affine_attempts`, `nr_failed_migrations_*`,
    ///   `nr_forced_migrations`, `slice_max`.
    /// - `"fair-policy"` — emitted only when
    ///   `fair_policy(p->policy)` returns true. Per
    ///   `kernel/sched/sched.h:194,203`, that admits
    ///   SCHED_NORMAL, SCHED_BATCH, AND SCHED_EXT (under
    ///   CONFIG_SCHED_CLASS_EXT). Zero under SCHED_FIFO/RR/DL/IDLE.
    ///   Example: `fair_slice_ns`.
    /// - `"non-ext"` — written by the schedstats sleep/wait
    ///   family wrappers `__update_stats_enqueue_sleeper`
    ///   (kernel/sched/stats.c:48) and `__update_stats_wait_end`
    ///   (kernel/sched/stats.c:21), called from fair.c, rt.c,
    ///   deadline.c but NOT ext.c — i.e. CFS/RT/DL accumulate,
    ///   sched_ext bypasses. Examples: `wait_sum`, `wait_count`,
    ///   `wait_max`, `voluntary_sleep_ns`, `sleep_max`,
    ///   `block_sum`, `block_max`, `iowait_sum`, `iowait_count`.
    pub sched_class: Option<&'static str>,
    /// Kernel CONFIG options that gate the metric. `&[]` means
    /// no gating (always populated when the source path runs).
    /// One element typically; multi-element when more than one
    /// gate is required (e.g. `core_forceidle_sum` requires
    /// CONFIG_SCHED_CORE AND CONFIG_SCHEDSTATS). Concrete
    /// spellings match the literal `Kconfig` symbol so an
    /// operator can `grep CONFIG_X /boot/config-$(uname -r)` to
    /// confirm. Verified gates:
    /// - `"CONFIG_SCHEDSTATS"` — gates every `__schedstat_*` /
    ///   `schedstat_*` macro call. Off → the macro is
    ///   `do { } while (0)` per `kernel/sched/stats.h:75-82`.
    /// - `"CONFIG_SCHED_INFO"` — gates the lighter-weight
    ///   `sched_info_*` accounting (`run_time_ns`,
    ///   `wait_time_ns`, `timeslices`); the schedstat file is
    ///   gated by `sched_info_on()` at
    ///   `proc_pid_schedstat` (fs/proc/base.c:511-523).
    /// - `"CONFIG_SCHED_CORE"` — gates the core-scheduling
    ///   subsystem (`__account_forceidle_time`).
    /// - `"CONFIG_SCHED_CLASS_EXT"` — gates the sched_ext
    ///   class. When off, no task can land on ext, so
    ///   `ext_enabled` reads false uniformly.
    /// - `"CONFIG_TASK_DELAY_ACCT"` — gates the delayacct
    ///   accounting path that populates the taskstats genetlink
    ///   delay-family fields (`cpu_delay_*`, `blkio_delay_*`,
    ///   etc.).
    /// - `"CONFIG_TASK_IO_ACCOUNTING"` — gates the per-task
    ///   I/O accounting fields exposed by `/proc/<tid>/io`
    ///   (`rchar`, `wchar`, `syscr`, `syscw`, `read_bytes`,
    ///   `write_bytes`, `cancelled_write_bytes`). The kernel
    ///   emits all 7 fields under one `do_io_accounting` call,
    ///   and CONFIG_TASK_IO_ACCOUNTING `depends on`
    ///   CONFIG_TASK_XACCT in `init/Kconfig` — so from the
    ///   procfs-reader perspective the file is all-or-nothing.
    pub config_gates: &'static [&'static str],
    /// True for kernel counters that are exposed in `/proc`
    /// but never incremented anywhere in the kernel tree —
    /// always reads zero. Operators reading the rendered table
    /// see the `[dead]` flag and stop chasing the always-zero
    /// cell. The registry is currently empty of `is_dead: true`
    /// entries: the previously-registered dead counters
    /// (`nr_wakeups_idle`, `nr_wakeups_passive`,
    /// `nr_migrations_cold`) were dropped from `ThreadState`
    /// and the registry; the kernel still emits the lines so
    /// the parser silently ignores them. The flag remains as
    /// infrastructure: a future kernel that resurrects a dead
    /// counter (or exposes a new always-zero one) registers
    /// with `is_dead: true` and the `[dead]` rendering path
    /// fires.
    pub is_dead: bool,
    /// One-line operator-facing description of what this metric
    /// counts. Surfaced by the `ctprof metric-list`
    /// subcommand alongside the bracketed tag suffix so an
    /// operator scanning a rendered table can map an unfamiliar
    /// metric name to its semantics without leaving the CLI.
    /// Plain ASCII. "Cumulative" is load-bearing — use it to
    /// distinguish counters from gauges; the [`AggRule`] only
    /// names the per-group reduction, not the per-thread
    /// counter shape.
    pub description: &'static str,
    /// Section this metric belongs to for the `--sections`
    /// per-row filter. Most rows tag [`Section::Primary`];
    /// taskstats-sourced rows (the eight delay-accounting
    /// categories plus the two memory watermarks) carry
    /// [`Section::TaskstatsDelay`] so an operator can scope
    /// the rendered table down to (or away from) the taskstats
    /// rows. The primary-table emitter checks
    /// [`DisplayOptions::is_section_enabled`] per row before
    /// rendering — `--sections taskstats-delay` keeps only
    /// taskstats rows, `--sections primary` excludes them, and
    /// either alone keeps the primary table open. The default
    /// (empty filter) renders every row regardless of section.
    pub section: Section,
}

/// Registry of per-thread metrics. Order here is the default
/// display order for rows that have no numeric delta to sort by
/// (ties fall back to registry order). Names are the ASCII
/// short-form used in capture code; long-form display is the
/// same — no translation layer.
///
/// **PSI is intentionally not in this registry.** Each
/// [`AggRule`] variant's accessor takes `&ThreadState` and
/// returns a [`crate::metric_types`] newtype (or a primitive
/// the dispatch coerces via `to_string()` for `ModeChar` /
/// `ModeBool`); only per-thread data fits that signature, while
/// Pressure Stall Information is per-snapshot (host-level) and
/// per-cgroup. PSI surfaces in dedicated secondary tables
/// under "## Host pressure / ..." and "## Pressure / ..."
/// headers, rendered by [`write_diff`] / `write_show` directly
/// rather than via [`AggRule`]. See [`Psi`] / [`PsiResource`] /
/// [`PsiHalf`] for the data model.
pub static CTPROF_METRICS: &[CtprofMetricDef] = &[
    // structural: group population count
    CtprofMetricDef {
        name: "thread_count",
        rule: AggRule::SumCount(|_| crate::metric_types::MonotonicCount(1)),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Number of threads in this group. Each thread contributes 1; the sum is the group population. Useful for --sort-by thread_count:desc to find groups where thread count changed the most.",
        section: Section::Primary,
    },
    // identity / structural (non-numeric aggregation)
    CtprofMetricDef {
        name: "policy",
        rule: AggRule::Mode(|t| t.policy.clone()),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Scheduling policy (SCHED_OTHER, SCHED_FIFO, SCHED_RR, SCHED_BATCH, SCHED_IDLE, SCHED_DEADLINE, SCHED_EXT).",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "nice",
        rule: AggRule::RangeI32(|t| t.nice),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Nice value (-20..19); CFS priority knob.",
        section: Section::Primary,
    },
    // `task_prio()` value from `/proc/<tid>/stat` field 18.
    // Per-thread ordinal — aggregate as OrdinalRange (mirrors
    // `nice` directly above), not Sum. Kernel ranges per
    // `task_prio()` at `kernel/sched/syscalls.c:170`:
    // CFS=[0..39], RT=[-2..-100], DL=-101 — see the field
    // doc on [`ThreadState::priority`].
    CtprofMetricDef {
        name: "priority",
        rule: AggRule::RangeI32(|t| t.priority),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Kernel task priority from /proc/<tid>/stat field 18 (CFS=[0..39], RT=[-2..-100], DL=-101).",
        section: Section::Primary,
    },
    // Real-time scheduler priority from `/proc/<tid>/stat`
    // field 40. Bounded 0..99 in practice (SCHED_FIFO /
    // SCHED_RR range); zero for CFS tasks. OrdinalRange to
    // surface the spread across a group, like `nice` and
    // `priority`.
    CtprofMetricDef {
        name: "rt_priority",
        rule: AggRule::RangeU32(|t| t.rt_priority),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Real-time scheduler priority (0..99); 0 for non-RT tasks.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "cpu_affinity",
        rule: AggRule::Affinity(|t| t.cpu_affinity.clone()),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Set of CPUs the task is allowed to run on (sched_getaffinity result).",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "processor",
        rule: AggRule::RangeI32(|t| t.processor),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Last CPU the task ran on.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "state",
        rule: AggRule::ModeChar(|t| t.state),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Task state letter (R running, S sleeping, D uninterruptible, Z zombie, T stopped).",
        section: Section::Primary,
    },
    // `ext_enabled` reflects whether the task is currently on
    // the sched_ext class. Gated by CONFIG_SCHED_CLASS_EXT —
    // when off, no task can land on ext, so the field reads
    // `false` uniformly across every thread.
    CtprofMetricDef {
        name: "ext_enabled",
        rule: AggRule::ModeBool(|t| t.ext_enabled),
        sched_class: None,
        config_gates: &["CONFIG_SCHED_CLASS_EXT"],
        is_dead: false,
        description: "Whether the task is currently dispatched on the sched_ext class.",
        section: Section::Primary,
    },
    // Process-wide thread count (`signal_struct->nr_threads`)
    // from `/proc/<tid>/status` `Threads:`. Capture-side
    // populates only on tid == tgid threads (leader dedup), so
    // every non-leader thread carries 0 — Sum across a group
    // would render 0 for any bucket whose leader is not part of
    // the bucket (e.g. `--group-by comm` puts non-leader threads
    // in their own comm bucket). `Max` answers "largest process
    // represented in this bucket"; the row count already covers
    // "how many threads are here". Identity/structural rather
    // than counter — placement here mirrors `state` and
    // `ext_enabled` (per-thread snapshots, not deltas).
    CtprofMetricDef {
        name: "nr_threads",
        rule: AggRule::MaxGaugeCount(|t| t.nr_threads),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Process-wide thread count (signal_struct->nr_threads); leader-only.",
        section: Section::Primary,
    },
    // scheduling
    // `run_time_ns` from `/proc/<tid>/schedstat` field 1 —
    // gated by CONFIG_SCHED_INFO via `sched_info_on()` at
    // `proc_pid_schedstat` (fs/proc/base.c:511-523).
    CtprofMetricDef {
        name: "run_time_ns",
        rule: AggRule::SumNs(|t| t.run_time_ns),
        sched_class: None,
        config_gates: &["CONFIG_SCHED_INFO"],
        is_dead: false,
        description: "Cumulative on-CPU time, ns; /proc/<tid>/schedstat field 1.",
        section: Section::Primary,
    },
    // `wait_time_ns` from `/proc/<tid>/schedstat` field 2 —
    // gated by CONFIG_SCHED_INFO via `sched_info_on()` at
    // `proc_pid_schedstat` (fs/proc/base.c:511-523).
    CtprofMetricDef {
        name: "wait_time_ns",
        rule: AggRule::SumNs(|t| t.wait_time_ns),
        sched_class: None,
        config_gates: &["CONFIG_SCHED_INFO"],
        is_dead: false,
        description: "Cumulative time waiting on the runqueue, ns; schedstat field 2.",
        section: Section::Primary,
    },
    // `timeslices` from `/proc/<tid>/schedstat` field 3 —
    // same gate as `wait_time_ns`.
    CtprofMetricDef {
        name: "timeslices",
        rule: AggRule::SumCount(|t| t.timeslices),
        sched_class: None,
        config_gates: &["CONFIG_SCHED_INFO"],
        is_dead: false,
        description: "Number of times the task was run on a CPU; schedstat field 3.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "voluntary_csw",
        rule: AggRule::SumCount(|t| t.voluntary_csw),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Voluntary context switches (task gave up the CPU itself).",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "nonvoluntary_csw",
        rule: AggRule::SumCount(|t| t.nonvoluntary_csw),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Involuntary context switches (task was preempted).",
        section: Section::Primary,
    },
    // `nr_wakeups`, `_local`, `_remote`, `_sync`, `_migrate`
    // are class-agnostic — `__schedstat_inc` from
    // `kernel/sched/core.c::ttwu_stat` (e.g. line 3614 for the
    // base counter) fires for every task class. The macro
    // expands to `do { } while (0)` under !CONFIG_SCHEDSTATS
    // per `kernel/sched/stats.h:75-82`.
    CtprofMetricDef {
        name: "nr_wakeups",
        rule: AggRule::SumCount(|t| t.nr_wakeups),
        sched_class: None,
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Total wakeups via try_to_wake_up().",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "nr_wakeups_local",
        rule: AggRule::SumCount(|t| t.nr_wakeups_local),
        sched_class: None,
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Wakeups landed on the same CPU as the waker.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "nr_wakeups_remote",
        rule: AggRule::SumCount(|t| t.nr_wakeups_remote),
        sched_class: None,
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Wakeups landed on a different CPU than the waker.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "nr_wakeups_sync",
        rule: AggRule::SumCount(|t| t.nr_wakeups_sync),
        sched_class: None,
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "WF_SYNC wakeups (synchronous wakeup hint to scheduler).",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "nr_wakeups_migrate",
        rule: AggRule::SumCount(|t| t.nr_wakeups_migrate),
        sched_class: None,
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Wakeups where the task migrated to a different CPU than its prior one (WF_MIGRATED); distinct from nr_wakeups_remote (waker CPU != target CPU).",
        section: Section::Primary,
    },
    // `nr_wakeups_affine`, `_attempts` are CFS-only —
    // `kernel/sched/fair.c::wake_affine` calls
    // `schedstat_inc(p->stats.nr_wakeups_affine_attempts)` at
    // line 7604 and the matching `_affine` increment at line
    // 7609. Both expand only under CFS task lifetime, so a
    // task on SCHED_EXT / SCHED_FIFO / SCHED_RR / SCHED_DL
    // never accumulates them.
    CtprofMetricDef {
        name: "nr_wakeups_affine",
        rule: AggRule::SumCount(|t| t.nr_wakeups_affine),
        sched_class: Some("cfs-only"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Wakeups that succeeded under the wake_affine() heuristic.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "nr_wakeups_affine_attempts",
        rule: AggRule::SumCount(|t| t.nr_wakeups_affine_attempts),
        sched_class: Some("cfs-only"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "wake_affine() attempts; success rate = nr_wakeups_affine / attempts.",
        section: Section::Primary,
    },
    // `nr_migrations` is incremented unconditionally at
    // `kernel/sched/core.c:3283` (`p->se.nr_migrations++`) — no
    // schedstat macro, no class gating. Always populated.
    CtprofMetricDef {
        name: "nr_migrations",
        rule: AggRule::SumCount(|t| t.nr_migrations),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Cumulative cross-CPU migrations of the task.",
        section: Section::Primary,
    },
    // `nr_forced_migrations` is set by
    // `kernel/sched/fair.c:9775` (`schedstat_inc`) inside
    // CFS-only load-balancing.
    CtprofMetricDef {
        name: "nr_forced_migrations",
        rule: AggRule::SumCount(|t| t.nr_forced_migrations),
        sched_class: Some("cfs-only"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Migrations forced by the CFS load balancer.",
        section: Section::Primary,
    },
    // `nr_failed_migrations_*` family — all CFS-only,
    // incremented in `kernel/sched/fair.c::can_migrate_task`
    // (lines 9701, 9735, 9761, 9942).
    CtprofMetricDef {
        name: "nr_failed_migrations_affine",
        rule: AggRule::SumCount(|t| t.nr_failed_migrations_affine),
        sched_class: Some("cfs-only"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Load-balancer migrations rejected for cpu-affinity reasons.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "nr_failed_migrations_running",
        rule: AggRule::SumCount(|t| t.nr_failed_migrations_running),
        sched_class: Some("cfs-only"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Load-balancer migrations rejected because the task was running.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "nr_failed_migrations_hot",
        rule: AggRule::SumCount(|t| t.nr_failed_migrations_hot),
        sched_class: Some("cfs-only"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Load-balancer migrations rejected because the task was cache-hot.",
        section: Section::Primary,
    },
    // `wait_sum` / `wait_count` / `wait_max` — written by
    // `__update_stats_wait_end` (`kernel/sched/stats.c:21`),
    // which is called from `update_stats_wait_end_fair`
    // (kernel/sched/fair.c:1426), `update_stats_wait_end_dl`
    // (kernel/sched/deadline.c:2114), and
    // `update_stats_wait_end_rt` (kernel/sched/rt.c:1282) —
    // i.e. CFS, RT, AND DL classes accumulate. Sched_ext bypasses
    // these wrappers, so the counters stay at zero for SCHED_EXT
    // tasks. Tagged `non-ext`. Expanded to a no-op under
    // !CONFIG_SCHEDSTATS via the schedstat macros at
    // `kernel/sched/stats.h:75-82`.
    CtprofMetricDef {
        name: "wait_sum",
        rule: AggRule::SumNs(|t| t.wait_sum),
        sched_class: Some("non-ext"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Cumulative time the task waited on the runqueue, ns.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "wait_count",
        rule: AggRule::SumCount(|t| t.wait_count),
        sched_class: Some("non-ext"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Number of distinct runqueue-wait intervals the task accumulated.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "wait_max",
        rule: AggRule::MaxPeak(|t| t.wait_max),
        sched_class: Some("non-ext"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Longest single runqueue-wait interval observed, ns.",
        section: Section::Primary,
    },
    // `voluntary_sleep_ns` / `sleep_max` / `block_sum` /
    // `block_max` / `iowait_sum` / `iowait_count` — written by
    // `__update_stats_enqueue_sleeper` (kernel/sched/stats.c:48),
    // which is called from `update_stats_enqueue_sleeper_fair`
    // (kernel/sched/fair.c:1452),
    // `update_stats_enqueue_sleeper_dl`
    // (kernel/sched/deadline.c:2122), and
    // `update_stats_enqueue_sleeper_rt`
    // (kernel/sched/rt.c:1252). Same shape as the wait_* family
    // above: CFS+RT+DL accumulate, sched_ext bypasses, so the
    // counters stay at zero for SCHED_EXT tasks. Tagged `non-ext`.
    // Expanded to a no-op under !CONFIG_SCHEDSTATS via the
    // schedstat macros at `kernel/sched/stats.h:75-82`.
    // `voluntary_sleep_ns` is the capture-side normalization of
    // the kernel's `sum_sleep_runtime` — the raw value
    // double-counts block under sleep, so capture subtracts
    // `sum_block_runtime` before storing.
    CtprofMetricDef {
        name: "voluntary_sleep_ns",
        rule: AggRule::SumNs(|t| t.voluntary_sleep_ns),
        sched_class: Some("non-ext"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Pure voluntary sleep time (TASK_INTERRUPTIBLE only), ns; capture-side normalized as sum_sleep_runtime - sum_block_runtime so the kernel's sleep/block double-count is stripped before delta math.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "sleep_max",
        rule: AggRule::MaxPeak(|t| t.sleep_max),
        sched_class: Some("non-ext"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Longest single sleep interval observed, ns.",
        section: Section::Primary,
    },
    // No `sleep_count` metric: the kernel does not emit that
    // counter — the wake-side tally is captured by `nr_wakeups`
    // already.
    CtprofMetricDef {
        name: "block_sum",
        rule: AggRule::SumNs(|t| t.block_sum),
        sched_class: Some("non-ext"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Cumulative time the task spent blocked (TASK_UNINTERRUPTIBLE), ns.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "block_max",
        rule: AggRule::MaxPeak(|t| t.block_max),
        sched_class: Some("non-ext"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Longest single uninterruptible-block interval observed, ns.",
        section: Section::Primary,
    },
    // No `block_count` metric: the kernel emits no per-event
    // counter for `sum_block_runtime` (unlike `wait_sum/wait_count`
    // and `iowait_sum/iowait_count` pairs).
    CtprofMetricDef {
        name: "iowait_sum",
        rule: AggRule::SumNs(|t| t.iowait_sum),
        sched_class: Some("non-ext"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Cumulative time the task spent in iowait, ns.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "iowait_count",
        rule: AggRule::SumCount(|t| t.iowait_count),
        sched_class: Some("non-ext"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Number of distinct iowait intervals the task accumulated.",
        section: Section::Primary,
    },
    // delayacct_blkio_ticks (the procfs USER_HZ-ticks delivery
    // of the same delay-accounting block-I/O bucket) was removed
    // because `blkio_delay_total_ns` from the taskstats genetlink
    // path supersedes it: same kernel data via the same
    // CONFIG_TASK_DELAY_ACCT gate, but ns precision instead of
    // USER_HZ truncation, no procfs round-trip, and one row in
    // the rendered registry instead of two. ktstr always runs as
    // root (CAP_NET_ADMIN is implicit), so the procfs fallback
    // bought no extra coverage.
    // `exec_max` is set inside `update_se`
    // (`kernel/sched/fair.c:1335`), guarded by
    // `if (schedstat_enabled())`. Reachable from sched_ext via
    // `update_curr_common` (`kernel/sched/ext.c:1355`), so
    // class-agnostic at runtime, gated only by CONFIG_SCHEDSTATS.
    CtprofMetricDef {
        name: "exec_max",
        rule: AggRule::MaxPeak(|t| t.exec_max),
        sched_class: None,
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Longest single uninterrupted on-CPU run observed, ns.",
        section: Section::Primary,
    },
    // `slice_max` is part of the CFS-class statistics struct.
    // Per the kernel-field-semantics audit, zero under
    // sched_ext / RT / DL because the populating call sites
    // live in CFS-class entry points.
    CtprofMetricDef {
        name: "slice_max",
        rule: AggRule::MaxPeak(|t| t.slice_max),
        sched_class: Some("cfs-only"),
        config_gates: &["CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Longest CFS slice the task was granted, ns.",
        section: Section::Primary,
    },
    // Cumulative core-scheduling forced-idle time, ns. Counter
    // (Sum). Increment is class-agnostic: `__account_forceidle_time()`
    // at `kernel/sched/cputime.c:244` does a plain
    // `__schedstat_add(p->stats.core_forceidle_sum, delta)` on
    // whichever task is running on each SMT sibling, called
    // from `__sched_core_account_forceidle()` in
    // `kernel/sched/core_sched.c:287`. Real gating is at
    // build/rq level: CONFIG_SCHED_CORE + CONFIG_SCHEDSTATS +
    // `core_forceidle_count > 0`. See [`ThreadState::core_forceidle_sum`]
    // for the full caller chain.
    // Auto_scale ns ladder takes ns → µs → ms → s. Lives next
    // to `slice_max` because both relate to scheduler-decision
    // moments rather than wait/sleep accumulation.
    CtprofMetricDef {
        name: "core_forceidle_sum",
        rule: AggRule::SumNs(|t| t.core_forceidle_sum),
        sched_class: None,
        config_gates: &["CONFIG_SCHED_CORE", "CONFIG_SCHEDSTATS"],
        is_dead: false,
        description: "Cumulative time this task forced its SMT sibling idle, ns (core scheduling).",
        section: Section::Primary,
    },
    // Current scheduler slice in ns (stale under SCHED_EXT —
    // see field doc) from `/proc/<tid>/sched`'s `slice` line.
    // Per-thread instantaneous gauge (NOT a high-water counter
    // — `slice_max` directly above is the historical max).
    // Aggregating across a group via Max surfaces the longest
    // current slice any thread is running with — Sum would
    // multiply a near-identical value across the group and
    // obscure the signal. Name `fair_slice_ns` mirrors the
    // kernel emission gate `fair_policy(p->policy)` at
    // `kernel/sched/debug.c:1363`, which (per
    // `kernel/sched/sched.h:194,203`) accepts SCHED_NORMAL,
    // SCHED_BATCH, AND SCHED_EXT under CONFIG_SCHED_CLASS_EXT.
    CtprofMetricDef {
        name: "fair_slice_ns",
        rule: AggRule::MaxGaugeNs(|t| t.fair_slice_ns),
        sched_class: Some("fair-policy"),
        config_gates: &[],
        is_dead: false,
        description: "Current scheduler slice, ns; snapshot from /proc/<tid>/sched (stale under sched_ext).",
        section: Section::Primary,
    },
    // memory
    CtprofMetricDef {
        name: "allocated_bytes",
        rule: AggRule::SumBytes(|t| t.allocated_bytes),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "jemalloc per-thread allocated bytes (TSD thread_allocated counter).",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "deallocated_bytes",
        rule: AggRule::SumBytes(|t| t.deallocated_bytes),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "jemalloc per-thread deallocated bytes (TSD thread_deallocated counter).",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "minflt",
        rule: AggRule::SumCount(|t| t.minflt),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Minor page faults (resolved without I/O).",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "majflt",
        rule: AggRule::SumCount(|t| t.majflt),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Major page faults (required disk I/O to resolve).",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "utime_clock_ticks",
        rule: AggRule::SumTicks(|t| t.utime_clock_ticks),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "User-mode CPU time, USER_HZ ticks; /proc/<tid>/stat field 14.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "stime_clock_ticks",
        rule: AggRule::SumTicks(|t| t.stime_clock_ticks),
        sched_class: None,
        config_gates: &[],
        is_dead: false,
        description: "Kernel-mode CPU time, USER_HZ ticks; /proc/<tid>/stat field 15.",
        section: Section::Primary,
    },
    // I/O — `/proc/<tid>/io` is emitted by
    // `do_io_accounting` (`fs/proc/base.c`) under a single
    // `CONFIG_TASK_IO_ACCOUNTING` gate, and CONFIG_TASK_IO_ACCOUNTING
    // `depends on` CONFIG_TASK_XACCT in init/Kconfig — so from
    // the capture-pipeline perspective the file is
    // all-or-nothing. All 6 fields share the same
    // `CONFIG_TASK_IO_ACCOUNTING` gate.
    CtprofMetricDef {
        name: "rchar",
        rule: AggRule::SumBytes(|t| t.rchar),
        sched_class: None,
        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
        is_dead: false,
        description: "Bytes read at the read syscall layer (incl. cached / pagecache hits).",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "wchar",
        rule: AggRule::SumBytes(|t| t.wchar),
        sched_class: None,
        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
        is_dead: false,
        description: "Bytes written at the write syscall layer (incl. pagecache / writeback).",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "syscr",
        rule: AggRule::SumCount(|t| t.syscr),
        sched_class: None,
        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
        is_dead: false,
        description: "Number of read syscalls.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "syscw",
        rule: AggRule::SumCount(|t| t.syscw),
        sched_class: None,
        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
        is_dead: false,
        description: "Number of write syscalls.",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "read_bytes",
        rule: AggRule::SumBytes(|t| t.read_bytes),
        sched_class: None,
        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
        is_dead: false,
        description: "Bytes that hit the storage device on read (excludes pagecache hits).",
        section: Section::Primary,
    },
    CtprofMetricDef {
        name: "write_bytes",
        rule: AggRule::SumBytes(|t| t.write_bytes),
        sched_class: None,
        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
        is_dead: false,
        description: "Bytes that hit the storage device on write (post-writeback).",
        section: Section::Primary,
    },
    // `cancelled_write_bytes` from `/proc/<tid>/io` 7th line.
    // `task_io_account_cancelled_write` (kernel
    // include/linux/task_io_accounting_ops.h:39-42) increments
    // `current->ioac.cancelled_write_bytes` from
    // `folio_account_cleaned` (mm/page-writeback.c:2628) when a
    // dirty folio is reclaimed without writeback (truncate /
    // inode invalidation), so the per-thread value records on
    // the truncating task — not necessarily the original writer.
    // Group-level Sum is meaningful (total cancelled-write
    // bytes for the bucket); per-thread `write_bytes -
    // cancelled_write_bytes` is NOT a derived metric because
    // the two counters track distinct parties — see the field
    // doc on ThreadState::cancelled_write_bytes.
    CtprofMetricDef {
        name: "cancelled_write_bytes",
        rule: AggRule::SumBytes(|t| t.cancelled_write_bytes),
        sched_class: None,
        config_gates: &["CONFIG_TASK_IO_ACCOUNTING"],
        is_dead: false,
        description: "Bytes the kernel deaccounted from a prior dirty-write because the page was reclaimed without writeback (truncate / inode invalidation); recorded on the truncating task, not the writer. Per-thread `write_bytes - cancelled_write_bytes` is NOT a valid derivation — see field doc.",
        section: Section::Primary,
    },
    // taskstats — captured via the kernel's genetlink TASKSTATS
    // family ([`crate::taskstats`]). Two field families share the
    // CONFIG_TASKSTATS netlink-family gate but differ in the
    // per-family kconfig:
    //
    //   - delay-accounting fields (cpu/blkio/swapin/freepages/
    //     thrashing/compact/wpcopy/irq × count/total/max/min,
    //     32 entries) are gated on CONFIG_TASKSTATS +
    //     CONFIG_TASK_DELAY_ACCT (the per-task counters in
    //     `kernel/delayacct.c`); the runtime `delayacct=on` toggle
    //     (sysctl `kernel.task_delayacct` or boot param
    //     `delayacct`) is a separate condition that must hold for
    //     the counters to actually update.
    //   - memory-watermark fields (hiwater_rss_bytes,
    //     hiwater_vm_bytes) are gated on CONFIG_TASKSTATS +
    //     CONFIG_TASK_XACCT (the extended-accounting path in
    //     `kernel/tsacct.c::xacct_add_tsk`); they do NOT respond
    //     to the `delayacct=on` toggle.
    //
    // Calling the netlink family additionally requires
    // `CAP_NET_ADMIN`. Any failed gate / missing cap collapses
    // the affected fields to zero per the best-effort capture
    // contract.
    //
    // CPU-delay block: cpu_count + cpu_delay_total are RACY —
    // updated by the sched_info path without a lock, so a reader
    // may observe count or total advance ahead of the other.
    // (cpu_delay_max / cpu_delay_min are PeakNs lifetime
    // watermarks updated at delayacct path entries; same race
    // window in principle, but the watermark semantics already
    // mask brief skew.)
    CtprofMetricDef {
        name: "cpu_delay_count",
        rule: AggRule::SumCount(|t| t.cpu_delay_count),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Number of off-CPU windows the task waited for the runqueue to schedule it (taskstats cpu_count). RACY: count + total are not updated atomically.",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "cpu_delay_total_ns",
        rule: AggRule::SumNs(|t| t.cpu_delay_total_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Cumulative ns the task waited on the runqueue (taskstats cpu_delay_total). Distinct from `wait_sum` (schedstat) which captures the same wait-for-CPU bucket via a different code path. RACY (see cpu_delay_count).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "cpu_delay_max_ns",
        rule: AggRule::MaxPeak(|t| t.cpu_delay_max_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Longest single CPU-wait window observed, ns (taskstats cpu_delay_max).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "cpu_delay_min_ns",
        rule: AggRule::MaxPeak(|t| t.cpu_delay_min_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Shortest non-zero CPU-wait window observed, ns (taskstats cpu_delay_min). Sentinel 0 means \"no events observed\" — compare against cpu_delay_count.",
        section: Section::TaskstatsDelay,
    },
    // Block-I/O delay block: serializes through `task->delays->lock`
    // so count + total are atomic (unlike cpu_*).
    CtprofMetricDef {
        name: "blkio_delay_count",
        rule: AggRule::SumCount(|t| t.blkio_delay_count),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Number of synchronous block-I/O wait windows (taskstats blkio_count).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "blkio_delay_total_ns",
        rule: AggRule::SumNs(|t| t.blkio_delay_total_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Cumulative ns waiting on synchronous block I/O (taskstats blkio_delay_total). Distinct from `iowait_sum` (schedstat).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "blkio_delay_max_ns",
        rule: AggRule::MaxPeak(|t| t.blkio_delay_max_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Longest single block-I/O wait observed, ns (taskstats blkio_delay_max).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "blkio_delay_min_ns",
        rule: AggRule::MaxPeak(|t| t.blkio_delay_min_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Shortest non-zero block-I/O wait observed, ns (taskstats blkio_delay_min). Sentinel 0 means \"no events observed\".",
        section: Section::TaskstatsDelay,
    },
    // Swap-in delay block: OVERLAPS with thrashing_* — every
    // thrashing event is also a swapin event from the syscall
    // layer. Do not sum swapin and thrashing.
    CtprofMetricDef {
        name: "swapin_delay_count",
        rule: AggRule::SumCount(|t| t.swapin_delay_count),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Number of swap-in wait windows (taskstats swapin_count). OVERLAPS with thrashing_delay_count — do not sum.",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "swapin_delay_total_ns",
        rule: AggRule::SumNs(|t| t.swapin_delay_total_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Cumulative ns waiting for swap-in to complete (taskstats swapin_delay_total).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "swapin_delay_max_ns",
        rule: AggRule::MaxPeak(|t| t.swapin_delay_max_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Longest single swap-in wait observed, ns (taskstats swapin_delay_max).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "swapin_delay_min_ns",
        rule: AggRule::MaxPeak(|t| t.swapin_delay_min_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Shortest non-zero swap-in wait observed, ns (taskstats swapin_delay_min). Sentinel 0 means \"no events observed\".",
        section: Section::TaskstatsDelay,
    },
    // Direct memory reclaim (free-pages) block.
    CtprofMetricDef {
        name: "freepages_delay_count",
        rule: AggRule::SumCount(|t| t.freepages_delay_count),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Number of direct-reclaim wait windows (taskstats freepages_count).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "freepages_delay_total_ns",
        rule: AggRule::SumNs(|t| t.freepages_delay_total_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Cumulative ns waiting in direct memory reclaim (taskstats freepages_delay_total).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "freepages_delay_max_ns",
        rule: AggRule::MaxPeak(|t| t.freepages_delay_max_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Longest single direct-reclaim wait observed, ns (taskstats freepages_delay_max).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "freepages_delay_min_ns",
        rule: AggRule::MaxPeak(|t| t.freepages_delay_min_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Shortest non-zero direct-reclaim wait observed, ns (taskstats freepages_delay_min). Sentinel 0 means \"no events observed\".",
        section: Section::TaskstatsDelay,
    },
    // Thrashing block: OVERLAPS with swapin_* (see above).
    CtprofMetricDef {
        name: "thrashing_delay_count",
        rule: AggRule::SumCount(|t| t.thrashing_delay_count),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Number of thrashing wait windows (taskstats thrashing_count). OVERLAPS with swapin_delay_count — do not sum.",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "thrashing_delay_total_ns",
        rule: AggRule::SumNs(|t| t.thrashing_delay_total_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Cumulative ns waiting under thrashing pressure (taskstats thrashing_delay_total).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "thrashing_delay_max_ns",
        rule: AggRule::MaxPeak(|t| t.thrashing_delay_max_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Longest single thrashing wait observed, ns (taskstats thrashing_delay_max).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "thrashing_delay_min_ns",
        rule: AggRule::MaxPeak(|t| t.thrashing_delay_min_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Shortest non-zero thrashing wait observed, ns (taskstats thrashing_delay_min). Sentinel 0 means \"no events observed\".",
        section: Section::TaskstatsDelay,
    },
    // Memory compaction block.
    CtprofMetricDef {
        name: "compact_delay_count",
        rule: AggRule::SumCount(|t| t.compact_delay_count),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Number of memory-compaction wait windows (taskstats compact_count).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "compact_delay_total_ns",
        rule: AggRule::SumNs(|t| t.compact_delay_total_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Cumulative ns waiting on memory compaction (taskstats compact_delay_total).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "compact_delay_max_ns",
        rule: AggRule::MaxPeak(|t| t.compact_delay_max_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Longest single compaction wait observed, ns (taskstats compact_delay_max).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "compact_delay_min_ns",
        rule: AggRule::MaxPeak(|t| t.compact_delay_min_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Shortest non-zero compaction wait observed, ns (taskstats compact_delay_min). Sentinel 0 means \"no events observed\".",
        section: Section::TaskstatsDelay,
    },
    // Write-protect-copy (CoW) fault block.
    CtprofMetricDef {
        name: "wpcopy_delay_count",
        rule: AggRule::SumCount(|t| t.wpcopy_delay_count),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Number of write-protect-copy (CoW) fault wait windows (taskstats wpcopy_count).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "wpcopy_delay_total_ns",
        rule: AggRule::SumNs(|t| t.wpcopy_delay_total_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Cumulative ns waiting on write-protect-copy faults (taskstats wpcopy_delay_total).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "wpcopy_delay_max_ns",
        rule: AggRule::MaxPeak(|t| t.wpcopy_delay_max_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Longest single write-protect-copy fault wait observed, ns (taskstats wpcopy_delay_max).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "wpcopy_delay_min_ns",
        rule: AggRule::MaxPeak(|t| t.wpcopy_delay_min_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Shortest non-zero write-protect-copy fault wait observed, ns (taskstats wpcopy_delay_min). Sentinel 0 means \"no events observed\".",
        section: Section::TaskstatsDelay,
    },
    // IRQ-handler delay block. Updates from `delayacct_irq` in
    // `kernel/delayacct.c` — counts kernel-IRQ time charged to
    // the task.
    CtprofMetricDef {
        name: "irq_delay_count",
        rule: AggRule::SumCount(|t| t.irq_delay_count),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Number of IRQ-handler windows charged to the task (taskstats irq_count).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "irq_delay_total_ns",
        rule: AggRule::SumNs(|t| t.irq_delay_total_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Cumulative ns of IRQ handling charged to the task (taskstats irq_delay_total).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "irq_delay_max_ns",
        rule: AggRule::MaxPeak(|t| t.irq_delay_max_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Longest single IRQ-handler window observed, ns (taskstats irq_delay_max).",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "irq_delay_min_ns",
        rule: AggRule::MaxPeak(|t| t.irq_delay_min_ns),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
        is_dead: false,
        description: "Shortest non-zero IRQ-handler window observed, ns (taskstats irq_delay_min). Sentinel 0 means \"no events observed\".",
        section: Section::TaskstatsDelay,
    },
    // Lifetime memory watermarks. Updates from `xacct_add_tsk` in
    // `kernel/tsacct.c` — kB → bytes conversion happens at parse
    // time in `crate::taskstats::parse_taskstats_payload`. Gated
    // on CONFIG_TASK_XACCT (the "extended accounting" path), NOT
    // CONFIG_TASK_DELAY_ACCT — `xacct_add_tsk` lives behind
    // `CONFIG_TASK_XACCT` while delayacct is the parallel
    // `CONFIG_TASK_DELAY_ACCT` subsystem; the two are
    // independently selectable.
    CtprofMetricDef {
        name: "hiwater_rss_bytes",
        rule: AggRule::MaxPeakBytes(|t| t.hiwater_rss_bytes),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_XACCT"],
        is_dead: false,
        description: "Lifetime high-watermark of resident-set size, bytes (taskstats hiwater_rss). Distinct from smaps_rollup_kb[\"Rss\"] which is the CURRENT RSS.",
        section: Section::TaskstatsDelay,
    },
    CtprofMetricDef {
        name: "hiwater_vm_bytes",
        rule: AggRule::MaxPeakBytes(|t| t.hiwater_vm_bytes),
        sched_class: None,
        config_gates: &["CONFIG_TASKSTATS", "CONFIG_TASK_XACCT"],
        is_dead: false,
        description: "Lifetime high-watermark of virtual-memory size, bytes (taskstats hiwater_vm).",
        section: Section::TaskstatsDelay,
    },
];

// ---------------------------------------------------------------------------
// Derived metrics
// ---------------------------------------------------------------------------

/// Output value of a derived metric.
///
/// Derived metrics carry an `f64` scalar. The `f64` carrier is
/// chosen because the value range varies across derivations:
/// - `[0, 1]` ratios: `cpu_efficiency`, `affine_success_ratio`,
///   `involuntary_csw_ratio`.
/// - `[0, ∞)` ratios: `disk_io_fraction` (readahead can pull more
///   block-device bytes than the syscall requested, so the ratio
///   exceeds 1.0 in practice).
/// - `[0, ∞)` per-event means: `avg_wait_ns`, `avg_slice_ns`,
///   `avg_iowait_ns` — sum over count, both non-negative.
/// - `(-∞, ∞)` signed differences: `live_heap_estimate` =
///   `allocated_bytes - deallocated_bytes` can go negative when
///   the deallocation total exceeds the allocation total (a
///   freelist drains memory allocated before capture began, or
///   the per-thread TSD counters were sampled mid-update on a
///   thread that has just released a large arena).
///
/// All four shapes flow through the same `f64` carrier. The
/// per-derivation auto-scale ladder lives on
/// [`DerivedMetricDef::ladder`] (not on the value type) so the
/// renderer picks the right magnitude (ns / Bytes / unitless)
/// per row regardless of whether the value is positive, zero,
/// negative, fractional, or in the millions. The `is_ratio`
/// flag on [`DerivedMetricDef`] toggles between the auto-scaled
/// path (e.g. `1.500ms`, `7.500GiB`) and the raw three-decimal
/// path (`0.873` for ratios).
///
/// Sign preservation: the [`auto_scale`] step uses `abs()` for
/// the threshold check but propagates the original signed value
/// through the scaled output, and [`format_derived_value_cell`]
/// / [`format_derived_delta_cell`] both render with `{value:.2}`
/// or `{value:.3}` formatters that preserve the explicit `-` for
/// negatives. The [`auto_scale_preserves_sign_on_negative_input`]
/// regression test pins this for the Bytes and ns ladders.
#[derive(Debug, Clone, Copy, PartialEq)]
#[non_exhaustive]
pub enum DerivedValue {
    /// Floating-point value. Render via the
    /// [`DerivedMetricDef::ladder`] + [`DerivedMetricDef::is_ratio`]
    /// pair: ratios format with three decimals (`0.873`,
    /// `+0.100`); ladder-bearing values
    /// ([`ScaleLadder::Ns`] / [`ScaleLadder::Bytes`] / etc.)
    /// route through the same auto-scale ladders the main table
    /// uses.
    Scalar(f64),
}

impl DerivedValue {
    /// Return the underlying `f64`. Helper for delta math
    /// downstream of [`DerivedRow`] consumers.
    pub fn as_f64(&self) -> f64 {
        match self {
            DerivedValue::Scalar(v) => *v,
        }
    }
}

/// Definition of a derived metric: a function that consumes the
/// already-aggregated input metrics for a group and produces a
/// single scalar (with its own unit and operator-facing
/// description).
///
/// The compute fn returns `None` when an input metric is missing
/// from the group's metrics map (capture-side gated by a kernel
/// CONFIG that wasn't enabled, or jemalloc not linked) OR when
/// the formula would divide by zero. The renderer surfaces a
/// `None` cell as `-` so the operator can distinguish "not
/// computable" from "computed as zero".
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
pub struct DerivedMetricDef {
    pub name: &'static str,
    /// Auto-scale ladder for the cell. [`ScaleLadder::None`] for
    /// ratio rows (renders as a bare three-decimal scalar with
    /// no suffix), [`ScaleLadder::Ns`] / [`ScaleLadder::Bytes`] /
    /// etc. for unit-bearing derivations. The same closed-match
    /// dispatch [`AggRule::ladder`] feeds.
    pub ladder: ScaleLadder,
    /// Operator-facing one-line description; surfaced by the
    /// `ctprof metric-list` subcommand.
    pub description: &'static str,
    /// Names of input metrics from [`CTPROF_METRICS`]. Pure
    /// documentation — surfaces in the `metric-list` output so
    /// the operator sees what each derivation depends on.
    pub inputs: &'static [&'static str],
    /// Render-shape flag for dimensionless quantities. When true,
    /// the renderer (1) suppresses the `%` (delta_pct) column,
    /// (2) renders the value as `N.NNN` with three decimals
    /// instead of routing through the auto-scale ladder, and
    /// (3) renders the delta as `+/-N.NNN` (no scaled unit
    /// suffix).
    ///
    /// The `[0, 1]` interval is the common case where this flag
    /// applies: `cpu_efficiency`, `affine_success_ratio`, and
    /// `involuntary_csw_ratio` all live in `[0, 1]`. Delta on a
    /// `[0, 1]` ratio reads as percentage points
    /// (0.5 → 0.6 = +0.100 = +10pp), and `delta / baseline` as
    /// a fraction (the `%` column) becomes confusing — `+20%` on
    /// a `[0, 1]` ratio is already in percentage points, so a
    /// percentage-of-percentage readout double-encodes the
    /// signal.
    ///
    /// `disk_io_fraction` (range `[0, ∞)`) carries `is_ratio: true`
    /// for the rendering shape but does NOT satisfy the
    /// percentage-points interpretation: a value of 1.5 is
    /// possible (readahead pulls more block-device bytes than
    /// the syscall requested), so a delta of +0.100 reads as
    /// "ratio rose by 0.1" rather than "ratio rose by 10
    /// percentage points." The render shape is still correct
    /// (suppress `%`, three decimals, no auto-scale) — only the
    /// pp interpretation is invalid.
    pub is_ratio: bool,
    /// The computation. Pulls input scalars from the group's
    /// metrics map via `Aggregated::numeric()` and produces the
    /// derived scalar.
    pub compute: fn(&BTreeMap<String, Aggregated>) -> Option<DerivedValue>,
    /// Section this derived metric belongs to for the
    /// `--sections` per-row filter, mirroring
    /// [`CtprofMetricDef::section`]. Most derivations tag
    /// [`Section::Derived`]; the 9 derivations whose inputs are
    /// taskstats fields (the eight `avg_*_delay_ns` averages
    /// plus `total_offcpu_delay_ns`) tag
    /// [`Section::TaskstatsDelay`] so an operator running
    /// `--sections taskstats-delay` gets a full taskstats view
    /// — the 34 raw rows AND the 9 derivations that depend on
    /// them — without dragging in unrelated derived metrics.
    /// The `## Derived metrics` table emitter checks
    /// [`DisplayOptions::is_section_enabled`] per row before
    /// rendering, and the outer-table gate opens whenever EITHER
    /// section in the rendered set is enabled.
    pub section: Section,
}

/// Helper: pull an input metric's `Aggregated::numeric()`
/// projection out of the group's metrics map.
fn input_scalar(metrics: &BTreeMap<String, Aggregated>, name: &str) -> Option<f64> {
    metrics.get(name).and_then(|a| a.numeric())
}

/// Helper: compute `num / den` for a simple ratio. Returns
/// `None` when either input is missing OR `den == 0` (so the
/// renderer surfaces `-` rather than NaN/inf). Used by the
/// majority of derived metrics whose formula is a plain
/// quotient over two registry inputs.
fn ratio_compute(
    metrics: &BTreeMap<String, Aggregated>,
    numerator: &str,
    denominator: &str,
) -> Option<DerivedValue> {
    let num = input_scalar(metrics, numerator)?;
    let den = input_scalar(metrics, denominator)?;
    if den == 0.0 {
        return None;
    }
    Some(DerivedValue::Scalar(num / den))
}

/// Helper: compute `num / (num + addend)` for ratios whose
/// denominator is a sum of two registry inputs. Returns `None`
/// when either input is missing OR the synthesized denominator
/// is zero. Used by `cpu_efficiency` (run / (run + wait)) and
/// `involuntary_csw_ratio` (nvcsw / (vcsw + nvcsw)).
fn ratio_of_sum_compute(
    metrics: &BTreeMap<String, Aggregated>,
    numerator: &str,
    addend: &str,
) -> Option<DerivedValue> {
    let num = input_scalar(metrics, numerator)?;
    let other = input_scalar(metrics, addend)?;
    let den = num + other;
    if den == 0.0 {
        return None;
    }
    Some(DerivedValue::Scalar(num / den))
}

/// Registry of derived metrics. Each entry consumes one or more
/// already-aggregated input metrics from
/// [`CTPROF_METRICS`] and produces a single scalar with its
/// own unit. See the per-entry doc strings for the formula and
/// kernel-source rationale.
pub static CTPROF_DERIVED_METRICS: &[DerivedMetricDef] = &[
    DerivedMetricDef {
        name: "affine_success_ratio",
        ladder: ScaleLadder::None,
        description: "wake_affine() success ratio: nr_wakeups_affine / nr_wakeups_affine_attempts.",
        inputs: &["nr_wakeups_affine", "nr_wakeups_affine_attempts"],
        is_ratio: true,
        compute: |m| ratio_compute(m, "nr_wakeups_affine", "nr_wakeups_affine_attempts"),
        section: Section::Derived,
    },
    DerivedMetricDef {
        name: "avg_wait_ns",
        ladder: ScaleLadder::Ns,
        description: "Average runqueue-wait duration per scheduling event: wait_sum / wait_count (ns/event).",
        inputs: &["wait_sum", "wait_count"],
        is_ratio: false,
        compute: |m| ratio_compute(m, "wait_sum", "wait_count"),
        section: Section::Derived,
    },
    // `voluntary_sleep_sum` derived metric was removed when
    // `voluntary_sleep_ns` became a first-class capture field.
    // The kernel's `sum_sleep_runtime - sum_block_runtime`
    // computation now happens at capture time inside
    // `capture_thread_at_with_tally` so every consumer reads the
    // pre-normalized value without re-deriving.
    DerivedMetricDef {
        name: "cpu_efficiency",
        ladder: ScaleLadder::None,
        description: "Fraction of total scheduler-tracked time spent on-CPU: run_time_ns / (run_time_ns + wait_time_ns).",
        inputs: &["run_time_ns", "wait_time_ns"],
        is_ratio: true,
        compute: |m| ratio_of_sum_compute(m, "run_time_ns", "wait_time_ns"),
        section: Section::Derived,
    },
    DerivedMetricDef {
        name: "avg_slice_ns",
        ladder: ScaleLadder::Ns,
        description: "Average on-CPU slice length per timeslice: run_time_ns / timeslices (ns/timeslice).",
        inputs: &["run_time_ns", "timeslices"],
        is_ratio: false,
        compute: |m| ratio_compute(m, "run_time_ns", "timeslices"),
        section: Section::Derived,
    },
    DerivedMetricDef {
        name: "involuntary_csw_ratio",
        ladder: ScaleLadder::None,
        description: "Fraction of context switches that were preemptions: nonvoluntary_csw / (voluntary_csw + nonvoluntary_csw).",
        inputs: &["nonvoluntary_csw", "voluntary_csw"],
        is_ratio: true,
        compute: |m| ratio_of_sum_compute(m, "nonvoluntary_csw", "voluntary_csw"),
        section: Section::Derived,
    },
    DerivedMetricDef {
        name: "disk_io_fraction",
        ladder: ScaleLadder::None,
        description: "Fraction of read syscall bytes that hit storage: read_bytes / rchar. Typically <= 1.0 but can exceed when readahead pulls more block-device bytes than the syscall requested.",
        inputs: &["read_bytes", "rchar"],
        is_ratio: true,
        compute: |m| ratio_compute(m, "read_bytes", "rchar"),
        section: Section::Derived,
    },
    DerivedMetricDef {
        name: "live_heap_estimate",
        ladder: ScaleLadder::Bytes,
        description: "jemalloc live-heap estimate: allocated_bytes - deallocated_bytes. Signed: negative when deallocations dominate (freelist drains memory allocated before capture, or sampled mid-update on a thread that just released a large arena). Renders with explicit `-` and the IEC binary suffix (e.g. `-1.907MiB`).",
        inputs: &["allocated_bytes", "deallocated_bytes"],
        is_ratio: false,
        compute: |m| {
            let alloc = input_scalar(m, "allocated_bytes")?;
            let dealloc = input_scalar(m, "deallocated_bytes")?;
            Some(DerivedValue::Scalar(alloc - dealloc))
        },
        section: Section::Derived,
    },
    DerivedMetricDef {
        name: "avg_iowait_ns",
        ladder: ScaleLadder::Ns,
        description: "Average iowait interval per iowait event: iowait_sum / iowait_count (ns/event).",
        inputs: &["iowait_sum", "iowait_count"],
        is_ratio: false,
        compute: |m| ratio_compute(m, "iowait_sum", "iowait_count"),
        section: Section::Derived,
    },
    // -- taskstats per-category averages (delay_total / count) --
    //
    // One average per delay-accounting category. Same shape as
    // avg_wait_ns / avg_iowait_ns above (sum-over-count quotient,
    // ns ladder, non-ratio). The category-specific caveats from
    // the registry (cpu RACY, swapin/thrashing OVERLAP, sentinel
    // semantics) carry forward into the description so an operator
    // reading `metric-list` for the derived row sees the same
    // gating discipline they get for the raw count/total fields.
    DerivedMetricDef {
        name: "avg_cpu_delay_ns",
        ladder: ScaleLadder::Ns,
        description: "Average CPU-wait per scheduling event: cpu_delay_total_ns / cpu_delay_count (ns/event). RACY: the kernel updates count + total via the lockless sched_info path, so a concurrent reader may observe one ahead of the other; the quotient is approximate at the sub-event scale and stable at the integrated scale.",
        inputs: &["cpu_delay_total_ns", "cpu_delay_count"],
        is_ratio: false,
        compute: |m| ratio_compute(m, "cpu_delay_total_ns", "cpu_delay_count"),
        section: Section::TaskstatsDelay,
    },
    DerivedMetricDef {
        name: "avg_blkio_delay_ns",
        ladder: ScaleLadder::Ns,
        description: "Average synchronous block-I/O wait per event: blkio_delay_total_ns / blkio_delay_count (ns/event). Distinct from avg_iowait_ns (schedstat) — this travels through the delayacct path and is the canonical delay-accounting block-I/O reading.",
        inputs: &["blkio_delay_total_ns", "blkio_delay_count"],
        is_ratio: false,
        compute: |m| ratio_compute(m, "blkio_delay_total_ns", "blkio_delay_count"),
        section: Section::TaskstatsDelay,
    },
    DerivedMetricDef {
        name: "avg_swapin_delay_ns",
        ladder: ScaleLadder::Ns,
        description: "Average swap-in wait per event: swapin_delay_total_ns / swapin_delay_count (ns/event). OVERLAPS with thrashing — every thrashing event is also a swapin event from the syscall layer; do not sum the two averages or the underlying totals directly.",
        inputs: &["swapin_delay_total_ns", "swapin_delay_count"],
        is_ratio: false,
        compute: |m| ratio_compute(m, "swapin_delay_total_ns", "swapin_delay_count"),
        section: Section::TaskstatsDelay,
    },
    DerivedMetricDef {
        name: "avg_freepages_delay_ns",
        ladder: ScaleLadder::Ns,
        description: "Average direct-reclaim wait per event: freepages_delay_total_ns / freepages_delay_count (ns/event).",
        inputs: &["freepages_delay_total_ns", "freepages_delay_count"],
        is_ratio: false,
        compute: |m| ratio_compute(m, "freepages_delay_total_ns", "freepages_delay_count"),
        section: Section::TaskstatsDelay,
    },
    DerivedMetricDef {
        name: "avg_thrashing_delay_ns",
        ladder: ScaleLadder::Ns,
        description: "Average thrashing wait per event: thrashing_delay_total_ns / thrashing_delay_count (ns/event). OVERLAPS with swapin (see avg_swapin_delay_ns).",
        inputs: &["thrashing_delay_total_ns", "thrashing_delay_count"],
        is_ratio: false,
        compute: |m| ratio_compute(m, "thrashing_delay_total_ns", "thrashing_delay_count"),
        section: Section::TaskstatsDelay,
    },
    DerivedMetricDef {
        name: "avg_compact_delay_ns",
        ladder: ScaleLadder::Ns,
        description: "Average memory-compaction wait per event: compact_delay_total_ns / compact_delay_count (ns/event).",
        inputs: &["compact_delay_total_ns", "compact_delay_count"],
        is_ratio: false,
        compute: |m| ratio_compute(m, "compact_delay_total_ns", "compact_delay_count"),
        section: Section::TaskstatsDelay,
    },
    DerivedMetricDef {
        name: "avg_wpcopy_delay_ns",
        ladder: ScaleLadder::Ns,
        description: "Average write-protect-copy fault wait per event: wpcopy_delay_total_ns / wpcopy_delay_count (ns/event).",
        inputs: &["wpcopy_delay_total_ns", "wpcopy_delay_count"],
        is_ratio: false,
        compute: |m| ratio_compute(m, "wpcopy_delay_total_ns", "wpcopy_delay_count"),
        section: Section::TaskstatsDelay,
    },
    DerivedMetricDef {
        name: "avg_irq_delay_ns",
        ladder: ScaleLadder::Ns,
        description: "Average IRQ-handler window per event: irq_delay_total_ns / irq_delay_count (ns/event).",
        inputs: &["irq_delay_total_ns", "irq_delay_count"],
        is_ratio: false,
        compute: |m| ratio_compute(m, "irq_delay_total_ns", "irq_delay_count"),
        section: Section::TaskstatsDelay,
    },
    // -- taskstats off-CPU rollup --
    //
    // Sum of every meaningful off-CPU delay category. Combines
    // cpu (runqueue wait), blkio (sync I/O wait), freepages
    // (direct reclaim), compact (compaction), wpcopy (CoW fault),
    // irq (IRQ-handler windows), and the LARGER of (swapin,
    // thrashing) — the two share the same syscall-layer event,
    // so summing both would double-count a thrashing-induced
    // swapin. `?` propagates None when any input is missing
    // (gating off, kernel pre-v14, etc.); `.max()` over the
    // overlap pair picks the dominant signal.
    DerivedMetricDef {
        name: "total_offcpu_delay_ns",
        ladder: ScaleLadder::Ns,
        description: "Sum of all off-CPU delay-accounting buckets, ns: cpu + blkio + freepages + compact + wpcopy + irq + max(swapin, thrashing). The swapin/thrashing pair is OR'd with .max() rather than summed because the two share syscall-layer events (every thrashing event is also a swapin). Returns `-` when any input is missing (CONFIG_TASK_DELAY_ACCT off, runtime toggle off, or kernel older than the bucket's introduction version).",
        inputs: &[
            "cpu_delay_total_ns",
            "blkio_delay_total_ns",
            "swapin_delay_total_ns",
            "freepages_delay_total_ns",
            "thrashing_delay_total_ns",
            "compact_delay_total_ns",
            "wpcopy_delay_total_ns",
            "irq_delay_total_ns",
        ],
        is_ratio: false,
        compute: |m| {
            let cpu = input_scalar(m, "cpu_delay_total_ns")?;
            let blkio = input_scalar(m, "blkio_delay_total_ns")?;
            let swapin = input_scalar(m, "swapin_delay_total_ns")?;
            let freepages = input_scalar(m, "freepages_delay_total_ns")?;
            let thrashing = input_scalar(m, "thrashing_delay_total_ns")?;
            let compact = input_scalar(m, "compact_delay_total_ns")?;
            let wpcopy = input_scalar(m, "wpcopy_delay_total_ns")?;
            let irq = input_scalar(m, "irq_delay_total_ns")?;
            let mem_overlap = swapin.max(thrashing);
            Some(DerivedValue::Scalar(
                cpu + blkio + freepages + compact + wpcopy + irq + mem_overlap,
            ))
        },
        section: Section::TaskstatsDelay,
    },
];

/// Render a metric's display name with gating tags appended.
/// Format: `<name> [<tag1>] [<tag2>] ...` when sched_class is
/// `Some` OR config_gates is non-empty OR is_dead is true; or
/// just `<name>` when none of those conditions hold. Tags emit
/// in stable order: `sched_class` first (one slot), then
/// `[dead]` if `is_dead`, then each `config_gate` in
/// registry-declared order.
///
/// Compact rendering: each `config_gate` is stripped of its
/// `CONFIG_` prefix before emission so the rendered cell stays
/// scannable in narrow tables. The data field
/// [`CtprofMetricDef::config_gates`] keeps the full
/// `CONFIG_X` spelling so an operator can grep their kconfig
/// directly. Examples:
/// - `wait_sum [CONFIG_SCHEDSTATS]` → `wait_sum [SCHEDSTATS]`
/// - `core_forceidle_sum [CONFIG_SCHED_CORE] [CONFIG_SCHEDSTATS]`
///   → `core_forceidle_sum [SCHED_CORE] [SCHEDSTATS]`
///
/// `sched_class` tags are rendered as-is (already short, e.g.
/// `[cfs-only]`, `[fair-policy]`, `[non-ext]`).
///
/// Returns `Cow::Borrowed(metric.name)` on the no-decoration
/// short-circuit so the typical-case path skips a heap
/// allocation; tagged metrics return `Cow::Owned(String)`.
///
/// Pure formatting layer — does not interpret tag values; the
/// metric's own [`CtprofMetricDef::sched_class`] /
/// [`CtprofMetricDef::config_gates`] / [`CtprofMetricDef::is_dead`]
/// docs are the source of truth for what each spelling means.
pub fn metric_display_name(metric: &CtprofMetricDef) -> &'static str {
    metric.name
}

pub fn metric_tags(metric: &CtprofMetricDef) -> String {
    let mut out = String::new();
    if let Some(class) = metric.sched_class {
        out.push('[');
        out.push_str(class);
        out.push(']');
    }
    if metric.is_dead {
        if !out.is_empty() {
            out.push(' ');
        }
        out.push_str("[dead]");
    }
    for gate in metric.config_gates {
        if !out.is_empty() {
            out.push(' ');
        }
        out.push('[');
        let short = gate.strip_prefix("CONFIG_").unwrap_or(gate);
        out.push_str(short);
        out.push(']');
    }
    out
}

/// Aggregated metric value for a single [`ThreadGroup`].
///
/// Carries both a numeric projection (used for delta math and
/// sort order) and a display form. Not every rule produces a
/// numeric — the categorical rules
/// ([`AggRule::Mode`] / [`AggRule::ModeChar`] /
/// [`AggRule::ModeBool`]) aggregate to a string, which has no
/// scalar — so the numeric is optional and rows without one
/// fall to the bottom of the default sort.
#[derive(Debug, Clone)]
pub enum Aggregated {
    /// Group-wide sum produced by the
    /// [`AggRule::SumCount`] / [`AggRule::SumNs`] /
    /// [`AggRule::SumTicks`] / [`AggRule::SumBytes`] rules. The
    /// dispatch unwraps the typed newtype's inner `u64` after
    /// the [`crate::metric_types::Summable::sum_across`]
    /// reduction; storage stays u64 to preserve full precision
    /// across the entire schedstats / byte / tick range with
    /// no lossy cast at aggregation time. Phase 4 will read
    /// the registry's `unit` tag (not the wrapper type) at
    /// render time to pick the auto-scale ladder.
    Sum(u64),
    /// Group-wide maximum produced by the
    /// [`AggRule::MaxPeak`] / [`AggRule::MaxGaugeNs`] /
    /// [`AggRule::MaxGaugeCount`] rules. Distinct variant from
    /// `Sum` so a downstream consumer that wants to surface
    /// "the worst single thread" rather than "the
    /// summed-across-threads value" can match without name-
    /// matching against the metric registry. Storage is u64 to
    /// preserve full ns precision across the entire schedstats
    /// range (no `as f64` lossy cast at aggregation time).
    Max(u64),
    /// Group-wide `[min, max]` interval produced by the
    /// [`AggRule::RangeI32`] / [`AggRule::RangeU32`] rules.
    /// Both bounds widen to `i64` at the dispatch boundary
    /// (`i64::from(OrdinalI32.0)` / `i64::from(OrdinalU32.0)`)
    /// — `OrdinalI32` carries a signed kernel-side range
    /// (`nice` includes negative values) and `OrdinalU32` fits
    /// into `i64` losslessly, so a single signed scalar
    /// represents both ordinal widths without losing the sign
    /// from `OrdinalI32` or wrapping the magnitude from
    /// `OrdinalU32`. Delta math takes the midpoint
    /// (`(min + max) / 2`) so a one-sided shift surfaces in
    /// the rendered delta column.
    OrdinalRange {
        min: i64,
        max: i64,
    },
    Mode {
        value: String,
        count: usize,
        total: usize,
    },
    Affinity(AffinitySummary),
}

/// CPU-affinity aggregation result.
///
/// `uniform` is `Some(cpus)` when every thread in the group shared
/// the same allowed set; otherwise heterogeneous and the renderer
/// emits "N-M cpus (mixed)".
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct AffinitySummary {
    pub min_cpus: usize,
    pub max_cpus: usize,
    pub uniform: Option<Vec<u32>>,
}

impl Aggregated {
    /// Scalar projection for delta math. `None` when the rule
    /// produces no meaningful scalar (categorical mode, affinity
    /// with heterogeneous cpusets).
    pub fn numeric(&self) -> Option<f64> {
        match self {
            Aggregated::Sum(v) => Some(*v as f64),
            Aggregated::Max(v) => Some(*v as f64),
            Aggregated::OrdinalRange { min, max } => {
                // Midpoint: keeps a min→max shift on one end visible
                // in the delta without privileging either bound.
                Some((*min as f64 + *max as f64) / 2.0)
            }
            Aggregated::Mode { .. } => None,
            Aggregated::Affinity(s) => {
                // Number of allowed CPUs is the natural scalar. When
                // the group is uniform, `min_cpus == max_cpus`; when
                // heterogeneous, midpoint parallels OrdinalRange.
                Some((s.min_cpus as f64 + s.max_cpus as f64) / 2.0)
            }
        }
    }
}

impl fmt::Display for Aggregated {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Aggregated::Sum(v) => write!(f, "{v}"),
            Aggregated::Max(v) => write!(f, "{v}"),
            Aggregated::OrdinalRange { min, max } => {
                if min == max {
                    write!(f, "{min}")
                } else {
                    write!(f, "{min}..{max}")
                }
            }
            Aggregated::Mode {
                value,
                count,
                total,
            } => {
                if count == total {
                    write!(f, "{value}")
                } else {
                    write!(f, "{value} ({count}/{total})")
                }
            }
            Aggregated::Affinity(s) => {
                if let Some(cpus) = &s.uniform {
                    let n = cpus.len();
                    let range = format_cpu_range(cpus);
                    write!(f, "{n} cpus ({range})")
                } else if s.min_cpus == s.max_cpus {
                    write!(f, "{} cpus (mixed)", s.min_cpus)
                } else {
                    write!(f, "{}-{} cpus (mixed)", s.min_cpus, s.max_cpus)
                }
            }
        }
    }
}

/// Aggregated metrics for every thread matched by one group key.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct ThreadGroup {
    pub key: String,
    pub thread_count: usize,
    /// Metric name → aggregated value. Entries are created for
    /// every registered metric; absent keys signal a missed
    /// aggregation step, not a skip.
    pub metrics: BTreeMap<String, Aggregated>,
    /// Only populated when grouping by cgroup — carries the cgroup
    /// v2 enrichment counters (cpu.stat, memory.current) for that
    /// path. Nested here so the renderer can surface them
    /// alongside the thread-metric rows without a second lookup.
    pub cgroup_stats: Option<CgroupStats>,
    /// Distinct member literals contained in this bucket, sorted
    /// ascending. The field carries `comm` literals under
    /// [`GroupBy::Comm`] and `pcomm` literals under
    /// [`GroupBy::Pcomm`] — both groupings feed the grex
    /// display-label path the same way (each pattern-aware bucket
    /// renders a regex over the union of its members across
    /// baseline + candidate). Empty Vec for groupings that
    /// render the join key directly: [`GroupBy::Cgroup`],
    /// [`GroupBy::CommExact`], or pattern-aware groupings under
    /// [`CompareOptions::no_thread_normalize`] where the join key
    /// IS the literal name and there is nothing to expand into a
    /// regex.
    pub members: Vec<String>,
    /// Average start_time_clock_ticks across group members.
    /// Lower = older = the group has been alive longer on average.
    pub avg_start_ticks: u64,
}

/// One row in the comparison table: `(group, metric)` pair with
/// aggregated values from both sides.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct DiffRow {
    /// Internal join key — deterministic across snapshots and
    /// stable for tests / programmatic consumers. For pattern-
    /// aggregated rows ([`GroupBy::Comm`] or [`GroupBy::Pcomm`]
    /// with bucket size ≥ 2 under default normalization), this is
    /// the token-normalized skeleton the bucket clusters on (e.g.
    /// `kworker/{N}:{N}-mm_percpu_wq` for Comm,
    /// `worker-{N}` for Pcomm); for every other grouping
    /// (CommExact, Cgroup, or pattern-aware grouping under
    /// [`CompareOptions::no_thread_normalize`]) it equals the
    /// rendered display key.
    pub group_key: String,
    pub thread_count_a: usize,
    pub thread_count_b: usize,
    /// Relative uptime % for this group (candidate side).
    /// 100% = as long-lived as the oldest group, 0% = just spawned.
    pub uptime_pct: Option<f64>,
    /// Sort-by metric cell: "baseline → candidate (delta%)" for
    /// the metric specified by --sort-by. Same value for every
    /// row in a group. None when no --sort-by is set.
    pub sort_by_cell: Option<String>,
    /// Sort metric's delta for this group (for coloring the SortBy column).
    pub sort_by_delta: Option<f64>,
    pub metric_name: &'static str,
    /// Auto-scale ladder for the row's value/delta cells. Sourced
    /// from `metric.rule.ladder()` at build time so the format
    /// dispatch stays a closed match (no string-keyed
    /// pass-through branch).
    pub metric_ladder: ScaleLadder,
    pub baseline: Aggregated,
    pub candidate: Aggregated,
    /// Signed candidate − baseline for numeric-capable rules.
    pub delta: Option<f64>,
    /// `delta / baseline` as a fraction. `None` when baseline is
    /// zero or the row has no numeric projection.
    pub delta_pct: Option<f64>,
    /// Operator-facing rendering of the group key. Equals
    /// `group_key` for non-pattern groupings; for [`GroupBy::Comm`]
    /// or [`GroupBy::Pcomm`] pattern buckets containing ≥ 2
    /// distinct member literals, this carries a grex-generated
    /// regex over the union of baseline+candidate members so the
    /// operator sees exactly which names landed in the bucket.
    pub display_key: String,
}

impl DiffRow {
    /// Sort key for "biggest absolute delta %". Numeric rows
    /// with a non-zero baseline sort by `|delta_pct|`; numeric
    /// rows with a zero baseline sort by `|delta|` scaled by a
    /// large constant so any non-zero candidate dominates
    /// percent-based rows; non-numeric rows sink to the bottom.
    fn sort_key(&self) -> f64 {
        if let Some(p) = self.delta_pct {
            p.abs()
        } else if let Some(d) = self.delta {
            // Baseline was zero (delta_pct undefined) but candidate
            // is some value — still a visible change. Inflate so it
            // beats percent-only rows in the sort.
            d.abs() * 1e9
        } else {
            f64::NEG_INFINITY
        }
    }
}

/// Apply a multi-key sort to `rows` per `sort_keys`. Computes a
/// per-group sort tuple by looking up the requested metrics'
/// deltas from the existing rows, ranks groups lexicographically
/// (with per-key direction), then orders rows by
/// `(group_rank, metric_registry_idx)` so rows for a given
/// group cluster together in registry order. Mirrors the default
/// sort's stability guarantee (within a group, registry order is
/// preserved; across groups, deterministic by tuple).
///
/// Missing values (a group has no row for the named metric, or
/// the row's `delta` is `None` because the metric is categorical
/// — even though [`parse_sort_by`] now rejects categorical
/// metrics at the CLI boundary, a programmatic caller can still
/// construct a [`SortKey`] over a `Mode*` metric directly) are
/// treated as `f64::NEG_INFINITY` for descending sort and
/// `f64::INFINITY` for ascending sort — they sink to the bottom
/// either way.
///
/// Caller must supply at least one sort key — an empty slice is a
/// programming error (the empty-spec case is handled at the
/// caller via the `sort_by.is_empty()` branch in
/// [`compare`] / `write_show`).
fn sort_diff_rows_by_keys(
    rows: &mut [DiffRow],
    derived_rows: &mut [DerivedRow],
    sort_keys: &[SortKey],
) {
    debug_assert!(
        !sort_keys.is_empty(),
        "sort_diff_rows_by_keys called with empty sort_keys; \
         caller must short-circuit before invoking the multi-key \
         sort path",
    );
    use std::collections::{BTreeMap, BTreeSet};
    // metric name → registry index (for stable within-group
    // ordering after sort). Both sides are `&'static str` so this
    // map is allocation-free at the key layer.
    let metric_idx: BTreeMap<&'static str, usize> = CTPROF_METRICS
        .iter()
        .enumerate()
        .map(|(i, m)| (m.name, i))
        .collect();
    let derived_idx: BTreeMap<&'static str, usize> = CTPROF_DERIVED_METRICS
        .iter()
        .enumerate()
        .map(|(i, m)| (m.name, i))
        .collect();
    // group_key → (metric_name → delta). The inner key is
    // `&'static str` borrowed from `row.metric_name` (itself a
    // `&'static str` pointing into `CTPROF_METRICS.name` or
    // `CTPROF_DERIVED_METRICS.name`), so no per-row
    // allocation is needed for the metric axis. Derived deltas
    // populate the same map; sort_by treats primary and derived
    // names uniformly for ranking.
    let mut group_metrics: BTreeMap<String, BTreeMap<&'static str, f64>> = BTreeMap::new();
    for row in rows.iter() {
        if let Some(d) = row.delta {
            group_metrics
                .entry(row.group_key.clone())
                .or_default()
                .insert(row.metric_name, d);
        }
    }
    for row in derived_rows.iter() {
        if let Some(d) = row.delta {
            group_metrics
                .entry(row.group_key.clone())
                .or_default()
                .insert(row.metric_name, d);
        }
    }
    // Unique group set: every key from group_metrics PLUS every
    // group_key from `rows` that had no numeric delta (every row
    // was Mode/etc). BTreeSet handles dedup-on-insert without a
    // separate sort+dedup pass.
    let mut unique_groups: BTreeSet<String> = group_metrics.keys().cloned().collect();
    for row in rows.iter() {
        unique_groups.insert(row.group_key.clone());
    }
    for row in derived_rows.iter() {
        unique_groups.insert(row.group_key.clone());
    }
    // Precompute (group_key, sort_tuple) pairs once. Avoids
    // recomputing the tuple inside the comparator on every
    // comparison; with N groups and a non-trivial tuple this
    // saves O(N log N) tuple builds.
    let mut groups_with_tuples: Vec<(String, Vec<f64>)> = unique_groups
        .into_iter()
        .map(|g| {
            let metrics = group_metrics.get(&g);
            let tuple: Vec<f64> = sort_keys
                .iter()
                .map(|k| {
                    metrics
                        .and_then(|m| m.get(k.metric).copied())
                        .unwrap_or(if k.descending {
                            f64::NEG_INFINITY
                        } else {
                            f64::INFINITY
                        })
                })
                .collect();
            (g, tuple)
        })
        .collect();
    // Sort with the precomputed tuples: comparator does only
    // O(sort_keys.len()) f64 comparisons per call, no map
    // lookups.
    groups_with_tuples.sort_by(|(ga, ta), (gb, tb)| {
        for (i, key) in sort_keys.iter().enumerate() {
            let (va, vb) = (ta[i], tb[i]);
            let ord = if key.descending {
                vb.partial_cmp(&va).unwrap_or(std::cmp::Ordering::Equal)
            } else {
                va.partial_cmp(&vb).unwrap_or(std::cmp::Ordering::Equal)
            };
            if ord != std::cmp::Ordering::Equal {
                return ord;
            }
        }
        // Final tie-break: ascending group_key for determinism.
        ga.cmp(gb)
    });
    let group_ranks: BTreeMap<String, usize> = groups_with_tuples
        .into_iter()
        .enumerate()
        .map(|(i, (g, _))| (g, i))
        .collect();
    rows.sort_by(|a, b| {
        let ra = group_ranks.get(&a.group_key).copied().unwrap_or(usize::MAX);
        let rb = group_ranks.get(&b.group_key).copied().unwrap_or(usize::MAX);
        ra.cmp(&rb).then_with(|| {
            let ia = metric_idx.get(a.metric_name).copied().unwrap_or(usize::MAX);
            let ib = metric_idx.get(b.metric_name).copied().unwrap_or(usize::MAX);
            ia.cmp(&ib)
        })
    });
    derived_rows.sort_by(|a, b| {
        let ra = group_ranks.get(&a.group_key).copied().unwrap_or(usize::MAX);
        let rb = group_ranks.get(&b.group_key).copied().unwrap_or(usize::MAX);
        ra.cmp(&rb).then_with(|| {
            let ia = derived_idx
                .get(a.metric_name)
                .copied()
                .unwrap_or(usize::MAX);
            let ib = derived_idx
                .get(b.metric_name)
                .copied()
                .unwrap_or(usize::MAX);
            ia.cmp(&ib)
        })
    });
}

/// A pair of cgroup groups fudged together by thread population overlap.
#[derive(Debug, Clone, Default)]
pub struct FudgedPair {
    pub baseline_key: String,
    pub candidate_key: String,
    pub overlap: usize,
    pub jaccard: f64,
    pub residual_baseline: Vec<String>,
    pub residual_candidate: Vec<String>,
    pub cascaded_children: usize,
    pub baseline_root: String,
    pub candidate_root: String,
}

/// Full comparison result.
#[derive(Debug, Clone, Default)]
#[non_exhaustive]
pub struct CtprofDiff {
    pub sort_metric_name: Option<&'static str>,
    pub rows: Vec<DiffRow>,
    /// Group keys that appeared in the baseline snapshot but not
    /// in the candidate.
    pub only_baseline: Vec<String>,
    /// Group keys that appeared in the candidate snapshot but not
    /// in the baseline.
    pub only_candidate: Vec<String>,
    /// Cgroup groups fudged together by thread population overlap.
    pub fudged_pairs: Vec<FudgedPair>,
    /// Baseline-only cgroup-level enrichment rows, keyed by the
    /// cgroup path (after flatten). Populated only for
    /// [`GroupBy::Cgroup`].
    pub cgroup_stats_a: BTreeMap<String, CgroupStats>,
    /// Candidate-only cgroup-level enrichment rows, same shape.
    pub cgroup_stats_b: BTreeMap<String, CgroupStats>,
    /// Baseline host-level Pressure Stall Information snapshot.
    /// Always populated (independent of `GroupBy`) — host-level
    /// PSI surfaces above the per-thread table for any compare,
    /// not just cgroup-grouped ones.
    pub host_psi_a: Psi,
    /// Candidate host-level PSI snapshot.
    pub host_psi_b: Psi,
    /// Baseline per-process smaps_rollup maps. Default
    /// normalization keys by the token-normalized pcomm
    /// (`pattern_key(&t.pcomm)`) — ephemeral PIDs across snapshots
    /// collapse into one bucket per pcomm pattern (e.g.
    /// `worker-{N}`), and the tgid is intentionally NOT part of
    /// the key (every PID for a given pcomm pattern shares a
    /// bucket; per-field byte counts SUM at
    /// [`collect_smaps_rollup`] when multiple PIDs collapse).
    /// Keys match the primary-table Pcomm group keys WHEN ≥2
    /// processes share the same pattern (`firefox`,
    /// `kworker/{N}:{N}`, `worker-{N}`, …). Singleton digit
    /// pcomms diverge intentionally: the primary table reverts
    /// the bucket key to the literal pcomm (e.g. `worker-7`)
    /// when only one process matches the skeleton — see
    /// [`build_groups`]'s singleton-revert gate — while smaps
    /// stays normalized (`worker-{N}`) regardless of bucket
    /// size, so cross-snapshot rows still join when PIDs are
    /// ephemeral. The asymmetry is documented on
    /// [`collect_smaps_rollup`] and is load-bearing for
    /// memory-leak diffing across reboots; correlation between
    /// the smaps row and the primary table happens via the
    /// shared pcomm pattern, not always via byte-identical keys.
    ///
    /// With [`CompareOptions::no_thread_normalize`] set, keys
    /// preserve the literal `pcomm[tgid]` shape so each PID stays
    /// attributable to its specific process instance — the
    /// `[tgid]` is preserved precisely so two distinct PIDs
    /// sharing a pcomm don't collide within a snapshot. Rows
    /// only join across snapshots when the same process instance
    /// ran on both sides, which is the price of literal mode.
    ///
    /// Populated from the per-thread leader rows of the
    /// snapshot (tid == tgid; see [`ThreadState::smaps_rollup_kb`]).
    pub smaps_rollup_a: BTreeMap<String, BTreeMap<String, u64>>,
    /// Candidate per-process smaps_rollup maps, same shape and
    /// normalization rules as [`Self::smaps_rollup_a`].
    pub smaps_rollup_b: BTreeMap<String, BTreeMap<String, u64>>,
    /// Baseline global sched_ext sysfs snapshot. `None` when
    /// the baseline kernel had no `/sys/kernel/sched_ext/`
    /// directory (CONFIG_SCHED_CLASS_EXT=n build).
    pub sched_ext_a: Option<crate::ctprof::SchedExtSysfs>,
    /// Candidate global sched_ext sysfs snapshot, same shape.
    pub sched_ext_b: Option<crate::ctprof::SchedExtSysfs>,
    /// One row per `(matched group, derived metric)` pair. Each
    /// derivation in [`CTPROF_DERIVED_METRICS`] consumes
    /// already-aggregated input metrics from the group's
    /// metrics map (see [`ThreadGroup::metrics`]) and produces a
    /// scalar `f64` with its own unit. `None`-valued sides
    /// signal "not computable" — either the input metric was
    /// missing on that side (capture-time CONFIG gate not set,
    /// jemalloc not linked) or the formula's denominator was
    /// zero. Surfaced by [`write_diff`] in the dedicated
    /// `## Derived metrics` section after the main table.
    pub derived_rows: Vec<DerivedRow>,
}

/// One row in the derived-metrics table: `(matched group,
/// derivation)` with the computed scalar from both sides.
///
/// Mirrors [`DiffRow`] in shape so the renderer can reuse the
/// same `(group | threads | metric | baseline | candidate |
/// delta | %)` column layout. The `%` column is suppressed for
/// rows whose derivation is a ratio
/// ([`DerivedMetricDef::is_ratio`] true) — absolute delta on a
/// `[0, 1]` ratio is already in percentage points so a delta_pct
/// readout would be confusing.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct DerivedRow {
    pub group_key: String,
    pub display_key: String,
    pub thread_count_a: usize,
    pub thread_count_b: usize,
    pub metric_name: &'static str,
    /// Auto-scale ladder for the row's value/delta cells. Mirrors
    /// [`DiffRow::metric_ladder`]; sourced from
    /// [`DerivedMetricDef::ladder`] at build time.
    pub metric_ladder: ScaleLadder,
    /// True when the derivation produces a ratio. Renderer
    /// suppresses the `%` column for ratio rows.
    pub is_ratio: bool,
    /// `None` when the input metric was missing on this side or
    /// the formula divides by zero.
    pub baseline: Option<DerivedValue>,
    /// `None` with the same semantics as [`Self::baseline`].
    pub candidate: Option<DerivedValue>,
    /// Signed candidate − baseline; `None` when either side is
    /// `None`.
    pub delta: Option<f64>,
    /// `delta / baseline`; `None` when baseline is zero, either
    /// side is `None`, OR the row is a ratio (suppressed for
    /// ratios so a `0.5 → 0.6` row doesn't render as
    /// `+20%` when the natural read is `+10pp`).
    pub delta_pct: Option<f64>,
}

impl DerivedRow {
    /// Sort key mirroring [`DiffRow::sort_key`] for default
    /// `|delta_pct|`-descending ordering. Ratio rows have
    /// `delta_pct == None` by design so they sort by their
    /// absolute delta, scaled by `1e9` so a non-zero ratio
    /// movement dominates a percent-based row whose baseline
    /// happens to be zero.
    fn sort_key(&self) -> f64 {
        if let Some(p) = self.delta_pct {
            p.abs()
        } else if let Some(d) = self.delta {
            d.abs() * 1e9
        } else {
            f64::NEG_INFINITY
        }
    }
}

/// Compute one [`DerivedRow`] for a matched group. Called per
/// derivation in [`compare`]; the produced row carries `None`
/// values when the formula's inputs are missing or the
/// denominator is zero on either side.
fn build_derived_row(
    key: &str,
    display_key: &str,
    n_a: usize,
    n_b: usize,
    def: &DerivedMetricDef,
    metrics_a: &BTreeMap<String, Aggregated>,
    metrics_b: &BTreeMap<String, Aggregated>,
) -> DerivedRow {
    let baseline = (def.compute)(metrics_a);
    let candidate = (def.compute)(metrics_b);
    let (delta, delta_pct) = match (baseline, candidate) {
        (Some(a), Some(b)) => {
            let va = a.as_f64();
            let vb = b.as_f64();
            let d = vb - va;
            // Suppress delta_pct for ratio rows per the design
            // call: `+20%` on a `[0, 1]` ratio is misleading.
            let pct = if def.is_ratio {
                None
            } else if va.abs() > f64::EPSILON {
                Some(d / va)
            } else {
                None
            };
            (Some(d), pct)
        }
        _ => (None, None),
    };
    DerivedRow {
        group_key: key.to_string(),
        display_key: display_key.to_string(),
        thread_count_a: n_a,
        thread_count_b: n_b,
        metric_name: def.name,
        metric_ladder: def.ladder,
        is_ratio: def.is_ratio,
        baseline,
        candidate,
        delta,
        delta_pct,
    }
}

/// Compare two snapshots and produce a [`CtprofDiff`].
pub fn compare(
    baseline: &CtprofSnapshot,
    candidate: &CtprofSnapshot,
    opts: &CompareOptions,
) -> CtprofDiff {
    let flatten = compile_flatten_patterns(&opts.cgroup_flatten);
    let group_by = opts.group_by.0;
    // For `GroupBy::Comm` and `GroupBy::Pcomm`, the frequency gate
    // that promotes a pattern_key from per-thread literal to a
    // clustered bucket must be evaluated against the UNION of both
    // snapshots' threads — otherwise a pattern that has 1 thread
    // in baseline + 3 threads in candidate would join under a
    // `worker-{N}` key on the candidate side but a literal
    // `worker-7` key on the baseline side, and the row would
    // surface as only-in-candidate. Computing the count from
    // the union ensures the same key is used on both sides.
    //
    // The Pcomm path is structurally identical: process names that
    // share a normalized skeleton across snapshots (e.g. ephemeral
    // worker pools whose pcomm differs only by a digit suffix)
    // collapse into one bucket, keyed by the skeleton. The accessor
    // selects which `ThreadState` field feeds the count — `t.comm`
    // for Comm, `t.pcomm` for Pcomm — so one helper covers both
    // axes.
    //
    // Skipped when `no_thread_normalize` is set — under literal
    // grouping, the key IS the comm/pcomm and there is no
    // promotion gate to evaluate.
    let pattern_counts: Option<BTreeMap<String, usize>> = match (group_by, opts.no_thread_normalize)
    {
        (GroupBy::Comm, false) => Some(pattern_counts_union(baseline, candidate, |t| {
            t.comm.as_str()
        })),
        (GroupBy::Pcomm | GroupBy::All, false) => {
            Some(pattern_counts_union(baseline, candidate, |t| {
                t.pcomm.as_str()
            }))
        }
        _ => None,
    };
    let cgroup_key_map: Option<BTreeMap<String, String>> =
        if matches!(group_by, GroupBy::Cgroup | GroupBy::All) && !opts.no_cg_normalize {
            Some(build_cgroup_key_map(baseline, candidate, &flatten))
        } else {
            None
        };
    let groups_a = build_groups(
        baseline,
        group_by,
        &flatten,
        pattern_counts.as_ref(),
        cgroup_key_map.as_ref(),
        opts.no_thread_normalize,
    );
    let groups_b = build_groups(
        candidate,
        group_by,
        &flatten,
        pattern_counts.as_ref(),
        cgroup_key_map.as_ref(),
        opts.no_thread_normalize,
    );

    let mut diff = CtprofDiff::default();

    // Compute per-snapshot "now" for lifetime calculation:
    // newest thread's start_time approximates capture time in ticks.
    let now_b = candidate
        .threads
        .iter()
        .map(|t| t.start_time_clock_ticks)
        .max()
        .unwrap_or(0);

    for (key, group_a) in &groups_a {
        let Some(group_b) = groups_b.get(key) else {
            diff.only_baseline.push(key.clone());
            continue;
        };
        // Render label: pattern grouping (Comm or Pcomm under
        // auto-normalize) unions baseline+candidate members and
        // runs grex over the result; every other grouping just
        // echoes the join key. Computed once per matched group,
        // reused across every metric row built off it.
        let pattern_axis_active =
            matches!(group_by, GroupBy::Comm | GroupBy::Pcomm) && !opts.no_thread_normalize;
        let display_key = if pattern_axis_active {
            let mut union: Vec<String> = group_a.members.clone();
            union.extend(group_b.members.iter().cloned());
            union.sort();
            union.dedup();
            pattern_display_label(key, &union)
        } else {
            key.clone()
        };
        // uptime_pct filled in second pass after all groups processed

        for metric in CTPROF_METRICS {
            let Some(a) = group_a.metrics.get(metric.name).cloned() else {
                continue;
            };
            let Some(b) = group_b.metrics.get(metric.name).cloned() else {
                continue;
            };
            diff.rows.push(build_row(
                key,
                &display_key,
                group_a.thread_count,
                group_b.thread_count,
                metric,
                a,
                b,
                None, // uptime_pct filled in second pass
            ));
        }
        // Derived metrics: one row per derivation per matched
        // group. Each row carries `None`-valued sides when the
        // formula's inputs are missing or the denominator is
        // zero — operator sees `-` rather than a synthesized
        // zero or NaN.
        for def in CTPROF_DERIVED_METRICS {
            diff.derived_rows.push(build_derived_row(
                key,
                &display_key,
                group_a.thread_count,
                group_b.thread_count,
                def,
                &group_a.metrics,
                &group_b.metrics,
            ));
        }
    }
    for key in groups_b.keys() {
        if !groups_a.contains_key(key) {
            diff.only_candidate.push(key.clone());
        }
    }
    // Content-based cgroup fudging: match one-sided groups by
    // thread population overlap when cgroup paths differ but
    // the workload is the same (e.g. service re-deployed to a
    // new cgroup path between snapshots).
    let mut fudged_key_pairs: Vec<(String, String)> = Vec::new();
    if group_by == GroupBy::All && !diff.only_baseline.is_empty() && !diff.only_candidate.is_empty()
    {
        // Extract cgroup prefix from compound keys.
        fn cg_prefix(key: &str) -> &str {
            key.split_once('\x00').map_or(key, |(cg, _)| cg)
        }

        // Collect thread types per CGROUP PREFIX (not per compound key).
        type TypeSet = std::collections::BTreeSet<(String, String)>;
        let mut cg_types_a: BTreeMap<String, TypeSet> = BTreeMap::new();
        let mut cg_types_b: BTreeMap<String, TypeSet> = BTreeMap::new();

        // Collect cgroup prefixes that appear in BOTH groups (already matched).
        // These must be excluded from fudging.
        let matched_prefixes: std::collections::BTreeSet<String> = groups_a
            .keys()
            .filter(|k| groups_b.contains_key(*k))
            .map(|k| cg_prefix(k).to_string())
            .collect();

        // Collect unique cgroup prefixes from one-sided keys,
        // skipping any prefix that already has matched keys.
        let mut cg_prefixes_a: std::collections::BTreeSet<String> =
            std::collections::BTreeSet::new();
        let mut cg_prefixes_b: std::collections::BTreeSet<String> =
            std::collections::BTreeSet::new();
        for key in &diff.only_baseline {
            let pfx = cg_prefix(key).to_string();
            if !matched_prefixes.contains(&pfx) {
                cg_prefixes_a.insert(pfx);
            }
        }
        for key in &diff.only_candidate {
            let pfx = cg_prefix(key).to_string();
            if !matched_prefixes.contains(&pfx) {
                cg_prefixes_b.insert(pfx);
            }
        }

        // Populate thread types per cgroup prefix from snapshots.
        for t in &baseline.threads {
            let cg = flatten_cgroup_path(&t.cgroup, &flatten);
            let cg_key = match cgroup_key_map.as_ref().and_then(|m| m.get(&cg)) {
                Some(k) => k.clone(),
                None => cg,
            };
            if cg_prefixes_a.contains(&cg_key) {
                cg_types_a
                    .entry(cg_key)
                    .or_default()
                    .insert((pattern_key(&t.pcomm), pattern_key(&t.comm)));
            }
        }
        for t in &candidate.threads {
            let cg = flatten_cgroup_path(&t.cgroup, &flatten);
            let cg_key = match cgroup_key_map.as_ref().and_then(|m| m.get(&cg)) {
                Some(k) => k.clone(),
                None => cg,
            };
            if cg_prefixes_b.contains(&cg_key) {
                cg_types_b
                    .entry(cg_key)
                    .or_default()
                    .insert((pattern_key(&t.pcomm), pattern_key(&t.comm)));
            }
        }

        // Match cgroup prefixes by Jaccard similarity. Each
        // candidate finds its best baseline match independently —
        // multiple candidates can match the same baseline (N
        // stacked tenants vs 1 unstacked baseline).
        let mut fudged_cg: Vec<(String, String)> = Vec::new(); // (baseline_cg, candidate_cg)

        for ccg in &cg_prefixes_b {
            let Some(set_b) = cg_types_b.get(ccg) else {
                continue;
            };
            if set_b.len() < 10 {
                continue;
            }
            let mut best: Option<(&str, f64, usize)> = None;
            for bcg in &cg_prefixes_a {
                let Some(set_a) = cg_types_a.get(bcg) else {
                    continue;
                };
                let intersection = set_a.intersection(set_b).count();
                if intersection < 10 {
                    continue;
                }
                let union = set_a.union(set_b).count();
                let jaccard = intersection as f64 / union as f64;
                if jaccard >= 0.90 && best.is_none_or(|(_, bj, _)| jaccard > bj) {
                    best = Some((bcg.as_str(), jaccard, intersection));
                }
            }
            if let Some((bcg, _jaccard, _overlap)) = best {
                fudged_cg.push((bcg.to_string(), ccg.clone()));
            }
        }

        // For each fudged cgroup pair, remap ALL compound keys sharing
        // that prefix. Match baseline keys to candidate keys by their
        // pcomm\x00comm suffix.
        let mut remove_baseline: std::collections::BTreeSet<String> =
            std::collections::BTreeSet::new();
        let mut remove_candidate: std::collections::BTreeSet<String> =
            std::collections::BTreeSet::new();

        // Collect all (baseline_key, candidate_key) pairs across
        // all fudge pairs. Multiple candidate keys can map to the
        // same baseline key (N:1).
        let mut fudge_matches: BTreeMap<String, Vec<String>> = BTreeMap::new(); // bkey → [ckeys]
        for (bcg, ccg) in &fudged_cg {
            let b_keys: Vec<&String> = diff
                .only_baseline
                .iter()
                .filter(|k| cg_prefix(k) == bcg.as_str())
                .collect();
            let c_keys: Vec<&String> = diff
                .only_candidate
                .iter()
                .filter(|k| cg_prefix(k) == ccg.as_str())
                .collect();
            let c_suffix_map: BTreeMap<&str, &String> = c_keys
                .iter()
                .map(|k| {
                    let suffix = k.split_once('\x00').map_or("", |(_, s)| s);
                    (suffix, *k)
                })
                .collect();
            for bkey in &b_keys {
                let b_suffix = bkey.split_once('\x00').map_or("", |(_, s)| s);
                if let Some(ckey) = c_suffix_map.get(b_suffix) {
                    remove_baseline.insert((*bkey).clone());
                    remove_candidate.insert((*ckey).clone());
                    fudged_key_pairs.push(((*bkey).clone(), (*ckey).clone()));
                    fudge_matches
                        .entry((*bkey).clone())
                        .or_default()
                        .push((*ckey).clone());
                }
            }
        }
        // Emit one row per baseline key with candidate values
        // aggregated across all N matched candidate groups.
        for (bkey, ckeys) in &fudge_matches {
            let Some(ga) = groups_a.get(bkey) else {
                continue;
            };
            // Merge candidate groups: sum Sum, max Max, union Range.
            let mut merged_metrics: BTreeMap<String, Aggregated> = BTreeMap::new();
            let mut merged_thread_count: usize = 0;
            for ckey in ckeys {
                let Some(gb) = groups_b.get(ckey) else {
                    continue;
                };
                merged_thread_count += gb.thread_count;
                for (name, val) in &gb.metrics {
                    let entry = merged_metrics.entry(name.clone());
                    match entry {
                        std::collections::btree_map::Entry::Vacant(e) => {
                            e.insert(val.clone());
                        }
                        std::collections::btree_map::Entry::Occupied(mut e) => {
                            let existing = e.get_mut();
                            match (existing, val) {
                                (Aggregated::Sum(s), Aggregated::Sum(v)) => {
                                    *s += v;
                                }
                                (Aggregated::Max(m), Aggregated::Max(v)) => {
                                    *m += v;
                                }
                                (
                                    Aggregated::OrdinalRange { min, max },
                                    Aggregated::OrdinalRange {
                                        min: vmin,
                                        max: vmax,
                                    },
                                ) => {
                                    *min = (*min).min(*vmin);
                                    *max = (*max).max(*vmax);
                                }
                                (
                                    Aggregated::Mode {
                                        value,
                                        count,
                                        total,
                                    },
                                    Aggregated::Mode {
                                        value: vv,
                                        count: vc,
                                        total: vt,
                                    },
                                ) => {
                                    *total += vt;
                                    if *vc > *count {
                                        *value = vv.clone();
                                        *count = *vc;
                                    }
                                }
                                _ => {}
                            }
                        }
                    }
                }
            }
            let display_key = "[fudged]".to_string();
            for metric in CTPROF_METRICS {
                let Some(a) = ga.metrics.get(metric.name).cloned() else {
                    continue;
                };
                let Some(b) = merged_metrics.get(metric.name).cloned() else {
                    continue;
                };
                diff.rows.push(build_row(
                    bkey,
                    &display_key,
                    ga.thread_count,
                    merged_thread_count,
                    metric,
                    a,
                    b,
                    None,
                ));
            }
            for def in CTPROF_DERIVED_METRICS {
                diff.derived_rows.push(build_derived_row(
                    bkey,
                    &display_key,
                    ga.thread_count,
                    merged_thread_count,
                    def,
                    &ga.metrics,
                    &merged_metrics,
                ));
            }
        }

        // Cascade: for each fudged cgroup pair, compute cascade
        // roots by stripping the longest common suffix (by /
        // segments) from the pair. Use those shorter roots for
        // starts_with matching, not the full fudged paths.
        let mut cascade_counts: BTreeMap<String, usize> = BTreeMap::new();
        let mut cascade_roots: BTreeMap<(String, String), (String, String)> = BTreeMap::new();
        let mut cascade_matches: BTreeMap<String, Vec<String>> = BTreeMap::new();
        for (bcg, ccg) in &fudged_cg {
            let b_segs: Vec<&str> = bcg.split('/').collect();
            let c_segs: Vec<&str> = ccg.split('/').collect();
            let common_suffix_len = b_segs
                .iter()
                .rev()
                .zip(c_segs.iter().rev())
                .take_while(|(a, b)| a == b)
                .count();
            let b_root: String = b_segs[..b_segs.len().saturating_sub(common_suffix_len)].join("/");
            let c_root: String = c_segs[..c_segs.len().saturating_sub(common_suffix_len)].join("/");
            let b_root = if b_root.is_empty() {
                bcg.clone()
            } else {
                b_root
            };
            let c_root = if c_root.is_empty() {
                ccg.clone()
            } else {
                c_root
            };

            cascade_roots.insert((bcg.clone(), ccg.clone()), (b_root.clone(), c_root.clone()));

            let remaining_b: Vec<String> = diff
                .only_baseline
                .iter()
                .filter(|k| {
                    !remove_baseline.contains(*k) && cg_prefix(k).starts_with(b_root.as_str())
                })
                .cloned()
                .collect();
            let remaining_c: Vec<String> = diff
                .only_candidate
                .iter()
                .filter(|k| {
                    !remove_candidate.contains(*k) && cg_prefix(k).starts_with(c_root.as_str())
                })
                .cloned()
                .collect();
            let c_by_suffix: BTreeMap<String, &String> = remaining_c
                .iter()
                .map(|k| {
                    let child_cg = cg_prefix(k);
                    let tail = &child_cg[c_root.len()..];
                    if !tail.is_empty() && !tail.starts_with('/') {
                        return (String::new(), k);
                    }
                    let rewritten = format!("{b_root}{tail}");
                    let suffix = k.split_once('\x00').map_or("", |(_, s)| s);
                    (format!("{rewritten}\x00{suffix}"), k)
                })
                .collect();
            for bkey in &remaining_b {
                if let Some(ckey) = c_by_suffix.get(bkey) {
                    remove_baseline.insert(bkey.clone());
                    remove_candidate.insert((*ckey).clone());
                    fudged_key_pairs.push((bkey.clone(), (*ckey).clone()));
                    *cascade_counts.entry(bcg.clone()).or_insert(0) += 1;
                    cascade_matches
                        .entry(bkey.clone())
                        .or_default()
                        .push((*ckey).clone());
                }
            }
        }

        // Emit aggregated rows for cascaded children (same N:1 merge).
        for (bkey, ckeys) in &cascade_matches {
            let Some(ga) = groups_a.get(bkey) else {
                continue;
            };
            let mut merged_metrics: BTreeMap<String, Aggregated> = BTreeMap::new();
            let mut merged_thread_count: usize = 0;
            for ckey in ckeys {
                let Some(gb) = groups_b.get(ckey) else {
                    continue;
                };
                merged_thread_count += gb.thread_count;
                for (name, val) in &gb.metrics {
                    let entry = merged_metrics.entry(name.clone());
                    match entry {
                        std::collections::btree_map::Entry::Vacant(e) => {
                            e.insert(val.clone());
                        }
                        std::collections::btree_map::Entry::Occupied(mut e) => {
                            let existing = e.get_mut();
                            match (existing, val) {
                                (Aggregated::Sum(s), Aggregated::Sum(v)) => {
                                    *s += v;
                                }
                                (Aggregated::Max(m), Aggregated::Max(v)) => {
                                    *m += v;
                                }
                                (
                                    Aggregated::OrdinalRange { min, max },
                                    Aggregated::OrdinalRange {
                                        min: vmin,
                                        max: vmax,
                                    },
                                ) => {
                                    *min = (*min).min(*vmin);
                                    *max = (*max).max(*vmax);
                                }
                                (
                                    Aggregated::Mode {
                                        value,
                                        count,
                                        total,
                                    },
                                    Aggregated::Mode {
                                        value: vv,
                                        count: vc,
                                        total: vt,
                                    },
                                ) => {
                                    *total += vt;
                                    if *vc > *count {
                                        *value = vv.clone();
                                        *count = *vc;
                                    }
                                }
                                _ => {}
                            }
                        }
                    }
                }
            }
            let display_key = "[fudged]".to_string();
            for metric in CTPROF_METRICS {
                let Some(a) = ga.metrics.get(metric.name).cloned() else {
                    continue;
                };
                let Some(b) = merged_metrics.get(metric.name).cloned() else {
                    continue;
                };
                diff.rows.push(build_row(
                    bkey,
                    &display_key,
                    ga.thread_count,
                    merged_thread_count,
                    metric,
                    a,
                    b,
                    None,
                ));
            }
            for def in CTPROF_DERIVED_METRICS {
                diff.derived_rows.push(build_derived_row(
                    bkey,
                    &display_key,
                    ga.thread_count,
                    merged_thread_count,
                    def,
                    &ga.metrics,
                    &merged_metrics,
                ));
            }
        }

        diff.only_baseline.retain(|k| !remove_baseline.contains(k));
        diff.only_candidate
            .retain(|k| !remove_candidate.contains(k));

        // Store fudge report per cgroup pair.
        diff.fudged_pairs = fudged_cg
            .iter()
            .map(|(bcg, ccg)| {
                let set_a = cg_types_a.get(bcg).cloned().unwrap_or_default();
                let set_b = cg_types_b.get(ccg).cloned().unwrap_or_default();
                let residual_a: Vec<String> = set_a
                    .difference(&set_b)
                    .map(|(p, c)| format!("{p}:{c}"))
                    .collect();
                let residual_b: Vec<String> = set_b
                    .difference(&set_a)
                    .map(|(p, c)| format!("{p}:{c}"))
                    .collect();
                let intersection = set_a.intersection(&set_b).count();
                let union = set_a.union(&set_b).count();
                FudgedPair {
                    baseline_key: bcg.clone(),
                    candidate_key: ccg.clone(),
                    overlap: intersection,
                    jaccard: if union > 0 {
                        intersection as f64 / union as f64
                    } else {
                        0.0
                    },
                    residual_baseline: residual_a,
                    residual_candidate: residual_b,
                    cascaded_children: cascade_counts.get(bcg).copied().unwrap_or(0),
                    baseline_root: cascade_roots
                        .get(&(bcg.clone(), ccg.clone()))
                        .map(|(b, _)| b.clone())
                        .unwrap_or_else(|| bcg.clone()),
                    candidate_root: cascade_roots
                        .get(&(bcg.clone(), ccg.clone()))
                        .map(|(_, c)| c.clone())
                        .unwrap_or_else(|| ccg.clone()),
                }
            })
            .collect();
    }

    diff.only_baseline.sort();
    diff.only_candidate.sort();

    // Second pass: fill in uptime_pct. Compute each group's
    // average thread lifetime (candidate side), then express as
    // % of the longest-lived group.
    {
        let mut group_lifetime: BTreeMap<String, u64> = BTreeMap::new();
        for (key, group_b) in &groups_b {
            if groups_a.contains_key(key) {
                group_lifetime.insert(key.clone(), now_b.saturating_sub(group_b.avg_start_ticks));
            }
        }
        let mut fudge_lt_sum: BTreeMap<String, (u64, u64)> = BTreeMap::new();
        for (bkey, ckey) in &fudged_key_pairs {
            if let Some(gb) = groups_b.get(ckey) {
                let lt = now_b.saturating_sub(gb.avg_start_ticks);
                let entry = fudge_lt_sum.entry(bkey.clone()).or_insert((0, 0));
                entry.0 += lt;
                entry.1 += 1;
            }
        }
        for (bkey, (sum, count)) in &fudge_lt_sum {
            if *count > 0 {
                group_lifetime.insert(bkey.clone(), sum / count);
            }
        }
        let max_lifetime = group_lifetime.values().copied().max().unwrap_or(1).max(1);
        for row in &mut diff.rows {
            if let Some(&lt) = group_lifetime.get(&row.group_key) {
                row.uptime_pct = Some(lt as f64 / max_lifetime as f64 * 100.0);
            }
        }
    }

    if opts.sort_by.is_empty() {
        // Default: stable-sort by descending |delta_pct|, ties
        // broken by ascending group_key + registry order of
        // metric. Apply the same shape to derived_rows so the
        // `## Derived metrics` section ranks by salient delta
        // rather than registry order — matches the operator's
        // expectation that the most-changed row sits at the
        // top of every section.
        diff.rows.sort_by(|a, b| {
            b.sort_key()
                .partial_cmp(&a.sort_key())
                .unwrap_or(std::cmp::Ordering::Equal)
                .then_with(|| a.group_key.cmp(&b.group_key))
        });
        diff.derived_rows.sort_by(|a, b| {
            b.sort_key()
                .partial_cmp(&a.sort_key())
                .unwrap_or(std::cmp::Ordering::Equal)
                .then_with(|| a.group_key.cmp(&b.group_key))
        });
    } else {
        // Multi-key sort: rank groups by tuple of named-metric
        // deltas, sort rows by (group_rank, metric_registry_idx).
        sort_diff_rows_by_keys(&mut diff.rows, &mut diff.derived_rows, &opts.sort_by);

        // Fill sort_by_cell: for each group, find the sort metric's
        // row and format its baseline→candidate (delta%).
        let sort_metric = opts.sort_by.first().map(|sk| sk.metric);
        diff.sort_metric_name = sort_metric;
        if let Some(metric_name) = sort_metric {
            let mut group_cells: BTreeMap<String, (String, Option<f64>)> = BTreeMap::new();
            for row in &diff.rows {
                if row.metric_name == metric_name && !group_cells.contains_key(&row.group_key) {
                    let b = format_value_cell(&row.baseline, row.metric_ladder);
                    let c = format_value_cell(&row.candidate, row.metric_ladder);
                    let pct = match row.delta_pct {
                        Some(p) => format!(" ({:+.1}%)", p * 100.0),
                        None => String::new(),
                    };
                    group_cells.insert(
                        row.group_key.clone(),
                        (format!("{b}\u{2192}{c}{pct}"), row.delta),
                    );
                }
            }
            for row in &mut diff.rows {
                if let Some((cell, delta)) = group_cells.get(&row.group_key) {
                    row.sort_by_cell = Some(cell.clone());
                    row.sort_by_delta = *delta;
                }
            }
        }
    }

    if group_by == GroupBy::Cgroup {
        diff.cgroup_stats_a =
            flatten_cgroup_stats(&baseline.cgroup_stats, &flatten, cgroup_key_map.as_ref());
        diff.cgroup_stats_b =
            flatten_cgroup_stats(&candidate.cgroup_stats, &flatten, cgroup_key_map.as_ref());
    }

    diff.host_psi_a = baseline.psi;
    diff.host_psi_b = candidate.psi;

    if group_by == GroupBy::All {
        diff.smaps_rollup_a = collect_smaps_rollup_hierarchical(
            baseline,
            opts.no_thread_normalize,
            &flatten,
            cgroup_key_map.as_ref(),
        );
        diff.smaps_rollup_b = collect_smaps_rollup_hierarchical(
            candidate,
            opts.no_thread_normalize,
            &flatten,
            cgroup_key_map.as_ref(),
        );
    } else {
        diff.smaps_rollup_a = collect_smaps_rollup(baseline, opts.no_thread_normalize);
        diff.smaps_rollup_b = collect_smaps_rollup(candidate, opts.no_thread_normalize);
    }

    // Remap fudged smaps keys so baseline and candidate join.
    // Fudge paired cg_A (baseline) with cg_B (candidate), but
    // smaps keys are cg_key\x00pcomm — different cg = no join.
    // Rename candidate keys from cg_B prefix to cg_A prefix.
    // Smaps fudge remap: for each fudge pair, find candidate smaps
    // keys under the candidate root, split into (cg_path, pcomm),
    // strip the candidate root from cg_path to get the relative
    // child path, rebuild as baseline_root + child + \x00 + pcomm.
    // Sum values when multiple candidates map to the same baseline
    // key (N containers → total candidate footprint).
    {
        // (relative_child_path + \x00 + pcomm) → summed values
        let mut summed_by_rel: BTreeMap<String, BTreeMap<String, u64>> = BTreeMap::new();
        for fp in &diff.fudged_pairs {
            let cr = &fp.candidate_root;
            let cr_slash = format!("{cr}/");
            let cr_nul = format!("{cr}\x00");
            let keys: Vec<String> = diff
                .smaps_rollup_b
                .keys()
                .filter(|k| {
                    k.starts_with(&cr_slash) || k.starts_with(&cr_nul) || k.as_str() == cr.as_str()
                })
                .cloned()
                .collect();
            for k in keys {
                if let Some(val) = diff.smaps_rollup_b.remove(&k) {
                    // Split smaps key: cg_path \x00 pcomm
                    let (cg_path, pcomm) = k.split_once('\x00').unwrap_or((&k, ""));
                    // Strip candidate root to get relative child path.
                    let child = if cg_path == cr.as_str() {
                        ""
                    } else if let Some(rest) = cg_path.strip_prefix(&cr_slash) {
                        rest
                    } else {
                        continue;
                    };
                    let rel_key = format!("{child}\x00{pcomm}");
                    let entry = summed_by_rel.entry(rel_key).or_default();
                    for (field, v) in &val {
                        *entry.entry(field.clone()).or_insert(0) += v;
                    }
                }
            }
        }
        // Rebuild baseline-side keys and insert summed data.
        if let Some(fp0) = diff.fudged_pairs.first() {
            let br = &fp0.baseline_root;
            for (rel_key, summed) in summed_by_rel {
                let (child, pcomm) = rel_key.split_once('\x00').unwrap_or((&rel_key, ""));
                let base_key = if child.is_empty() {
                    format!("{br}\x00{pcomm}")
                } else {
                    format!("{br}/{child}\x00{pcomm}")
                };
                diff.smaps_rollup_b.insert(base_key, summed);
            }
        }
    }
    diff.sched_ext_a = baseline.sched_ext.clone();
    diff.sched_ext_b = candidate.sched_ext.clone();

    diff
}

/// Walk a snapshot's threads and pull non-empty smaps_rollup
/// maps off the leader threads (tid == tgid; non-leader threads
/// land at empty map per the leader-dedup contract).
///
/// Keying:
///
/// - Default normalization (`no_thread_normalize: false`): key is
///   `pattern_key(&t.pcomm)` — pcomm only, the `[tgid]` suffix is
///   DROPPED. The tgid digits would always normalize to `{N}` and
///   add no discriminating signal to the join key, so omitting
///   them makes smaps keys match the primary-table Pcomm group
///   keys exactly (`kworker/{N}:{N}`, `firefox`, `worker-{N}`,
///   etc.).
///
///   No singleton revert. Unlike [`build_groups`], which reverts a
///   pattern_key to the literal name when only one contributor
///   shares the skeleton, `collect_smaps_rollup` always normalizes
///   when normalization is enabled regardless of how many PIDs
///   share the bucket. The reason is structural: smaps keys exist
///   to JOIN baseline vs candidate across snapshots, and PIDs are
///   per-snapshot ephemeral. A singleton-revert path would emit a
///   literal `worker[7]` on baseline and a literal `worker[1234]`
///   on candidate — two never-matching keys — orphaning every
///   cross-snapshot row. The build_groups invariant ("don't
///   advertise a pattern that no peer shares") doesn't apply on
///   the smaps axis because the bucket's role isn't intra-
///   snapshot fleet aggregation; it's cross-snapshot memory
///   diffing.
///
/// - Literal mode (`no_thread_normalize: true`): key is
///   `pcomm[tgid]` so each PID stays attributable to its
///   specific instance. The tradeoff is that rows only join
///   across snapshots when the same process instance ran on
///   both sides — the `[tgid]` is preserved precisely so two
///   distinct PIDs sharing a pcomm don't collide within a
///   snapshot.
///
/// Aggregation: multiple leader threads mapping to the same
/// key (default mode: a fleet of `worker-{N}` parents) SUM
/// their per-field byte counts. `Rss`, `Pss`, `Private_*`,
/// `Shared_*` etc. each accumulate via `saturating_add` —
/// memory quantities are additive across the merged bucket.
/// `saturating_add` mirrors the cumulative-counter merge policy
/// elsewhere in this module (cpu_usage_usec, throttled_usec); a
/// u64 byte-count overflow implies more than 16 EiB of resident
/// memory across the bucket, well past any realistic host.
///
/// Caveat on `Shared_*` aggregation: when multiple PIDs in a
/// merged bucket share physical pages (the COW case for forked
/// children, mmap'd shared libraries, etc.), summing each PID's
/// per-process `Shared_*` reading double-counts the overlapping
/// physical residency. The same double-count exists in the
/// un-aggregated display — the operator already sees `Shared_Clean
/// = 500MiB` listed against two distinct PID rows that happen to
/// share the same library mapping — so the merge introduces no
/// new information loss, just preserves the pre-existing kernel-
/// emission characteristic. `Pss` stays the precise read for a
/// merged bucket's resident footprint because the kernel
/// proportionally divides shared pages across mappers
/// (`fs/proc/task_mmu.c::smap_account`); operators tracking actual
/// memory pressure should prefer `Pss` over `Rss + Shared_*`
/// arithmetic on collapsed buckets.
///
/// Values are converted from kB to bytes via
/// [`ThreadState::smaps_rollup_bytes`] up-front, so the
/// downstream renderer can pass cell values directly into the
/// auto_scale "B" ladder without further unit math.
pub fn collect_smaps_rollup(
    snap: &CtprofSnapshot,
    no_thread_normalize: bool,
) -> BTreeMap<String, BTreeMap<String, u64>> {
    collect_smaps_rollup_inner(snap, no_thread_normalize, false, &[], None)
}

pub fn collect_smaps_rollup_hierarchical(
    snap: &CtprofSnapshot,
    no_thread_normalize: bool,
    flatten: &[glob::Pattern],
    cgroup_key_map: Option<&BTreeMap<String, String>>,
) -> BTreeMap<String, BTreeMap<String, u64>> {
    collect_smaps_rollup_inner(snap, no_thread_normalize, true, flatten, cgroup_key_map)
}

fn collect_smaps_rollup_inner(
    snap: &CtprofSnapshot,
    no_thread_normalize: bool,
    compound_cgroup: bool,
    flatten: &[glob::Pattern],
    cgroup_key_map: Option<&BTreeMap<String, String>>,
) -> BTreeMap<String, BTreeMap<String, u64>> {
    let mut out: BTreeMap<String, BTreeMap<String, u64>> = BTreeMap::new();
    for t in &snap.threads {
        if t.smaps_rollup_kb.is_empty() {
            continue;
        }
        let pcomm_key = if no_thread_normalize {
            format!("{}[{}]", t.pcomm, t.tgid)
        } else {
            pattern_key(&t.pcomm)
        };
        let key = if compound_cgroup {
            let cg = flatten_cgroup_path(&t.cgroup, flatten);
            let cg_key = match cgroup_key_map.and_then(|m| m.get(&cg)) {
                Some(k) => k.clone(),
                None => cg,
            };
            format!("{cg_key}\x00{pcomm_key}")
        } else {
            pcomm_key
        };
        let entry = out.entry(key).or_default();
        for (k, b) in t.smaps_rollup_bytes() {
            entry
                .entry(k.clone())
                .and_modify(|v| *v = v.saturating_add(b.0))
                .or_insert(b.0);
        }
    }
    out
}

/// Build the post-flatten-path → final-tightened-key map for
/// [`GroupBy::Cgroup`] under auto-normalization. Walks the union
/// of paths from both snapshots' threads and `cgroup_stats` so
/// that Layer 3 (tighten) sees every contributor to a given
/// Layer-2 skeleton group. Returns the map keyed by post-flatten
/// path; consumers ([`build_groups`], [`flatten_cgroup_stats`])
/// look up the final key for any path they see.
pub fn build_cgroup_key_map(
    baseline: &CtprofSnapshot,
    candidate: &CtprofSnapshot,
    flatten: &[glob::Pattern],
) -> BTreeMap<String, String> {
    use std::collections::BTreeSet;
    let mut paths: BTreeSet<String> = BTreeSet::new();
    for snap in [baseline, candidate] {
        for t in &snap.threads {
            paths.insert(flatten_cgroup_path(&t.cgroup, flatten));
        }
        for k in snap.cgroup_stats.keys() {
            paths.insert(flatten_cgroup_path(k, flatten));
        }
    }
    // Compute (skeleton, post_l1, tokens) for every path.
    let entries: Vec<(String, String, String, Vec<String>)> = paths
        .into_iter()
        .map(|p| {
            let (skeleton, post_l1, tokens) = cgroup_normalize_skeleton(&p);
            (p, skeleton, post_l1, tokens)
        })
        .collect();
    // Group entries by Layer-2 skeleton.
    let mut groups: BTreeMap<String, Vec<usize>> = BTreeMap::new();
    for (idx, (_, skel, _, _)) in entries.iter().enumerate() {
        groups.entry(skel.clone()).or_default().push(idx);
    }
    // Tighten per group.
    let mut tightened: Vec<String> = vec![String::new(); entries.len()];
    for (skeleton, indices) in &groups {
        if indices.len() < 2 {
            // Singleton — Layer-2 skeleton stays as the key. No
            // member set to compare against.
            for &i in indices {
                tightened[i] = skeleton.clone();
            }
        } else {
            let post_l1_paths: Vec<String> =
                indices.iter().map(|&i| entries[i].2.clone()).collect();
            let member_tokens: Vec<Vec<String>> =
                indices.iter().map(|&i| entries[i].3.clone()).collect();
            let key = tighten_group(&post_l1_paths, &member_tokens);
            for &i in indices {
                tightened[i] = key.clone();
            }
        }
    }
    let mut out: BTreeMap<String, String> = BTreeMap::new();
    for (i, (orig, _, _, _)) in entries.into_iter().enumerate() {
        out.insert(orig, tightened[i].clone());
    }
    out
}

#[allow(clippy::too_many_arguments)]
fn build_row(
    key: &str,
    display_key: &str,
    n_a: usize,
    n_b: usize,
    metric: &'static CtprofMetricDef,
    a: Aggregated,
    b: Aggregated,
    uptime_pct: Option<f64>,
) -> DiffRow {
    let (delta, delta_pct) = match (a.numeric(), b.numeric()) {
        (Some(va), Some(vb)) => {
            let d = vb - va;
            let pct = if va.abs() > f64::EPSILON {
                Some(d / va)
            } else {
                None
            };
            (Some(d), pct)
        }
        _ => (None, None),
    };
    DiffRow {
        group_key: key.to_string(),
        thread_count_a: n_a,
        thread_count_b: n_b,
        uptime_pct,
        metric_name: metric.name,
        metric_ladder: metric.rule.ladder(),
        baseline: a,
        candidate: b,
        delta,
        delta_pct,
        display_key: display_key.to_string(),
        sort_by_cell: None,
        sort_by_delta: None,
    }
}

/// Placeholder for a pure-digit token (rule 1 of the token-based
/// normalizer). Replaces a token of all ASCII digits.
const TOKEN_DIGIT_PLACEHOLDER: &str = "{N}";

/// Placeholder for a hex-like token (rule 2 of the token-based
/// normalizer). Replaces a token whose chars are all in `[0-9a-f]`,
/// length ≥ 2, and contain at least one digit.
const TOKEN_HEX_PLACEHOLDER: &str = "{H}";

/// Placeholder for a systemd template instance whose value is an
/// opaque ID (rule applied by [`apply_systemd_template`] in cgroup
/// layer 1). For example, `user@0.service` and `user@1001.service`
/// both normalize to `user@{I}.service` because their instances
/// (`0`, `1001`) carry no `[._-]` separators that would suggest a
/// structured service name.
const TOKEN_INSTANCE_PLACEHOLDER: &str = "{I}";

/// Rule 1 pattern: pure ASCII digits.
static TOKEN_RULE_PURE_DIGITS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[0-9]+$").unwrap());

/// Rule 2 pattern: hex-like (all chars in `[0-9a-f]`, length ≥ 2).
/// The "must contain at least one digit" check is applied
/// separately because anchored character-class repetition does
/// not natively express that constraint.
static TOKEN_RULE_HEX_LIKE: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"^[0-9a-f]{2,}$").unwrap());

/// Rule 3 pattern: alpha prefix (length ≥ 1) followed by
/// trailing digits. Capture group 1 is the alpha prefix.
static TOKEN_RULE_ALPHA_PREFIX_DIGITS: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"^([A-Za-z]+)[0-9]+$").unwrap());

/// Rule 4 pattern: leading digits followed by an alpha suffix
/// (length ≥ 1). Capture group 1 is the alpha suffix. Catches
/// kworker high-priority bound workers (`1H`, `0H`, `2H` etc. —
/// the `H` suffix added by `format_worker_id` in
/// `kernel/workqueue.c` when the worker pool's nice value is
/// negative).
static TOKEN_RULE_DIGITS_ALPHA_SUFFIX: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"^[0-9]+([A-Za-z]+)$").unwrap());

/// Token-classification rule. The token-based normalizer
/// ([`pattern_key`]) walks segments produced by
/// [`split_into_segments`] and applies the first rule that
/// matches each token. Rules are checked in order; the first
/// match wins. Rule patterns are direct regex translations of
/// the thread-name normalization rules.
fn classify_token(t: &str) -> String {
    if t.is_empty() {
        return String::new();
    }
    // Rule 1: pure digits → `{N}`.
    if TOKEN_RULE_PURE_DIGITS.is_match(t) {
        return TOKEN_DIGIT_PLACEHOLDER.to_string();
    }
    // Rule 2: hex-like (all chars in [0-9a-f], length ≥ 2,
    // contains at least one digit) → `{H}`. The regex enforces
    // the character set + length; the `.contains` check enforces
    // the "must have at least one digit" gate that the spec
    // requires. Pure-alpha tokens like `abc` fail the digit check;
    // pure-digit tokens fall through to rule 1 first.
    if TOKEN_RULE_HEX_LIKE.is_match(t) && t.chars().any(|c| c.is_ascii_digit()) {
        return TOKEN_HEX_PLACEHOLDER.to_string();
    }
    // Rule 3: alpha prefix + trailing digits → `prefix{N}`. The
    // captured group is the alpha prefix; the trailing digit run
    // is replaced with the placeholder. Single-letter alpha
    // prefixes like `u8` (`kworker/u8:7`) qualify because the
    // spec sets the prefix lower bound at 1.
    if let Some(caps) = TOKEN_RULE_ALPHA_PREFIX_DIGITS.captures(t) {
        return format!("{}{}", &caps[1], TOKEN_DIGIT_PLACEHOLDER);
    }
    // Rule 4: leading digits + alpha suffix → `{N}suffix`. The
    // captured group is the alpha suffix. Comes AFTER rule 2 so
    // hex-like tokens (`1a`, `0f`) take precedence over the
    // leading-digit-suffix interpretation.
    if let Some(caps) = TOKEN_RULE_DIGITS_ALPHA_SUFFIX.captures(t) {
        return format!("{}{}", TOKEN_DIGIT_PLACEHOLDER, &caps[1]);
    }
    // Otherwise: keep literal.
    t.to_string()
}

/// One segment of a tokenized string: either a non-separator run
/// (a token) or a run of separator characters.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Segment<'a> {
    Token(&'a str),
    Separator(&'a str),
}

/// Returns true for any character treated as a token separator by
/// the token-based normalizer. The set is `[.\-_/:@+\[\]\s]+` —
/// ASCII punctuation `.`, `-`, `_`, `/`, `:`, `@`, `+`, `[`, `]`
/// plus any Unicode whitespace. The `+` decoration kworker uses on
/// active workers (`kworker/<cpu>:<id>+<wq>` per `wq_worker_comm`
/// in `kernel/workqueue.c`) is a separator so the digit tokens on
/// either side normalize independently. Brackets appear in
/// process names set via `prctl(PR_SET_NAME)` (kernel threads in
/// userspace tooling render as `[ksoftirqd/0]`, etc.) AND in the
/// literal-mode smaps key shape `pcomm[tgid]` produced by
/// [`collect_smaps_rollup`] under
/// [`CompareOptions::no_thread_normalize`]; treating brackets as
/// separators allows the digit / hex tokens inside them to
/// normalize independently from the surrounding alpha tokens.
fn is_token_separator(c: char) -> bool {
    matches!(c, '.' | '-' | '_' | '/' | ':' | '@' | '+' | '[' | ']') || c.is_whitespace()
}

/// Walk the input and emit alternating token / separator runs.
/// Empty input yields zero segments. Maximal runs are emitted —
/// `a..b` produces `[Token("a"), Separator(".."), Token("b")]`.
fn split_into_segments(s: &str) -> Vec<Segment<'_>> {
    let mut out = Vec::new();
    if s.is_empty() {
        return out;
    }
    let mut chars = s.char_indices().peekable();
    while let Some(&(start, first_c)) = chars.peek() {
        let is_sep = is_token_separator(first_c);
        let mut end = start;
        while let Some(&(idx, c)) = chars.peek() {
            if is_token_separator(c) != is_sep {
                break;
            }
            end = idx + c.len_utf8();
            chars.next();
        }
        let slice = &s[start..end];
        if is_sep {
            out.push(Segment::Separator(slice));
        } else {
            out.push(Segment::Token(slice));
        }
    }
    out
}

/// Compute the token-normalized skeleton for a name string.
///
/// Consumed by [`GroupBy::Comm`] (thread-name grouping),
/// [`GroupBy::Pcomm`] (process-name grouping), and
/// [`collect_smaps_rollup`] (per-pcomm smaps aggregation) — each
/// callsite passes a different field (`t.comm`, `t.pcomm`,
/// `t.pcomm` respectively) and applies its own callsite-level
/// policy on top of the skeleton this function returns.
///
/// Splits the input on a separator class (`[.\-_/:@+\[\]\s]+`),
/// classifies each non-separator token by [`classify_token`], and
/// rejoins with the original separator runs preserved verbatim.
/// The first matching rule wins per token:
///
/// 1. Pure digits → `{N}` (e.g. `42` → `{N}`).
/// 2. Hex-like (all chars `[0-9a-f]`, length ≥ 2, contains at
///    least one digit) → `{H}` (e.g. `abc123def` → `{H}`).
/// 3. Alpha prefix + trailing digits (`^[A-Za-z]+\d+$`, alpha
///    prefix length ≥ 1) → `prefix{N}` (e.g. `worker7` →
///    `worker{N}`, `u8` → `u{N}`).
/// 4. Leading digits + alpha suffix (`^\d+[A-Za-z]+$`) →
///    `{N}suffix` (e.g. `1H` → `{N}H`, `100Hz` → `{N}Hz`).
/// 5. Otherwise: keep literal.
///
/// Two names that produce the same skeleton group together at
/// the bucket layer. The singleton-revert policy ("if only one
/// thread / process matches a skeleton, revert to literal") is a
/// callsite policy enforced by [`build_groups`] — `pattern_key`
/// itself always returns the skeleton, leaving callsites free to
/// override (and indeed [`collect_smaps_rollup`] does NOT
/// singleton-revert; see its doc for why).
///
/// Examples:
/// - `whirly-gig-15` → `whirly-gig-{N}`.
/// - `kworker/0:0-wq_reclaim` → `kworker/{N}:{N}-wq_reclaim`.
/// - `kworker/u8:7` → `kworker/u{N}:{N}` (single-letter alpha
///   prefix `u` qualifies under rule 3).
/// - `session-a1234` → `session-{H}` (hex-like).
/// - `BPF_CUBIC` → `BPF_CUBIC` (pure alpha, no digits).
/// - `bloop-tangler` → `bloop-tangler` (pure alpha).
fn pattern_key(name: &str) -> String {
    let segments = split_into_segments(name);
    let mut out = String::new();
    for seg in segments {
        match seg {
            Segment::Token(t) => out.push_str(&classify_token(t)),
            Segment::Separator(s) => out.push_str(s),
        }
    }
    out
}

/// Cgroup layer 1: systemd `template@instance.service`
/// normalization. Walks the path, finding each
/// `@<instance>.service` segment (bounded by `/` or end-of-string).
/// If the instance contains any of `[._-]`, it is a structured
/// service name and the segment is kept verbatim. Otherwise, the
/// instance is treated as an opaque ID and the segment is rewritten
/// to `@{I}.service`.
///
/// Examples:
/// - `/user.slice/user-0.slice/user@0.service/boot.scope`
///   → `/user.slice/user-0.slice/user@{I}.service/boot.scope`
///   (`0` has no `[._-]`).
/// - `/critical.slice/launcher@foo.bar.baz.service`
///   → unchanged (instance `foo.bar.baz` has `.`).
fn apply_systemd_template(path: &str) -> String {
    let mut out = String::new();
    let mut rest = path;
    while let Some(at_idx) = rest.find('@') {
        out.push_str(&rest[..at_idx]);
        out.push('@');
        let after_at = &rest[at_idx + 1..];
        // Bound the instance segment by the next `/` (or end-of-input).
        let segment_end = after_at.find('/').unwrap_or(after_at.len());
        let segment = &after_at[..segment_end];
        if let Some(instance) = segment.strip_suffix(".service") {
            if instance.is_empty() || instance.contains(['.', '_', '-']) {
                // Structured instance — keep verbatim.
                out.push_str(segment);
            } else {
                // Opaque ID — normalize.
                out.push_str(TOKEN_INSTANCE_PLACEHOLDER);
                out.push_str(".service");
            }
            rest = &after_at[segment_end..];
        } else {
            // No `.service` suffix on this segment — leave the `@`
            // and continue scanning after it.
            rest = after_at;
        }
    }
    out.push_str(rest);
    out
}

/// Cgroup layer 2: token-based normalization. Identical to
/// [`pattern_key`] but operates on a cgroup path string. Returns
/// the post-Layer-1 token list alongside the normalized skeleton —
/// the token list is consumed by [`tighten_group`] to revert
/// constant-across-members positions to literals (Layer 3).
fn cgroup_skeleton_tokens(post_l1: &str) -> (String, Vec<String>) {
    let segments = split_into_segments(post_l1);
    let mut skeleton = String::new();
    let mut tokens = Vec::new();
    for seg in segments {
        match seg {
            Segment::Token(t) => {
                tokens.push(t.to_string());
                skeleton.push_str(&classify_token(t));
            }
            Segment::Separator(s) => {
                skeleton.push_str(s);
            }
        }
    }
    (skeleton, tokens)
}

/// Cgroup layer 3 (tighten): for a multi-member group sharing the
/// same Layer-2 skeleton, revert any token position whose value is
/// identical across every member to its literal form. Positions
/// that vary across members keep their Layer-2 placeholder.
///
/// Members carry both their post-Layer-1 path (used to recover
/// separator runs verbatim from a representative member) and their
/// per-position token list (compared across members for the
/// position-by-position equality check). All members share the
/// same number of tokens and the same separator structure by
/// construction — they share a Layer-2 skeleton.
///
/// Returns the tightened skeleton; if every position varies
/// (nothing to tighten), the result equals the input skeleton.
fn tighten_group(post_l1_paths: &[String], member_tokens: &[Vec<String>]) -> String {
    let representative = match post_l1_paths.first() {
        Some(p) => p,
        None => return String::new(),
    };
    let segments = split_into_segments(representative);
    let mut out = String::new();
    let mut token_pos = 0;
    for seg in segments {
        match seg {
            Segment::Token(_) => {
                let first = &member_tokens[0][token_pos];
                let classified = classify_token(first);
                let all_equal = member_tokens
                    .iter()
                    .all(|tokens| &tokens[token_pos] == first);
                if all_equal && classified == *first {
                    out.push_str(first);
                } else {
                    out.push_str(&classified);
                }
                token_pos += 1;
            }
            Segment::Separator(s) => {
                out.push_str(s);
            }
        }
    }
    out
}

/// Compute the cgroup grouping key for a path under
/// [`GroupBy::Cgroup`] aggregation. Applies Layer 1 (systemd
/// template) and Layer 2 (token normalization). Layer 3 (tighten)
/// runs separately on multi-member groups inside [`build_groups`].
///
/// Returns `(layer2_skeleton, post_l1_path, post_l1_tokens)`. The
/// skeleton is the join key; the post-L1 path and tokens feed
/// [`tighten_group`] for groups with ≥ 2 members.
fn cgroup_normalize_skeleton(path: &str) -> (String, String, Vec<String>) {
    let post_l1 = apply_systemd_template(path);
    let (skeleton, tokens) = cgroup_skeleton_tokens(&post_l1);
    (skeleton, post_l1, tokens)
}

/// Compute the operator-facing display label for a pattern-aware
/// group, given the union of baseline+candidate member comms. For
/// buckets with ≥ 2 distinct member names, runs grex over the
/// sorted union to emit a regex that exactly matches the
/// constituent thread names. For singleton or all-identical
/// buckets, returns the join key unchanged so the rendered label
/// equals what would have shown under literal grouping.
///
/// Empty `members` returns `key` — defensive against synthetic
/// inputs; production builds populate `members` for every
/// bucket.
pub fn pattern_display_label(key: &str, members: &[String]) -> String {
    if members.len() < 2 {
        return key.to_string();
    }
    let regex = grex::RegExpBuilder::from(members).build();
    if regex.len() <= key.len() {
        regex
    } else {
        key.to_string()
    }
}

/// Build the union frequency map for pattern-aware grouping
/// ([`GroupBy::Comm`] or [`GroupBy::Pcomm`]) across the
/// baseline + candidate snapshots. The frequency gate that
/// promotes a `pattern_key` from per-thread literal to a clustered
/// bucket must be evaluated against the UNION of both
/// snapshots' threads — otherwise a pattern that has 1 thread
/// in baseline + 3 threads in candidate would join under a
/// `worker-{N}` key in candidate but a literal `worker-7` key in
/// baseline, and `compare()` would surface the row as
/// only-in-candidate. Computing the count from the union ensures
/// the same key is used on both sides.
///
/// `field` selects which [`ThreadState`] string feeds the count:
/// `|t| t.comm.as_str()` for `Comm`, `|t| t.pcomm.as_str()` for
/// `Pcomm`. The two axes share the same union-frequency contract
/// so one helper covers both.
fn pattern_counts_union(
    baseline: &CtprofSnapshot,
    candidate: &CtprofSnapshot,
    field: fn(&ThreadState) -> &str,
) -> BTreeMap<String, usize> {
    let mut counts: BTreeMap<String, usize> = BTreeMap::new();
    for t in baseline.threads.iter().chain(candidate.threads.iter()) {
        *counts.entry(pattern_key(field(t))).or_insert(0) += 1;
    }
    counts
}

pub fn build_groups(
    snap: &CtprofSnapshot,
    group_by: GroupBy,
    flatten: &[glob::Pattern],
    pattern_counts: Option<&BTreeMap<String, usize>>,
    cgroup_key_map: Option<&BTreeMap<String, String>>,
    no_thread_normalize: bool,
) -> BTreeMap<String, ThreadGroup> {
    // Pattern-aware grouping (Comm, Pcomm) needs a frequency pass:
    // pattern keys with only one matching thread revert to the
    // literal name so a lone worker stays ungrouped instead of
    // advertising a `worker-{N}` pattern that no other thread
    // shares. Non-pattern groupings (CommExact, Cgroup) skip the
    // pre-pass.
    //
    // When `pattern_counts` is supplied (production: `compare()`
    // passes the union over baseline+candidate), it is used as
    // the gate. When it is `None` (single-snapshot test
    // ergonomics), this fn computes counts from `snap` alone.
    // Suppressed when `no_thread_normalize` is set — the gate is
    // meaningless once each thread groups by its literal name.
    let pattern_field: Option<fn(&ThreadState) -> &str> = match (group_by, no_thread_normalize) {
        (GroupBy::Comm | GroupBy::All, false) => Some(|t: &ThreadState| t.comm.as_str()),
        (GroupBy::Pcomm, false) => Some(|t: &ThreadState| t.pcomm.as_str()),
        _ => None,
    };
    let local_counts: Option<BTreeMap<String, usize>> = match (pattern_field, pattern_counts) {
        (Some(field), None) => {
            let mut counts: BTreeMap<String, usize> = BTreeMap::new();
            for t in &snap.threads {
                *counts.entry(pattern_key(field(t))).or_insert(0) += 1;
            }
            Some(counts)
        }
        _ => None,
    };
    let counts_ref: Option<&BTreeMap<String, usize>> = pattern_counts.or(local_counts.as_ref());

    let mut buckets: BTreeMap<String, Vec<&ThreadState>> = BTreeMap::new();
    for t in &snap.threads {
        let key = match group_by {
            GroupBy::All => {
                let cg = flatten_cgroup_path(&t.cgroup, flatten);
                let cg_key = match cgroup_key_map.and_then(|m| m.get(&cg)) {
                    Some(k) => k.clone(),
                    None => cg,
                };
                let pcomm_key = if no_thread_normalize {
                    t.pcomm.clone()
                } else {
                    pattern_key(&t.pcomm)
                };
                let comm_key = if no_thread_normalize {
                    t.comm.clone()
                } else {
                    pattern_key(&t.comm)
                };
                format!("{cg_key}\x00{pcomm_key}\x00{comm_key}")
            }
            // Pcomm and Comm share the same shape: when
            // normalization is enabled, route the chosen field
            // through `pattern_key` and revert singletons to the
            // literal name so a lone process / thread does not
            // advertise a pattern that no other contributor
            // shares. The `pattern_field` accessor (already
            // computed for the local_counts pre-pass) selects
            // pcomm vs comm; under `no_thread_normalize` it is
            // `None` and we group by literal name directly.
            GroupBy::Pcomm | GroupBy::Comm => match pattern_field {
                Some(field) => {
                    let name = field(t);
                    let pk = pattern_key(name);
                    let counts = counts_ref.expect("pattern_counts seeded for Pcomm/Comm");
                    if counts.get(&pk).copied().unwrap_or(0) >= 2 {
                        pk
                    } else {
                        name.to_string()
                    }
                }
                None => {
                    // `no_thread_normalize` set — literal grouping.
                    if group_by == GroupBy::Pcomm {
                        t.pcomm.clone()
                    } else {
                        t.comm.clone()
                    }
                }
            },
            GroupBy::CommExact => t.comm.clone(),
            GroupBy::Cgroup => {
                let post_flatten = flatten_cgroup_path(&t.cgroup, flatten);
                // When auto-normalize is enabled, the cgroup key map
                // (built by `compare()` over the union of paths from
                // both snapshots) maps each post-flatten path to its
                // final tightened key (Layer 1 + 2 + 3). Otherwise,
                // group by post-flatten path verbatim.
                match cgroup_key_map.and_then(|m| m.get(&post_flatten)) {
                    Some(k) => k.clone(),
                    None => post_flatten,
                }
            }
        };
        buckets.entry(key).or_default().push(t);
    }

    let mut out = BTreeMap::new();
    for (key, threads) in buckets {
        let mut metrics = BTreeMap::new();
        for m in CTPROF_METRICS {
            metrics.insert(m.name.to_string(), aggregate(m.rule, &threads));
        }
        let cgroup_stats = if group_by == GroupBy::Cgroup {
            // Pick the first sampled thread's (flattened) cgroup
            // path and look up its enrichment. All threads in the
            // bucket share the flattened key by construction, so
            // the first is representative.
            threads
                .first()
                .and_then(|t| snap.cgroup_stats.get(&t.cgroup).cloned())
        } else {
            None
        };
        // `members` feeds the grex display-label path for
        // normalized `GroupBy::Comm` (literal comms) and
        // `GroupBy::Pcomm` (literal pcomms). Other groupings — and
        // either pattern-aware grouping under
        // `no_thread_normalize` — render the join key directly, so
        // skip the per-bucket name collection (saves a
        // clone-per-thread per-bucket on busy hosts).
        let members: Vec<String> = match pattern_field {
            Some(field) => {
                let mut v: Vec<String> = threads.iter().map(|t| field(t).to_string()).collect();
                v.sort();
                v.dedup();
                v
            }
            None => Vec::new(),
        };
        let valid_starts: Vec<u64> = threads
            .iter()
            .map(|t| t.start_time_clock_ticks)
            .filter(|&t| t > 0)
            .collect();
        let avg_start_ticks = if valid_starts.is_empty() {
            0
        } else {
            valid_starts.iter().sum::<u64>() / valid_starts.len() as u64
        };
        out.insert(
            key.clone(),
            ThreadGroup {
                key,
                thread_count: threads.len(),
                metrics,
                cgroup_stats,
                members,
                avg_start_ticks,
            },
        );
    }
    out
}

/// Aggregate one metric across a slice of threads per its rule.
///
/// Each `Sum*` / `Max*` / `Range*` / `Mode*` arm dispatches
/// through the trait method on the typed newtype defined in
/// [`crate::metric_types`] — `sum_across` for [`Summable`],
/// `max_across` for [`Maxable`], `range_across` for [`Rangeable`],
/// `mode_across` for [`Modeable`] — then unwraps to the
/// untyped scalar that [`Aggregated`] carries today; the
/// unit-aware format dispatch will land in phase 4 and reads
/// the registry's `unit` tag rather than the wrapper type, so
/// `Aggregated` stays scalar-shaped after this phase.
///
/// # Empty-bucket contract
///
/// The trait-level shapes split empty handling differently
/// from the dispatch-level shape:
/// - [`Summable::sum_across`] returns the additive identity
///   (zero) on an empty input — the trait surface itself
///   collapses the empty case. The `Sum*` arms therefore feed
///   straight into [`Aggregated::Sum`] without re-checking.
/// - [`Maxable::max_across`] returns `Option<Self>` (`None`
///   for empty) so callers can distinguish "no contributors"
///   from "all contributors had zero." The dispatch in this
///   function collapses `None` to `Aggregated::Max(0)` at the
///   call boundary so the historical empty-bucket contract on
///   this code path (zero rendered for empty groups) holds
///   regardless of the trait's richer shape.
/// - [`Rangeable::range_across`] returns
///   `Option<Range<Self>>`; the dispatch collapses `None` to
///   `Aggregated::OrdinalRange { min: 0, max: 0 }` at the call
///   boundary.
/// - [`Modeable::mode_across`] returns
///   `Option<(Self, count, total)>`; the dispatch collapses
///   `None` to `Aggregated::Mode { value: "", count: 0, total }`
///   where `total` is the bucket size (which is non-zero only
///   when threads exist but the iterator was emptied — for
///   `aggregate`, total tracks the bucket size directly so the
///   `None` arm always carries `total: threads.len()`).
///
/// Downstream delta math therefore sees a well-defined value
/// at every join boundary regardless of which side of a
/// compare carried zero threads under the bucket key.
///
/// [`Summable`]: crate::metric_types::Summable
/// [`Maxable`]: crate::metric_types::Maxable
/// [`Rangeable`]: crate::metric_types::Rangeable
/// [`Modeable`]: crate::metric_types::Modeable
///
/// Mode-arm dispatch helper used by `aggregate`. Routes a typed
/// iterator of [`crate::metric_types::CategoricalString`] through
/// `mode_across`, then projects the result onto
/// [`Aggregated::Mode`] with the supplied `total` (the number of
/// threads in the bucket). Empty buckets surface as
/// `Aggregated::Mode { value: "", count: 0, total }` matching the
/// historical empty-bucket contract — downstream delta math sees
/// a well-defined value at the join boundary regardless of which
/// side carried zero threads. Lifts the otherwise-identical
/// match arms for [`AggRule::Mode`], [`AggRule::ModeChar`], and
/// [`AggRule::ModeBool`] into one site so a future refactor that
/// changes the empty-bucket contract or the `mode_across` return
/// shape only edits one place.
fn mode_aggregate(
    total: usize,
    items: impl IntoIterator<Item = crate::metric_types::CategoricalString>,
) -> Aggregated {
    use crate::metric_types::{CategoricalString, Modeable};
    match CategoricalString::mode_across(items) {
        Some((value, count, _total)) => Aggregated::Mode {
            value: value.0,
            count,
            total,
        },
        None => Aggregated::Mode {
            value: String::new(),
            count: 0,
            total,
        },
    }
}

pub fn aggregate(rule: AggRule, threads: &[&ThreadState]) -> Aggregated {
    // `Modeable` is imported in `mode_aggregate`; the Mode arms
    // route through that helper so the trait doesn't need to be
    // in scope here. `CategoricalString` is still needed because
    // the ModeChar / ModeBool arms construct one for the
    // coercion path before passing the iterator to
    // `mode_aggregate`.
    use crate::metric_types::{CategoricalString, Maxable, Rangeable, Summable};
    match rule {
        AggRule::SumCount(f) => {
            let s = crate::metric_types::MonotonicCount::sum_across(threads.iter().map(|t| f(t)));
            Aggregated::Sum(s.0)
        }
        AggRule::SumNs(f) => {
            let s = crate::metric_types::MonotonicNs::sum_across(threads.iter().map(|t| f(t)));
            Aggregated::Sum(s.0)
        }
        AggRule::SumTicks(f) => {
            let s = crate::metric_types::ClockTicks::sum_across(threads.iter().map(|t| f(t)));
            Aggregated::Sum(s.0)
        }
        AggRule::SumBytes(f) => {
            let s = crate::metric_types::Bytes::sum_across(threads.iter().map(|t| f(t)));
            Aggregated::Sum(s.0)
        }
        AggRule::MaxPeak(f) => {
            // `max_across` returns `Option<Self>` so callers can
            // distinguish "empty thread bucket" from "all
            // contributors had zero." The historical empty-bucket
            // contract on this code path was `Aggregated::Max(0)`;
            // preserve it by collapsing `None` to the additive
            // identity at the call boundary. Non-empty buckets
            // produce a concrete max regardless of value.
            let m = crate::metric_types::PeakNs::max_across(threads.iter().map(|t| f(t)));
            Aggregated::Max(m.map(|v| v.0).unwrap_or(0))
        }
        AggRule::MaxPeakBytes(f) => {
            // Same Option<Self> + None → Aggregated::Max(0)
            // collapse as MaxPeak; the difference is only the
            // typed accessor's unit family — Bytes vs Ns. The
            // ladder() match maps this variant to
            // ScaleLadder::Bytes so the renderer auto-scales
            // with KiB/MiB/GiB/TiB suffixes.
            let m = crate::metric_types::PeakBytes::max_across(threads.iter().map(|t| f(t)));
            Aggregated::Max(m.map(|v| v.0).unwrap_or(0))
        }
        AggRule::MaxGaugeNs(f) => {
            let m = crate::metric_types::GaugeNs::max_across(threads.iter().map(|t| f(t)));
            Aggregated::Max(m.map(|v| v.0).unwrap_or(0))
        }
        AggRule::MaxGaugeCount(f) => {
            let m = crate::metric_types::GaugeCount::max_across(threads.iter().map(|t| f(t)));
            Aggregated::Max(m.map(|v| v.0).unwrap_or(0))
        }
        AggRule::RangeI32(f) => {
            match crate::metric_types::OrdinalI32::range_across(threads.iter().map(|t| f(t))) {
                // `range_across` returns `None` only on an empty
                // iterator — mirror the historical empty-group
                // contract by collapsing to (0, 0) so the
                // downstream midpoint and delta math sees a
                // well-defined value at the join boundary. The
                // `Some` arm carries a typed `Range<OrdinalI32>`
                // wrapper that guarantees min ≤ max as a
                // type-system invariant; `into_tuple()` extracts
                // the pair without re-checking.
                Some(r) => {
                    let (min, max) = r.into_tuple();
                    Aggregated::OrdinalRange {
                        min: i64::from(min.0),
                        max: i64::from(max.0),
                    }
                }
                None => Aggregated::OrdinalRange { min: 0, max: 0 },
            }
        }
        AggRule::RangeU32(f) => {
            match crate::metric_types::OrdinalU32::range_across(threads.iter().map(|t| f(t))) {
                Some(r) => {
                    let (min, max) = r.into_tuple();
                    Aggregated::OrdinalRange {
                        min: i64::from(min.0),
                        max: i64::from(max.0),
                    }
                }
                None => Aggregated::OrdinalRange { min: 0, max: 0 },
            }
        }
        AggRule::Mode(f) => mode_aggregate(threads.len(), threads.iter().map(|t| f(t))),
        AggRule::ModeChar(f) => mode_aggregate(
            threads.len(),
            // `char` is not Modeable directly; coerce to the
            // CategoricalString reduction so the lex-tiebreak
            // contract is identical to other Mode variants.
            threads.iter().map(|t| CategoricalString(f(t).to_string())),
        ),
        AggRule::ModeBool(f) => mode_aggregate(
            threads.len(),
            // Same coercion path as `ModeChar`. `to_string()`
            // produces `"true"`/`"false"` per `bool::Display`.
            threads.iter().map(|t| CategoricalString(f(t).to_string())),
        ),
        AggRule::Affinity(f) => {
            let mut seen: Vec<Vec<u32>> = Vec::new();
            let mut min_cpus = usize::MAX;
            let mut max_cpus = 0usize;
            for t in threads {
                let cpus = f(t).0;
                min_cpus = min_cpus.min(cpus.len());
                max_cpus = max_cpus.max(cpus.len());
                if !seen.iter().any(|s| s == &cpus) {
                    seen.push(cpus);
                }
            }
            if threads.is_empty() {
                min_cpus = 0;
            }
            let uniform = if seen.len() == 1 {
                seen.into_iter().next()
            } else {
                None
            };
            Aggregated::Affinity(AffinitySummary {
                min_cpus,
                max_cpus,
                uniform,
            })
        }
    }
}

/// Collapse dynamic segments of a cgroup path per every pattern
/// in `patterns`. A pattern is a glob (`*` matches one segment,
/// `**` matches multiple) where the literal portions are preserved
/// and the wildcard portions are replaced with the wildcard token
/// itself. Example: pattern `/kubepods/*/workload` applied to
/// `/kubepods/pod-abc/workload` produces `/kubepods/*/workload`,
/// so two runs with different pod IDs collapse onto the same key.
///
/// Patterns are tried in listed order; the first match wins and
/// subsequent patterns are not applied. A path that matches no
/// pattern is returned verbatim.
pub fn flatten_cgroup_path(path: &str, patterns: &[glob::Pattern]) -> String {
    for p in patterns {
        if p.matches(path) {
            // The pattern itself becomes the canonical key: every
            // path matching `/kubepods/*/workload` collapses onto
            // the literal pattern string.
            return p.as_str().to_string();
        }
    }
    path.to_string()
}

pub fn compile_flatten_patterns(raw: &[String]) -> Vec<glob::Pattern> {
    raw.iter()
        .filter_map(|s| glob::Pattern::new(s).ok())
        .collect()
}

/// Parse a `--sort-by` CLI value into a list of [`SortKey`]s.
/// Spec format: `metric1[:dir1],metric2[:dir2],...` where each
/// `metric` is a name from [`CTPROF_METRICS`] or
/// [`CTPROF_DERIVED_METRICS`] and `dir` is `asc` or `desc`
/// (case-insensitive — `:DESC`, `:Asc`, `:asc` all work).
/// Direction defaults to `desc` (largest delta first — operator
/// "show me the largest changes" default).
///
/// Whitespace around the metric name and around the direction
/// is trimmed independently, so `wait_sum : desc` and
/// `wait_sum:desc` produce identical [`SortKey`] values.
///
/// Each parsed [`SortKey`] stores the matched registry name as
/// `&'static str` (not a copy of the user's input), so downstream
/// equality with [`CtprofMetricDef::name`] or
/// [`DerivedMetricDef::name`] is a content-equality check
/// (`str::eq`) over the same registry-owned bytes — no per-key
/// allocation outlives this call. The two registries are
/// disjoint, so a name resolves unambiguously to one or the
/// other.
///
/// Sorts groups by their aggregated metric values under whatever
/// `--group-by` axis is in effect. The same spec works under
/// every grouping (pcomm / cgroup / comm / comm-exact) — group
/// rank reflects the per-group aggregate (sum, max, etc. per
/// the metric's [`AggRule`]) of the named metric, OR the
/// per-group derived value for entries from
/// [`CTPROF_DERIVED_METRICS`].
///
/// Examples:
/// - `"wait_sum"` → one key, descending.
/// - `"wait_sum:asc"` → one key, ascending.
/// - `"wait_sum:desc,run_time_ns:desc"` → two keys, both
///   descending; lexicographic.
/// - `"avg_wait_ns:desc"` → one key referencing a derived
///   metric, descending.
/// - `""` → empty Vec (caller falls back to default sort).
///
/// Errors:
/// - Unknown metric name (not in [`CTPROF_METRICS`] AND not
///   in [`CTPROF_DERIVED_METRICS`]).
/// - Categorical metric name (one whose [`AggRule`] is
///   [`AggRule::Mode`] / [`AggRule::ModeChar`] /
///   [`AggRule::ModeBool`] — string- / char- / bool-valued, no
///   scalar to sort by). The default sort already places mode
///   rows last under the `delta_pct` ladder; sorting BY a mode
///   metric would silently degrade to alphabetical group order.
/// - Duplicate metric name across two entries (e.g.
///   `--sort-by wait_sum,wait_sum`). The second key would never
///   contribute to the lex ordering, so it's an operator typo
///   rather than a meaningful spec.
/// - Direction string other than `asc` / `desc`.
/// - Empty token between commas (e.g. `"a,,b"`).
pub fn parse_sort_by(spec: &str) -> anyhow::Result<Vec<SortKey>> {
    if spec.is_empty() {
        return Ok(Vec::new());
    }
    // Build a `name → &'static CtprofMetricDef` index so the
    // lookup returns the canonical registry pointer (for storing
    // in SortKey) AND the AggRule (for the categorical-reject
    // check).
    let registry: std::collections::BTreeMap<&'static str, &'static CtprofMetricDef> =
        CTPROF_METRICS.iter().map(|m| (m.name, m)).collect();
    let mut out: Vec<SortKey> = Vec::new();
    let mut seen: std::collections::BTreeSet<&'static str> = std::collections::BTreeSet::new();
    for entry in spec.split(',') {
        let entry = entry.trim();
        if entry.is_empty() {
            anyhow::bail!(
                "empty entry in --sort-by spec {spec:?}; \
                 entries are comma-separated and must be non-empty"
            );
        }
        let (metric, descending) = match entry.split_once(':') {
            Some((m, dir)) => {
                // Trim both sides (`"wait_sum : DESC"` → metric
                // `"wait_sum"` and direction `"DESC"`) and lowercase
                // the direction so `:DESC` / `:Asc` / `:asc` are
                // accepted equivalently. Operator-typed CLI input
                // is forgiving about case; the canonical form
                // stored in [`SortKey`] is still derived from the
                // matched ascii literal.
                let dir_norm = dir.trim().to_ascii_lowercase();
                match dir_norm.as_str() {
                    "desc" => (m, true),
                    "asc" => (m, false),
                    _ => anyhow::bail!(
                        "invalid direction {dir:?} in --sort-by entry \
                         {entry:?}; expected `asc` or `desc`"
                    ),
                }
            }
            None => (entry, true),
        };
        let metric = metric.trim();
        // Resolve the input name against either the primary
        // registry or the derived registry. The two namespaces
        // are disjoint (the registry_and_derived_names_disjoint
        // test pins this), so a name resolves unambiguously.
        // The categorical-reject check applies only to primary
        // metrics (derived metrics never go through AggRule).
        let resolved_name: Option<&'static str> = if let Some(def) = registry.get(metric).copied() {
            if matches!(
                def.rule,
                AggRule::Mode(_) | AggRule::ModeChar(_) | AggRule::ModeBool(_),
            ) {
                anyhow::bail!(
                    "metric {metric:?} is categorical (no numeric value to sort by); \
                     --sort-by accepts only metrics whose AggRule yields a scalar \
                     (Sum*, Max*, Range*, or Affinity)"
                );
            }
            Some(def.name)
        } else {
            CTPROF_DERIVED_METRICS
                .iter()
                .find(|d| d.name == metric)
                .map(|d| d.name)
        };
        let Some(canonical) = resolved_name else {
            // Sorted comma-separated list keeps the diagnostic
            // copy-pasteable — operator can grep the names
            // without parsing BTreeSet debug syntax. The
            // rendered table cells append `[tag]` suffixes (e.g.
            // `wait_sum [non-ext] [SCHEDSTATS]`), but
            // `--sort-by` accepts only the bare metric name; if
            // the operator pasted the rendered cell verbatim
            // the trailing bracket would land here, hence the
            // explicit hint.
            let mut valid: Vec<&'static str> = registry.keys().copied().collect();
            for d in CTPROF_DERIVED_METRICS {
                valid.push(d.name);
            }
            valid.sort();
            let valid = valid.join(", ");
            anyhow::bail!(
                "unknown metric {metric:?} in --sort-by spec {spec:?}; \
                 use the bare metric name, not the rendered cell with \
                 [tag] suffixes; must be one of: {valid}",
            );
        };
        if !seen.insert(canonical) {
            anyhow::bail!(
                "duplicate metric {metric:?} in --sort-by spec {spec:?}; \
                 each metric may appear at most once across all sort keys"
            );
        }
        out.push(SortKey {
            metric: canonical,
            descending,
        });
    }
    Ok(out)
}

pub fn flatten_cgroup_stats(
    stats: &BTreeMap<String, CgroupStats>,
    patterns: &[glob::Pattern],
    cgroup_key_map: Option<&BTreeMap<String, String>>,
) -> BTreeMap<String, CgroupStats> {
    // When multiple input paths flatten to the same key, the
    // merge is per-controller and per-field-class:
    //
    // - **Counters** (`usage_usec`, `nr_throttled`,
    //   `throttled_usec`, `pids.current`, `memory.events` map
    //   values, AND counter-shaped `memory.stat` keys
    //   (workingset_*, pgfault, pgmajfault, pgsteal_*, etc.)):
    //   saturating_add. Cumulative across the merged bucket.
    // - **Instantaneous values / gauges** (`memory.current` AND
    //   gauge-shaped `memory.stat` keys per
    //   [`MEMORY_STAT_GAUGE_KEYS`]: anon, file, slab,
    //   active_anon, etc.): max. Summing point-in-time pool
    //   sizes overstates the merged-bucket gauge. Counter vs
    //   gauge dispatch lives in [`merge_memory_stat`].
    // - **Limits** (`memory.max`, `memory.high`, `pids.max`,
    //   `cpu.max` quota, `cpu.weight`, `cpu.weight.nice`):
    //   max-for-limits via [`merge_max_option`]. `None` ("no
    //   limit") propagates when EITHER side is unbounded — the
    //   merged bucket is unbounded if any contributor is, since
    //   no synthesized cap reflects the actual kernel-enforced
    //   reality.
    // - **Floors** (`memory.low`, `memory.min`): min-for-floors
    //   via [`merge_min_option`]. `None` ("no floor")
    //   propagates when EITHER side has no floor — the merged
    //   bucket is only as protected as its weakest contributor,
    //   for the same reason. The literal "max" token (full
    //   protection) parses to `Some(u64::MAX)` per
    //   [`parse_floor_value`] and merges via min-for-floors,
    //   correctly yielding the smaller concrete floor when one
    //   contributor has full protection and another has a
    //   numeric floor.
    // - **PSI**: avg fields max-across, total_usec
    //   saturating_add (per [`merge_psi`]).
    //
    // When `cgroup_key_map` is provided (auto-normalize is on),
    // each post-flatten path is further mapped to its final
    // tightened key — so the enrichment table renders against
    // the same labels as thread groups. When absent, the
    // post-flatten path itself is the key (matches the legacy
    // behavior with glob-only flatten).
    // First-iteration-replace semantics: the first contributor
    // for a key is inserted verbatim (clone). Subsequent
    // contributors are merged in via the per-domain merge fns.
    // Using `or_default()` + merge here would synthesize a
    // CgroupStats whose `Option<u64>` limits are all None and
    // None-poison every `merge_max_option`/`merge_min_option`
    // call against the first real contributor — yielding `None`
    // for limits/floors even when every contributor has a
    // concrete value. The replace-on-first / merge-on-rest split
    // ensures None-poisoning fires only when contributors
    // genuinely disagree (one None, one Some), never when
    // merging the synthetic seed.
    let mut out: BTreeMap<String, CgroupStats> = BTreeMap::new();
    for (path, cs) in stats {
        let post_flatten = flatten_cgroup_path(path, patterns);
        let key = match cgroup_key_map.and_then(|m| m.get(&post_flatten)) {
            Some(k) => k.clone(),
            None => post_flatten,
        };
        match out.get_mut(&key) {
            None => {
                out.insert(key, cs.clone());
            }
            Some(agg) => {
                merge_cgroup_cpu(&mut agg.cpu, &cs.cpu);
                merge_cgroup_memory(&mut agg.memory, &cs.memory);
                merge_cgroup_pids(&mut agg.pids, &cs.pids);
                agg.psi = merge_psi(agg.psi, cs.psi);
            }
        }
    }
    out
}

/// Merge two [`Psi`] bundles for the cgroup-flatten path. PSI
/// avg fields (`avg10/60/300`) are percentages, so summing
/// across cgroups overstates the merged-bucket pressure
/// (200% has no meaning); max gives "worst-pressured cgroup
/// in the merged bucket" which is the actionable signal for
/// regression detection. `total_usec` is cumulative microseconds
/// of stall time, additive across the merged cgroups —
/// `saturating_add` matches the existing `throttled_usec`
/// flatten policy directly above.
fn merge_psi(a: Psi, b: Psi) -> Psi {
    Psi {
        cpu: merge_psi_resource(a.cpu, b.cpu),
        memory: merge_psi_resource(a.memory, b.memory),
        io: merge_psi_resource(a.io, b.io),
        irq: merge_psi_resource(a.irq, b.irq),
    }
}

fn merge_psi_resource(a: PsiResource, b: PsiResource) -> PsiResource {
    PsiResource {
        some: merge_psi_half(a.some, b.some),
        full: merge_psi_half(a.full, b.full),
    }
}

fn merge_psi_half(a: PsiHalf, b: PsiHalf) -> PsiHalf {
    PsiHalf {
        avg10: a.avg10.max(b.avg10),
        avg60: a.avg60.max(b.avg60),
        avg300: a.avg300.max(b.avg300),
        total_usec: a.total_usec.saturating_add(b.total_usec),
    }
}

/// Merge two [`CgroupCpuStats`]: counters use `saturating_add`,
/// limits/knobs use the max-for-limits / max-for-weights rule.
/// Floors don't apply here (none in this domain). `period`
/// takes the larger value as a stable fallback when
/// contributors set different periods.
fn merge_cgroup_cpu(agg: &mut CgroupCpuStats, src: &CgroupCpuStats) {
    agg.usage_usec = agg.usage_usec.saturating_add(src.usage_usec);
    agg.nr_throttled = agg.nr_throttled.saturating_add(src.nr_throttled);
    agg.throttled_usec = agg.throttled_usec.saturating_add(src.throttled_usec);
    agg.max_quota_us = merge_max_option(agg.max_quota_us, src.max_quota_us);
    agg.max_period_us = agg.max_period_us.max(src.max_period_us);
    // `weight` and `weight_nice` are aliases of the same kernel
    // knob (`kernel/sched/core.c::sched_weight_to_nice` /
    // `nice_to_weight`). Apply the SAME merge policy to both —
    // asymmetric merging would render a `weight=10, weight_nice=None`
    // bucket as if its contributors disagreed when they cannot
    // (the kernel writes both atomically). Use `merge_max_option`
    // (None-poisons) for both: the merged bucket is "no weight
    // configured" if any contributor is unconfigured.
    agg.weight = merge_max_option(agg.weight, src.weight);
    agg.weight_nice = match (agg.weight_nice, src.weight_nice) {
        (Some(a), Some(b)) => Some(a.max(b)),
        // Mirror merge_max_option's None-poisoning policy:
        // None ∨ Some = None. Treats "absent file" as
        // "unconfigured" — merged bucket inherits the
        // unconfigured state.
        (Some(_), None) | (None, Some(_)) | (None, None) => None,
    };
}

/// Merge two [`CgroupMemoryStats`]. `current` is instantaneous
/// RSS — `max` matches the existing memory_current policy.
/// Limits (`max`, `high`) use max-for-limits, floors (`low`,
/// `min`) use min-for-floors per Q4. `stat` is a heterogeneous
/// map (counters + gauges) — see [`merge_memory_stat`] for the
/// per-key policy. `events` is purely counter-shaped — sum
/// per-key via [`merge_kv_counters`].
fn merge_cgroup_memory(agg: &mut CgroupMemoryStats, src: &CgroupMemoryStats) {
    agg.current = agg.current.max(src.current);
    agg.max = merge_max_option(agg.max, src.max);
    agg.high = merge_max_option(agg.high, src.high);
    agg.low = merge_min_option(agg.low, src.low);
    agg.min = merge_min_option(agg.min, src.min);
    merge_memory_stat(&mut agg.stat, &src.stat);
    merge_kv_counters(&mut agg.events, &src.events);
}

/// `memory.stat` keys whose values are INSTANTANEOUS GAUGES,
/// not cumulative counters. The kernel emits these as the
/// current (point-in-time) byte count for that pool — summing
/// across cgroups overstates the merged-bucket gauge, so the
/// merge takes max instead. Keys NOT in this list are
/// counter-shaped (pgfault, pgmajfault, workingset_*,
/// pgsteal_*, pgscan_*, pgrefill, etc.) and merge via
/// `saturating_add`.
///
/// List sourced from inspecting the v2 `memory.stat` emission
/// path in `mm/memcontrol.c` and the cgroup v2 documentation:
/// these names denote pools (active resident bytes), not
/// occurrences. Conservative — if a key is unknown, the merge
/// defaults to sum (the existing kv-counter policy).
const MEMORY_STAT_GAUGE_KEYS: &[&str] = &[
    "anon",
    "file",
    "kernel",
    "kernel_stack",
    "pagetables",
    "sec_pagetables",
    "percpu",
    "sock",
    "vmalloc",
    "shmem",
    "zswap",
    "zswapped",
    "file_mapped",
    "file_dirty",
    "file_writeback",
    "swapcached",
    "anon_thp",
    "file_thp",
    "shmem_thp",
    "inactive_anon",
    "active_anon",
    "inactive_file",
    "active_file",
    "unevictable",
    "slab_reclaimable",
    "slab_unreclaimable",
    "slab",
    "hugetlb",
];

/// Merge `memory.stat` maps with per-key policy: gauge keys
/// (per [`MEMORY_STAT_GAUGE_KEYS`]) take max; counter keys
/// take saturating_add. Gauges are point-in-time pool sizes
/// (`anon`, `file`, `slab`, etc.) — summing across cgroups
/// overstates the merged-bucket pool. Counter keys
/// (workingset_refault_*, pgfault, pgmajfault, pgsteal_*,
/// etc.) are cumulative event counts — additive across the
/// merged bucket.
fn merge_memory_stat(agg: &mut BTreeMap<String, u64>, src: &BTreeMap<String, u64>) {
    for (key, value) in src {
        let is_gauge = MEMORY_STAT_GAUGE_KEYS.contains(&key.as_str());
        agg.entry(key.clone())
            .and_modify(|v| {
                *v = if is_gauge {
                    (*v).max(*value)
                } else {
                    v.saturating_add(*value)
                };
            })
            .or_insert(*value);
    }
}

/// Merge two [`CgroupPidsStats`]. `current` is a point-in-time
/// task count — the merged bucket's count is the sum across
/// contributors at the moment of capture (each contributor's
/// processes are disjoint by construction, so the sum is the
/// total count). `max` is a limit (max-for-limits).
fn merge_cgroup_pids(agg: &mut CgroupPidsStats, src: &CgroupPidsStats) {
    agg.current = match (agg.current, src.current) {
        (Some(a), Some(b)) => Some(a.saturating_add(b)),
        (Some(v), None) | (None, Some(v)) => Some(v),
        (None, None) => None,
    };
    agg.max = merge_max_option(agg.max, src.max);
}

/// Merge policy for `Option<u64>` LIMITS: take the max across
/// contributors. `None` means "no limit" — propagating `None`
/// when EITHER side is unbounded matches the kernel's actual
/// behavior (the merged bucket is unbounded if any contributor
/// is). When both sides have concrete values, max gives "the
/// largest cap any contributor enforces".
///
/// Surface-symmetric with [`merge_min_option`] but the kernel
/// semantics are OPPOSITE: limits use `None` to mean
/// "unbounded" (any contributor unbounded ⇒ merged unbounded);
/// floors use `None` to mean "no protection" (any contributor
/// unprotected ⇒ merged unprotected). They share the same
/// None-poisoning shape because both interpret missing as
/// "the weakest contributor wins" in their respective
/// directions.
fn merge_max_option(a: Option<u64>, b: Option<u64>) -> Option<u64> {
    match (a, b) {
        (Some(a), Some(b)) => Some(a.max(b)),
        // `None` = "no limit"; merged bucket is unbounded if
        // either contributor is. Drop the concrete value rather
        // than synthesize a bound that doesn't reflect reality.
        (Some(_), None) | (None, Some(_)) => None,
        (None, None) => None,
    }
}

/// Merge policy for `Option<u64>` FLOORS (memory.low,
/// memory.min): take the min across contributors. `None` means
/// "no floor" (no protection); propagate `None` when either
/// side has no floor — the merged bucket is only as protected
/// as its weakest contributor.
fn merge_min_option(a: Option<u64>, b: Option<u64>) -> Option<u64> {
    match (a, b) {
        (Some(a), Some(b)) => Some(a.min(b)),
        (Some(_), None) | (None, Some(_)) => None,
        (None, None) => None,
    }
}

/// Per-key sum of two key-value counter maps. Keys present only
/// on one side are copied verbatim; keys on both sides sum with
/// saturating_add.
fn merge_kv_counters(agg: &mut BTreeMap<String, u64>, src: &BTreeMap<String, u64>) {
    for (key, value) in src {
        agg.entry(key.clone())
            .and_modify(|v| *v = v.saturating_add(*value))
            .or_insert(*value);
    }
}

fn format_cpu_range(cpus: &[u32]) -> String {
    // Collapse contiguous runs to `a-b`, join with commas. Assumes
    // sorted ascending; capture layer stores sorted cpusets.
    if cpus.is_empty() {
        return String::new();
    }
    let mut out = String::new();
    let mut start = cpus[0];
    let mut prev = cpus[0];
    for &c in &cpus[1..] {
        if c == prev + 1 {
            prev = c;
            continue;
        }
        if !out.is_empty() {
            out.push(',');
        }
        if start == prev {
            out.push_str(&start.to_string());
        } else {
            out.push_str(&format!("{start}-{prev}"));
        }
        start = c;
        prev = c;
    }
    if !out.is_empty() {
        out.push(',');
    }
    if start == prev {
        out.push_str(&start.to_string());
    } else {
        out.push_str(&format!("{start}-{prev}"));
    }
    out
}

/// Closed enumeration of auto-scale ladders driven by phase 4
/// format dispatch.
///
/// Picks the unit family up the type system rather than a free-form
/// `&'static str` tag. Each [`AggRule`] variant maps to exactly one
/// ladder via [`AggRule::ladder`]; each [`DerivedMetricDef`] entry
/// carries a ladder via [`DerivedMetricDef::ladder`]; the cgroup-
/// level render path passes a ladder directly. A registry typo or
/// drift between accessor newtype and ladder choice fails to compile
/// at the registry edit site rather than silently routing through
/// an "unknown unit" pass-through arm at render time.
///
/// The six ladder variants and their step-up rules:
/// - [`Ns`](Self::Ns): ns → µs (×1e3) → ms (×1e6) → s (×1e9).
///   Decimal prefixes — SI time, not binary. Used for
///   [`AggRule::SumNs`] (cumulative ns counters),
///   [`AggRule::MaxPeak`] (lifetime ns high-water marks),
///   [`AggRule::MaxGaugeNs`] (instantaneous ns gauges), and
///   the `"ns"` derived-metric ladder.
/// - [`Us`](Self::Us): µs → ms (×1e3) → s (×1e6). Decimal SI
///   prefixes. The cgroup `cpu_usage_usec` and `throttled_usec`
///   fields are reported by the kernel in microseconds; this
///   ladder scales them up the same way the `Ns` ladder scales
///   nanoseconds.
/// - [`Bytes`](Self::Bytes): B → KiB → MiB → GiB → TiB. IEC binary
///   prefixes (×1024) for byte counts. Used for
///   [`AggRule::SumBytes`] and any byte-typed derived metric.
/// - [`Ticks`](Self::Ticks): ticks → Kticks (×1e3) → Mticks (×1e6).
///   Decimal prefixes for clock-tick counts
///   (`utime_clock_ticks`, `stime_clock_ticks`); the unit
///   itself is opaque (the kernel's `USER_HZ` rate is
///   host-dependent), so an SI prefix is the most we can
///   promise.
/// - [`Unitless`](Self::Unitless): "" → K → M → G. Decimal
///   prefixes for non-dimensional counters (wakeups, migrations,
///   csw, syscall counts). Used for [`AggRule::SumCount`] and
///   [`AggRule::MaxGaugeCount`].
/// - [`None`](Self::None): no ladder — values render as the bare
///   integer with no unit suffix and no scaling. Used for
///   [`AggRule::Mode`] / [`AggRule::ModeChar`] /
///   [`AggRule::ModeBool`] (categorical strings),
///   [`AggRule::RangeI32`] / [`AggRule::RangeU32`] (bounded
///   ordinals), and [`AggRule::Affinity`] (cpuset summaries) —
///   the [`Aggregated`] [`fmt::Display`] impl handles render for
///   these directly.
///
/// The threshold for stepping up is `|value| >= next_scale`.
/// Sign is preserved through scaling (negative deltas pass
/// through). Zero stays at base unit.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ScaleLadder {
    Ns,
    Us,
    Bytes,
    Ticks,
    Unitless,
    None,
}

impl ScaleLadder {
    /// Base unit string for this ladder — what [`auto_scale`]
    /// returns for a value at the bottom of the ladder. Used by
    /// the format helpers to detect whether a value stepped up
    /// (`auto_scale(v).1 != ladder.base_unit()` ⇒ stepped up,
    /// render with the scaled unit; equal ⇒ no step-up, render
    /// the bare integer with the base unit suffix).
    pub fn base_unit(&self) -> &'static str {
        match self {
            ScaleLadder::Ns => "ns",
            ScaleLadder::Us => "µs",
            ScaleLadder::Bytes => "B",
            ScaleLadder::Ticks => "ticks",
            ScaleLadder::Unitless | ScaleLadder::None => "",
        }
    }
}

impl AggRule {
    /// The auto-scale ladder for this rule's value cell.
    ///
    /// Closed match — adding a new [`AggRule`] variant requires
    /// adding the ladder mapping here, which is the type-system
    /// enforcement phase 4 introduces. The mapping is one-to-one
    /// with the typed accessor newtype: [`AggRule::SumNs`] →
    /// [`ScaleLadder::Ns`], [`AggRule::SumBytes`] →
    /// [`ScaleLadder::Bytes`], etc.
    pub fn ladder(&self) -> ScaleLadder {
        match self {
            // Cumulative counters — Sum reductions, ladder
            // determined by the unit family of the typed
            // accessor. SumCount and MaxGaugeCount both produce a
            // unitless count; SumNs / MaxPeak / MaxGaugeNs all
            // produce a ns value; SumTicks produces ticks;
            // SumBytes / MaxPeakBytes produce bytes.
            AggRule::SumCount(_) => ScaleLadder::Unitless,
            AggRule::SumNs(_) => ScaleLadder::Ns,
            AggRule::SumTicks(_) => ScaleLadder::Ticks,
            AggRule::SumBytes(_) => ScaleLadder::Bytes,
            AggRule::MaxPeak(_) => ScaleLadder::Ns,
            AggRule::MaxPeakBytes(_) => ScaleLadder::Bytes,
            AggRule::MaxGaugeNs(_) => ScaleLadder::Ns,
            AggRule::MaxGaugeCount(_) => ScaleLadder::Unitless,
            // Range / Mode / Affinity carry no ladder — the
            // Aggregated Display impl handles render directly.
            AggRule::RangeI32(_)
            | AggRule::RangeU32(_)
            | AggRule::Mode(_)
            | AggRule::ModeChar(_)
            | AggRule::ModeBool(_)
            | AggRule::Affinity(_) => ScaleLadder::None,
        }
    }
}

/// Auto-scale a numeric value to a more readable magnitude based
/// on its [`ScaleLadder`]. Returns the scaled value paired with
/// the scaled unit string.
///
/// This is render-only; the underlying numeric values used for
/// sort order and delta math are untouched.
///
/// Phase 4: dispatches on a closed [`ScaleLadder`] enum rather
/// than a free-form unit string. The mapping from
/// [`AggRule`] / [`DerivedMetricDef`] / cgroup-render call site
/// to [`ScaleLadder`] lives at the type level — see
/// [`AggRule::ladder`] and [`DerivedMetricDef::ladder`] — so a
/// registry typo can no longer fall through an `other =>
/// pass-through` arm and silently render the unscaled value.
fn auto_scale(value: f64, ladder: ScaleLadder) -> (f64, &'static str) {
    let abs = value.abs();
    match ladder {
        ScaleLadder::Ns => {
            if abs >= 1e9 {
                (value / 1e9, "s")
            } else if abs >= 1e6 {
                (value / 1e6, "ms")
            } else if abs >= 1e3 {
                (value / 1e3, "µs")
            } else {
                (value, "ns")
            }
        }
        ScaleLadder::Us => {
            if abs >= 1e6 {
                (value / 1e6, "s")
            } else if abs >= 1e3 {
                (value / 1e3, "ms")
            } else {
                (value, "µs")
            }
        }
        ScaleLadder::Bytes => {
            const KIB: f64 = 1024.0;
            const MIB: f64 = 1024.0 * KIB;
            const GIB: f64 = 1024.0 * MIB;
            const TIB: f64 = 1024.0 * GIB;
            if abs >= TIB {
                (value / TIB, "TiB")
            } else if abs >= GIB {
                (value / GIB, "GiB")
            } else if abs >= MIB {
                (value / MIB, "MiB")
            } else if abs >= KIB {
                (value / KIB, "KiB")
            } else {
                (value, "B")
            }
        }
        ScaleLadder::Ticks => {
            if abs >= 1e6 {
                (value / 1e6, "Mticks")
            } else if abs >= 1e3 {
                (value / 1e3, "Kticks")
            } else {
                (value, "ticks")
            }
        }
        ScaleLadder::Unitless => {
            if abs >= 1e9 {
                (value / 1e9, "G")
            } else if abs >= 1e6 {
                (value / 1e6, "M")
            } else if abs >= 1e3 {
                (value / 1e3, "K")
            } else {
                (value, "")
            }
        }
        ScaleLadder::None => (value, ""),
    }
}

/// Format a per-row baseline / candidate cell for [`write_diff`].
/// Numeric aggregates ([`Aggregated::Sum`] / [`Aggregated::Max`])
/// run through [`auto_scale`] so large values render in a
/// readable magnitude (`1.235ms` instead of `1234567ns`). When
/// the scaled unit equals the ladder's base unit (no step-up was
/// triggered), the original integer value is rendered verbatim
/// — this avoids polluting small numbers with a `.000` suffix.
/// Non-numeric aggregates (`OrdinalRange`, `Mode`, `Affinity`)
/// fall through to the [`Aggregated`] [`fmt::Display`] impl
/// unchanged because no scaling applies; the ladder is
/// [`ScaleLadder::None`] for these and the suffix is empty.
pub fn format_value_cell(agg: &Aggregated, ladder: ScaleLadder) -> String {
    match agg {
        Aggregated::Sum(v) => format_scaled_u64(*v, ladder),
        Aggregated::Max(v) => format_scaled_u64(*v, ladder),
        _ => format!("{agg}{}", ladder.base_unit()),
    }
}

/// Auto-scale a `u64` value at the given ladder and render it as
/// a cell. Helper for [`format_value_cell`] — the Sum and Max
/// arms share this exact logic. Also used by the `ctprof
/// show` renderer for the cgroup-stats secondary table, where
/// each scalar stands alone (no baseline/candidate pair to fold
/// into a delta cell).
pub fn format_scaled_u64(v: u64, ladder: ScaleLadder) -> String {
    let (scaled, scaled_unit) = auto_scale(v as f64, ladder);
    if scaled_unit == ladder.base_unit() {
        // No step-up — render the original integer to preserve
        // exact precision (auto_scale's f64 round-trip is
        // identity below the threshold, but the integer form is
        // shorter and avoids the `.000` suffix).
        format!("{v}{}", ladder.base_unit())
    } else {
        format!("{scaled:.3}{scaled_unit}")
    }
}

/// Format a derived-metric value cell for the `## Derived metrics`
/// table. Ratio rows (`is_ratio: true`, [`ScaleLadder::None`])
/// render with three decimals (`0.873`); ns / B / ticks ladders
/// route through the same auto-scale ladder as the main table.
/// Negative values (e.g. a negative `live_heap_estimate`) carry
/// their explicit minus sign through the format.
pub fn format_derived_value_cell(v: DerivedValue, ladder: ScaleLadder, is_ratio: bool) -> String {
    let value = v.as_f64();
    if is_ratio {
        return format!("{value:.3}");
    }
    let (scaled, scaled_unit) = auto_scale(value, ladder);
    if scaled_unit == ladder.base_unit() {
        // No ladder step-up — render two decimals to preserve
        // the fractional precision derived averages carry (e.g.
        // wait_sum=1234 ns / wait_count=10 = 123.40 ns). The
        // primary-table integer formatter (format_scaled_u64)
        // strips fractions because its inputs ARE integers; the
        // derived path's inputs are `f64` divisions, so two
        // decimals keep the signal intact.
        format!("{value:.2}{}", ladder.base_unit())
    } else {
        format!("{scaled:.3}{scaled_unit}")
    }
}

/// Format the signed delta cell for a derived row. Mirrors
/// [`format_derived_value_cell`] but always carries an explicit
/// `+`/`-` sign so the operator can read directionality at a
/// glance. Ratios render with three decimals (`+0.100` is +10pp);
/// other ladders route through `auto_scale` and pick up the
/// scaled unit suffix.
pub fn format_derived_delta_cell(d: f64, ladder: ScaleLadder, is_ratio: bool) -> String {
    if is_ratio {
        return format!("{d:+.3}");
    }
    let (scaled, scaled_unit) = auto_scale(d, ladder);
    if scaled_unit == ladder.base_unit() {
        format!("{d:+.2}{}", ladder.base_unit())
    } else {
        format!("{scaled:+.3}{scaled_unit}")
    }
}

/// Render an `Option<u64>` cgroup limit as either `max` (no
/// limit / kernel emitted the literal `max` token) or the
/// auto-scaled value. Used for `memory.max`, `memory.high`,
/// `memory.low`, `memory.min`, `pids.max`, `cpu.max` quota.
/// Mirrors the kernel's own display: `cat memory.max` prints
/// `max` when no cap is set, a u64 byte count otherwise.
pub fn format_optional_limit(v: Option<u64>, ladder: ScaleLadder) -> String {
    match v {
        Some(n) => format_scaled_u64(n, ladder),
        None => "max".to_string(),
    }
}

/// Render a `cpu.max` pair as `<quota>/<period>` where quota is
/// either `max` (no cap) or the auto-scaled µs value. Period is
/// always present (default 100_000 µs per
/// `default_bw_period_us()` at `kernel/sched/sched.h:441`). The
/// `<quota>/<period>` separator is THIS crate's display
/// convention — the kernel itself emits raw integers in
/// `cat cpu.max` (space-separated, no auto-scale); we
/// auto-scale via [`format_scaled_u64`] for human-friendly
/// output, which also widens the visual delimiter from the
/// kernel's space to a slash.
pub fn format_cpu_max(quota: Option<u64>, period_us: u64) -> String {
    let q = match quota {
        Some(q) => format_scaled_u64(q, ScaleLadder::Us),
        None => "max".to_string(),
    };
    let p = format_scaled_u64(period_us, ScaleLadder::Us);
    format!("{q}/{p}")
}

/// Render a baseline → candidate cell for an `Option<u64>`
/// LIMIT (e.g. `memory.max`, `memory.high`, `pids.max`). `None`
/// reads as `max` (no limit) per [`format_optional_limit`]; a
/// step from concrete to `max` between snapshots renders as
/// `<value> → max`.
pub fn cgroup_optional_limit_cell(
    baseline: Option<u64>,
    candidate: Option<u64>,
    ladder: ScaleLadder,
) -> String {
    let bl = format_optional_limit(baseline, ladder);
    let cd = format_optional_limit(candidate, ladder);
    if baseline == candidate {
        // No diff — render once. Avoids the `max → max` redundancy
        // and keeps the limits column scannable when nothing
        // changed.
        return bl;
    }
    format!("{bl} → {cd}")
}

/// Render a baseline → candidate cell for `cpu.max`
/// `(quota, period)` pairs. When both pairs are equal, renders
/// once via [`format_cpu_max`]; otherwise renders as
/// `<a> → <b>`. Mirrors [`cgroup_optional_limit_cell`]'s
/// equality-collapse policy.
pub fn cgroup_limits_cell(
    baseline: Option<(Option<u64>, u64)>,
    candidate: Option<(Option<u64>, u64)>,
) -> String {
    let render = |pair: Option<(Option<u64>, u64)>| match pair {
        Some((q, p)) => format_cpu_max(q, p),
        None => "-".to_string(),
    };
    let bl = render(baseline);
    let cd = render(candidate);
    if bl == cd {
        return bl;
    }
    format!("{bl} → {cd}")
}

/// Format a per-row delta cell for [`write_diff`]. Routes the
/// signed numeric delta through [`auto_scale`] so a large delta
/// renders in a readable magnitude with the matching prefix
/// applied to the ladder's base unit. Sign is preserved (rendered
/// with `+` or `-`). When no step-up was triggered AND the delta
/// is integer-valued, the cell renders as the bare signed integer
/// to match [`format_value_cell`]'s short-circuit (so `+5ns`
/// instead of `+5.000ns`); otherwise the scaled f64 renders with
/// 3 decimals.
fn format_delta_cell(delta: f64, ladder: ScaleLadder) -> String {
    let (scaled, scaled_unit) = auto_scale(delta, ladder);
    if scaled_unit == ladder.base_unit() && delta.fract() == 0.0 {
        format!("{:+}{scaled_unit}", delta as i64)
    } else {
        format!("{scaled:+.3}{scaled_unit}")
    }
}

/// Per-row display layout for [`write_diff`].
///
/// `Full` (default) emits the seven-column form
/// `(group | threads | metric | baseline | candidate | delta | %)`.
/// The remaining variants are compact shortcuts for common
/// operator workflows; each resolves to a fixed [`Column`] set
/// before the renderer runs. A `--columns` override on the same
/// invocation wins over the format's default column set.
///
/// [`Arrow`] collapses baseline / candidate / delta into a
/// single cell shaped `<baseline> -> <candidate> (<delta>)` so a
/// narrow display still surfaces directionality. The arrow cell
/// renderer mirrors [`cgroup_cell`]'s shape so the two stay
/// visually consistent across primary and cgroup tables.
///
/// [`Arrow`]: DisplayFormat::Arrow
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, clap::ValueEnum)]
#[non_exhaustive]
pub enum DisplayFormat {
    /// Default — emit baseline, candidate, delta, and pct
    /// columns alongside group / threads / metric.
    #[default]
    Full,
    /// Drop baseline + candidate columns; keep delta + pct.
    DeltaOnly,
    /// Drop pct column; keep baseline + candidate + delta.
    NoPct,
    /// Single-cell `<baseline> -> <candidate> (<delta>)` form;
    /// drop pct.
    Arrow,
    /// Drop baseline / candidate / delta; keep pct only.
    PctOnly,
}

/// One column slot in the rendered diff/show table. The renderer
/// iterates the resolved [`Column`] vec to build both the
/// header row and each data row, dispatching cell construction
/// per variant. Order in the slice is the rendered order — the
/// renderer never re-sorts.
///
/// Column variants are uniform across compare and show even
/// though show's [`Column::Baseline`], [`Column::Candidate`],
/// [`Column::Delta`], [`Column::Pct`], [`Column::Arrow`] are
/// meaningless for a single snapshot. The show entry point
/// rejects those names at CLI parse time so an operator never
/// reaches the renderer with a mismatched column set.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum Column {
    /// The group-by axis label (rendered header is "pcomm",
    /// "cgroup", "comm-pattern", or "comm" per `GroupBy`).
    Group,
    /// Thread-count cell (`N` when the count matches across
    /// snapshots, `A->B` arrow form otherwise).
    Threads,
    /// Metric name with bracketed tag suffix.
    Metric,
    /// Baseline value (compare only).
    Baseline,
    /// Candidate value (compare only).
    Candidate,
    /// Signed delta (compare only).
    Delta,
    /// Percentage delta (compare only).
    Pct,
    /// Single-cell `<baseline> -> <candidate> (<delta>)`
    /// (compare only). Mutually exclusive with
    /// `Baseline`/`Candidate`/`Delta`/`Pct` — the arrow form
    /// fuses them.
    Arrow,
    /// Aggregated value cell (show only).
    Value,
    /// Bracketed tag suffix (sched_class + config_gates + dead).
    /// Off by default — opt in with `--columns ...,tags`.
    Tags,
    /// Relative uptime: group age as a percentage of the oldest
    /// thread in the snapshot. 100% = as old as the oldest
    /// thread, 0% = just spawned. Color gradient: green ≥50%,
    /// red <50% (>2x younger than the oldest).
    Uptime,
    /// Sort-by metric summary column. Shows the --sort-by metric's
    /// baseline→candidate (delta%) per group. Only present when
    /// --sort-by is set.
    SortBy,
}

impl Column {
    /// Canonical CLI name. Round-trips through
    /// [`parse_columns`].
    pub fn cli_name(self) -> &'static str {
        match self {
            Column::Group => "group",
            Column::Threads => "threads",
            Column::Metric => "metric",
            Column::Baseline => "baseline",
            Column::Candidate => "candidate",
            Column::Delta => "delta",
            Column::Pct => "%",
            Column::Arrow => "arrow",
            Column::Value => "value",
            Column::Tags => "tags",
            Column::Uptime => "uptime",
            Column::SortBy => "sort-by", // overridden dynamically in colored_header
        }
    }

    /// Header cell text. The group axis carries a per-`GroupBy`
    /// label (`pcomm`, `cgroup`, etc.); other columns echo
    /// [`Self::cli_name`].
    pub fn header(self, group_header: &'static str) -> &'static str {
        match self {
            Column::Group => group_header,
            Column::Threads => "threads",
            Column::Metric => "metric",
            Column::Baseline => "baseline",
            Column::Candidate => "candidate",
            Column::Delta => "delta",
            Column::Pct => "%",
            Column::Arrow => "value",
            Column::Value => "value",
            Column::Tags => "tags",
            Column::Uptime => "%uptime",
            Column::SortBy => "sort-by", // overridden dynamically in colored_header
        }
    }
}

/// Resolve a [`DisplayFormat`] to its default column set
/// (compare-side). Returns the full ordered column slice
/// including the group / threads / metric prefix.
fn compare_columns_for(format: DisplayFormat) -> Vec<Column> {
    let mut cols = vec![Column::Group, Column::Threads, Column::Metric];
    let trailing: &[Column] = match format {
        DisplayFormat::Full => &[
            Column::Baseline,
            Column::Candidate,
            Column::Delta,
            Column::Pct,
        ],
        DisplayFormat::DeltaOnly => &[Column::Delta, Column::Pct],
        DisplayFormat::NoPct => &[Column::Baseline, Column::Candidate, Column::Delta],
        DisplayFormat::Arrow => &[Column::Arrow, Column::Delta, Column::Pct, Column::Uptime],
        DisplayFormat::PctOnly => &[Column::Pct],
    };
    cols.extend_from_slice(trailing);
    cols
}

/// Resolve the show-side default column set (no
/// baseline/candidate/delta/pct — show is single-snapshot).
fn show_columns_default() -> Vec<Column> {
    vec![
        Column::Group,
        Column::Threads,
        Column::Metric,
        Column::Value,
    ]
}

/// Parse a CLI `--columns` spec into a typed [`Column`] vec.
/// Format: comma-separated names matching [`Column::cli_name`].
/// Whitespace around each name is trimmed. Empty input parses
/// to an empty Vec — caller falls back to the format default.
///
/// `compare_side` controls which subset is allowed:
/// - `true` accepts every variant except [`Column::Value`]
///   (show-only).
/// - `false` accepts every variant except
///   [`Column::Baseline`], [`Column::Candidate`],
///   [`Column::Delta`], [`Column::Pct`], [`Column::Arrow`]
///   (compare-only).
///
/// Errors:
/// - Unknown name (cite the offending token; list valid names).
/// - Wrong-side name (e.g. `value` on compare or `baseline`
///   on show).
/// - Duplicate name across two entries.
/// - Empty token between commas.
/// - `arrow` paired with any of `baseline`/`candidate`/`delta`/`%`
///   (the arrow form fuses those columns into a single cell —
///   pairing them would render the same data twice).
pub fn parse_columns(spec: &str, compare_side: bool) -> anyhow::Result<Vec<Column>> {
    if spec.trim().is_empty() {
        return Ok(Vec::new());
    }
    let allowed: &[Column] = if compare_side {
        &[
            Column::Group,
            Column::Threads,
            Column::Metric,
            Column::Baseline,
            Column::Candidate,
            Column::Delta,
            Column::Pct,
            Column::Arrow,
            Column::Tags,
            Column::Uptime,
        ]
    } else {
        &[
            Column::Group,
            Column::Threads,
            Column::Metric,
            Column::Value,
            Column::Tags,
            Column::Uptime,
        ]
    };
    let valid_names = allowed
        .iter()
        .map(|c| c.cli_name())
        .collect::<Vec<_>>()
        .join(", ");
    let mut out: Vec<Column> = Vec::new();
    let mut seen: std::collections::BTreeSet<&'static str> = std::collections::BTreeSet::new();
    for entry in spec.split(',') {
        let entry = entry.trim();
        if entry.is_empty() {
            anyhow::bail!(
                "empty entry in --columns spec {spec:?}; \
                 entries are comma-separated and must be non-empty"
            );
        }
        let normalized = entry.to_ascii_lowercase();
        let Some(col) = allowed.iter().copied().find(|c| c.cli_name() == normalized) else {
            anyhow::bail!(
                "unknown column {entry:?} in --columns spec {spec:?}; \
                 must be one of: {valid_names}",
            );
        };
        if !seen.insert(col.cli_name()) {
            anyhow::bail!(
                "duplicate column {entry:?} in --columns spec {spec:?}; \
                 each column may appear at most once"
            );
        }
        out.push(col);
    }
    // Arrow fuses baseline/candidate/delta/% into a single cell;
    // pairing arrow with any of those names asks the renderer to
    // emit the same data twice. Reject at parse time.
    let has_arrow = out.iter().any(|c| matches!(c, Column::Arrow));
    let has_fused = out.iter().any(|c| {
        matches!(
            c,
            Column::Baseline | Column::Candidate | Column::Delta | Column::Pct
        )
    });
    if has_arrow && has_fused {
        anyhow::bail!(
            "column 'arrow' is mutually exclusive with baseline/candidate/delta/% \
             — the arrow form fuses them into a single cell."
        );
    }
    Ok(out)
}

/// One sub-table emitted by [`write_diff`] / `write_show`.
/// `--sections` filters which sub-tables render — every section
/// not in the filter is suppressed before its emission gate
/// (zero-suppression, group-by-cgroup gating, etc.) runs, so a
/// section that would otherwise emit when its data is present
/// stays silent when omitted from the filter.
///
/// Variant order tracks the rendering order in [`write_diff`]
/// and `write_show` so iteration over [`Section::ALL`] walks
/// the table in the order the operator sees it. The
/// [`Self::cli_name`] tokens are the spelling accepted by
/// [`parse_sections`] — round-trip through that parser pins the
/// vocabulary against drift.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub enum Section {
    /// Per-thread metric table — the primary rows produced by
    /// `build_row` / `aggregate`, EXCLUDING the taskstats
    /// genetlink-sourced rows which carry their own
    /// [`Section::TaskstatsDelay`] tag for separate filtering.
    /// Always rendered first.
    Primary,
    /// `## Derived metrics` section emitted from
    /// [`CTPROF_DERIVED_METRICS`].
    Derived,
    /// Cgroup-enrichment table (`cpu_usage_usec`,
    /// `nr_throttled`, `throttled_usec`, `memory_current`).
    /// Compare- and show-side both gate on `GroupBy::Cgroup`
    /// plus a non-empty `cgroup_stats` map; the `--sections`
    /// filter runs ahead of that gate.
    CgroupStats,
    /// `## Cgroup limits / knobs` table — operator-set
    /// configuration (`cpu.max`, `cpu.weight`, `memory.max`,
    /// `memory.high`, `pids.current`, `pids.max`).
    Limits,
    /// `## memory.stat` long-table — kernel-emitted memory
    /// counters per cgroup.
    MemoryStat,
    /// `## memory.events` long-table — pressure-event counters
    /// per cgroup.
    MemoryEvents,
    /// `## Pressure / <resource>` per-cgroup PSI sub-tables
    /// (cpu / memory / io / irq).
    Pressure,
    /// `## Host pressure / <resource>` host-level PSI
    /// sub-tables.
    HostPressure,
    /// `## smaps_rollup` memory-mapping summary. Compare-side
    /// rows are keyed per pcomm pattern under default
    /// normalization (matching the [`GroupBy::Pcomm`] join key)
    /// or per literal `pcomm[tgid]` PID under
    /// [`CompareOptions::no_thread_normalize`]; show-side rows
    /// are emitted per-PID directly off each captured leader
    /// thread.
    Smaps,
    /// `## sched_ext` global sysfs section (`state`,
    /// `switch_all`, `nr_rejected`, `hotplug_seq`,
    /// `enable_seq`).
    SchedExt,
    /// Taskstats genetlink-sourced rows in the primary table —
    /// the 34 fields covering the eight delay-accounting
    /// categories (`cpu_delay_*`, `blkio_delay_*`,
    /// `swapin_delay_*`, `freepages_delay_*`,
    /// `thrashing_delay_*`, `compact_delay_*`, `wpcopy_delay_*`,
    /// `irq_delay_*`) plus the two memory watermarks
    /// (`hiwater_rss_bytes`, `hiwater_vm_bytes`). Renders inside
    /// the primary table alongside [`Section::Primary`] rows;
    /// each [`CtprofMetricDef`] carries a [`Self`] tag in its
    /// [`CtprofMetricDef::section`] field, and the primary
    /// table emitter checks
    /// [`DisplayOptions::is_section_enabled`] per row so
    /// `--sections taskstats-delay` shows only the taskstats
    /// rows, `--sections primary` excludes them, and either
    /// alone keeps the primary table open. Captured via the
    /// kernel's TASKSTATS family in [`crate::taskstats`].
    TaskstatsDelay,
}

impl Section {
    /// Every variant in rendering order. Single source of
    /// truth — `parse_sections` walks this slice to validate
    /// names and the [`DisplayOptions::is_section_enabled`]
    /// default-empty case treats it as "all on."
    pub const ALL: &'static [Section] = &[
        Section::Primary,
        Section::TaskstatsDelay,
        Section::Derived,
        Section::CgroupStats,
        Section::Limits,
        Section::MemoryStat,
        Section::MemoryEvents,
        Section::Pressure,
        Section::HostPressure,
        Section::Smaps,
        Section::SchedExt,
    ];

    /// Canonical CLI name. Round-trips through
    /// [`parse_sections`].
    pub fn cli_name(self) -> &'static str {
        match self {
            Section::Primary => "primary",
            Section::TaskstatsDelay => "taskstats-delay",
            Section::Derived => "derived",
            Section::CgroupStats => "cgroup-stats",
            Section::Limits => "cgroup-limits",
            Section::MemoryStat => "memory-stat",
            Section::MemoryEvents => "memory-events",
            Section::Pressure => "pressure",
            Section::HostPressure => "host-pressure",
            Section::Smaps => "smaps-rollup",
            Section::SchedExt => "sched-ext",
        }
    }

    /// Returns `true` when this section's data only exists
    /// under [`GroupBy::Cgroup`] grouping. Five sections live
    /// behind the cgroup outer-gate in `write_diff` /
    /// `write_show`: [`CgroupStats`](Section::CgroupStats),
    /// [`Limits`](Section::Limits),
    /// [`MemoryStat`](Section::MemoryStat),
    /// [`MemoryEvents`](Section::MemoryEvents), and
    /// [`Pressure`](Section::Pressure). Naming any of them
    /// under `--sections` while using a non-cgroup
    /// `--group-by` would silently produce zero rows for that
    /// section — the framework warns the operator instead via
    /// [`warn_cgroup_only_sections_under_non_cgroup`].
    pub fn requires_cgroup_grouping(self) -> bool {
        matches!(
            self,
            Section::CgroupStats
                | Section::Limits
                | Section::MemoryStat
                | Section::MemoryEvents
                | Section::Pressure
        )
    }
}

/// Emit a stderr warning when an explicit `--sections` filter
/// names a cgroup-only section while `--group-by` is not
/// [`GroupBy::Cgroup`]. Without the warning, the section would
/// silently render zero rows (its outer-gate suppresses it),
/// leaving the operator wondering whether their snapshot lacked
/// the data or their flag was misconfigured.
///
/// Only fires when the filter is explicitly populated — the
/// default-empty case ("render every section that has data")
/// is already self-correcting and emits no warning. Non-cgroup
/// sections in the same explicit filter are not flagged; only
/// the cgroup-only entries are called out.
///
/// Diagnostic shape per cgroup-only entry: one line of the
/// form `section 'X' requires --group-by cgroup; omitted under
/// --group-by Y`. The text is pinned by
/// [`format_cgroup_only_section_warning`] so a wording drift
/// surfaces in unit tests rather than at the operator's
/// terminal.
pub fn warn_cgroup_only_sections_under_non_cgroup(sections: &[Section], group_by: GroupBy) {
    if sections.is_empty() || group_by == GroupBy::Cgroup {
        return;
    }
    for section in sections {
        if section.requires_cgroup_grouping() {
            eprintln!("{}", format_cgroup_only_section_warning(*section, group_by));
        }
    }
}

/// Render the per-section "requires --group-by cgroup" warning
/// text. Split from [`warn_cgroup_only_sections_under_non_cgroup`]
/// so the wording can be unit-tested without capturing stderr.
/// The `--group-by` axis is rendered via [`group_by_cli_name`]
/// so the operator-facing label matches the clap value-enum
/// spelling they typed (`pcomm` / `cgroup` / `comm` /
/// `comm-exact`).
pub(crate) fn format_cgroup_only_section_warning(section: Section, group_by: GroupBy) -> String {
    format!(
        "section '{}' requires --group-by cgroup; omitted under --group-by {}",
        section.cli_name(),
        group_by_cli_name(group_by),
    )
}

/// Operator-facing spelling of a [`GroupBy`] variant — matches
/// the clap value-enum tokens accepted on the CLI. Centralized
/// here so the warning surface and any future diagnostic site
/// share one source of truth.
fn group_by_cli_name(group_by: GroupBy) -> &'static str {
    match group_by {
        GroupBy::Pcomm => "pcomm",
        GroupBy::Cgroup => "cgroup",
        GroupBy::Comm => "comm",
        GroupBy::CommExact => "comm-exact",
        GroupBy::All => "all",
    }
}

/// Parse a CLI `--sections` spec into a typed [`Section`] vec.
/// Format: comma-separated names matching [`Section::cli_name`].
/// Whitespace around each name is trimmed. Empty input parses
/// to an empty `Vec` — caller treats that as "every section
/// renders" via [`DisplayOptions::is_section_enabled`].
///
/// Errors (mirrored from [`parse_columns`] so the two CLI
/// surfaces report drift identically):
/// - Unknown name (cite the offending token; list valid names).
/// - Duplicate name across two entries.
/// - Empty token between commas.
pub fn parse_sections(spec: &str) -> anyhow::Result<Vec<Section>> {
    if spec.trim().is_empty() {
        return Ok(Vec::new());
    }
    let valid_names = Section::ALL
        .iter()
        .map(|s| s.cli_name())
        .collect::<Vec<_>>()
        .join(", ");
    let mut out: Vec<Section> = Vec::new();
    let mut seen: std::collections::BTreeSet<&'static str> = std::collections::BTreeSet::new();
    for entry in spec.split(',') {
        let entry = entry.trim();
        if entry.is_empty() {
            anyhow::bail!(
                "empty entry in --sections spec {spec:?}; \
                 entries are comma-separated and must be non-empty"
            );
        }
        let normalized = entry.to_ascii_lowercase();
        let Some(section) = Section::ALL
            .iter()
            .copied()
            .find(|s| s.cli_name() == normalized)
        else {
            anyhow::bail!(
                "unknown section {entry:?} in --sections spec {spec:?}; \
                 must be one of: {valid_names}",
            );
        };
        if !seen.insert(section.cli_name()) {
            anyhow::bail!(
                "duplicate section {entry:?} in --sections spec {spec:?}; \
                 each section may appear at most once"
            );
        }
        out.push(section);
    }
    Ok(out)
}

/// Parse a CLI `--metrics` spec into a typed
/// `Vec<&'static str>` of registry names. Format:
/// comma-separated names that must each match a `name` field
/// from either [`CTPROF_METRICS`] or
/// [`CTPROF_DERIVED_METRICS`]. Whitespace around each name
/// is trimmed. Empty input parses to an empty `Vec` — caller
/// treats that as "every metric renders" via
/// [`DisplayOptions::is_metric_enabled`], mirroring
/// [`parse_sections`]'s empty-input semantic.
///
/// The returned `&'static str`s point into the registry's own
/// `name` fields (not into the input `spec`), so the parsed
/// vector survives the input string going out of scope and
/// equality checks against registry names are pointer-stable.
///
/// Errors (mirrored from [`parse_sections`] / [`parse_columns`]
/// so the three CLI surfaces report drift identically):
/// - Unknown name (cite the offending token).
/// - Duplicate name across two entries.
/// - Empty token between commas.
pub fn parse_metrics(spec: &str) -> anyhow::Result<Vec<&'static str>> {
    if spec.trim().is_empty() {
        return Ok(Vec::new());
    }
    let mut out: Vec<&'static str> = Vec::new();
    let mut seen: std::collections::BTreeSet<&'static str> = std::collections::BTreeSet::new();
    for entry in spec.split(',') {
        let entry = entry.trim();
        if entry.is_empty() {
            anyhow::bail!(
                "empty entry in --metrics spec {spec:?}; \
                 entries are comma-separated and must be non-empty"
            );
        }
        // Linear scan over both registries — name lookup is
        // not on a hot path. Returns the registry's own
        // `&'static str` so the parsed vec is pointer-stable
        // and survives the input string's lifetime.
        let primary = CTPROF_METRICS
            .iter()
            .find(|m| m.name == entry)
            .map(|m| m.name);
        let derived = CTPROF_DERIVED_METRICS
            .iter()
            .find(|d| d.name == entry)
            .map(|d| d.name);
        let Some(name) = primary.or(derived) else {
            anyhow::bail!(
                "unknown metric {entry:?} in --metrics spec {spec:?}; \
                 must be one of the names from `ctprof metric-list` \
                 (CTPROF_METRICS or CTPROF_DERIVED_METRICS)",
            );
        };
        if !seen.insert(name) {
            anyhow::bail!(
                "duplicate metric {entry:?} in --metrics spec {spec:?}; \
                 each metric may appear at most once"
            );
        }
        out.push(name);
    }
    Ok(out)
}

/// Aggregate display options for the renderer. Plumbed as a
/// single struct through [`write_diff`] so a future addition
/// lands in one place without growing every signature. The
/// show-side entry (`write_show` in `src/bin/ktstr.rs`) keeps
/// a flatter signature for historical reasons but mirrors the
/// same field semantics — `--wrap`, `--sections`, `--metrics`
/// reach show via `wrap` / `sections` / `metrics` parameters
/// that share the same
/// helpers (`new_wrapped_table`, [`Section::cli_name`]).
#[derive(Debug, Clone, Default)]
#[non_exhaustive]
pub struct DisplayOptions {
    /// Format shorthand. Default [`DisplayFormat::Full`].
    pub format: DisplayFormat,
    /// User-supplied column override; empty Vec means "use the
    /// format's default column set." Set via [`parse_columns`].
    pub columns: Vec<Column>,
    /// When `true`, tables render with comfy-table's
    /// [`comfy_table::ContentArrangement::Dynamic`] layout
    /// (terminal-width-aware cell wrapping). When `false`,
    /// tables render with `ContentArrangement::Disabled` — the
    /// prior shape, where columns can spill past the terminal
    /// edge. Default `false` keeps existing operator workflows
    /// untouched until the flag is opted into.
    pub wrap: bool,
    /// User-supplied section filter; empty Vec means "render
    /// every section that has data" — the unfiltered default.
    /// Non-empty restricts the rendered output to the listed
    /// sections only. Set via [`parse_sections`].
    pub sections: Vec<Section>,
    /// User-supplied per-metric row filter; empty Vec means
    /// "render every metric in the primary + derived sections"
    /// — the unfiltered default. Non-empty restricts the
    /// rendered rows to the listed metric names (which must
    /// be in [`CTPROF_METRICS`] or
    /// [`CTPROF_DERIVED_METRICS`]). Set via
    /// [`parse_metrics`].
    ///
    /// Distinct from [`Self::sections`]: sections gate whole
    /// sub-tables (primary, derived, cgroup-stats, …);
    /// metrics gate individual ROWS within the primary and
    /// derived sub-tables. The two compose — naming
    /// `--sections primary` and `--metrics run_time_ns` shows
    /// a single primary row.
    pub metrics: Vec<&'static str>,
    /// Maximum rendered lines per section. Sections whose table
    /// output exceeds this limit are truncated with a notice.
    /// `0` means unlimited (no truncation). Default `750`.
    pub section_line_limit: usize,
}

impl DisplayOptions {
    /// Resolved compare-side column set: `columns` if
    /// non-empty, otherwise [`compare_columns_for`] over
    /// `format`. `--columns` always wins over the format
    /// shorthand (explicit > shorthand) per the design call.
    pub fn resolved_compare_columns(&self) -> Vec<Column> {
        if self.columns.is_empty() {
            compare_columns_for(self.format)
        } else {
            self.columns.clone()
        }
    }

    /// Resolved show-side column set: `columns` if non-empty,
    /// otherwise [`show_columns_default`].
    pub fn resolved_show_columns(&self) -> Vec<Column> {
        if self.columns.is_empty() {
            show_columns_default()
        } else {
            self.columns.clone()
        }
    }

    /// Returns `true` when `section` should render under the
    /// current filter. Empty `sections` means "every section
    /// renders" (the default — no filter applied), matching
    /// [`parse_sections`]'s empty-input semantic. Non-empty
    /// `sections` restricts rendering to the named entries.
    pub fn is_section_enabled(&self, section: Section) -> bool {
        self.sections.is_empty() || self.sections.contains(&section)
    }

    /// Returns `true` when the metric named `name` should
    /// render under the current row-level filter. Empty
    /// `metrics` means "every metric renders" — the
    /// unfiltered default mirroring
    /// [`Self::is_section_enabled`]. Non-empty restricts
    /// rendering to the listed names. The comparison is on
    /// the metric's `&'static str` name (so a registry-name
    /// pointer or any byte-equal string both match).
    pub fn is_metric_enabled(&self, name: &str) -> bool {
        self.metrics.is_empty() || self.metrics.contains(&name)
    }

    /// Construct a comfy-table builder honouring the
    /// [`wrap`](Self::wrap) flag: terminal-width-aware
    /// `Dynamic` arrangement when `wrap` is true, otherwise the
    /// existing borderless, disabled-arrangement layout via
    /// [`crate::cli::new_table`]. Single source of truth so
    /// every section in [`write_diff`] honours `--wrap` without
    /// per-call-site `if` branching. The show-side renderer
    /// (`write_show` in `src/bin/ktstr.rs`) calls the underlying
    /// helpers directly through the same branch.
    pub fn new_table(&self) -> comfy_table::Table {
        if self.wrap {
            crate::cli::new_wrapped_table()
        } else {
            crate::cli::new_table()
        }
    }

    /// Create a table constrained to the given max content widths.
    /// Heading rows wider than data get auto-truncated by comfy_table
    /// with its built-in "..." indicator.
    pub fn new_constrained_table(&self, max_widths: &[u16]) -> comfy_table::Table {
        let mut t = self.new_table();
        // Create dummy columns so constraints can be set.
        // Columns are auto-created when the header is added later,
        // but we need them NOW for set_constraint. Adding a dummy
        // header row with the right column count, then replacing
        // it when the real header is set.
        let dummy: Vec<&str> = (0..max_widths.len()).map(|_| "").collect();
        t.set_header(dummy);
        for (i, &w) in max_widths.iter().enumerate() {
            if let Some(col) = t.column_mut(i) {
                col.set_constraint(comfy_table::ColumnConstraint::UpperBoundary(
                    comfy_table::Width::Fixed(w),
                ));
            }
        }
        t
    }
}

/// Format the arrow cell `<baseline> -> <candidate> (<delta>)`
/// for primary diff rows. Mirrors [`cgroup_cell`]'s shape so
/// the visual style stays consistent across primary and cgroup
/// tables. When `delta` is `None` (categorical Mode rule), the
/// parenthetical drops to "same"/"differs"; for non-Mode rows
/// without a numeric projection the parenthetical is "-".
fn format_arrow_cell(
    baseline: &Aggregated,
    candidate: &Aggregated,
    delta: Option<f64>,
    ladder: ScaleLadder,
) -> String {
    let baseline_cell = format_value_cell(baseline, ladder);
    let candidate_cell = format_value_cell(candidate, ladder);
    let _ = delta;
    format!("{baseline_cell} \u{2192} {candidate_cell}")
}

/// Format the arrow cell for derived rows. Same shape as
/// [`format_arrow_cell`] but pulls from typed
/// [`DerivedValue`]s and routes through the derived
/// formatters so ratios / ns / B units pick up their
/// auto-scale ladders.
fn format_arrow_cell_derived(row: &DerivedRow) -> String {
    let baseline_cell = match row.baseline {
        Some(v) => format_derived_value_cell(v, row.metric_ladder, row.is_ratio),
        None => "-".to_string(),
    };
    let candidate_cell = match row.candidate {
        Some(v) => format_derived_value_cell(v, row.metric_ladder, row.is_ratio),
        None => "-".to_string(),
    };
    format!("{baseline_cell} \u{2192} {candidate_cell}")
}

/// Helper: shared threads-cell rendering — the `N` form when
/// counts match across snapshots, `A->B` arrow form otherwise.
fn render_threads_cell(a: usize, b: usize) -> String {
    if a == b {
        a.to_string()
    } else {
        format!("{}\u{2192}{}", a, b)
    }
}

/// Render a [`DiffRow`] into the column-by-column cell vector
/// per the resolved [`Column`] set. Caller emits the resulting
/// `Vec<String>` straight into a comfy_table row.
fn render_diff_row_cells(row: &DiffRow, columns: &[Column]) -> Vec<String> {
    let metric_def = CTPROF_METRICS
        .iter()
        .find(|m| m.name == row.metric_name)
        .expect("metric_name comes from CTPROF_METRICS via build_row");
    let metric_cell = metric_display_name(metric_def).to_string();
    let mut cells = Vec::with_capacity(columns.len());
    for col in columns {
        let cell = match col {
            Column::Group => row.display_key.clone(),
            Column::Threads => render_threads_cell(row.thread_count_a, row.thread_count_b),
            Column::Metric => metric_cell.clone(),
            Column::Baseline => format_value_cell(&row.baseline, row.metric_ladder),
            Column::Candidate => format_value_cell(&row.candidate, row.metric_ladder),
            Column::Delta => match row.delta {
                Some(d) => format_delta_cell(d, row.metric_ladder),
                None => match (&row.baseline, &row.candidate) {
                    (Aggregated::Mode { value: a, .. }, Aggregated::Mode { value: b, .. }) => {
                        if a == b {
                            "same".to_string()
                        } else {
                            "differs".to_string()
                        }
                    }
                    _ => "-".to_string(),
                },
            },
            Column::Pct => match row.delta_pct {
                Some(p) => format!("{:+.1}%", p * 100.0),
                None => "-".to_string(),
            },
            Column::Arrow => {
                format_arrow_cell(&row.baseline, &row.candidate, row.delta, row.metric_ladder)
            }
            // Show-only columns. The compare-side parse_columns
            // gate rejects Value at CLI parse time, so reaching
            // this arm requires constructing a column set
            // directly through the Rust API. Surface a `-`
            // rather than panic.
            Column::Value => "-".to_string(),
            Column::Tags => metric_tags(metric_def),
            Column::Uptime => match row.uptime_pct {
                Some(pct) => format!("{pct:.0}%"),
                None => "-".to_string(),
            },
            Column::SortBy => row.sort_by_cell.clone().unwrap_or_else(|| "-".to_string()),
        };
        cells.push(cell);
    }
    cells
}

/// Color a diff-table cell based on its column type and the row's delta.
/// Delta/% cells: yellow for positive (increase), magenta for negative
/// (decrease). Uptime: green/yellow/red gradient. Other columns: default.
pub fn color_diff_cell(
    text: String,
    col: Column,
    delta: Option<f64>,
    uptime_pct: Option<f64>,
    sort_by_delta: Option<f64>,
) -> comfy_table::Cell {
    use comfy_table::{Attribute, Color};
    match col {
        Column::Pct => {
            let color = match delta {
                Some(d) if d > 0.0 => Color::Yellow,
                Some(d) if d < 0.0 => Color::Magenta,
                _ => Color::White,
            };
            let mut cell = comfy_table::Cell::new(text).fg(color);
            if matches!(delta, Some(d) if d.abs() > 0.5) {
                cell = cell.add_attribute(Attribute::Bold);
            }
            cell
        }
        Column::Delta => {
            let color = match delta {
                Some(d) if d > 0.0 => Color::Yellow,
                Some(d) if d < 0.0 => Color::Magenta,
                _ => Color::White,
            };
            comfy_table::Cell::new(text).fg(color)
        }
        Column::Uptime => {
            let color = match uptime_pct {
                Some(p) if p >= 75.0 => Color::Green,
                Some(p) if p >= 50.0 => Color::Yellow,
                Some(_) => Color::Red,
                None => Color::White,
            };
            let mut cell = comfy_table::Cell::new(text).fg(color);
            if matches!(uptime_pct, Some(p) if p < 50.0) {
                cell = cell.add_attribute(Attribute::Bold);
            }
            cell
        }
        Column::SortBy => {
            let color = match sort_by_delta {
                Some(d) if d > 0.0 => Color::Yellow,
                Some(d) if d < 0.0 => Color::Magenta,
                _ => Color::Cyan,
            };
            comfy_table::Cell::new(text).fg(color)
        }
        _ => comfy_table::Cell::new(text),
    }
}

/// Extract the parent directory and leaf segment of a cgroup path.
/// `/system.slice/foo.service` → (`/system.slice`, `foo.service`).
/// `/` → (`/`, `/`). Empty → (``, ``).
fn cgroup_parent_leaf(path: &str) -> (&str, &str) {
    match path.rfind('/') {
        Some(0) => ("/", &path[1..]),
        Some(i) => (&path[..i], &path[i + 1..]),
        None => ("", path),
    }
}

/// Build a colored header row — cyan foreground so headers are
/// visually distinct from data rows.
pub fn colored_header(columns: &[Column], group_header: &'static str) -> Vec<comfy_table::Cell> {
    colored_header_with_sort(columns, group_header, None)
}

pub fn colored_header_with_sort(
    columns: &[Column],
    group_header: &'static str,
    sort_metric: Option<&str>,
) -> Vec<comfy_table::Cell> {
    columns
        .iter()
        .map(|c| {
            let label = if *c == Column::SortBy {
                sort_metric.unwrap_or("sort-by")
            } else {
                c.header(group_header)
            };
            comfy_table::Cell::new(label).fg(comfy_table::Color::Cyan)
        })
        .collect()
}

/// Wrap a string-cell row in [`comfy_table::Cell`]s with blue
/// foreground so derived-metric rows render visually distinct
/// from the per-thread primary table when stdout is a TTY.
/// Operators scanning a long compare or show output can locate
/// the `## Derived metrics` rows at a glance instead of relying
/// on the section header alone.
///
/// On a non-TTY stdout the comfy-table builder calls
/// [`comfy_table::Table::force_no_tty`] (see
/// [`crate::cli::new_table`]) which strips the ANSI escape
/// sequences; the rendered output is byte-identical to the
/// pre-color baseline for shell-pipeline consumers.
///
/// Color choice: blue contrasts with both the unstyled primary
/// table and the stats compare verdict palette
/// (`Color::Red` / `Color::Green` for REGRESSION /
/// improvement) — derived rows do not carry a regression
/// verdict of their own, so reusing the verdict colors here
/// would conflict with the established convention.
pub fn color_derived_cells(cells: Vec<String>) -> Vec<comfy_table::Cell> {
    cells
        .into_iter()
        .map(|c| comfy_table::Cell::new(c).fg(comfy_table::Color::Blue))
        .collect()
}

/// Render a [`DerivedRow`] into the column-by-column cell
/// vector. Mirrors [`render_diff_row_cells`] but routes
/// numeric cells through the typed-derived formatters.
fn render_derived_row_cells(row: &DerivedRow, columns: &[Column]) -> Vec<String> {
    let mut cells = Vec::with_capacity(columns.len());
    for col in columns {
        let cell = match col {
            Column::Group => row.display_key.clone(),
            Column::Threads => render_threads_cell(row.thread_count_a, row.thread_count_b),
            Column::Metric => row.metric_name.to_string(),
            Column::Baseline => match row.baseline {
                Some(v) => format_derived_value_cell(v, row.metric_ladder, row.is_ratio),
                None => "-".to_string(),
            },
            Column::Candidate => match row.candidate {
                Some(v) => format_derived_value_cell(v, row.metric_ladder, row.is_ratio),
                None => "-".to_string(),
            },
            Column::Delta => match row.delta {
                Some(d) => format_derived_delta_cell(d, row.metric_ladder, row.is_ratio),
                None => "-".to_string(),
            },
            Column::Pct => match row.delta_pct {
                Some(p) => format!("{:+.1}%", p * 100.0),
                None => "-".to_string(),
            },
            Column::Arrow => format_arrow_cell_derived(row),
            Column::Value => "-".to_string(),
            Column::Tags => String::new(),
            Column::Uptime => "-".to_string(),
            Column::SortBy => "-".to_string(),
        };
        cells.push(cell);
    }
    cells
}

/// Arguments for the `ktstr ctprof compare` subcommand.
#[derive(Debug, clap::Args)]
pub struct CtprofCompareArgs {
    /// Baseline snapshot (`.ctprof.zst`) from `ktstr ctprof capture -o`.
    pub baseline: std::path::PathBuf,
    /// Candidate snapshot (`.ctprof.zst`) from `ktstr ctprof capture -o`.
    pub candidate: std::path::PathBuf,
    /// Grouping key. `pcomm` (default) aggregates per process
    /// name with token-based pattern normalization (so
    /// `worker-{0..N}` parent processes cluster into one
    /// `worker-{N}` bucket); `cgroup` per cgroup path; `comm`
    /// aggregates threads by NAME PATTERN under the same
    /// token-based normalizer (digits, hex,
    /// alpha-prefix-digits collapse into placeholders so
    /// `tokio-worker-{0..N}` and `kworker/u8:7` cluster); use
    /// `--no-thread-normalize` to disable that collapse and group
    /// by literal `comm` / `pcomm` instead. `comm-exact` is a
    /// synonym for `comm --no-thread-normalize`.
    #[arg(long, value_enum, default_value_t = GroupBy::All, help_heading = "Grouping")]
    pub group_by: GroupBy,
    /// Glob patterns that collapse dynamic cgroup path segments
    /// so structurally-equivalent cgroups across runs group
    /// together. Example:
    /// `--cgroup-flatten '/kubepods/*/workload'` treats different
    /// pod IDs as the same group. Repeatable. Independent of
    /// `--no-cg-normalize`: explicit globs apply first, then
    /// auto-normalize runs unless disabled.
    #[arg(long, help_heading = "Grouping")]
    pub cgroup_flatten: Vec<String>,
    /// Disable token-based pattern normalization across every
    /// name-family axis: `--group-by comm`, `--group-by pcomm`,
    /// AND the `## smaps_rollup` per-process keying (which
    /// normalizes by the pcomm pattern by default — see
    /// `collect_smaps_rollup`). With this flag set:
    /// threads / processes group by their literal name; smaps
    /// rows preserve their per-PID identity (`pcomm[tgid]`
    /// instead of the normalized pcomm pattern). The
    /// digit/hex/alpha-prefix placeholders are bypassed on every
    /// axis. Has no effect under `--group-by comm-exact`
    /// (already literal) or `--group-by cgroup`. Mirror of
    /// `--no-cg-normalize` for the cgroup axis.
    #[arg(long, help_heading = "Grouping")]
    pub no_thread_normalize: bool,
    /// Disable token-based pattern normalization for the cgroup
    /// axis (`--group-by cgroup`). Cgroup paths group by literal
    /// post-`--cgroup-flatten` path — Layer 1 (systemd template
    /// `@<id>.service` → `@{I}.service`), Layer 2 (token
    /// normalization), and Layer 3 (tighten) are all bypassed.
    /// Has no effect under any other grouping.
    #[arg(long, help_heading = "Grouping")]
    pub no_cg_normalize: bool,
    /// Multi-key sort spec for the diff rows. Format:
    /// `metric1[:dir1],metric2[:dir2],...` where each `metric` is
    /// one of the primary or derived metric names (run
    /// `ctprof metric-list` for the full vocabulary) and
    /// `dir` is `asc` or `desc` (default `desc`). Groups rank by
    /// the tuple (`metric1_delta`, `metric2_delta`, ...) under
    /// lexicographic order with per-key direction; rows within a
    /// group keep registry order. Empty (the default) keeps the
    /// "biggest |delta_pct|" sort. Examples:
    /// - `--sort-by wait_sum:desc,run_time_ns:desc` — rank by
    ///   the largest scheduler-wait deltas first, breaking ties
    ///   by run-time delta.
    /// - `--sort-by hiwater_rss_bytes:desc` — rank by the
    ///   largest peak-RSS growth across the snapshot. Useful
    ///   for memory-leak investigations.
    /// - `--sort-by avg_wait_ns:asc` — rank by smallest average
    ///   wait time first; surfaces the most-improved processes.
    ///
    /// Affects only the per-thread metric table and the
    /// derived-metrics section. The `## smaps_rollup`
    /// sub-table sorts process rows independently by total Rss
    /// descending (its own built-in default; see
    /// [`write_diff`]); a future flag could expose that knob,
    /// but `--sort-by` does not propagate to it today.
    ///
    /// Parsed by [`parse_sort_by`] into [`CompareOptions::sort_by`].
    #[arg(long, default_value = "", help_heading = "Display")]
    pub sort_by: String,
    /// Per-row column layout. `full` (default) emits the
    /// seven-column form; `delta-only` drops baseline +
    /// candidate; `no-pct` drops the percentage column;
    /// `arrow` collapses baseline / candidate / delta into a
    /// single cell; `pct-only` keeps just the percentage.
    /// `--columns` (below) overrides the format's default
    /// column set when both are present.
    #[arg(long, value_enum, default_value_t = DisplayFormat::Arrow, help_heading = "Display")]
    pub display_format: DisplayFormat,
    /// Comma-separated column names to render. Empty (the
    /// default) means "use the column set selected by
    /// --display-format." Valid names: `group`, `threads`,
    /// `metric`, `baseline`, `candidate`, `delta`, `%`,
    /// `arrow`. Order in the spec is the rendered order.
    /// Example: `--columns metric,delta,%`. Applies to the
    /// `primary` section's per-metric table only; secondary
    /// tables (cgroup-stats, smaps-rollup, etc.) have fixed
    /// column shapes and ignore this flag.
    #[arg(long, default_value = "", help_heading = "Display")]
    pub columns: String,
    /// Comma-separated section names to render. Empty (the
    /// default) renders every section that has data. When
    /// non-empty, restricts output to the listed sub-tables —
    /// every section not named is suppressed before its
    /// data-availability gate runs. Valid names: `primary`,
    /// `taskstats-delay`, `derived`, `cgroup-stats`,
    /// `cgroup-limits`, `memory-stat`, `memory-events`,
    /// `pressure`, `host-pressure`, `smaps-rollup`,
    /// `sched-ext`. Useful for narrowing a wide compare to one
    /// area of interest. Example:
    /// `--sections primary,host-pressure`.
    #[arg(long, default_value = "", help_heading = "Filter")]
    pub sections: String,
    /// Comma-separated metric names to render. Empty (the
    /// default) renders every metric in the primary and
    /// derived sub-tables. When non-empty, restricts the
    /// rendered ROWS to the listed names — names must come
    /// from the `ctprof metric-list` vocabulary
    /// (CTPROF_METRICS or CTPROF_DERIVED_METRICS).
    /// Useful for zooming on a specific counter family
    /// without computing every metric: `--metrics
    /// run_time_ns,wait_sum,affine_success_ratio`. Composes
    /// with `--sections` — naming `--sections primary
    /// --metrics run_time_ns` shows a single primary row.
    #[arg(long, default_value = "", help_heading = "Filter")]
    pub metrics: String,
    /// Wrap table cells to fit the terminal width. Off by
    /// default — wide tables can spill past the terminal edge,
    /// matching the prior shell-pipeline-friendly layout. When
    /// set, cells too wide for the available width wrap inside
    /// the cell rather than overflowing, at the cost of taller
    /// rows. The wrap kicks in only when stdout is a tty (the
    /// terminal width is unknown otherwise); when piped to a
    /// file or another command, the flag is silently dropped
    /// and output stays unwrapped so awk/grep pipelines see
    /// the same byte sequence as without the flag.
    #[arg(long, help_heading = "Display")]
    pub wrap: bool,
    /// Maximum rendered lines per section. Sections whose table
    /// output exceeds this limit are truncated with a notice
    /// showing the number of hidden lines. Applies independently
    /// to each sub-table (primary, derived, smaps-rollup, etc.).
    /// `0` disables truncation entirely. Default `500`.
    #[arg(long, default_value_t = 500, help_heading = "Display")]
    pub limit: usize,
}

/// Entry point for the compare CLI. Parses `--sort-by` first,
/// then loads both snapshots, computes the diff, prints the
/// table, and returns `0` on success. Exits non-zero only on
/// I/O or parse errors; a non-empty diff is data, not a
/// failure.
///
/// Order is deliberate: `parse_sort_by` runs before the
/// snapshot loads so an operator typo in the spec (`--sort-by
/// not_a_real_metric`) fails fast without waiting on disk I/O.
/// Without this ordering the operator pays for two snapshot
/// loads only to hit the parser error after — and an
/// integration test driving a malformed spec against
/// non-existent snapshot paths would surface the load failure
/// instead of the parser failure (the path the test actually
/// pins).
pub fn run_compare(args: &CtprofCompareArgs) -> anyhow::Result<i32> {
    let sort_by = parse_sort_by(&args.sort_by)
        .with_context(|| format!("parse --sort-by {:?}", args.sort_by))?;
    // Parse --columns alongside --sort-by so a malformed spec
    // surfaces before the snapshot loads. compare_side: true
    // for the diff renderer. --sections / --metrics share the
    // same fail-fast contract — an unknown name should not pay
    // for two snapshot loads before failing.
    let columns = parse_columns(&args.columns, true)
        .with_context(|| format!("parse --columns {:?}", args.columns))?;
    let sections = parse_sections(&args.sections)
        .with_context(|| format!("parse --sections {:?}", args.sections))?;
    let metrics = parse_metrics(&args.metrics)
        .with_context(|| format!("parse --metrics {:?}", args.metrics))?;

    // Warn the operator if any explicitly-named section is
    // cgroup-only but the requested grouping isn't cgroup —
    // those sections would silently render zero rows under the
    // outer GroupBy::Cgroup gate in `write_diff` otherwise.
    // The warning fires before snapshot load so the operator
    // sees it immediately, not after a long disk-I/O wait.
    warn_cgroup_only_sections_under_non_cgroup(&sections, args.group_by);

    let baseline = CtprofSnapshot::load(&args.baseline)
        .with_context(|| format!("load baseline {}", args.baseline.display()))?;
    let candidate = CtprofSnapshot::load(&args.candidate)
        .with_context(|| format!("load candidate {}", args.candidate.display()))?;

    let display = DisplayOptions {
        format: args.display_format,
        columns,
        wrap: args.wrap,
        sections,
        metrics,
        section_line_limit: args.limit,
    };

    let opts = CompareOptions {
        group_by: args.group_by.into(),
        cgroup_flatten: args.cgroup_flatten.clone(),
        no_thread_normalize: args.no_thread_normalize,
        no_cg_normalize: args.no_cg_normalize,
        sort_by,
    };
    let diff = compare(&baseline, &candidate, &opts);
    print_diff(
        &diff,
        &args.baseline,
        &args.candidate,
        args.group_by,
        &display,
    );
    Ok(0)
}

/// Render the metric-list discovery output: a tag legend
/// (sched_class / config_gates / `[dead]`) followed by a per-metric
/// table whose rows show `name | tags | description`. Tag legend
/// is keyed off the closed-set vocabulary the registry pin test
/// guards (`registry_tag_vocabulary_is_closed`), so adding a new
/// allowed class or gate fails the test until both the legend
/// and the closed-set table are updated together.
///
/// Splits rendering from I/O so tests can drive the formatter
/// into a `String` buffer; the public [`run_metric_list`] entry
/// point is the print wrapper.
pub fn write_metric_list<W: fmt::Write>(w: &mut W) -> fmt::Result {
    writeln!(w, "## Tag legend")?;
    writeln!(w)?;
    writeln!(w, "sched_class:")?;
    writeln!(
        w,
        "  [cfs-only]    metric increments only inside CFS-class call paths (kernel/sched/fair.c);"
    )?;
    writeln!(w, "                zero under sched_ext / RT / DL / IDLE.")?;
    writeln!(
        w,
        "  [non-ext]     metric is written by the schedstat sleep/wait family wrappers"
    )?;
    writeln!(
        w,
        "                (kernel/sched/stats.c); CFS / RT / DL accumulate, sched_ext bypasses."
    )?;
    writeln!(
        w,
        "  [fair-policy] metric emits only when fair_policy(p->policy) is true:"
    )?;
    writeln!(
        w,
        "                SCHED_NORMAL, SCHED_BATCH, AND SCHED_EXT under CONFIG_SCHED_CLASS_EXT."
    )?;
    writeln!(w)?;
    writeln!(
        w,
        "config_gates (compact form; full kconfig symbol prefixed with CONFIG_):"
    )?;
    writeln!(
        w,
        "  [SCHED_INFO]            requires CONFIG_SCHED_INFO; gates the sched_info_* counters"
    )?;
    writeln!(
        w,
        "                          surfaced via /proc/<tid>/schedstat (run_time_ns, wait_time_ns,"
    )?;
    writeln!(w, "                          timeslices).")?;
    writeln!(
        w,
        "  [SCHEDSTATS]            requires CONFIG_SCHEDSTATS; gates every __schedstat_* /"
    )?;
    writeln!(
        w,
        "                          schedstat_* macro call (kernel/sched/stats.h:75-82)."
    )?;
    writeln!(
        w,
        "  [SCHED_CORE]            requires CONFIG_SCHED_CORE; gates the core-scheduling"
    )?;
    writeln!(
        w,
        "                          subsystem (core_forceidle_sum)."
    )?;
    writeln!(
        w,
        "  [SCHED_CLASS_EXT]       requires CONFIG_SCHED_CLASS_EXT; without it no task can"
    )?;
    writeln!(w, "                          land on the sched_ext class.")?;
    writeln!(
        w,
        "  [TASK_DELAY_ACCT]       requires CONFIG_TASK_DELAY_ACCT AND runtime delayacct=on"
    )?;
    writeln!(
        w,
        "                          (boot param or kernel.task_delayacct sysctl)."
    )?;
    writeln!(
        w,
        "  [TASK_IO_ACCOUNTING]    requires CONFIG_TASK_IO_ACCOUNTING; gates /proc/<tid>/io."
    )?;
    writeln!(
        w,
        "  [TASKSTATS]             requires CONFIG_TASKSTATS; gates the netlink TASKSTATS family"
    )?;
    writeln!(
        w,
        "                          (kernel/taskstats.c) used by the taskstats delay-accounting"
    )?;
    writeln!(
        w,
        "                          and hiwater_rss/hiwater_vm capture path. Calls also need"
    )?;
    writeln!(w, "                          CAP_NET_ADMIN.")?;
    writeln!(
        w,
        "  [TASK_XACCT]            requires CONFIG_TASK_XACCT; gates extended accounting fields"
    )?;
    writeln!(
        w,
        "                          (hiwater_rss, hiwater_vm) populated by xacct_add_tsk."
    )?;
    writeln!(w)?;
    writeln!(w, "status:")?;
    writeln!(
        w,
        "  [dead]        kernel exposes the counter via /proc but never increments it; always"
    )?;
    writeln!(
        w,
        "                reads zero. Surfaced for forward-compat parity with the kernel's"
    )?;
    writeln!(w, "                exposure surface.")?;
    writeln!(w)?;

    // Sections vocabulary table — discovery companion to the
    // `--sections` CLI flag. Lists every Section variant in
    // rendering order with its CLI name and a short description
    // of what it renders. Operators reading the rendered table
    // see `--sections primary,host-pressure` (or whatever) in
    // their compare/show invocation and need a way to learn
    // which sub-tables those tokens correspond to without
    // jumping to source. This section closes that loop.
    writeln!(w, "## Sections")?;
    writeln!(w)?;
    let mut sections_table = crate::cli::new_table();
    sections_table.set_header(vec!["section", "rendered heading", "description"]);
    for section in Section::ALL {
        let (heading, desc) = match section {
            Section::Primary => (
                "(no heading; first table)",
                "Per-thread metric table — the primary aggregated rows EXCLUDING the taskstats genetlink rows (those carry the `taskstats-delay` tag).",
            ),
            Section::TaskstatsDelay => (
                "(rendered inside the primary table)",
                "Taskstats genetlink-sourced rows — eight delay-accounting categories (cpu/blkio/swapin/freepages/thrashing/compact/wpcopy/irq × count/total/max/min) plus hiwater_rss_bytes / hiwater_vm_bytes. Per-row filter inside the primary table.",
            ),
            Section::Derived => (
                "## Derived metrics",
                "Computed metrics derived from the primary registry (ratios, averages, signed differences).",
            ),
            Section::CgroupStats => (
                "(no heading; cgroup-stats table)",
                "Per-cgroup CPU + memory enrichment from cpu.stat / memory.current. Requires --group-by cgroup.",
            ),
            Section::Limits => (
                "## Cgroup limits / knobs",
                "Operator-set cgroup configuration — cpu.max, cpu.weight, memory.max, memory.high, pids.*. Requires --group-by cgroup.",
            ),
            Section::MemoryStat => (
                "## memory.stat",
                "Kernel-emitted memory.stat counters per cgroup. Requires --group-by cgroup.",
            ),
            Section::MemoryEvents => (
                "## memory.events",
                "Pressure-event counters from memory.events per cgroup. Requires --group-by cgroup.",
            ),
            Section::Pressure => (
                "## Pressure / <resource>",
                "Per-cgroup PSI sub-tables — one per resource (cpu / memory / io / irq). Requires --group-by cgroup.",
            ),
            Section::HostPressure => (
                "## Host pressure / <resource>",
                "System-level PSI sub-tables from /proc/pressure/<resource>.",
            ),
            Section::Smaps => (
                "## smaps_rollup",
                "Per-process memory-mapping summary from /proc/<pid>/smaps_rollup (Rss / Pss / private / shared / swap). Compare-side keys default to per-pcomm-pattern aggregates (`worker-{N}`); pass `--no-thread-normalize` to switch back to literal `pcomm[tgid]` per-PID rows. Under default normalization, byte counts per (pcomm-pattern, key) pair are field-summed across all PIDs sharing the same pcomm skeleton.",
            ),
            Section::SchedExt => (
                "## sched_ext",
                "Global sched_ext sysfs state — state, switch_all, nr_rejected, hotplug_seq, enable_seq.",
            ),
        };
        sections_table.add_row(vec![
            section.cli_name().to_string(),
            heading.to_string(),
            desc.to_string(),
        ]);
    }
    writeln!(w, "{sections_table}")?;
    writeln!(w)?;

    writeln!(w, "## Metrics")?;
    writeln!(w)?;
    let mut table = crate::cli::new_table();
    table.set_header(vec!["metric", "tags", "description"]);
    for m in CTPROF_METRICS {
        // Strip the bare metric name off the rendered display
        // form so the `tags` column carries only the bracketed
        // suffixes — keeps the table scannable. When the metric
        // has no tags, the cell is empty.
        let tags = metric_tags(m);
        table.add_row(vec![m.name.to_string(), tags, m.description.to_string()]);
    }
    writeln!(w, "{table}")?;
    writeln!(w)?;
    writeln!(w, "## Derived metrics")?;
    writeln!(w)?;
    let mut dt = crate::cli::new_table();
    dt.set_header(vec!["metric", "unit", "inputs", "description"]);
    for d in CTPROF_DERIVED_METRICS {
        // Phase 4: ladder is the source of truth — `ratio` and
        // unit suffixes both fall out of `ScaleLadder::base_unit`
        // (with an explicit override for ratio rows where
        // is_ratio is true and the ladder is None).
        let unit_cell = if d.is_ratio {
            "ratio".to_string()
        } else {
            d.ladder.base_unit().to_string()
        };
        dt.add_row(vec![
            d.name.to_string(),
            unit_cell,
            d.inputs.join(", "),
            d.description.to_string(),
        ]);
    }
    writeln!(w, "{dt}")?;
    Ok(())
}

/// Print the metric-list discovery output to stdout. Thin
/// wrapper over [`write_metric_list`] so the CLI keeps the
/// one-line call ergonomics; tests drive the writer into a
/// `String` buffer.
pub fn print_metric_list() {
    let mut out = String::new();
    // Infallible: writing into a String cannot fail.
    let _ = write_metric_list(&mut out);
    print!("{out}");
}

/// Entry point for the `ctprof metric-list` subcommand.
/// Always returns `Ok(0)` — discovery output is informational
/// and never fails.
pub fn run_metric_list() -> anyhow::Result<i32> {
    print_metric_list();
    Ok(0)
}

/// Render [`CtprofDiff`] as a table on stdout. Thin wrapper
/// over [`write_diff`] so the non-test caller keeps the
/// ergonomics of a one-line call; tests drive [`write_diff`]
/// into a `String` buffer.
pub fn print_diff(
    diff: &CtprofDiff,
    baseline_path: &Path,
    candidate_path: &Path,
    group_by: GroupBy,
    display: &DisplayOptions,
) {
    let mut out = String::new();
    // Infallible: writing into a String cannot fail.
    let _ = write_diff(
        &mut out,
        diff,
        baseline_path,
        candidate_path,
        group_by,
        display,
    );
    if display.section_line_limit > 0 {
        print!("{}", limit_sections(&out, display.section_line_limit));
    } else {
        print!("{out}");
    }
}

/// Truncate each `## <heading>` section to at most `limit` lines.
/// Sections are delimited by lines starting with `## `. Content
/// before the first section header passes through untruncated
/// (typically the file-path header row).
pub fn limit_sections(output: &str, limit: usize) -> String {
    let mut result = String::with_capacity(output.len());
    let mut section_lines: Vec<&str> = Vec::new();
    let mut section_header: Option<&str> = None;

    for line in output.lines() {
        if line.starts_with("## ") {
            flush_section(&mut result, section_header, &section_lines, limit);
            section_lines.clear();
            section_header = Some(line);
        } else if section_header.is_some() {
            section_lines.push(line);
        } else {
            result.push_str(line);
            result.push('\n');
        }
    }
    flush_section(&mut result, section_header, &section_lines, limit);
    result
}

fn flush_section(result: &mut String, header: Option<&str>, lines: &[&str], limit: usize) {
    let Some(header) = header else { return };
    result.push_str(header);
    result.push('\n');
    if lines.len() <= limit {
        for line in lines {
            result.push_str(line);
            result.push('\n');
        }
    } else {
        for line in &lines[..limit] {
            result.push_str(line);
            result.push('\n');
        }
        result.push_str(&format!(
            "... {} more lines truncated (use --limit 0 for unlimited)\n",
            lines.len() - limit,
        ));
    }
}

/// Render [`CtprofDiff`] into `w`. The formatter layer lives
/// here so tests can inspect exactly what `print_diff` would
/// emit without shelling through stdout capture. Write errors
/// propagate as [`std::fmt::Error`] — callers that write into an
/// infallible sink (`String`) can unwrap or ignore.
///
/// `display` controls per-row column layout, terminal-width
/// wrapping, and per-section filtering: see [`DisplayFormat`] /
/// [`Column`] / [`Section`] / [`DisplayOptions`] for the
/// resolution rules. Each sub-table emission below is gated on
/// [`DisplayOptions::is_section_enabled`] before its
/// data-availability check, so `--sections` always wins over
/// the per-section zero-suppression heuristic.
pub fn write_diff<W: fmt::Write>(
    w: &mut W,
    diff: &CtprofDiff,
    baseline_path: &Path,
    candidate_path: &Path,
    group_by: GroupBy,
    display: &DisplayOptions,
) -> fmt::Result {
    let group_header = match group_by {
        GroupBy::Pcomm => "pcomm",
        GroupBy::Cgroup => "cgroup",
        GroupBy::Comm => "comm-pattern",
        GroupBy::CommExact => "comm",
        GroupBy::All => "comm",
    };

    let mut columns = display.resolved_compare_columns();
    let has_sort_col = diff.rows.first().is_some_and(|r| r.sort_by_cell.is_some());
    if has_sort_col {
        columns.push(Column::SortBy);
    }

    // Compute column widths from ALL data rows (primary + derived)
    // so every table in every section shares the same widths.
    // Heading rows are constrained to these widths via
    // new_constrained_table so they can't inflate columns.
    let global_max_widths: Vec<u16> = if group_by == GroupBy::All {
        let mut measure = display.new_table();
        measure.set_header(colored_header_with_sort(
            &columns,
            group_header,
            diff.sort_metric_name,
        ));
        for row in &diff.rows {
            let mut cells = render_diff_row_cells(row, &columns);
            if let Some(pos) = columns.iter().position(|c| *c == Column::Group) {
                let comm = row.group_key.splitn(3, '\x00').nth(2).unwrap_or("");
                cells[pos] = comm.to_string();
            }
            measure.add_row(cells);
        }
        for row in &diff.derived_rows {
            let mut cells = render_derived_row_cells(row, &columns);
            if let Some(pos) = columns.iter().position(|c| *c == Column::Group) {
                let comm = row.group_key.splitn(3, '\x00').nth(2).unwrap_or("");
                cells[pos] = comm.to_string();
            }
            measure.add_row(cells);
        }
        measure.column_max_content_widths()
    } else {
        Vec::new()
    };

    // The primary table renders rows whose metric.section is
    // enabled. Two sections share the table:
    //   - Section::Primary: the 52 non-taskstats rows.
    //   - Section::TaskstatsDelay: the 34 taskstats genetlink rows.
    // The outer gate keeps the table open while EITHER section
    // is enabled — `--sections taskstats-delay` alone still emits
    // the table containing only the 34 taskstats rows;
    // `--sections primary` alone emits the table containing only
    // the 52 non-taskstats rows; either combined or the empty
    // default ("all on") emits all rows.
    if display.is_section_enabled(Section::Primary)
        || display.is_section_enabled(Section::TaskstatsDelay)
    {
        // Filter rows first.
        let primary_rows: Vec<&DiffRow> = diff
            .rows
            .iter()
            .filter(|row| {
                if !display.is_metric_enabled(row.metric_name) {
                    return false;
                }
                let metric = CTPROF_METRICS
                    .iter()
                    .find(|m| m.name == row.metric_name)
                    .expect("metric_name comes from CTPROF_METRICS via build_row");
                display.is_section_enabled(metric.section)
            })
            .collect();

        if group_by == GroupBy::All {
            // Sort + truncate BEFORE organizing into the cgroup
            // tree. primary_rows are already delta-sorted from
            // compare(). Apply the line limit here so the tree
            // only contains the top movers.
            let limited_rows: Vec<&DiffRow> = if display.section_line_limit > 0 {
                primary_rows
                    .iter()
                    .copied()
                    .take(display.section_line_limit)
                    .collect()
            } else {
                primary_rows.clone()
            };

            struct HierRow<'a> {
                cgroup: &'a str,
                pcomm: &'a str,
                comm: &'a str,
                row: &'a DiffRow,
            }
            let mut hier: Vec<HierRow<'_>> = limited_rows
                .iter()
                .map(|row| {
                    let mut parts = row.group_key.splitn(3, '\x00');
                    let cgroup = parts.next().unwrap_or("");
                    let pcomm = parts.next().unwrap_or("");
                    let comm = parts.next().unwrap_or(pcomm);
                    HierRow {
                        cgroup,
                        pcomm,
                        comm,
                        row,
                    }
                })
                .collect();
            // Use pre-sorted row index as score so hierarchy honors
            // --sort-by. Rows arrive sorted by the chosen metric.
            // Lower index = higher priority (biggest mover first).
            let row_rank: BTreeMap<*const DiffRow, usize> = hier
                .iter()
                .enumerate()
                .map(|(i, h)| (h.row as *const DiffRow, i))
                .collect();
            let mut leaf_rank: BTreeMap<(&str, &str), usize> = BTreeMap::new();
            let mut cg_rank: BTreeMap<&str, usize> = BTreeMap::new();
            for h in &hier {
                let rank = row_rank[&(h.row as *const DiffRow)];
                let le = leaf_rank.entry((h.cgroup, h.pcomm)).or_insert(usize::MAX);
                if rank < *le {
                    *le = rank;
                }
                let ce = cg_rank.entry(h.cgroup).or_insert(usize::MAX);
                if rank < *ce {
                    *ce = rank;
                }
            }
            hier.sort_by(|a, b| {
                let cga = cg_rank.get(a.cgroup).copied().unwrap_or(usize::MAX);
                let cgb = cg_rank.get(b.cgroup).copied().unwrap_or(usize::MAX);
                cga.cmp(&cgb)
                    .then_with(|| {
                        let sa = leaf_rank
                            .get(&(a.cgroup, a.pcomm))
                            .copied()
                            .unwrap_or(usize::MAX);
                        let sb = leaf_rank
                            .get(&(b.cgroup, b.pcomm))
                            .copied()
                            .unwrap_or(usize::MAX);
                        sa.cmp(&sb)
                    })
                    .then_with(|| {
                        let ra = row_rank[&(a.row as *const DiffRow)];
                        let rb = row_rank[&(b.row as *const DiffRow)];
                        ra.cmp(&rb)
                    })
            });

            // Two-pass: first measure data-only widths, then build
            // the real table with heading rows constrained to those
            // widths so headings can't inflate columns.
            writeln!(w, "## Primary metrics")?;
            let mut last_segments: Vec<&str> = Vec::new();
            let mut last_pcomm = "";
            let mut table = display.new_constrained_table(&global_max_widths);
            table.set_header(colored_header_with_sort(
                &columns,
                "comm",
                diff.sort_metric_name,
            ));

            let depth_color = |depth: usize| -> comfy_table::Color {
                match depth {
                    0 => comfy_table::Color::Green,
                    1 => comfy_table::Color::Cyan,
                    _ => comfy_table::Color::DarkGrey,
                }
            };

            for h in &hier {
                let segments: Vec<&str> = h.cgroup.split('/').filter(|s| !s.is_empty()).collect();

                let common = segments
                    .iter()
                    .zip(last_segments.iter())
                    .take_while(|(a, b)| a == b)
                    .count();

                let cg_changed =
                    common < last_segments.len() || segments.len() > last_segments.len();
                if cg_changed {
                    for (depth, seg) in segments.iter().enumerate().skip(common) {
                        let indent = "  ".repeat(depth);
                        let label = format!("{indent}{seg}");
                        let heading_cells: Vec<comfy_table::Cell> = columns
                            .iter()
                            .map(|c| {
                                if *c == Column::Group {
                                    comfy_table::Cell::new(&label)
                                        .fg(depth_color(depth))
                                        .add_attribute(comfy_table::Attribute::Bold)
                                } else {
                                    comfy_table::Cell::new("")
                                }
                            })
                            .collect();
                        table.add_row(heading_cells);
                    }
                    last_segments = segments;
                    last_pcomm = "";
                }

                if h.pcomm != last_pcomm {
                    let cg_depth = last_segments.len();
                    let indent = "  ".repeat(cg_depth);
                    let label = format!("{indent}{}", h.pcomm);
                    let heading_cells: Vec<comfy_table::Cell> = columns
                        .iter()
                        .map(|c| {
                            if *c == Column::Group {
                                comfy_table::Cell::new(&label)
                                    .fg(comfy_table::Color::White)
                                    .add_attribute(comfy_table::Attribute::Bold)
                            } else {
                                comfy_table::Cell::new("")
                            }
                        })
                        .collect();
                    table.add_row(heading_cells);
                    last_pcomm = h.pcomm;
                }

                let mut string_cells = render_diff_row_cells(h.row, &columns);
                if let Some(pos) = columns.iter().position(|c| *c == Column::Group) {
                    let cg_depth = last_segments.len();
                    string_cells[pos] = format!("{}  {}", "  ".repeat(cg_depth + 1), h.comm);
                }
                let cells: Vec<comfy_table::Cell> = string_cells
                    .into_iter()
                    .zip(columns.iter())
                    .map(|(s, col)| {
                        color_diff_cell(s, *col, h.row.delta, h.row.uptime_pct, h.row.sort_by_delta)
                    })
                    .collect();
                table.add_row(cells);
            }
            writeln!(w, "{table}")?;
        } else if group_by == GroupBy::Cgroup {
            // Hierarchical cgroup rendering: group rows by parent
            // path, emit a sub-heading per parent, show only the
            // leaf segment in the group column.
            let mut by_parent: BTreeMap<&str, Vec<&DiffRow>> = BTreeMap::new();
            for row in &primary_rows {
                let (parent, _) = cgroup_parent_leaf(&row.display_key);
                by_parent.entry(parent).or_default().push(row);
            }
            for (parent, rows) in &by_parent {
                writeln!(w)?;
                writeln!(w, "\x1b[1;32m## {parent}\x1b[0m")?;
                let mut table = display.new_table();
                table.set_header(colored_header_with_sort(
                    &columns,
                    "cgroup",
                    diff.sort_metric_name,
                ));
                let cg_limit = if display.section_line_limit > 0 {
                    &rows[..rows.len().min(display.section_line_limit)]
                } else {
                    &rows[..]
                };
                for row in cg_limit {
                    let (_, leaf) = cgroup_parent_leaf(&row.display_key);
                    let mut string_cells = render_diff_row_cells(row, &columns);
                    // Replace group cell with leaf segment.
                    if let Some(pos) = columns.iter().position(|c| *c == Column::Group) {
                        string_cells[pos] = leaf.to_string();
                    }
                    let cells: Vec<comfy_table::Cell> = string_cells
                        .into_iter()
                        .zip(columns.iter())
                        .map(|(s, col)| {
                            color_diff_cell(s, *col, row.delta, row.uptime_pct, row.sort_by_delta)
                        })
                        .collect();
                    table.add_row(cells);
                }
                writeln!(w, "{table}")?;
            }
        } else {
            writeln!(w, "## Primary metrics")?;
            let mut table = display.new_table();
            table.set_header(colored_header_with_sort(
                &columns,
                group_header,
                diff.sort_metric_name,
            ));
            let limit_iter = if display.section_line_limit > 0 {
                &primary_rows[..primary_rows.len().min(display.section_line_limit)]
            } else {
                &primary_rows[..]
            };
            for row in limit_iter {
                let string_cells = render_diff_row_cells(row, &columns);
                let cells: Vec<comfy_table::Cell> = string_cells
                    .into_iter()
                    .zip(columns.iter())
                    .map(|(s, col)| {
                        color_diff_cell(s, *col, row.delta, row.uptime_pct, row.sort_by_delta)
                    })
                    .collect();
                table.add_row(cells);
            }
            writeln!(w, "{table}")?;
        }
    }

    // Derived-table outer gate mirrors the primary-table pattern:
    // open the table when EITHER `Section::Derived` (the eight
    // pre-existing derivations) OR `Section::TaskstatsDelay` (the
    // nine taskstats-derived rollups) is enabled. Per-row gating
    // below keeps `--sections taskstats-delay` from leaking
    // unrelated derivations into the table.
    if (display.is_section_enabled(Section::Derived)
        || display.is_section_enabled(Section::TaskstatsDelay))
        && !diff.derived_rows.is_empty()
    {
        let derived_rows: Vec<&DerivedRow> = diff
            .derived_rows
            .iter()
            .filter(|row| {
                if !display.is_metric_enabled(row.metric_name) {
                    return false;
                }
                let metric = CTPROF_DERIVED_METRICS
                    .iter()
                    .find(|d| d.name == row.metric_name)
                    .expect("derived metric_name from CTPROF_DERIVED_METRICS");
                display.is_section_enabled(metric.section)
            })
            .collect();

        // Build uptime lookup from primary rows for derived rendering.
        let uptime_map: BTreeMap<&str, Option<f64>> = diff
            .rows
            .iter()
            .map(|r| (r.group_key.as_str(), r.uptime_pct))
            .collect();

        if group_by == GroupBy::All {
            // Hierarchical derived rendering — same tree as primary.
            let limited: Vec<&DerivedRow> = if display.section_line_limit > 0 {
                derived_rows
                    .iter()
                    .copied()
                    .take(display.section_line_limit)
                    .collect()
            } else {
                derived_rows
            };
            struct DHier<'a> {
                cgroup: &'a str,
                pcomm: &'a str,
                comm: &'a str,
                row: &'a DerivedRow,
            }
            let mut hier: Vec<DHier<'_>> = limited
                .iter()
                .map(|row| {
                    let mut parts = row.group_key.splitn(3, '\x00');
                    let cg = parts.next().unwrap_or("");
                    let pc = parts.next().unwrap_or("");
                    let cm = parts.next().unwrap_or(pc);
                    DHier {
                        cgroup: cg,
                        pcomm: pc,
                        comm: cm,
                        row,
                    }
                })
                .collect();
            // Use pre-sorted row index as score so hierarchy honors
            // --sort-by. Rows arrive sorted by the chosen metric.
            // Lower index = higher priority (biggest mover first).
            let row_rank: BTreeMap<*const DerivedRow, usize> = hier
                .iter()
                .enumerate()
                .map(|(i, h)| (h.row as *const DerivedRow, i))
                .collect();
            let mut leaf_rank: BTreeMap<(&str, &str), usize> = BTreeMap::new();
            let mut cg_rank: BTreeMap<&str, usize> = BTreeMap::new();
            for h in &hier {
                let rank = row_rank[&(h.row as *const DerivedRow)];
                let le = leaf_rank.entry((h.cgroup, h.pcomm)).or_insert(usize::MAX);
                if rank < *le {
                    *le = rank;
                }
                let ce = cg_rank.entry(h.cgroup).or_insert(usize::MAX);
                if rank < *ce {
                    *ce = rank;
                }
            }
            hier.sort_by(|a, b| {
                let cga = cg_rank.get(a.cgroup).copied().unwrap_or(usize::MAX);
                let cgb = cg_rank.get(b.cgroup).copied().unwrap_or(usize::MAX);
                cga.cmp(&cgb)
                    .then_with(|| {
                        let sa = leaf_rank
                            .get(&(a.cgroup, a.pcomm))
                            .copied()
                            .unwrap_or(usize::MAX);
                        let sb = leaf_rank
                            .get(&(b.cgroup, b.pcomm))
                            .copied()
                            .unwrap_or(usize::MAX);
                        sa.cmp(&sb)
                    })
                    .then_with(|| {
                        let ra = row_rank[&(a.row as *const DerivedRow)];
                        let rb = row_rank[&(b.row as *const DerivedRow)];
                        ra.cmp(&rb)
                    })
            });

            writeln!(w)?;
            writeln!(w, "## Derived metrics")?;
            let mut dt = display.new_constrained_table(&global_max_widths);
            dt.set_header(colored_header_with_sort(
                &columns,
                "comm",
                diff.sort_metric_name,
            ));
            let mut last_segs: Vec<&str> = Vec::new();
            let mut last_pc = "";
            let depth_color = |d: usize| -> comfy_table::Color {
                match d {
                    0 => comfy_table::Color::Green,
                    1 => comfy_table::Color::Cyan,
                    _ => comfy_table::Color::DarkGrey,
                }
            };
            for h in &hier {
                let segs: Vec<&str> = h.cgroup.split('/').filter(|s| !s.is_empty()).collect();
                let common = segs
                    .iter()
                    .zip(last_segs.iter())
                    .take_while(|(a, b)| a == b)
                    .count();
                if common < last_segs.len() || segs.len() > last_segs.len() {
                    for (depth, seg) in segs.iter().enumerate().skip(common) {
                        let indent = "  ".repeat(depth);
                        let label = format!("{indent}{seg}");
                        let hcells: Vec<comfy_table::Cell> = columns
                            .iter()
                            .map(|c| {
                                if *c == Column::Group {
                                    comfy_table::Cell::new(&label)
                                        .fg(depth_color(depth))
                                        .add_attribute(comfy_table::Attribute::Bold)
                                } else {
                                    comfy_table::Cell::new("")
                                }
                            })
                            .collect();
                        dt.add_row(hcells);
                    }
                    last_segs = segs;
                    last_pc = "";
                }
                if h.pcomm != last_pc {
                    let cg_depth = last_segs.len();
                    let indent = "  ".repeat(cg_depth);
                    let label = format!("{indent}{}", h.pcomm);
                    let hcells: Vec<comfy_table::Cell> = columns
                        .iter()
                        .map(|c| {
                            if *c == Column::Group {
                                comfy_table::Cell::new(&label)
                                    .fg(comfy_table::Color::White)
                                    .add_attribute(comfy_table::Attribute::Bold)
                            } else {
                                comfy_table::Cell::new("")
                            }
                        })
                        .collect();
                    dt.add_row(hcells);
                    last_pc = h.pcomm;
                }
                let mut cells = render_derived_row_cells(h.row, &columns);
                if let Some(pos) = columns.iter().position(|c| *c == Column::Group) {
                    let cg_depth = last_segs.len();
                    cells[pos] = format!("{}  {}", "  ".repeat(cg_depth + 1), h.comm);
                }
                let colored: Vec<comfy_table::Cell> = cells
                    .into_iter()
                    .zip(columns.iter())
                    .map(|(s, col)| {
                        let up = uptime_map.get(h.row.group_key.as_str()).copied().flatten();
                        if *col == Column::Uptime {
                            let text = match up {
                                Some(pct) => format!("{pct:.0}%"),
                                None => "-".to_string(),
                            };
                            color_diff_cell(text, *col, h.row.delta, up, None)
                        } else {
                            color_diff_cell(s, *col, h.row.delta, up, None)
                        }
                    })
                    .collect();
                dt.add_row(colored);
            }
            writeln!(w, "{dt}")?;
        } else {
            writeln!(w)?;
            writeln!(w, "## Derived metrics")?;
            let mut dt = display.new_table();
            dt.set_header(colored_header_with_sort(
                &columns,
                group_header,
                diff.sort_metric_name,
            ));
            let d_limit = if display.section_line_limit > 0 {
                &derived_rows[..derived_rows.len().min(display.section_line_limit)]
            } else {
                &derived_rows[..]
            };
            for row in d_limit {
                let string_cells = render_derived_row_cells(row, &columns);
                let cells: Vec<comfy_table::Cell> = string_cells
                    .into_iter()
                    .zip(columns.iter())
                    .map(|(s, col)| {
                        let up = uptime_map.get(row.group_key.as_str()).copied().flatten();
                        if *col == Column::Uptime {
                            let text = match up {
                                Some(pct) => format!("{pct:.0}%"),
                                None => "-".to_string(),
                            };
                            color_diff_cell(text, *col, row.delta, up, None)
                        } else {
                            color_diff_cell(s, *col, row.delta, up, None)
                        }
                    })
                    .collect();
                dt.add_row(cells);
            }
            writeln!(w, "{dt}")?;
        }
    }

    // within the equal cluster.
    if display.is_section_enabled(Section::Smaps)
        && (!diff.smaps_rollup_a.is_empty() || !diff.smaps_rollup_b.is_empty())
    {
        let mut process_keys: std::collections::BTreeSet<&String> =
            diff.smaps_rollup_a.keys().collect();
        process_keys.extend(diff.smaps_rollup_b.keys());

        let max_field_for = |pkey: &&String, field: &str| -> u64 {
            let a = diff
                .smaps_rollup_a
                .get(*pkey)
                .and_then(|m| m.get(field).copied())
                .unwrap_or(0);
            let b = diff
                .smaps_rollup_b
                .get(*pkey)
                .and_then(|m| m.get(field).copied())
                .unwrap_or(0);
            a.max(b)
        };
        let abs_rss_delta = |pkey: &&String| -> u64 {
            let a = diff
                .smaps_rollup_a
                .get(*pkey)
                .and_then(|m| m.get("Rss").copied())
                .unwrap_or(0);
            let b = diff
                .smaps_rollup_b
                .get(*pkey)
                .and_then(|m| m.get("Rss").copied())
                .unwrap_or(0);
            (b as i128 - a as i128).unsigned_abs() as u64
        };
        let mut sorted_process_keys: Vec<&String> = process_keys.iter().copied().collect();
        sorted_process_keys.sort_by(|a, b| {
            abs_rss_delta(b)
                .cmp(&abs_rss_delta(a))
                .then_with(|| max_field_for(b, "Rss").cmp(&max_field_for(a, "Rss")))
                .then_with(|| a.cmp(b))
        });
        if display.section_line_limit > 0 {
            sorted_process_keys.truncate(display.section_line_limit);
        }

        // Pre-pass: any (process, key) pair with a non-equal
        // delta? Suppresses the section header when nothing
        // moved, even if both maps are populated.
        let any_delta = sorted_process_keys.iter().any(|pkey| {
            let a = diff.smaps_rollup_a.get(*pkey);
            let b = diff.smaps_rollup_b.get(*pkey);
            let mut keys: std::collections::BTreeSet<&String> =
                a.map(|m| m.keys().collect()).unwrap_or_default();
            if let Some(m) = b {
                keys.extend(m.keys());
            }
            keys.iter().any(|k| {
                let av = a.and_then(|m| m.get(*k).copied());
                let bv = b.and_then(|m| m.get(*k).copied());
                av != bv
            })
        });
        if any_delta {
            writeln!(w)?;
            writeln!(w, "## smaps_rollup")?;
            let mut st = if global_max_widths.is_empty() {
                display.new_table()
            } else {
                display.new_constrained_table(&global_max_widths)
            };
            st.set_header(colored_header_with_sort(
                &columns,
                "pcomm",
                diff.sort_metric_name,
            ));

            // For All mode, re-sort by cgroup hierarchy (keys are
            // compound cgroup\x00pcomm). Track segments for tree headings.
            let is_compound = group_by == GroupBy::All;
            let mut sorted_keys = sorted_process_keys.clone();
            if is_compound {
                sorted_keys.sort();
            }

            let mut last_segs: Vec<&str> = Vec::new();
            let depth_color = |d: usize| -> comfy_table::Color {
                match d {
                    0 => comfy_table::Color::Green,
                    1 => comfy_table::Color::Cyan,
                    _ => comfy_table::Color::DarkGrey,
                }
            };

            for pkey in &sorted_keys {
                let (cg_part, display_process) = if is_compound {
                    pkey.split_once('\x00').unwrap_or(("", pkey))
                } else {
                    ("", pkey.as_str())
                };

                if is_compound {
                    let segs: Vec<&str> = cg_part.split('/').filter(|s| !s.is_empty()).collect();
                    let common = segs
                        .iter()
                        .zip(last_segs.iter())
                        .take_while(|(a, b)| a == b)
                        .count();
                    if common < last_segs.len() || segs.len() > last_segs.len() {
                        for (depth, seg) in segs.iter().enumerate().skip(common) {
                            let indent = "  ".repeat(depth);
                            let label = format!("{indent}{seg}");
                            let hcells: Vec<comfy_table::Cell> = columns
                                .iter()
                                .map(|c| {
                                    if *c == Column::Group {
                                        comfy_table::Cell::new(&label)
                                            .fg(depth_color(depth))
                                            .add_attribute(comfy_table::Attribute::Bold)
                                    } else {
                                        comfy_table::Cell::new("")
                                    }
                                })
                                .collect();
                            st.add_row(hcells);
                        }
                        last_segs = segs;
                    }
                }

                let a = diff.smaps_rollup_a.get(*pkey);
                let b = diff.smaps_rollup_b.get(*pkey);
                let mut keys_union: std::collections::BTreeSet<&String> =
                    a.map(|m| m.keys().collect()).unwrap_or_default();
                if let Some(m) = b {
                    keys_union.extend(m.keys());
                }
                for sk in keys_union {
                    let av = a.and_then(|m| m.get(sk).copied());
                    let bv = b.and_then(|m| m.get(sk).copied());
                    if av == bv {
                        continue;
                    }
                    let a_cell = av
                        .map(|v| format_scaled_u64(v, ScaleLadder::Bytes))
                        .unwrap_or_else(|| "-".to_string());
                    let b_cell = bv
                        .map(|v| format_scaled_u64(v, ScaleLadder::Bytes))
                        .unwrap_or_else(|| "-".to_string());
                    let value_cell = format!("{a_cell} \u{2192} {b_cell}");
                    let a_val = av.unwrap_or(0);
                    let b_val = bv.unwrap_or(0);
                    let delta = b_val as i128 - a_val as i128;
                    let delta_cell = if av.is_none() || bv.is_none() {
                        "-".to_string()
                    } else {
                        format_delta_cell(delta as f64, ScaleLadder::Bytes)
                    };
                    let pct_cell = if a_val == 0 || av.is_none() || bv.is_none() {
                        "-".to_string()
                    } else {
                        let pct = (delta as f64 / a_val as f64) * 100.0;
                        format!("{pct:+.1}%")
                    };
                    let cg_depth = last_segs.len();
                    let group_label = format!("{}  {}", "  ".repeat(cg_depth + 1), display_process);
                    let delta_pct_opt: Option<f64> = if a_val > 0 && av.is_some() && bv.is_some() {
                        Some(delta as f64 / a_val as f64)
                    } else {
                        None
                    };
                    let string_cells: Vec<String> = columns
                        .iter()
                        .map(|c| match c {
                            Column::Group => group_label.clone(),
                            Column::Threads => String::new(),
                            Column::Metric => sk.clone(),
                            Column::Arrow => value_cell.clone(),
                            Column::Delta => delta_cell.clone(),
                            Column::Pct => pct_cell.clone(),
                            Column::Uptime => String::new(),
                            _ => String::new(),
                        })
                        .collect();
                    let cells: Vec<comfy_table::Cell> = string_cells
                        .into_iter()
                        .zip(columns.iter())
                        .map(|(s, col)| color_diff_cell(s, *col, delta_pct_opt, None, None))
                        .collect();
                    st.add_row(cells);
                }
            }
            writeln!(w, "{st}")?;
        }
    }

    if group_by == GroupBy::Cgroup
        && (!diff.cgroup_stats_a.is_empty() || !diff.cgroup_stats_b.is_empty())
    {
        // CgroupStats / Limits / MemoryStat / MemoryEvents /
        // Pressure all live behind the GroupBy::Cgroup gate
        // because their data only exists when the diff was
        // computed with cgroup grouping. The `--sections` filter
        // is checked again per sub-table below so a user can
        // request, e.g., `--sections pressure` and get only the
        // PSI rollups even though the cgroup-stats prefix is
        // present in the diff.
        let mut all_keys: std::collections::BTreeSet<&String> =
            diff.cgroup_stats_a.keys().collect();
        all_keys.extend(diff.cgroup_stats_b.keys());

        if display.is_section_enabled(Section::CgroupStats) {
            writeln!(w)?;
            let mut ct = display.new_table();
            ct.set_header(vec![
                "cgroup",
                "cpu_usage_usec",
                "nr_throttled",
                "throttled_usec",
                "memory_current",
            ]);
            for key in &all_keys {
                let a = diff.cgroup_stats_a.get(*key);
                let b = diff.cgroup_stats_b.get(*key);
                ct.add_row(vec![
                    key.to_string(),
                    cgroup_cell(
                        a.map(|s| s.cpu.usage_usec),
                        b.map(|s| s.cpu.usage_usec),
                        ScaleLadder::Us,
                    ),
                    cgroup_cell(
                        a.map(|s| s.cpu.nr_throttled),
                        b.map(|s| s.cpu.nr_throttled),
                        ScaleLadder::Unitless,
                    ),
                    cgroup_cell(
                        a.map(|s| s.cpu.throttled_usec),
                        b.map(|s| s.cpu.throttled_usec),
                        ScaleLadder::Us,
                    ),
                    cgroup_cell(
                        a.map(|s| s.memory.current),
                        b.map(|s| s.memory.current),
                        ScaleLadder::Bytes,
                    ),
                ]);
            }
            writeln!(w, "{ct}")?;
        }

        // Per-cgroup limits / knobs sub-table — operator-set
        // configuration: cpu.max, cpu.weight, memory.max,
        // memory.high, pids.current/max. Cells render as
        // baseline → candidate. `Option<u64>` limits show "max"
        // when None per [`format_optional_limit`]. Suppressed
        // when no cgroup in either snapshot exposes any of these
        // (root cgroup, controllers not enabled, etc.).
        if display.is_section_enabled(Section::Limits) {
            let any_limits = all_keys.iter().any(|key| {
                let has_limits = |s: &CgroupStats| {
                    s.cpu.max_quota_us.is_some()
                        || s.cpu.weight.is_some()
                        || s.memory.max.is_some()
                        || s.memory.high.is_some()
                        || s.pids.current.is_some()
                        || s.pids.max.is_some()
                };
                diff.cgroup_stats_a.get(*key).is_some_and(has_limits)
                    || diff.cgroup_stats_b.get(*key).is_some_and(has_limits)
            });
            if any_limits {
                writeln!(w)?;
                writeln!(w, "## Cgroup limits / knobs")?;
                let mut lt = display.new_table();
                lt.set_header(vec![
                    "cgroup",
                    "cpu.max",
                    "cpu.weight",
                    "memory.max",
                    "memory.high",
                    "pids.current",
                    "pids.max",
                ]);
                for key in &all_keys {
                    let a = diff.cgroup_stats_a.get(*key);
                    let b = diff.cgroup_stats_b.get(*key);
                    // Per-row gate: skip rows where every column is
                    // unset on BOTH sides (the cgroup has no caps,
                    // no weight, no pids accounting on either
                    // baseline or candidate).
                    let row_has_data = |s: &CgroupStats| {
                        s.cpu.max_quota_us.is_some()
                            || s.cpu.weight.is_some()
                            || s.memory.max.is_some()
                            || s.memory.high.is_some()
                            || s.pids.current.is_some()
                            || s.pids.max.is_some()
                    };
                    if !a.is_some_and(row_has_data) && !b.is_some_and(row_has_data) {
                        continue;
                    }
                    lt.add_row(vec![
                        key.to_string(),
                        cgroup_limits_cell(
                            a.map(|s| (s.cpu.max_quota_us, s.cpu.max_period_us)),
                            b.map(|s| (s.cpu.max_quota_us, s.cpu.max_period_us)),
                        ),
                        cgroup_cell(
                            a.and_then(|s| s.cpu.weight),
                            b.and_then(|s| s.cpu.weight),
                            ScaleLadder::Unitless,
                        ),
                        cgroup_optional_limit_cell(
                            a.and_then(|s| s.memory.max),
                            b.and_then(|s| s.memory.max),
                            ScaleLadder::Bytes,
                        ),
                        cgroup_optional_limit_cell(
                            a.and_then(|s| s.memory.high),
                            b.and_then(|s| s.memory.high),
                            ScaleLadder::Bytes,
                        ),
                        cgroup_cell(
                            a.and_then(|s| s.pids.current),
                            b.and_then(|s| s.pids.current),
                            ScaleLadder::Unitless,
                        ),
                        cgroup_optional_limit_cell(
                            a.and_then(|s| s.pids.max),
                            b.and_then(|s| s.pids.max),
                            ScaleLadder::Unitless,
                        ),
                    ]);
                }
                writeln!(w, "{lt}")?;
            }
        }

        // Per-cgroup memory.stat sub-table — kernel-emitted
        // memory counters per cgroup. Up to 71 keys per cgroup.
        // Long-table layout: one row per (cgroup, key) pair
        // with baseline → candidate cells.
        if display.is_section_enabled(Section::MemoryStat)
            && all_keys.iter().any(|key| {
                let has_stat = |s: &CgroupStats| !s.memory.stat.is_empty();
                diff.cgroup_stats_a.get(*key).is_some_and(has_stat)
                    || diff.cgroup_stats_b.get(*key).is_some_and(has_stat)
            })
        {
            writeln!(w)?;
            writeln!(w, "## memory.stat")?;
            let mut mt = display.new_table();
            mt.set_header(vec!["cgroup", "key", "value"]);
            for key in &all_keys {
                let a = diff.cgroup_stats_a.get(*key);
                let b = diff.cgroup_stats_b.get(*key);
                let mut keys_union: std::collections::BTreeSet<&String> = a
                    .map(|s| s.memory.stat.keys().collect())
                    .unwrap_or_default();
                if let Some(s) = b {
                    keys_union.extend(s.memory.stat.keys());
                }
                for stat_key in keys_union {
                    let av = a.and_then(|s| s.memory.stat.get(stat_key).copied());
                    let bv = b.and_then(|s| s.memory.stat.get(stat_key).copied());
                    // Compare-side zero-row suppression: skip
                    // rows where baseline equals candidate. With
                    // 71 keys × N cgroups the table is dominated
                    // by unchanged values; surfacing only the
                    // movers cuts output ~10x for typical runs.
                    // Treats absent and explicit 0 as equal
                    // (both render as "0" / "-").
                    if av == bv {
                        continue;
                    }
                    mt.add_row(vec![
                        key.to_string(),
                        stat_key.clone(),
                        cgroup_cell(av, bv, ScaleLadder::Unitless),
                    ]);
                }
            }
            writeln!(w, "{mt}")?;
        }

        // Per-cgroup memory.events sub-table — pressure-event
        // counters. Same long-table layout as memory.stat with
        // the same baseline-vs-candidate zero-row suppression.
        if display.is_section_enabled(Section::MemoryEvents)
            && all_keys.iter().any(|key| {
                let has_events = |s: &CgroupStats| !s.memory.events.is_empty();
                diff.cgroup_stats_a.get(*key).is_some_and(has_events)
                    || diff.cgroup_stats_b.get(*key).is_some_and(has_events)
            })
        {
            writeln!(w)?;
            writeln!(w, "## memory.events")?;
            let mut et = display.new_table();
            et.set_header(vec!["cgroup", "event", "count"]);
            for key in &all_keys {
                let a = diff.cgroup_stats_a.get(*key);
                let b = diff.cgroup_stats_b.get(*key);
                let mut keys_union: std::collections::BTreeSet<&String> = a
                    .map(|s| s.memory.events.keys().collect())
                    .unwrap_or_default();
                if let Some(s) = b {
                    keys_union.extend(s.memory.events.keys());
                }
                for event_key in keys_union {
                    let av = a.and_then(|s| s.memory.events.get(event_key).copied());
                    let bv = b.and_then(|s| s.memory.events.get(event_key).copied());
                    if av == bv {
                        continue;
                    }
                    et.add_row(vec![
                        key.to_string(),
                        event_key.clone(),
                        cgroup_cell(av, bv, ScaleLadder::Unitless),
                    ]);
                }
            }
            writeln!(w, "{et}")?;
        }

        // Per-cgroup PSI sub-tables — one per resource, each
        // with `some`+`full` rows × `avg10/60/300/total` columns.
        // Mirrors the show-side layout but with
        // baseline→candidate→delta cells rather than single
        // values. Suppressed when every cell on both sides is
        // zero — synthetic fixtures and PSI-disabled hosts both
        // hit this case and there's nothing useful to render.
        if display.is_section_enabled(Section::Pressure) {
            for (resource_name, accessor) in psi_resource_accessors() {
                let any_data = all_keys.iter().any(|key| {
                    let a = diff.cgroup_stats_a.get(*key).map(|s| accessor(&s.psi));
                    let b = diff.cgroup_stats_b.get(*key).map(|s| accessor(&s.psi));
                    a.as_ref().is_some_and(psi_resource_has_data)
                        || b.as_ref().is_some_and(psi_resource_has_data)
                });
                if !any_data {
                    continue;
                }
                writeln!(w)?;
                writeln!(w, "## Pressure / {resource_name}")?;
                let mut pt = display.new_table();
                pt.set_header(vec!["cgroup", "row", "avg10", "avg60", "avg300", "total"]);
                for key in &all_keys {
                    let a = diff.cgroup_stats_a.get(*key).map(|s| accessor(&s.psi));
                    let b = diff.cgroup_stats_b.get(*key).map(|s| accessor(&s.psi));
                    pt.add_row(vec![
                        key.to_string(),
                        "some".into(),
                        format_psi_avg_cell(a.map(|r| r.some.avg10), b.map(|r| r.some.avg10)),
                        format_psi_avg_cell(a.map(|r| r.some.avg60), b.map(|r| r.some.avg60)),
                        format_psi_avg_cell(a.map(|r| r.some.avg300), b.map(|r| r.some.avg300)),
                        cgroup_cell(
                            a.map(|r| r.some.total_usec),
                            b.map(|r| r.some.total_usec),
                            ScaleLadder::Us,
                        ),
                    ]);
                    pt.add_row(vec![
                        key.to_string(),
                        "full".into(),
                        format_psi_avg_cell(a.map(|r| r.full.avg10), b.map(|r| r.full.avg10)),
                        format_psi_avg_cell(a.map(|r| r.full.avg60), b.map(|r| r.full.avg60)),
                        format_psi_avg_cell(a.map(|r| r.full.avg300), b.map(|r| r.full.avg300)),
                        cgroup_cell(
                            a.map(|r| r.full.total_usec),
                            b.map(|r| r.full.total_usec),
                            ScaleLadder::Us,
                        ),
                    ]);
                }
                writeln!(w, "{pt}")?;
            }
        }
    }

    // Host-level PSI compare — one sub-table per resource. Runs
    // independent of `GroupBy` because host pressure is the
    // primary scheduler-health signal regardless of which axis
    // the user grouped per-thread metrics by. Suppressed when
    // both snapshots' host PSI is all-zero.
    if display.is_section_enabled(Section::HostPressure)
        && psi_pair_has_data(&diff.host_psi_a, &diff.host_psi_b)
    {
        for (resource_name, accessor) in psi_resource_accessors() {
            let a = accessor(&diff.host_psi_a);
            let b = accessor(&diff.host_psi_b);
            if !psi_resource_has_data(&a) && !psi_resource_has_data(&b) {
                continue;
            }
            writeln!(w)?;
            writeln!(w, "## Host pressure / {resource_name}")?;
            let mut pt = display.new_table();
            pt.set_header(vec!["row", "avg10", "avg60", "avg300", "total"]);
            pt.add_row(vec![
                "some".into(),
                format_psi_avg_cell(Some(a.some.avg10), Some(b.some.avg10)),
                format_psi_avg_cell(Some(a.some.avg60), Some(b.some.avg60)),
                format_psi_avg_cell(Some(a.some.avg300), Some(b.some.avg300)),
                cgroup_cell(
                    Some(a.some.total_usec),
                    Some(b.some.total_usec),
                    ScaleLadder::Us,
                ),
            ]);
            pt.add_row(vec![
                "full".into(),
                format_psi_avg_cell(Some(a.full.avg10), Some(b.full.avg10)),
                format_psi_avg_cell(Some(a.full.avg60), Some(b.full.avg60)),
                format_psi_avg_cell(Some(a.full.avg300), Some(b.full.avg300)),
                cgroup_cell(
                    Some(a.full.total_usec),
                    Some(b.full.total_usec),
                    ScaleLadder::Us,
                ),
            ]);
            writeln!(w, "{pt}")?;
        }
    }

    // Per-process smaps_rollup compare. Iterates the union of
    // process keys across both snapshots; one row per
    // (process, key) pair carrying baseline → candidate kB
    // values rendered through the existing "B" auto-scale
    // ladder after kB → bytes conversion. Suppressed when
    // neither side has any smaps_rollup data; per-row gate
    // skips rows where baseline equals candidate (treats
    // absent and 0 as equal). Mirrors the memory.stat compare
    // layout.
    //
    // Process iteration order: descending by max(Rss baseline,
    // Rss candidate), tiebreak by descending max-Pss, final
    // tiebreak alphabetical. Memory-heavy processes that moved
    // between snapshots surface first; the renderer then walks
    // each process's per-field smaps keys in BTreeSet
    // (alphabetical) order so within-process row ordering stays
    // deterministic. Rss is the primary "how much memory does
    // this process hold" signal; Pss carries proportional set
    // size (per `fs/proc/task_mmu.c::smap_account`) and breaks
    // ties when two processes report equal Rss but differ in
    // shared-page accounting. Processes missing both keys sort
    // last under `unwrap_or(0)` and preserve alphabetical order
    // Global sched_ext sysfs compare. Suppressed when both
    // sides are None (CONFIG_SCHED_CLASS_EXT=n on both kernels)
    // OR when both sides are Some and every field is identical
    // across baseline and candidate (no signal to surface). When
    // exactly one side is Some, surface the configuration delta
    // — that's a load-bearing signal that the host kernel
    // changed between snapshots.
    let scx_emit = match (&diff.sched_ext_a, &diff.sched_ext_b) {
        (None, None) => false,
        (Some(_), None) | (None, Some(_)) => true,
        (Some(a), Some(b)) => {
            a.state != b.state
                || a.switch_all != b.switch_all
                || a.nr_rejected != b.nr_rejected
                || a.hotplug_seq != b.hotplug_seq
                || a.enable_seq != b.enable_seq
        }
    };
    if display.is_section_enabled(Section::SchedExt) && scx_emit {
        writeln!(w)?;
        writeln!(w, "## sched_ext")?;
        let mut at = display.new_table();
        at.set_header(vec!["attr", "value"]);
        // state cell: render "-" for both absent (Option=None)
        // AND for the empty-string-but-Some case (file
        // unreadable but directory present). The "-" placeholder
        // makes "no observation" visually distinct from a real
        // sched_ext_state_str[] value.
        fn state_cell_for(s: Option<&crate::ctprof::SchedExtSysfs>) -> String {
            match s {
                None => "-".to_string(),
                Some(scx) if scx.state.is_empty() => "-".to_string(),
                Some(scx) => scx.state.clone(),
            }
        }
        let state_a = state_cell_for(diff.sched_ext_a.as_ref());
        let state_b = state_cell_for(diff.sched_ext_b.as_ref());
        let state_cell = if state_a == state_b {
            state_a
        } else {
            format!("{state_a} → {state_b}")
        };
        at.add_row(vec!["state".into(), state_cell]);
        at.add_row(vec![
            "switch_all".into(),
            cgroup_cell(
                diff.sched_ext_a.as_ref().map(|s| s.switch_all),
                diff.sched_ext_b.as_ref().map(|s| s.switch_all),
                ScaleLadder::Unitless,
            ),
        ]);
        at.add_row(vec![
            "nr_rejected".into(),
            cgroup_cell(
                diff.sched_ext_a.as_ref().map(|s| s.nr_rejected),
                diff.sched_ext_b.as_ref().map(|s| s.nr_rejected),
                ScaleLadder::Unitless,
            ),
        ]);
        at.add_row(vec![
            "hotplug_seq".into(),
            cgroup_cell(
                diff.sched_ext_a.as_ref().map(|s| s.hotplug_seq),
                diff.sched_ext_b.as_ref().map(|s| s.hotplug_seq),
                ScaleLadder::Unitless,
            ),
        ]);
        at.add_row(vec![
            "enable_seq".into(),
            cgroup_cell(
                diff.sched_ext_a.as_ref().map(|s| s.enable_seq),
                diff.sched_ext_b.as_ref().map(|s| s.enable_seq),
                ScaleLadder::Unitless,
            ),
        ]);
        writeln!(w, "{at}")?;
    }

    let write_only_list = |w: &mut W, label: &str, path: &Path, keys: &[String]| -> fmt::Result {
        if keys.is_empty() {
            return Ok(());
        }
        writeln!(
            w,
            "\n{} group(s) only in {label} ({}):",
            keys.len(),
            path.display()
        )?;
        if group_by == GroupBy::All {
            let mut sorted: Vec<&str> = keys.iter().map(|s| s.as_str()).collect();
            sorted.sort();
            let mut last_segs: Vec<&str> = Vec::new();
            for k in &sorted {
                let (cg, pc) = k.split_once('\x00').unwrap_or(("", k));
                let segs: Vec<&str> = cg.split('/').filter(|s| !s.is_empty()).collect();
                let common = segs
                    .iter()
                    .zip(last_segs.iter())
                    .take_while(|(a, b)| a == b)
                    .count();
                if common < last_segs.len() || segs.len() > last_segs.len() {
                    for (depth, seg) in segs.iter().enumerate().skip(common) {
                        let indent = "  ".repeat(depth + 1);
                        writeln!(w, "{indent}{seg}")?;
                    }
                    last_segs = segs;
                }
                let indent = "  ".repeat(last_segs.len() + 1);
                writeln!(w, "{indent}{pc}")?;
            }
        } else {
            for k in keys {
                writeln!(w, "  {k}")?;
            }
        }
        Ok(())
    };
    write_only_list(w, "baseline", baseline_path, &diff.only_baseline)?;
    write_only_list(w, "candidate", candidate_path, &diff.only_candidate)?;

    if !diff.fudged_pairs.is_empty() {
        writeln!(
            w,
            "\n\x1b[1;33m## Fudged cgroup matches ({} pair(s))\x1b[0m",
            diff.fudged_pairs.len()
        )?;
        for fp in &diff.fudged_pairs {
            writeln!(w, "\n  \x1b[36mbaseline:\x1b[0m {}", fp.baseline_key)?;
            writeln!(w, "  \x1b[36mcandidate:\x1b[0m {}", fp.candidate_key)?;
            writeln!(
                w,
                "  overlap: {} thread types, Jaccard: {:.1}%, cascaded children: {}",
                fp.overlap,
                fp.jaccard * 100.0,
                fp.cascaded_children
            )?;
            if !fp.residual_baseline.is_empty() {
                writeln!(
                    w,
                    "  residual (baseline only): {}",
                    fp.residual_baseline.join(", ")
                )?;
            }
            if !fp.residual_candidate.is_empty() {
                writeln!(
                    w,
                    "  residual (candidate only): {}",
                    fp.residual_candidate.join(", ")
                )?;
            }
        }
    }

    Ok(())
}

/// Render a `(baseline, candidate, delta)` cell for the
/// cgroup-enrichment secondary table emitted under
/// [`GroupBy::Cgroup`]. The `ladder` parameter routes each
/// scalar through `auto_scale` (private to this module) so a
/// 7.5 GiB `memory_current` row reads
/// `7.500GiB → 8.250GiB (+768.000MiB)` instead of
/// `8053063680 → 8858370048 (+805306368)`. Each cell scales
/// independently — baseline, candidate, and delta may pick
/// different prefixes when their magnitudes cross thresholds.
///
/// See [`ScaleLadder`] for the closed enumeration of supported
/// ladder families and per-variant step-up rules. The variants
/// most relevant to cgroup-render call sites:
/// - [`ScaleLadder::Us`]: cgroup `cpu_usage_usec` /
///   `throttled_usec` / PSI `total_usec`.
/// - [`ScaleLadder::Bytes`]: `memory_current` / `memory.max` /
///   `memory.high` (IEC binary, B → KiB → MiB → GiB → TiB).
/// - [`ScaleLadder::Unitless`]: `nr_throttled` / `cpu.weight` /
///   `pids.current` / sched_ext attribute counters (decimal
///   SI, "" → K → M → G).
pub fn cgroup_cell(baseline: Option<u64>, candidate: Option<u64>, ladder: ScaleLadder) -> String {
    match (baseline, candidate) {
        (Some(baseline), Some(candidate)) => {
            let baseline_cell = format_scaled_u64(baseline, ladder);
            let candidate_cell = format_scaled_u64(candidate, ladder);
            let d = candidate as i128 - baseline as i128;
            // Delta is signed; route via format_delta_cell so the
            // sign is rendered explicitly and the auto-scale step
            // applies. i128 → f64 cast is lossy at extreme
            // magnitudes (>2^53) but cgroup counters on typical
            // hosts stay well under that ceiling.
            let delta_cell = format_delta_cell(d as f64, ladder);
            format!("{baseline_cell} → {candidate_cell} ({delta_cell})")
        }
        (Some(baseline), None) => format!("{} → -", format_scaled_u64(baseline, ladder)),
        (None, Some(candidate)) => format!("- → {}", format_scaled_u64(candidate, ladder)),
        (None, None) => "-".to_string(),
    }
}

/// Render a baseline→candidate→delta cell for a PSI average
/// field. `baseline` and `candidate` are centi-percent (0..=10000
/// covering 0.00..=100.00 %); the cell renders each as `N.NN%`
/// and computes a signed delta `(+|-D.DD%)`. Mirrors
/// [`cgroup_cell`]'s structure but does NOT route through the
/// auto-scale ladder — a pressure percentage is dimensionless
/// and topping out at 100 means there's nothing to scale.
pub fn format_psi_avg_cell(baseline: Option<u16>, candidate: Option<u16>) -> String {
    match (baseline, candidate) {
        (Some(b), Some(c)) => {
            let baseline_cell = format_psi_avg_centi_percent(b);
            let candidate_cell = format_psi_avg_centi_percent(c);
            let d = c as i32 - b as i32;
            let sign = if d >= 0 { "+" } else { "-" };
            let abs = d.unsigned_abs();
            let delta_int = abs / 100;
            let delta_frac = abs % 100;
            format!("{baseline_cell} → {candidate_cell} ({sign}{delta_int}.{delta_frac:02}%)")
        }
        (Some(b), None) => format!("{} → -", format_psi_avg_centi_percent(b)),
        (None, Some(c)) => format!("- → {}", format_psi_avg_centi_percent(c)),
        (None, None) => "-".to_string(),
    }
}

/// Convert a centi-percent value (0..=10000) to its display
/// form `N.NN%`. The centi-percent representation is 1:1 with
/// the kernel's `LOAD_INT.LOAD_FRAC` 2-decimal-digit emission at
/// `kernel/sched/psi.c:1284` — preserve that precision on
/// display.
pub fn format_psi_avg_centi_percent(v: u16) -> String {
    let int = v / 100;
    let frac = v % 100;
    format!("{int}.{frac:02}%")
}

/// One entry in the [`psi_resource_accessors`] table — a
/// display name paired with the accessor that pulls one
/// [`PsiResource`] out of a [`Psi`] bundle.
type PsiAccessor = (&'static str, fn(&Psi) -> PsiResource);

/// Returns the four PSI resource accessors paired with their
/// display names. Single source of truth for compare-side
/// rendering — adding a fifth resource means one edit here.
fn psi_resource_accessors() -> [PsiAccessor; 4] {
    [
        ("cpu", |p| p.cpu),
        ("memory", |p| p.memory),
        ("io", |p| p.io),
        ("irq", |p| p.irq),
    ]
}

/// Returns true when either side of a [`Psi`] pair has any
/// non-zero data. Used to suppress a host-pressure or
/// per-cgroup-pressure section when both sides are flat zero.
fn psi_pair_has_data(a: &Psi, b: &Psi) -> bool {
    psi_has_data(a) || psi_has_data(b)
}

fn psi_has_data(p: &Psi) -> bool {
    [p.cpu, p.memory, p.io, p.irq]
        .iter()
        .any(psi_resource_has_data)
}

fn psi_resource_has_data(r: &PsiResource) -> bool {
    let h = |h: &PsiHalf| h.avg10 != 0 || h.avg60 != 0 || h.avg300 != 0 || h.total_usec != 0;
    h(&r.some) || h(&r.full)
}

#[cfg(test)]
#[allow(clippy::field_reassign_with_default)]
mod tests {
    use super::*;
    use crate::metric_types::{
        Bytes, CategoricalString, CpuSet, MonotonicCount, MonotonicNs, OrdinalI32, PeakNs,
    };

    fn make_thread(pcomm: &str, comm: &str) -> ThreadState {
        ThreadState {
            tid: 1,
            tgid: 1,
            pcomm: pcomm.into(),
            comm: comm.into(),
            cgroup: "/".into(),
            start_time_clock_ticks: 0,
            policy: CategoricalString("SCHED_OTHER".into()),
            nice: OrdinalI32(0),
            cpu_affinity: CpuSet(vec![0, 1, 2, 3]),
            ..ThreadState::default()
        }
    }

    fn snap_with(threads: Vec<ThreadState>) -> CtprofSnapshot {
        CtprofSnapshot {
            captured_at_unix_ns: 0,
            host: None,
            threads,
            cgroup_stats: BTreeMap::new(),
            probe_summary: None,
            parse_summary: None,
            taskstats_summary: None,
            psi: crate::ctprof::Psi::default(),
            sched_ext: None,
        }
    }

    /// Build a `CgroupStats` populated with the four primary
    /// cpu / memory counter fields used in compare-pipeline
    /// tests. Helper because the nested-struct shape makes
    /// Default + per-field-assignment noisy at every test
    /// fixture; this keeps call-site brevity at the four
    /// counter values that drive most compare assertions.
    fn simple_cgroup_stats(
        cpu_usage_usec: u64,
        nr_throttled: u64,
        throttled_usec: u64,
        memory_current: u64,
    ) -> CgroupStats {
        let mut cs = CgroupStats::default();
        cs.cpu.usage_usec = cpu_usage_usec;
        cs.cpu.nr_throttled = nr_throttled;
        cs.cpu.throttled_usec = throttled_usec;
        cs.memory.current = memory_current;
        cs
    }

    #[test]
    fn sum_aggregation_totals_across_group() {
        let mut a = make_thread("app", "w1");
        a.run_time_ns = MonotonicNs(1_000);
        let mut b = make_thread("app", "w2");
        b.run_time_ns = MonotonicNs(3_000);
        let v = aggregate(AggRule::SumNs(|t| t.run_time_ns), &[&a, &b]);
        match v {
            Aggregated::Sum(s) => assert_eq!(s, 4_000),
            other => panic!("expected Sum, got {other:?}"),
        }
    }

    #[test]
    fn sum_saturates_on_overflow() {
        let mut a = make_thread("app", "w1");
        a.run_time_ns = MonotonicNs(u64::MAX);
        let mut b = make_thread("app", "w2");
        b.run_time_ns = MonotonicNs(5);
        let v = aggregate(AggRule::SumNs(|t| t.run_time_ns), &[&a, &b]);
        match v {
            Aggregated::Sum(s) => assert_eq!(s, u64::MAX),
            other => panic!("expected Sum, got {other:?}"),
        }
    }

    #[test]
    fn ordinal_range_picks_extremes() {
        let mut a = make_thread("app", "w1");
        a.nice = OrdinalI32(-5);
        let mut b = make_thread("app", "w2");
        b.nice = OrdinalI32(10);
        let v = aggregate(AggRule::RangeI32(|t| t.nice), &[&a, &b]);
        match v {
            Aggregated::OrdinalRange { min, max } => {
                assert_eq!(min, -5);
                assert_eq!(max, 10);
            }
            other => panic!("expected OrdinalRange, got {other:?}"),
        }
    }

    #[test]
    fn mode_aggregation_picks_most_frequent() {
        let mut a = make_thread("app", "w1");
        a.policy = "SCHED_OTHER".into();
        let mut b = make_thread("app", "w2");
        b.policy = "SCHED_OTHER".into();
        let mut c = make_thread("app", "w3");
        c.policy = "SCHED_FIFO".into();
        let v = aggregate(AggRule::Mode(|t| t.policy.clone()), &[&a, &b, &c]);
        match v {
            Aggregated::Mode {
                value,
                count,
                total,
            } => {
                assert_eq!(value, "SCHED_OTHER");
                assert_eq!(count, 2);
                assert_eq!(total, 3);
            }
            other => panic!("expected Mode, got {other:?}"),
        }
    }

    #[test]
    fn affinity_uniform_preserves_cpuset() {
        let a = make_thread("app", "w1");
        let b = make_thread("app", "w2");
        let v = aggregate(AggRule::Affinity(|t| t.cpu_affinity.clone()), &[&a, &b]);
        match v {
            Aggregated::Affinity(s) => {
                assert_eq!(s.min_cpus, 4);
                assert_eq!(s.max_cpus, 4);
                assert_eq!(s.uniform, Some(vec![0, 1, 2, 3]));
            }
            other => panic!("expected Affinity, got {other:?}"),
        }
    }

    #[test]
    fn affinity_heterogeneous_drops_uniform() {
        let a = make_thread("app", "w1");
        let mut b = make_thread("app", "w2");
        b.cpu_affinity = CpuSet(vec![4, 5]);
        let v = aggregate(AggRule::Affinity(|t| t.cpu_affinity.clone()), &[&a, &b]);
        match v {
            Aggregated::Affinity(s) => {
                assert_eq!(s.min_cpus, 2);
                assert_eq!(s.max_cpus, 4);
                assert!(s.uniform.is_none());
            }
            other => panic!("expected Affinity, got {other:?}"),
        }
    }

    #[test]
    fn format_cpu_range_collapses_contiguous_runs() {
        assert_eq!(format_cpu_range(&[0, 1, 2, 3]), "0-3");
        assert_eq!(format_cpu_range(&[0, 1, 4, 5, 7]), "0-1,4-5,7");
        assert_eq!(format_cpu_range(&[3]), "3");
        assert_eq!(format_cpu_range(&[]), "");
    }

    #[test]
    fn flatten_cgroup_path_collapses_via_pattern() {
        let pats = compile_flatten_patterns(&["/kubepods/*/workload".into()]);
        let out = flatten_cgroup_path("/kubepods/pod-abc-123/workload", &pats);
        assert_eq!(out, "/kubepods/*/workload");
    }

    #[test]
    fn flatten_cgroup_path_falls_through_unmatched() {
        let pats = compile_flatten_patterns(&["/kubepods/*/workload".into()]);
        assert_eq!(
            flatten_cgroup_path("/system.slice/sshd.service", &pats),
            "/system.slice/sshd.service",
        );
    }

    #[test]
    fn compare_emits_rows_for_matched_groups() {
        let mut ta = make_thread("app", "w1");
        ta.run_time_ns = MonotonicNs(1_000);
        let mut tb = make_thread("app", "w1");
        tb.run_time_ns = MonotonicNs(2_000);
        let a = snap_with(vec![ta]);
        let b = snap_with(vec![tb]);
        let diff = compare(&a, &b, &CompareOptions::default());
        let run_time = diff
            .rows
            .iter()
            .find(|r| r.metric_name == "run_time_ns")
            .expect("run_time_ns row");
        assert_eq!(run_time.group_key, "app");
        assert_eq!(run_time.delta, Some(1_000.0));
        assert!((run_time.delta_pct.unwrap() - 1.0).abs() < 1e-9);
    }

    #[test]
    fn compare_reports_unmatched_groups() {
        let a = snap_with(vec![make_thread("only_a", "w1")]);
        let b = snap_with(vec![make_thread("only_b", "w1")]);
        let diff = compare(&a, &b, &CompareOptions::default());
        assert_eq!(diff.only_baseline, vec!["only_a".to_string()]);
        assert_eq!(diff.only_candidate, vec!["only_b".to_string()]);
    }

    #[test]
    fn compare_sorts_by_abs_delta_pct_descending() {
        // Build two baseline threads and two candidate threads:
        // "big" swings 10x, "small" swings 1.1x. After compare,
        // the "big" row must sort before "small".
        let mut a1 = make_thread("big", "w");
        a1.run_time_ns = MonotonicNs(100);
        let mut a2 = make_thread("small", "w");
        a2.run_time_ns = MonotonicNs(1_000);
        let mut b1 = make_thread("big", "w");
        b1.run_time_ns = MonotonicNs(1_000);
        let mut b2 = make_thread("small", "w");
        b2.run_time_ns = MonotonicNs(1_100);
        let diff = compare(
            &snap_with(vec![a1, a2]),
            &snap_with(vec![b1, b2]),
            &CompareOptions::default(),
        );
        let run_rows: Vec<&DiffRow> = diff
            .rows
            .iter()
            .filter(|r| r.metric_name == "run_time_ns")
            .collect();
        assert_eq!(run_rows[0].group_key, "big");
        assert_eq!(run_rows[1].group_key, "small");
    }

    #[test]
    fn group_by_cgroup_applies_flatten_patterns() {
        let mut ta = make_thread("app", "w1");
        ta.cgroup = "/kubepods/pod-xxx/workload".into();
        ta.run_time_ns = MonotonicNs(1_000);
        let mut tb = make_thread("app", "w1");
        tb.cgroup = "/kubepods/pod-yyy/workload".into();
        tb.run_time_ns = MonotonicNs(2_000);
        let opts = CompareOptions {
            group_by: GroupBy::Cgroup.into(),
            cgroup_flatten: vec!["/kubepods/*/workload".into()],
            no_thread_normalize: false,
            no_cg_normalize: false,
            sort_by: Vec::new(),
        };
        let diff = compare(&snap_with(vec![ta]), &snap_with(vec![tb]), &opts);
        assert!(diff.only_baseline.is_empty(), "{:?}", diff.only_baseline);
        assert!(diff.only_candidate.is_empty(), "{:?}", diff.only_candidate,);
        assert!(
            diff.rows
                .iter()
                .any(|r| r.group_key == "/kubepods/*/workload"),
            "rows={:?}",
            diff.rows.iter().map(|r| &r.group_key).collect::<Vec<_>>(),
        );
    }

    #[test]
    fn group_by_cgroup_surfaces_enrichment_on_diff() {
        let mut ta = make_thread("app", "w1");
        ta.cgroup = "/app".into();
        let mut snap_a = snap_with(vec![ta]);
        snap_a
            .cgroup_stats
            .insert("/app".into(), simple_cgroup_stats(100, 1, 50, 1 << 20));
        let mut tb = make_thread("app", "w1");
        tb.cgroup = "/app".into();
        let mut snap_b = snap_with(vec![tb]);
        snap_b
            .cgroup_stats
            .insert("/app".into(), simple_cgroup_stats(500, 3, 250, 2 << 20));
        let opts = CompareOptions {
            group_by: GroupBy::Cgroup.into(),
            cgroup_flatten: vec![],
            no_thread_normalize: false,
            no_cg_normalize: false,
            sort_by: Vec::new(),
        };
        let diff = compare(&snap_a, &snap_b, &opts);
        assert_eq!(diff.cgroup_stats_a["/app"].cpu.usage_usec, 100);
        assert_eq!(diff.cgroup_stats_b["/app"].cpu.usage_usec, 500);
    }

    #[test]
    fn categorical_row_labels_same_or_differs() {
        let mut ta = make_thread("app", "w1");
        ta.policy = "SCHED_OTHER".into();
        let mut tb = make_thread("app", "w1");
        tb.policy = "SCHED_FIFO".into();
        let diff = compare(
            &snap_with(vec![ta]),
            &snap_with(vec![tb]),
            &CompareOptions::default(),
        );
        let policy_row = diff
            .rows
            .iter()
            .find(|r| r.metric_name == "policy")
            .expect("policy row");
        assert!(policy_row.delta.is_none());
        match (&policy_row.baseline, &policy_row.candidate) {
            (Aggregated::Mode { value: a, .. }, Aggregated::Mode { value: b, .. }) => {
                assert_eq!(a, "SCHED_OTHER");
                assert_eq!(b, "SCHED_FIFO");
            }
            _ => panic!("expected two Mode aggregates"),
        }
    }

    #[test]
    fn delta_pct_absent_when_baseline_zero() {
        // Baseline=0, candidate=100 → numeric delta is 100 but
        // percent is undefined (division by zero). The row must
        // still appear (the absolute-delta inflation in sort_key
        // keeps it visible).
        let mut ta = make_thread("app", "w1");
        ta.run_time_ns = MonotonicNs(0);
        let mut tb = make_thread("app", "w1");
        tb.run_time_ns = MonotonicNs(100);
        let diff = compare(
            &snap_with(vec![ta]),
            &snap_with(vec![tb]),
            &CompareOptions::default(),
        );
        let row = diff
            .rows
            .iter()
            .find(|r| r.metric_name == "run_time_ns")
            .expect("row");
        assert_eq!(row.delta, Some(100.0));
        assert!(row.delta_pct.is_none());
    }

    // -- Additional coverage --

    /// Two empty snapshots (no threads, no cgroup enrichment)
    /// produce an empty diff with zero rows and zero unmatched
    /// groups. Gate against a silent panic or spurious
    /// "only in baseline" entries driven by inserting keys into
    /// the group map from empty inputs.
    #[test]
    fn empty_snapshots_produce_empty_diff() {
        let diff = compare(
            &snap_with(vec![]),
            &snap_with(vec![]),
            &CompareOptions::default(),
        );
        assert!(diff.rows.is_empty());
        assert!(diff.only_baseline.is_empty());
        assert!(diff.only_candidate.is_empty());
    }

    /// Baseline empty, candidate populated: every candidate
    /// group surfaces as `only_candidate`; `rows` stays empty
    /// because there is no matched group to produce a delta.
    #[test]
    fn baseline_empty_surfaces_only_candidate_groups() {
        let t = make_thread("new_proc", "t1");
        let diff = compare(
            &snap_with(vec![]),
            &snap_with(vec![t]),
            &CompareOptions::default(),
        );
        assert!(diff.rows.is_empty());
        assert!(diff.only_baseline.is_empty());
        assert_eq!(diff.only_candidate, vec!["new_proc".to_string()]);
    }

    /// Identical snapshots produce rows whose delta is
    /// uniformly zero (for every numeric rule) and whose
    /// delta_pct is zero (for every non-zero baseline) —
    /// categorical rows still get the "same" treatment via
    /// `Aggregated::Mode` equality. Pin a representative
    /// subset: every delta field in `rows` must be `Some(0.0)`
    /// or `None` (the `None` branch belongs only to categorical
    /// / all-zero-baseline cases).
    #[test]
    fn identical_snapshots_produce_zero_deltas() {
        let mut t = make_thread("app", "w1");
        t.run_time_ns = MonotonicNs(1_000);
        t.voluntary_csw = MonotonicCount(50);
        let snap = snap_with(vec![t]);
        let diff = compare(&snap, &snap, &CompareOptions::default());
        // `Aggregated::Mode { .. } => None` (line ~465) gates the
        // delta — every metric registered with any `AggRule::Mode*`
        // variant (`Mode` for policy, `ModeChar` for state,
        // `ModeBool` for ext_enabled — see CTPROF_METRICS)
        // surfaces as None-delta even when both sides are
        // identical, because Mode-family rules are categorical
        // and have no numeric delta concept. Build the closed
        // set from the registry so a future Mode*-rule addition
        // lands in this assertion automatically.
        let mode_metrics: std::collections::BTreeSet<&str> = CTPROF_METRICS
            .iter()
            .filter(|m| {
                matches!(
                    m.rule,
                    AggRule::Mode(_) | AggRule::ModeChar(_) | AggRule::ModeBool(_),
                )
            })
            .map(|m| m.name)
            .collect();
        for row in &diff.rows {
            match row.delta {
                Some(d) => assert_eq!(d, 0.0, "metric {} had nonzero delta", row.metric_name),
                None => assert!(
                    mode_metrics.contains(row.metric_name),
                    "non-Mode metric {} produced a None-delta — \
                     identical snapshots should yield Some(0.0) for \
                     numeric metrics; only Mode-aggregated metrics \
                     ({:?}) are allowed to surface None",
                    row.metric_name,
                    mode_metrics,
                ),
            }
        }
    }

    /// Single-thread group: registry emits exactly one row per
    /// registered metric. Defends against a future "skip if
    /// only one thread" short-circuit sneaking into
    /// `aggregate`.
    #[test]
    fn single_thread_group_yields_one_row_per_metric() {
        let a = make_thread("solo", "t");
        let mut b = make_thread("solo", "t");
        b.run_time_ns = MonotonicNs(1);
        let diff = compare(
            &snap_with(vec![a]),
            &snap_with(vec![b]),
            &CompareOptions::default(),
        );
        let solo_rows: Vec<&DiffRow> = diff.rows.iter().filter(|r| r.group_key == "solo").collect();
        assert_eq!(solo_rows.len(), CTPROF_METRICS.len());
    }

    /// All-zero cumulative counters on both sides still produce
    /// a row for each Sum metric (delta=0, delta_pct=None
    /// because baseline=0). Gate against a "skip zero" filter
    /// hiding newly-introduced metrics that the workload never
    /// exercises.
    #[test]
    fn all_zero_metrics_emit_zero_delta_rows() {
        let a = make_thread("quiet", "t");
        let b = make_thread("quiet", "t");
        let diff = compare(
            &snap_with(vec![a]),
            &snap_with(vec![b]),
            &CompareOptions::default(),
        );
        let run_time = diff
            .rows
            .iter()
            .find(|r| r.metric_name == "run_time_ns")
            .expect("row");
        assert_eq!(run_time.delta, Some(0.0));
        assert!(run_time.delta_pct.is_none());
    }

    /// `GroupBy::Comm` lumps threads with the same thread name
    /// across processes.
    #[test]
    fn group_by_comm_aggregates_across_processes() {
        let mut ta = make_thread("procA", "worker");
        ta.run_time_ns = MonotonicNs(100);
        let mut tb = make_thread("procB", "worker");
        tb.run_time_ns = MonotonicNs(200);
        let mut candidate = make_thread("procA", "worker");
        candidate.run_time_ns = MonotonicNs(500);
        let mut candidate2 = make_thread("procB", "worker");
        candidate2.run_time_ns = MonotonicNs(500);
        let diff = compare(
            &snap_with(vec![ta, tb]),
            &snap_with(vec![candidate, candidate2]),
            &CompareOptions {
                group_by: GroupBy::Comm.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: false,
                no_cg_normalize: false,
                sort_by: Vec::new(),
            },
        );
        let row = diff
            .rows
            .iter()
            .find(|r| r.metric_name == "run_time_ns" && r.group_key == "worker")
            .expect("worker row");
        // Summed across both processes: baseline=300, candidate=1000, delta=700.
        assert_eq!(row.thread_count_a, 2);
        assert_eq!(row.thread_count_b, 2);
        assert_eq!(row.delta, Some(700.0));
    }

    /// Thread-count change between baseline and candidate
    /// renders "a\u{2192}b" in the row. Gate against silent
    /// collapse to a single value when the group grows or
    /// shrinks.
    #[test]
    fn thread_count_diff_surfaces_when_group_grows() {
        let ta = make_thread("pool", "t");
        let tb1 = make_thread("pool", "t");
        let tb2 = make_thread("pool", "t");
        let diff = compare(
            &snap_with(vec![ta]),
            &snap_with(vec![tb1, tb2]),
            &CompareOptions::default(),
        );
        let row = diff
            .rows
            .iter()
            .find(|r| r.metric_name == "run_time_ns")
            .expect("row");
        assert_eq!(row.thread_count_a, 1);
        assert_eq!(row.thread_count_b, 2);
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        assert!(
            out.contains("1\u{2192}2"),
            "expected thread-count diff rendering, got:\n{out}",
        );
    }

    /// Earlier flatten pattern wins when multiple patterns
    /// match the same path. Gate against a later pattern
    /// silently stealing the collapse when an operator layers
    /// broad and narrow patterns.
    #[test]
    fn flatten_first_match_wins_over_later_pattern() {
        let pats =
            compile_flatten_patterns(&["/kubepods/*/workload".into(), "/kubepods/**".into()]);
        assert_eq!(
            flatten_cgroup_path("/kubepods/pod-abc/workload", &pats),
            "/kubepods/*/workload",
        );
    }

    /// Multi-pattern collapse: several distinct cgroup paths
    /// flatten to the same key → their enrichment counters
    /// aggregate (sum for counters, max for memory.current).
    #[test]
    fn flatten_cgroup_stats_collapses_overlapping_paths() {
        let mut stats = BTreeMap::new();
        stats.insert(
            "/kubepods/pod-a/workload".into(),
            simple_cgroup_stats(100, 1, 10, 500),
        );
        stats.insert(
            "/kubepods/pod-b/workload".into(),
            simple_cgroup_stats(200, 2, 20, 800),
        );
        let pats = compile_flatten_patterns(&["/kubepods/*/workload".into()]);
        let out = flatten_cgroup_stats(&stats, &pats, None);
        let agg = &out["/kubepods/*/workload"];
        assert_eq!(agg.cpu.usage_usec, 300);
        assert_eq!(agg.cpu.nr_throttled, 3);
        assert_eq!(agg.cpu.throttled_usec, 30);
        // Instantaneous value: max, not sum.
        assert_eq!(agg.memory.current, 800);
    }

    /// `merge_max_option` policy: take the max across
    /// contributors when both have a concrete cap; propagate
    /// `None` when either side is unbounded (matches kernel
    /// "no limit" semantics — the merged bucket is unbounded if
    /// any contributor is).
    #[test]
    fn merge_max_option_propagates_no_limit() {
        assert_eq!(merge_max_option(Some(100), Some(200)), Some(200));
        assert_eq!(merge_max_option(Some(200), Some(100)), Some(200));
        assert_eq!(merge_max_option(Some(50), Some(50)), Some(50));
        // None ∨ Some = None (an unbounded contributor makes
        // the merged bucket unbounded).
        assert_eq!(merge_max_option(None, Some(100)), None);
        assert_eq!(merge_max_option(Some(100), None), None);
        assert_eq!(merge_max_option(None, None), None);
    }

    /// `merge_min_option` policy: take the min across
    /// contributors when both have a concrete floor; propagate
    /// `None` when either side has no floor (matches the floor
    /// equivalent of the limit policy — merged bucket is only
    /// as protected as its weakest contributor).
    #[test]
    fn merge_min_option_propagates_no_floor() {
        assert_eq!(merge_min_option(Some(100), Some(200)), Some(100));
        assert_eq!(merge_min_option(Some(200), Some(100)), Some(100));
        assert_eq!(merge_min_option(None, Some(100)), None);
        assert_eq!(merge_min_option(Some(100), None), None);
        assert_eq!(merge_min_option(None, None), None);
    }

    /// `merge_kv_counters` per-key sum: keys present on both
    /// sides sum; one-sided keys copy verbatim. Pure
    /// counter-shaped policy — used for `memory.events` where
    /// every key is a counter.
    #[test]
    fn merge_kv_counters_per_key_sum() {
        let mut agg: BTreeMap<String, u64> = BTreeMap::new();
        agg.insert("oom_kill".into(), 10);
        agg.insert("high".into(), 20);
        let mut src: BTreeMap<String, u64> = BTreeMap::new();
        src.insert("oom_kill".into(), 5);
        src.insert("low".into(), 7);
        merge_kv_counters(&mut agg, &src);
        assert_eq!(agg.get("oom_kill"), Some(&15), "common key sums");
        assert_eq!(agg.get("high"), Some(&20), "agg-only key preserved");
        assert_eq!(agg.get("low"), Some(&7), "src-only key copied");
    }

    /// `merge_memory_stat` per-key dispatch: gauge keys (per
    /// [`MEMORY_STAT_GAUGE_KEYS`]) take max; counter keys take
    /// saturating_add. Summing instantaneous pool sizes
    /// (anon, file, slab) overstates the merged-bucket gauge,
    /// so the gauge keys take max instead.
    #[test]
    fn merge_memory_stat_dispatches_gauge_vs_counter() {
        let mut agg: BTreeMap<String, u64> = BTreeMap::new();
        agg.insert("anon".into(), 1_000_000);
        agg.insert("file".into(), 500_000);
        agg.insert("slab".into(), 800_000);
        agg.insert("pgfault".into(), 100);
        agg.insert("workingset_refault_anon".into(), 50);
        let mut src: BTreeMap<String, u64> = BTreeMap::new();
        src.insert("anon".into(), 2_000_000);
        src.insert("file".into(), 100_000);
        src.insert("slab".into(), 300_000);
        src.insert("pgfault".into(), 25);
        src.insert("workingset_refault_anon".into(), 10);
        merge_memory_stat(&mut agg, &src);
        // Gauges: max wins (NOT sum).
        assert_eq!(agg.get("anon"), Some(&2_000_000), "anon is gauge → max");
        assert_eq!(agg.get("file"), Some(&500_000), "file is gauge → max");
        assert_eq!(agg.get("slab"), Some(&800_000), "slab is gauge → max");
        // Counters: sum.
        assert_eq!(agg.get("pgfault"), Some(&125), "pgfault is counter → sum");
        assert_eq!(
            agg.get("workingset_refault_anon"),
            Some(&60),
            "workingset_refault_anon is counter → sum"
        );
    }

    /// End-to-end merge: two cgroups with distinct caps and
    /// counters flatten to one bucket. Verifies the per-domain
    /// merge policy holds across the full nested struct path.
    #[test]
    fn flatten_cgroup_stats_merges_limits_and_kv_maps() {
        let mut a = CgroupStats::default();
        a.cpu.usage_usec = 100;
        a.cpu.max_quota_us = Some(50_000);
        a.cpu.max_period_us = 100_000;
        a.cpu.weight = Some(100);
        a.memory.max = Some(1_000_000);
        a.memory.high = Some(800_000);
        a.memory.low = Some(400_000);
        a.memory.stat.insert("anon".into(), 1000);
        a.memory.events.insert("oom_kill".into(), 0);
        a.pids.current = Some(10);
        a.pids.max = Some(1024);

        let mut b = CgroupStats::default();
        b.cpu.usage_usec = 200;
        b.cpu.max_quota_us = Some(80_000);
        b.cpu.max_period_us = 100_000;
        b.cpu.weight = Some(300);
        b.memory.max = Some(2_000_000);
        b.memory.high = Some(1_500_000);
        b.memory.low = Some(200_000);
        b.memory.stat.insert("anon".into(), 500);
        b.memory.stat.insert("file".into(), 200);
        b.memory.events.insert("oom_kill".into(), 1);
        b.pids.current = Some(5);
        b.pids.max = Some(2048);

        let mut stats = BTreeMap::new();
        stats.insert("/a".into(), a);
        stats.insert("/b".into(), b);
        // Glob crate (0.3.x) supports `*`, `?`, `[...]`, `**` —
        // NOT brace expansion `{a,b}`. Use the `[ab]`
        // character-class to collapse `/a` and `/b` onto one
        // bucket; `flatten_cgroup_path` returns the pattern
        // string itself as the canonical key.
        let pats = compile_flatten_patterns(&["/[ab]".into()]);
        let out = flatten_cgroup_stats(&stats, &pats, None);
        let agg = &out["/[ab]"];

        // CPU: counters sum, limits take max.
        assert_eq!(agg.cpu.usage_usec, 300);
        assert_eq!(agg.cpu.max_quota_us, Some(80_000));
        assert_eq!(agg.cpu.weight, Some(300));

        // Memory: limits max, floors min, stat-counters sum,
        // stat-gauges max (per MEMORY_STAT_GAUGE_KEYS dispatch),
        // events sum.
        assert_eq!(agg.memory.max, Some(2_000_000));
        assert_eq!(agg.memory.high, Some(1_500_000));
        assert_eq!(agg.memory.low, Some(200_000));
        // `anon` and `file` are gauges — max wins, not sum.
        assert_eq!(agg.memory.stat.get("anon"), Some(&1000));
        assert_eq!(agg.memory.stat.get("file"), Some(&200));
        assert_eq!(agg.memory.events.get("oom_kill"), Some(&1));

        // Pids: current sums, max takes max.
        assert_eq!(agg.pids.current, Some(15));
        assert_eq!(agg.pids.max, Some(2048));
    }

    /// Single-contributor flatten: ONE cgroup with concrete
    /// `Some`-valued limits passes through `flatten_cgroup_stats`
    /// unchanged. Pin the regression for the
    /// first-iteration-replace fix: under the prior
    /// `or_default()` + `merge_max_option` flow, the synthetic
    /// `CgroupStats::default()` would seed every `Option<u64>`
    /// limit at None, then `merge_max_option(None, Some(N))`
    /// would None-poison the lone real contributor, erasing
    /// every concrete cap to None.
    #[test]
    fn flatten_cgroup_stats_single_contributor_preserves_concrete_limits() {
        let mut a = CgroupStats::default();
        a.cpu.usage_usec = 12_345;
        a.cpu.max_quota_us = Some(50_000);
        a.cpu.max_period_us = 100_000;
        a.cpu.weight = Some(150);
        a.cpu.weight_nice = Some(0);
        a.memory.current = 1_500_000;
        a.memory.max = Some(2 << 30);
        a.memory.high = Some(1 << 30);
        a.memory.low = Some(1 << 28);
        a.memory.min = Some(1 << 27);
        a.pids.current = Some(42);
        a.pids.max = Some(2048);
        let mut stats = BTreeMap::new();
        stats.insert("/lone".into(), a);
        // No flatten patterns and no key map — the path passes
        // through verbatim, so /lone is the only contributor for
        // its key.
        let out = flatten_cgroup_stats(&stats, &[], None);
        let agg = &out["/lone"];
        // Every concrete `Option<u64>` survives the flatten layer
        // verbatim. Under the buggy code, every assertion below
        // would fail with `Some(_) != None`.
        assert_eq!(agg.cpu.usage_usec, 12_345);
        assert_eq!(agg.cpu.max_quota_us, Some(50_000));
        assert_eq!(agg.cpu.max_period_us, 100_000);
        assert_eq!(agg.cpu.weight, Some(150));
        assert_eq!(agg.cpu.weight_nice, Some(0));
        assert_eq!(agg.memory.current, 1_500_000);
        assert_eq!(agg.memory.max, Some(2 << 30));
        assert_eq!(agg.memory.high, Some(1 << 30));
        assert_eq!(agg.memory.low, Some(1 << 28));
        assert_eq!(agg.memory.min, Some(1 << 27));
        assert_eq!(agg.pids.current, Some(42));
        assert_eq!(agg.pids.max, Some(2048));
    }

    /// Limit + floor No-limit propagation through flatten: when
    /// one cgroup has memory.max=None (no cap) and another has
    /// a concrete cap, the merged bucket inherits None.
    #[test]
    fn flatten_cgroup_stats_propagates_no_limit() {
        let mut a = CgroupStats::default();
        a.memory.max = None;
        a.memory.low = None;
        let mut b = CgroupStats::default();
        b.memory.max = Some(1_000_000);
        b.memory.low = Some(500_000);
        let mut stats = BTreeMap::new();
        stats.insert("/a".into(), a);
        stats.insert("/b".into(), b);
        // Glob crate (0.3.x) supports `*`, `?`, `[...]`, `**` —
        // NOT brace expansion `{a,b}`. Use the `[ab]`
        // character-class to collapse `/a` and `/b` onto one
        // bucket; `flatten_cgroup_path` returns the pattern
        // string itself as the canonical key.
        let pats = compile_flatten_patterns(&["/[ab]".into()]);
        let out = flatten_cgroup_stats(&stats, &pats, None);
        let agg = &out["/[ab]"];
        assert_eq!(agg.memory.max, None, "any unbounded → bucket unbounded");
        assert_eq!(agg.memory.low, None, "any no-floor → bucket unprotected");
    }

    /// Per-row gate: a cgroup with counter data but no
    /// caps / weight / pids accounting must NOT contribute a
    /// row to the "## Cgroup limits / knobs" sub-table. The
    /// cgroup-stats primary table still mentions it, but the
    /// limits table is exclusive to cgroups exposing those
    /// knobs.
    #[test]
    fn write_diff_limits_table_skips_cgroups_without_caps() {
        let mut diff = CtprofDiff::default();
        // /counters-only carries pure counter data — no
        // cpu.max/weight, no memory.max/high, no pids.
        diff.cgroup_stats_a.insert(
            "/counters-only".into(),
            simple_cgroup_stats(100, 0, 0, 1024),
        );
        diff.cgroup_stats_b.insert(
            "/counters-only".into(),
            simple_cgroup_stats(200, 0, 0, 2048),
        );
        // /capped sets a memory.max and a cpu.weight, so it
        // SHOULD appear in the limits table.
        let mut capped_a = CgroupStats::default();
        capped_a.memory.max = Some(1 << 30);
        capped_a.cpu.weight = Some(150);
        let mut capped_b = CgroupStats::default();
        capped_b.memory.max = Some(1 << 30);
        capped_b.cpu.weight = Some(150);
        diff.cgroup_stats_a.insert("/capped".into(), capped_a);
        diff.cgroup_stats_b.insert("/capped".into(), capped_b);

        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Cgroup,
            &DisplayOptions::default(),
        )
        .unwrap();

        // Header is rendered (at least one cgroup carries
        // limits data).
        assert!(
            out.contains("## Cgroup limits / knobs"),
            "limits header missing:\n{out}",
        );
        // Find the section bounds — between the limits header
        // and the next `##` header (or EOF).
        let header_pos = out.find("## Cgroup limits / knobs").unwrap();
        let after_header = &out[header_pos..];
        let next_section = after_header
            .find("\n## ")
            .map(|p| p + 1)
            .unwrap_or(after_header.len());
        let limits_section = &after_header[..next_section];
        // /capped appears (has caps), /counters-only does not.
        assert!(
            limits_section.contains("/capped"),
            "capped cgroup should appear in limits table:\n{limits_section}",
        );
        assert!(
            !limits_section.contains("/counters-only"),
            "counters-only cgroup should NOT appear (no caps/weight/pids):\n{limits_section}",
        );
    }

    /// memory.stat unchanged-row suppression: a key that
    /// carries the same value on both sides must NOT appear in
    /// the rendered memory.stat sub-table; a key that changed
    /// MUST appear. Pins the baseline-vs-candidate equality
    /// gate that cuts output ~10x for typical runs.
    #[test]
    fn write_diff_memory_stat_skips_unchanged_rows() {
        let mut diff = CtprofDiff::default();
        let mut a = CgroupStats::default();
        a.memory.stat.insert("pgfault".into(), 100);
        a.memory.stat.insert("anon".into(), 1_000_000);
        let mut b = CgroupStats::default();
        b.memory.stat.insert("pgfault".into(), 250);
        b.memory.stat.insert("anon".into(), 1_000_000);
        diff.cgroup_stats_a.insert("/app".into(), a);
        diff.cgroup_stats_b.insert("/app".into(), b);

        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Cgroup,
            &DisplayOptions::default(),
        )
        .unwrap();

        let header_pos = out
            .find("## memory.stat")
            .expect("memory.stat header missing");
        let after_header = &out[header_pos..];
        let next_section = after_header
            .find("\n## ")
            .map(|p| p + 1)
            .unwrap_or(after_header.len());
        let stat_section = &after_header[..next_section];
        assert!(
            stat_section.contains("pgfault"),
            "changed key (pgfault: 100 → 250) must appear:\n{stat_section}",
        );
        assert!(
            !stat_section.contains("anon"),
            "unchanged gauge key (anon: 1M = 1M) must be suppressed:\n{stat_section}",
        );
    }

    /// memory.events unchanged-row suppression: same pattern
    /// as memory.stat — only changed events surface.
    #[test]
    fn write_diff_memory_events_skips_unchanged_rows() {
        let mut diff = CtprofDiff::default();
        let mut a = CgroupStats::default();
        a.memory.events.insert("low".into(), 5);
        a.memory.events.insert("oom_kill".into(), 0);
        let mut b = CgroupStats::default();
        b.memory.events.insert("low".into(), 12);
        b.memory.events.insert("oom_kill".into(), 0);
        diff.cgroup_stats_a.insert("/app".into(), a);
        diff.cgroup_stats_b.insert("/app".into(), b);

        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Cgroup,
            &DisplayOptions::default(),
        )
        .unwrap();

        let header_pos = out
            .find("## memory.events")
            .expect("memory.events header missing");
        let after_header = &out[header_pos..];
        let next_section = after_header
            .find("\n## ")
            .map(|p| p + 1)
            .unwrap_or(after_header.len());
        let events_section = &after_header[..next_section];
        assert!(
            events_section.contains("low"),
            "changed event (low: 5 → 12) must appear:\n{events_section}",
        );
        // `oom_kill` 0→0 should be suppressed. Use a
        // word-boundary check: `low` is a prefix of `low` but
        // distinct from `oom_kill`, so just check the literal
        // substring is absent.
        assert!(
            !events_section.contains("oom_kill"),
            "unchanged event (oom_kill: 0 = 0) must be suppressed:\n{events_section}",
        );
    }

    /// Malformed glob patterns are silently dropped by the
    /// compiler (they never match so they never collapse
    /// anything). Gate against a future change that accidentally
    /// starts rejecting valid-looking patterns.
    #[test]
    fn compile_flatten_patterns_skips_malformed() {
        let pats = compile_flatten_patterns(&["[invalid".into(), "/ok/*".into()]);
        assert_eq!(pats.len(), 1);
        assert_eq!(pats[0].as_str(), "/ok/*");
    }

    /// Every `ThreadState` field that names a registered metric
    /// in the registry has a reachable accessor: sum one unit of
    /// that field through a single-thread aggregate and confirm
    /// the Sum result is 1. Defends against a typo in any
    /// `AggRule::Sum*` variant
    /// ([`AggRule::SumCount`] / [`AggRule::SumNs`] /
    /// [`AggRule::SumTicks`] / [`AggRule::SumBytes`]) accessor
    /// pointing at the wrong field.
    ///
    /// The test is metric-registry-driven rather than field-
    /// driven because new metrics have to land through the
    /// registry; a drift between the test and the registry
    /// would catch itself.
    #[test]
    fn sum_metric_accessors_read_expected_field() {
        use crate::metric_types::{Bytes, ClockTicks, MonotonicCount, MonotonicNs};
        type MetricSetter = fn(&mut ThreadState);
        let cases: &[(&str, MetricSetter)] = &[
            ("run_time_ns", |t| t.run_time_ns = MonotonicNs(1)),
            ("wait_time_ns", |t| t.wait_time_ns = MonotonicNs(1)),
            ("timeslices", |t| t.timeslices = MonotonicCount(1)),
            ("voluntary_csw", |t| t.voluntary_csw = MonotonicCount(1)),
            ("nonvoluntary_csw", |t| {
                t.nonvoluntary_csw = MonotonicCount(1)
            }),
            ("nr_wakeups", |t| t.nr_wakeups = MonotonicCount(1)),
            ("nr_wakeups_local", |t| {
                t.nr_wakeups_local = MonotonicCount(1)
            }),
            ("nr_wakeups_remote", |t| {
                t.nr_wakeups_remote = MonotonicCount(1)
            }),
            ("nr_wakeups_sync", |t| t.nr_wakeups_sync = MonotonicCount(1)),
            ("nr_wakeups_migrate", |t| {
                t.nr_wakeups_migrate = MonotonicCount(1)
            }),
            ("nr_wakeups_affine", |t| {
                t.nr_wakeups_affine = MonotonicCount(1)
            }),
            ("nr_wakeups_affine_attempts", |t| {
                t.nr_wakeups_affine_attempts = MonotonicCount(1)
            }),
            ("nr_migrations", |t| t.nr_migrations = MonotonicCount(1)),
            ("nr_forced_migrations", |t| {
                t.nr_forced_migrations = MonotonicCount(1)
            }),
            ("nr_failed_migrations_affine", |t| {
                t.nr_failed_migrations_affine = MonotonicCount(1)
            }),
            ("nr_failed_migrations_running", |t| {
                t.nr_failed_migrations_running = MonotonicCount(1)
            }),
            ("nr_failed_migrations_hot", |t| {
                t.nr_failed_migrations_hot = MonotonicCount(1)
            }),
            ("wait_sum", |t| t.wait_sum = MonotonicNs(1)),
            ("wait_count", |t| t.wait_count = MonotonicCount(1)),
            ("voluntary_sleep_ns", |t| {
                t.voluntary_sleep_ns = MonotonicNs(1)
            }),
            ("block_sum", |t| t.block_sum = MonotonicNs(1)),
            ("iowait_sum", |t| t.iowait_sum = MonotonicNs(1)),
            ("iowait_count", |t| t.iowait_count = MonotonicCount(1)),
            ("allocated_bytes", |t| t.allocated_bytes = Bytes(1)),
            ("deallocated_bytes", |t| t.deallocated_bytes = Bytes(1)),
            ("minflt", |t| t.minflt = MonotonicCount(1)),
            ("majflt", |t| t.majflt = MonotonicCount(1)),
            ("utime_clock_ticks", |t| t.utime_clock_ticks = ClockTicks(1)),
            ("stime_clock_ticks", |t| t.stime_clock_ticks = ClockTicks(1)),
            ("rchar", |t| t.rchar = Bytes(1)),
            ("wchar", |t| t.wchar = Bytes(1)),
            ("syscr", |t| t.syscr = MonotonicCount(1)),
            ("syscw", |t| t.syscw = MonotonicCount(1)),
            ("read_bytes", |t| t.read_bytes = Bytes(1)),
            ("write_bytes", |t| t.write_bytes = Bytes(1)),
            ("cancelled_write_bytes", |t| {
                t.cancelled_write_bytes = Bytes(1)
            }),
        ];
        for (name, set) in cases {
            let mut t = make_thread("p", "w");
            set(&mut t);
            let def = CTPROF_METRICS
                .iter()
                .find(|m| m.name == *name)
                .unwrap_or_else(|| panic!("metric {name} not in registry"));
            let agg = aggregate(def.rule, &[&t]);
            match agg {
                Aggregated::Sum(v) => {
                    assert_eq!(v, 1, "accessor for {name} did not read the {name} field",)
                }
                other => panic!("expected Sum for {name}, got {other:?}"),
            }
        }
    }

    /// Every registered metric name must be unique. A
    /// collision would silently shadow the earlier entry in
    /// lookups and still "work" for fields that happen to
    /// match — a slow-burn correctness bug.
    #[test]
    fn ctprof_metric_names_are_unique() {
        let mut seen = std::collections::BTreeSet::new();
        for m in CTPROF_METRICS {
            assert!(
                seen.insert(m.name),
                "duplicate metric name in registry: {}",
                m.name,
            );
        }
    }

    /// Test-only helper: look up a registry entry by name and
    /// return a static reference. Reduces fixture duplication
    /// across the metric_display_name + tag tests below.
    fn lookup_metric(name: &str) -> &'static CtprofMetricDef {
        CTPROF_METRICS
            .iter()
            .find(|m| m.name == name)
            .unwrap_or_else(|| panic!("metric {name} registered"))
    }

    /// `metric_display_name` of a fully-ungated metric returns
    /// the bare name with no trailing tags. Pins the
    /// no-decoration short-circuit for the typical case, and
    /// verifies that the borrowed-Cow path is taken (no
    /// allocation when nothing decorates).
    #[test]
    fn metric_display_name_no_gates_returns_bare_name() {
        let policy = lookup_metric("policy");
        assert_eq!(metric_display_name(policy), "policy");
        assert!(metric_tags(policy).is_empty());
        let cpu_aff = lookup_metric("cpu_affinity");
        assert_eq!(metric_display_name(cpu_aff), "cpu_affinity");
        assert!(metric_tags(cpu_aff).is_empty());
    }

    /// CFS-only + CONFIG_SCHEDSTATS metric renders BOTH tags in
    /// stable order: sched_class first, then each config gate.
    /// `nr_wakeups_affine` is the load-bearing example here —
    /// `kernel/sched/fair.c::wake_affine` is the only call site
    /// for the underlying `__schedstat_inc`. The config gate
    /// renders compact (`[SCHEDSTATS]` not `[CONFIG_SCHEDSTATS]`)
    /// per the strip rule on `metric_display_name`. Pins both
    /// decoration paths against drift.
    #[test]
    fn metric_tags_renders_class_and_config_tags() {
        let m = lookup_metric("nr_wakeups_affine");
        assert_eq!(metric_display_name(m), "nr_wakeups_affine");
        assert_eq!(metric_tags(m), "[cfs-only] [SCHEDSTATS]");
    }

    /// Multi-gate metric (`core_forceidle_sum` requires both
    /// CONFIG_SCHED_CORE and CONFIG_SCHEDSTATS) renders every
    /// gate in registry-declared order. Class is `None` here so
    /// no class tag emits — only the two config tags. Compact
    /// rendering strips the `CONFIG_` prefix from each gate.
    #[test]
    fn metric_tags_emits_each_config_gate_in_order() {
        let core = lookup_metric("core_forceidle_sum");
        assert_eq!(metric_display_name(core), "core_forceidle_sum");
        assert_eq!(metric_tags(core), "[SCHED_CORE] [SCHEDSTATS]");
    }

    /// `fair_slice_ns` is fair-policy-only with no config gate.
    /// Pins that the class tag emits without any trailing
    /// config-gate tag — the for-loop must not produce a
    /// trailing `[]` or trailing whitespace when
    /// `config_gates` is empty.
    #[test]
    fn metric_tags_class_only_no_config_gate() {
        let fair = lookup_metric("fair_slice_ns");
        assert_eq!(metric_display_name(fair), "fair_slice_ns");
        assert_eq!(metric_tags(fair), "[fair-policy]");
    }

    /// Compact rendering: `metric_display_name` strips the
    /// `CONFIG_` prefix from each `config_gate` before emission.
    /// The data field stays full so an operator can grep their
    /// kconfig directly. Pin the rule explicitly so a refactor
    /// of `metric_display_name` does not silently regress the
    /// strip behavior.
    #[test]
    fn metric_tags_strips_config_prefix() {
        for m in CTPROF_METRICS {
            for gate in m.config_gates {
                assert!(
                    gate.starts_with("CONFIG_"),
                    "registry config_gate {gate:?} on metric {} \
                     must spell the literal CONFIG_X kconfig symbol",
                    m.name,
                );
                let tags = metric_tags(m);
                let expected_short = gate.strip_prefix("CONFIG_").unwrap();
                assert!(
                    tags.contains(&format!("[{expected_short}]")),
                    "metric {} tags {tags:?} must contain [{expected_short}]",
                    m.name,
                );
                assert!(
                    !tags.contains(&format!("[{gate}]")),
                    "metric {} tags {tags:?} must not contain full [{gate}]",
                    m.name,
                );
            }
        }
    }

    /// `[dead]` tag rendering remains in the metric-display
    /// machinery even though the registry currently has no
    /// `is_dead: true` entries (the previously-registered dead
    /// counters were dropped). Pin the rendering on a synthetic
    /// `CtprofMetricDef` so a regression that drops the
    /// `[dead]` clause from `metric_display_name` surfaces here
    /// rather than waiting for a future kernel quirk that
    /// resurrects the tag.
    #[test]
    fn metric_tags_marks_synthetic_dead_counter() {
        let m = CtprofMetricDef {
            name: "synthetic_dead",
            rule: AggRule::SumCount(|_| crate::metric_types::MonotonicCount(0)),
            sched_class: None,
            config_gates: &["CONFIG_SCHEDSTATS"],
            is_dead: true,
            description: "synthetic dead-counter test fixture.",
            section: Section::Primary,
        };
        assert_eq!(metric_display_name(&m), "synthetic_dead");
        assert_eq!(metric_tags(&m), "[dead] [SCHEDSTATS]",);
        // Live registry must NOT carry any is_dead: true entries
        // until a kernel resurrects a dead counter or a new
        // always-zero counter is captured. Detects accidental
        // re-introduction.
        for m in CTPROF_METRICS {
            assert!(
                !m.is_dead,
                "{} unexpectedly carries is_dead: true — the \
                 registry is currently empty of dead counters; \
                 add the entry to the matrix-pin test below if \
                 a new dead counter is intentional",
                m.name,
            );
        }
    }

    /// `non-ext` rendering: the schedstat sleep/wait family is
    /// tagged `non-ext` because it accumulates under CFS / RT /
    /// DL but not sched_ext. Pin a representative example:
    /// `wait_sum [non-ext] [SCHEDSTATS]`. Guards against the
    /// matrix regression that previously left these tagged
    /// `None`.
    #[test]
    fn metric_tags_renders_non_ext_class() {
        let m = lookup_metric("wait_sum");
        assert_eq!(metric_display_name(m), "wait_sum");
        assert_eq!(metric_tags(m), "[non-ext] [SCHEDSTATS]",);
    }

    /// Exhaustive tag pin: every metric in CTPROF_METRICS
    /// gets its (sched_class, config_gates, is_dead) triple
    /// asserted against the locked matrix. Set-equality on the
    /// keys: every registry name must appear in the matrix
    /// table, and vice versa. Drift on either side fails the
    /// test before reaching the rendered output.
    #[test]
    fn registry_tag_matrix_is_pinned() {
        // Locked matrix: (name → (sched_class, config_gates, is_dead)).
        // Order matches CTPROF_METRICS for ease of audit.
        let matrix: &[(&str, Option<&str>, &[&str], bool)] = &[
            // identity / structural
            ("policy", None, &[], false),
            ("nice", None, &[], false),
            ("priority", None, &[], false),
            ("rt_priority", None, &[], false),
            ("cpu_affinity", None, &[], false),
            ("processor", None, &[], false),
            ("state", None, &[], false),
            ("ext_enabled", None, &["CONFIG_SCHED_CLASS_EXT"], false),
            ("nr_threads", None, &[], false),
            // scheduling / schedstat
            ("run_time_ns", None, &["CONFIG_SCHED_INFO"], false),
            ("wait_time_ns", None, &["CONFIG_SCHED_INFO"], false),
            ("timeslices", None, &["CONFIG_SCHED_INFO"], false),
            ("voluntary_csw", None, &[], false),
            ("nonvoluntary_csw", None, &[], false),
            ("nr_wakeups", None, &["CONFIG_SCHEDSTATS"], false),
            ("nr_wakeups_local", None, &["CONFIG_SCHEDSTATS"], false),
            ("nr_wakeups_remote", None, &["CONFIG_SCHEDSTATS"], false),
            ("nr_wakeups_sync", None, &["CONFIG_SCHEDSTATS"], false),
            ("nr_wakeups_migrate", None, &["CONFIG_SCHEDSTATS"], false),
            (
                "nr_wakeups_affine",
                Some("cfs-only"),
                &["CONFIG_SCHEDSTATS"],
                false,
            ),
            (
                "nr_wakeups_affine_attempts",
                Some("cfs-only"),
                &["CONFIG_SCHEDSTATS"],
                false,
            ),
            ("nr_migrations", None, &[], false),
            (
                "nr_forced_migrations",
                Some("cfs-only"),
                &["CONFIG_SCHEDSTATS"],
                false,
            ),
            (
                "nr_failed_migrations_affine",
                Some("cfs-only"),
                &["CONFIG_SCHEDSTATS"],
                false,
            ),
            (
                "nr_failed_migrations_running",
                Some("cfs-only"),
                &["CONFIG_SCHEDSTATS"],
                false,
            ),
            (
                "nr_failed_migrations_hot",
                Some("cfs-only"),
                &["CONFIG_SCHEDSTATS"],
                false,
            ),
            ("wait_sum", Some("non-ext"), &["CONFIG_SCHEDSTATS"], false),
            ("wait_count", Some("non-ext"), &["CONFIG_SCHEDSTATS"], false),
            ("wait_max", Some("non-ext"), &["CONFIG_SCHEDSTATS"], false),
            (
                "voluntary_sleep_ns",
                Some("non-ext"),
                &["CONFIG_SCHEDSTATS"],
                false,
            ),
            ("sleep_max", Some("non-ext"), &["CONFIG_SCHEDSTATS"], false),
            ("block_sum", Some("non-ext"), &["CONFIG_SCHEDSTATS"], false),
            ("block_max", Some("non-ext"), &["CONFIG_SCHEDSTATS"], false),
            ("iowait_sum", Some("non-ext"), &["CONFIG_SCHEDSTATS"], false),
            (
                "iowait_count",
                Some("non-ext"),
                &["CONFIG_SCHEDSTATS"],
                false,
            ),
            ("exec_max", None, &["CONFIG_SCHEDSTATS"], false),
            ("slice_max", Some("cfs-only"), &["CONFIG_SCHEDSTATS"], false),
            (
                "core_forceidle_sum",
                None,
                &["CONFIG_SCHED_CORE", "CONFIG_SCHEDSTATS"],
                false,
            ),
            ("fair_slice_ns", Some("fair-policy"), &[], false),
            // memory
            ("allocated_bytes", None, &[], false),
            ("deallocated_bytes", None, &[], false),
            ("minflt", None, &[], false),
            ("majflt", None, &[], false),
            ("utime_clock_ticks", None, &[], false),
            ("stime_clock_ticks", None, &[], false),
            // I/O — all 7 fields share CONFIG_TASK_IO_ACCOUNTING
            // (the kernel emits /proc/<tid>/io as a single block
            // under that gate; CONFIG_TASK_IO_ACCOUNTING `depends
            // on` CONFIG_TASK_XACCT in init/Kconfig).
            ("rchar", None, &["CONFIG_TASK_IO_ACCOUNTING"], false),
            ("wchar", None, &["CONFIG_TASK_IO_ACCOUNTING"], false),
            ("syscr", None, &["CONFIG_TASK_IO_ACCOUNTING"], false),
            ("syscw", None, &["CONFIG_TASK_IO_ACCOUNTING"], false),
            ("read_bytes", None, &["CONFIG_TASK_IO_ACCOUNTING"], false),
            ("write_bytes", None, &["CONFIG_TASK_IO_ACCOUNTING"], false),
            (
                "cancelled_write_bytes",
                None,
                &["CONFIG_TASK_IO_ACCOUNTING"],
                false,
            ),
            // taskstats delay accounting — every entry is
            // double-gated on CONFIG_TASKSTATS (the netlink family
            // registration in `kernel/taskstats.c`) and
            // CONFIG_TASK_DELAY_ACCT (the per-task counters in
            // `kernel/delayacct.c`). Operator-visible behavior:
            // missing either gate collapses every field to zero.
            (
                "cpu_delay_count",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "cpu_delay_total_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "cpu_delay_max_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "cpu_delay_min_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "blkio_delay_count",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "blkio_delay_total_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "blkio_delay_max_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "blkio_delay_min_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "swapin_delay_count",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "swapin_delay_total_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "swapin_delay_max_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "swapin_delay_min_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "freepages_delay_count",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "freepages_delay_total_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "freepages_delay_max_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "freepages_delay_min_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "thrashing_delay_count",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "thrashing_delay_total_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "thrashing_delay_max_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "thrashing_delay_min_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "compact_delay_count",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "compact_delay_total_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "compact_delay_max_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "compact_delay_min_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "wpcopy_delay_count",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "wpcopy_delay_total_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "wpcopy_delay_max_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "wpcopy_delay_min_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "irq_delay_count",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "irq_delay_total_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "irq_delay_max_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "irq_delay_min_ns",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_DELAY_ACCT"],
                false,
            ),
            (
                "hiwater_rss_bytes",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_XACCT"],
                false,
            ),
            (
                "hiwater_vm_bytes",
                None,
                &["CONFIG_TASKSTATS", "CONFIG_TASK_XACCT"],
                false,
            ),
        ];
        // Set-equality: registry keys vs matrix keys.
        let registry_names: std::collections::BTreeSet<&str> =
            CTPROF_METRICS.iter().map(|m| m.name).collect();
        let matrix_names: std::collections::BTreeSet<&str> =
            matrix.iter().map(|(n, _, _, _)| *n).collect();
        assert_eq!(
            registry_names, matrix_names,
            "registry vs matrix key mismatch — every metric must be \
             pinned in the locked matrix and the matrix must not name \
             metrics that aren't registered",
        );
        // Per-entry pin: each tuple matches the registry exactly.
        for (name, expected_class, expected_gates, expected_dead) in matrix {
            let m = lookup_metric(name);
            assert_eq!(m.sched_class, *expected_class, "{name}: sched_class drift",);
            assert_eq!(
                m.config_gates, *expected_gates,
                "{name}: config_gates drift",
            );
            assert_eq!(m.is_dead, *expected_dead, "{name}: is_dead drift");
        }
    }

    /// Closed-set vocabulary: the registry's tag values must
    /// stay inside the documented vocabulary. sched_class is
    /// one of {None, "non-ext", "cfs-only", "fair-policy"};
    /// config_gates is a subset of the documented kconfig set.
    /// Defends against a future entry that tags a metric with a
    /// freshly-invented label that the doc / display layers
    /// don't yet handle.
    #[test]
    fn registry_tag_vocabulary_is_closed() {
        let allowed_classes: std::collections::BTreeSet<&str> =
            ["non-ext", "cfs-only", "fair-policy"].into_iter().collect();
        let allowed_gates: std::collections::BTreeSet<&str> = [
            "CONFIG_SCHED_INFO",
            "CONFIG_SCHEDSTATS",
            "CONFIG_SCHED_CORE",
            "CONFIG_TASK_DELAY_ACCT",
            "CONFIG_TASK_IO_ACCOUNTING",
            "CONFIG_TASK_XACCT",
            "CONFIG_SCHED_CLASS_EXT",
            "CONFIG_TASKSTATS",
        ]
        .into_iter()
        .collect();
        for m in CTPROF_METRICS {
            if let Some(class) = m.sched_class {
                assert!(
                    allowed_classes.contains(class),
                    "{}: sched_class {class:?} outside the closed set \
                     {{None, \"non-ext\", \"cfs-only\", \"fair-policy\"}}",
                    m.name,
                );
            }
            for gate in m.config_gates {
                assert!(
                    gate.starts_with("CONFIG_"),
                    "{}: config_gate {gate:?} must start with CONFIG_",
                    m.name,
                );
                assert!(
                    allowed_gates.contains(gate),
                    "{}: config_gate {gate:?} outside the closed set \
                     {allowed_gates:?}",
                    m.name,
                );
            }
        }
    }

    /// Integration test for `write_diff`: a tagged metric row
    /// (`nr_wakeups_affine`) renders the bracketed tag suffix on
    /// the `metric` cell in the produced table. Pins that the
    /// registry tag → cell rendering plumbing stays connected
    /// end-to-end; refactoring `metric_display_name`'s callers
    /// without rerouting the bracketed tag would silently strip
    /// the cell back to bare metric names.
    #[test]
    fn write_diff_renders_tagged_metric_cell() {
        let mut a = make_thread("p", "w");
        a.nr_wakeups_affine = MonotonicCount(5);
        let mut b = make_thread("p", "w");
        b.nr_wakeups_affine = MonotonicCount(9);
        let diff = compare(
            &snap_with(vec![a]),
            &snap_with(vec![b]),
            &CompareOptions::default(),
        );
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        assert!(
            out.contains("nr_wakeups_affine [cfs-only] [SCHEDSTATS]"),
            "tagged metric cell missing from rendered table:\n{out}",
        );
    }

    /// Integration test for `write_diff`: a `non-ext` metric
    /// (`wait_sum`) renders with the new tag in the produced
    /// table. Pins the matrix change end-to-end so a future
    /// regression that rolls the class back to `None` fails
    /// here as well as in the unit test.
    #[test]
    fn write_diff_renders_non_ext_metric_cell() {
        let mut a = make_thread("p", "w");
        a.wait_sum = MonotonicNs(100);
        let mut b = make_thread("p", "w");
        b.wait_sum = MonotonicNs(200);
        let diff = compare(
            &snap_with(vec![a]),
            &snap_with(vec![b]),
            &CompareOptions::default(),
        );
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        assert!(
            out.contains("wait_sum [non-ext] [SCHEDSTATS]"),
            "non-ext metric cell missing from rendered table:\n{out}",
        );
    }

    // The previous integration test
    // `write_diff_renders_is_dead_metric_cell` was removed when
    // the registry's dead counters (nr_wakeups_idle,
    // nr_migrations_cold, nr_wakeups_passive) were dropped. The
    // [dead] rendering path is still covered by
    // `metric_display_name_marks_synthetic_dead_counter` (which
    // drives the path from a synthetic registry entry without
    // depending on a live registered metric).

    // ------------------------------------------------------------
    // DisplayFormat / Column / parse_columns
    // ------------------------------------------------------------

    /// Build a one-thread snapshot pair where every column has
    /// a meaningful value. Used by the display-format /
    /// column-set tests below.
    fn snap_pair_for_display() -> (CtprofSnapshot, CtprofSnapshot) {
        let mut a = make_thread("p", "w");
        a.run_time_ns = MonotonicNs(100);
        a.wait_count = MonotonicCount(4);
        a.wait_sum = MonotonicNs(1000);
        let mut b = make_thread("p", "w");
        b.run_time_ns = MonotonicNs(200);
        b.wait_count = MonotonicCount(4);
        b.wait_sum = MonotonicNs(2000);
        (snap_with(vec![a]), snap_with(vec![b]))
    }

    /// Default DisplayFormat is `Full`. Pinned via `Default`
    /// derive so a future enum reorder cannot silently shift
    /// the default.
    #[test]
    fn display_format_default_is_full() {
        assert_eq!(DisplayFormat::default(), DisplayFormat::Full);
    }

    /// Each variant of [`DisplayFormat`] resolves to a fixed
    /// column set. Pin the resolved set per variant so a
    /// future change that tweaks the trailing columns surfaces
    /// here with a precise diff.
    #[test]
    fn compare_columns_for_resolves_per_variant() {
        assert_eq!(
            compare_columns_for(DisplayFormat::Full),
            vec![
                Column::Group,
                Column::Threads,
                Column::Metric,
                Column::Baseline,
                Column::Candidate,
                Column::Delta,
                Column::Pct,
            ]
        );
        assert_eq!(
            compare_columns_for(DisplayFormat::DeltaOnly),
            vec![
                Column::Group,
                Column::Threads,
                Column::Metric,
                Column::Delta,
                Column::Pct
            ]
        );
        assert_eq!(
            compare_columns_for(DisplayFormat::NoPct),
            vec![
                Column::Group,
                Column::Threads,
                Column::Metric,
                Column::Baseline,
                Column::Candidate,
                Column::Delta,
            ]
        );
        assert_eq!(
            compare_columns_for(DisplayFormat::Arrow),
            vec![
                Column::Group,
                Column::Threads,
                Column::Metric,
                Column::Arrow
            ]
        );
        assert_eq!(
            compare_columns_for(DisplayFormat::PctOnly),
            vec![Column::Group, Column::Threads, Column::Metric, Column::Pct]
        );
    }

    /// `Column::cli_name()` round-trips through
    /// [`parse_columns`] for every compare-side allowed variant.
    /// `arrow` is mutually exclusive with the
    /// baseline/candidate/delta/% set it fuses, so it is
    /// exercised by the dedicated arrow-only round-trip below.
    #[test]
    fn parse_columns_round_trips_compare_names() {
        let spec = "group,threads,metric,baseline,candidate,delta,%";
        let cols = parse_columns(spec, true).expect("valid compare spec");
        assert_eq!(
            cols,
            vec![
                Column::Group,
                Column::Threads,
                Column::Metric,
                Column::Baseline,
                Column::Candidate,
                Column::Delta,
                Column::Pct,
            ]
        );
    }

    /// Round-trip the `arrow` form on its own — the fused single
    /// cell carries baseline/candidate/delta in one column and
    /// must not be paired with any of those names.
    #[test]
    fn parse_columns_round_trips_arrow_form() {
        let spec = "group,threads,metric,arrow";
        let cols = parse_columns(spec, true).expect("valid arrow-form spec");
        assert_eq!(
            cols,
            vec![
                Column::Group,
                Column::Threads,
                Column::Metric,
                Column::Arrow,
            ]
        );
    }

    /// Show-side `parse_columns` rejects compare-only column
    /// names. The error message lists the show-side allowed
    /// vocabulary so the operator can recover from the
    /// diagnostic alone.
    #[test]
    fn parse_columns_rejects_compare_only_on_show_side() {
        let err = parse_columns("baseline", false).unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("baseline"),
            "error must cite the offending name: {msg}"
        );
        assert!(
            msg.contains("group, threads, metric, value"),
            "error must list the show-side allowed names: {msg}"
        );
    }

    /// Compare-side `parse_columns` rejects `value` (show
    /// only).
    #[test]
    fn parse_columns_rejects_show_only_on_compare_side() {
        let err = parse_columns("value", true).unwrap_err();
        let msg = format!("{err:#}");
        assert!(msg.contains("value"), "error must cite name: {msg}");
    }

    /// `parse_columns` rejects an unknown name with a list of
    /// valid alternatives.
    #[test]
    fn parse_columns_rejects_unknown_name() {
        let err = parse_columns("not_a_column", true).unwrap_err();
        let msg = format!("{err:#}");
        assert!(msg.contains("not_a_column"), "error must cite name: {msg}",);
    }

    /// `parse_columns` rejects duplicate names.
    #[test]
    fn parse_columns_rejects_duplicate() {
        let err = parse_columns("metric,delta,metric", true).unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("duplicate"),
            "error must mention duplicates: {msg}"
        );
    }

    /// `parse_columns` rejects empty entries between commas.
    #[test]
    fn parse_columns_rejects_empty_entry() {
        let err = parse_columns("metric,,delta", true).unwrap_err();
        let msg = format!("{err:#}");
        assert!(msg.contains("empty"), "error must mention empty: {msg}");
    }

    /// Empty `--columns` parses to an empty Vec — caller falls
    /// back to the format default.
    #[test]
    fn parse_columns_empty_returns_empty_vec() {
        let cols = parse_columns("", true).expect("empty parses");
        assert!(cols.is_empty());
        let cols = parse_columns("   ", true).expect("whitespace-only parses as empty");
        assert!(cols.is_empty());
    }

    /// Show-side `parse_columns` accepts the `metric,value`
    /// pair — the show-only allowed vocabulary. Pins that the
    /// show-side path actually parses both names rather than
    /// silently rejecting `value` as if it were compare-only.
    #[test]
    fn parse_columns_accepts_show_side_metric_value() {
        let cols = parse_columns("metric,value", false).expect("metric,value is show-side valid");
        assert_eq!(cols, vec![Column::Metric, Column::Value]);
    }

    /// Compare-side `parse_columns` rejects `arrow` paired with
    /// any of `baseline` / `candidate` / `delta` / `%`. Arrow
    /// fuses those four into a single cell, so combining them
    /// would render the same data twice. The error message
    /// names the constraint so the operator can recover.
    #[test]
    fn parse_columns_rejects_arrow_with_fused_columns() {
        for fused in &["baseline", "candidate", "delta", "%"] {
            let spec = format!("arrow,{fused}");
            let res = parse_columns(&spec, true);
            let err = res
                .err()
                .unwrap_or_else(|| panic!("arrow+{fused} must be rejected"));
            let msg = format!("{err:#}");
            assert!(
                msg.contains("arrow") && msg.contains("mutually exclusive"),
                "error must name arrow's mutual exclusivity for spec {spec:?}: {msg}"
            );
        }
    }

    // ------------------------------------------------------------
    // parse_sections / Section / DisplayOptions::is_section_enabled
    // ------------------------------------------------------------

    /// Empty / whitespace-only `--sections` parses to an empty
    /// `Vec` — caller treats that as "all sections render" via
    /// [`DisplayOptions::is_section_enabled`]'s empty-input
    /// short-circuit. Mirror of [`parse_columns_empty_returns_empty_vec`].
    #[test]
    fn parse_sections_empty_returns_empty_vec() {
        let secs = parse_sections("").expect("empty parses");
        assert!(secs.is_empty());
        let secs = parse_sections("   ").expect("whitespace-only parses as empty");
        assert!(secs.is_empty());
    }

    /// Round-trip every [`Section::ALL`] entry through its
    /// [`Section::cli_name`] and back through [`parse_sections`].
    /// Exhaustively pins the cli_name table and the parser's
    /// recognition logic against drift — adding a new variant
    /// without updating cli_name would surface here as a
    /// nonexistent name in the comma-joined spec.
    #[test]
    fn parse_sections_round_trips_every_name() {
        let spec = Section::ALL
            .iter()
            .map(|s| s.cli_name())
            .collect::<Vec<_>>()
            .join(",");
        let parsed = parse_sections(&spec).expect("every cli_name must round-trip");
        assert_eq!(
            parsed,
            Section::ALL.to_vec(),
            "round-trip must preserve order and identity"
        );
    }

    /// Unknown section name must surface a diagnostic that
    /// names the offending token and lists every valid name —
    /// the operator should be able to recover from the error
    /// alone without reading the source.
    #[test]
    fn parse_sections_rejects_unknown_name() {
        let err = parse_sections("not_a_section").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("not_a_section"),
            "error must cite the offending name: {msg}"
        );
        // Sample a couple of valid names so a future cli_name
        // rename surfaces here too.
        assert!(
            msg.contains("primary"),
            "error must list valid names: {msg}"
        );
        assert!(
            msg.contains("host-pressure"),
            "error must list valid names: {msg}"
        );
    }

    /// Duplicate name across two entries must reject — same
    /// section appearing twice carries no extra information and
    /// signals a typo.
    #[test]
    fn parse_sections_rejects_duplicate() {
        let err = parse_sections("primary,derived,primary").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("duplicate"),
            "error must mention duplicates: {msg}"
        );
    }

    /// Empty token between commas (`primary,,derived`) must
    /// reject. Mirrors `parse_columns_rejects_empty_entry` —
    /// surfacing the typo at parse time beats silently
    /// dropping an empty slot.
    #[test]
    fn parse_sections_rejects_empty_entry() {
        let err = parse_sections("primary,,derived").unwrap_err();
        let msg = format!("{err:#}");
        assert!(msg.contains("empty"), "error must mention empty: {msg}");
    }

    /// Multiple non-overlapping names parse in input order —
    /// the resolved Vec preserves the operator-supplied
    /// sequence rather than re-sorting into [`Section::ALL`]
    /// order. Pins that the parser does not stealthily
    /// reorder.
    #[test]
    fn parse_sections_accepts_multiple_in_input_order() {
        let secs =
            parse_sections("derived,primary,host-pressure").expect("multi-section spec parses");
        assert_eq!(
            secs,
            vec![Section::Derived, Section::Primary, Section::HostPressure],
            "input order must be preserved",
        );
    }

    /// Whitespace around each entry is trimmed before lookup —
    /// `--sections "primary , derived"` must parse identically
    /// to `--sections primary,derived`. Pins the trim() call in
    /// the parser body.
    #[test]
    fn parse_sections_trims_whitespace_around_entries() {
        let secs =
            parse_sections("  primary , derived  ").expect("whitespace-tolerant spec parses");
        assert_eq!(secs, vec![Section::Primary, Section::Derived]);
    }

    /// [`Section::ALL`] must list every variant exactly once.
    /// Walks ALL, round-trips each through `parse_sections`,
    /// and enforces uniqueness via the parser's duplicate
    /// rejection — a future variant added without an `ALL`
    /// entry would fail the round-trip; a duplicate in `ALL`
    /// would fail the BTreeSet uniqueness check below.
    /// Pinning this invariant in the test surface lets
    /// `parse_sections` stay the single source of truth and
    /// catches drift between the enum and the constant.
    #[test]
    fn section_all_is_exhaustive_and_unique() {
        let mut names: std::collections::BTreeSet<&'static str> = std::collections::BTreeSet::new();
        for s in Section::ALL {
            assert!(
                names.insert(s.cli_name()),
                "duplicate cli_name in Section::ALL: {}",
                s.cli_name()
            );
            // Each name must round-trip individually so a
            // future variant whose `cli_name` collides with
            // another's is caught by the BTreeSet insert
            // above, AND its absence from `parse_sections`'s
            // recognition would surface here as a parse
            // failure.
            let parsed = parse_sections(s.cli_name())
                .unwrap_or_else(|e| panic!("cli_name {} failed parse: {e:#}", s.cli_name()));
            assert_eq!(parsed, vec![*s]);
        }
        assert_eq!(
            names.len(),
            Section::ALL.len(),
            "ALL count must match the unique-names count",
        );
    }

    /// Empty `sections` Vec on [`DisplayOptions`] means "every
    /// section is enabled" — the no-filter default. Pins the
    /// short-circuit in `is_section_enabled` so a regression
    /// that flipped the empty case to "no section enabled"
    /// surfaces here.
    #[test]
    fn is_section_enabled_empty_treats_all_as_on() {
        let opts = DisplayOptions::default();
        for s in Section::ALL {
            assert!(
                opts.is_section_enabled(*s),
                "empty filter must enable {} (default = all-on)",
                s.cli_name()
            );
        }
    }

    /// Non-empty `sections` Vec restricts rendering to the
    /// listed entries — every variant not in the filter must
    /// be disabled, every variant in the filter enabled. Pins
    /// the `contains` membership check.
    #[test]
    fn is_section_enabled_non_empty_restricts_to_listed() {
        let mut opts = DisplayOptions::default();
        opts.sections = vec![Section::Primary, Section::HostPressure];
        for s in Section::ALL {
            let in_filter = matches!(s, Section::Primary | Section::HostPressure);
            assert_eq!(
                opts.is_section_enabled(*s),
                in_filter,
                "is_section_enabled({}) under {{Primary, HostPressure}} \
                 must be {in_filter}",
                s.cli_name(),
            );
        }
    }

    /// [`Section::requires_cgroup_grouping`] returns true for
    /// the five sections behind the `GroupBy::Cgroup` outer
    /// gate (`CgroupStats`, `Limits`, `MemoryStat`,
    /// `MemoryEvents`, `Pressure`) and false for every other
    /// variant. Pins the closed-set so a future variant
    /// addition that lives behind the cgroup gate has to
    /// update this match arm.
    #[test]
    fn section_requires_cgroup_grouping_classifies_correctly() {
        for s in Section::ALL {
            let expected = matches!(
                s,
                Section::CgroupStats
                    | Section::Limits
                    | Section::MemoryStat
                    | Section::MemoryEvents
                    | Section::Pressure
            );
            assert_eq!(
                s.requires_cgroup_grouping(),
                expected,
                "Section::{s:?}.requires_cgroup_grouping() must be {expected}",
            );
        }
    }

    // ------------------------------------------------------------
    // parse_metrics + is_metric_enabled tests
    //
    // Mirror the parse_sections / is_section_enabled coverage —
    // the row-level filter is structurally analogous to the
    // section filter, so the test shapes match.
    // ------------------------------------------------------------

    /// Empty / whitespace-only `--metrics` parses to an empty
    /// `Vec<&str>` — caller treats that as "every metric
    /// renders" via [`DisplayOptions::is_metric_enabled`]'s
    /// empty-input short-circuit.
    #[test]
    fn parse_metrics_empty_returns_empty_vec() {
        assert!(parse_metrics("").expect("empty parses").is_empty());
        assert!(
            parse_metrics("   ")
                .expect("whitespace-only parses as empty")
                .is_empty()
        );
    }

    /// Every primary registry name round-trips through
    /// `parse_metrics`. Walks `CTPROF_METRICS` exhaustively
    /// — adding a new metric to the registry without re-running
    /// its name through this parser would surface here only if
    /// the parser silently dropped it; the linear-scan match in
    /// `parse_metrics` accepts any `name` field, so the test
    /// is a sanity rail rather than a drift detector.
    #[test]
    fn parse_metrics_round_trips_every_primary_registry_name() {
        for m in CTPROF_METRICS {
            let parsed = parse_metrics(m.name)
                .unwrap_or_else(|e| panic!("metric name {} failed parse: {e:#}", m.name));
            assert_eq!(parsed, vec![m.name]);
        }
    }

    /// Derived metric names round-trip identically to primary
    /// metric names — the parser accepts both registries and
    /// returns the registry's `&'static str` either way. Pins
    /// the union-of-registries lookup contract.
    #[test]
    fn parse_metrics_round_trips_every_derived_registry_name() {
        for d in CTPROF_DERIVED_METRICS {
            let parsed = parse_metrics(d.name)
                .unwrap_or_else(|e| panic!("derived name {} failed parse: {e:#}", d.name));
            assert_eq!(parsed, vec![d.name]);
        }
    }

    /// Mixed primary + derived metrics in one spec parse in
    /// input order. Pins that the parser does not stealthily
    /// segregate by registry, and that the input-order contract
    /// matches `parse_sections`.
    #[test]
    fn parse_metrics_accepts_primary_and_derived_in_input_order() {
        // `run_time_ns` is a primary metric, `cpu_efficiency`
        // is a derived metric — both well-known names that
        // exist in the live registry.
        let parsed = parse_metrics("cpu_efficiency,run_time_ns")
            .expect("mixed primary+derived spec must parse");
        assert_eq!(parsed.len(), 2);
        assert_eq!(parsed[0], "cpu_efficiency");
        assert_eq!(parsed[1], "run_time_ns");
    }

    /// Unknown metric name surfaces a diagnostic that names the
    /// offending token and points at `ctprof metric-list`.
    /// The error must mention BOTH registries so the operator
    /// knows the lookup spans primary + derived.
    #[test]
    fn parse_metrics_rejects_unknown_name() {
        let err = parse_metrics("not_a_real_metric").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("not_a_real_metric"),
            "error must cite the offending name: {msg}"
        );
        assert!(
            msg.contains("metric-list"),
            "error must point operator at the discovery command: {msg}"
        );
    }

    /// Duplicate metric across two entries rejects.
    #[test]
    fn parse_metrics_rejects_duplicate() {
        let err = parse_metrics("run_time_ns,wait_sum,run_time_ns").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("duplicate"),
            "error must mention duplicates: {msg}"
        );
    }

    /// Empty token between commas (`run_time_ns,,wait_sum`)
    /// rejects.
    #[test]
    fn parse_metrics_rejects_empty_entry() {
        let err = parse_metrics("run_time_ns,,wait_sum").unwrap_err();
        let msg = format!("{err:#}");
        assert!(msg.contains("empty"), "error must mention empty: {msg}");
    }

    /// Whitespace around each entry is trimmed before lookup.
    #[test]
    fn parse_metrics_trims_whitespace_around_entries() {
        let parsed =
            parse_metrics("  run_time_ns , wait_sum  ").expect("whitespace-tolerant spec parses");
        assert_eq!(parsed, vec!["run_time_ns", "wait_sum"]);
    }

    /// Empty `metrics` Vec on [`DisplayOptions`] means "every
    /// metric is enabled" — the no-filter default. Pins the
    /// short-circuit in `is_metric_enabled` so a regression
    /// that flipped the empty case to "no metric enabled"
    /// surfaces here.
    #[test]
    fn is_metric_enabled_empty_treats_all_as_on() {
        let opts = DisplayOptions::default();
        // Sample a primary and a derived metric — both must
        // be enabled under the empty default.
        assert!(opts.is_metric_enabled("run_time_ns"));
        assert!(opts.is_metric_enabled("cpu_efficiency"));
        // Even a name not in any registry returns true under
        // the empty filter. is_metric_enabled is the gate at
        // render time; parse_metrics enforces validity at CLI
        // parse time, so these two checks compose to "filter
        // restricts only when populated."
        assert!(opts.is_metric_enabled("anything_under_empty_filter"));
    }

    /// Non-empty `metrics` Vec restricts rendering to the
    /// listed names — names IN the filter return true, names
    /// NOT in the filter return false. Pins the contains
    /// membership check.
    #[test]
    fn is_metric_enabled_non_empty_restricts_to_listed() {
        let mut opts = DisplayOptions::default();
        opts.metrics = vec!["run_time_ns", "wait_sum"];
        assert!(opts.is_metric_enabled("run_time_ns"));
        assert!(opts.is_metric_enabled("wait_sum"));
        assert!(!opts.is_metric_enabled("nr_wakeups"));
        assert!(!opts.is_metric_enabled("cpu_efficiency"));
    }

    /// [`format_cgroup_only_section_warning`] renders a
    /// diagnostic that names the offending section, the
    /// `--group-by cgroup` requirement, AND the operator's
    /// chosen group-by spelling. Pins all three load-bearing
    /// elements of the warning text against drift.
    #[test]
    fn format_cgroup_only_section_warning_names_all_three_elements() {
        let msg = format_cgroup_only_section_warning(Section::Pressure, GroupBy::Pcomm);
        assert!(
            msg.contains("'pressure'"),
            "warning must quote the section cli_name: {msg}",
        );
        assert!(
            msg.contains("--group-by cgroup"),
            "warning must name the cgroup requirement: {msg}",
        );
        assert!(
            msg.contains("pcomm"),
            "warning must echo the operator's --group-by axis: {msg}",
        );
    }

    /// [`format_cgroup_only_section_warning`] echoes the
    /// `comm-exact` spelling (not `CommExact`) so the warning
    /// matches the value-enum the operator typed at the CLI.
    /// Pins [`group_by_cli_name`]'s mapping for the
    /// hyphenated variant — clap's value-enum derive renames
    /// `CommExact` to `comm-exact`, and a regression that
    /// stringified the variant via `Debug` would surface
    /// `CommExact` instead.
    #[test]
    fn format_cgroup_only_section_warning_uses_comm_exact_spelling() {
        let msg = format_cgroup_only_section_warning(Section::CgroupStats, GroupBy::CommExact);
        assert!(
            msg.contains("comm-exact"),
            "warning must use the clap value-enum spelling: {msg}",
        );
        assert!(
            !msg.contains("CommExact"),
            "warning must not surface the rust variant name: {msg}",
        );
    }

    /// `--columns` overrides `--display-format`'s default.
    /// Resolved column set comes from `columns` when non-empty.
    #[test]
    fn columns_override_wins_over_display_format() {
        let mut opts = DisplayOptions::default();
        opts.format = DisplayFormat::Full;
        opts.columns = vec![Column::Metric, Column::Delta];
        let resolved = opts.resolved_compare_columns();
        assert_eq!(resolved, vec![Column::Metric, Column::Delta]);
    }

    /// `DisplayFormat::DeltaOnly` end-to-end: rendered diff
    /// table omits the `baseline` and `candidate` columns.
    #[test]
    fn write_diff_delta_only_omits_baseline_candidate_columns() {
        let (a, b) = snap_pair_for_display();
        let diff = compare(&a, &b, &CompareOptions::default());
        let mut display = DisplayOptions::default();
        display.format = DisplayFormat::DeltaOnly;
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &display,
        )
        .unwrap();
        // Header must NOT contain "baseline" or "candidate".
        let header_line = out.lines().next().unwrap_or("");
        assert!(
            !header_line.contains("baseline"),
            "delta-only header must drop baseline column:\n{header_line}"
        );
        assert!(
            !header_line.contains("candidate"),
            "delta-only header must drop candidate column:\n{header_line}"
        );
        assert!(
            header_line.contains("delta"),
            "delta column must remain:\n{header_line}"
        );
    }

    /// `DisplayFormat::NoPct` drops the `%` column.
    #[test]
    fn write_diff_no_pct_omits_pct_column() {
        let (a, b) = snap_pair_for_display();
        let diff = compare(&a, &b, &CompareOptions::default());
        let mut display = DisplayOptions::default();
        display.format = DisplayFormat::NoPct;
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &display,
        )
        .unwrap();
        let header_line = out.lines().next().unwrap_or("");
        // `%` is the literal column name; assert it is absent
        // as a stand-alone token. The header is whitespace-padded
        // by comfy_table; check there's no bare " % " run.
        assert!(
            !header_line.contains(" % "),
            "no-pct header must drop percent column:\n{header_line}"
        );
    }

    /// `DisplayFormat::Arrow` collapses baseline → candidate
    /// (delta) into a single cell. Pin the cell carries the
    /// arrow glyph and the parenthesized delta.
    #[test]
    fn write_diff_arrow_renders_combined_cell() {
        let (a, b) = snap_pair_for_display();
        let diff = compare(&a, &b, &CompareOptions::default());
        let mut display = DisplayOptions::default();
        display.format = DisplayFormat::Arrow;
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &display,
        )
        .unwrap();
        // run_time_ns row with 100 -> 200 (+100). Auto-scale
        // ladder leaves these as ns since they're below 1000.
        // Should render `100ns -> 200ns (+100ns)`. The arrow
        // glyph is U+2192.
        assert!(
            out.contains("\u{2192}"),
            "arrow glyph must appear in output:\n{out}"
        );
        assert!(
            out.contains("100ns") && out.contains("200ns"),
            "baseline and candidate values must surface in arrow cell:\n{out}"
        );
        assert!(
            out.contains("(+100ns)") || out.contains("(+100"),
            "delta must appear in parens:\n{out}"
        );
    }

    /// `DisplayFormat::Arrow` for derived rows: rendered
    /// derived row also collapses to a single arrow cell.
    #[test]
    fn write_diff_arrow_renders_derived_arrow_cell() {
        let (a, b) = snap_pair_for_display();
        let diff = compare(&a, &b, &CompareOptions::default());
        let mut display = DisplayOptions::default();
        display.format = DisplayFormat::Arrow;
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &display,
        )
        .unwrap();
        // The avg_wait_ns derived row shows up. wait_sum 1000/4
        // = 250.00ns baseline; 2000/4 = 500.00ns candidate.
        assert!(
            out.contains("avg_wait_ns"),
            "derived metric must appear in arrow rendering:\n{out}"
        );
        // Both values should appear in the arrow form.
        assert!(
            out.contains("250.00ns") || out.contains("250ns"),
            "baseline derived value must appear in arrow cell:\n{out}"
        );
    }

    /// `DisplayFormat::PctOnly` drops baseline / candidate /
    /// delta — only the % column carries data.
    #[test]
    fn write_diff_pct_only_keeps_only_pct() {
        let (a, b) = snap_pair_for_display();
        let diff = compare(&a, &b, &CompareOptions::default());
        let mut display = DisplayOptions::default();
        display.format = DisplayFormat::PctOnly;
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &display,
        )
        .unwrap();
        let header_line = out.lines().next().unwrap_or("");
        assert!(
            !header_line.contains("baseline"),
            "pct-only header must drop baseline:\n{header_line}"
        );
        assert!(
            !header_line.contains("candidate"),
            "pct-only header must drop candidate:\n{header_line}"
        );
        assert!(
            !header_line.contains("delta"),
            "pct-only header must drop delta:\n{header_line}"
        );
        // The `%` column header is just the literal `%` glyph,
        // which is hard to match unambiguously in a wide
        // table. Pin the data instead — run_time_ns 100 → 200
        // is +100% so the cell renders `+100.0%`.
        assert!(
            out.contains("+100.0%"),
            "pct-only must render percent cell:\n{out}",
        );
    }

    /// `--columns metric,delta` overrides `--display-format
    /// full` and emits exactly those two columns plus their
    /// labels.
    #[test]
    fn write_diff_columns_override_emits_only_selected_columns() {
        let (a, b) = snap_pair_for_display();
        let diff = compare(&a, &b, &CompareOptions::default());
        let mut display = DisplayOptions::default();
        display.format = DisplayFormat::Full; // would normally emit 7 columns
        display.columns = vec![Column::Metric, Column::Delta];
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &display,
        )
        .unwrap();
        let header_line = out.lines().next().unwrap_or("");
        assert!(
            header_line.contains("metric"),
            "metric column must appear:\n{header_line}"
        );
        assert!(
            header_line.contains("delta"),
            "delta column must appear:\n{header_line}"
        );
        assert!(
            !header_line.contains("baseline"),
            "baseline must NOT appear when --columns excludes it:\n{header_line}"
        );
        assert!(
            !header_line.contains("candidate"),
            "candidate must NOT appear when --columns excludes it:\n{header_line}"
        );
    }

    // ------------------------------------------------------------
    // Derived metrics
    // ------------------------------------------------------------

    /// `affine_success_ratio` = nr_wakeups_affine /
    /// nr_wakeups_affine_attempts. Pin the formula on a
    /// deterministic 7/10 input.
    #[test]
    fn derived_affine_success_ratio_formula() {
        let mut t = make_thread("p", "w");
        t.nr_wakeups_affine = MonotonicCount(7);
        t.nr_wakeups_affine_attempts = MonotonicCount(10);
        let diff = compare(
            &snap_with(vec![t.clone()]),
            &snap_with(vec![t]),
            &CompareOptions::default(),
        );
        let row = diff
            .derived_rows
            .iter()
            .find(|r| r.metric_name == "affine_success_ratio")
            .expect("affine_success_ratio row present");
        assert_eq!(row.baseline, Some(DerivedValue::Scalar(0.7)));
        assert_eq!(row.candidate, Some(DerivedValue::Scalar(0.7)));
        assert!(row.is_ratio, "affine_success_ratio is a ratio");
    }

    /// `avg_wait_ns` = wait_sum / wait_count. Pin formula on
    /// 1000ns / 4 events = 250ns.
    #[test]
    fn derived_avg_wait_ns_formula() {
        let mut t = make_thread("p", "w");
        t.wait_sum = MonotonicNs(1000);
        t.wait_count = MonotonicCount(4);
        let diff = compare(
            &snap_with(vec![t.clone()]),
            &snap_with(vec![t]),
            &CompareOptions::default(),
        );
        let row = diff
            .derived_rows
            .iter()
            .find(|r| r.metric_name == "avg_wait_ns")
            .expect("avg_wait_ns row present");
        assert_eq!(row.baseline, Some(DerivedValue::Scalar(250.0)));
    }

    /// `voluntary_sleep_ns` is now a first-class capture field
    /// — the normalization (`sum_sleep_runtime -
    /// sum_block_runtime`) happens at capture time inside
    /// `capture_thread_at_with_tally`, so the derived metric of
    /// the same shape was removed. The compare/show path simply
    /// sums `voluntary_sleep_ns` like any other Sum metric.
    /// Pin a 1000ns thread renders as 1000ns through the
    /// SumNs aggregation path.
    #[test]
    fn voluntary_sleep_ns_sums_through_registry() {
        let mut t = make_thread("p", "w");
        t.voluntary_sleep_ns = MonotonicNs(1000);
        let diff = compare(
            &snap_with(vec![t.clone()]),
            &snap_with(vec![t]),
            &CompareOptions::default(),
        );
        let row = diff
            .rows
            .iter()
            .find(|r| r.metric_name == "voluntary_sleep_ns")
            .expect("voluntary_sleep_ns row in diff");
        assert_eq!(
            row.baseline.numeric(),
            Some(1000.0),
            "voluntary_sleep_ns flows through SumNs aggregation \
             carrying the capture-side normalized value verbatim",
        );
    }

    /// Registry no longer exposes `voluntary_sleep_sum` as a
    /// derived metric — the capture-side `voluntary_sleep_ns`
    /// field replaced it. Pin the absence so a future
    /// re-introduction surfaces here.
    #[test]
    fn voluntary_sleep_sum_derived_metric_is_removed() {
        let names: std::collections::BTreeSet<&'static str> =
            CTPROF_DERIVED_METRICS.iter().map(|m| m.name).collect();
        assert!(
            !names.contains("voluntary_sleep_sum"),
            "voluntary_sleep_sum derived metric must not exist — \
             the normalization moved to capture-side \
             `voluntary_sleep_ns` (see ThreadState field doc). \
             Got derived metrics: {names:?}",
        );
    }

    /// `cpu_efficiency` = run / (run + wait). Pin on
    /// 100 / (100 + 100) = 0.5.
    #[test]
    fn derived_cpu_efficiency_formula() {
        let mut t = make_thread("p", "w");
        t.run_time_ns = MonotonicNs(100);
        t.wait_time_ns = MonotonicNs(100);
        let diff = compare(
            &snap_with(vec![t.clone()]),
            &snap_with(vec![t]),
            &CompareOptions::default(),
        );
        let row = diff
            .derived_rows
            .iter()
            .find(|r| r.metric_name == "cpu_efficiency")
            .expect("cpu_efficiency row present");
        assert_eq!(row.baseline, Some(DerivedValue::Scalar(0.5)));
        assert!(row.is_ratio);
    }

    /// `avg_slice_ns` = run_time_ns / timeslices.
    #[test]
    fn derived_avg_slice_ns_formula() {
        let mut t = make_thread("p", "w");
        t.run_time_ns = MonotonicNs(4000);
        t.timeslices = MonotonicCount(8);
        let diff = compare(
            &snap_with(vec![t.clone()]),
            &snap_with(vec![t]),
            &CompareOptions::default(),
        );
        let row = diff
            .derived_rows
            .iter()
            .find(|r| r.metric_name == "avg_slice_ns")
            .expect("avg_slice_ns row present");
        assert_eq!(row.baseline, Some(DerivedValue::Scalar(500.0)));
    }

    /// `involuntary_csw_ratio` = nvcsw / (vcsw + nvcsw).
    #[test]
    fn derived_involuntary_csw_ratio_formula() {
        let mut t = make_thread("p", "w");
        t.voluntary_csw = MonotonicCount(75);
        t.nonvoluntary_csw = MonotonicCount(25);
        let diff = compare(
            &snap_with(vec![t.clone()]),
            &snap_with(vec![t]),
            &CompareOptions::default(),
        );
        let row = diff
            .derived_rows
            .iter()
            .find(|r| r.metric_name == "involuntary_csw_ratio")
            .expect("involuntary_csw_ratio row present");
        assert_eq!(row.baseline, Some(DerivedValue::Scalar(0.25)));
        assert!(row.is_ratio);
    }

    /// `disk_io_fraction` = read_bytes / rchar.
    #[test]
    fn derived_disk_io_fraction_formula() {
        let mut t = make_thread("p", "w");
        t.rchar = Bytes(10_000);
        t.read_bytes = Bytes(2_500);
        let diff = compare(
            &snap_with(vec![t.clone()]),
            &snap_with(vec![t]),
            &CompareOptions::default(),
        );
        let row = diff
            .derived_rows
            .iter()
            .find(|r| r.metric_name == "disk_io_fraction")
            .expect("disk_io_fraction row present");
        assert_eq!(row.baseline, Some(DerivedValue::Scalar(0.25)));
        assert!(row.is_ratio);
    }

    /// `live_heap_estimate` = allocated - deallocated. Pin
    /// signed: 1000 alloc - 1500 dealloc = -500 (drained).
    #[test]
    fn derived_live_heap_estimate_signed() {
        let mut t = make_thread("p", "w");
        t.allocated_bytes = Bytes(1000);
        t.deallocated_bytes = Bytes(1500);
        let diff = compare(
            &snap_with(vec![t.clone()]),
            &snap_with(vec![t]),
            &CompareOptions::default(),
        );
        let row = diff
            .derived_rows
            .iter()
            .find(|r| r.metric_name == "live_heap_estimate")
            .expect("live_heap_estimate row present");
        assert_eq!(row.baseline, Some(DerivedValue::Scalar(-500.0)));
        assert!(!row.is_ratio, "live_heap_estimate is a B-unit, not ratio");
    }

    /// `avg_iowait_ns` = iowait_sum / iowait_count.
    #[test]
    fn derived_avg_iowait_ns_formula() {
        let mut t = make_thread("p", "w");
        t.iowait_sum = MonotonicNs(9000);
        t.iowait_count = MonotonicCount(3);
        let diff = compare(
            &snap_with(vec![t.clone()]),
            &snap_with(vec![t]),
            &CompareOptions::default(),
        );
        let row = diff
            .derived_rows
            .iter()
            .find(|r| r.metric_name == "avg_iowait_ns")
            .expect("avg_iowait_ns row present");
        assert_eq!(row.baseline, Some(DerivedValue::Scalar(3000.0)));
    }

    /// Every per-category `avg_<bucket>_delay_ns` row
    /// computes `total / count` correctly. One thread, distinct
    /// (count, total) pair per bucket so a row that mixed up
    /// numerator and denominator (or pulled from the wrong
    /// bucket's count) would surface as an off-by-bucket
    /// equality failure here.
    #[test]
    fn derived_avg_delay_ns_formulas_match_manual_division() {
        let mut t = make_thread("p", "w");
        // Distinct (count, total) per bucket so a wrong-bucket
        // crosswire produces a wrong quotient rather than a
        // collision that hides the bug.
        t.cpu_delay_count = MonotonicCount(3);
        t.cpu_delay_total_ns = MonotonicNs(9_000);
        t.blkio_delay_count = MonotonicCount(4);
        t.blkio_delay_total_ns = MonotonicNs(20_000);
        t.swapin_delay_count = MonotonicCount(5);
        t.swapin_delay_total_ns = MonotonicNs(35_000);
        t.freepages_delay_count = MonotonicCount(6);
        t.freepages_delay_total_ns = MonotonicNs(54_000);
        t.thrashing_delay_count = MonotonicCount(7);
        t.thrashing_delay_total_ns = MonotonicNs(77_000);
        t.compact_delay_count = MonotonicCount(8);
        t.compact_delay_total_ns = MonotonicNs(104_000);
        t.wpcopy_delay_count = MonotonicCount(9);
        t.wpcopy_delay_total_ns = MonotonicNs(135_000);
        t.irq_delay_count = MonotonicCount(10);
        t.irq_delay_total_ns = MonotonicNs(170_000);
        let diff = compare(
            &snap_with(vec![t.clone()]),
            &snap_with(vec![t]),
            &CompareOptions::default(),
        );
        // (name, expected_avg)
        for (name, expected) in [
            ("avg_cpu_delay_ns", 3_000.0),
            ("avg_blkio_delay_ns", 5_000.0),
            ("avg_swapin_delay_ns", 7_000.0),
            ("avg_freepages_delay_ns", 9_000.0),
            ("avg_thrashing_delay_ns", 11_000.0),
            ("avg_compact_delay_ns", 13_000.0),
            ("avg_wpcopy_delay_ns", 15_000.0),
            ("avg_irq_delay_ns", 17_000.0),
        ] {
            let row = diff
                .derived_rows
                .iter()
                .find(|r| r.metric_name == name)
                .unwrap_or_else(|| panic!("{name} row present"));
            assert_eq!(
                row.baseline,
                Some(DerivedValue::Scalar(expected)),
                "{name} formula mismatch — expected {expected}",
            );
        }
    }

    /// `total_offcpu_delay_ns` sums every bucket and OR's
    /// (swapin, thrashing) via `.max()`. Two test cases to pin
    /// the .max() behavior in both directions:
    ///
    /// (a) swapin > thrashing → swapin contributes.
    /// (b) thrashing > swapin → thrashing contributes.
    ///
    /// A regression that summed swapin + thrashing (instead of
    /// max-ing) would double-count the overlap and the rollup
    /// would be off by `min(swapin, thrashing)` in both cases.
    #[test]
    fn derived_total_offcpu_delay_ns_sums_with_max_overlap() {
        // Case (a): swapin (200) > thrashing (50). Rollup picks
        // swapin via .max().
        let mut t_a = make_thread("p", "w");
        t_a.cpu_delay_total_ns = MonotonicNs(10);
        t_a.blkio_delay_total_ns = MonotonicNs(20);
        t_a.swapin_delay_total_ns = MonotonicNs(200);
        t_a.freepages_delay_total_ns = MonotonicNs(30);
        t_a.thrashing_delay_total_ns = MonotonicNs(50);
        t_a.compact_delay_total_ns = MonotonicNs(40);
        t_a.wpcopy_delay_total_ns = MonotonicNs(60);
        t_a.irq_delay_total_ns = MonotonicNs(70);
        // Expected: 10 + 20 + 30 + 40 + 60 + 70 + max(200,50) = 430
        let diff_a = compare(
            &snap_with(vec![t_a.clone()]),
            &snap_with(vec![t_a]),
            &CompareOptions::default(),
        );
        let row_a = diff_a
            .derived_rows
            .iter()
            .find(|r| r.metric_name == "total_offcpu_delay_ns")
            .expect("total_offcpu_delay_ns row present (case a)");
        assert_eq!(
            row_a.baseline,
            Some(DerivedValue::Scalar(430.0)),
            "case (a) swapin>thrashing: expected 430, got {:?}",
            row_a.baseline,
        );

        // Case (b): thrashing (300) > swapin (75). Rollup picks
        // thrashing via .max().
        let mut t_b = make_thread("p", "w");
        t_b.cpu_delay_total_ns = MonotonicNs(10);
        t_b.blkio_delay_total_ns = MonotonicNs(20);
        t_b.swapin_delay_total_ns = MonotonicNs(75);
        t_b.freepages_delay_total_ns = MonotonicNs(30);
        t_b.thrashing_delay_total_ns = MonotonicNs(300);
        t_b.compact_delay_total_ns = MonotonicNs(40);
        t_b.wpcopy_delay_total_ns = MonotonicNs(60);
        t_b.irq_delay_total_ns = MonotonicNs(70);
        // Expected: 10 + 20 + 30 + 40 + 60 + 70 + max(75,300) = 530
        let diff_b = compare(
            &snap_with(vec![t_b.clone()]),
            &snap_with(vec![t_b]),
            &CompareOptions::default(),
        );
        let row_b = diff_b
            .derived_rows
            .iter()
            .find(|r| r.metric_name == "total_offcpu_delay_ns")
            .expect("total_offcpu_delay_ns row present (case b)");
        assert_eq!(
            row_b.baseline,
            Some(DerivedValue::Scalar(530.0)),
            "case (b) thrashing>swapin: expected 530, got {:?}",
            row_b.baseline,
        );
    }

    /// Each `avg_<bucket>_delay_ns` compute closure
    /// returns `None` when EITHER input is missing from the
    /// metrics map. Pulls the closure directly out of
    /// `CTPROF_DERIVED_METRICS` and exercises it with a
    /// partial `BTreeMap` (only the numerator side present, no
    /// denominator). The compute path must short-circuit via
    /// `input_scalar`'s `?` rather than panicking or returning
    /// `Some(NaN)`.
    ///
    /// `total_offcpu_delay_ns` follows the same pattern: every
    /// input must be present; missing any one returns `None`.
    /// The all-inputs-present-but-zero case is covered by the
    /// extension to `derived_division_by_zero_returns_none`
    /// below.
    #[test]
    fn derived_avg_delay_ns_returns_none_on_missing_input() {
        let lookup = |name: &str| -> &DerivedMetricDef {
            CTPROF_DERIVED_METRICS
                .iter()
                .find(|d| d.name == name)
                .unwrap_or_else(|| panic!("{name} present in registry"))
        };

        // For each avg_*: insert ONLY the numerator, not the
        // denominator. The compute closure should return None.
        for (name, numerator) in [
            ("avg_cpu_delay_ns", "cpu_delay_total_ns"),
            ("avg_blkio_delay_ns", "blkio_delay_total_ns"),
            ("avg_swapin_delay_ns", "swapin_delay_total_ns"),
            ("avg_freepages_delay_ns", "freepages_delay_total_ns"),
            ("avg_thrashing_delay_ns", "thrashing_delay_total_ns"),
            ("avg_compact_delay_ns", "compact_delay_total_ns"),
            ("avg_wpcopy_delay_ns", "wpcopy_delay_total_ns"),
            ("avg_irq_delay_ns", "irq_delay_total_ns"),
        ] {
            let mut metrics: BTreeMap<String, Aggregated> = BTreeMap::new();
            metrics.insert(numerator.to_string(), Aggregated::Sum(123));
            let def = lookup(name);
            assert!(
                (def.compute)(&metrics).is_none(),
                "{name}: compute must return None when denominator is \
                 missing from metrics map (only {numerator} present)",
            );
        }

        // total_offcpu_delay_ns: insert all but ONE input
        // (`compact_delay_total_ns`). Verify None.
        let mut partial: BTreeMap<String, Aggregated> = BTreeMap::new();
        for name in [
            "cpu_delay_total_ns",
            "blkio_delay_total_ns",
            "swapin_delay_total_ns",
            "freepages_delay_total_ns",
            "thrashing_delay_total_ns",
            // compact_delay_total_ns INTENTIONALLY OMITTED
            "wpcopy_delay_total_ns",
            "irq_delay_total_ns",
        ] {
            partial.insert(name.to_string(), Aggregated::Sum(100));
        }
        let total_def = lookup("total_offcpu_delay_ns");
        assert!(
            (total_def.compute)(&partial).is_none(),
            "total_offcpu_delay_ns: compute must return None when ANY \
             input is missing — exercised here with compact_delay_total_ns \
             omitted from the metrics map",
        );
    }

    /// Division by zero in any ratio derivation produces `None`,
    /// not NaN or zero. Operator-actionable as `-` in the
    /// rendered cell.
    #[test]
    fn derived_division_by_zero_returns_none() {
        let mut t = make_thread("p", "w");
        // affine_attempts == 0 → ratio is None
        t.nr_wakeups_affine = MonotonicCount(0);
        t.nr_wakeups_affine_attempts = MonotonicCount(0);
        // wait_count == 0 → avg_wait_ns is None
        t.wait_sum = MonotonicNs(0);
        t.wait_count = MonotonicCount(0);
        // run + wait == 0 → cpu_efficiency is None
        t.run_time_ns = MonotonicNs(0);
        t.wait_time_ns = MonotonicNs(0);
        // timeslices == 0 → avg_slice_ns is None
        t.timeslices = MonotonicCount(0);
        // vcsw + nvcsw == 0 → involuntary_csw_ratio is None
        t.voluntary_csw = MonotonicCount(0);
        t.nonvoluntary_csw = MonotonicCount(0);
        // rchar == 0 → disk_io_fraction is None
        t.rchar = Bytes(0);
        t.read_bytes = Bytes(0);
        // iowait_count == 0 → avg_iowait_ns is None
        t.iowait_sum = MonotonicNs(0);
        t.iowait_count = MonotonicCount(0);
        // Every taskstats avg_*_delay_ns: count == 0 → None
        // (all default to MonotonicCount(0) / MonotonicNs(0)
        // from `..ThreadState::default()` so no explicit
        // assignment is needed; pinning the assertion below is
        // the load-bearing check). The 8 buckets follow the
        // ratio_compute pattern of avg_wait_ns / avg_iowait_ns,
        // so the same division-by-zero contract applies.
        let diff = compare(
            &snap_with(vec![t.clone()]),
            &snap_with(vec![t]),
            &CompareOptions::default(),
        );
        for name in [
            "affine_success_ratio",
            "avg_wait_ns",
            "cpu_efficiency",
            "avg_slice_ns",
            "involuntary_csw_ratio",
            "disk_io_fraction",
            "avg_iowait_ns",
            "avg_cpu_delay_ns",
            "avg_blkio_delay_ns",
            "avg_swapin_delay_ns",
            "avg_freepages_delay_ns",
            "avg_thrashing_delay_ns",
            "avg_compact_delay_ns",
            "avg_wpcopy_delay_ns",
            "avg_irq_delay_ns",
        ] {
            let row = diff
                .derived_rows
                .iter()
                .find(|r| r.metric_name == name)
                .unwrap_or_else(|| panic!("{name} row present"));
            assert!(
                row.baseline.is_none(),
                "{name} divides by zero — baseline must be None, got {:?}",
                row.baseline
            );
            assert!(
                row.delta.is_none(),
                "{name} delta must be None when inputs are zero"
            );
        }

        // total_offcpu_delay_ns is a SUM, not a quotient. With
        // every input present and all-zero, the formula evaluates
        // cleanly to 0.0 — `Some(Scalar(0.0))`, not `None`.
        // Genuine zero is meaningful here (the task accumulated
        // zero off-CPU delay across every bucket, which is a
        // real signal — e.g. an idle-since-fork bookkeeping
        // thread); collapsing it to None would conflate "no
        // delay observed" with "missing input". The
        // missing-input case is covered separately by
        // `derived_avg_delay_ns_returns_none_on_missing_input`.
        let total_row = diff
            .derived_rows
            .iter()
            .find(|r| r.metric_name == "total_offcpu_delay_ns")
            .expect("total_offcpu_delay_ns row present");
        assert_eq!(
            total_row.baseline,
            Some(DerivedValue::Scalar(0.0)),
            "total_offcpu_delay_ns with all-zero inputs must be \
             Some(0.0), not None — genuine zero is meaningful for a sum",
        );
    }

    /// Ratio rows render with absolute delta in the delta column
    /// and `-` in the % column (suppressed for ratios per design
    /// call: 0.5 → 0.6 reads as +0.100 absolute = +10pp; the
    /// fraction +0.2 = +20% of baseline is misleading).
    #[test]
    fn write_diff_derived_ratio_suppresses_pct() {
        let mut a = make_thread("p", "w");
        a.nr_wakeups_affine = MonotonicCount(50);
        a.nr_wakeups_affine_attempts = MonotonicCount(100); // ratio = 0.5
        let mut b = make_thread("p", "w");
        b.nr_wakeups_affine = MonotonicCount(60);
        b.nr_wakeups_affine_attempts = MonotonicCount(100); // ratio = 0.6
        let diff = compare(
            &snap_with(vec![a]),
            &snap_with(vec![b]),
            &CompareOptions::default(),
        );
        let row = diff
            .derived_rows
            .iter()
            .find(|r| r.metric_name == "affine_success_ratio")
            .expect("affine_success_ratio present");
        let delta = row.delta.expect("delta present when both sides defined");
        assert!(
            (delta - 0.1).abs() < 1e-10,
            "expected delta ~0.1 (0.6 - 0.5 in f64), got {delta}",
        );
        assert!(
            row.delta_pct.is_none(),
            "ratio row must suppress delta_pct, got {:?}",
            row.delta_pct
        );
    }

    /// Non-ratio (ns/B) derivations keep delta_pct populated.
    #[test]
    fn write_diff_derived_ns_keeps_pct() {
        let mut a = make_thread("p", "w");
        a.wait_sum = MonotonicNs(1000);
        a.wait_count = MonotonicCount(10); // avg = 100ns
        let mut b = make_thread("p", "w");
        b.wait_sum = MonotonicNs(1500);
        b.wait_count = MonotonicCount(10); // avg = 150ns
        let diff = compare(
            &snap_with(vec![a]),
            &snap_with(vec![b]),
            &CompareOptions::default(),
        );
        let row = diff
            .derived_rows
            .iter()
            .find(|r| r.metric_name == "avg_wait_ns")
            .expect("avg_wait_ns present");
        assert_eq!(row.baseline, Some(DerivedValue::Scalar(100.0)));
        assert_eq!(row.candidate, Some(DerivedValue::Scalar(150.0)));
        assert_eq!(row.delta, Some(50.0));
        // delta_pct = 50/100 = 0.5
        assert!(row.delta_pct.is_some());
        let pct = row.delta_pct.unwrap();
        assert!(
            (pct - 0.5).abs() < 1e-9,
            "expected delta_pct ~0.5, got {pct}"
        );
    }

    /// Render integration: write_diff emits the `## Derived
    /// metrics` section with one row per derivation per matched
    /// group. Pin the section header and a representative row.
    #[test]
    fn write_diff_emits_derived_section() {
        let mut t = make_thread("p", "w");
        t.run_time_ns = MonotonicNs(1000);
        t.timeslices = MonotonicCount(4);
        let diff = compare(
            &snap_with(vec![t.clone()]),
            &snap_with(vec![t]),
            &CompareOptions::default(),
        );
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        assert!(
            out.contains("## Derived metrics"),
            "missing derived section header:\n{out}",
        );
        assert!(
            out.contains("avg_slice_ns"),
            "missing avg_slice_ns row in derived section:\n{out}",
        );
    }

    /// `--sort-by` accepts derived metric names. Three groups
    /// with distinct cpu_efficiency values: sort descending puts
    /// the highest first.
    #[test]
    fn parse_sort_by_accepts_derived_metric_name() {
        let keys = parse_sort_by("cpu_efficiency").expect("derived name parses");
        assert_eq!(keys.len(), 1);
        assert_eq!(keys[0].metric, "cpu_efficiency");
        assert!(keys[0].descending);
    }

    /// `--sort-by` rejects unknown names with a hint that lists
    /// derived names alongside primary registry names.
    #[test]
    fn parse_sort_by_unknown_lists_derived_names() {
        let err = parse_sort_by("not_a_real_metric").unwrap_err();
        let msg = format!("{err:#}");
        // Lists at least one derived metric name.
        assert!(
            msg.contains("affine_success_ratio")
                || msg.contains("cpu_efficiency")
                || msg.contains("avg_wait_ns"),
            "error must list derived metric names alongside primary; got: {msg}",
        );
    }

    /// Primary and derived metric namespaces are disjoint — a
    /// derived metric may NOT shadow a primary metric name. Pin
    /// the disjoint invariant so a future addition that
    /// accidentally collides surfaces here.
    #[test]
    fn registry_and_derived_names_disjoint() {
        let primary: std::collections::BTreeSet<&str> =
            CTPROF_METRICS.iter().map(|m| m.name).collect();
        for d in CTPROF_DERIVED_METRICS {
            assert!(
                !primary.contains(d.name),
                "derived metric {} shadows primary registry name",
                d.name,
            );
        }
    }

    /// Every derived metric has a non-empty description and a
    /// non-empty inputs list. Defends against a future addition
    /// that forgets to fill either field.
    #[test]
    fn registry_derived_metrics_well_formed() {
        for d in CTPROF_DERIVED_METRICS {
            assert!(
                !d.description.is_empty(),
                "derived metric {} has empty description",
                d.name,
            );
            assert!(
                !d.inputs.is_empty(),
                "derived metric {} has empty inputs list",
                d.name,
            );
            // Every input must be a real registered metric name.
            let primary: std::collections::BTreeSet<&str> =
                CTPROF_METRICS.iter().map(|m| m.name).collect();
            for input in d.inputs {
                assert!(
                    primary.contains(input),
                    "derived metric {} cites unknown input {input}",
                    d.name,
                );
            }
        }
    }

    /// `metric-list` emits the `## Derived metrics` section
    /// with every registered derivation listed. Pin set-equality
    /// on names so a registry addition automatically surfaces.
    #[test]
    fn write_metric_list_emits_derived_section() {
        let mut out = String::new();
        write_metric_list(&mut out).unwrap();
        assert!(
            out.contains("## Derived metrics"),
            "metric-list must emit a Derived metrics header:\n{out}",
        );
        for d in CTPROF_DERIVED_METRICS {
            assert!(
                out.contains(d.name),
                "derived metric {} missing from metric-list:\n{out}",
                d.name,
            );
        }
    }

    /// `metric-list` emits the `## Sections` table listing
    /// every Section variant by its CLI name. Discovery
    /// companion to the `--sections` flag — operators reading
    /// the rendered metric-list output should see the full
    /// vocabulary for `--sections` without needing to read
    /// source. Pin every cli_name from `Section::ALL`.
    #[test]
    fn write_metric_list_emits_sections_vocabulary() {
        let mut out = String::new();
        write_metric_list(&mut out).unwrap();
        assert!(
            out.contains("## Sections"),
            "metric-list must emit the Sections vocabulary heading:\n{out}",
        );
        for section in Section::ALL {
            assert!(
                out.contains(section.cli_name()),
                "section cli_name {} missing from Sections \
                 vocabulary table:\n{out}",
                section.cli_name(),
            );
        }
    }

    /// The Sections vocabulary appears BEFORE the Metrics
    /// table in the rendered output. Pins the layout order
    /// so a future refactor that moves Sections after Metrics
    /// (or drops the heading entirely) surfaces here.
    #[test]
    fn write_metric_list_sections_precedes_metrics() {
        let mut out = String::new();
        write_metric_list(&mut out).unwrap();
        let sections_at = out
            .find("## Sections")
            .expect("Sections heading must be present");
        let metrics_at = out
            .find("## Metrics")
            .expect("Metrics heading must be present");
        assert!(
            sections_at < metrics_at,
            "Sections heading must precede Metrics heading; \
             got Sections@{sections_at} Metrics@{metrics_at}\n{out}",
        );
    }

    /// `format_derived_value_cell` renders a ratio with three
    /// decimals (`0.873`); ns/B values route through auto-scale.
    #[test]
    fn format_derived_value_cell_ratio_three_decimals() {
        let v = DerivedValue::Scalar(0.873_5);
        let cell = format_derived_value_cell(v, ScaleLadder::None, true);
        assert_eq!(cell, "0.874");
    }

    /// `format_derived_value_cell` auto-scales ns to ms above
    /// the threshold.
    #[test]
    fn format_derived_value_cell_ns_auto_scales() {
        let v = DerivedValue::Scalar(2_500_000.0);
        let cell = format_derived_value_cell(v, ScaleLadder::Ns, false);
        // 2.5e6 ns → 2.500ms via the existing auto_scale ladder.
        assert_eq!(cell, "2.500ms");
    }

    /// `format_derived_value_cell` preserves fractional precision
    /// for derived averages below the auto-scale threshold.
    /// avg_wait_ns = 1234 ns / 10 events = 123.4 ns; the
    /// formatter renders 123.40ns (two decimals). Without the
    /// fractional precision, this would round to "123ns" and
    /// the operator would lose the post-decimal signal.
    #[test]
    fn format_derived_value_cell_ns_preserves_fractional_precision() {
        let v = DerivedValue::Scalar(123.4);
        let cell = format_derived_value_cell(v, ScaleLadder::Ns, false);
        assert_eq!(cell, "123.40ns");
    }

    /// `format_derived_value_cell` renders a negative B value
    /// with the explicit minus sign (live_heap_estimate that
    /// went negative).
    #[test]
    fn format_derived_value_cell_negative_bytes_signed() {
        let two_kib_neg = -(2.0 * 1024.0);
        let v = DerivedValue::Scalar(two_kib_neg);
        let cell = format_derived_value_cell(v, ScaleLadder::Bytes, false);
        assert_eq!(cell, "-2.000KiB");
    }

    /// `format_derived_delta_cell` carries explicit `+` for
    /// positive deltas (mirrors format_delta_cell). Pin the
    /// sign carry on a ratio delta of +0.100 = +10pp.
    #[test]
    fn format_derived_delta_cell_ratio_carries_sign() {
        let cell = format_derived_delta_cell(0.1, ScaleLadder::None, true);
        assert_eq!(cell, "+0.100");
    }

    /// `live_heap_estimate` can go negative when deallocations
    /// dominate — the renderer must preserve the sign through
    /// the auto-scale ladder step (here: MiB step-up). Pins the
    /// signed-Bytes path that f64 carries. Mirrors the
    /// existing KiB-scale test but exercises the MiB threshold
    /// so a future regression that drops the sign at a
    /// higher rung of the ladder still fails.
    #[test]
    fn format_derived_value_cell_negative_bytes_at_mib_step() {
        // -2_000_000 bytes: |abs| = 2_000_000 ≥ 1 MiB (1_048_576),
        // < 1 GiB (1_073_741_824) → step to MiB.
        // -2_000_000 / 1_048_576 ≈ -1.907.
        let v = DerivedValue::Scalar(-2_000_000.0);
        let cell = format_derived_value_cell(v, ScaleLadder::Bytes, false);
        assert_eq!(cell, "-1.907MiB");
    }

    /// `disk_io_fraction` is `is_ratio: true` for the rendering
    /// shape (three decimals, no `%` column, no auto-scale) but
    /// can exceed 1.0 in practice — readahead pulls more
    /// block-device bytes than the syscall requested, pushing
    /// `read_bytes / rchar` above 1. Pin that the renderer
    /// emits the value verbatim with three decimals when it
    /// crosses 1.0 — no clamp, no truncation, no exponent.
    #[test]
    fn format_derived_value_cell_ratio_above_one_renders_verbatim() {
        let v = DerivedValue::Scalar(1.5);
        let cell = format_derived_value_cell(v, ScaleLadder::None, true);
        assert_eq!(cell, "1.500");
    }

    /// `--sort-by avg_wait_ns` ranks groups by the derived
    /// metric's delta. End-to-end pin: three pcomm buckets with
    /// distinct avg_wait_ns deltas; descending sort puts the
    /// largest delta's group first in the rendered table.
    #[test]
    fn write_diff_sort_by_derived_metric_ranks_groups() {
        // bucket "high": avg_wait grew from 100ns to 300ns (+200ns)
        // bucket "low": avg_wait grew from 100ns to 150ns (+50ns)
        // Descending sort puts "high" first.
        let mut high_a = make_thread("p", "w");
        high_a.pcomm = "high".to_string();
        high_a.wait_sum = MonotonicNs(100);
        high_a.wait_count = MonotonicCount(1);
        let mut high_b = make_thread("p", "w");
        high_b.pcomm = "high".to_string();
        high_b.wait_sum = MonotonicNs(300);
        high_b.wait_count = MonotonicCount(1);
        let mut low_a = make_thread("p", "w");
        low_a.pcomm = "low".to_string();
        low_a.wait_sum = MonotonicNs(100);
        low_a.wait_count = MonotonicCount(1);
        let mut low_b = make_thread("p", "w");
        low_b.pcomm = "low".to_string();
        low_b.wait_sum = MonotonicNs(150);
        low_b.wait_count = MonotonicCount(1);
        let opts = CompareOptions {
            sort_by: vec![SortKey {
                metric: "avg_wait_ns",
                descending: true,
            }],
            ..CompareOptions::default()
        };
        let diff = compare(
            &snap_with(vec![high_a, low_a]),
            &snap_with(vec![high_b, low_b]),
            &opts,
        );
        // Find the first derived row (post-sort) — the group with
        // the largest avg_wait_ns delta.
        let first = &diff.derived_rows[0];
        assert_eq!(
            first.group_key, "high",
            "descending sort by avg_wait_ns must put `high` first; \
             got {:?}",
            first.group_key,
        );
    }

    /// `write_metric_list` emits the tag legend section with
    /// every closed-set tag value documented. Ties the legend
    /// content to the closed-set vocabulary the registry pin
    /// guards (`registry_tag_vocabulary_is_closed`); a future
    /// allowed-class or allowed-gate addition that doesn't
    /// extend the legend fails this test.
    #[test]
    fn write_metric_list_emits_full_tag_legend() {
        let mut out = String::new();
        write_metric_list(&mut out).unwrap();
        // sched_class vocabulary
        assert!(
            out.contains("[cfs-only]"),
            "missing [cfs-only] in legend:\n{out}"
        );
        assert!(
            out.contains("[non-ext]"),
            "missing [non-ext] in legend:\n{out}"
        );
        assert!(
            out.contains("[fair-policy]"),
            "missing [fair-policy] in legend:\n{out}",
        );
        // config_gates vocabulary (compact form)
        assert!(
            out.contains("[SCHED_INFO]"),
            "missing [SCHED_INFO] in legend:\n{out}"
        );
        assert!(
            out.contains("[SCHEDSTATS]"),
            "missing [SCHEDSTATS] in legend:\n{out}",
        );
        assert!(
            out.contains("[SCHED_CORE]"),
            "missing [SCHED_CORE] in legend:\n{out}"
        );
        assert!(
            out.contains("[SCHED_CLASS_EXT]"),
            "missing [SCHED_CLASS_EXT] in legend:\n{out}",
        );
        assert!(
            out.contains("[TASK_DELAY_ACCT]"),
            "missing [TASK_DELAY_ACCT] in legend:\n{out}",
        );
        assert!(
            out.contains("[TASK_IO_ACCOUNTING]"),
            "missing [TASK_IO_ACCOUNTING] in legend:\n{out}",
        );
        assert!(
            out.contains("[TASKSTATS]"),
            "missing [TASKSTATS] in legend:\n{out}",
        );
        assert!(
            out.contains("[TASK_XACCT]"),
            "missing [TASK_XACCT] in legend:\n{out}",
        );
        // status vocabulary
        assert!(out.contains("[dead]"), "missing [dead] in legend:\n{out}");
        // Section headers
        assert!(
            out.contains("## Tag legend"),
            "missing Tag legend section header:\n{out}",
        );
        assert!(
            out.contains("## Metrics"),
            "missing Metrics section header:\n{out}",
        );
    }

    /// `write_metric_list` covers every metric in the registry.
    /// Pin set-equality on the names so a registry addition
    /// fails the test until the description is added (which
    /// happens automatically — `write_metric_list` iterates the
    /// registry).
    #[test]
    fn write_metric_list_covers_every_registered_metric() {
        let mut out = String::new();
        write_metric_list(&mut out).unwrap();
        for m in CTPROF_METRICS {
            assert!(
                out.contains(m.name),
                "metric {} missing from metric-list output:\n{out}",
                m.name,
            );
            assert!(
                out.contains(m.description),
                "description for {} missing from metric-list output:\n{out}",
                m.name,
            );
        }
    }

    /// `write_metric_list` puts the tags into their own column —
    /// no metric name leaks into the tags cell. Pin a
    /// representative example: `nr_wakeups_affine` carries
    /// `[cfs-only] [SCHEDSTATS]`, and that exact substring (with
    /// a leading space gap before the bracket) is present in the
    /// output but the rendered display form
    /// `nr_wakeups_affine [cfs-only] [SCHEDSTATS]` is NOT (which
    /// would mean the name leaked into the tags cell).
    #[test]
    fn write_metric_list_tags_column_excludes_metric_name() {
        let mut out = String::new();
        write_metric_list(&mut out).unwrap();
        assert!(
            out.contains("[cfs-only] [SCHEDSTATS]"),
            "expected bare tag pair `[cfs-only] [SCHEDSTATS]` in tags column:\n{out}",
        );
        assert!(
            !out.contains("nr_wakeups_affine [cfs-only]"),
            "metric name must not leak into tags column:\n{out}",
        );
    }

    /// Every metric carries a non-empty description string.
    /// Defends against a future metric addition that forgets to
    /// fill the field — leaving an empty cell in the discovery
    /// output that defeats the entire purpose of `metric-list`.
    #[test]
    fn registry_descriptions_are_non_empty() {
        for m in CTPROF_METRICS {
            assert!(
                !m.description.is_empty(),
                "metric {} has empty description",
                m.name,
            );
            // No trailing whitespace, no leading whitespace —
            // the table cell carries the description verbatim.
            assert_eq!(
                m.description.trim(),
                m.description,
                "metric {} description has leading/trailing whitespace",
                m.name,
            );
        }
    }

    /// Mode rule with a deterministic tie-break: when two
    /// values share the top count, the lexicographically
    /// smaller one wins. Pin the rule so the rendered output
    /// is reproducible across runs.
    #[test]
    fn mode_rule_tie_break_is_lexicographic() {
        let mut a = make_thread("app", "w1");
        a.policy = "SCHED_FIFO".into();
        let mut b = make_thread("app", "w2");
        b.policy = "SCHED_OTHER".into();
        let v = aggregate(AggRule::Mode(|t| t.policy.clone()), &[&a, &b]);
        match v {
            Aggregated::Mode { value, count, .. } => {
                assert_eq!(value, "SCHED_FIFO");
                assert_eq!(count, 1);
            }
            other => panic!("expected Mode, got {other:?}"),
        }
    }

    /// Affinity aggregate on an empty thread slice returns
    /// `min_cpus == max_cpus == 0` and no uniform cpuset — the
    /// compare engine cannot produce an empty group today, but
    /// this defends against an upstream refactor that permits
    /// one.
    #[test]
    fn affinity_aggregate_on_empty_threads_is_zero() {
        let empty: Vec<&ThreadState> = vec![];
        let v = aggregate(AggRule::Affinity(|t| t.cpu_affinity.clone()), &empty);
        match v {
            Aggregated::Affinity(s) => {
                assert_eq!(s.min_cpus, 0);
                assert_eq!(s.max_cpus, 0);
                assert!(s.uniform.is_none());
            }
            other => panic!("expected Affinity, got {other:?}"),
        }
    }

    /// Ordinal range collapses `min == max` to a single number
    /// in display. Defends against `nice=0` single-thread
    /// groups rendering as `0..0`.
    #[test]
    fn ordinal_display_collapses_degenerate_range() {
        let r = Aggregated::OrdinalRange { min: 0, max: 0 };
        assert_eq!(r.to_string(), "0");
        let r = Aggregated::OrdinalRange { min: -5, max: 10 };
        assert_eq!(r.to_string(), "-5..10");
    }

    /// Mode display omits the minority ratio when the mode is
    /// unanimous (count == total). Keeps the table compact for
    /// homogeneous groups.
    #[test]
    fn mode_display_hides_ratio_when_unanimous() {
        let m = Aggregated::Mode {
            value: "SCHED_OTHER".into(),
            count: 4,
            total: 4,
        };
        assert_eq!(m.to_string(), "SCHED_OTHER");
        let m = Aggregated::Mode {
            value: "SCHED_OTHER".into(),
            count: 3,
            total: 5,
        };
        assert_eq!(m.to_string(), "SCHED_OTHER (3/5)");
    }

    // -- write_diff: output rendering --

    #[test]
    fn write_diff_emits_expected_column_headers() {
        let diff = compare(
            &snap_with(vec![make_thread("p", "w")]),
            &snap_with(vec![make_thread("p", "w")]),
            &CompareOptions::default(),
        );
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        for h in [
            "pcomm",
            "threads",
            "metric",
            "baseline",
            "candidate",
            "delta",
            "%",
        ] {
            assert!(out.contains(h), "missing header {h}:\n{out}");
        }
    }

    #[test]
    fn write_diff_header_switches_on_group_by() {
        let empty = CtprofDiff::default();
        let mut out = String::new();
        write_diff(
            &mut out,
            &empty,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Cgroup,
            &DisplayOptions::default(),
        )
        .unwrap();
        assert!(out.contains("cgroup"));
        let mut out = String::new();
        write_diff(
            &mut out,
            &empty,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Comm,
            &DisplayOptions::default(),
        )
        .unwrap();
        assert!(out.contains("comm"));
        // "comm" must render as the column header, not as a
        // substring of "pcomm" left over from the Pcomm variant.
        assert!(!out.contains("pcomm"));
    }

    #[test]
    fn write_diff_prints_only_baseline_section() {
        let diff = CtprofDiff {
            only_baseline: vec!["missing_proc".into()],
            ..CtprofDiff::default()
        };
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("/tmp/a.ctprof.zst"),
            Path::new("/tmp/b.ctprof.zst"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        assert!(out.contains("only in baseline"));
        assert!(out.contains("missing_proc"));
        assert!(out.contains("/tmp/a.ctprof.zst"));
    }

    #[test]
    fn write_diff_prints_only_candidate_section() {
        let diff = CtprofDiff {
            only_candidate: vec!["new_proc".into()],
            ..CtprofDiff::default()
        };
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("/tmp/a.ctprof.zst"),
            Path::new("/tmp/b.ctprof.zst"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        assert!(out.contains("only in candidate"));
        assert!(out.contains("new_proc"));
        assert!(out.contains("/tmp/b.ctprof.zst"));
    }

    #[test]
    fn write_diff_cgroup_enrichment_section_for_cgroup_mode() {
        let mut diff = CtprofDiff::default();
        diff.cgroup_stats_a
            .insert("/app".into(), simple_cgroup_stats(10, 0, 0, 100));
        diff.cgroup_stats_b
            .insert("/app".into(), simple_cgroup_stats(50, 0, 0, 200));
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Cgroup,
            &DisplayOptions::default(),
        )
        .unwrap();
        assert!(
            out.contains("cpu_usage_usec"),
            "missing enrichment header:\n{out}"
        );
        // Cell renders as a contiguous `baseline → candidate
        // (delta)` triple via `cgroup_cell`. Both 10 µs and 50 µs
        // are below the 1000-µs ms-step threshold, so they keep
        // the base unit (`10µs`, `50µs`); delta +40 likewise.
        // Asserting on the contiguous string (rather than three
        // bare integer substrings) defends against a regression
        // where one cell's render drifts — bare `out.contains("10")`
        // would silently pass even if the µs cell were dropped
        // entirely (the substring "10" appears in the larger
        // surrounding format).
        assert!(
            out.contains("10µs → 50µs (+40µs)"),
            "missing contiguous scaled triple `10µs → 50µs (+40µs)`:\n{out}",
        );
        // Memory_current went 100 → 200 — both below the 1024 KiB
        // threshold so they render as bare bytes with the `B`
        // unit. Pin the contiguous form here too so the byte
        // family's no-step-up path is covered.
        assert!(
            out.contains("100B → 200B (+100B)"),
            "missing contiguous scaled triple `100B → 200B (+100B)`:\n{out}",
        );
    }

    #[test]
    fn write_diff_enrichment_section_absent_when_group_by_pcomm() {
        let mut diff = CtprofDiff::default();
        // Populate enrichment; renderer must ignore it under
        // GroupBy::Pcomm.
        diff.cgroup_stats_a
            .insert("/app".into(), simple_cgroup_stats(10, 0, 0, 0));
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        assert!(!out.contains("cpu_usage_usec"), "enrichment leaked:\n{out}");
    }

    #[test]
    fn write_diff_delta_cell_has_plus_minus_sign() {
        let mut ta = make_thread("app", "w");
        ta.run_time_ns = MonotonicNs(100);
        let mut tb = make_thread("app", "w");
        tb.run_time_ns = MonotonicNs(50);
        let diff = compare(
            &snap_with(vec![ta]),
            &snap_with(vec![tb]),
            &CompareOptions::default(),
        );
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        // 50 - 100 = -50 ns → integer delta below the µs
        // threshold → bare signed-integer render via
        // `format_delta_cell`'s short-circuit (no `.000` noise).
        assert!(
            out.contains("-50ns"),
            "missing signed delta with unit:\n{out}",
        );
        assert!(out.contains("-50.0%"), "missing signed pct:\n{out}");
    }

    #[test]
    fn write_diff_categorical_delta_labels_same_or_differs() {
        let mut ta = make_thread("app", "w");
        ta.policy = "SCHED_OTHER".into();
        let mut tb = make_thread("app", "w");
        tb.policy = "SCHED_FIFO".into();
        let diff = compare(
            &snap_with(vec![ta]),
            &snap_with(vec![tb]),
            &CompareOptions::default(),
        );
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        assert!(out.contains("differs"), "missing 'differs' label:\n{out}");
    }

    /// Full round-trip via the public loader: two snapshots
    /// written to disk via `CtprofSnapshot::write`, loaded
    /// via `CtprofSnapshot::load`, compared, and the
    /// rendered output inspected. This stitches together the
    /// serialization layer, the comparison engine, and the
    /// formatter — the components `run_compare` composes in
    /// production.
    #[test]
    fn load_compare_render_pipeline_end_to_end() {
        // pcomm is pure-alpha so [`pattern_key`] returns it
        // unchanged — the e2e pipeline test pins basic round-trip
        // behavior and does not exercise pcomm normalization. A
        // pcomm with hex-eligible tokens like `e2e` would
        // normalize to `{H}_proc`, masking the round-trip
        // assertion behind a separate normalization codepath.
        let mut a = make_thread("etoe_proc", "thread_a");
        a.run_time_ns = MonotonicNs(1_000_000);
        a.voluntary_csw = MonotonicCount(10);
        a.policy = "SCHED_OTHER".into();
        let snap_a = snap_with(vec![a]);
        let mut b = make_thread("etoe_proc", "thread_a");
        b.run_time_ns = MonotonicNs(3_000_000);
        b.voluntary_csw = MonotonicCount(30);
        b.policy = "SCHED_FIFO".into();
        let snap_b = snap_with(vec![b]);

        let tmp_a = tempfile::NamedTempFile::new().unwrap();
        let tmp_b = tempfile::NamedTempFile::new().unwrap();
        snap_a.write(tmp_a.path()).unwrap();
        snap_b.write(tmp_b.path()).unwrap();
        let loaded_a = CtprofSnapshot::load(tmp_a.path()).unwrap();
        let loaded_b = CtprofSnapshot::load(tmp_b.path()).unwrap();

        let diff = compare(&loaded_a, &loaded_b, &CompareOptions::default());
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            tmp_a.path(),
            tmp_b.path(),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();

        // Column headers present.
        assert!(out.contains("pcomm"));
        assert!(out.contains("metric"));
        // Group key made it through.
        assert!(out.contains("etoe_proc"));
        // run_time_ns delta: +2_000_000 ns → auto-scaled to
        // `+2.000ms` per `auto_scale` (the ns ladder steps up to
        // ms at 1e6).
        assert!(
            out.contains("+2.000ms"),
            "run_time delta missing in:\n{out}",
        );
        // Policy row renders "differs" because SCHED_FIFO vs
        // SCHED_OTHER — non-numeric delta path exercised.
        assert!(out.contains("differs"));
    }

    // -- comparison coverage expansion --

    /// Pin all four branches of `cgroup_cell` directly with the
    /// dimensionless ("") unit so values render verbatim (no
    /// scaling). Auto-scaling per-unit is exercised separately by
    /// `cgroup_cell_renders_scaled_*`. Existing higher-level
    /// tests only exercise the (Some, Some) path transitively via
    /// `write_diff_cgroup_enrichment_section_for_cgroup_mode`; the
    /// other three branches (baseline-only, candidate-only,
    /// both-missing) are rendering-critical for the one-sided
    /// enrichment row path (`all_keys` union at the enrichment
    /// table site) and have no current pin.
    #[test]
    fn cgroup_cell_renders_all_four_branches() {
        // (Some, Some) → "a → b (+d)" where d = b - a (signed).
        assert_eq!(
            cgroup_cell(Some(10), Some(42), ScaleLadder::Unitless),
            "10 → 42 (+32)"
        );
        // Negative delta uses the signed formatter to keep the
        // sign explicit.
        assert_eq!(
            cgroup_cell(Some(50), Some(5), ScaleLadder::Unitless),
            "50 → 5 (-45)"
        );
        // (Some, None) → baseline value then en-dash placeholder.
        assert_eq!(cgroup_cell(Some(7), None, ScaleLadder::Unitless), "7 → -");
        // (None, Some) → leading en-dash placeholder.
        assert_eq!(cgroup_cell(None, Some(99), ScaleLadder::Unitless), "- → 99");
        // (None, None) → single en-dash (both sides absent).
        assert_eq!(cgroup_cell(None, None, ScaleLadder::Unitless), "-");
    }

    /// Pin all four branches of `format_psi_avg_cell`. Mirrors
    /// the [`cgroup_cell_renders_all_four_branches`] discipline
    /// for the centi-percent display path.
    #[test]
    fn format_psi_avg_cell_renders_all_four_branches() {
        // (Some, Some) — both halves render N.NN% with a signed
        // (+|-D.DD%) delta. 1859 centi-percent = 18.59%, 2431 =
        // 24.31%, delta = 5.72%.
        assert_eq!(
            format_psi_avg_cell(Some(1859), Some(2431)),
            "18.59% → 24.31% (+5.72%)",
        );
        // Negative delta uses an explicit minus sign.
        assert_eq!(
            format_psi_avg_cell(Some(2431), Some(1859)),
            "24.31% → 18.59% (-5.72%)",
        );
        // (Some, None) → baseline value then en-dash placeholder.
        assert_eq!(format_psi_avg_cell(Some(750), None), "7.50% → -");
        // (None, Some) → leading en-dash placeholder.
        assert_eq!(format_psi_avg_cell(None, Some(50)), "- → 0.50%");
        // (None, None) → single en-dash (both sides absent).
        assert_eq!(format_psi_avg_cell(None, None), "-");
    }

    /// `format_psi_avg_centi_percent` renders the kernel's
    /// 2-decimal-digit fixed-point representation. Pins the
    /// zero-padding boundary explicitly (`5` centi-percent must
    /// render as `0.05%`, not `0.5%`) — a regression dropping
    /// the zero-pad would round-trip through display only on
    /// the integer-percent path.
    #[test]
    fn format_psi_avg_centi_percent_zero_pads_fraction() {
        assert_eq!(format_psi_avg_centi_percent(0), "0.00%");
        assert_eq!(format_psi_avg_centi_percent(5), "0.05%");
        assert_eq!(format_psi_avg_centi_percent(50), "0.50%");
        assert_eq!(format_psi_avg_centi_percent(100), "1.00%");
        assert_eq!(format_psi_avg_centi_percent(101), "1.01%");
        assert_eq!(format_psi_avg_centi_percent(10000), "100.00%");
        // Kernel EWMA rounding ceiling
        // (include/linux/sched/loadavg.h:35).
        assert_eq!(format_psi_avg_centi_percent(10099), "100.99%");
    }

    /// `psi_pair_has_data` returns false only when BOTH sides of
    /// the pair are entirely zero. Pins the gating used in
    /// `write_diff` to suppress the host-pressure block.
    #[test]
    fn psi_pair_has_data_returns_false_when_both_sides_zero() {
        let zero = Psi::default();
        assert!(!psi_pair_has_data(&zero, &zero));
    }

    #[test]
    fn psi_pair_has_data_returns_true_when_one_side_nonzero() {
        let zero = Psi::default();
        let mut nonzero = Psi::default();
        nonzero.cpu.some.avg10 = 1;
        // Either order: the helper checks both sides.
        assert!(psi_pair_has_data(&zero, &nonzero));
        assert!(psi_pair_has_data(&nonzero, &zero));
    }

    /// Boundary: `total_usec` set to a non-zero value with every
    /// avg-field still at zero counts as "has data". The avg
    /// fields can lag on a low-pressure system that still
    /// accumulated cumulative stall time, so a regression that
    /// only checked avg10/60/300 (omitting total) would render
    /// a misleading empty section here.
    #[test]
    fn psi_pair_has_data_detects_total_usec_only_data() {
        let zero = Psi::default();
        let mut total_only = Psi::default();
        total_only.io.full.total_usec = 1;
        assert!(psi_pair_has_data(&zero, &total_only));
        assert!(psi_pair_has_data(&total_only, &zero));
    }

    /// Auto-scale on the cgroup_cell µs family: a cpu_usage_usec
    /// row with raw values in the millions of microseconds (i.e.
    /// seconds-of-CPU range) renders with `s` / `ms` prefixes
    /// rather than raw 7-digit µs counts. Each cell scales
    /// independently.
    #[test]
    fn cgroup_cell_scales_microseconds_to_ms_or_s() {
        // 1_500_000 µs = 1.5 s; 3_000_000 µs = 3.0 s; delta 1.5 s.
        assert_eq!(
            cgroup_cell(Some(1_500_000), Some(3_000_000), ScaleLadder::Us),
            "1.500s → 3.000s (+1.500s)",
        );
        // Below the ms threshold — no step-up; integer below the
        // delta's short-circuit so the bare integer renders.
        assert_eq!(
            cgroup_cell(Some(500), Some(900), ScaleLadder::Us),
            "500µs → 900µs (+400µs)",
        );
    }

    /// Auto-scale on the cgroup_cell B family: a memory_current
    /// row in the GiB range renders with the `GiB` prefix on each
    /// scalar. Same IEC binary divisor (1024) as the per-thread
    /// allocated_bytes / read_bytes columns.
    #[test]
    fn cgroup_cell_scales_bytes_to_iec_prefix() {
        let one_gib: u64 = 1024 * 1024 * 1024;
        let two_gib: u64 = 2 * one_gib;
        assert_eq!(
            cgroup_cell(Some(one_gib), Some(two_gib), ScaleLadder::Bytes),
            "1.000GiB → 2.000GiB (+1.000GiB)",
        );
    }

    /// Auto-scale on the dimensionless cgroup_cell column
    /// (`nr_throttled`): large counts render with `K` / `M` /
    /// `G` SI prefixes per the empty-unit ladder. Exercises each
    /// step of the ladder so a regression that flips any
    /// threshold (1e3 / 1e6 / 1e9) surfaces here.
    #[test]
    fn cgroup_cell_scales_unitless_count_to_k_m_g() {
        // K step: values in the 1e3..1e6 range pick up a `K`
        // suffix and divide by 1e3.
        assert_eq!(
            cgroup_cell(Some(1_500), Some(2_500), ScaleLadder::Unitless),
            "1.500K → 2.500K (+1.000K)",
        );
        // M step: values in the 1e6..1e9 range pick up `M` and
        // divide by 1e6.
        assert_eq!(
            cgroup_cell(Some(1_500_000), Some(2_500_000), ScaleLadder::Unitless),
            "1.500M → 2.500M (+1.000M)",
        );
        // G step: values >= 1e9 pick up `G` and divide by 1e9.
        assert_eq!(
            cgroup_cell(
                Some(1_500_000_000),
                Some(2_500_000_000),
                ScaleLadder::Unitless
            ),
            "1.500G → 2.500G (+1.000G)",
        );
    }

    /// Enrichment renderer must union `cgroup_stats_a` and
    /// `cgroup_stats_b` keys so a cgroup that appeared in only one
    /// run still surfaces a row. Drives the one-sided paths of
    /// `cgroup_cell` through `write_diff` so the rendered output
    /// carries the `"X → -"` / `"- → Y"` strings.
    #[test]
    fn write_diff_enrichment_handles_one_sided_cgroup_keys() {
        let mut diff = CtprofDiff::default();
        diff.cgroup_stats_a
            .insert("/only-baseline".into(), simple_cgroup_stats(111, 0, 0, 0));
        diff.cgroup_stats_b
            .insert("/only-candidate".into(), simple_cgroup_stats(222, 0, 0, 0));
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Cgroup,
            &DisplayOptions::default(),
        )
        .unwrap();
        // Both keys present.
        assert!(
            out.contains("/only-baseline"),
            "baseline-only key missing:\n{out}",
        );
        assert!(
            out.contains("/only-candidate"),
            "candidate-only key missing:\n{out}",
        );
        // Each one-sided row emits the en-dash placeholder for
        // the absent side (per `cgroup_cell`'s Some/None branch).
        // cpu_usage_usec carries the "µs" unit; 111 µs is below
        // the ms threshold (1000), so it renders verbatim with
        // the base unit suffix.
        assert!(
            out.contains("111µs → -"),
            "baseline-only row missing '111µs → -' cell:\n{out}",
        );
        assert!(
            out.contains("- → 222µs"),
            "candidate-only row missing '- → 222µs' cell:\n{out}",
        );
    }

    /// Rows with equal `sort_key()` break ties by ascending
    /// `group_key`. Build two groups that move the same metric by
    /// the same percentage (so their sort keys are identical) and
    /// verify the output order is alphabetical.
    #[test]
    fn write_diff_stable_sort_tie_breaks_by_group_key_ascending() {
        // Same percentage swing, distinct group keys "alpha" and
        // "bravo". Both rise 1_000 → 2_000 (+100%).
        let mut a1 = make_thread("alpha", "w");
        a1.run_time_ns = MonotonicNs(1_000);
        let mut a2 = make_thread("bravo", "w");
        a2.run_time_ns = MonotonicNs(1_000);
        let mut b1 = make_thread("alpha", "w");
        b1.run_time_ns = MonotonicNs(2_000);
        let mut b2 = make_thread("bravo", "w");
        b2.run_time_ns = MonotonicNs(2_000);
        let diff = compare(
            &snap_with(vec![a1, a2]),
            &snap_with(vec![b1, b2]),
            &CompareOptions::default(),
        );
        // Filter to run_time_ns rows across the two groups; the
        // tie-break must put "alpha" before "bravo".
        let run_rows: Vec<&DiffRow> = diff
            .rows
            .iter()
            .filter(|r| r.metric_name == "run_time_ns")
            .collect();
        assert_eq!(run_rows.len(), 2);
        assert!(
            (run_rows[0].delta_pct.unwrap() - 1.0).abs() < 1e-9
                && (run_rows[1].delta_pct.unwrap() - 1.0).abs() < 1e-9,
            "test fixture must produce identical delta_pct for both groups",
        );
        assert_eq!(
            run_rows[0].group_key, "alpha",
            "ascending group_key tie-break expected alpha first",
        );
        assert_eq!(run_rows[1].group_key, "bravo");
    }

    /// `sort_key` inflates the zero-baseline-nonzero-candidate
    /// branch (delta=Some, delta_pct=None) by 1e9 so it sorts
    /// above pure zero-delta rows but still below any nonzero
    /// percentage row. Two rows: one zero-delta (delta_pct=0.0),
    /// one zero-baseline (delta=100, delta_pct=None) — the zero-
    /// baseline row must sort FIRST.
    #[test]
    fn sort_key_zero_delta_rows_sink_below_nonzero() {
        // Group "calm": identical values → delta 0, pct 0.0.
        let mut a1 = make_thread("calm", "w");
        a1.run_time_ns = MonotonicNs(500);
        let mut b1 = make_thread("calm", "w");
        b1.run_time_ns = MonotonicNs(500);
        // Group "birth": baseline 0 → candidate 100 → delta 100,
        // pct undefined (None). sort_key inflates to 100 * 1e9.
        let a2 = make_thread("birth", "w");
        let mut b2 = make_thread("birth", "w");
        b2.run_time_ns = MonotonicNs(100);
        let diff = compare(
            &snap_with(vec![a1, a2]),
            &snap_with(vec![b1, b2]),
            &CompareOptions::default(),
        );
        let run_rows: Vec<&DiffRow> = diff
            .rows
            .iter()
            .filter(|r| r.metric_name == "run_time_ns")
            .collect();
        // "birth" row (zero-baseline branch of sort_key) sorts
        // ahead of "calm" (zero-delta branch).
        assert_eq!(run_rows[0].group_key, "birth");
        assert_eq!(run_rows[1].group_key, "calm");
        // Pin the exact shape each branch is meant to carry, so a
        // regression that swapped the inflation with the zero
        // arm surfaces here with a precise diagnostic rather than
        // just "wrong order".
        assert_eq!(run_rows[0].delta, Some(100.0));
        assert!(run_rows[0].delta_pct.is_none());
        assert_eq!(run_rows[1].delta, Some(0.0));
        assert_eq!(run_rows[1].delta_pct, Some(0.0));
    }

    /// Rows with no numeric delta (categorical Mode) sort to the
    /// bottom via `sort_key`'s `f64::NEG_INFINITY` arm. Pin that a
    /// nonzero numeric row sorts ahead of a Mode row whose inputs
    /// differ, and that the Mode row still appears (sinks, not
    /// dropped).
    #[test]
    fn sort_key_none_delta_rows_sink_to_bottom() {
        let mut a = make_thread("app", "w");
        a.run_time_ns = MonotonicNs(100);
        a.policy = "SCHED_OTHER".into();
        let mut b = make_thread("app", "w");
        b.run_time_ns = MonotonicNs(200);
        b.policy = "SCHED_FIFO".into();
        let diff = compare(
            &snap_with(vec![a]),
            &snap_with(vec![b]),
            &CompareOptions::default(),
        );
        // Locate the positions of run_time_ns (numeric) and
        // policy (Mode, delta=None) in the sorted rows.
        let run_idx = diff
            .rows
            .iter()
            .position(|r| r.metric_name == "run_time_ns")
            .expect("run_time_ns row");
        let policy_idx = diff
            .rows
            .iter()
            .position(|r| r.metric_name == "policy")
            .expect("policy row");
        assert!(
            run_idx < policy_idx,
            "numeric row at {run_idx} must sort above Mode row at {policy_idx}",
        );
        // Mode row really is None-delta — otherwise the ordering
        // wouldn't prove the NEG_INFINITY branch.
        assert!(diff.rows[policy_idx].delta.is_none());
    }

    /// `aggregate(OrdinalRange, &[])` returns `OrdinalRange {
    /// min: 0, max: 0 }` via the `unwrap_or(0)` in the first-value
    /// init. Sibling to the empty-affinity test.
    #[test]
    fn aggregate_ordinal_range_on_empty_threads_is_zero() {
        let empty: Vec<&ThreadState> = vec![];
        let v = aggregate(AggRule::RangeI32(|t| t.nice), &empty);
        match v {
            Aggregated::OrdinalRange { min, max } => {
                assert_eq!(min, 0);
                assert_eq!(max, 0);
            }
            other => panic!("expected OrdinalRange, got {other:?}"),
        }
    }

    /// `aggregate(Mode, &[])` returns `Mode { value: "", count:
    /// 0, total: 0 }` via the empty-iterator tail of
    /// `Modeable::mode_across`.
    #[test]
    fn aggregate_mode_on_empty_threads_is_empty() {
        let empty: Vec<&ThreadState> = vec![];
        let v = aggregate(AggRule::Mode(|t| t.policy.clone()), &empty);
        match v {
            Aggregated::Mode {
                value,
                count,
                total,
            } => {
                assert!(value.is_empty());
                assert_eq!(count, 0);
                assert_eq!(total, 0);
            }
            other => panic!("expected Mode, got {other:?}"),
        }
    }

    /// All three Mode-family arms — `Mode`, `ModeChar`,
    /// `ModeBool` — route through the same `mode_aggregate`
    /// helper. Drive each arm with a deterministic 3-thread
    /// fixture and assert all three produce
    /// `Aggregated::Mode { value, count, total }` with `total
    /// == 3` and a count >= 1, pinning the helper's projection
    /// shape (value+count+total triple from `mode_across`, then
    /// total override from the supplied `threads.len()`).
    #[test]
    fn mode_aggregate_helper_dispatches_all_three_arms() {
        use crate::metric_types::CategoricalString;
        let mut t1 = make_thread("p", "w");
        let mut t2 = make_thread("p", "w");
        let mut t3 = make_thread("p", "w");
        // Mode: policy field — three distinct values, lex-tie
        // resolves to alphabetically-smallest unique winner.
        t1.policy = CategoricalString::from("SCHED_OTHER");
        t2.policy = CategoricalString::from("SCHED_OTHER");
        t3.policy = CategoricalString::from("SCHED_FIFO");
        // ModeChar: state is char.
        t1.state = 'R';
        t2.state = 'R';
        t3.state = 'S';
        // ModeBool: ext_enabled is bool.
        t1.ext_enabled = true;
        t2.ext_enabled = true;
        t3.ext_enabled = false;
        let threads: Vec<&ThreadState> = vec![&t1, &t2, &t3];

        // Mode arm: SCHED_OTHER wins 2/3.
        match aggregate(AggRule::Mode(|t| t.policy.clone()), &threads) {
            Aggregated::Mode {
                value,
                count,
                total,
            } => {
                assert_eq!(value, "SCHED_OTHER");
                assert_eq!(count, 2);
                assert_eq!(total, 3);
            }
            other => panic!("expected Mode for AggRule::Mode, got {other:?}"),
        }
        // ModeChar arm: 'R' wins 2/3 — coerced through
        // CategoricalString::to_string() via the helper.
        match aggregate(AggRule::ModeChar(|t| t.state), &threads) {
            Aggregated::Mode {
                value,
                count,
                total,
            } => {
                assert_eq!(value, "R");
                assert_eq!(count, 2);
                assert_eq!(total, 3);
            }
            other => panic!("expected Mode for AggRule::ModeChar, got {other:?}"),
        }
        // ModeBool arm: true wins 2/3 — coerced through
        // bool::Display.
        match aggregate(AggRule::ModeBool(|t| t.ext_enabled), &threads) {
            Aggregated::Mode {
                value,
                count,
                total,
            } => {
                assert_eq!(value, "true");
                assert_eq!(count, 2);
                assert_eq!(total, 3);
            }
            other => panic!("expected Mode for AggRule::ModeBool, got {other:?}"),
        }
    }

    /// `aggregate(SumNs, &[])` returns `Sum(0)` via the
    /// identity-element seed of `Summable::sum_across`.
    /// Completes empty-slice coverage across the reduction
    /// families (Sum*/Max*/Range*/Mode*).
    #[test]
    fn aggregate_sum_on_empty_threads_is_zero() {
        let empty: Vec<&ThreadState> = vec![];
        let v = aggregate(AggRule::SumNs(|t| t.run_time_ns), &empty);
        match v {
            Aggregated::Sum(s) => assert_eq!(s, 0),
            other => panic!("expected Sum, got {other:?}"),
        }
    }

    /// Three threads with different `wait_max` values aggregate to
    /// the GROUP MAX, not the sum. Pins the core semantic of
    /// `AggRule::MaxPeak` — the kernel's `*_max` schedstats fields
    /// are already per-thread maxes, and the group-level reduction
    /// should surface the worst single thread's worst window, not
    /// conflate a single 1s tail-latency spike with 1000 routine
    /// 1ms windows.
    #[test]
    fn aggregate_max_picks_group_maximum_not_sum() {
        let mut a = make_thread("p", "w");
        let mut b = make_thread("p", "w");
        let mut c = make_thread("p", "w");
        a.wait_max = PeakNs(100);
        b.wait_max = PeakNs(999_999_999); // The clear group-wide tail.
        c.wait_max = PeakNs(50);
        let v = aggregate(AggRule::MaxPeak(|t| t.wait_max), &[&a, &b, &c]);
        match v {
            Aggregated::Max(m) => {
                assert_eq!(
                    m, 999_999_999,
                    "Max must pick the largest value, not sum (sum \
                     would be 1_000_000_149)"
                );
            }
            other => panic!("expected Max, got {other:?}"),
        }
    }

    /// `aggregate(MaxPeak, &[])` returns `Max(0)` via the
    /// dispatch's None-to-Max(0) collapse at the call boundary —
    /// `Maxable::max_across` itself returns `Option<Self>`
    /// (`None` on empty input), and the `MaxPeak` arm in
    /// `aggregate()` collapses `None` to `Aggregated::Max(0)` so
    /// the historical empty-bucket contract on this code path is
    /// preserved. Mirrors the empty-Sum contract so downstream
    /// delta math works the same way for both rules when one
    /// side has no threads under the join key.
    #[test]
    fn aggregate_max_on_empty_threads_is_zero() {
        let empty: Vec<&ThreadState> = vec![];
        let v = aggregate(AggRule::MaxPeak(|t| t.wait_max), &empty);
        match v {
            Aggregated::Max(m) => assert_eq!(m, 0),
            other => panic!("expected Max, got {other:?}"),
        }
    }

    /// Single-thread group: `MaxPeak` returns the single thread's
    /// value verbatim. Pins that the dispatch's None-to-Max(0)
    /// collapse does not override a real reading — the trait's
    /// `Some(...)` arm fires for any non-empty input regardless
    /// of value.
    #[test]
    fn aggregate_max_single_thread_returns_thread_value() {
        let mut t = make_thread("p", "w");
        t.sleep_max = PeakNs(12_345_678_901);
        let v = aggregate(AggRule::MaxPeak(|t| t.sleep_max), &[&t]);
        match v {
            Aggregated::Max(m) => assert_eq!(m, 12_345_678_901),
            other => panic!("expected Max, got {other:?}"),
        }
    }

    /// Each `*_max` metric in the registry reads the matching
    /// per-thread field — guards against a copy-paste mistake
    /// like `Max(|t| t.wait_max.0)` for the `block_max` slot.
    /// Mirrors `sum_metric_accessors_read_expected_field` for
    /// the Max family.
    #[test]
    fn max_metric_accessors_read_expected_field() {
        type MetricSetter = fn(&mut ThreadState);
        let cases: &[(&str, MetricSetter)] = &[
            ("wait_max", |t| t.wait_max = PeakNs(1)),
            ("sleep_max", |t| t.sleep_max = PeakNs(1)),
            ("block_max", |t| t.block_max = PeakNs(1)),
            ("exec_max", |t| t.exec_max = PeakNs(1)),
            ("slice_max", |t| t.slice_max = PeakNs(1)),
        ];
        for (name, set) in cases {
            let mut t = make_thread("p", "w");
            set(&mut t);
            let def = CTPROF_METRICS
                .iter()
                .find(|m| m.name == *name)
                .unwrap_or_else(|| panic!("metric {name} not in registry"));
            let agg = aggregate(def.rule, &[&t]);
            match agg {
                Aggregated::Max(v) => {
                    assert_eq!(v, 1, "accessor for {name} did not read the {name} field")
                }
                other => panic!("expected Max for {name}, got {other:?}"),
            }
        }
    }

    /// `Aggregated::Max` projects to f64 via `numeric()` so the
    /// delta-math pipeline in `build_row` handles Max rows the
    /// same way it handles Sum rows. Display renders the bare u64
    /// (same shape as Sum). Pins both the numeric and Display
    /// arms so a regression that dropped one of them surfaces.
    #[test]
    fn aggregated_max_numeric_and_display() {
        let m = Aggregated::Max(7_500_000);
        assert_eq!(m.numeric(), Some(7_500_000.0));
        assert_eq!(format!("{m}"), "7500000");
    }

    /// `Aggregated::numeric` returns `None` for `Mode` — a
    /// policy name has no scalar projection. Pin the contract
    /// directly rather than via the diff pipeline because the
    /// pipeline only reads numeric through `build_row`'s `(a.numeric(),
    /// b.numeric())` pair and a regression could silently flip the
    /// return to `Some(0.0)` without any currently-visible symptom.
    #[test]
    fn numeric_returns_none_for_mode() {
        let m = Aggregated::Mode {
            value: "SCHED_OTHER".into(),
            count: 4,
            total: 4,
        };
        assert!(m.numeric().is_none());
    }

    /// `Aggregated::numeric` for a heterogeneous `Affinity`
    /// returns `(min_cpus + max_cpus) / 2.0` — the midpoint
    /// projection. Existing affinity tests only exercise uniform
    /// cpusets where `min == max`, so the arithmetic path is
    /// unpinned.
    #[test]
    fn numeric_returns_midpoint_for_affinity_heterogeneous() {
        let a = Aggregated::Affinity(AffinitySummary {
            min_cpus: 2,
            max_cpus: 8,
            uniform: None,
        });
        assert_eq!(a.numeric(), Some(5.0));
        // Single-element (uniform) heterogeneous check is the
        // degenerate case where the midpoint equals either bound.
        let b = Aggregated::Affinity(AffinitySummary {
            min_cpus: 4,
            max_cpus: 4,
            uniform: None,
        });
        assert_eq!(b.numeric(), Some(4.0));
    }

    /// Uniform non-contiguous cpuset `[0, 2]` renders as
    /// `"2 cpus (0,2)"` — exercises the comma-separated branch of
    /// `format_cpu_range` from the Affinity display impl. Existing
    /// uniform test uses `[0,1,2,3]` which collapses to a single
    /// range token.
    #[test]
    fn affinity_display_uniform_noncontiguous_renders_comma_separated() {
        let a = Aggregated::Affinity(AffinitySummary {
            min_cpus: 2,
            max_cpus: 2,
            uniform: Some(vec![0, 2]),
        });
        assert_eq!(a.to_string(), "2 cpus (0,2)");
    }

    /// Heterogeneous affinity where `min_cpus == max_cpus` (every
    /// thread has the same cpuset SIZE but different SETS) renders
    /// as `"N cpus (mixed)"` — pins the specific branch in the
    /// display impl. Current heterogeneous test has min != max so
    /// this branch was unpinned.
    #[test]
    fn affinity_display_heterogeneous_same_count_renders_mixed() {
        let a = Aggregated::Affinity(AffinitySummary {
            min_cpus: 3,
            max_cpus: 3,
            uniform: None,
        });
        assert_eq!(a.to_string(), "3 cpus (mixed)");
    }

    /// `flatten_cgroup_stats` with zero patterns preserves the
    /// input map verbatim — no entry merges, no key rewrites. A
    /// regression that accidentally ran the aggregation step on
    /// the empty-pattern path would collapse distinct cgroup paths
    /// together.
    #[test]
    fn flatten_cgroup_stats_with_no_patterns_preserves_keys() {
        let mut stats = BTreeMap::new();
        stats.insert("/alpha".into(), simple_cgroup_stats(10, 1, 5, 100));
        stats.insert("/beta".into(), simple_cgroup_stats(20, 2, 15, 200));
        let out = flatten_cgroup_stats(&stats, &[], None);
        assert_eq!(out.len(), 2);
        assert_eq!(out["/alpha"].cpu.usage_usec, 10);
        assert_eq!(out["/alpha"].memory.current, 100);
        assert_eq!(out["/beta"].cpu.usage_usec, 20);
        assert_eq!(out["/beta"].memory.current, 200);
    }

    // ------------------------------------------------------------
    // Pattern-aggregation tests: GroupBy::Comm
    // ------------------------------------------------------------

    /// Strip-trailing-digit happy path with a variety of separator
    /// chars in the prefix: `tokio-worker-12` → `tokio-worker-{N}`,
    /// `worker_5` → `worker_{N}`, etc. Pins that a separator before
    /// Token-based normalizer: every separator-delimited
    /// digit-run is replaced with `{N}` (rule 1), every
    /// alpha-prefix-plus-digits token with `prefix{N}` (rule 3).
    /// Embedded digit tokens between separators normalize too —
    /// `pool-2-thread-7` collapses to `pool-{N}-thread-{N}`. This
    /// is the new spec behavior; under the legacy algorithm only
    /// the trailing run was stripped.
    #[test]
    fn pattern_key_strips_trailing_digits() {
        assert_eq!(pattern_key("tokio-worker-12"), "tokio-worker-{N}");
        assert_eq!(pattern_key("worker_5"), "worker_{N}");
        assert_eq!(pattern_key("rayon.pool.7"), "rayon.pool.{N}");
        // Whitespace-separated tokens normalize per the same
        // rules; the run of whitespace separator chars is
        // preserved verbatim.
        assert_eq!(pattern_key("Chrome thread 4"), "Chrome thread {N}");
        // Embedded digit tokens between separators each
        // normalize.
        assert_eq!(pattern_key("pool-2-thread-7"), "pool-{N}-thread-{N}");
    }

    /// Bare-numeric and dangling-separator inputs.
    /// - `"0"` is a single pure-digit token → `{N}` (rule 1).
    /// - `"worker-"` is `[Token("worker"), Separator("-")]`; the
    ///   trailing separator is preserved verbatim, the alpha
    ///   token has no digits so it stays literal → `worker-`.
    #[test]
    fn pattern_key_bare_numeric_and_dangling_separator() {
        assert_eq!(pattern_key("0"), "{N}");
        assert_eq!(pattern_key("worker-"), "worker-");
    }

    /// AlphaPrefix (no separator before the digit run) groups when
    /// the prefix length passes the min-prefix gate. This catches
    /// CamelCase names like `CamelCaseWord0`/`CamelCaseWord1`/...
    /// that compose 40% of the unobserved coverage gap on
    /// many-CPU hosts. `cpu0` (prefix `cpu` exactly 3 chars) groups
    /// — correct on hosts where every CPU spawns one such thread.
    #[test]
    fn pattern_key_alpha_prefix_groups_without_separator() {
        assert_eq!(pattern_key("CamelCaseWord0"), "CamelCaseWord{N}");
        assert_eq!(pattern_key("CamelCaseWord175"), "CamelCaseWord{N}");
        assert_eq!(pattern_key("worker7"), "worker{N}");
        // 3-char prefix is the min boundary — `cpu` is exactly 3 chars.
        assert_eq!(pattern_key("cpu0"), "cpu{N}");
        // No trailing digits at all → stays literal.
        assert_eq!(pattern_key("init"), "init");
    }

    /// Single-letter alpha prefix in a delimited token normalizes
    /// (rule 3 with alpha prefix length ≥ 1) — when the alpha char
    /// is OUTSIDE `[0-9a-f]`. If the alpha char is inside that
    /// range, rule 2 (hex) fires first (it precedes rule 3). So:
    /// - `v` is outside → `gadget-v2` → `gadget-v{N}` (rule 3).
    /// - `r` is outside → `thingo-r2` → `thingo-r{N}` (rule 3).
    /// - `t` is outside → `t1` → `t{N}` (rule 3).
    /// - `a` is INSIDE → `a0` → `{H}` (rule 2 hex precedence).
    /// - `c0` etc. would also be `{H}` for the same reason.
    #[test]
    fn pattern_key_single_letter_alpha_prefix_normalizes() {
        assert_eq!(pattern_key("gadget-v2"), "gadget-v{N}");
        assert_eq!(pattern_key("thingo-r2"), "thingo-r{N}");
        assert_eq!(pattern_key("t1"), "t{N}");
        // `a0` falls under rule 2 because both chars are in
        // `[0-9a-f]` and one is a digit.
        assert_eq!(pattern_key("a0"), "{H}");
        // `t-1` splits into [Token("t"), Separator("-"), Token("1")];
        // `t` is pure alpha (no digits, no rule fires), `1` is pure
        // digit (rule 1 → `{N}`). Joined: `t-{N}`.
        assert_eq!(pattern_key("t-1"), "t-{N}");
        // `ab_5` splits into [Token("ab"), Separator("_"), Token("5")];
        // `ab` is hex-eligible chars but no digit → rule 2 fails;
        // alpha-only → rule 3 fails (no digits); literal `ab`.
        // `5` → `{N}`. Joined: `ab_{N}`.
        assert_eq!(pattern_key("ab_5"), "ab_{N}");
    }

    /// kworker thread names produce the same skeleton across CPUs
    /// under the token-based normalizer. Bound bare:
    /// `kworker/0:0` → `kworker/{N}:{N}`. Unbound:
    /// `kworker/u8:3` → `kworker/u{N}:{N}` (alpha prefix `u`
    /// length 1 normalizes per rule 3). Workqueue-bearing:
    /// `kworker/0:0-wq_reclaim` → `kworker/{N}:{N}-wq_reclaim`
    /// (workqueue suffix tokens are pure alpha → literal).
    /// High-priority worker `1H` matches rule 4
    /// (`^\d+[A-Za-z]+$`) and normalizes to `{N}H`.
    #[test]
    fn pattern_key_kworker_shapes_under_token_normalizer() {
        // Bare bound and unbound.
        assert_eq!(pattern_key("kworker/0:0"), "kworker/{N}:{N}");
        assert_eq!(pattern_key("kworker/3:2"), "kworker/{N}:{N}");
        assert_eq!(pattern_key("kworker/u8:3"), "kworker/u{N}:{N}");
        assert_eq!(pattern_key("kworker/u8:7"), "kworker/u{N}:{N}");
        assert_eq!(pattern_key("kworker/u16:0"), "kworker/u{N}:{N}");
        // Workqueue-bearing.
        assert_eq!(
            pattern_key("kworker/0:0-wq_reclaim"),
            "kworker/{N}:{N}-wq_reclaim",
        );
        assert_eq!(
            pattern_key("kworker/47:2-wq_reclaim"),
            "kworker/{N}:{N}-wq_reclaim",
        );
        // High-priority bound worker — `1H` token matches rule 4.
        assert_eq!(pattern_key("kworker/0:1H"), "kworker/{N}:{N}H");
        // High-priority bound worker with workqueue suffix.
        assert_eq!(
            pattern_key("kworker/0:1H-wq_prio"),
            "kworker/{N}:{N}H-wq_prio",
        );
    }

    /// Rule 4 (digits + alpha suffix → `{N}suffix`) catches the
    /// `<id>H` shape kworker high-priority pools emit. Rule 4 sits
    /// AFTER rule 2 (hex), so hex-eligible tokens still take the
    /// `{H}` path; only tokens whose alpha portion includes a char
    /// outside `[0-9a-f]` (uppercase letters, `g..z`) reach rule 4.
    #[test]
    fn classify_token_digits_alpha_suffix_rule_4() {
        // Pure-digit then non-hex alpha: rule 4 fires.
        assert_eq!(classify_token("1H"), "{N}H");
        assert_eq!(classify_token("0H"), "{N}H");
        // `Hz` contains uppercase (outside `[0-9a-f]`) → rule 2
        // fails → rule 4 fires.
        assert_eq!(classify_token("100Hz"), "{N}Hz");
        // `z` is outside `[0-9a-f]` → rule 2 fails → rule 4.
        assert_eq!(classify_token("3z"), "{N}z");
        // Rule 2 (hex) precedence: tokens whose chars are all in
        // `[0-9a-f]` (with at least one digit, len ≥ 2) classify
        // as hex BEFORE rule 4 runs.
        assert_eq!(classify_token("1a"), "{H}");
        assert_eq!(classify_token("0f"), "{H}");
        // `42abc` has chars `4,2,a,b,c` all in `[0-9a-f]` and
        // contains digits → rule 2 fires (`{H}`), NOT rule 4.
        assert_eq!(classify_token("42abc"), "{H}");
        // Mixed hex-then-non-hex alpha (e.g. `1aZ`): rule 2 fails
        // because `Z` is outside `[0-9a-f]`. Rule 3 fails (alpha
        // prefix length 0). Rule 4 fires (digits=`1`, alpha=`aZ`).
        assert_eq!(classify_token("1aZ"), "{N}aZ");
        // `42xyz` mixes hex digit with non-hex alpha → rule 2
        // fails on `x` → rule 4 fires.
        assert_eq!(classify_token("42xyz"), "{N}xyz");
        // Pure digits: rule 1 fires before rule 4 ever runs.
        assert_eq!(classify_token("42"), "{N}");
    }

    /// Empty comm input returns empty (no panic).
    #[test]
    fn pattern_key_empty_input_returns_empty() {
        assert_eq!(pattern_key(""), "");
    }

    /// Happy path: 8 `worker-N` + 4 `rayon-pool-N` + 1 `main`
    /// produce 2 pattern buckets + 1 ungrouped (singleton). The
    /// pattern bucket join keys are the `prefix-{N}` placeholder
    /// form; the singleton reverts to its literal comm.
    #[test]
    fn build_groups_comm_produces_pattern_buckets_and_singleton() {
        let mut threads = Vec::new();
        for i in 0..8 {
            threads.push(make_thread("app", &format!("worker-{i}")));
        }
        for i in 0..4 {
            threads.push(make_thread("app", &format!("rayon-pool-{i}")));
        }
        threads.push(make_thread("app", "main"));

        let snap = snap_with(threads);
        let groups = build_groups(&snap, GroupBy::Comm, &[], None, None, false);

        // Two pattern buckets keyed by the stripped prefix and one
        // singleton keyed by the literal name.
        assert!(
            groups.contains_key("worker-{N}"),
            "worker-{{N}} pattern bucket",
        );
        assert_eq!(groups["worker-{N}"].thread_count, 8);
        assert!(
            groups.contains_key("rayon-pool-{N}"),
            "rayon-pool-{{N}} pattern bucket",
        );
        assert_eq!(groups["rayon-pool-{N}"].thread_count, 4);
        assert!(
            groups.contains_key("main"),
            "singleton main reverts to literal comm",
        );
        assert_eq!(groups["main"].thread_count, 1);
        assert_eq!(groups.len(), 3);
    }

    /// A lone `worker-0` (no peer to share the prefix) reverts to
    /// the literal comm so the operator does not see a fake
    /// `worker-{N}` pattern matching only one thread.
    #[test]
    fn build_groups_comm_singleton_reverts_to_literal() {
        let snap = snap_with(vec![make_thread("app", "worker-0")]);
        let groups = build_groups(&snap, GroupBy::Comm, &[], None, None, false);
        assert!(
            groups.contains_key("worker-0"),
            "lone worker-0 stays literal",
        );
        assert!(
            !groups.contains_key("worker-{N}"),
            "no `worker-{{N}}` pattern key for a singleton",
        );
        assert_eq!(groups.len(), 1);
    }

    /// Different prefixes do not merge: `worker-0`, `worker-1`,
    /// `worker-large-0`, `worker-large-1` produce two distinct
    /// pattern buckets (`worker-{N}` and `worker-large-{N}`).
    #[test]
    fn build_groups_comm_distinct_prefixes_do_not_merge() {
        let snap = snap_with(vec![
            make_thread("app", "worker-0"),
            make_thread("app", "worker-1"),
            make_thread("app", "worker-large-0"),
            make_thread("app", "worker-large-1"),
        ]);
        let groups = build_groups(&snap, GroupBy::Comm, &[], None, None, false);
        assert_eq!(groups["worker-{N}"].thread_count, 2);
        assert_eq!(groups["worker-large-{N}"].thread_count, 2);
        assert_eq!(groups.len(), 2);
    }

    /// AlphaPrefix grouping (no separator before trailing digits)
    /// clusters CamelCase names that share a prefix. 176
    /// `CamelCaseWord{0..175}` threads (one per CPU) collapse
    /// into one bucket — pin the bucket count and exact member
    /// count to defend against a regression that reintroduces
    /// the separator gate.
    #[test]
    fn build_groups_comm_alpha_prefix_clusters_camelcase() {
        let mut threads = Vec::new();
        for i in 0..6 {
            threads.push(make_thread("app", &format!("CamelCaseWord{i}")));
        }
        let snap = snap_with(threads);
        let groups = build_groups(&snap, GroupBy::Comm, &[], None, None, false);
        assert!(
            groups.contains_key("CamelCaseWord{N}"),
            "CamelCaseWord{{N}} bucket",
        );
        assert_eq!(groups["CamelCaseWord{N}"].thread_count, 6);
        assert_eq!(groups.len(), 1);
    }

    /// kworker workqueue grouping: workqueue-bearing kworkers
    /// collapse across CPUs to one `kworker/{N}:{N}-<wq>` bucket
    /// per workqueue. Different workqueues do NOT merge —
    /// `wq_reclaim` and `mm_percpu_wq` each get their own bucket.
    /// The workqueue suffix is whatever pure-alpha tokens form
    /// (e.g. `wq_reclaim` tokenizes to `wq` + `_` + `reclaim`,
    /// both literal).
    #[test]
    fn build_groups_comm_kworker_workqueue_collapses_per_cpu() {
        let snap = snap_with(vec![
            make_thread("kworker", "kworker/42:7-mm_percpu_wq"),
            make_thread("kworker", "kworker/43:8-mm_percpu_wq"),
            make_thread("kworker", "kworker/44:9-mm_percpu_wq"),
            make_thread("kworker", "kworker/0:0-wq_reclaim"),
            make_thread("kworker", "kworker/1:0-wq_reclaim"),
        ]);
        let groups = build_groups(&snap, GroupBy::Comm, &[], None, None, false);
        assert_eq!(groups["kworker/{N}:{N}-mm_percpu_wq"].thread_count, 3);
        assert_eq!(groups["kworker/{N}:{N}-wq_reclaim"].thread_count, 2);
        assert_eq!(groups.len(), 2);
    }

    /// Bare kworker (no `-<wq>` suffix) collapses across CPUs
    /// under the token normalizer: `kworker/0:0`, `kworker/0:1`,
    /// `kworker/1:0`, `kworker/3:2` all produce
    /// `kworker/{N}:{N}` and join one bucket. This is the new
    /// spec behavior — both `<cpu>` and `<id>` tokens normalize to
    /// `{N}`.
    #[test]
    fn build_groups_comm_kworker_bare_collapses_across_cpus() {
        let snap = snap_with(vec![
            make_thread("kworker", "kworker/0:0"),
            make_thread("kworker", "kworker/0:1"),
            make_thread("kworker", "kworker/1:0"),
            make_thread("kworker", "kworker/3:2"),
        ]);
        let groups = build_groups(&snap, GroupBy::Comm, &[], None, None, false);
        assert_eq!(groups["kworker/{N}:{N}"].thread_count, 4);
        assert_eq!(groups.len(), 1);
    }

    /// Unbound kworker (`u<pool_id>:<id>`) and bound kworker
    /// (`<cpu>:<id>`) skeletons differ — unbound has the `u`
    /// prefix, bound does not. They group into separate buckets:
    /// `kworker/u{N}:{N}` and `kworker/{N}:{N}`.
    #[test]
    fn build_groups_comm_kworker_unbound_separate_from_bound() {
        let snap = snap_with(vec![
            make_thread("kworker", "kworker/0:0"),
            make_thread("kworker", "kworker/3:2"),
            make_thread("kworker", "kworker/u8:3"),
            make_thread("kworker", "kworker/u8:7"),
            make_thread("kworker", "kworker/u16:0"),
        ]);
        let groups = build_groups(&snap, GroupBy::Comm, &[], None, None, false);
        assert_eq!(groups["kworker/{N}:{N}"].thread_count, 2);
        assert_eq!(groups["kworker/u{N}:{N}"].thread_count, 3);
        assert_eq!(groups.len(), 2);
    }

    /// Empty comm strings group together as the empty literal —
    /// no panic, no special handling.
    #[test]
    fn build_groups_comm_empty_comm_does_not_panic() {
        let snap = snap_with(vec![make_thread("app", ""), make_thread("app", "")]);
        let groups = build_groups(&snap, GroupBy::Comm, &[], None, None, false);
        assert_eq!(groups[""].thread_count, 2);
    }

    /// TASK_COMM_LEN truncation: identical truncated comms group
    /// together via the literal-comm branch (no separator before
    /// trailing chars). Pin the all-too-common case where Linux
    /// truncates a long thread name to 15 chars and two threads
    /// land on the same truncated literal.
    #[test]
    fn build_groups_comm_truncated_comms_group_via_exact_match() {
        // Both threads share the same truncated 15-char comm.
        let snap = snap_with(vec![
            make_thread("app", "tokio-runtime-w"),
            make_thread("app", "tokio-runtime-w"),
        ]);
        let groups = build_groups(&snap, GroupBy::Comm, &[], None, None, false);
        // No trailing digits → pattern_key returns input unchanged
        // → both threads land in the same literal-comm bucket.
        assert_eq!(groups["tokio-runtime-w"].thread_count, 2);
        assert_eq!(groups.len(), 1);
    }

    /// Conservation: the sum of an aggregated counter across every
    /// pattern bucket equals the sum across every input thread.
    /// Pattern-aggregation must be bookkeeping-neutral.
    #[test]
    fn build_groups_comm_sum_conservation_across_buckets() {
        let mut threads = Vec::new();
        for i in 0..5 {
            let mut t = make_thread("app", &format!("worker-{i}"));
            t.run_time_ns = MonotonicNs(100 * (i as u64 + 1));
            threads.push(t);
        }
        for i in 0..3 {
            let mut t = make_thread("app", &format!("redis-bg-{i}"));
            t.run_time_ns = MonotonicNs(50 * (i as u64 + 1));
            threads.push(t);
        }
        let mut single = make_thread("app", "main");
        single.run_time_ns = MonotonicNs(999);
        threads.push(single);

        let input_total: u64 = threads.iter().map(|t| t.run_time_ns.0).sum();
        let snap = snap_with(threads);
        let groups = build_groups(&snap, GroupBy::Comm, &[], None, None, false);

        let aggregated_total: u64 = groups
            .values()
            .map(|g| match g.metrics.get("run_time_ns") {
                Some(Aggregated::Sum(n)) => *n,
                _ => 0,
            })
            .sum();
        assert_eq!(
            aggregated_total, input_total,
            "pattern-aggregated sum must equal input sum",
        );
    }

    /// `GroupBy::CommExact` preserves the old literal semantics —
    /// `worker-0` and `worker-1` stay in distinct buckets.
    #[test]
    fn build_groups_comm_exact_preserves_literal_semantics() {
        let snap = snap_with(vec![
            make_thread("app", "worker-0"),
            make_thread("app", "worker-1"),
            make_thread("app", "worker-1"),
        ]);
        let groups = build_groups(&snap, GroupBy::CommExact, &[], None, None, false);
        assert_eq!(groups["worker-0"].thread_count, 1);
        assert_eq!(groups["worker-1"].thread_count, 2);
        assert_eq!(groups.len(), 2);
    }

    /// `pattern_display_label` produces a grex regex over the
    /// member set for buckets ≥ 2; singletons fall through to the
    /// join key. Validates the render-side wiring without
    /// asserting a specific regex shape (grex internals may vary).
    #[test]
    fn pattern_display_label_grex_for_multi_member_else_join_key() {
        let single = vec!["worker-0".to_string()];
        assert_eq!(pattern_display_label("worker-0", &single), "worker-0");
        let empty: Vec<String> = vec![];
        assert_eq!(pattern_display_label("worker", &empty), "worker");
        let multi = vec!["worker-0".to_string(), "worker-1".to_string()];
        let label = pattern_display_label("worker", &multi);
        assert!(
            label.contains("worker"),
            "grex label must mention the shared prefix; got {label:?}",
        );
    }

    /// End-to-end pin: `compare(GroupBy::Comm, ...)` produces
    /// DiffRow whose `group_key` is the `prefix-{N}` placeholder
    /// (deterministic across snapshots) and whose `display_key`
    /// reflects grex over the union of baseline + candidate
    /// members.
    #[test]
    fn compare_comm_pattern_emits_prefix_join_key_and_grex_display() {
        let baseline = snap_with(vec![
            make_thread("app", "worker-0"),
            make_thread("app", "worker-1"),
        ]);
        let candidate = snap_with(vec![
            make_thread("app", "worker-2"),
            make_thread("app", "worker-3"),
        ]);
        let diff = compare(
            &baseline,
            &candidate,
            &CompareOptions {
                group_by: GroupBy::Comm.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: false,
                no_cg_normalize: false,
                sort_by: Vec::new(),
            },
        );
        let row = diff
            .rows
            .iter()
            .find(|r| r.metric_name == "run_time_ns" && r.group_key == "worker-{N}")
            .expect("worker-{N} row");
        assert_eq!(
            row.group_key, "worker-{N}",
            "join key is the placeholder pattern"
        );
        assert!(
            row.display_key.contains("worker"),
            "display key reflects grex over union; got {:?}",
            row.display_key,
        );
        // Distinct-member union (4 names) must produce a label
        // distinct from the bare prefix when grex is active.
        assert_ne!(
            row.display_key, "worker-{N}",
            "≥2 members → grex regex, not the placeholder pattern",
        );
    }

    /// Regression pin for the cross-snapshot frequency union: a
    /// pattern that has 1 thread in baseline and 3 threads in
    /// candidate must still cluster under the same `worker-{N}`
    /// key on BOTH sides. Under per-snapshot counts the baseline
    /// would gate `worker-7` to literal (count 1 < 2), the
    /// candidate would gate `worker-{N}` to pattern (count 3 ≥ 2),
    /// and the row would surface as both only-in-baseline AND
    /// only-in-candidate — orphaned. The union frequency
    /// (1 + 3 = 4 ≥ 2) promotes the pattern on both sides so the
    /// row joins.
    #[test]
    fn compare_comm_pattern_joins_across_asymmetric_resize() {
        let baseline = snap_with(vec![make_thread("app", "worker-7")]);
        let candidate = snap_with(vec![
            make_thread("app", "worker-0"),
            make_thread("app", "worker-1"),
            make_thread("app", "worker-2"),
        ]);
        let diff = compare(
            &baseline,
            &candidate,
            &CompareOptions {
                group_by: GroupBy::Comm.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: false,
                no_cg_normalize: false,
                sort_by: Vec::new(),
            },
        );
        let row = diff
            .rows
            .iter()
            .find(|r| r.metric_name == "run_time_ns" && r.group_key == "worker-{N}")
            .expect("worker-{N} row joined across asymmetric snapshots");
        assert_eq!(row.thread_count_a, 1, "baseline carries 1 worker");
        assert_eq!(row.thread_count_b, 3, "candidate carries 3 workers");
        // No orphan rows for the worker family. The union map
        // ensures both sides use the same `worker-{N}` key.
        let baseline_orphans: Vec<&String> = diff
            .only_baseline
            .iter()
            .filter(|k| k.starts_with("worker"))
            .collect();
        assert!(
            baseline_orphans.is_empty(),
            "no worker-prefixed orphans in only_baseline; got {baseline_orphans:?}",
        );
        let candidate_orphans: Vec<&String> = diff
            .only_candidate
            .iter()
            .filter(|k| k.starts_with("worker"))
            .collect();
        assert!(
            candidate_orphans.is_empty(),
            "no worker-prefixed orphans in only_candidate; got {candidate_orphans:?}",
        );
    }

    // ------------------------------------------------------------
    // Token-based normalizer tests for thread-name and cgroup-path
    // grouping. Each test pins one expected (input, normalized
    // bucket key) pair so a regression in `pattern_key` /
    // `flatten_cgroup_path` surfaces as a localized failure.
    // ------------------------------------------------------------

    /// Token classifier: pure digits → `{N}` (rule 1).
    #[test]
    fn classify_token_pure_digits() {
        assert_eq!(classify_token("0"), "{N}");
        assert_eq!(classify_token("42"), "{N}");
        assert_eq!(classify_token("999"), "{N}");
    }

    /// Token classifier: hex-like (all `[0-9a-f]`, length ≥ 2,
    /// at least one digit) → `{H}` (rule 2). `abc` (no digits)
    /// is NOT hex-like; `a1` is. Pure-digit tokens fall through
    /// to rule 1 first.
    #[test]
    fn classify_token_hex_like() {
        assert_eq!(classify_token("a1234"), "{H}");
        assert_eq!(classify_token("abc123def456"), "{H}");
        assert_eq!(classify_token("7890ab"), "{H}");
        assert_eq!(classify_token("1a2"), "{H}");
        assert_eq!(classify_token("650ab12cd34ef"), "{H}");
        // Pure alpha (no digits) — rule 2 fails the digit check.
        assert_eq!(classify_token("abc"), "abc");
        // Length 1 — rule 2 fails the length check (digit-only
        // would be rule 1, but `a` has no digit).
        assert_eq!(classify_token("a"), "a");
        // Hex-like length 2 with one digit and one alpha:
        assert_eq!(classify_token("a1"), "{H}");
        // Token containing chars outside `[0-9a-f]` (like `g`,
        // `u`, `H`) is NOT hex-like and falls through to rule 3.
        assert_eq!(classify_token("u8"), "u{N}");
    }

    /// Token classifier: alpha prefix + trailing digits
    /// (`^[A-Za-z]+\d+$`, alpha prefix length ≥ 1) →
    /// `prefix{N}` (rule 3). Single-letter alpha prefixes (e.g.
    /// `u8`, `v2`, `r2`) qualify.
    #[test]
    fn classify_token_alpha_prefix_digits() {
        assert_eq!(classify_token("worker7"), "worker{N}");
        assert_eq!(classify_token("CamelCaseWord175"), "CamelCaseWord{N}");
        assert_eq!(classify_token("u8"), "u{N}");
        assert_eq!(classify_token("u16"), "u{N}");
        assert_eq!(classify_token("v2"), "v{N}");
        assert_eq!(classify_token("r2"), "r{N}");
        // Digits-then-alpha matches rule 4 (`^\d+[A-Za-z]+$`) →
        // `{N}suffix`. Rule 2 (hex) takes precedence when chars
        // qualify (e.g. `3a` → `{H}` because both chars are in
        // `[0-9a-f]`); `H` and `z` are outside that range, so
        // rule 4 fires.
        assert_eq!(classify_token("1H"), "{N}H");
        assert_eq!(classify_token("3z"), "{N}z");
        // Alpha-then-digits-then-alpha does NOT match rule 3
        // (the regex requires the digit run to be at the end,
        // anchored by `$`).
        assert_eq!(classify_token("proto303handler"), "proto303handler");
    }

    /// Token classifier: token with no rule match stays literal.
    #[test]
    fn classify_token_literal_fallback() {
        assert_eq!(classify_token("BPF"), "BPF");
        assert_eq!(classify_token("CUBIC"), "CUBIC");
        assert_eq!(classify_token("AUTO"), "AUTO");
        assert_eq!(classify_token("FLOWLABEL"), "FLOWLABEL");
        assert_eq!(classify_token("hamster"), "hamster");
        assert_eq!(classify_token("zilch"), "zilch");
    }

    /// Empty token returns empty (no panic).
    #[test]
    fn classify_token_empty_returns_empty() {
        assert_eq!(classify_token(""), "");
    }

    /// Tokenizer: `split_into_segments` alternates token / sep
    /// runs, preserving the original separator characters
    /// verbatim. Empty input yields zero segments.
    #[test]
    fn split_into_segments_alternates_token_and_separator_runs() {
        assert!(split_into_segments("").is_empty());
        // Pure alpha → one token.
        let segs = split_into_segments("hamster");
        assert_eq!(segs, vec![Segment::Token("hamster")]);
        // Token-sep-token.
        let segs = split_into_segments("worker-7");
        assert_eq!(
            segs,
            vec![
                Segment::Token("worker"),
                Segment::Separator("-"),
                Segment::Token("7"),
            ],
        );
        // Multi-char separator run preserved as one segment.
        let segs = split_into_segments("a..b");
        assert_eq!(
            segs,
            vec![
                Segment::Token("a"),
                Segment::Separator(".."),
                Segment::Token("b"),
            ],
        );
        // Leading separator run.
        let segs = split_into_segments("/abc");
        assert_eq!(segs, vec![Segment::Separator("/"), Segment::Token("abc")],);
        // Mixed separator chars in one run.
        let segs = split_into_segments("yy._650");
        assert_eq!(
            segs,
            vec![
                Segment::Token("yy"),
                Segment::Separator("._"),
                Segment::Token("650"),
            ],
        );
        // `+` is a separator (per spec): kworker active-worker
        // decoration tokenizes the same way as the idle (`-`)
        // form. Tokens on either side normalize independently.
        let segs = split_into_segments("kworker/0:1+events");
        assert_eq!(
            segs,
            vec![
                Segment::Token("kworker"),
                Segment::Separator("/"),
                Segment::Token("0"),
                Segment::Separator(":"),
                Segment::Token("1"),
                Segment::Separator("+"),
                Segment::Token("events"),
            ],
        );
    }

    /// `+` is a separator (per spec) so active-kworker comms
    /// (`<cpu>:<id>+<wq>`) tokenize the same shape as idle
    /// (`<cpu>:<id>-<wq>`) and the digit tokens on each side
    /// normalize independently. Active workers across distinct
    /// CPUs collapse to one bucket per workqueue. Active and
    /// idle workers DO NOT collapse — the separator character
    /// (`+` vs `-`) is preserved verbatim in the rejoined
    /// skeleton, so they sort into separate buckets per
    /// workqueue per decoration.
    #[test]
    fn pattern_key_kworker_active_decoration_separator() {
        // Active-decoration per-CPU collapse.
        assert_eq!(pattern_key("kworker/0:1+events"), "kworker/{N}:{N}+events",);
        assert_eq!(pattern_key("kworker/1:0+events"), "kworker/{N}:{N}+events",);
        // Active and idle remain distinct buckets.
        assert_ne!(
            pattern_key("kworker/0:1+events"),
            pattern_key("kworker/0:1-events"),
        );
        assert_eq!(pattern_key("kworker/0:1-events"), "kworker/{N}:{N}-events",);
    }

    /// Reference test data: 14 multi-member buckets and 13
    /// singletons covering every classifier rule. Every input
    /// thread name produces the exact expected normalized form;
    /// every bucket has the exact expected member count.
    #[test]
    fn spec_thread_grouping_verbatim() {
        let inputs: &[&str] = &[
            // Bucket 1: whirly-gig-{N} (4 members)
            "whirly-gig-0",
            "whirly-gig-1",
            "whirly-gig-2",
            "whirly-gig-15",
            // Bucket 2: plonk_zap_{N} (3)
            "plonk_zap_0",
            "plonk_zap_1",
            "plonk_zap_7",
            // Bucket 3: ksoftirqd/{N} (4)
            "ksoftirqd/0",
            "ksoftirqd/1",
            "ksoftirqd/2",
            "ksoftirqd/99",
            // Bucket 4: kworker/{N}:{N} (4) — bare bound
            "kworker/0:0",
            "kworker/0:1",
            "kworker/1:0",
            "kworker/3:2",
            // Bucket 5: kworker/{N}:{N}-wq_reclaim (3)
            "kworker/0:0-wq_reclaim",
            "kworker/1:0-wq_reclaim",
            "kworker/47:2-wq_reclaim",
            // Bucket 6: kworker/u{N}:{N} (3) — bare unbound
            "kworker/u8:3",
            "kworker/u8:7",
            "kworker/u16:0",
            // Bucket 6b: kworker/{N}:{N}H-wq_prio (3) —
            // high-priority bound workers; rule 4 normalizes
            // `<id>H` tokens.
            "kworker/0:1H-wq_prio",
            "kworker/1:0H-wq_prio",
            "kworker/2:1H-wq_prio",
            // Bucket 7: FooBar{N} (4)
            "FooBar0",
            "FooBar1",
            "FooBar2",
            "FooBar175",
            // Bucket 8: BazQux{N} (3)
            "BazQux0",
            "BazQux1",
            "BazQux42",
            // Bucket 9: wonk{N} (3)
            "wonk0",
            "wonk1",
            "wonk9",
            // Bucket 10: Grommet.Z{N} (3)
            "Grommet.Z0",
            "Grommet.Z1",
            "Grommet.Z999",
            // Bucket 11: fizz-buzz-wham{N} (3)
            "fizz-buzz-wham0",
            "fizz-buzz-wham1",
            "fizz-buzz-wham7",
            // Bucket 12: rcu_exp_par_gp_kthread_worker/{N} (2)
            "rcu_exp_par_gp_kthread_worker/0",
            "rcu_exp_par_gp_kthread_worker/1",
            // Bucket 13: migration/{N} (2)
            "migration/0",
            "migration/1",
            // Singletons:
            "bloop-tangler",
            "narf-bonker",
            "spork-wrangler",
            "hamster",
            "zilch",
            "gadget-v2",
            "thingo-r2",
            "cpu0",
            "blip0",
            "snorf0",
            "ptp0",
            "BPF_CUBIC",
            "AUTO_FLOWLABEL",
        ];

        // Per-input expected pattern_key.
        let expected_keys: &[(&str, &str)] = &[
            ("whirly-gig-0", "whirly-gig-{N}"),
            ("whirly-gig-1", "whirly-gig-{N}"),
            ("whirly-gig-2", "whirly-gig-{N}"),
            ("whirly-gig-15", "whirly-gig-{N}"),
            ("plonk_zap_0", "plonk_zap_{N}"),
            ("plonk_zap_1", "plonk_zap_{N}"),
            ("plonk_zap_7", "plonk_zap_{N}"),
            ("ksoftirqd/0", "ksoftirqd/{N}"),
            ("ksoftirqd/1", "ksoftirqd/{N}"),
            ("ksoftirqd/2", "ksoftirqd/{N}"),
            ("ksoftirqd/99", "ksoftirqd/{N}"),
            ("kworker/0:0", "kworker/{N}:{N}"),
            ("kworker/0:1", "kworker/{N}:{N}"),
            ("kworker/1:0", "kworker/{N}:{N}"),
            ("kworker/3:2", "kworker/{N}:{N}"),
            ("kworker/0:0-wq_reclaim", "kworker/{N}:{N}-wq_reclaim"),
            ("kworker/1:0-wq_reclaim", "kworker/{N}:{N}-wq_reclaim"),
            ("kworker/47:2-wq_reclaim", "kworker/{N}:{N}-wq_reclaim"),
            ("kworker/u8:3", "kworker/u{N}:{N}"),
            ("kworker/u8:7", "kworker/u{N}:{N}"),
            ("kworker/u16:0", "kworker/u{N}:{N}"),
            ("kworker/0:1H-wq_prio", "kworker/{N}:{N}H-wq_prio"),
            ("kworker/1:0H-wq_prio", "kworker/{N}:{N}H-wq_prio"),
            ("kworker/2:1H-wq_prio", "kworker/{N}:{N}H-wq_prio"),
            ("FooBar0", "FooBar{N}"),
            ("FooBar1", "FooBar{N}"),
            ("FooBar2", "FooBar{N}"),
            ("FooBar175", "FooBar{N}"),
            ("BazQux0", "BazQux{N}"),
            ("BazQux1", "BazQux{N}"),
            ("BazQux42", "BazQux{N}"),
            ("wonk0", "wonk{N}"),
            ("wonk1", "wonk{N}"),
            ("wonk9", "wonk{N}"),
            ("Grommet.Z0", "Grommet.Z{N}"),
            ("Grommet.Z1", "Grommet.Z{N}"),
            ("Grommet.Z999", "Grommet.Z{N}"),
            ("fizz-buzz-wham0", "fizz-buzz-wham{N}"),
            ("fizz-buzz-wham1", "fizz-buzz-wham{N}"),
            ("fizz-buzz-wham7", "fizz-buzz-wham{N}"),
            (
                "rcu_exp_par_gp_kthread_worker/0",
                "rcu_exp_par_gp_kthread_worker/{N}",
            ),
            (
                "rcu_exp_par_gp_kthread_worker/1",
                "rcu_exp_par_gp_kthread_worker/{N}",
            ),
            ("migration/0", "migration/{N}"),
            ("migration/1", "migration/{N}"),
            // Singletons (skeleton form per algorithm).
            ("bloop-tangler", "bloop-tangler"),
            ("narf-bonker", "narf-bonker"),
            ("spork-wrangler", "spork-wrangler"),
            ("hamster", "hamster"),
            ("zilch", "zilch"),
            ("gadget-v2", "gadget-v{N}"),
            ("thingo-r2", "thingo-r{N}"),
            ("cpu0", "cpu{N}"),
            ("blip0", "blip{N}"),
            ("snorf0", "snorf{N}"),
            ("ptp0", "ptp{N}"),
            ("BPF_CUBIC", "BPF_CUBIC"),
            ("AUTO_FLOWLABEL", "AUTO_FLOWLABEL"),
        ];

        for (input, expected) in expected_keys {
            assert_eq!(
                pattern_key(input),
                *expected,
                "pattern_key({input:?}) skeleton mismatch",
            );
        }

        // Build groups via `build_groups` and assert bucket
        // membership counts. Singletons revert to the literal
        // input under `build_groups`'s gate, so the bucket key
        // for a singleton is the input string, not the
        // skeleton.
        let threads: Vec<_> = inputs.iter().map(|c| make_thread("p", c)).collect();
        let snap = snap_with(threads);
        let groups = build_groups(&snap, GroupBy::Comm, &[], None, None, false);

        let expected_buckets: &[(&str, usize)] = &[
            ("whirly-gig-{N}", 4),
            ("plonk_zap_{N}", 3),
            ("ksoftirqd/{N}", 4),
            ("kworker/{N}:{N}", 4),
            ("kworker/{N}:{N}-wq_reclaim", 3),
            ("kworker/u{N}:{N}", 3),
            ("kworker/{N}:{N}H-wq_prio", 3),
            ("FooBar{N}", 4),
            ("BazQux{N}", 3),
            ("wonk{N}", 3),
            ("Grommet.Z{N}", 3),
            ("fizz-buzz-wham{N}", 3),
            ("rcu_exp_par_gp_kthread_worker/{N}", 2),
            ("migration/{N}", 2),
        ];
        for (key, count) in expected_buckets {
            let g = groups
                .get(*key)
                .unwrap_or_else(|| panic!("missing bucket {key:?}"));
            assert_eq!(
                g.thread_count, *count,
                "bucket {key:?} expected {count} members, got {}",
                g.thread_count,
            );
        }

        // Singletons keep their literal input as the bucket key
        // (the gate at `build_groups` reverts singletons to the
        // input).
        for singleton in &[
            "bloop-tangler",
            "narf-bonker",
            "spork-wrangler",
            "hamster",
            "zilch",
            "gadget-v2",
            "thingo-r2",
            "cpu0",
            "blip0",
            "snorf0",
            "ptp0",
            "BPF_CUBIC",
            "AUTO_FLOWLABEL",
        ] {
            let g = groups
                .get(*singleton)
                .unwrap_or_else(|| panic!("missing singleton bucket {singleton:?}"));
            assert_eq!(
                g.thread_count, 1,
                "singleton {singleton:?} should have 1 member",
            );
        }

        // Total bucket count: 14 multi-member + 13 singletons.
        assert_eq!(groups.len(), 14 + 13, "expected 27 buckets total");
    }

    /// `--no-thread-normalize` (mirrored at API level by
    /// `CompareOptions::no_thread_normalize = true`) bypasses the
    /// token normalizer for thread-name grouping. Two threads
    /// with names that share a normalized skeleton but differ
    /// literally (e.g. `worker-0` and `worker-1`) end up in
    /// SEPARATE buckets — same effect as `GroupBy::CommExact`.
    #[test]
    fn no_thread_normalize_uses_literal_comm() {
        let snap_a = snap_with(vec![
            make_thread("p", "worker-0"),
            make_thread("p", "worker-1"),
        ]);
        let snap_b = snap_with(vec![
            make_thread("p", "worker-0"),
            make_thread("p", "worker-1"),
        ]);
        let diff = compare(
            &snap_a,
            &snap_b,
            &CompareOptions {
                group_by: GroupBy::Comm.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: true,
                no_cg_normalize: false,
                sort_by: Vec::new(),
            },
        );
        // Two distinct buckets — no collapse to `worker-{N}`.
        let group_keys: std::collections::BTreeSet<&str> =
            diff.rows.iter().map(|r| r.group_key.as_str()).collect();
        assert!(
            group_keys.contains("worker-0"),
            "literal worker-0 missing: {group_keys:?}",
        );
        assert!(
            group_keys.contains("worker-1"),
            "literal worker-1 missing: {group_keys:?}",
        );
        assert!(
            !group_keys.contains("worker-{N}"),
            "no normalized bucket under no_thread_normalize: {group_keys:?}",
        );
    }

    // ------------------------------------------------------------
    // Cgroup normalization tests.
    // ------------------------------------------------------------

    /// Layer 1: systemd template normalization. Instances without
    /// `[._-]` become `{I}`; instances with any of those stay
    /// literal.
    #[test]
    fn apply_systemd_template_opaque_id_to_placeholder() {
        // Opaque numeric instance — normalize.
        assert_eq!(
            apply_systemd_template("/user.slice/user-0.slice/user@0.service/boot.scope"),
            "/user.slice/user-0.slice/user@{I}.service/boot.scope",
        );
        assert_eq!(
            apply_systemd_template("/user.slice/user-1001.slice/user@1001.service/boot.scope"),
            "/user.slice/user-1001.slice/user@{I}.service/boot.scope",
        );
        // Structured instance with `.` — keep literal.
        assert_eq!(
            apply_systemd_template("/critical.slice/launcher@foo.bar.baz.service"),
            "/critical.slice/launcher@foo.bar.baz.service",
        );
        // No `@<x>.service` — unchanged.
        assert_eq!(
            apply_systemd_template("/system.slice/crond.service"),
            "/system.slice/crond.service",
        );
        // Path with no `@` at all.
        assert_eq!(apply_systemd_template("/"), "/");
    }

    /// Cgroup grouping: spec test data verbatim. Two
    /// `user@{I}.service` paths with different opaque IDs
    /// collapse into one bucket; `launcher@<structured>.service`
    /// paths stay as singletons; sprocket leaves (run_17 +
    /// run_22) share a skeleton, gizmo leaves are singletons
    /// (different words: gizmo vs sprocket).
    #[test]
    fn spec_cgroup_grouping_verbatim() {
        let cgroups: &[&str] = &[
            "/",
            "/boot.scope",
            "/critical.slice/emitd.service",
            "/critical.slice/remoted.service",
            "/critical.slice/launcher@foo.bar.baz.service",
            "/critical.slice/launcher@foo.qux.quux.service",
            "/critical.slice/launcher@foo.waldo.grault.service",
            "/system.slice/crond.service",
            "/system.slice/ntpd.service",
            "/system.slice/tpl.slice/launcher@foo.garply.plugh.service",
            "/system.slice/tpl.slice/launcher@foo.corge.xyzzy.service",
            "/system.slice/tpl.slice/launcher@foo.thud.fred.service",
            "/user.slice/user-0.slice/session-a1234.scope",
            "/user.slice/user-0.slice/user@0.service/boot.scope",
            "/user.slice/user-1001.slice/session-b5678.scope",
            "/user.slice/user-1001.slice/user@1001.service/boot.scope",
            // Sprocket app variants (run_17, run_22) — share skeleton.
            // Each variant has 4 leaves.
            "/apps.slice/wl-foo.slice/wl-foo-abc123def456.7890ab.alloc.slice/v2_acme.prod_widget_sprocket_run_17.400_fluxcap9000.01.zz3_650ab12cd34ef_1a2.run.yy._650ab34ef56cd_1b3.run.exec.service/helper-logs",
            "/apps.slice/wl-foo.slice/wl-foo-abc123def456.7890ab.alloc.slice/v2_acme.prod_widget_sprocket_run_17.400_fluxcap9000.01.zz3_650ab12cd34ef_1a2.run.yy._650ab34ef56cd_1b3.run.exec.service/nested/boot.scope",
            "/apps.slice/wl-foo.slice/wl-foo-abc123def456.7890ab.alloc.slice/v2_acme.prod_widget_sprocket_run_17.400_fluxcap9000.01.zz3_650ab12cd34ef_1a2.run.yy._650ab34ef56cd_1b3.run.exec.service/nested/system.slice/remoted.service",
            "/apps.slice/wl-foo.slice/wl-foo-abc123def456.7890ab.alloc.slice/v2_acme.prod_widget_sprocket_run_17.400_fluxcap9000.01.zz3_650ab12cd34ef_1a2.run.yy._650ab34ef56cd_1b3.run.exec.service/nested/system.slice/emitd.service",
            "/apps.slice/wl-foo.slice/wl-foo-def789abc012.3456cd.alloc.slice/v2_acme.prod_widget_sprocket_run_22.401_fluxcap9000.01.zz3_650ab12cd78ef_1a3.run.yy._650ab34ef90cd_1b4.run.exec.service/helper-logs",
            "/apps.slice/wl-foo.slice/wl-foo-def789abc012.3456cd.alloc.slice/v2_acme.prod_widget_sprocket_run_22.401_fluxcap9000.01.zz3_650ab12cd78ef_1a3.run.yy._650ab34ef90cd_1b4.run.exec.service/nested/boot.scope",
            "/apps.slice/wl-foo.slice/wl-foo-def789abc012.3456cd.alloc.slice/v2_acme.prod_widget_sprocket_run_22.401_fluxcap9000.01.zz3_650ab12cd78ef_1a3.run.yy._650ab34ef90cd_1b4.run.exec.service/nested/system.slice/remoted.service",
            "/apps.slice/wl-foo.slice/wl-foo-def789abc012.3456cd.alloc.slice/v2_acme.prod_widget_sprocket_run_22.401_fluxcap9000.01.zz3_650ab12cd78ef_1a3.run.yy._650ab34ef90cd_1b4.run.exec.service/nested/system.slice/emitd.service",
            // Gizmo app variant — different words (gizmo,
            // fluxcap2000, zz7), so its skeleton differs from
            // sprocket's; each gizmo leaf is a singleton.
            "/apps.slice/wl-foo.slice/wl-foo-fedcba987654.abcdef.alloc.slice/v2_acme.prod_widget_gizmo_run_5.399_fluxcap2000.03.zz7_650ab12cdaaef_2c1.run.yy._650ab34efbbcd_2c2.run.exec.service/helper-logs",
            "/apps.slice/wl-foo.slice/wl-foo-fedcba987654.abcdef.alloc.slice/v2_acme.prod_widget_gizmo_run_5.399_fluxcap2000.03.zz7_650ab12cdaaef_2c1.run.yy._650ab34efbbcd_2c2.run.exec.service/nested/boot.scope",
            "/apps.slice/wl-foo.slice/wl-foo-fedcba987654.abcdef.alloc.slice/v2_acme.prod_widget_gizmo_run_5.399_fluxcap2000.03.zz7_650ab12cdaaef_2c1.run.yy._650ab34efbbcd_2c2.run.exec.service/nested/system.slice/remoted.service",
            "/apps.slice/wl-foo.slice/wl-foo-fedcba987654.abcdef.alloc.slice/v2_acme.prod_widget_gizmo_run_5.399_fluxcap2000.03.zz7_650ab12cdaaef_2c1.run.yy._650ab34efbbcd_2c2.run.exec.service/nested/system.slice/emitd.service",
            "/apps.slice/wl-bar.slice/relay.service",
            "/apps.slice/wl-bar.slice/cache.service",
        ];

        // Build a thread per cgroup, then group by Cgroup.
        let threads: Vec<_> = cgroups
            .iter()
            .enumerate()
            .map(|(i, cg)| {
                let mut t = make_thread("p", &format!("t{i}"));
                t.cgroup = (*cg).into();
                t
            })
            .collect();
        let snap_a = snap_with(threads.clone());
        let snap_b = snap_with(threads);
        let diff = compare(
            &snap_a,
            &snap_b,
            &CompareOptions {
                group_by: GroupBy::Cgroup.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: false,
                no_cg_normalize: false,
                sort_by: Vec::new(),
            },
        );

        let group_keys: std::collections::BTreeSet<String> =
            diff.rows.iter().map(|r| r.group_key.clone()).collect();

        // user session bucket: the two `session-{H}.scope` paths
        // collapse — `a1234` and `b5678` are both hex tokens.
        let user_session_skel = "/user.slice/user-{N}.slice/session-{H}.scope";
        assert!(
            group_keys.contains(user_session_skel),
            "missing user-session bucket; got {group_keys:?}",
        );

        // user@{I}.service bucket: two paths collapse via Layer 1.
        let user_service_skel = "/user.slice/user-{N}.slice/user@{I}.service/boot.scope";
        assert!(
            group_keys.contains(user_service_skel),
            "missing user@.service bucket; got {group_keys:?}",
        );

        // Singletons stay literal under `build_groups` gate.
        for singleton in &[
            "/",
            "/boot.scope",
            "/critical.slice/emitd.service",
            "/critical.slice/remoted.service",
            "/critical.slice/launcher@foo.bar.baz.service",
            "/critical.slice/launcher@foo.qux.quux.service",
            "/critical.slice/launcher@foo.waldo.grault.service",
            "/system.slice/crond.service",
            "/system.slice/ntpd.service",
            "/system.slice/tpl.slice/launcher@foo.garply.plugh.service",
            "/system.slice/tpl.slice/launcher@foo.corge.xyzzy.service",
            "/system.slice/tpl.slice/launcher@foo.thud.fred.service",
            "/apps.slice/wl-bar.slice/relay.service",
            "/apps.slice/wl-bar.slice/cache.service",
        ] {
            assert!(
                group_keys.contains(*singleton),
                "missing singleton bucket {singleton}; got {group_keys:?}",
            );
        }
    }

    /// Layer 3 (tighten) recovers literal tokens that are
    /// constant across all members of a multi-member group. Two
    /// sprocket paths share `v2`, `acme`, `prod`, `widget`,
    /// `sprocket`, `fluxcap9000`, `01`, `zz3`, `run`, `yy`,
    /// `exec`, `service` etc. — those positions revert. Tokens
    /// that vary (`17`/`22`, hex hashes) keep their Layer-2
    /// placeholder.
    #[test]
    fn cgroup_tighten_recovers_constant_tokens() {
        // Two simplified paths that share most tokens but differ
        // at one digit position.
        let path_1 = "/apps.slice/run-17.fluxcap9000_01.zz3";
        let path_2 = "/apps.slice/run-22.fluxcap9000_01.zz3";
        // After Layer 2 (no Layer 1 substitution applies):
        // Expected skeleton tokens (digits / hex placeholders):
        //   apps, slice, run, {N}, fluxcap{N}, {N}, zz{N}
        // After Layer 3 (tighten):
        //   `fluxcap{N}` (always 9000) → `fluxcap9000`
        //   `{N}` (the `01`) → `01`
        //   `zz{N}` (always 3) → `zz3`
        //   The first `{N}` (17 vs 22) varies → stays `{N}`.
        let snap = snap_with(vec![
            {
                let mut t = make_thread("p", "ta");
                t.cgroup = path_1.into();
                t
            },
            {
                let mut t = make_thread("p", "tb");
                t.cgroup = path_2.into();
                t
            },
        ]);
        let diff = compare(
            &snap,
            &snap,
            &CompareOptions {
                group_by: GroupBy::Cgroup.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: false,
                no_cg_normalize: false,
                sort_by: Vec::new(),
            },
        );
        let group_keys: std::collections::BTreeSet<String> =
            diff.rows.iter().map(|r| r.group_key.clone()).collect();
        // Tightened key recovers fluxcap9000, 01, zz3.
        let expected = "/apps.slice/run-{N}.fluxcap9000_01.zz3";
        assert!(
            group_keys.contains(expected),
            "tightened key {expected:?} missing; got {group_keys:?}",
        );
    }

    /// `--no-cg-normalize` (mirrored at API level by
    /// `CompareOptions::no_cg_normalize = true`) bypasses
    /// Layer 1 / 2 / 3 entirely. Two cgroup paths that would
    /// collapse under auto-normalize stay as separate literal
    /// buckets. Explicit `cgroup_flatten` glob patterns still
    /// apply; this flag only disables the auto-normalizer.
    #[test]
    fn no_cg_normalize_uses_literal_post_flatten_path() {
        let mut ta = make_thread("p", "ta");
        ta.cgroup = "/user.slice/user-0.slice/user@0.service/boot.scope".into();
        let mut tb = make_thread("p", "tb");
        tb.cgroup = "/user.slice/user-1001.slice/user@1001.service/boot.scope".into();
        let snap_a = snap_with(vec![ta]);
        let snap_b = snap_with(vec![tb]);

        // With auto-normalize ON (default): both paths collapse
        // into one bucket via Layer 1 + Layer 2.
        let diff_on = compare(
            &snap_a,
            &snap_b,
            &CompareOptions {
                group_by: GroupBy::Cgroup.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: false,
                no_cg_normalize: false,
                sort_by: Vec::new(),
            },
        );
        let normalized_key = "/user.slice/user-{N}.slice/user@{I}.service/boot.scope";
        assert!(
            diff_on.rows.iter().any(|r| r.group_key == normalized_key),
            "expected normalized key {normalized_key:?} when no_cg_normalize=false",
        );

        // With no_cg_normalize ON: paths stay separate as
        // singletons, surfacing as only-baseline / only-candidate.
        let diff_off = compare(
            &snap_a,
            &snap_b,
            &CompareOptions {
                group_by: GroupBy::Cgroup.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: false,
                no_cg_normalize: true,
                sort_by: Vec::new(),
            },
        );
        assert!(
            diff_off
                .only_baseline
                .contains(&"/user.slice/user-0.slice/user@0.service/boot.scope".to_string()),
            "literal baseline path missing under no_cg_normalize: only_baseline={:?}",
            diff_off.only_baseline,
        );
        assert!(
            diff_off
                .only_candidate
                .contains(&"/user.slice/user-1001.slice/user@1001.service/boot.scope".to_string()),
            "literal candidate path missing under no_cg_normalize: only_candidate={:?}",
            diff_off.only_candidate,
        );
    }

    /// Brackets in cgroup paths split tokens just like every other
    /// separator. Two paths with bracketed hex IDs (session
    /// scopes, container instance IDs) collapse onto the same
    /// skeleton — `[a1b2c3d4]` and `[deadbeef]` both tokenize to
    /// `[{H}]`. The Layer-2 token normalizer treats brackets as
    /// run boundaries (per [`is_token_separator`]), so the hex
    /// payloads inside them flow through rule 2 the same way
    /// dotted hex would.
    ///
    /// Pin the cgroup-path bracket case end-to-end through
    /// `compare`: two distinct sessions with hex-ID brackets must
    /// land in one normalized bucket under
    /// [`GroupBy::Cgroup`].
    #[test]
    fn cgroup_normalize_collapses_bracketed_hex_session_ids() {
        let mut ta = make_thread("p", "ta");
        ta.cgroup = "/user.slice/session-[a1b2c3d4]/scope".into();
        let mut tb = make_thread("p", "tb");
        tb.cgroup = "/user.slice/session-[dead1234]/scope".into();
        let snap_a = snap_with(vec![ta]);
        let snap_b = snap_with(vec![tb]);

        // Sanity-check the lower-level pieces this test composes:
        // (a) `cgroup_normalize_skeleton` produces the expected
        // `[{H}]` skeleton for both paths, and
        // (b) `build_cgroup_key_map` resolves both literal paths
        // to the tightened skeleton key. If either of these
        // returns something different, the resulting bucket key
        // won't match the test's expected string and the
        // outer compare-driven assertion would fail with an
        // unhelpful "got {}" message.
        let (skel_a, post_a, _) = cgroup_normalize_skeleton("/user.slice/session-[a1b2c3d4]/scope");
        let (skel_b, post_b, _) = cgroup_normalize_skeleton("/user.slice/session-[dead1234]/scope");
        assert_eq!(
            skel_a, "/user.slice/session-[{H}]/scope",
            "Layer-2 skeleton for path1 mismatch; got {skel_a:?}",
        );
        assert_eq!(
            skel_b, "/user.slice/session-[{H}]/scope",
            "Layer-2 skeleton for path2 mismatch; got {skel_b:?}",
        );
        // Layer 1 is a no-op for these paths (no @<x>.service).
        assert_eq!(post_a, "/user.slice/session-[a1b2c3d4]/scope");
        assert_eq!(post_b, "/user.slice/session-[dead1234]/scope");
        let key_map = build_cgroup_key_map(&snap_a, &snap_b, &[]);
        assert_eq!(
            key_map.get("/user.slice/session-[a1b2c3d4]/scope"),
            Some(&"/user.slice/session-[{H}]/scope".to_string()),
            "key_map must resolve path1 to the tightened skeleton",
        );
        assert_eq!(
            key_map.get("/user.slice/session-[dead1234]/scope"),
            Some(&"/user.slice/session-[{H}]/scope".to_string()),
            "key_map must resolve path2 to the tightened skeleton",
        );

        let diff = compare(
            &snap_a,
            &snap_b,
            &CompareOptions {
                group_by: GroupBy::Cgroup.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: false,
                no_cg_normalize: false,
                sort_by: Vec::new(),
            },
        );
        // Both bracketed-hex IDs collapse to `[{H}]`, so the two
        // paths share one normalized cgroup key after Layer 2.
        let group_keys: std::collections::BTreeSet<String> =
            diff.rows.iter().map(|r| r.group_key.clone()).collect();
        let expected = "/user.slice/session-[{H}]/scope";
        assert!(
            group_keys.contains(expected),
            "missing bracketed-hex cgroup bucket {expected:?}; got {group_keys:?}; \
             diff.only_baseline={:?}; diff.only_candidate={:?}",
            diff.only_baseline,
            diff.only_candidate,
        );
        // No only-side orphans — the union frequency promoted
        // both paths under the same key.
        assert!(
            diff.only_baseline.is_empty(),
            "no orphans under bracketed-hex collapse, got {:?}",
            diff.only_baseline,
        );
        assert!(
            diff.only_candidate.is_empty(),
            "no orphans under bracketed-hex collapse, got {:?}",
            diff.only_candidate,
        );
    }

    // ------------------------------------------------------------
    // auto_scale + render-cell tests: unit-aware magnitude
    // scaling for ns / B / ticks / unitless cells.
    // ------------------------------------------------------------

    /// Boundary: 999 ns stays at the base unit; 1000 ns steps up
    /// to µs. Pins the threshold at exactly the prefix transition.
    #[test]
    fn auto_scale_ns_boundary_stays_at_base_below_threshold() {
        assert_eq!(auto_scale(0.0, ScaleLadder::Ns), (0.0, "ns"));
        assert_eq!(auto_scale(999.0, ScaleLadder::Ns), (999.0, "ns"));
        assert_eq!(auto_scale(1000.0, ScaleLadder::Ns), (1.0, "µs"));
    }

    /// ns ladder: ns → µs (1e3) → ms (1e6) → s (1e9). Pins each
    /// step. Decimal SI prefixes (NOT IEC binary).
    #[test]
    fn auto_scale_ns_ladder_steps_up_at_powers_of_ten() {
        let (v, u) = auto_scale(1_500.0, ScaleLadder::Ns);
        assert_eq!(u, "µs");
        assert!((v - 1.5).abs() < 1e-9);
        let (v, u) = auto_scale(1_500_000.0, ScaleLadder::Ns);
        assert_eq!(u, "ms");
        assert!((v - 1.5).abs() < 1e-9);
        let (v, u) = auto_scale(1_500_000_000.0, ScaleLadder::Ns);
        assert_eq!(u, "s");
        assert!((v - 1.5).abs() < 1e-9);
    }

    /// Byte ladder uses IEC binary prefixes (×1024). 1024 B → 1
    /// KiB, 1 MiB at 1024², 1 GiB at 1024³. Pin both the
    /// threshold and the divisor.
    #[test]
    fn auto_scale_byte_iec_ladder_uses_1024() {
        assert_eq!(auto_scale(1023.0, ScaleLadder::Bytes), (1023.0, "B"));
        let (v, u) = auto_scale(1024.0, ScaleLadder::Bytes);
        assert_eq!(u, "KiB");
        assert!((v - 1.0).abs() < 1e-9);
        let (v, u) = auto_scale(1024.0 * 1024.0, ScaleLadder::Bytes);
        assert_eq!(u, "MiB");
        assert!((v - 1.0).abs() < 1e-9);
        let (v, u) = auto_scale(1024.0 * 1024.0 * 1024.0, ScaleLadder::Bytes);
        assert_eq!(u, "GiB");
        assert!((v - 1.0).abs() < 1e-9);
    }

    /// Ticks ladder: ticks → Kticks (×1e3) → Mticks (×1e6).
    /// Decimal prefixes — clock-tick rate is host-dependent.
    #[test]
    fn auto_scale_ticks_ladder_uses_decimal_prefixes() {
        assert_eq!(auto_scale(999.0, ScaleLadder::Ticks), (999.0, "ticks"));
        let (v, u) = auto_scale(1_500.0, ScaleLadder::Ticks);
        assert_eq!(u, "Kticks");
        assert!((v - 1.5).abs() < 1e-9);
        let (v, u) = auto_scale(2_000_000.0, ScaleLadder::Ticks);
        assert_eq!(u, "Mticks");
        assert!((v - 2.0).abs() < 1e-9);
    }

    /// Unitless (large counts) ladder: "" → K → M → G. Decimal
    /// SI prefixes for non-dimensional counts (wakeups,
    /// migrations, etc.).
    #[test]
    fn auto_scale_unitless_ladder_uses_si_prefixes() {
        assert_eq!(auto_scale(999.0, ScaleLadder::Unitless), (999.0, ""));
        let (v, u) = auto_scale(1_500.0, ScaleLadder::Unitless);
        assert_eq!(u, "K");
        assert!((v - 1.5).abs() < 1e-9);
        let (v, u) = auto_scale(2_500_000.0, ScaleLadder::Unitless);
        assert_eq!(u, "M");
        assert!((v - 2.5).abs() < 1e-9);
        let (v, u) = auto_scale(3_000_000_000.0, ScaleLadder::Unitless);
        assert_eq!(u, "G");
        assert!((v - 3.0).abs() < 1e-9);
    }

    /// Negative values pass through scaling with sign preserved.
    /// A delta cell with `-2,000,000 ns` should scale to
    /// `-2.000 ms` (NOT `+2 ms` or `2 ms`).
    #[test]
    fn auto_scale_preserves_sign_on_negative_input() {
        let (v, u) = auto_scale(-2_000_000.0, ScaleLadder::Ns);
        assert_eq!(u, "ms");
        assert!((v - (-2.0)).abs() < 1e-9);
        let (v, u) = auto_scale(-5_000.0, ScaleLadder::Bytes);
        // -5000 < -1024 in absolute value, but value is signed.
        // |-5000| = 5000 ≥ 1024, so step to KiB.
        assert_eq!(u, "KiB");
        assert!((v - (-5000.0 / 1024.0)).abs() < 1e-9);
    }

    /// Phase 4: the unknown-unit pass-through behavior was
    /// removed when `auto_scale` migrated from a free-form
    /// `&'static str` unit tag to the closed [`ScaleLadder`]
    /// enum. A registry typo can no longer slip through an
    /// `other => pass-through` arm at render time — every
    /// ladder is named at the type level. The corresponding
    /// `auto_scale_unknown_unit_passes_through` test
    /// disappeared with that change.
    ///
    /// `format_value_cell` for a Sum aggregate with the Ns ladder:
    /// values below the µs threshold render as integers; values
    /// at/above the threshold render as scaled f64 with 3
    /// decimals.
    #[test]
    fn format_value_cell_renders_sum_at_appropriate_scale() {
        // Below threshold → integer + base unit, no decimals.
        assert_eq!(
            format_value_cell(&Aggregated::Sum(50), ScaleLadder::Ns),
            "50ns"
        );
        assert_eq!(
            format_value_cell(&Aggregated::Sum(999), ScaleLadder::Ns),
            "999ns"
        );
        // At/above threshold → scaled f64 with 3 decimals.
        assert_eq!(
            format_value_cell(&Aggregated::Sum(1_500), ScaleLadder::Ns),
            "1.500µs",
        );
        assert_eq!(
            format_value_cell(&Aggregated::Sum(2_000_000), ScaleLadder::Ns),
            "2.000ms",
        );
    }

    /// `format_value_cell` for a Max aggregate: same scaling
    /// behavior as Sum (the *_max kernel fields use ns just like
    /// the *_sum fields).
    #[test]
    fn format_value_cell_renders_max_at_appropriate_scale() {
        assert_eq!(
            format_value_cell(&Aggregated::Max(100), ScaleLadder::Ns),
            "100ns"
        );
        assert_eq!(
            format_value_cell(&Aggregated::Max(7_500_000), ScaleLadder::Ns),
            "7.500ms",
        );
    }

    /// Non-numeric aggregates (Mode, OrdinalRange, Affinity) fall
    /// through to the [`Aggregated`] [`fmt::Display`] impl
    /// unchanged. No scaling because the values aren't scalar
    /// counts.
    #[test]
    fn format_value_cell_passes_non_numeric_aggregates_through() {
        let m = Aggregated::Mode {
            value: "SCHED_OTHER".into(),
            count: 4,
            total: 4,
        };
        assert_eq!(format_value_cell(&m, ScaleLadder::None), "SCHED_OTHER");
        let r = Aggregated::OrdinalRange { min: -5, max: 10 };
        assert_eq!(format_value_cell(&r, ScaleLadder::None), "-5..10");
    }

    /// `format_delta_cell` renders the signed delta with the
    /// scaled unit. Sign is preserved (with explicit `+` for
    /// positive). When no step-up was triggered AND the delta is
    /// integer-valued, the cell renders as a bare signed integer
    /// (no `.000` noise) to match
    /// [`format_value_cell`]'s short-circuit; otherwise 3-decimal
    /// precision applies.
    #[test]
    fn format_delta_cell_renders_signed_scaled_value() {
        // Below threshold, integer delta — short-circuit to bare
        // signed integer.
        assert_eq!(format_delta_cell(-50.0, ScaleLadder::Ns), "-50ns");
        assert_eq!(format_delta_cell(50.0, ScaleLadder::Ns), "+50ns");
        assert_eq!(format_delta_cell(0.0, ScaleLadder::Ns), "+0ns");
        // Below threshold, non-integer delta — keep 3 decimals so
        // sub-unit precision survives (rare in practice — counters
        // are u64-sourced — but possible after delta math on
        // ordinal-range midpoints).
        assert_eq!(format_delta_cell(50.5, ScaleLadder::Ns), "+50.500ns");
        // Above threshold — step up. Always 3 decimals because
        // the scale-up path can produce fractional values
        // (`2_000_001 / 1e6 = 2.000001`).
        assert_eq!(format_delta_cell(2_000_000.0, ScaleLadder::Ns), "+2.000ms");
        assert_eq!(format_delta_cell(-2_000_000.0, ScaleLadder::Ns), "-2.000ms");
    }

    /// `compare`'s sort order is unaffected by render-time
    /// scaling: the underlying `delta_pct` and `delta` fields
    /// hold the raw numeric values regardless of how cells are
    /// rendered. Pin two rows whose deltas differ in scale (one
    /// in ns range, one in ms-equivalent range) and verify sort
    /// is by raw |delta_pct|, not by rendered string.
    #[test]
    fn auto_scale_does_not_affect_sort_order() {
        let mut a_small = make_thread("small", "w");
        a_small.run_time_ns = MonotonicNs(100);
        let mut a_big = make_thread("big", "w");
        a_big.run_time_ns = MonotonicNs(1_000_000);
        let mut b_small = make_thread("small", "w");
        b_small.run_time_ns = MonotonicNs(110);
        let mut b_big = make_thread("big", "w");
        b_big.run_time_ns = MonotonicNs(2_000_000);
        let diff = compare(
            &snap_with(vec![a_small, a_big]),
            &snap_with(vec![b_small, b_big]),
            &CompareOptions::default(),
        );
        // big: +100% (1M → 2M) vs small: +10% (100 → 110). Big
        // should sort first regardless of which scale the cells
        // render at.
        let run_rows: Vec<&DiffRow> = diff
            .rows
            .iter()
            .filter(|r| r.metric_name == "run_time_ns")
            .collect();
        assert_eq!(run_rows[0].group_key, "big");
        assert_eq!(run_rows[1].group_key, "small");
    }

    /// Integration test: a snapshot pair whose run_time_ns sums
    /// fall in the ms range renders as `*ms` cells via
    /// [`write_diff`]. Pins that the new auto-scale call sites
    /// at the baseline / candidate / delta cells take effect end-
    /// to-end.
    #[test]
    fn write_diff_renders_auto_scaled_cells_for_ns_metric() {
        let mut ta = make_thread("p", "w");
        ta.run_time_ns = MonotonicNs(5_000_000); // 5 ms
        let mut tb = make_thread("p", "w");
        tb.run_time_ns = MonotonicNs(8_000_000); // 8 ms
        let diff = compare(
            &snap_with(vec![ta]),
            &snap_with(vec![tb]),
            &CompareOptions::default(),
        );
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        // Baseline cell: 5 ms with the ms unit.
        assert!(out.contains("5.000ms"), "missing baseline ms:\n{out}");
        // Candidate cell.
        assert!(out.contains("8.000ms"), "missing candidate ms:\n{out}");
        // Delta cell: +3 ms.
        assert!(out.contains("+3.000ms"), "missing delta ms:\n{out}");
    }

    /// Registry pin: the utime/stime clock-tick metrics carry
    /// the `"ticks"` unit so they pick up the ticks ladder under
    /// auto-scaling. Defends against a regression that flips
    /// either entry's unit back to `""` (which would route them
    /// through the unitless ladder and produce `K` / `M` /
    /// `G`-prefix cells).
    #[test]
    fn registry_utime_stime_carry_ticks_unit() {
        let utime = CTPROF_METRICS
            .iter()
            .find(|m| m.name == "utime_clock_ticks")
            .expect("utime_clock_ticks in registry");
        let stime = CTPROF_METRICS
            .iter()
            .find(|m| m.name == "stime_clock_ticks")
            .expect("stime_clock_ticks in registry");
        assert_eq!(utime.rule.ladder(), ScaleLadder::Ticks);
        assert_eq!(stime.rule.ladder(), ScaleLadder::Ticks);
    }

    // ------------------------------------------------------------
    // parse_sort_by + multi-key sort tests
    // ------------------------------------------------------------

    /// Empty `--sort-by` value parses to an empty Vec — caller
    /// then falls back to the default delta_pct sort.
    #[test]
    fn parse_sort_by_empty_returns_empty_vec() {
        let keys = parse_sort_by("").expect("empty parses");
        assert!(keys.is_empty());
    }

    /// Single field with no direction defaults to descending
    /// (largest delta first, matching operator default).
    #[test]
    fn parse_sort_by_single_field_defaults_to_desc() {
        let keys = parse_sort_by("wait_sum").expect("parse");
        assert_eq!(keys.len(), 1);
        assert_eq!(keys[0].metric, "wait_sum");
        assert!(keys[0].descending);
    }

    /// Bare metric name surrounded by whitespace (no colon, no
    /// direction) parses as a single descending key. Pins the
    /// metric-side trim path on the `None` arm of the
    /// `split_once(':')` match — `entry.trim()` runs first to
    /// strip the entry-level whitespace, then the `None` arm
    /// passes the trimmed string straight through. A regression
    /// that dropped either trim layer would surface here as a
    /// failed registry lookup on the literal `"  wait_sum  "`.
    #[test]
    fn parse_sort_by_bare_metric_with_whitespace_no_colon() {
        let keys = parse_sort_by("  wait_sum  ").expect("bare-metric whitespace must parse");
        assert_eq!(keys.len(), 1);
        assert_eq!(keys[0].metric, "wait_sum");
        assert!(keys[0].descending);
    }

    /// Explicit `:asc` and `:desc` directions parse correctly.
    /// Mixed-direction multi-key spec round-trips fine.
    #[test]
    fn parse_sort_by_explicit_directions() {
        let keys = parse_sort_by("wait_sum:asc,run_time_ns:desc").expect("parse");
        assert_eq!(keys.len(), 2);
        assert_eq!(keys[0].metric, "wait_sum");
        assert!(!keys[0].descending);
        assert_eq!(keys[1].metric, "run_time_ns");
        assert!(keys[1].descending);
    }

    /// Whitespace is trimmed at every layer — entry-level
    /// (between commas) AND inside the metric:direction split.
    /// Both `  wait_sum:desc  ` and `wait_sum : desc` (spaces
    /// around the `:`) parse to the same key because the metric
    /// and direction are independently trimmed after
    /// `split_once(':')`.
    #[test]
    fn parse_sort_by_trims_whitespace_between_entries() {
        let keys = parse_sort_by("  wait_sum:desc  ,  run_time_ns:asc  ").expect("parse");
        assert_eq!(keys.len(), 2);
        assert_eq!(keys[0].metric, "wait_sum");
        assert!(keys[0].descending);
        assert_eq!(keys[1].metric, "run_time_ns");
        assert!(!keys[1].descending);
    }

    /// Whitespace around the `:` separator is tolerated:
    /// `wait_sum : desc` parses as if the spaces were absent.
    /// Pin both metric- and direction-side trimming. A regression
    /// that drops the direction-side trim would surface as an
    /// "invalid direction \" desc\"" error.
    #[test]
    fn parse_sort_by_trims_whitespace_around_colon() {
        let keys = parse_sort_by("wait_sum : desc").expect("trimmed colon parse");
        assert_eq!(keys.len(), 1);
        assert_eq!(keys[0].metric, "wait_sum");
        assert!(keys[0].descending);
        // Asymmetric whitespace is also fine.
        let keys2 = parse_sort_by("run_time_ns:  asc  ").expect("trimmed asc-side parse");
        assert_eq!(keys2.len(), 1);
        assert_eq!(keys2[0].metric, "run_time_ns");
        assert!(!keys2[0].descending);
    }

    /// Direction matching is case-insensitive: `:DESC`, `:Desc`,
    /// `:Asc`, and `:ASC` all map to the canonical `desc` /
    /// `asc` semantics. Pin the lowercase normalization so an
    /// operator who typed in caps doesn't get an
    /// "invalid direction" error.
    #[test]
    fn parse_sort_by_direction_is_case_insensitive() {
        for spec in ["wait_sum:DESC", "wait_sum:Desc", "wait_sum:dEsC"] {
            let keys = parse_sort_by(spec).unwrap_or_else(|e| panic!("{spec} must parse: {e}"));
            assert_eq!(keys.len(), 1, "{spec}");
            assert!(keys[0].descending, "{spec}");
        }
        for spec in ["wait_sum:ASC", "wait_sum:Asc", "wait_sum:aSc"] {
            let keys = parse_sort_by(spec).unwrap_or_else(|e| panic!("{spec} must parse: {e}"));
            assert_eq!(keys.len(), 1, "{spec}");
            assert!(!keys[0].descending, "{spec}");
        }
    }

    /// Unknown metric name is rejected with a parse error
    /// citing the offending name.
    #[test]
    fn parse_sort_by_rejects_unknown_metric() {
        let err = parse_sort_by("not_a_real_metric").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("not_a_real_metric"),
            "error must cite offending metric name, got: {msg}"
        );
        // Also pin the "must be one of" preamble + at least one
        // canonical valid name so an operator who hits a typo
        // can recover from the diagnostic alone (without reading
        // the source). `parse_sort_by_unknown_metric_lists_valid_names_sorted`
        // pins the alphabetical order; this lighter test just
        // pins that the list rendering itself fired.
        assert!(
            msg.contains("must be one of"),
            "error must include the 'must be one of' preamble that introduces the valid-name list, got: {msg}"
        );
        assert!(
            msg.contains("run_time_ns"),
            "error must list at least one canonical metric name from the registry, got: {msg}"
        );
        // Pin the bare-metric-name hint: rendered cells now carry
        // `[tag]` suffixes (e.g. `wait_sum [non-ext] [SCHEDSTATS]`),
        // and an operator pasting the rendered cell verbatim into
        // `--sort-by` would land here. The error must redirect them
        // to the bare name.
        assert!(
            msg.contains("bare metric name"),
            "error must hint at bare-metric-name usage, got: {msg}"
        );
    }

    /// Pasting a tagged cell verbatim into --sort-by produces an
    /// error that carries the bare-metric-name hint. Pins the
    /// hint as actionable for the most likely operator failure
    /// mode after the tag-suffix change.
    #[test]
    fn parse_sort_by_unknown_with_tag_suffix_carries_hint() {
        let err = parse_sort_by("wait_sum [non-ext] [SCHEDSTATS]").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("bare metric name"),
            "tagged-cell paste must produce the bare-name hint, got: {msg}",
        );
    }

    /// Invalid direction string (anything other than `asc` /
    /// `desc`) is rejected with an actionable error.
    #[test]
    fn parse_sort_by_rejects_invalid_direction() {
        let err = parse_sort_by("wait_sum:sideways").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("sideways"),
            "error must cite offending direction, got: {msg}"
        );
    }

    /// Empty entry between commas (`a,,b`) is rejected.
    #[test]
    fn parse_sort_by_rejects_empty_entry() {
        let err = parse_sort_by("wait_sum,,run_time_ns").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("empty entry"),
            "error must mention empty entry, got: {msg}"
        );
    }

    /// Trailing comma (`"wait_sum,"`) yields an empty token at
    /// the tail and is rejected with the same diagnostic as
    /// `"a,,b"`. Pins that `split(',')` semantics produce an
    /// empty trailing entry rather than silently dropping it.
    #[test]
    fn parse_sort_by_rejects_trailing_comma() {
        let err = parse_sort_by("wait_sum,").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("empty entry"),
            "trailing comma must surface as empty-entry error, got: {msg}"
        );
    }

    /// Leading comma (`",wait_sum"`) yields an empty token at
    /// the head — same shape as the trailing-comma case. Pins
    /// the symmetric behavior so an operator who pastes a stray
    /// `,` at either end of the spec gets a consistent error.
    #[test]
    fn parse_sort_by_rejects_leading_comma() {
        let err = parse_sort_by(",wait_sum").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("empty entry"),
            "leading comma must surface as empty-entry error, got: {msg}"
        );
    }

    /// Bare colon (`":"`) splits to an empty metric and the
    /// empty string as direction. The empty direction matches
    /// neither `desc` nor `asc`, so the bad-direction arm fires
    /// citing the empty token. Pins this branch over the
    /// alternative interpretation ("metric is empty") so the
    /// diagnostic stays operator-actionable.
    #[test]
    fn parse_sort_by_rejects_bare_colon() {
        let err = parse_sort_by(":").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("invalid direction"),
            "bare colon must surface as invalid-direction error, got: {msg}"
        );
    }

    /// Metric name with trailing colon and no direction
    /// (`"wait_sum:"`) splits to (`"wait_sum"`, `""`). The
    /// empty direction is not `asc` or `desc`, so the
    /// bad-direction arm fires. A regression that treated empty
    /// direction as the default `desc` would silently accept
    /// the typo.
    #[test]
    fn parse_sort_by_rejects_metric_colon_no_direction() {
        let err = parse_sort_by("wait_sum:").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("invalid direction"),
            "metric-colon-no-direction must surface as invalid-direction error, got: {msg}"
        );
    }

    /// A categorical metric — one whose [`AggRule`] is any
    /// `Mode*` variant: [`AggRule::Mode`] (`policy`, string),
    /// [`AggRule::ModeChar`] (`state`, char), or
    /// [`AggRule::ModeBool`] (`ext_enabled`, bool) — has no
    /// scalar to sort by. `parse_sort_by` rejects it at the CLI
    /// boundary so the operator gets an actionable error rather
    /// than silent fall-through to alphabetical group order.
    /// Pin the canonical `policy` entry from the registry.
    #[test]
    fn parse_sort_by_rejects_categorical_metric() {
        // Sanity: policy is currently registered with AggRule::Mode
        // (the CategoricalString variant — distinct from
        // ModeChar/ModeBool).
        let policy_def = CTPROF_METRICS
            .iter()
            .find(|m| m.name == "policy")
            .expect("policy must be in CTPROF_METRICS");
        assert!(
            matches!(policy_def.rule, AggRule::Mode(_)),
            "test premise drift: policy is no longer Mode-aggregated; \
             pick a different categorical metric for this test",
        );
        let err = parse_sort_by("policy").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("categorical"),
            "categorical metric error must label the failure mode, got: {msg}"
        );
        assert!(
            msg.contains("policy"),
            "categorical metric error must name the offending metric, got: {msg}"
        );
    }

    /// Duplicate metric name across two entries
    /// (`--sort-by wait_sum,wait_sum` or `wait_sum:asc,wait_sum:desc`)
    /// is rejected. The second key never contributes to the lex
    /// ordering (the first key already disambiguated every
    /// non-tied case, and the second key would tie identically
    /// on the same metric), so it's an operator typo rather
    /// than a meaningful spec.
    #[test]
    fn parse_sort_by_rejects_duplicate_metric() {
        let err = parse_sort_by("wait_sum,wait_sum").unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("duplicate"),
            "duplicate-metric error must label the failure mode, got: {msg}"
        );
        assert!(
            msg.contains("wait_sum"),
            "duplicate-metric error must name the offending metric, got: {msg}"
        );
        // Different directions on the same metric still count
        // as duplicate — the second entry can't change the
        // ordering, so it's still a typo.
        let err2 = parse_sort_by("wait_sum:asc,wait_sum:desc").unwrap_err();
        let msg2 = format!("{err2:#}");
        assert!(
            msg2.contains("duplicate"),
            "duplicate metric across different directions must still reject, got: {msg2}"
        );
    }

    /// Unknown-metric error message lists the valid registry
    /// entries as a sorted comma-separated list (not a
    /// `BTreeSet` debug dump). Pins the operator-facing shape:
    /// the diagnostic is copy-pasteable and the names appear in
    /// alphabetical order so the operator can scan for the one
    /// they meant.
    #[test]
    fn parse_sort_by_unknown_metric_lists_valid_names_sorted() {
        let err = parse_sort_by("not_a_real_metric").unwrap_err();
        let msg = format!("{err:#}");
        // The list is comma-separated. Find two known-adjacent
        // names from the sorted set and pin their relative
        // order in the diagnostic.
        // In alphabetical order, "nice" comes before
        // "policy" and "policy" before "run_time_ns" (registry
        // names live mostly under the `n…` / `p…` / `r…`
        // namespaces). Pick a triple whose alphabetical order
        // is unambiguous.
        let nice_at = msg
            .find("nice")
            .expect("error must list 'nice' from the registry");
        let policy_at = msg
            .find("policy")
            .expect("error must list 'policy' from the registry");
        let run_time_at = msg
            .find("run_time_ns")
            .expect("error must list 'run_time_ns' from the registry");
        assert!(
            nice_at < policy_at,
            "names must appear in alphabetical order: \
             nice@{nice_at} < policy@{policy_at}\nmsg: {msg}",
        );
        assert!(
            policy_at < run_time_at,
            "names must appear in alphabetical order: \
             policy@{policy_at} < run_time_ns@{run_time_at}\nmsg: {msg}",
        );
        // Format must be comma-separated, not BTreeSet debug
        // (`{...}`). Pin the absence of the debug-set delimiters.
        assert!(
            !msg.contains("{\""),
            "error must use comma-separated list, not BTreeSet debug dump:\n{msg}"
        );
    }

    /// Multi-key sort spec preserves entry order in the
    /// returned Vec (left-to-right). Pins the documented
    /// "lexicographic in input order" contract — a reordering
    /// regression would silently rank by the second key first.
    #[test]
    fn parse_sort_by_multi_key_preserves_order() {
        // Three keys, distinct names — pick one each from the
        // ns / unitless / count axes so the entries are visibly
        // distinct.
        let keys =
            parse_sort_by("run_time_ns:desc,nr_wakeups:asc,wait_time_ns:desc").expect("parse");
        assert_eq!(keys.len(), 3);
        assert_eq!(keys[0].metric, "run_time_ns");
        assert!(keys[0].descending);
        assert_eq!(keys[1].metric, "nr_wakeups");
        assert!(!keys[1].descending);
        assert_eq!(keys[2].metric, "wait_time_ns");
        assert!(keys[2].descending);
    }

    /// Multi-key sort: groups rank by the requested metrics'
    /// deltas in tuple order. Big regression on the FIRST key
    /// dominates regardless of the second key.
    ///
    /// Exercises `sort_diff_rows_by_keys` directly on synthetic
    /// `DiffRow` values rather than driving through `compare()`
    /// — the function under test is the sort, not the diff
    /// pipeline; building the diff via `compare(empty, full)`
    /// would route every group into `only_baseline` /
    /// `only_candidate` rather than producing the matched-group
    /// rows the sort consumes.
    #[test]
    fn sort_diff_rows_by_keys_ranks_by_first_key_first() {
        // Build synthetic rows: 3 groups × 2 metrics = 6 rows.
        let mk_row = |group: &str, metric: &'static str, delta: f64| DiffRow {
            group_key: group.into(),
            thread_count_a: 1,
            thread_count_b: 1,
            metric_name: metric,
            metric_ladder: ScaleLadder::None,
            baseline: Aggregated::Sum(0),
            candidate: Aggregated::Sum(0),
            delta: Some(delta),
            delta_pct: None,
            display_key: group.into(),
            uptime_pct: None,
            sort_by_cell: None,
            sort_by_delta: None,
        };
        let mut rows = vec![
            mk_row("A", "run_time_ns", 1000.0),
            mk_row("A", "wait_sum", 100.0),
            mk_row("B", "run_time_ns", 100.0),
            mk_row("B", "wait_sum", 1000.0),
            mk_row("C", "run_time_ns", 50.0),
            mk_row("C", "wait_sum", 50.0),
        ];
        sort_diff_rows_by_keys(
            &mut rows,
            &mut Vec::new(),
            &[SortKey {
                metric: "run_time_ns",
                descending: true,
            }],
        );
        let groups_in_order: Vec<&str> = rows.iter().map(|r| r.group_key.as_str()).collect();
        // A has run_time_ns 1000 → first. B has 100 → second. C has 50 → third.
        // Each group's two rows cluster together in registry
        // order (run_time_ns before wait_sum).
        assert_eq!(
            groups_in_order,
            vec!["A", "A", "B", "B", "C", "C"],
            "groups should rank by run_time_ns delta desc",
        );
        // Within each group: run_time_ns row comes first
        // (registry index lower than wait_sum).
        let metrics_first_two: Vec<&str> = rows.iter().take(2).map(|r| r.metric_name).collect();
        assert_eq!(metrics_first_two, vec!["run_time_ns", "wait_sum"]);
    }

    /// Multi-key sort tie-break: when the first key value is
    /// equal across groups, the second key disambiguates. Two
    /// groups with the same run_time_ns delta but different
    /// wait_sum deltas: the one with the larger wait_sum delta
    /// sorts first (under desc,desc).
    #[test]
    fn sort_diff_rows_by_keys_breaks_ties_with_second_key() {
        let mk_row = |group: &str, metric: &'static str, delta: f64| DiffRow {
            group_key: group.into(),
            thread_count_a: 1,
            thread_count_b: 1,
            metric_name: metric,
            metric_ladder: ScaleLadder::None,
            baseline: Aggregated::Sum(0),
            candidate: Aggregated::Sum(0),
            delta: Some(delta),
            delta_pct: None,
            display_key: group.into(),
            uptime_pct: None,
            sort_by_cell: None,
            sort_by_delta: None,
        };
        let mut rows = vec![
            // A and B tie on run_time_ns (both 500). Use wait_sum
            // to break: A.wait_sum delta is 100, B.wait_sum delta
            // is 200. Under desc,desc → B first.
            mk_row("A", "run_time_ns", 500.0),
            mk_row("A", "wait_sum", 100.0),
            mk_row("B", "run_time_ns", 500.0),
            mk_row("B", "wait_sum", 200.0),
        ];
        sort_diff_rows_by_keys(
            &mut rows,
            &mut Vec::new(),
            &[
                SortKey {
                    metric: "run_time_ns",
                    descending: true,
                },
                SortKey {
                    metric: "wait_sum",
                    descending: true,
                },
            ],
        );
        let groups_in_order: Vec<&str> = rows.iter().map(|r| r.group_key.as_str()).collect();
        assert_eq!(groups_in_order, vec!["B", "B", "A", "A"]);
    }

    /// Ascending direction reverses the sort. Group with the
    /// SMALLEST delta should sort first under `:asc`.
    #[test]
    fn sort_diff_rows_by_keys_respects_ascending_direction() {
        let mk_row = |group: &str, metric: &'static str, delta: f64| DiffRow {
            group_key: group.into(),
            thread_count_a: 1,
            thread_count_b: 1,
            metric_name: metric,
            metric_ladder: ScaleLadder::None,
            baseline: Aggregated::Sum(0),
            candidate: Aggregated::Sum(0),
            delta: Some(delta),
            delta_pct: None,
            display_key: group.into(),
            uptime_pct: None,
            sort_by_cell: None,
            sort_by_delta: None,
        };
        let mut rows = vec![
            mk_row("A", "run_time_ns", 1000.0),
            mk_row("B", "run_time_ns", 100.0),
            mk_row("C", "run_time_ns", 500.0),
        ];
        sort_diff_rows_by_keys(
            &mut rows,
            &mut Vec::new(),
            &[SortKey {
                metric: "run_time_ns",
                descending: false, // asc
            }],
        );
        let groups_in_order: Vec<&str> = rows.iter().map(|r| r.group_key.as_str()).collect();
        // B (100) < C (500) < A (1000) under asc.
        assert_eq!(groups_in_order, vec!["B", "C", "A"]);
    }

    /// End-to-end: `compare()` with a non-empty sort_by uses the
    /// multi-key path. Pin that two groups with different
    /// run_time_ns deltas surface in the operator-requested
    /// order, regardless of which group has the larger
    /// |delta_pct| (which would have won under the default sort).
    #[test]
    fn compare_uses_sort_by_when_set() {
        let mut a_pre = make_thread("alpha", "w");
        a_pre.run_time_ns = MonotonicNs(1_000_000_000); // 1B baseline → big abs but tiny pct change
        let mut a_post = make_thread("alpha", "w");
        a_post.run_time_ns = MonotonicNs(1_000_000_500); // +500 abs; +5e-5 % change
        let mut b_pre = make_thread("bravo", "w");
        b_pre.run_time_ns = MonotonicNs(100);
        let mut b_post = make_thread("bravo", "w");
        b_post.run_time_ns = MonotonicNs(200); // +100 abs; +100% change
        // Default sort: bravo wins by |delta_pct|. With
        // sort_by=run_time_ns:desc, alpha wins by absolute delta
        // (500 > 100).
        let diff = compare(
            &snap_with(vec![a_pre, b_pre]),
            &snap_with(vec![a_post, b_post]),
            &CompareOptions {
                group_by: GroupBy::Pcomm.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: false,
                no_cg_normalize: false,
                sort_by: vec![SortKey {
                    metric: "run_time_ns",
                    descending: true,
                }],
            },
        );
        let run_rows: Vec<&DiffRow> = diff
            .rows
            .iter()
            .filter(|r| r.metric_name == "run_time_ns")
            .collect();
        assert_eq!(
            run_rows[0].group_key, "alpha",
            "sort_by abs delta picks alpha"
        );
        assert_eq!(run_rows[1].group_key, "bravo");
    }

    /// Final tie-break: when every sort-key value matches across
    /// groups, `sort_diff_rows_by_keys` falls through to ascending
    /// `group_key` ordering for deterministic output. Pins the
    /// last branch in the comparator (`a.cmp(b)`) — without it,
    /// equal-delta groups would emerge in BTreeMap-iteration order
    /// dependent on hash, which would produce flaky test output.
    #[test]
    fn sort_diff_rows_by_keys_falls_back_to_ascending_group_key_on_full_tie() {
        let mk_row = |group: &str, metric: &'static str, delta: f64| DiffRow {
            group_key: group.into(),
            thread_count_a: 1,
            thread_count_b: 1,
            metric_name: metric,
            metric_ladder: ScaleLadder::None,
            baseline: Aggregated::Sum(0),
            candidate: Aggregated::Sum(0),
            delta: Some(delta),
            delta_pct: None,
            display_key: group.into(),
            uptime_pct: None,
            sort_by_cell: None,
            sort_by_delta: None,
        };
        // Three groups with IDENTICAL deltas — only the
        // group_key tie-break can deterministically order them.
        // Insert in reverse-alphabetical order so the test fails
        // if the tie-break is dropped (BTreeMap iteration would
        // already produce ascending — distinguishable only via
        // explicit reverse-input ordering).
        let mut rows = vec![
            mk_row("charlie", "run_time_ns", 100.0),
            mk_row("bravo", "run_time_ns", 100.0),
            mk_row("alpha", "run_time_ns", 100.0),
        ];
        sort_diff_rows_by_keys(
            &mut rows,
            &mut Vec::new(),
            &[SortKey {
                metric: "run_time_ns",
                descending: true,
            }],
        );
        let order: Vec<&str> = rows.iter().map(|r| r.group_key.as_str()).collect();
        assert_eq!(
            order,
            vec!["alpha", "bravo", "charlie"],
            "full sort-key tie must fall back to ascending group_key",
        );
    }

    /// Missing-metric handling under descending direction:
    /// when a group has no row for the named metric (or its
    /// row's `delta` is `None`), `sort_diff_rows_by_keys`
    /// substitutes `f64::NEG_INFINITY` so the group sinks to
    /// the bottom under desc. Pin the documented contract — a
    /// regression that used 0.0 (or panicked) would surface
    /// here.
    #[test]
    fn sort_diff_rows_by_keys_missing_metric_sinks_under_desc() {
        let mk_row = |group: &str, metric: &'static str, delta: Option<f64>| DiffRow {
            group_key: group.into(),
            thread_count_a: 1,
            thread_count_b: 1,
            metric_name: metric,
            metric_ladder: ScaleLadder::None,
            baseline: Aggregated::Sum(0),
            candidate: Aggregated::Sum(0),
            delta,
            delta_pct: None,
            display_key: group.into(),
            uptime_pct: None,
            sort_by_cell: None,
            sort_by_delta: None,
        };
        let mut rows = vec![
            // alpha has a real run_time_ns delta.
            mk_row("alpha", "run_time_ns", Some(100.0)),
            // bravo has only a wait_time_ns row — its run_time_ns
            // tuple value is missing → NEG_INFINITY under desc.
            mk_row("bravo", "wait_time_ns", Some(999_999.0)),
        ];
        sort_diff_rows_by_keys(
            &mut rows,
            &mut Vec::new(),
            &[SortKey {
                metric: "run_time_ns",
                descending: true,
            }],
        );
        // Recover unique group ordering.
        let mut order: Vec<&str> = Vec::new();
        for r in &rows {
            if !order.contains(&r.group_key.as_str()) {
                order.push(r.group_key.as_str());
            }
        }
        assert_eq!(
            order,
            vec!["alpha", "bravo"],
            "missing metric under desc must sink the group (NEG_INFINITY)",
        );
    }

    /// Missing-metric handling under ascending direction:
    /// when the named metric is missing, `sort_diff_rows_by_keys`
    /// substitutes `f64::INFINITY` so the group sinks to the
    /// bottom under asc. Mirror of the desc test — same shape,
    /// inverted polarity. Together they pin both arms of the
    /// `if k.descending` branch in the fallback.
    #[test]
    fn sort_diff_rows_by_keys_missing_metric_sinks_under_asc() {
        let mk_row = |group: &str, metric: &'static str, delta: Option<f64>| DiffRow {
            group_key: group.into(),
            thread_count_a: 1,
            thread_count_b: 1,
            metric_name: metric,
            metric_ladder: ScaleLadder::None,
            baseline: Aggregated::Sum(0),
            candidate: Aggregated::Sum(0),
            delta,
            delta_pct: None,
            display_key: group.into(),
            uptime_pct: None,
            sort_by_cell: None,
            sort_by_delta: None,
        };
        let mut rows = vec![
            // alpha has a real (positive) run_time_ns delta.
            mk_row("alpha", "run_time_ns", Some(100.0)),
            // bravo has only a wait_time_ns row — its run_time_ns
            // tuple value is missing → INFINITY under asc.
            mk_row("bravo", "wait_time_ns", Some(50.0)),
        ];
        sort_diff_rows_by_keys(
            &mut rows,
            &mut Vec::new(),
            &[SortKey {
                metric: "run_time_ns",
                descending: false,
            }],
        );
        let mut order: Vec<&str> = Vec::new();
        for r in &rows {
            if !order.contains(&r.group_key.as_str()) {
                order.push(r.group_key.as_str());
            }
        }
        assert_eq!(
            order,
            vec!["alpha", "bravo"],
            "missing metric under asc must sink the group (INFINITY)",
        );
    }

    /// Categorical-only group: every row's `delta` is `None`
    /// (the group's metric is Mode and delta math doesn't
    /// apply), but the group still appears in `rows`.
    /// `sort_diff_rows_by_keys` must surface the group with
    /// the missing-metric fallback applied — no panic, no row
    /// dropped. This guards the second loop in the function
    /// that adds groups present in `rows` but absent from
    /// `group_metrics`.
    #[test]
    fn sort_diff_rows_by_keys_categorical_only_group_does_not_panic() {
        let mk_row = |group: &str, metric: &'static str| DiffRow {
            group_key: group.into(),
            thread_count_a: 1,
            thread_count_b: 1,
            metric_name: metric,
            metric_ladder: ScaleLadder::None,
            baseline: Aggregated::Mode {
                value: "SCHED_OTHER".into(),
                count: 1,
                total: 1,
            },
            candidate: Aggregated::Mode {
                value: "SCHED_OTHER".into(),
                count: 1,
                total: 1,
            },
            // `Mode` rows carry `delta: None` because mode
            // metrics have no scalar projection — see
            // `Aggregated::numeric()`.
            delta: None,
            delta_pct: None,
            display_key: group.into(),
            uptime_pct: None,
            sort_by_cell: None,
            sort_by_delta: None,
        };
        let mut rows = vec![mk_row("alpha", "policy"), mk_row("bravo", "policy")];
        // Sort by run_time_ns — neither group has it, both fall
        // through to the missing-metric fallback. Final tie-break
        // (`a.cmp(b)`) breaks the tie ascending.
        sort_diff_rows_by_keys(
            &mut rows,
            &mut Vec::new(),
            &[SortKey {
                metric: "run_time_ns",
                descending: true,
            }],
        );
        let order: Vec<&str> = rows.iter().map(|r| r.group_key.as_str()).collect();
        assert_eq!(
            order,
            vec!["alpha", "bravo"],
            "categorical-only groups must survive the sort and fall to ascending group_key",
        );
    }

    /// Within a group, rows appear in `CTPROF_METRICS`
    /// registry order regardless of input order or sort spec.
    /// Pins the documented "rows within a group keep registry
    /// order" contract — a regression that ordered metric rows
    /// by `metric_name` lexicographically (or by sort_key
    /// position) would produce non-deterministic per-bucket
    /// layouts.
    #[test]
    fn sort_diff_rows_by_keys_within_group_uses_registry_order() {
        let mk_row = |group: &str, metric: &'static str, delta: f64| DiffRow {
            group_key: group.into(),
            thread_count_a: 1,
            thread_count_b: 1,
            metric_name: metric,
            metric_ladder: ScaleLadder::None,
            baseline: Aggregated::Sum(0),
            candidate: Aggregated::Sum(0),
            delta: Some(delta),
            delta_pct: None,
            display_key: group.into(),
            uptime_pct: None,
            sort_by_cell: None,
            sort_by_delta: None,
        };
        // Use four metrics from the scheduling block in their
        // registry order: run_time_ns (idx 6), wait_time_ns (7),
        // timeslices (8), nr_wakeups (11). Insert in
        // REVERSE-registry order so a regression that orders by
        // input/sort-spec/lexicographic would surface as a
        // visibly wrong metric_order assertion.
        let mut rows = vec![
            mk_row("alpha", "nr_wakeups", 4.0),
            mk_row("alpha", "timeslices", 3.0),
            mk_row("alpha", "wait_time_ns", 999.0),
            mk_row("alpha", "run_time_ns", 1.0),
        ];
        sort_diff_rows_by_keys(
            &mut rows,
            &mut Vec::new(),
            &[SortKey {
                // Sort by wait_time_ns to verify the metric
                // rows still emerge in REGISTRY order, not
                // sort-spec order (which would put wait_time_ns
                // first).
                metric: "wait_time_ns",
                descending: true,
            }],
        );
        let metric_order: Vec<&str> = rows.iter().map(|r| r.metric_name).collect();
        assert_eq!(
            metric_order,
            vec!["run_time_ns", "wait_time_ns", "timeslices", "nr_wakeups"],
            "within-group order must be registry, not sort-spec, order",
        );
    }

    /// NaN-safe partial_cmp: a `delta` that's NaN must not
    /// panic the sort. `partial_cmp` returns `None` for NaN,
    /// which the comparator maps to `Ordering::Equal` so the
    /// remaining keys (or the group_key tie-break) decide. Pin
    /// that the function survives the NaN input — without the
    /// `unwrap_or(Equal)` in both arms, the sort would panic on
    /// the implicit `unwrap()` of an arithmetic NaN result.
    #[test]
    fn sort_diff_rows_by_keys_nan_delta_does_not_panic() {
        let mk_row = |group: &str, metric: &'static str, delta: f64| DiffRow {
            group_key: group.into(),
            thread_count_a: 1,
            thread_count_b: 1,
            metric_name: metric,
            metric_ladder: ScaleLadder::None,
            baseline: Aggregated::Sum(0),
            candidate: Aggregated::Sum(0),
            delta: Some(delta),
            delta_pct: None,
            display_key: group.into(),
            uptime_pct: None,
            sort_by_cell: None,
            sort_by_delta: None,
        };
        let mut rows = vec![
            mk_row("alpha", "run_time_ns", f64::NAN),
            mk_row("bravo", "run_time_ns", 100.0),
            mk_row("charlie", "run_time_ns", f64::NAN),
        ];
        // The function call must not panic; output ordering is
        // unspecified for NaN-vs-NaN beyond the group_key
        // tie-break, so we only assert that all three groups
        // survive the sort.
        sort_diff_rows_by_keys(
            &mut rows,
            &mut Vec::new(),
            &[SortKey {
                metric: "run_time_ns",
                descending: true,
            }],
        );
        let mut groups: Vec<&str> = rows.iter().map(|r| r.group_key.as_str()).collect();
        groups.sort();
        groups.dedup();
        assert_eq!(
            groups,
            vec!["alpha", "bravo", "charlie"],
            "NaN delta must not drop or duplicate any group",
        );
    }

    /// `compare()` with empty `sort_by` routes through the
    /// default `delta_pct desc` sort, NOT `sort_diff_rows_by_keys`.
    /// Pin the routing branch by exercising the same data
    /// shape under both `sort_by: empty` and `sort_by: [...]`
    /// and confirming they produce *different* orderings.
    /// Together with `compare_uses_sort_by_when_set` (the
    /// non-empty branch above), this pins both arms of the
    /// `if opts.sort_by.is_empty()` check inside `compare()`.
    #[test]
    fn compare_uses_default_sort_when_sort_by_empty() {
        // `alpha` has 1B baseline, +500 delta → tiny |delta_pct|.
        // `bravo` has 100 baseline, +100 delta → +100% delta_pct.
        // Default sort ranks by |delta_pct| desc → bravo first.
        let mut a_pre = make_thread("alpha", "w");
        a_pre.run_time_ns = MonotonicNs(1_000_000_000);
        let mut a_post = make_thread("alpha", "w");
        a_post.run_time_ns = MonotonicNs(1_000_000_500);
        let mut b_pre = make_thread("bravo", "w");
        b_pre.run_time_ns = MonotonicNs(100);
        let mut b_post = make_thread("bravo", "w");
        b_post.run_time_ns = MonotonicNs(200);

        // Empty sort_by → default delta_pct desc.
        let diff_default = compare(
            &snap_with(vec![a_pre.clone(), b_pre.clone()]),
            &snap_with(vec![a_post.clone(), b_post.clone()]),
            &CompareOptions {
                group_by: GroupBy::Pcomm.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: false,
                no_cg_normalize: false,
                sort_by: Vec::new(),
            },
        );
        let default_order: Vec<&str> = diff_default
            .rows
            .iter()
            .filter(|r| r.metric_name == "run_time_ns")
            .map(|r| r.group_key.as_str())
            .collect();
        assert_eq!(
            default_order,
            vec!["bravo", "alpha"],
            "empty sort_by must use default delta_pct desc sort \
             (bravo's +100% beats alpha's +5e-5 %)",
        );

        // Non-empty sort_by → multi-key. Picks alpha first by
        // absolute delta (+500 > +100).
        let diff_sort = compare(
            &snap_with(vec![a_pre, b_pre]),
            &snap_with(vec![a_post, b_post]),
            &CompareOptions {
                group_by: GroupBy::Pcomm.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: false,
                no_cg_normalize: false,
                sort_by: vec![SortKey {
                    metric: "run_time_ns",
                    descending: true,
                }],
            },
        );
        let sort_order: Vec<&str> = diff_sort
            .rows
            .iter()
            .filter(|r| r.metric_name == "run_time_ns")
            .map(|r| r.group_key.as_str())
            .collect();
        assert_eq!(
            sort_order,
            vec!["alpha", "bravo"],
            "non-empty sort_by must use multi-key path (alpha's +500 abs beats bravo's +100)",
        );

        // The two orderings differ — pins that the routing
        // actually swaps paths, not just produces the same
        // result by coincidence.
        assert_ne!(
            default_order, sort_order,
            "empty vs non-empty sort_by must produce different orderings on this fixture",
        );
    }

    /// Auto-scale edge case: zero values render as bare
    /// `0<unit>` across all five unit families. Pin that the
    /// `abs() >= threshold` chain short-circuits to "no
    /// step-up" at zero and the integer fast-path renders
    /// `0ns`, `0µs`, `0B`, `0ticks`, and `0` (the empty-unit
    /// case). A regression that flipped the threshold to `>`
    /// (so `abs >= 0` matches and the chain over-steps to the
    /// largest unit) would surface here.
    #[test]
    fn format_scaled_u64_zero_renders_at_base_unit_for_all_families() {
        assert_eq!(format_scaled_u64(0, ScaleLadder::Ns), "0ns");
        assert_eq!(format_scaled_u64(0, ScaleLadder::Us), "0µs");
        assert_eq!(format_scaled_u64(0, ScaleLadder::Bytes), "0B");
        assert_eq!(format_scaled_u64(0, ScaleLadder::Ticks), "0ticks");
        // Empty unit: format prints just the integer with no
        // suffix. This is the canonical unitless render path.
        assert_eq!(format_scaled_u64(0, ScaleLadder::Unitless), "0");
    }

    /// `format_delta_cell` on a negative µs delta auto-scales
    /// AND keeps the explicit minus sign. Pin both sides:
    /// magnitude is reported in seconds (`-1.500s`, not
    /// `-1500000µs`), and the leading `-` survives the scale
    /// step.
    #[test]
    fn format_delta_cell_negative_microseconds_scales_to_seconds() {
        let cell = format_delta_cell(-1_500_000.0, ScaleLadder::Us);
        assert_eq!(cell, "-1.500s");
    }

    /// `format_delta_cell` on a negative byte delta auto-scales
    /// AND keeps the explicit minus sign. Pin the IEC binary
    /// path on the negative side; the existing positive-byte
    /// path is exercised by other tests but the negative-byte
    /// branch was unpinned.
    #[test]
    fn format_delta_cell_negative_bytes_scales_to_gib() {
        let two_gib_neg = -(2.0 * 1024.0 * 1024.0 * 1024.0);
        let cell = format_delta_cell(two_gib_neg, ScaleLadder::Bytes);
        assert_eq!(cell, "-2.000GiB");
    }

    /// Asymmetric threshold-crossing: each cell of a
    /// `cgroup_cell` triple scales independently. A baseline
    /// just below the µs→ms threshold renders as bare µs while
    /// the candidate (just above) jumps to ms — and the delta
    /// (their difference) picks its own scale based on its own
    /// magnitude. Pin that the three cells don't bleed scales
    /// into each other.
    #[test]
    fn cgroup_cell_each_cell_scales_independently() {
        // Baseline 999 µs (below 1000-µs ms threshold) →
        // renders as `999µs`. Candidate 2000 µs (above) → `2.000ms`.
        // Delta +1001 µs (above) → `+1.001ms`.
        let cell = cgroup_cell(Some(999), Some(2000), ScaleLadder::Us);
        assert_eq!(
            cell, "999µs → 2.000ms (+1.001ms)",
            "asymmetric scaling: each cell must pick its own prefix",
        );
    }

    // ------------------------------------------------------------
    // Token separator extension: `[` and `]`
    //
    // The bracket-as-separator change feeds the smaps_rollup
    // normalization path: `pcomm[tgid]` keys tokenize the tgid
    // digits independently of the surrounding pcomm so ephemeral
    // PIDs collapse into one bucket per pcomm pattern across
    // snapshots. The pin tests below exercise the new behavior at
    // every layer that consumes `is_token_separator`.
    // ------------------------------------------------------------

    /// Brackets split tokens identically to the existing separator
    /// class. `worker[42]` tokenizes to
    /// `[Token("worker"), Sep("["), Token("42"), Sep("]")]` so a
    /// rejoin under `pattern_key` produces `worker[{N}]`. A
    /// regression that removed `[` / `]` from the separator class
    /// would surface here as `worker[42]` returning literal because
    /// the bracketed-digit token would no longer reach rule 1.
    #[test]
    fn pattern_key_normalizes_bracketed_digits() {
        assert_eq!(pattern_key("worker[42]"), "worker[{N}]");
        assert_eq!(
            pattern_key("systemd-network[105904]"),
            "systemd-network[{N}]"
        );
        // Both pcomm halves and the tgid normalize when each side
        // is hex/digit-eligible. `bash[4242]` — bash is pure alpha,
        // 4242 is pure digits → `bash[{N}]`.
        assert_eq!(pattern_key("bash[4242]"), "bash[{N}]");
        // Hex-only inside the brackets still picks `{H}` per the
        // hex-rule precedence over rule 4.
        assert_eq!(pattern_key("dev[1ab]"), "dev[{H}]");
    }

    /// `[` and `]` join the existing separator class — `split_into_segments`
    /// emits separator runs that include them verbatim. Pin both
    /// the standalone bracket and a multi-char run mixing brackets
    /// with other separators.
    #[test]
    fn split_into_segments_treats_brackets_as_separators() {
        let segs = split_into_segments("worker[42]");
        assert_eq!(
            segs,
            vec![
                Segment::Token("worker"),
                Segment::Separator("["),
                Segment::Token("42"),
                Segment::Separator("]"),
            ],
        );
        // Bracket adjacent to existing separator chars merges into
        // a single separator run.
        let segs = split_into_segments("a-[1]");
        assert_eq!(
            segs,
            vec![
                Segment::Token("a"),
                Segment::Separator("-["),
                Segment::Token("1"),
                Segment::Separator("]"),
            ],
        );
    }

    /// `is_token_separator` returns true for `[` and `]` directly.
    /// Pin the boolean predicate so a regression that drops a
    /// bracket from the `matches!` arm surfaces as a unit-test
    /// failure rather than only at the end-to-end pattern-rejoin
    /// site.
    #[test]
    fn is_token_separator_includes_brackets() {
        assert!(is_token_separator('['));
        assert!(is_token_separator(']'));
    }

    // ------------------------------------------------------------
    // GroupBy::Pcomm normalization
    //
    // Pcomm now flows through the same token-based normalizer as
    // Comm — ephemeral worker pools whose pcomm differs only by
    // a digit suffix collapse across snapshots. The pin tests
    // below mirror the Comm-axis tests on the Pcomm axis.
    // ------------------------------------------------------------

    /// kworker-style parent processes collapse into one bucket
    /// when grouped by pcomm under default normalization.
    /// `kworker/0:0`, `kworker/1:0`, `kworker/2:1` all produce
    /// the skeleton `kworker/{N}:{N}` so a 3-process fleet
    /// clusters into one bucket. Mirrors
    /// [`build_groups_comm_kworker_bare_collapses_across_cpus`]
    /// for the Pcomm axis.
    #[test]
    fn build_groups_pcomm_kworker_collapses_across_cpus() {
        let snap = snap_with(vec![
            make_thread("kworker/0:0", "t0"),
            make_thread("kworker/1:0", "t1"),
            make_thread("kworker/3:2", "t2"),
        ]);
        let groups = build_groups(&snap, GroupBy::Pcomm, &[], None, None, false);
        assert_eq!(groups["kworker/{N}:{N}"].thread_count, 3);
        assert_eq!(groups.len(), 1);
    }

    /// Singleton pcomm reverts to literal so a lone parent
    /// process does not advertise a `worker-{N}` pattern that
    /// no other process shares. Mirrors
    /// [`build_groups_comm_singleton_reverts_to_literal`] for the
    /// Pcomm axis.
    #[test]
    fn build_groups_pcomm_singleton_reverts_to_literal() {
        let snap = snap_with(vec![make_thread("worker-7", "t0")]);
        let groups = build_groups(&snap, GroupBy::Pcomm, &[], None, None, false);
        assert!(
            groups.contains_key("worker-7"),
            "lone worker-7 stays literal under Pcomm normalization",
        );
        assert!(
            !groups.contains_key("worker-{N}"),
            "no `worker-{{N}}` pattern key for a singleton pcomm",
        );
        assert_eq!(groups.len(), 1);
    }

    /// `--no-thread-normalize` under [`GroupBy::Pcomm`] preserves
    /// literal pcomm grouping — `worker-7` and `worker-15` stay in
    /// distinct buckets. Mirrors
    /// [`no_thread_normalize_uses_literal_comm`] for the Pcomm axis.
    #[test]
    fn no_thread_normalize_uses_literal_pcomm() {
        let snap_a = snap_with(vec![
            make_thread("worker-7", "t0"),
            make_thread("worker-15", "t1"),
        ]);
        let snap_b = snap_with(vec![
            make_thread("worker-7", "t0"),
            make_thread("worker-15", "t1"),
        ]);
        let diff = compare(
            &snap_a,
            &snap_b,
            &CompareOptions {
                group_by: GroupBy::Pcomm.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: true,
                no_cg_normalize: false,
                sort_by: Vec::new(),
            },
        );
        let group_keys: std::collections::BTreeSet<&str> =
            diff.rows.iter().map(|r| r.group_key.as_str()).collect();
        assert!(
            group_keys.contains("worker-7"),
            "literal worker-7 missing under no_thread_normalize: {group_keys:?}",
        );
        assert!(
            group_keys.contains("worker-15"),
            "literal worker-15 missing under no_thread_normalize: {group_keys:?}",
        );
        assert!(
            !group_keys.contains("worker-{N}"),
            "no normalized bucket under no_thread_normalize on Pcomm: {group_keys:?}",
        );
    }

    /// Cross-snapshot frequency union promotes a `worker-{N}`
    /// pattern when baseline has 1 process + candidate has 2 —
    /// the union total (3) crosses the >= 2 promotion gate so
    /// both sides use the same `worker-{N}` join key. Mirrors
    /// [`compare_comm_pattern_joins_across_asymmetric_resize`]
    /// for the Pcomm axis. Without the union, baseline's
    /// `worker-7` would gate to literal (count 1) while
    /// candidate's two would gate to pattern, producing orphaned
    /// only-in-baseline / only-in-candidate rows.
    #[test]
    fn compare_pcomm_pattern_joins_across_asymmetric_resize() {
        let baseline = snap_with(vec![make_thread("worker-7", "t0")]);
        let candidate = snap_with(vec![
            make_thread("worker-0", "t0"),
            make_thread("worker-1", "t1"),
        ]);
        let diff = compare(
            &baseline,
            &candidate,
            &CompareOptions {
                group_by: GroupBy::Pcomm.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: false,
                no_cg_normalize: false,
                sort_by: Vec::new(),
            },
        );
        let row = diff
            .rows
            .iter()
            .find(|r| r.metric_name == "run_time_ns" && r.group_key == "worker-{N}")
            .expect("worker-{N} pcomm row joined across asymmetric snapshots");
        assert_eq!(row.thread_count_a, 1, "baseline carries 1 worker process");
        assert_eq!(
            row.thread_count_b, 2,
            "candidate carries 2 worker processes"
        );
        // No orphan rows for the worker family.
        let baseline_orphans: Vec<&String> = diff
            .only_baseline
            .iter()
            .filter(|k| k.starts_with("worker"))
            .collect();
        assert!(
            baseline_orphans.is_empty(),
            "no worker-prefixed pcomm orphans in only_baseline; got {baseline_orphans:?}",
        );
        let candidate_orphans: Vec<&String> = diff
            .only_candidate
            .iter()
            .filter(|k| k.starts_with("worker"))
            .collect();
        assert!(
            candidate_orphans.is_empty(),
            "no worker-prefixed pcomm orphans in only_candidate; got {candidate_orphans:?}",
        );
    }

    /// End-to-end: `compare(GroupBy::Pcomm, ...)` produces a
    /// `DiffRow` whose `group_key` is the `prefix-{N}` skeleton
    /// (deterministic across snapshots) and whose `display_key`
    /// reflects grex over the union of baseline + candidate
    /// pcomm members. Mirrors
    /// [`compare_comm_pattern_emits_prefix_join_key_and_grex_display`]
    /// for the Pcomm axis.
    #[test]
    fn compare_pcomm_pattern_emits_prefix_join_key_and_grex_display() {
        let baseline = snap_with(vec![
            make_thread("worker-0", "t0"),
            make_thread("worker-1", "t1"),
        ]);
        let candidate = snap_with(vec![
            make_thread("worker-2", "t0"),
            make_thread("worker-3", "t1"),
        ]);
        let diff = compare(
            &baseline,
            &candidate,
            &CompareOptions {
                group_by: GroupBy::Pcomm.into(),
                cgroup_flatten: vec![],
                no_thread_normalize: false,
                no_cg_normalize: false,
                sort_by: Vec::new(),
            },
        );
        let row = diff
            .rows
            .iter()
            .find(|r| r.metric_name == "run_time_ns" && r.group_key == "worker-{N}")
            .expect("worker-{N} pcomm row");
        assert_eq!(
            row.group_key, "worker-{N}",
            "join key is the placeholder pattern under Pcomm normalization",
        );
        assert!(
            row.display_key.contains("worker"),
            "display key reflects grex over union; got {:?}",
            row.display_key,
        );
        // Distinct-member union (4 names) must produce a label
        // distinct from the bare prefix when grex is active.
        assert_ne!(
            row.display_key, "worker-{N}",
            "≥2 members → grex regex, not the placeholder pattern",
        );
    }

    /// Conservation: the sum of an aggregated counter across every
    /// pattern bucket equals the sum across every input thread.
    /// Pcomm pattern-aggregation must be bookkeeping-neutral —
    /// mirrors [`build_groups_comm_sum_conservation_across_buckets`]
    /// for the Pcomm axis.
    #[test]
    fn build_groups_pcomm_sum_conservation_across_buckets() {
        let mut threads = Vec::new();
        for i in 0..5 {
            let mut t = make_thread(&format!("worker-{i}"), "t");
            t.run_time_ns = MonotonicNs(100 * (i as u64 + 1));
            threads.push(t);
        }
        for i in 0..3 {
            let mut t = make_thread(&format!("redis-bg-{i}"), "t");
            t.run_time_ns = MonotonicNs(50 * (i as u64 + 1));
            threads.push(t);
        }
        let mut single = make_thread("init", "t");
        single.run_time_ns = MonotonicNs(999);
        threads.push(single);

        let input_total: u64 = threads.iter().map(|t| t.run_time_ns.0).sum();
        let snap = snap_with(threads);
        let groups = build_groups(&snap, GroupBy::Pcomm, &[], None, None, false);

        let aggregated_total: u64 = groups
            .values()
            .map(|g| match g.metrics.get("run_time_ns") {
                Some(Aggregated::Sum(n)) => *n,
                _ => 0,
            })
            .sum();
        assert_eq!(
            aggregated_total, input_total,
            "Pcomm pattern-aggregated sum must equal input sum",
        );
    }

    // ------------------------------------------------------------
    // collect_smaps_rollup normalization
    //
    // smaps_rollup keys default to `pattern_key(&t.pcomm)` (the
    // tgid is dropped) so ephemeral PIDs collapse into one bucket
    // per pcomm pattern; multiple PIDs mapping to the same key SUM
    // their per-field byte counts. Under `no_thread_normalize:
    // true`, the literal `pcomm[tgid]` shape is preserved instead
    // so each PID stays attributable.
    // ------------------------------------------------------------

    /// Helper: build a leader thread with a populated smaps_rollup
    /// map. The `tid == tgid` shape lets the leader-dedup gate
    /// inside `collect_smaps_rollup` admit the row.
    fn smaps_thread(pcomm: &str, tgid: u32, rss_kb: u64, pss_kb: u64) -> ThreadState {
        let mut t = ThreadState {
            tid: tgid,
            tgid,
            pcomm: pcomm.into(),
            comm: pcomm.into(),
            cgroup: "/".into(),
            ..ThreadState::default()
        };
        t.smaps_rollup_kb.insert("Rss".into(), rss_kb);
        t.smaps_rollup_kb.insert("Pss".into(), pss_kb);
        t
    }

    /// Default normalization collapses ephemeral PIDs into one
    /// bucket per pcomm pattern. Three `worker-{0,1,2}` parents
    /// (each with its own ephemeral tgid) all key as `worker-{N}`
    /// — the tgid is dropped, so the join key matches the primary-
    /// table Pcomm group key exactly. Per-field byte counts SUM
    /// across the three collapsed PIDs.
    #[test]
    fn collect_smaps_rollup_normalizes_and_sums_across_pids() {
        let snap = snap_with(vec![
            smaps_thread("worker-0", 100, 1024, 512),
            smaps_thread("worker-1", 200, 2048, 1024),
            smaps_thread("worker-2", 300, 4096, 2048),
        ]);
        let out = collect_smaps_rollup(&snap, false);
        assert_eq!(out.len(), 1, "three PIDs collapse into one bucket: {out:?}");
        let bucket = out
            .get("worker-{N}")
            .expect("bucket key is pattern_key(pcomm) — no `[tgid]` suffix");
        // Values are bytes (kB * 1024).
        assert_eq!(
            bucket.get("Rss").copied(),
            Some((1024 + 2048 + 4096) * 1024),
            "Rss SUMs across the three collapsed PIDs",
        );
        assert_eq!(
            bucket.get("Pss").copied(),
            Some((512 + 1024 + 2048) * 1024),
            "Pss SUMs across the three collapsed PIDs",
        );
    }

    /// Default normalization always produces a normalized key —
    /// no singleton revert. A lone `worker-7` parent process
    /// still keys as `worker-{N}` so a literal-PID baseline like
    /// `worker[7]` joins a literal-PID candidate `worker[1234]`
    /// across snapshots, which is the load-bearing invariant
    /// behind dropping the tgid suffix. The primary-table Pcomm
    /// axis DOES revert singletons; smaps does NOT — the design
    /// asymmetry is documented on `collect_smaps_rollup` and is
    /// the bug-fix the normalization exists for.
    #[test]
    fn collect_smaps_rollup_no_singleton_revert_when_normalizing() {
        // Single leader for `worker-7` — under primary Pcomm
        // grouping this would revert to literal `worker-7`
        // because the bucket has count 1. smaps does NOT revert:
        // the join across baseline/candidate would otherwise fail
        // when the PID changes between snapshots.
        let snap = snap_with(vec![smaps_thread("worker-7", 99, 1024, 512)]);
        let out = collect_smaps_rollup(&snap, false);
        assert_eq!(out.len(), 1);
        assert!(
            out.contains_key("worker-{N}"),
            "lone worker-7 must STILL normalize to worker-{{N}} for smaps; \
             singleton-revert is intentionally skipped on the smaps axis: \
             got {:?}",
            out.keys().collect::<Vec<_>>(),
        );
        assert!(
            !out.contains_key("worker-7"),
            "literal singleton key must NOT appear under default smaps \
             normalization: got {:?}",
            out.keys().collect::<Vec<_>>(),
        );
    }

    /// `no_thread_normalize: true` preserves the literal
    /// `pcomm[tgid]` key — each PID stays attributable to its
    /// specific instance. Three workers produce three buckets
    /// with their per-PID values verbatim, no summation.
    #[test]
    fn collect_smaps_rollup_no_normalize_preserves_literal_pid_keys() {
        let snap = snap_with(vec![
            smaps_thread("worker-0", 100, 1024, 512),
            smaps_thread("worker-1", 200, 2048, 1024),
            smaps_thread("worker-2", 300, 4096, 2048),
        ]);
        let out = collect_smaps_rollup(&snap, true);
        assert_eq!(
            out.len(),
            3,
            "no_normalize keeps three distinct PID buckets"
        );
        assert_eq!(out["worker-0[100]"]["Rss"], 1024 * 1024);
        assert_eq!(out["worker-1[200]"]["Rss"], 2048 * 1024);
        assert_eq!(out["worker-2[300]"]["Rss"], 4096 * 1024);
    }

    /// Empty snapshot produces an empty rollup map under both
    /// modes (no panic, no synthesized entries). Boundary case.
    #[test]
    fn collect_smaps_rollup_empty_snapshot_returns_empty_map() {
        let snap = snap_with(vec![]);
        assert!(collect_smaps_rollup(&snap, false).is_empty());
        assert!(collect_smaps_rollup(&snap, true).is_empty());
    }

    /// Non-leader threads (tid != tgid) carry empty smaps_rollup
    /// maps per the leader-dedup contract. The `is_empty()` skip
    /// at the head of `collect_smaps_rollup` filters them — pin
    /// that they don't synthesize ghost buckets under either
    /// normalization mode.
    #[test]
    fn collect_smaps_rollup_skips_non_leader_threads() {
        let leader = smaps_thread("worker-0", 100, 1024, 512);
        let mut non_leader = ThreadState {
            tid: 101,
            tgid: 100,
            pcomm: "worker-0".into(),
            comm: "worker-0".into(),
            cgroup: "/".into(),
            ..ThreadState::default()
        };
        // non_leader.smaps_rollup_kb stays empty (default) — the
        // capture-side dedup contract means non-leader threads
        // never carry a populated map.
        assert!(non_leader.smaps_rollup_kb.is_empty());
        // Reassure: clearing is the no-op the contract assumes.
        non_leader.smaps_rollup_kb.clear();
        let snap = snap_with(vec![leader, non_leader]);
        // Default normalize: one bucket from the leader keyed by
        // `pattern_key(pcomm)`; no ghost entry from the
        // non-leader's empty map.
        let out_norm = collect_smaps_rollup(&snap, false);
        assert_eq!(out_norm.len(), 1);
        assert!(out_norm.contains_key("worker-{N}"));
        // No-normalize: one bucket keyed at the leader's literal
        // pcomm[tgid].
        let out_lit = collect_smaps_rollup(&snap, true);
        assert_eq!(out_lit.len(), 1);
        assert!(out_lit.contains_key("worker-0[100]"));
    }

    /// Multiple PIDs with the same pcomm pattern but disjoint
    /// smaps_rollup field sets (e.g. one snapshot has Rss only,
    /// another has Pss only) merge into one bucket whose map
    /// carries every field that any contributor reported. Pin
    /// that absent fields don't shadow present ones at the merge
    /// boundary.
    #[test]
    fn collect_smaps_rollup_merge_carries_every_field_seen() {
        let t1 = smaps_thread("worker-0", 100, 1024, 512);
        let mut t2 = ThreadState {
            tid: 200,
            tgid: 200,
            pcomm: "worker-1".into(),
            comm: "worker-1".into(),
            cgroup: "/".into(),
            ..ThreadState::default()
        };
        // t1 has Rss + Pss. t2 has Rss + Private_Clean only.
        t2.smaps_rollup_kb.insert("Rss".into(), 2048);
        t2.smaps_rollup_kb.insert("Private_Clean".into(), 256);
        // t1 keeps its Rss + Pss from the helper, no Private_Clean.
        assert!(!t1.smaps_rollup_kb.contains_key("Private_Clean"));

        let snap = snap_with(vec![t1, t2]);
        let out = collect_smaps_rollup(&snap, false);
        let bucket = out.get("worker-{N}").expect("merged bucket");
        // Rss: 1024 + 2048 (both contribute).
        assert_eq!(bucket.get("Rss").copied(), Some((1024 + 2048) * 1024));
        // Pss: only t1 contributed → t1's value alone.
        assert_eq!(bucket.get("Pss").copied(), Some(512 * 1024));
        // Private_Clean: only t2 contributed → t2's value alone.
        assert_eq!(bucket.get("Private_Clean").copied(), Some(256 * 1024));
    }

    /// Sort by total Rss desc: the smaps render iterates process
    /// keys ranked by max(baseline_rss, candidate_rss) descending
    /// so the heaviest mover appears first. Pin that the rendered
    /// table places `heavy` ahead of `light` regardless of
    /// alphabetical key order. Without the sort, BTreeSet
    /// iteration would put `heavy` after `light`.
    #[test]
    fn write_diff_smaps_orders_processes_by_rss_desc() {
        let mut diff = CtprofDiff::default();
        let mut heavy = BTreeMap::new();
        heavy.insert("Rss".to_string(), 100 * 1024 * 1024); // 100 MiB
        heavy.insert("Pss".to_string(), 50 * 1024 * 1024);
        let mut heavy_b = BTreeMap::new();
        heavy_b.insert("Rss".to_string(), 200 * 1024 * 1024);
        heavy_b.insert("Pss".to_string(), 100 * 1024 * 1024);
        let mut light = BTreeMap::new();
        light.insert("Rss".to_string(), 1024); // 1 KiB
        light.insert("Pss".to_string(), 512);
        let mut light_b = BTreeMap::new();
        light_b.insert("Rss".to_string(), 2048);
        light_b.insert("Pss".to_string(), 1024);
        // Keys ordered alphabetically (`a_light` before `b_heavy`
        // when sorted) so a regression that fell back to BTreeSet
        // iteration would put a_light first.
        diff.smaps_rollup_a.insert("a_light[1]".to_string(), light);
        diff.smaps_rollup_b
            .insert("a_light[1]".to_string(), light_b);
        diff.smaps_rollup_a.insert("b_heavy[2]".to_string(), heavy);
        diff.smaps_rollup_b
            .insert("b_heavy[2]".to_string(), heavy_b);

        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();

        let smaps_at = out
            .find("## smaps_rollup")
            .expect("smaps section must render");
        let after_header = &out[smaps_at..];
        let heavy_pos = after_header
            .find("b_heavy[2]")
            .expect("b_heavy must appear");
        let light_pos = after_header
            .find("a_light[1]")
            .expect("a_light must appear");
        assert!(
            heavy_pos < light_pos,
            "process with larger Rss must render first; \
             b_heavy@{heavy_pos} must precede a_light@{light_pos}",
        );
    }

    /// Saturating overflow: two leader threads each reporting
    /// `Rss = u64::MAX kB` (impossible in practice, defensive
    /// pin). Sum via `saturating_add` must not panic; the
    /// merged Rss caps at `u64::MAX` bytes after the kB→B
    /// conversion. Without `saturating_add`, the addition would
    /// overflow and panic in debug builds.
    #[test]
    fn collect_smaps_rollup_saturating_add_does_not_panic_on_overflow() {
        let snap = snap_with(vec![
            smaps_thread("worker-0", 100, u64::MAX, 1),
            smaps_thread("worker-1", 200, u64::MAX, 1),
        ]);
        let out = collect_smaps_rollup(&snap, false);
        let bucket = out.get("worker-{N}").expect("merged bucket");
        // `smaps_rollup_bytes` converts kB → B by multiplying by
        // 1024; with kB at u64::MAX the conversion itself
        // saturates inside `smaps_rollup_bytes`. The
        // post-conversion sum then saturates again on the
        // second contributor. Either way, the merge never
        // panics — pin the well-defined output value.
        let v = bucket
            .get("Rss")
            .copied()
            .expect("Rss key present after overflow");
        assert_eq!(
            v,
            u64::MAX,
            "saturating_add must clamp to u64::MAX, not panic",
        );
    }

    /// Literal bracket name (no digits inside): `pattern_key`
    /// returns the input unchanged. `[bar]` tokenizes to
    /// `[Sep("["), Token("bar"), Sep("]")]`; `bar` is pure alpha
    /// (no rule fires). The whole input thus echoes through
    /// — the bracket separators are preserved verbatim.
    #[test]
    fn pattern_key_bracket_alpha_token_stays_literal() {
        assert_eq!(pattern_key("foo[bar]"), "foo[bar]");
        assert_eq!(pattern_key("a[b]"), "a[b]");
        // Hex-eligible alpha-only inside brackets still stays
        // literal — rule 2 requires at least one digit.
        assert_eq!(pattern_key("dev[abc]"), "dev[abc]");
    }

    /// Distinct prefixes do not merge under Pcomm: `worker-0/1`
    /// and `worker-large-0/1` produce two normalized buckets
    /// (`worker-{N}` and `worker-large-{N}`), not one. Pcomm-
    /// axis sibling to [`build_groups_comm_distinct_prefixes_do_not_merge`].
    #[test]
    fn build_groups_pcomm_distinct_prefixes_do_not_merge() {
        let snap = snap_with(vec![
            make_thread("worker-0", "t"),
            make_thread("worker-1", "t"),
            make_thread("worker-large-0", "t"),
            make_thread("worker-large-1", "t"),
        ]);
        let groups = build_groups(&snap, GroupBy::Pcomm, &[], None, None, false);
        assert_eq!(groups["worker-{N}"].thread_count, 2);
        assert_eq!(groups["worker-large-{N}"].thread_count, 2);
        assert_eq!(groups.len(), 2);
    }

    /// Singleton PID smaps pin: a single leader thread with
    /// pcomm `bash` (pure-alpha, no normalizer rule fires)
    /// produces ONE bucket keyed at `pattern_key("bash") =
    /// "bash"`. The key drops the tgid suffix even with one
    /// PID, matching what the primary table's Pcomm bucket
    /// would render. Without dropping the suffix, smaps would
    /// emit `bash[42]` while the primary table shows `bash` —
    /// the very mismatch the design fix was here to eliminate.
    #[test]
    fn collect_smaps_rollup_singleton_drops_tgid_suffix() {
        let snap = snap_with(vec![smaps_thread("bash", 42, 4096, 1024)]);
        let out = collect_smaps_rollup(&snap, false);
        assert_eq!(out.len(), 1);
        assert!(
            out.contains_key("bash"),
            "singleton bash key must equal pattern_key(\"bash\") = \"bash\"; \
             got {:?}",
            out.keys().collect::<Vec<_>>(),
        );
        assert!(
            !out.contains_key("bash[42]"),
            "singleton must NOT carry the tgid suffix under \
             default normalization: got {:?}",
            out.keys().collect::<Vec<_>>(),
        );
    }

    /// Render-side process-name pin: both process keys appear
    /// in the smaps section body, not just the headers. Pins
    /// that the row-emission loop reaches both keys — a future
    /// regression that broke iteration after the first match
    /// would surface here as a missing process.
    #[test]
    fn write_diff_smaps_emits_row_for_each_process_key() {
        let mut diff = CtprofDiff::default();
        let mut firefox_a = BTreeMap::new();
        firefox_a.insert("Rss".to_string(), 100 * 1024 * 1024);
        let mut firefox_b = BTreeMap::new();
        firefox_b.insert("Rss".to_string(), 200 * 1024 * 1024);
        let mut bash_a = BTreeMap::new();
        bash_a.insert("Rss".to_string(), 1024);
        let mut bash_b = BTreeMap::new();
        bash_b.insert("Rss".to_string(), 2048);
        diff.smaps_rollup_a.insert("firefox".into(), firefox_a);
        diff.smaps_rollup_b.insert("firefox".into(), firefox_b);
        diff.smaps_rollup_a.insert("bash".into(), bash_a);
        diff.smaps_rollup_b.insert("bash".into(), bash_b);

        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        let smaps_at = out
            .find("## smaps_rollup")
            .expect("smaps section must render");
        let smaps_section = &out[smaps_at..];
        assert!(
            smaps_section.contains("firefox"),
            "process key `firefox` must appear in smaps section body:\n{smaps_section}",
        );
        assert!(
            smaps_section.contains("bash"),
            "process key `bash` must appear in smaps section body:\n{smaps_section}",
        );
    }

    /// `pattern_display_label` over members whose names contain
    /// brackets must not panic in `grex` and must produce a
    /// regex that contains the bracketed substrings. Brackets
    /// are regex metacharacters, so `grex` has to escape them
    /// for the resulting regex to be valid. Pin that the labels
    /// for `worker[0]` and `worker[1]` come back containing
    /// `worker` — the literal-prefix portion — so a regression
    /// that drops the bracket escaping (or that crashes on
    /// bracket input) surfaces here.
    #[test]
    fn pattern_display_label_handles_bracket_member_names() {
        let members = vec![
            "worker[0]".to_string(),
            "worker[1]".to_string(),
            "worker[2]".to_string(),
        ];
        let label = pattern_display_label("worker[{N}]", &members);
        assert!(
            label.contains("worker"),
            "grex must produce a label that contains the shared `worker` prefix; got {label:?}",
        );
        // The regex `grex` produces is well-formed — try
        // compiling it. A bracket-escaping regression would
        // produce an invalid regex syntax that `Regex::new`
        // rejects.
        let _: Regex = Regex::new(&label)
            .unwrap_or_else(|e| panic!("grex output {label:?} is not a valid regex: {e}"));
    }

    /// Empty pcomm threads collapse together under default
    /// normalization. `pattern_key("")` returns the empty
    /// string; two threads with empty pcomm both key as `""`
    /// and merge into one bucket. Defensive pin: the
    /// `pattern_key` empty-input arm and the merge path both
    /// have to survive the empty key without panic. Real-world
    /// hits include kernel threads whose comm read failed
    /// during capture — capture-side default for an unreadable
    /// comm is the empty string.
    #[test]
    fn build_groups_pcomm_empty_pcomm_collapses_under_normalization() {
        let snap = snap_with(vec![make_thread("", "t0"), make_thread("", "t1")]);
        let groups = build_groups(&snap, GroupBy::Pcomm, &[], None, None, false);
        assert_eq!(groups[""].thread_count, 2);
        assert_eq!(groups.len(), 1);
    }

    /// Cgroup paths with bracketed segments tokenize through
    /// the same separator class as comms. A path like
    /// `/runner-[xyz]/scope` splits brackets into separator
    /// runs around the inner token. Pin that
    /// [`cgroup_skeleton_tokens`] handles the brackets without
    /// panic and that the resulting skeleton preserves the
    /// non-bracket tokens. Sister test to
    /// [`cgroup_normalize_collapses_bracketed_hex_session_ids`]
    /// but at the lower-level `cgroup_skeleton_tokens` boundary.
    #[test]
    fn cgroup_skeleton_tokens_handles_bracketed_segments() {
        let (skeleton, tokens) = cgroup_skeleton_tokens("/runner-[xyz]/scope");
        // Tokens come from the non-separator runs only:
        // `runner`, `xyz`, `scope`. Brackets and `/` and `-` are
        // all separators and don't show up in the token list.
        assert_eq!(
            tokens,
            vec!["runner".to_string(), "xyz".to_string(), "scope".to_string(),],
            "bracket separators must split tokens cleanly; got {tokens:?}",
        );
        // Skeleton: `runner` and `xyz` and `scope` are all pure
        // alpha → literal. Separators preserved verbatim.
        assert_eq!(
            skeleton, "/runner-[xyz]/scope",
            "skeleton must preserve separators including brackets; got {skeleton:?}",
        );
    }

    /// Bracketed pcomms collapse under Pcomm normalization. Three
    /// processes with pcomms `[stress-ng-0]`, `[stress-ng-1]`,
    /// `[stress-ng-2]` all key as `[stress-ng-{N}]` after
    /// `pattern_key` runs (brackets are separators, the digit
    /// suffix normalizes to `{N}`). Pins the integration of
    /// bracket-as-separator with the build_groups Pcomm path.
    #[test]
    fn build_groups_pcomm_bracketed_pcomms_collapse() {
        let snap = snap_with(vec![
            make_thread("[stress-ng-0]", "t0"),
            make_thread("[stress-ng-1]", "t1"),
            make_thread("[stress-ng-2]", "t2"),
        ]);
        let groups = build_groups(&snap, GroupBy::Pcomm, &[], None, None, false);
        assert_eq!(
            groups["[stress-ng-{N}]"].thread_count,
            3,
            "all three bracketed pcomms must collapse into one bucket; got {:?}",
            groups.keys().collect::<Vec<_>>(),
        );
        assert_eq!(groups.len(), 1);
    }

    /// TASK_COMM_LEN truncation under Pcomm: identical truncated
    /// pcomms group together via the literal-pcomm branch (no
    /// normalization fires when the tail tokens are pure alpha).
    /// Mirror of `build_groups_comm_truncated_comms_group_via_exact_match`
    /// for the Pcomm axis. Two processes share the same 15-char
    /// truncated pcomm and merge into one bucket.
    #[test]
    fn build_groups_pcomm_truncated_pcomms_group_via_exact_match() {
        // Both processes share the same truncated 15-char pcomm.
        // `tokio-runtime-w` has tokens `tokio`, `runtime`, `w` —
        // all pure alpha → literal → bucket key matches input.
        let snap = snap_with(vec![
            make_thread("tokio-runtime-w", "t0"),
            make_thread("tokio-runtime-w", "t1"),
        ]);
        let groups = build_groups(&snap, GroupBy::Pcomm, &[], None, None, false);
        assert_eq!(
            groups["tokio-runtime-w"].thread_count, 2,
            "identical truncated pcomms collapse via literal-pcomm branch",
        );
        assert_eq!(groups.len(), 1);
    }

    /// `collect_smaps_rollup` normalizes pcomm independently of
    /// the primary group_by axis. Build a snapshot with worker
    /// processes whose threads carry mixed cgroups; even if a
    /// caller groups primary metrics by cgroup, smaps keys still
    /// flow through `pattern_key(&t.pcomm)` and merge across
    /// PIDs. Pins the design property that smaps keying is
    /// orthogonal to `--group-by` — the smaps section reads
    /// pcomm directly off each leader thread, not the
    /// post-grouping bucket key.
    #[test]
    fn collect_smaps_rollup_independent_of_group_by_axis() {
        let mut t0 = smaps_thread("worker-0", 100, 1024, 512);
        t0.cgroup = "/cg-a".into();
        let mut t1 = smaps_thread("worker-1", 200, 2048, 1024);
        t1.cgroup = "/cg-b".into();
        let mut t2 = smaps_thread("worker-2", 300, 4096, 2048);
        t2.cgroup = "/cg-c".into();
        let snap = snap_with(vec![t0, t1, t2]);
        // Drive collect_smaps_rollup directly with the
        // normalize-on path. The function takes `(snap,
        // no_thread_normalize)` only — group_by is not in its
        // signature, which is the load-bearing fact this test
        // pins. Three distinct cgroup paths but one normalized
        // pcomm bucket.
        let out = collect_smaps_rollup(&snap, false);
        assert_eq!(
            out.len(),
            1,
            "smaps keying must collapse pcomm-pattern siblings \
             regardless of cgroup distribution: got {:?}",
            out.keys().collect::<Vec<_>>(),
        );
        let bucket = out.get("worker-{N}").expect("merged worker bucket");
        assert_eq!(
            bucket.get("Rss").copied(),
            Some((1024 + 2048 + 4096) * 1024)
        );
    }

    /// Smaps render Pss tiebreaker: when two processes report
    /// equal max-Rss, the secondary sort key is descending
    /// max-Pss. Pin that the larger-Pss process appears ahead of
    /// the smaller-Pss process when Rss is tied. The
    /// alphabetical fallback would have ordered them
    /// `equal_a` before `equal_b` (alpha) — which here is also
    /// the desired Pss-desc order, so we use ascending pcomm
    /// names for the LARGER-Pss row to make the test
    /// non-trivially distinguish the Pss tiebreaker from alpha.
    #[test]
    fn write_diff_smaps_pss_breaks_tie_when_rss_equal() {
        let mut diff = CtprofDiff::default();
        // Two processes with identical Rss; one carries higher
        // Pss. The Pss-higher process must render first.
        // Choose alphabetical names such that alpha-sort would
        // place the lower-Pss process first — that way the test
        // distinguishes "Pss tiebreak fired" from "alpha
        // fallback fired."
        let mut a = BTreeMap::new();
        a.insert("Rss".to_string(), 100 * 1024 * 1024);
        a.insert("Pss".to_string(), 30 * 1024 * 1024); // lower Pss
        let mut a_b = BTreeMap::new();
        a_b.insert("Rss".to_string(), 120 * 1024 * 1024);
        a_b.insert("Pss".to_string(), 35 * 1024 * 1024);
        let mut z = BTreeMap::new();
        z.insert("Rss".to_string(), 100 * 1024 * 1024); // same Rss as a
        z.insert("Pss".to_string(), 80 * 1024 * 1024); // higher Pss
        let mut z_b = BTreeMap::new();
        z_b.insert("Rss".to_string(), 120 * 1024 * 1024); // same Rss as a_b
        z_b.insert("Pss".to_string(), 90 * 1024 * 1024);
        // Insertion order doesn't matter for BTreeMap, but the
        // alphabetical pcomm names are: "alpha_proc" < "zoomed".
        // Under pure alpha, alpha_proc would come first — which
        // is the LOWER-Pss process. The Pss tiebreaker must
        // place "zoomed" first to pass.
        diff.smaps_rollup_a.insert("alpha_proc".into(), a);
        diff.smaps_rollup_b.insert("alpha_proc".into(), a_b);
        diff.smaps_rollup_a.insert("zoomed".into(), z);
        diff.smaps_rollup_b.insert("zoomed".into(), z_b);

        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        let smaps_at = out
            .find("## smaps_rollup")
            .expect("smaps section must render");
        let after = &out[smaps_at..];
        let zoomed_pos = after.find("zoomed").expect("zoomed key must appear");
        let alpha_pos = after
            .find("alpha_proc")
            .expect("alpha_proc key must appear");
        assert!(
            zoomed_pos < alpha_pos,
            "Pss tiebreaker must place higher-Pss process (zoomed) ahead of \
             lower-Pss process (alpha_proc) when Rss ties; got zoomed@{zoomed_pos} \
             alpha_proc@{alpha_pos}",
        );
    }

    /// Empty pcomm under default smaps normalization: a leader
    /// thread whose pcomm is the empty string (kernel-thread
    /// capture race, unreadable comm fallback) keys at
    /// `pattern_key("") = ""`. Two such leaders aggregate into
    /// one bucket whose key is the empty string. Defensive pin —
    /// the empty-key path through `or_default()` and the
    /// per-field saturating_add merge must survive without
    /// panic. Mirrors `build_groups_pcomm_empty_pcomm_collapses_under_normalization`
    /// for the smaps axis.
    #[test]
    fn collect_smaps_rollup_empty_pcomm_collapses_under_normalization() {
        let snap = snap_with(vec![
            smaps_thread("", 100, 1024, 512),
            smaps_thread("", 200, 2048, 1024),
        ]);
        let out = collect_smaps_rollup(&snap, false);
        assert_eq!(
            out.len(),
            1,
            "two empty-pcomm leaders must merge into one bucket; got {:?}",
            out.keys().collect::<Vec<_>>(),
        );
        let bucket = out.get("").expect("empty-key bucket");
        assert_eq!(bucket.get("Rss").copied(), Some((1024 + 2048) * 1024));
        assert_eq!(bucket.get("Pss").copied(), Some((512 + 1024) * 1024));
    }

    /// End-to-end literal-mode smaps render: drive
    /// `compare(..., no_thread_normalize: true)` with populated
    /// smaps_rollup data, then `write_diff` it, and assert the
    /// rendered section shows the literal `pcomm[tgid]` keys
    /// (NOT `pattern_key(pcomm)`). The same process instance
    /// must run on both sides for the row to join — that's the
    /// price of literal mode and the row only appears when both
    /// snapshots reference the same PID. Pin the full pipeline
    /// from `collect_smaps_rollup` literal-key construction
    /// through `write_diff`'s smaps section emission.
    #[test]
    fn write_diff_smaps_literal_mode_renders_pcomm_tgid_keys() {
        let mut leader_a = make_thread("worker", "worker");
        leader_a.tid = 4242;
        leader_a.tgid = 4242;
        leader_a.smaps_rollup_kb.insert("Rss".into(), 4096);
        leader_a.smaps_rollup_kb.insert("Pss".into(), 1024);
        let snap_a = snap_with(vec![leader_a]);

        let mut leader_b = make_thread("worker", "worker");
        leader_b.tid = 4242;
        leader_b.tgid = 4242;
        leader_b.smaps_rollup_kb.insert("Rss".into(), 4096);
        leader_b.smaps_rollup_kb.insert("Pss".into(), 2048);
        let snap_b = snap_with(vec![leader_b]);

        let opts = CompareOptions {
            group_by: GroupBy::Pcomm.into(),
            cgroup_flatten: vec![],
            no_thread_normalize: true,
            no_cg_normalize: false,
            sort_by: Vec::new(),
        };
        let diff = compare(&snap_a, &snap_b, &opts);

        // Diff struct must carry the literal `worker[4242]` key,
        // not the normalized `worker` form.
        assert!(
            diff.smaps_rollup_a.contains_key("worker[4242]"),
            "literal-mode baseline key must be `worker[4242]`; got {:?}",
            diff.smaps_rollup_a.keys().collect::<Vec<_>>(),
        );
        assert!(
            diff.smaps_rollup_b.contains_key("worker[4242]"),
            "literal-mode candidate key must be `worker[4242]`; got {:?}",
            diff.smaps_rollup_b.keys().collect::<Vec<_>>(),
        );
        // No normalized `worker` key under literal mode.
        assert!(
            !diff.smaps_rollup_a.contains_key("worker"),
            "literal-mode must NOT carry the normalized `worker` key",
        );

        // Rendered section text shows the literal key.
        let mut out = String::new();
        write_diff(
            &mut out,
            &diff,
            Path::new("a"),
            Path::new("b"),
            GroupBy::Pcomm,
            &DisplayOptions::default(),
        )
        .unwrap();
        let smaps_at = out
            .find("## smaps_rollup")
            .expect("smaps section must render");
        let after = &out[smaps_at..];
        assert!(
            after.contains("worker[4242]"),
            "literal-mode rendered table must show `worker[4242]` key:\n{after}",
        );
    }
}