tokitai-operator 0.1.0

//! End-to-end training runner (gated on `rocm-hip`).
//!
//! Phase 2.11 of the 0.7B MoE training project. Wires together
//! the existing building blocks:
//!
//! - `synth_data` for synthetic quality-decision samples
//! - `dataset_bridge` for real SQLite ledger ingestion
//! - `moe_model` for the MoE forward + backward passes
//! - `model::Parameter::adamw_step` (pure-CPU AdamW) for the
//!   optimizer step; the CPU path is the default so the
//!   runner works in CI / dev environments without a working
//!   HIP stack
//! - `checkpoint` (TKP1 binary) and `model_arch` (JSON +
//!   SHA-256) for the run output
//!
// Phase 2.11 end-to-end training runner.
//
// Wires together the existing building blocks:
//   - `synth_data` for synthetic quality-decision samples.
//   - `dataset_bridge` for real SQLite ledger ingestion.
//   - `moe_model` for the MoE forward + backward passes.
//   - `model::Parameter::adamw_step` (pure-CPU AdamW) for the
//     optimizer step. The CPU path is the default so the runner
//     works in CI / dev environments without a working HIP stack.
//   - `checkpoint` (TKP1 binary) and `model_arch` (JSON + SHA-256
//     fingerprint) for the end-of-run artifacts.
//   - `metrics` (JSONL) for per-step loss / grad-norm / lr.

use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::time::Instant;

use rand::SeedableRng;
use rand::seq::SliceRandom;
use serde::Serialize;

use crate::checkpoint::{Checkpoint, ParameterSnapshot, save_checkpoint, snapshot_parameter};
use crate::dataset_bridge::{LocalDataset, LocalSample};
use crate::domain::DomainId;
use crate::metrics::grad_norm;
use crate::model::layer::RouterCache;
use crate::model::{Layer, Model, Parameter};
use crate::model_arch::{ModelArch, ModelKind, RouterKind, arch_fingerprint};
use crate::moe_model::diagnose_ref::{
    build_from_model as build_fp32_ref, forward_backward as fp32_forward_backward, grad_norm_of,
    param_names as fp32_param_names, simulated_fp16_forward_backward as fp16_sim_forward_backward,
};
use crate::moe_model::{MoEModel, MoESize, N_EXPERTS};
use crate::object::{Dim, Shape, Tensor};
use crate::synth_data::regression::RegressionSample;
use crate::synth_data::{QualitySample, make_quality_decision_dataset, make_regression_dataset};
use crate::{Error, Result};

/// One training sample that the runner can consume after
/// `build_dataset` normalises the upstream sample types (synth or
/// ledger) into a single shape.
#[derive(Debug, Clone)]
struct TrainingSample {
    input: Vec<f32>,
    target: Vec<f32>,
}

impl From<QualitySample> for TrainingSample {
    fn from(s: QualitySample) -> Self {
        Self {
            input: s.input,
            target: s.target,
        }
    }
}

impl From<RegressionSample> for TrainingSample {
    fn from(s: RegressionSample) -> Self {
        Self {
            input: s.0,
            target: s.1,
        }
    }
}

impl From<LocalSample> for TrainingSample {
    fn from(s: LocalSample) -> Self {
        Self {
            input: s.features,
            target: s.labels,
        }
    }
}

/// Which optimizer the runner should apply on each step. The
/// default is [`Optimizer::Adamw`] for backward compatibility with
/// the pre-SGD-momentum behaviour, but the 0.7B Tiny run on the
/// fp16 HIP path is unstable under AdamW (loss diverges after a
/// few steps because bias correction amplifies the second-moment
/// noise), so callers can opt into [`Optimizer::Sgd`] via
/// `--optimizer sgd` — the same shape the 10K MLP gate test
/// already uses.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
pub enum Optimizer {
    Adamw,
    Sgd,
}

impl Default for Optimizer {
    fn default() -> Self {
        Optimizer::Adamw
    }
}

/// Static, fully-resolved training configuration. The binary builds
/// this from CLI args; the tests build it directly.
///
/// The 3-way ablation study (MoE+sheaf+padic vs MoE+softmax-only
/// vs Dense) is selected by the `model_kind` + `router_kind`
/// pair on this struct. The factory methods
/// ([`TrainConfig::tiny_moe_sheaf_padic`], [`TrainConfig::tiny_softmax_moe`],
/// [`TrainConfig::tiny_dense`]) pin the canonical combinations
/// so the ablation CLI / tests don't have to assemble the
/// fields by hand. The historical
/// `tiny_default` is preserved as an alias for
/// `tiny_moe_sheaf_padic` — see that method's docstring for the
/// exact same LR / batch / weight-decay / warmup values that
/// the pre-ablation runner used.
#[derive(Debug, Clone)]
pub struct TrainConfig {
    /// Model family selector. The training runner dispatches on
    /// this in `run_training` (and in `train_step_cpu`) to
    /// decide whether to construct a `MoEModel` or a
    /// `DenseModel`. Defaults to `ModelKind::MoE` for backward
    /// compatibility with the pre-ablation config struct.
    pub model_kind: ModelKind,
    /// Router kernel selector. Only meaningful when
    /// `model_kind == ModelKind::MoE`; the `Dense` branch
    /// records but ignores this field. Defaults to
    /// `RouterKind::SheafPadic` for backward compatibility.
    pub router_kind: RouterKind,
    /// `MoESize` variant. Used to size the `MoEModel` when
    /// `model_kind == ModelKind::MoE`. For the Dense branch
    /// this field is recorded for reproducibility but the
    /// actual model dims come from the canonical
    /// `tiny_dense` factory values.
    pub size: MoESize,
    pub steps: u32,
    pub batch_size: usize,
    pub lr: f32,
    pub weight_decay: f32,
    pub grad_clip: f32,
    pub seed: u32,
    pub checkpoint_dir: PathBuf,
    pub metrics_path: PathBuf,
    pub arch_path: PathBuf,
    pub quiet: bool,
    pub dataset_spec: String,
    pub use_hip: bool,
    /// When `true`, skip the HIP forward/backward/optimizer path
    /// and write synthetic metrics rows instead. The orchestrator
    /// (file IO, arch writing, checkpoint writing, summary
    /// computation) still runs, so callers can verify the wiring
    /// end-to-end without paying the cost — or risking the
    /// unreliability — of the HIP kernel pipeline. The
    /// `MoEModel::new` constructor is also skipped: in dry-run
    /// mode, `total_params` and the arch are derived from the
    /// `MoESize` directly, so this mode is also safe for the
    /// `Full` size (which would otherwise allocate ~10GB of AdamW
    /// state). Dry-run checkpoints have an empty `params` vector
    /// — the TKP1 header is still written so consumers can detect
    /// "dry run" vs "no run" by looking at the param count.
    pub dry_run: bool,
    pub optimizer: Optimizer,
    /// Momentum coefficient for [`Optimizer::Sgd`]. Ignored under
    /// [`Optimizer::Adamw`]. The 10K MLP test uses 0.9; the runner
    /// defaults to 0.9 to match.
    pub momentum: f32,
    /// When `true`, skip the training loop entirely and run a
    /// single fp16-vs-fp32 forward+backward comparison. Used to
    /// diagnose residual grad_norm growth suspected to come from
    /// fp16 accumulation noise in the HIP kernels. The diagnose
    /// path prints a per-parameter grad_norm table to stderr and
    /// exits; no metrics.jsonl or checkpoint is written.
    pub diagnose: bool,
    /// Per-module learning rate scale for the router parameters
    /// (router weight + router bias, the first 2 entries in
    /// `all_parameters_mut` order). The router's per-element gradient
    /// is ~7.7x larger than each expert's per-element gradient
    /// (see `tests/moe_router_diagnose.rs`). AdamW normalises
    /// per-parameter, so without this scale the router's update
    /// magnitude is 7.7x too large per step and the softmax
    /// saturates to a single top-K pair after 1-2 steps (the
    /// classic "router collapse" pattern in MoE training). The
    /// fix is applied at the optimizer step (multiplying the
    /// effective LR), NOT in the gradient buffer, so the global
    /// grad-norm clipping and the gradient values returned to
    /// callers are unchanged. Default 0.1 (1/10 of base LR);
    /// MoE literature uses 0.01-0.3 depending on the model.
    pub router_lr_scale: f32,
    /// Number of warmup steps at the start of training during
    /// which the learning rate is linearly ramped from 0 to the
    /// base [`Self::lr`]. Warmup is the canonical fix for the
    /// "first step blows up the model" failure mode in MoE and
    /// transformer training — without it, the first ~5 updates
    /// see the un-scaled random-projection gradients, which can
    /// be 10-100x the steady-state magnitude and silently push
    /// the router to a degenerate point from which SGD can't
    /// recover. The 0.7B MoE runner defaults to 5 (~10% of
    /// `tiny`'s default 50 steps), which is conservative for
    /// the current noise profile; bump to 50-100 for the
    /// medium/full sizes. 0 disables warmup (constant LR).
    /// See GPT-3 §2.2 (Brown et al., 2020) and Switch
    /// Transformer §4 (Fedus et al., 2022) for the original
    /// references.
    pub warmup_steps: u32,
    /// Floor for the cosine decay, as a fraction of the base
    /// [`Self::lr`]. The cosine schedule decays from
    /// `warmup_steps` onward, reaching
    /// `base_lr * min_lr_ratio` at step `steps - 1` and
    /// staying there. 0.1 matches the convention in
    /// "Attention Is All You Need" §5.3 (Vaswani et al., 2017)
    /// and modern transformer recipes. 0.0 lets the LR decay
    /// all the way to zero (sometimes useful for fine-tuning
    /// the last 1-2% of loss).
    pub min_lr_ratio: f32,
}

impl TrainConfig {
    /// Learning rate at a given step, applying the warmup +
    /// cosine decay schedule.
    ///
    /// Three regimes, in order:
    /// 1. **Warmup** (`step < warmup_steps`): linear ramp from 0
    ///    to `lr`. The first step sees `lr / warmup_steps` (or
    ///    just `lr` if `warmup_steps == 0`).
    /// 2. **Cosine decay** (`warmup_steps <= step < steps`):
    ///    smooth decay from `lr` to `lr * min_lr_ratio` over
    ///    the remaining steps. The half-cosine formula is
    ///    `0.5 * (1 + cos(π * t))` where `t` is the fractional
    ///    progress through the decay window.
    /// 3. **Hold** (`step >= steps`): clamped to
    ///    `lr * min_lr_ratio`. Should not happen in normal
    ///    operation; the runner stops at `steps - 1`.
    pub fn lr_at(&self, step: u32) -> f32 {
        let base = self.lr;
        let min_lr = base * self.min_lr_ratio;
        if step < self.warmup_steps {
            // Linear warmup: 0 -> base over warmup_steps.
            // The first non-zero step (step=1) is
            // base / warmup_steps (we start from 0 at step 0
            // so the first optimizer step sees a tiny LR,
            // which is the safety property warmup provides).
            if self.warmup_steps == 0 {
                return base;
            }
            return base * (step as f32 / self.warmup_steps as f32);
        }
        // Cosine decay window. If `steps <= warmup_steps`, we
        // skip the cosine and just hold at `base`.
        if self.steps <= self.warmup_steps {
            return base;
        }
        let decay_span = (self.steps - self.warmup_steps) as f32;
        let progress = (step - self.warmup_steps) as f32 / decay_span;
        let progress = progress.clamp(0.0, 1.0);
        let cosine = 0.5 * (1.0 + (std::f32::consts::PI * progress).cos());
        // Linearly interpolate between `base` and `min_lr`.
        min_lr + (base - min_lr) * cosine
    }
}

impl TrainConfig {
    /// Reasonable defaults for a `Tiny` smoke run with the
    /// historical MoE + sheaf+padic router. This is an alias
    /// for [`TrainConfig::tiny_moe_sheaf_padic`]; the explicit
    /// name is preferred for new code so the ablation study
    /// reads cleanly. The historical field values (LR=0.01,
    /// batch_size=128, grad_clip=0.5, warmup=5,
    /// min_lr_ratio=0.1, etc.) are unchanged from the
    /// pre-ablation codebase — see
    /// [`TrainConfig::tiny_moe_sheaf_padic`] for the
    /// per-field rationale.
    pub fn tiny_default() -> Self {
        Self::tiny_moe_sheaf_padic()
    }

    /// Canonical ablation arm #1: MoE + sheaf-overlap-check +
    /// p-adic encode/decode. This is the historical
    /// `tiny_default` (the MoE family with the sheaf+padic
    /// router that has been the production variant since the
    /// patent disclosure). Field values are unchanged from the
    /// pre-ablation `tiny_default` so the historical smoke
    /// tests and HIP gates still pass after this refactor.
    pub fn tiny_moe_sheaf_padic() -> Self {
        let timestamp = unix_timestamp();
        let checkpoint_dir = PathBuf::from(format!("./var/training/{timestamp}"));
        Self {
            model_kind: ModelKind::MoE,
            router_kind: RouterKind::SheafPadic,
            size: MoESize::Tiny,
            steps: 50,
            // 128 is a multiple of 16 (Linear fp16 GEMM constraint)
            // and is 4x the historical 32. The per-step CPU
            // orchestration cost (fp16 marshaling, subprocess
            // I/O, fp32<->fp16 conversion) is amortized over 4x
            // more samples, so per-step wall time drops by ~3-4x
            // even though total compute scales linearly. The
            // 0.7B MoE training in `tests/hip_5step_nano.rs` and
            // the patent's gradient-norm stability story are
            // both at 32 today, so any future change to that
            // gate must come with a re-bench.
            batch_size: 128,
            // 0.01 is the base LR for the warmup+cosine schedule
            // (min_lr_ratio=0.1 -> 0.001 at the end of 50 steps).
            // Lower than the historical 0.05 (SGD) and 0.001
            // (AdamW) because the fp16 routing-divergence
            // investigation (see `project_diagnose_findings.md`)
            // showed that even SGD+momentum at 0.05 lets the
            // global grad_norm drift to 40+ on Tiny. The new
            // tighter schedule + 0.5 grad clip is the principled
            // response; a future run can re-tune upward once
            // fp16 routing noise is fixed in the kernels.
            lr: 0.01,
            weight_decay: 0.01,
            // 0.5 (down from 1.0). For MoE with fp16 routing
            // noise, a tighter global L2 clip is the cheapest
            // stability win; Switch Transformer uses 1.0 but
            // has auxiliary router z-loss balancing the
            // scale, which we don't (yet). Re-tune upward if
            // the model under-trains.
            grad_clip: 0.5,
            seed: 42,
            metrics_path: checkpoint_dir.join("metrics.jsonl"),
            arch_path: checkpoint_dir.join("arch.json"),
            checkpoint_dir,
            quiet: false,
            dataset_spec: "synth:quality:512".to_string(),
            use_hip: false,
            dry_run: false,
            optimizer: Optimizer::Adamw,
            momentum: 0.9,
            router_lr_scale: 0.1,
            diagnose: false,
            // 5 steps linear warmup (~10% of 50) is the
            // "first step blows up the router" fix.
            warmup_steps: 5,
            // Cosine to 10% of base LR (matches Vaswani
            // et al. 2017 §5.3 / standard transformer
            // recipes).
            min_lr_ratio: 0.1,
        }
    }

    /// Canonical ablation arm #2: MoE with the **softmax-only**
    /// router. Same MoE topology (1 router + N_EXPERTS experts)
    /// as `tiny_moe_sheaf_padic`, but the router is the plain
    /// post-mask softmax with no sheaf overlap-check and no
    /// p-adic encode/decode. This is the "no structural
    /// prior" baseline against which the sheaf+padic
    /// contribution is measured.
    ///
    /// The training-loop schedule (LR, batch, weight decay,
    /// warmup, cosine) is kept identical to
    /// `tiny_moe_sheaf_padic` so the ablation isolates the
    /// router kernel as the only variable. The `size` field is
    /// set to `MoESize::Tiny` because the MoE topology is
    /// unchanged; only the router's forward logic differs.
    pub fn tiny_softmax_moe() -> Self {
        let mut cfg = Self::tiny_moe_sheaf_padic();
        // The ONLY difference from the sheaf+padic arm is
        // the router kernel. Everything else — the MoE
        // topology, the experts, the schedule — is held
        // constant so the ablation isolates the router.
        cfg.router_kind = RouterKind::SoftmaxOnly;
        // For the softmax-only arm, the router_lr_scale
        // rationale still applies (the per-element gradient
        // imbalance between router and experts is a property
        // of the MoE, not the router kernel), so we leave it
        // at the historical 0.1.
        cfg
    }

    /// Canonical ablation arm #3: a Dense MLP stack with no
    /// MoE dispatch. This is the "matched parameter budget"
    /// baseline — same approximate total param count as the
    /// MoE arms (~700M), no router, no per-expert gating, no
    /// sheaf/p-adic. If the MoE arms don't beat this baseline
    /// on the ablation metric, the sheaf+padic router kernel
    /// is not pulling its weight and the patent claim
    /// weakens.
    ///
    /// The Dense architecture is a stack of FFN sub-blocks
    /// (`Linear(hidden, intermediate) + LN + GELU +
    /// Linear(intermediate, hidden) + LN + GELU`) with
    /// `n_blocks=84`, `hidden=1024`, `intermediate=4096`,
    /// which lands the scalar param count at ~706M (in the
    /// [650M, 750M] target range; see the constraint in the
    /// ablation brief).
    ///
    /// The training-loop schedule mirrors
    /// `tiny_moe_sheaf_padic` as closely as possible — the
    /// ablation isolates the model family as the only
    /// variable. The two fields that do NOT carry over are
    /// `router_lr_scale` (Dense has no router, so the field
    /// is set to 1.0 as a "no-op" sentinel) and `size` (set
    /// to `MoESize::Tiny` as a placeholder; the actual dense
    /// dims are pinned by the factory, not by `size`).
    pub fn tiny_dense() -> Self {
        let mut cfg = Self::tiny_moe_sheaf_padic();
        // Switch the model family to Dense. The `size` field
        // is now a placeholder (the Dense dims come from
        // the `tiny_dense` factory, not from `MoESize`).
        cfg.model_kind = ModelKind::Dense;
        // Dense has no router, so router_lr_scale is a
        // no-op. Set it to 1.0 so any code that still
        // applies the scale gets the "historical default"
        // behaviour (no scaling) rather than 0.0 (which
        // would zero out the first 2 params — harmless for
        // Dense, since those params don't exist, but the
        // 1.0 sentinel is cleaner).
        cfg.router_lr_scale = 1.0;
        // The router_kind is recorded for reproducibility
        // even though Dense doesn't use it. The default
        // SheafPadic is the historical fallback; the Dense
        // build path ignores it.
        cfg
    }
}

#[derive(Debug, Clone, Serialize)]
pub struct StepMetrics {
    pub step: u32,
    pub loss: f32,
    pub final_loss: f32,
    pub grad_norm: f32,
    pub lr: f32,
    pub elapsed_ms: f64,
    pub model_size: String,
    /// Mean Shannon entropy of the router's per-sample probability
    /// distribution (post top-K mask). High entropy (e.g. ln(2) for
    /// the current TOP_K=2 design) means the router is splitting
    /// mass roughly evenly between its top-K picks; entropy near 0
    /// means the top-1 is dominating the mass inside the top-K, and
    /// entropy collapsed to 0 across the whole batch means the
    /// router has degenerated to a single expert pair (the classic
    /// "router collapse" failure mode this metric exists to detect).
    pub router_entropy_mean: f32,
    /// Fraction of the batch whose top-1 (argmax) expert was each
    /// of the [`N_EXPERTS`] experts. Sum across the slice is 1.0
    /// (within f32 rounding). Each entry is in [0, 1]; a healthy
    /// router has all entries close to 1/N_EXPERTS (= 0.25), and a
    /// collapsed router has one entry at 1.0 and the rest at 0.0.
    pub per_expert_share: [f32; N_EXPERTS],
}

#[derive(Debug, Clone)]
pub struct TrainSummary {
    pub model_size: String,
    pub total_params: usize,
    pub final_loss: f32,
    pub steps_run: u32,
    pub time_elapsed_sec: f64,
    pub throughput_steps_per_sec: f64,
    pub checkpoint_path: PathBuf,
    pub arch_path: PathBuf,
    pub arch_fingerprint: String,
    /// Router entropy on the last training step. Carried into the
    /// summary so the CLI can print it on the final `SUMMARY:` line
    /// (it is also written per-step into the JSONL, but the final
    /// step's value is the one the caller usually wants to
    /// eyeball). `f32::NAN` in dry-run mode, where no real router
    /// exists.
    pub last_router_entropy_mean: f32,
}

/// Default per-size step count (20 for Nano, 50 for Tiny, 100 for
/// Medium/Full). `Nano` is the hyperparameter-iteration size — a
/// 20-step run is enough to confirm the topology (router, gating,
/// gradients) is wired correctly, and completes in a couple of
/// seconds even on a CPU.
pub fn default_steps(size: MoESize) -> u32 {
    match size {
        MoESize::Nano => 20,
        MoESize::Tiny => 50,
        _ => 100,
    }
}

/// Default `use_hip` flag: false under the runner's default config
/// (the CPU path is more reliable in CI). The binary can flip this
/// on via env or future CLI flag.
pub fn default_use_hip() -> bool {
    false
}

/// End-to-end training run. Returns a [`TrainSummary`] on success
/// or an `Err` on the first IO / forward / backward / save failure.
///
/// When `cfg.dry_run` is `true`, the HIP forward / backward /
/// optimizer path is skipped entirely: no `MoEModel` is constructed
/// (so this is also safe for the `Full` size), no HIP kernels are
/// invoked, and per-step metrics are written as a deterministic
/// synthetic loss curve. The orchestrator (arch.json + fingerprint
/// + param_count, metrics.jsonl, TKP1 checkpoint, summary
/// computation) still runs in full, so callers can verify the
/// end-to-end wiring without paying the cost — or risking the
/// unreliability — of the HIP kernel pipeline.

/// end-to-end wiring without paying the cost — or risking the
/// unreliability — of the HIP kernel pipeline.
pub fn run_training(cfg: &TrainConfig) -> Result<TrainSummary> {
    if cfg.steps == 0 {
        return Err(Error::backend("run_training: steps must be >= 1"));
    }
    if cfg.batch_size == 0 {
        return Err(Error::backend("run_training: batch_size must be >= 1"));
    }
    fs::create_dir_all(&cfg.checkpoint_dir).map_err(|e| {
        Error::backend(format!(
            "create_dir_all {}: {e}",
            cfg.checkpoint_dir.display()
        ))
    })?;

    // Resolve total_params + arch + (optionally) the live model.
    // In dry-run mode we use the cheap `from_size` constructor (no
    // allocations), which is what makes dry-run mode safe for
    // `MoESize::Full` (~10GB of AdamW state if we actually built
    // the model). In real-training mode, we build the model once
    // here and reuse it for the training loop below so we don't
    // double the per-expert allocation.
    let (total_params, arch, model_opt) = if cfg.dry_run {
        let total = cfg.size.param_count();
        let arch = ModelArch::from_size(cfg.size, cfg.seed as u64);
        (total, arch, None)
    } else {
        let model = MoEModel::new(cfg.size, cfg.seed as u64);
        let total = model.scalar_param_count();
        let arch = ModelArch::from_moe_model(&model);
        (total, arch, Some(model))
    };

    let mut dataset = build_dataset(&cfg.dataset_spec, cfg.seed)?;
    if dataset.is_empty() {
        return Err(Error::backend("run_training: dataset is empty"));
    }

    let fingerprint = arch_fingerprint(&arch);
    let param_count_value = total_params as u64;
    write_arch_with_fingerprint(&cfg.arch_path, &arch, &fingerprint, param_count_value)?;

    let mut metrics_file = fs::File::create(&cfg.metrics_path).map_err(|e| {
        Error::backend(format!(
            "create metrics file {}: {e}",
            cfg.metrics_path.display()
        ))
    })?;

    let mut rng = rand::rngs::StdRng::seed_from_u64(cfg.seed as u64);
    let model_size_label = cfg.size.name().to_string();
    let started = Instant::now();
    let mut last_loss = 0.0f32;
    let mut last_router_entropy = f32::NAN;
    let checkpoint_path = cfg.checkpoint_dir.join("checkpoint.tkp1");

    if cfg.dry_run {
        // Synthetic step loop: deterministic loss curve, no model,
        // no HIP. The shape of the curve is deliberately simple
        // (loss = 1.0 / (1 + step), grad_norm = 1 / (1 + step)) so
        // the JSONL rows are easy to eyeball in a test failure.
        // Router stats are written as a perfectly balanced fake
        // (entropy = ln(N_EXPERTS) / N_EXPERTS? no — for the
        // current top-K=2 design the post-mask entropy maxes at
        // ln(2) and the share per expert averages to 1/N_EXPERTS).
        // We use 1/N_EXPERTS for the per-expert share and
        // ln(N_EXPERTS) as the synthetic entropy so the JSONL
        // row shape stays uniform between dry-run and real runs.
        let dry_share = [1.0 / N_EXPERTS as f32; N_EXPERTS];
        let dry_entropy = (N_EXPERTS as f32).ln();
        for step in 0..cfg.steps {
            let sp = step as f32;
            let loss = 1.0f32 / (1.0f32 + sp);
            let grad_n = 1.0f32 / (1.0f32 + sp);
            let elapsed_ms = started.elapsed().as_secs_f64() * 1000.0;
            last_loss = loss;

            let row = StepMetrics {
                step,
                loss,
                final_loss: loss,
                grad_norm: grad_n,
                lr: cfg.lr_at(step),
                elapsed_ms,
                model_size: model_size_label.clone(),
                router_entropy_mean: dry_entropy,
                per_expert_share: dry_share,
            };
            let line = serde_json::to_string(&row)
                .map_err(|e| Error::backend(format!("serialize metrics row: {e}")))?;
            writeln!(metrics_file, "{line}")
                .map_err(|e| Error::backend(format!("write metrics row: {e}")))?;
            metrics_file
                .flush()
                .map_err(|e| Error::backend(format!("flush metrics row: {e}")))?;

            if !cfg.quiet {
                eprintln!(
                    "[train_quality_moe] step {:4} loss={:.6} grad_norm={:.4} router_entropy={:.4} elapsed_ms={:.1}",
                    step, loss, grad_n, dry_entropy, elapsed_ms
                );
            }
        }

        // Dry-run checkpoint: TKP1 header + zero params. Consumers
        // can detect a dry run by looking at the count field
        // (header bytes [12..16] LE u32 == 0).
        let optimizer_name = match cfg.optimizer {
            Optimizer::Adamw => "adamw",
            Optimizer::Sgd => "sgd",
        };
        let ckpt = Checkpoint {
            step: cfg.steps,
            params: Vec::new(),
            config: format!(
                "{{\"size\":\"{}\",\"seed\":{},\"steps\":{},\"dry_run\":true,\"optimizer\":\"{}\",\"momentum\":{}}}",
                cfg.size.name(),
                cfg.seed,
                cfg.steps,
                optimizer_name,
                cfg.momentum,
            ),
        };
        save_checkpoint(&checkpoint_path, &ckpt)?;
    } else {
        // Real-training path: reuse the model we built for the
        // arch above (no double-allocation).
        let mut model = model_opt
            .expect("run_training: model_opt must be Some when dry_run is false (checked above)");

        for step in 0..cfg.steps {
            if step as usize % dataset.len().max(1) == 0 && step > 0 {
                dataset.shuffle(&mut rng);
            }
            let batch = dataset.batch(cfg.batch_size, step);
            if batch.is_empty() {
                return Err(Error::backend(format!(
                    "run_training: empty batch at step {step}"
                )));
            }
            let (xb, yb) = stack_batch(&batch);

            let step_started = Instant::now();
            let (loss, grad_n, router_entropy, per_expert_share) =
                train_step_cpu(&mut model, &xb, &yb, cfg, cfg.lr_at(step))?;
            let step_ms = step_started.elapsed().as_secs_f64() * 1000.0;
            let elapsed_ms = started.elapsed().as_secs_f64() * 1000.0;
            last_loss = loss;
            last_router_entropy = router_entropy;

            let row = StepMetrics {
                step,
                loss,
                final_loss: loss,
                grad_norm: grad_n,
                lr: cfg.lr_at(step),
                elapsed_ms,
                model_size: model_size_label.clone(),
                router_entropy_mean: router_entropy,
                per_expert_share,
            };
            let line = serde_json::to_string(&row)
                .map_err(|e| Error::backend(format!("serialize metrics row: {e}")))?;
            writeln!(metrics_file, "{line}")
                .map_err(|e| Error::backend(format!("write metrics row: {e}")))?;
            metrics_file
                .flush()
                .map_err(|e| Error::backend(format!("flush metrics row: {e}")))?;

            if !cfg.quiet {
                eprintln!(
                    "[train_quality_moe] step {:4} loss={:.6} grad_norm={:.4} router_entropy={:.4} step_ms={:.2} elapsed_ms={:.1}",
                    step, loss, grad_n, router_entropy, step_ms, elapsed_ms
                );
            }
        }

        let ckpt = build_checkpoint(&model, cfg.steps, cfg.seed as u64);
        save_checkpoint(&checkpoint_path, &ckpt)?;
    }

    let total_elapsed = started.elapsed().as_secs_f64();
    let throughput = if total_elapsed > 0.0 {
        cfg.steps as f64 / total_elapsed
    } else {
        0.0
    };

    Ok(TrainSummary {
        model_size: model_size_label,
        total_params,
        final_loss: last_loss,
        steps_run: cfg.steps,
        time_elapsed_sec: total_elapsed,
        throughput_steps_per_sec: throughput,
        checkpoint_path: cfg.checkpoint_dir.join("checkpoint.tkp1"),
        arch_path: cfg.arch_path.clone(),
        arch_fingerprint: fingerprint,
        last_router_entropy_mean: last_router_entropy,
    })
}

fn build_dataset(spec: &str, seed: u32) -> Result<Dataset> {
    if let Some(rest) = spec.strip_prefix("synth:") {
        if let Some(n_str) = rest.strip_prefix("quality:") {
            let n: usize = n_str
                .parse()
                .map_err(|e| Error::backend(format!("parse synth:quality:N N: {e}")))?;
            let samples: Vec<TrainingSample> = make_quality_decision_dataset(n, seed as u64)
                .into_iter()
                .map(TrainingSample::from)
                .collect();
            return Ok(Dataset { samples });
        }
        if let Some(n_str) = rest.strip_prefix("regression:") {
            let n: usize = n_str
                .parse()
                .map_err(|e| Error::backend(format!("parse synth:regression:N N: {e}")))?;
            let in_dim = crate::synth_data::QUALITY_INPUT_DIM;
            let out_dim = crate::synth_data::QUALITY_OUTPUT_DIM;
            let samples: Vec<TrainingSample> =
                make_regression_dataset(n, in_dim, out_dim, seed as u64)
                    .into_iter()
                    .map(TrainingSample::from)
                    .collect();
            return Ok(Dataset { samples });
        }
        return Err(Error::backend(format!(
            "unknown synth dataset spec: {spec} (expected synth:quality:N or synth:regression:N)"
        )));
    }
    if spec == "empty" {
        return Ok(Dataset { samples: vec![] });
    }
    let path = Path::new(spec);
    let decisions = path.join("quality_decisions.db");
    let outcomes = path.join("quality_outcomes.db");
    let local = LocalDataset::open_sqlite(&decisions, &outcomes).map_err(|e| {
        Error::backend(format!(
            "open sqlite dataset at {spec}: {e} (looking for quality_decisions.db and quality_outcomes.db)"
        ))
    })?;
    let samples: Vec<TrainingSample> = local
        .as_slice()
        .iter()
        .cloned()
        .map(TrainingSample::from)
        .collect();
    Ok(Dataset { samples })
}

#[derive(Debug, Clone)]
struct Dataset {
    samples: Vec<TrainingSample>,
}

impl Dataset {
    fn len(&self) -> usize {
        self.samples.len()
    }

    fn is_empty(&self) -> bool {
        self.samples.is_empty()
    }

    fn shuffle(&mut self, rng: &mut rand::rngs::StdRng) {
        self.samples.shuffle(rng);
    }

    fn batch(&self, batch_size: usize, step: u32) -> Vec<TrainingSample> {
        if self.samples.is_empty() {
            return vec![];
        }
        let n = self.samples.len();
        let start = (step as usize * batch_size) % n;
        let mut out: Vec<TrainingSample> = Vec::with_capacity(batch_size);
        for i in 0..batch_size {
            let idx = (start + i) % n;
            out.push(self.samples[idx].clone());
        }
        out
    }
}

fn stack_batch(batch: &[TrainingSample]) -> (Tensor<f32>, Tensor<f32>) {
    let b = batch.len();
    let in_dim = batch[0].input.len();
    let out_dim = batch[0].target.len();
    let mut xb: Vec<f32> = Vec::with_capacity(b * in_dim);
    let mut yb: Vec<f32> = Vec::with_capacity(b * out_dim);
    for s in batch {
        assert_eq!(s.input.len(), in_dim, "stack_batch: input dim mismatch");
        assert_eq!(s.target.len(), out_dim, "stack_batch: target dim mismatch");
        xb.extend_from_slice(&s.input);
        yb.extend_from_slice(&s.target);
    }
    let xt = Tensor::dense_cpu(
        DomainId::new("f32"),
        Shape::new(vec![Dim::Static(b), Dim::Static(in_dim)]),
        xb,
    );
    let yt = Tensor::dense_cpu(
        DomainId::new("f32"),
        Shape::new(vec![Dim::Static(b), Dim::Static(out_dim)]),
        yb,
    );
    (xt, yt)
}

/// One CPU-side training step on a MoE model: forward, MSE
/// backward, gradient clipping, and the optimizer step (AdamW or
/// SGD-with-momentum) per `cfg.optimizer`, with the router's
/// effective LR scaled by `cfg.router_lr_scale`. Exposed for
/// regression tests in `tests/sgd_router_regression.rs`; the
/// production entry point is [`run_training`].
///
/// The base `cfg.lr` is the **schedule peak**; the caller is
/// responsible for passing the step-scheduled LR as `base_lr`
/// (typically `cfg.lr_at(step)`). This split lets the regression
/// tests pin a constant LR while the production loop honors
/// warmup+cosine.
pub fn train_step_cpu(
    model: &mut MoEModel,
    inputs: &Tensor<f32>,
    targets: &Tensor<f32>,
    cfg: &TrainConfig,
    base_lr: f32,
) -> Result<(f32, f32, f32, [f32; N_EXPERTS])> {
    let output = model.forward(inputs)?;
    // Snapshot the router stats BEFORE we move `output` apart.
    // `output.router_weights` is shape `[B, N_EXPERTS]` (post-mask
    // top-K), and we use it to compute per-batch entropy + per-expert
    // selection share. This is the cheap end of the MoE-diagnostics
    // story (one extra O(B*N_EXPERTS) pass per step); a richer
    // per-expert loss term would need re-running forward with one
    // expert zeroed out at a time, which is 4x the cost.
    let (router_entropy_mean, per_expert_share) = compute_router_stats(&output.router_weights);
    let logits = &output.logits;
    let b = match &logits.meta.shape.dims[0] {
        Dim::Static(v) => *v,
        _ => {
            return Err(Error::shape(
                "train_step_cpu: logits batch dim must be static",
            ));
        }
    };
    let out_dim = match &logits.meta.shape.dims[1] {
        Dim::Static(v) => *v,
        _ => {
            return Err(Error::shape(
                "train_step_cpu: logits out dim must be static",
            ));
        }
    };
    if logits.data.len() != b * out_dim {
        return Err(Error::shape(format!(
            "train_step_cpu: logits data {} != batch*out_dim = {}*{}",
            logits.data.len(),
            b,
            out_dim
        )));
    }
    if targets.data.len() != b * out_dim {
        return Err(Error::shape(format!(
            "train_step_cpu: target data {} != batch*out_dim = {}*{}",
            targets.data.len(),
            b,
            out_dim
        )));
    }

    let n = b * out_dim;
    let nf = n as f32;
    let mut loss = 0.0f32;
    let mut grad_logits = vec![0.0f32; n];
    for i in 0..n {
        let diff = logits.data[i] - targets.data[i];
        loss += diff * diff;
        grad_logits[i] = 2.0 * diff / nf;
    }
    loss /= nf;

    let grad_output = Tensor::dense_cpu(
        logits.meta.domain.clone(),
        logits.meta.shape.clone(),
        grad_logits,
    );
    let (_grad_input, param_grads) = model.backward(&grad_output)?;

    let flat_grads = flatten_param_grads(&param_grads);
    let total_norm = grad_norm(&flat_grads);
    let clip_scale = if cfg.grad_clip > 0.0 && total_norm > cfg.grad_clip {
        cfg.grad_clip / total_norm
    } else {
        1.0
    };
    let mut param_grads = param_grads;
    apply_clip_scale(&mut param_grads, clip_scale);

    let mut params = all_parameters_mut(model);
    if params.len() != param_grads.len() {
        return Err(Error::backend(format!(
            "train_step_cpu: param count {} != param_grads count {}",
            params.len(),
            param_grads.len()
        )));
    }
    // Batched HIP AdamW path: when the run is on the AdamW
    // optimizer, `use_hip` is on, and the ROCm/HIP stack is
    // actually available at runtime, collapse the per-parameter
    // N kernel launches into ONE launch per training step (TWO
    // if `router_lr_scale != 1.0`, since the batched kernel
    // takes a single LR value). For the 0.7B MoE the per-step
    // win over the per-param binary path is ~13x. Falls through
    // to the per-param CPU/SGD path when HIP is not in play.
    #[cfg(feature = "rocm-hip")]
    {
        if cfg.optimizer == Optimizer::Adamw
            && cfg.use_hip
            && hip_adamw_available()
            && !params.is_empty()
        {
            // Bump per-param step counters once, up front. We
            // also collect a single global `t` for the bias
            // correction (AdamW's `1 - beta^t` only depends on
            // the step index, not on per-param state, so sharing
            // `t` across all params in a step is exactly the
            // math the kernel implements). The per-param
            // `step` fields are kept in sync with the global
            // step so the existing checkpoint and param-level
            // observability don't see a behavior change.
            for param in params.iter_mut() {
                if param.step == u32::MAX {
                    return Err(Error::backend(
                        "train_step_cpu: parameter step counter overflow",
                    ));
                }
                param.step += 1;
            }
            let t_global = params[0].step as i32;

            // Split into router (first 2) and experts (rest),
            // matching the per-param layout used by the CPU
            // path: see the layout comment in
            // `tests/moe_router_diagnose.rs`. The router gets
            // `base_lr * router_lr_scale`; the experts get
            // `base_lr`.
            let router_n = 2.min(params.len());
            let (router_params, expert_params) = params.split_at_mut(router_n);
            let (router_grads, expert_grads) = param_grads.split_at(router_n);
            let router_grad_refs: Vec<&Tensor<f32>> = router_grads.iter().collect();
            let expert_grad_refs: Vec<&Tensor<f32>> = expert_grads.iter().collect();

            if !router_params.is_empty() {
                adamw_step_batched_binary_for_group(
                    router_params,
                    &router_grad_refs,
                    base_lr * cfg.router_lr_scale,
                    0.9,
                    0.999,
                    1e-8,
                    cfg.weight_decay,
                    t_global,
                )?;
            }
            if !expert_params.is_empty() {
                adamw_step_batched_binary_for_group(
                    expert_params,
                    &expert_grad_refs,
                    base_lr,
                    0.9,
                    0.999,
                    1e-8,
                    cfg.weight_decay,
                    t_global,
                )?;
            }

            return Ok((loss, total_norm, router_entropy_mean, per_expert_share));
        }
    }

    for (idx, (param, grad_t)) in params.iter_mut().zip(param_grads.iter()).enumerate() {
        // The router (weight + bias) is always the first 2 entries
        // in `all_parameters_mut` order: see the layout comment in
        // `tests/moe_router_diagnose.rs`. Scale the LR for those
        // two parameters only; the experts use the base LR.
        // `router_lr_scale == 1.0` (the historical default) is a
        // no-op shortcut, so existing callers that don't set the
        // field are unaffected.
        let is_router_param = idx < 2;
        let eff_lr = if is_router_param {
            base_lr * cfg.router_lr_scale
        } else {
            base_lr
        };
        match cfg.optimizer {
            Optimizer::Sgd => {
                param.sgd_momentum_step(grad_t, eff_lr, cfg.momentum)?;
            }
            Optimizer::Adamw => {
                // CPU AdamW reference. The HIP-enabled batched
                // path above short-circuits this branch when
                // HIP is requested and available, so this arm
                // only fires when HIP is off, unavailable, or
                // the runner explicitly asked for the CPU path
                // (e.g. the `tests/sgd_router_regression.rs`
                // dispatch tests with `use_hip=false`).
                param.adamw_step(grad_t, eff_lr, 0.9, 0.999, 1e-8, cfg.weight_decay)?;
            }
        }
    }

    Ok((loss, total_norm, router_entropy_mean, per_expert_share))
}

/// Run the **truly batched** binary AdamW step on a *group* of
/// `(param, grad)` pairs in **one** kernel launch. The
/// `Parameter.data` (fp32 theta) and `grad.data` (fp32 grad)
/// are round-tripped through fp16 (the HIP kernel's
/// weight/grad storage format); the AdamW `m`/`v` state is fp32
/// throughout. The kernel signature takes `&mut [Vec<u16>]` /
/// `&mut [Vec<f32>]` slices — one entry per parameter — and
/// internally flattens them into a single contiguous buffer
/// before invoking the GPU kernel. For the 0.7B MoE this
/// collapses ~N kernel launches (one per parameter, ~100s in
/// total) into a single launch per training step, recovering
/// the full ~13x end-to-end speedup the binary wire protocol
/// promised.
///
/// The caller is responsible for bumping `param.step` BEFORE
/// this call and for routing any per-parameter LR scale (the
/// router scale in `TrainConfig::router_lr_scale`) by passing
/// the already-scaled `lr`. The bias-correction step counter
/// `t` is the same for every parameter in the group (it is a
/// per-step property of AdamW, not a per-parameter one).
///
/// Returns `Ok(())` on success; `Err(_)` if HIP is unavailable,
/// if the kernel call fails, or if any of the per-parameter
/// length invariants (theta/m/v/grad all equal) is violated.
#[cfg(feature = "rocm-hip")]
#[allow(clippy::too_many_arguments)]
fn adamw_step_batched_binary_for_group(
    params: &mut [&mut crate::model::parameter::Parameter],
    grads: &[&Tensor<f32>],
    lr: f32,
    beta1: f32,
    beta2: f32,
    eps: f32,
    weight_decay: f32,
    t: i32,
) -> Result<()> {
    use crate::backend::f16_convert::{f16_to_f32, f32_to_f16};
    // The `hip_adamw.rs` file is normally `include!`d from tests
    // and the gpu_smoke binary (it uses absolute
    // `tokitai_operator::` paths). Pull it in here with the same
    // bridge alias the test bridge uses, then reach the batched
    // function through the local module path.
    #[allow(unused_imports)]
    mod _hip_adamw {
        pub use crate as tokitai_operator;
        include!("backend/hip_adamw.rs");
    }
    use _hip_adamw::run_rocm_hip_adamw_step_all_binary;

    let n_params = params.len();
    if n_params == 0 {
        return Ok(());
    }
    if n_params != grads.len() {
        return Err(Error::backend(format!(
            "adamw_step_batched_binary_for_group: params/grads count mismatch \
             (params={n_params}, grads={})",
            grads.len()
        )));
    }

    // Build the per-parameter slices. The f16/f32 round-trip
    // runs in a tight inner loop so the compiler can
    // autovectorize. We allocate one `Vec<u16>` / `Vec<f32>` per
    // parameter, which means N+1 Vec allocations per step. For
    // 0.7B (N=~100 params) this is a couple hundred allocations;
    // not free, but well under the kernel compute time. A future
    // optimization could pre-allocate a thread-local scratch
    // buffer and `copy_from_slice` into it, but for now we keep
    // the simple path that mirrors what the per-param helper did.
    let mut theta_slices: Vec<Vec<u16>> = Vec::with_capacity(n_params);
    let mut m_slices: Vec<Vec<f32>> = Vec::with_capacity(n_params);
    let mut v_slices: Vec<Vec<f32>> = Vec::with_capacity(n_params);
    let mut grad_slices: Vec<Vec<u16>> = Vec::with_capacity(n_params);

    for (param, grad) in params.iter_mut().zip(grads.iter()) {
        let n = param.data.data.len();
        if n == 0 {
            // Skip empty params rather than fail: a degenerate
            // parameter (e.g. a zero-element bias) should not
            // block the rest of the group. The batched kernel
            // itself rejects a "zero total elements" group, so
            // we round-trip into zero-length vectors and let
            // the kernel emit its own error if needed.
            theta_slices.push(Vec::new());
            m_slices.push(Vec::new());
            v_slices.push(Vec::new());
            grad_slices.push(Vec::new());
            continue;
        }
        if n != grad.data.len() || n != param.m.data.len() || n != param.v.data.len() {
            return Err(Error::backend(format!(
                "adamw_step_batched_binary_for_group: length mismatch \
                 (data={n}, m={}, v={}, grad={})",
                param.m.data.len(),
                param.v.data.len(),
                grad.data.len()
            )));
        }
        let mut theta_i: Vec<u16> = vec![0u16; n];
        let mut m_i: Vec<f32> = vec![0.0f32; n];
        let mut v_i: Vec<f32> = vec![0.0f32; n];
        for j in 0..n {
            theta_i[j] = f32_to_f16(param.data.data[j]);
            m_i[j] = param.m.data[j];
            v_i[j] = param.v.data[j];
        }
        let grad_i: Vec<u16> = grad.data.iter().map(|&x| f32_to_f16(x)).collect();
        theta_slices.push(theta_i);
        m_slices.push(m_i);
        v_slices.push(v_i);
        grad_slices.push(grad_i);
    }

    // One kernel call for the entire group. The function
    // internally flattens these into a single contiguous buffer
    // and dispatches to the binary kernel exactly once.
    run_rocm_hip_adamw_step_all_binary(
        &mut theta_slices,
        &mut m_slices,
        &mut v_slices,
        &grad_slices,
        lr,
        beta1,
        beta2,
        eps,
        weight_decay,
        t,
    )?;

    // Scatter: convert fp16 theta back to fp32 and write each
    // param's data/m/v back in place. m and v MUST be updated
    // because they carry the AdamW state across steps. We do
    // not need to touch the empty-slice degenerate params (the
    // `for` loop just iterates zero times for them).
    for (i, param) in params.iter_mut().enumerate() {
        let n = param.data.data.len();
        if n == 0 {
            continue;
        }
        for j in 0..n {
            param.data.data[j] = f16_to_f32(theta_slices[i][j]);
            param.m.data[j] = m_slices[i][j];
            param.v.data[j] = v_slices[i][j];
        }
    }
    Ok(())
}

/// Probe whether the HIP AdamW binary kernel is available in the
/// current process. The kernel itself probes the GPU; we just
/// check the env-var gate (`cfg.use_hip`) and the
/// `rocm_hip_adamw_pilot` capability report. This is a cheap
/// check: the actual kernel binary is only built/loaded on the
/// first call, so this is purely a guard against wasted work
/// when HIP is not wired into the runner.
#[cfg(feature = "rocm-hip")]
fn hip_adamw_available() -> bool {
    use crate::backend::rocm::detect_local_rocm_hip;
    detect_local_rocm_hip().available
}

/// Compute per-batch router statistics: mean Shannon entropy of the
/// post-mask router distribution and the per-expert top-1 selection
/// share. Operates on a `[B, N_EXPERTS]` tensor; runs in
/// O(B*N_EXPERTS) time. Entropy is in nats (natural log). For the
/// current `TOP_K=2` design the entropy of a balanced router
/// converges to `ln(2) ≈ 0.693` (only the top-K entries carry
/// mass), and a router that has collapsed to a single top-1
/// expert has entropy `0`.
fn compute_router_stats(router_weights: &Tensor<f32>) -> (f32, [f32; N_EXPERTS]) {
    // Validate the shape: must be `[B, N_EXPERTS]` with a static
    // batch dim. The MoE model guarantees this on the forward
    // path; we fall back gracefully if a caller hands us a
    // dynamic dim or a too-small buffer.
    let b = match router_weights.meta.shape.dims.first() {
        Some(Dim::Static(v)) => *v,
        _ => 1,
    };
    if router_weights.data.len() < b * N_EXPERTS {
        // Defensive: shape contract broken. Return a balanced fake
        // rather than panicking mid-training.
        return ((N_EXPERTS as f32).ln(), [1.0 / N_EXPERTS as f32; N_EXPERTS]);
    }
    let mut counts = [0usize; N_EXPERTS];
    let mut entropy_sum = 0.0f32;
    for bi in 0..b {
        let row_start = bi * N_EXPERTS;
        // Argmax for top-1 expert selection. Ties resolve to the
        // first max, which is fine — we only care about the
        // distribution shape, not tie semantics.
        let mut top_e = 0usize;
        let mut top_v = router_weights.data[row_start];
        for ei in 1..N_EXPERTS {
            let v = router_weights.data[row_start + ei];
            if v > top_v {
                top_v = v;
                top_e = ei;
            }
        }
        counts[top_e] += 1;
        // Shannon entropy over the post-mask distribution. The
        // post-mask layout zeroes all but the top-K experts, so we
        // only need to consider non-negative entries. `p.ln()` of
        // 0.0 is `-inf` and would NaN the result, so guard with
        // `p > 0`.
        let mut h = 0.0f32;
        for ei in 0..N_EXPERTS {
            let p = router_weights.data[row_start + ei];
            if p > 0.0 {
                h -= p * p.ln();
            }
        }
        entropy_sum += h;
    }
    let bf = b.max(1) as f32;
    let entropy_mean = entropy_sum / bf;
    let per_expert_share = [
        counts[0] as f32 / bf,
        counts[1] as f32 / bf,
        counts[2] as f32 / bf,
        counts[3] as f32 / bf,
    ];
    (entropy_mean, per_expert_share)
}

fn flatten_param_grads(grads: &[Tensor<f32>]) -> Vec<f32> {
    let mut out: Vec<f32> = Vec::new();
    for g in grads {
        out.extend(g.data.iter().copied());
    }
    out
}

/// Cosine similarity between two equally-sized float slices.
/// Returns 1.0 if both vectors are zero (avoids NaN and matches
/// the "agree" intuition: zero-grad in both means the optimizer
/// will not move that parameter on either path). Returns NaN if
/// the lengths differ (the caller should always pass matching
/// slices — a length mismatch is a programmer error).
fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
    assert_eq!(
        a.len(),
        b.len(),
        "cosine_similarity: length mismatch ({} vs {})",
        a.len(),
        b.len()
    );
    let mut dot = 0.0f64;
    let mut na = 0.0f64;
    let mut nb = 0.0f64;
    for i in 0..a.len() {
        let ai = a[i] as f64;
        let bi = b[i] as f64;
        dot += ai * bi;
        na += ai * ai;
        nb += bi * bi;
    }
    if na == 0.0 && nb == 0.0 {
        return 1.0;
    }
    let denom = (na * nb).sqrt();
    if denom == 0.0 {
        // One side is zero, the other isn't: orthogonal in
        // direction (no alignment at all). The diagnose
        // pipeline will format this as "0.00" via {:.2}.
        return 0.0;
    }
    (dot / denom) as f32
}

fn apply_clip_scale(grads: &mut [Tensor<f32>], scale: f32) {
    if scale == 1.0 {
        return;
    }
    for g in grads {
        for v in g.data.iter_mut() {
            *v *= scale;
        }
    }
}

/// Return every trainable parameter in the MoE model in a stable
/// order: router parameters first, then experts 0..N_EXPERTS-1 in
/// declaration order. Within the router, `Linear::parameters_mut`
/// returns `[weight, bias]` (see `src/model/layer.rs:200-203`), so
/// the layout is:
///
/// ```text
/// [router.weight, router.bias, expert0.weight, expert0.bias, expert0.ln.weight, expert0.ln.bias, expert0.fc2.weight, expert0.fc2.bias,
///  expert1.weight, ..., expert3.fc2.bias]
/// ```
///
/// The training-runner relies on the **first two slots being the
/// router weight + bias** for `router_lr_scale` to apply to the
/// right parameters. If `MoEModel::parameters_mut` is ever
/// refactored to reorder this, the `idx < 2` shortcut in
/// `train_step_cpu` will silently apply the router scale to the
/// wrong tensors. The `debug_assert!` below makes the layout
/// assumption a crash in dev builds instead of a silent divergence
/// in production.
fn all_parameters_mut(model: &mut MoEModel) -> Vec<&mut Parameter> {
    let mut out: Vec<&mut Parameter> = Vec::new();
    out.extend(model.router.parameters_mut());
    for expert in model.experts.iter_mut() {
        out.extend(expert.parameters_mut());
    }
    debug_assert!(
        out.len() >= 2,
        "all_parameters_mut: expected at least router.weight + router.bias, got {}",
        out.len()
    );
    out
}

fn build_checkpoint(model: &MoEModel, step: u32, seed: u64) -> Checkpoint {
    let mut snapshots: Vec<ParameterSnapshot> = Vec::new();
    let router_params = model.router.parameters();
    for (i, p) in router_params.iter().enumerate() {
        snapshots.push(snapshot_parameter(p, &format!("router.param_{i}")));
    }
    for (ei, expert) in model.experts.iter().enumerate() {
        for (i, p) in expert.parameters().iter().enumerate() {
            snapshots.push(snapshot_parameter(p, &format!("expert_{ei}.param_{i}")));
        }
    }
    let config = format!(
        "{{\"size\":\"{}\",\"seed\":{seed},\"steps\":{step}}}",
        model.size.name()
    );
    Checkpoint {
        step,
        params: snapshots,
        config,
    }
}

fn unix_timestamp() -> u64 {
    std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .map(|d| d.as_secs())
        .unwrap_or(0)
}

fn write_arch_with_fingerprint(
    path: &Path,
    arch: &ModelArch,
    fingerprint: &str,
    param_count: u64,
) -> Result<()> {
    let mut v = serde_json::to_value(arch)
        .map_err(|e| Error::backend(format!("arch: serialize to value: {e}")))?;
    if let Some(obj) = v.as_object_mut() {
        obj.insert(
            "fingerprint".to_string(),
            serde_json::Value::String(fingerprint.to_string()),
        );
        obj.insert(
            "param_count".to_string(),
            serde_json::Value::Number(param_count.into()),
        );
    }
    let text = serde_json::to_string_pretty(&v)
        .map_err(|e| Error::backend(format!("arch: re-serialize: {e}")))?;
    std::fs::write(path, text)
        .map_err(|e| Error::backend(format!("arch: write {}: {e}", path.display())))?;
    Ok(())
}

// ---------------------------------------------------------------------------
// Diagnose mode (Phase 2.11)
//
// Single forward+backward step with two parallel paths on the same model
// state and the same input:
//   - the existing fp16-via-HIP path (model.forward + model.backward)
//   - a pure-f32 reference forward+backward (no fp16 conversion at all)
//
// The diagnose mode exists to test the hypothesis that the residual
// grad_norm growth observed in the 0.7B MoE training is fp16 accumulation
// noise in the Linear GEMM and softmax kernels. If the per-parameter
// grad_norm ratio fp16 / fp32 is much greater than 1, the noise
// hypothesis is confirmed; if the ratios are close to 1, the noise
// hypothesis is wrong and the residual growth is coming from somewhere
// else (e.g. the optimizer / bias-correction / learning-rate path).
//
// The diagnose path is intentionally short: one synth sample (or one
// batch slice of the user-supplied dataset), one forward+backward in
// each precision, a per-parameter table on stderr, and a one-line
// verdict. It writes no metrics.jsonl and no checkpoint — it is a
// diagnostic only.

/// Per-parameter row in the diagnose table.
pub struct DiagnoseRow {
    pub name: String,
    /// HIP fp16 path (None if HIP was unavailable).
    pub fp16: Option<f32>,
    /// Simulated fp16 (f32 + quantization between ops).
    pub fp16_sim: f32,
    /// Pure f32 reference.
    pub fp32: f32,
    /// fp16_sim / fp32 ratio (always present, since the simulated
    /// path is always run).
    pub ratio: f32,
    /// HIP fp16 / fp32 ratio (None if HIP was unavailable; finite
    /// otherwise). Surfaced alongside `ratio` so the verdict can
    /// use the worse of the two.
    pub hip_ratio: Option<f32>,
    /// |fp16 - fp32| in grad_norm units. The HIP path's
    /// absolute error; surfaced so the noise magnitude is
    /// readable without doing mental arithmetic on the two
    /// columns.
    pub abs_diff: Option<f32>,
    /// |fp16 - fp32| / max(|fp32|, eps). Scale-free view of the
    /// same error; less dominated by parameters that have huge
    /// gradients but tiny relative error (e.g. the output
    /// projection's bias).
    pub rel_diff: Option<f32>,
    /// sim(grad_fp16, grad_fp32) computed on the flattened
    /// gradient vectors. The strongest per-parameter
    /// divergence indicator — a value below ~0.99 means the
    /// two paths are pointing the optimizer in materially
    /// different directions even when their grad_norms look
    /// similar.
    pub cosine_sim: Option<f32>,
}

/// Single forward+backward, fp16 vs fp32, with a per-parameter
/// grad_norm comparison. No training loop, no checkpoint, no
/// metrics file. The first `batch_size` samples of the configured
/// dataset are used.
pub fn run_diagnose(cfg: &TrainConfig) -> Result<()> {
    if cfg.batch_size == 0 {
        return Err(Error::backend("run_diagnose: batch_size must be >= 1"));
    }
    // The fp16-via-HIP path goes through the Linear layer's 16x16
    // GEMM kernel, which requires the batch dim to be a multiple
    // of 16. If the caller passed a non-multiple of 16, fall back
    // to 16 (the smallest valid size). The fp32 reference path
    // has no such constraint, but we keep the two paths at the
    // same batch size so the per-param grad_norms are
    // apples-to-apples.
    let batch_size = if cfg.batch_size % 16 == 0 {
        cfg.batch_size
    } else {
        eprintln!(
            "[--diagnose] note: bumping batch {} -> 16 (Linear fp16 GEMM requires multiple of 16)",
            cfg.batch_size
        );
        16
    };
    let dataset = build_dataset(&cfg.dataset_spec, cfg.seed)?;
    if dataset.is_empty() {
        return Err(Error::backend("run_diagnose: dataset is empty"));
    }
    let batch = dataset.batch(batch_size, 0);
    let (xb, yb) = stack_batch(&batch);
    let b = match &xb.meta.shape.dims[0] {
        Dim::Static(v) => *v,
        _ => return Err(Error::shape("run_diagnose: xb batch dim must be static")),
    };
    let in_dim = crate::moe_model::topology::IN_DIM;
    let out_dim = crate::moe_model::topology::OUT_DIM;
    if xb.data.len() != b * in_dim {
        return Err(Error::shape(format!(
            "run_diagnose: xb.len() {} != B*IN_DIM={}*{}",
            xb.data.len(),
            b,
            in_dim
        )));
    }
    if yb.data.len() != b * out_dim {
        return Err(Error::shape(format!(
            "run_diagnose: yb.len() {} != B*OUT_DIM={}*{}",
            yb.data.len(),
            b,
            out_dim
        )));
    }

    eprintln!(
        "[--diagnose] model_size={} batch={} total_params={}",
        cfg.size.name(),
        b,
        cfg.size.param_count()
    );

    // ---- fp16 path (existing forward + backward) ----
    // We must call the model's real forward+backward so the
    // comparison is against the exact code path the training
    // loop uses. We build a transient model rather than touching
    // the caller's; the diagnose path applies no optimizer step.
    //
    // If the HIP stack is unavailable (kernel compile failed,
    // hipcc missing, etc.) the fp16 path will fail. We catch
    // that and continue with the fp32 reference so the diagnose
    // mode still prints something useful — the user's request
    // was that diagnose mode should not be gated on `use_hip`,
    // i.e. it should print what it can, even when the HIP
    // forward+backward is not reachable.
    let model = MoEModel::new(cfg.size, cfg.seed as u64);
    let names = fp32_param_names(&model);
    let mut fp16_per_param: Option<Vec<f32>> = None;
    let mut fp16_grads_per_param: Option<Vec<Vec<f32>>> = None;
    let mut fp16_top_k_indices: Option<Vec<usize>> = None;
    let mut fp16_total_opt: Option<f32> = None;
    let mut fp16_loss: Option<f32> = None;
    let mut logits_fp16: Option<Vec<f32>> = None;
    let mut grad_output_data: Option<Vec<f32>> = None;
    let mut grad_output: Option<Tensor<f32>> = None;

    // Try the fp16 path. On failure, fall through to the
    // simulated-fp16 + fp32 reference so the user still gets a
    // comparison.
    match model.forward(&xb) {
        Ok(output) => {
            let logits = output.logits.data.clone();
            // Capture the fp16 (HIP) router's top-K indices BEFORE
            // backward consumes the cache. This is the fp16 routing
            // decision: the softmax output is fp16 via
            // `run_rocm_hip_softmax_fwd`, then the top-K mask is
            // computed on the host from those fp16 probs. This is
            // the single strongest indicator of fp16-vs-fp32
            // routing divergence (the hypothesis from the
            // `project_diagnose_findings.md` memory file: fp16
            // softmax picks different experts than fp32).
            if let Some(cache) = model.last_cache.borrow().as_ref() {
                if let Some(router_cache) = cache.router_cache.downcast_ref::<RouterCache>() {
                    fp16_top_k_indices = Some(router_cache.top_k_indices.clone());
                }
            }
            // Compute the same MSE + grad_logits that
            // train_step_cpu would compute, so both paths see
            // the same grad_output.
            let mut grad_logits = vec![0.0f32; b * out_dim];
            let mut loss = 0.0f32;
            let nf = (b * out_dim) as f32;
            for i in 0..(b * out_dim) {
                let diff = logits[i] - yb.data[i];
                loss += diff * diff;
                grad_logits[i] = 2.0 * diff / nf;
            }
            loss /= nf;
            let go = Tensor::dense_cpu(
                output.logits.meta.domain.clone(),
                output.logits.meta.shape.clone(),
                grad_logits,
            );
            match model.backward(&go) {
                Ok((_gi, param_grads_fp16)) => {
                    if param_grads_fp16.len() != names.len() {
                        return Err(Error::backend(format!(
                            "run_diagnose: fp16 param_grads.len() {} != names.len() {} (model.layer layout changed?)",
                            param_grads_fp16.len(),
                            names.len()
                        )));
                    }
                    fp16_per_param = Some(
                        param_grads_fp16
                            .iter()
                            .map(|g| grad_norm(&g.data))
                            .collect(),
                    );
                    // Keep the raw per-param grads around for the
                    // cosine-similarity column in the table below.
                    // Without this, the table can only show
                    // grad_norms and the routing-divergence
                    // indicator, both of which are noisier than
                    // sim(grad_fp16, grad_fp32) per parameter.
                    fp16_grads_per_param =
                        Some(param_grads_fp16.iter().map(|g| g.data.clone()).collect());
                    fp16_total_opt = Some(grad_norm(&flatten_param_grads(&param_grads_fp16)));
                    fp16_loss = Some(loss);
                    logits_fp16 = Some(logits);
                    grad_output_data = Some(go.data.clone());
                    grad_output = Some(go);
                }
                Err(e) => {
                    eprintln!(
                        "[--diagnose] fp16-via-HIP backward failed: {e}; \
                         continuing with fp32 reference only"
                    );
                }
            }
        }
        Err(e) => {
            eprintln!(
                "[--diagnose] fp16-via-HIP forward failed: {e}; \
                 continuing with fp32 reference only"
            );
        }
    }

    // If the fp16 path failed to produce a grad_output, build one
    // from the fp32 reference so the rest of the pipeline still
    // has a valid grad_output tensor to print against.
    if grad_output_data.is_none() {
        let ref_only = build_fp32_ref(&model);
        let ref_res = fp32_forward_backward(&ref_only, &xb.data, &vec![0.0f32; b * out_dim]);
        // Use a zero grad_output: the absolute fp32 numbers are
        // then trivially 0 and the table will show "ratio=inf"
        // for non-zero parameters, which is a clear visual cue
        // that the fp16 path is unavailable.
        let _ = ref_res;
        grad_output_data = Some(vec![0.0f32; b * out_dim]);
    }

    // ---- fp32 reference path (always runs) ----
    let ref_model = build_fp32_ref(&model);
    // Safe: the `if grad_output_data.is_none()` block above populates
    // grad_output_data with a zero-filled buffer of length b*out_dim, so
    // it is guaranteed to be Some here.
    let grad_for_fp32 = grad_output_data
        .as_ref()
        .expect("grad_output_data populated by fallback above");
    let ref_result = fp32_forward_backward(&ref_model, &xb.data, grad_for_fp32);
    let fp32_per_param: Vec<f32> = ref_result
        .param_grads
        .iter()
        .map(|g| grad_norm_of(g))
        .collect();
    let fp32_total = grad_norm_of(
        &ref_result
            .param_grads
            .iter()
            .flat_map(|g| g.iter().copied())
            .collect::<Vec<_>>(),
    );

    // ---- simulated fp16 path (always runs; doesn't need HIP) ----
    // Pure-f32 math with fp16 round-trip on every activation and
    // weight buffer. Captures the noise hypothesis even when the
    // HIP stack is unavailable.
    let sim_result = fp16_sim_forward_backward(&ref_model, &xb.data, grad_for_fp32);
    let fp16_sim_per_param: Vec<f32> = sim_result
        .param_grads
        .iter()
        .map(|g| grad_norm_of(g))
        .collect();
    let fp16_sim_total = grad_norm_of(
        &sim_result
            .param_grads
            .iter()
            .flat_map(|g| g.iter().copied())
            .collect::<Vec<_>>(),
    );

    // ---- Forward sanity: max abs diff between fp16 and fp32 logits. ----
    let mut max_abs = 0.0f32;
    if let (Some(logits_h), ref_logits) = (&logits_fp16, &ref_result.logits) {
        for (a, bv) in logits_h.iter().zip(ref_logits.iter()) {
            let d = (a - bv).abs();
            if d > max_abs {
                max_abs = d;
            }
        }
    }
    let fp16_total = fp16_total_opt.unwrap_or(0.0);
    let loss = fp16_loss.unwrap_or(0.0);
    eprintln!(
        "[--diagnose] forward: fp16 vs fp32 logits max_abs_diff={:.6}  fp16_loss={:.6}  \
         fp16_total_grad_norm={:.6}  fp16_sim_total_grad_norm={:.6}  fp32_total_grad_norm={:.6}",
        max_abs, loss, fp16_total, fp16_sim_total, fp32_total
    );

    // ---- Per-parameter table ----
    // The ratio column has two flavors: sim/fp32 (always shown) and
    // HIP/fp32 (when HIP was reachable). The verdict below uses
    // whichever is the worse of the two so we don't mislead the
    // user into thinking the noise is mild when in fact the real
    // HIP path blows up — the simulated path is a lower bound on
    // what fp16 noise can do.
    let mut rows: Vec<DiagnoseRow> = Vec::with_capacity(names.len());
    let mut ratio_max_sim = 0.0f32;
    let mut ratio_max_hip: Option<f32> = None;
    for (i, name) in names.iter().enumerate() {
        let fp16_opt = fp16_per_param.as_ref().map(|v| v[i]);
        let fp16_sim = fp16_sim_per_param[i];
        let fp32 = fp32_per_param[i];
        let sim_ratio = if fp32 > 0.0 {
            fp16_sim / fp32
        } else if fp16_sim == 0.0 {
            1.0
        } else {
            f32::INFINITY
        };
        let hip_ratio_opt = fp16_opt.map(|h| {
            if fp32 > 0.0 {
                h / fp32
            } else if h == 0.0 {
                1.0
            } else {
                f32::INFINITY
            }
        });
        if sim_ratio.is_finite() && sim_ratio > ratio_max_sim {
            ratio_max_sim = sim_ratio;
        }
        if let Some(hip_ratio) = hip_ratio_opt {
            // Track the worst HIP ratio. INFINITY is the worst
            // possible value, so do NOT skip it via is_finite() —
            // a true division-by-exact-zero IS the smoking gun for
            // fp16 noise, and we want it to dominate the verdict.
            let update = match ratio_max_hip {
                None => true,
                Some(cur) if cur.is_finite() && hip_ratio.is_finite() => hip_ratio > cur,
                Some(_) => true, // either current or new is non-finite; new is at least as bad
            };
            if update {
                ratio_max_hip = Some(hip_ratio);
            }
        }
        // Per-parameter divergence details: |fp16 - fp32|, relative
        // difference, and cosine similarity of the raw grad
        // vectors. The cosine sim is the strongest indicator — two
        // grads can have nearly identical grad_norms but point in
        // orthogonal directions, which would mislead the
        // grad_norm ratio into reading "fine" when in fact the
        // optimizer is being pushed in a different direction on
        // every step. Only available when the HIP fp16 path
        // succeeded (otherwise we have no fp16 grads to compare).
        let rel_eps = 1.0e-12f32;
        let (abs_diff, rel_diff, cosine_sim) = match fp16_opt {
            Some(h) => {
                let abs_d = (h - fp32).abs();
                let rel_d = abs_d / fp32.abs().max(rel_eps);
                let cos = match fp16_grads_per_param.as_ref() {
                    Some(grads) => {
                        let fp32_grad = &ref_result.param_grads[i];
                        if grads[i].len() == fp32_grad.len() && !grads[i].is_empty() {
                            Some(cosine_similarity(&grads[i], fp32_grad))
                        } else {
                            None
                        }
                    }
                    None => None,
                };
                (Some(abs_d), Some(rel_d), cos)
            }
            None => (None, None, None),
        };
        rows.push(DiagnoseRow {
            name: name.clone(),
            fp16: fp16_opt,
            fp16_sim,
            fp32,
            ratio: sim_ratio,
            hip_ratio: hip_ratio_opt,
            abs_diff,
            rel_diff,
            cosine_sim,
        });
    }
    let _ = grad_output; // silence unused-warnings if HIP path skipped
    let name_w = rows.iter().map(|r| r.name.len()).max().unwrap_or(0);
    eprintln!("[--diagnose] per-param grad_norm:");
    if fp16_per_param.is_some() {
        eprintln!(
            "  {:<name_w$}  {:>12}  {:>12}  {:>12}  {:>10}  {:>10}  {:>11}  {:>10}  {:>10}",
            "param", "fp16(HIP)", "fp16(sim)", "fp32", "HIP/fp32", "sim/fp32", "|Δ|", "rel", "cos"
        );
    } else {
        eprintln!(
            "  {:<name_w$}  {:>12}  {:>12}  {:>10}",
            "param", "fp16(sim)", "fp32", "sim/fp32"
        );
    }
    // Compact fp16/fp32 ratio formatter: cap at ">1e6" so an
    // absurdly large number (e.g. 2.8e38 from a near-zero fp32
    // denominator) doesn't dominate the column width. A value of
    // ">1e6" is already past the verdict threshold and is a clear
    // "noise is huge" signal.
    let compact_ratio = |v: f32| -> String {
        if !v.is_finite() {
            "inf".to_string()
        } else if v >= 1.0e6 {
            ">1e6".to_string()
        } else {
            format!("{:.2}", v)
        }
    };
    // Compact formatter for the divergence columns (abs/rel/cos).
    // Same idea: cap at ">1e6", print NaN cleanly, and keep the
    // columns narrow enough that a Nano-sized param list (only
    // ~12 rows) doesn't get pushed off the screen.
    let compact_diff = |v: Option<f32>| -> String {
        match v {
            None => "-".to_string(),
            Some(x) if !x.is_finite() => "inf".to_string(),
            Some(x) if x >= 1.0e6 => ">1e6".to_string(),
            Some(x) => format!("{:.3e}", x),
        }
    };
    let compact_rel = |v: Option<f32>| -> String {
        match v {
            None => "-".to_string(),
            Some(x) if !x.is_finite() => "inf".to_string(),
            Some(x) if x >= 1.0e6 => ">1e6".to_string(),
            Some(x) => format!("{:.3e}", x),
        }
    };
    let compact_cos = |v: Option<f32>| -> String {
        match v {
            None => "-".to_string(),
            Some(x) if !x.is_finite() => "nan".to_string(),
            Some(x) => {
                // sim=1.0 means identical direction; anything
                // below 0.99 is a real divergence. Use 4 dp so
                // the difference between 0.9999 and 0.9990 is
                // visible at a glance.
                format!("{:.4}", x)
            }
        }
    };
    for r in &rows {
        if fp16_per_param.is_some() {
            // fp16(HIP), fp16(sim), fp32 columns: 6 significant
            // figures in scientific notation, matching the task
            // spec's "at least 6 significant figures" bar and
            // letting very small / very large grads share the
            // same width.
            let fp16_str = match r.fp16 {
                Some(v) => format!("{:>12.6e}", v),
                None => format!("{:>12}", "(HIP n/a)"),
            };
            let fp32_str = format!("{:>12.6e}", r.fp32);
            let fp16_sim_str = format!("{:>12.6e}", r.fp16_sim);
            let hip_str = match r.hip_ratio {
                Some(v) => format!("{:>10}", compact_ratio(v)),
                None => format!("{:>10}", "-"),
            };
            let sim_str = format!("{:>10}", compact_ratio(r.ratio));
            eprintln!(
                "  {:<name_w$}  {}  {}  {}  {}  {}  {:>11}  {:>10}  {:>10}",
                r.name,
                fp16_str,
                fp16_sim_str,
                fp32_str,
                hip_str,
                sim_str,
                compact_diff(r.abs_diff),
                compact_rel(r.rel_diff),
                compact_cos(r.cosine_sim),
            );
        } else {
            // HIP unavailable: show the original 3-column layout
            // (fp16(sim), fp32, sim/fp32) and skip the abs/rel/cos
            // columns — we have no fp16 grads to compare against.
            let fp16_sim_str = format!("{:>12.6e}", r.fp16_sim);
            let fp32_str = format!("{:>12.6e}", r.fp32);
            let sim_str = format!("{:>10}", compact_ratio(r.ratio));
            eprintln!(
                "  {:<name_w$}  {}  {}  {}",
                r.name, fp16_sim_str, fp32_str, sim_str,
            );
        }
    }

    // ---- Router routing divergence ----
    // The single strongest indicator of fp16-vs-fp32 routing
    // divergence: did the fp16 softmax pick the same top-K
    // experts as the fp32 softmax? If not, the fp16 path is
    // sending different inputs through different experts than
    // the fp32 path, which by itself explains residual
    // grad_norm growth even when per-parameter grad_norms
    // agree. This block compares:
    //   - fp16 (HIP) top-K vs fp32 reference top-K
    //   - fp16-sim top-K vs fp32 reference top-K
    // A "[ROUTING DIVERGENT]" marker line is printed for
    // grep-ability if either comparison shows a divergence.
    let top_k = crate::moe_model::topology::TOP_K;
    let fp32_top_k = &ref_result.top_k_indices;
    let sim_top_k = &sim_result.top_k_indices;
    let fp16_top_k = fp16_top_k_indices.as_deref();
    let (hip_diverge, hip_diverge_n) = match fp16_top_k {
        Some(h) if h.len() == fp32_top_k.len() && !fp32_top_k.is_empty() => {
            let mut n = 0usize;
            for (bi, slot) in fp32_top_k.chunks(top_k).enumerate() {
                let h_row = &h[bi * top_k..(bi + 1) * top_k];
                // Sets compare as multisets of length K — the
                // top-K mask doesn't preserve insertion order,
                // only "which K experts were selected".
                let mut a: Vec<usize> = h_row.to_vec();
                let mut b: Vec<usize> = slot.to_vec();
                a.sort_unstable();
                b.sort_unstable();
                if a != b {
                    n += 1;
                }
            }
            let rows = fp32_top_k.len() / top_k;
            (Some((n, rows)), n)
        }
        _ => (None, 0),
    };
    let (sim_diverge, sim_diverge_n) =
        if sim_top_k.len() == fp32_top_k.len() && !fp32_top_k.is_empty() {
            let mut n = 0usize;
            for (bi, slot) in fp32_top_k.chunks(top_k).enumerate() {
                let s_row = &sim_top_k[bi * top_k..(bi + 1) * top_k];
                let mut a: Vec<usize> = s_row.to_vec();
                let mut b: Vec<usize> = slot.to_vec();
                a.sort_unstable();
                b.sort_unstable();
                if a != b {
                    n += 1;
                }
            }
            let rows = fp32_top_k.len() / top_k;
            (Some((n, rows)), n)
        } else {
            (None, 0)
        };
    eprintln!("[--diagnose] router top-K assignments (B={}):", b);
    eprintln!(
        "  {:<5}  {:<24}  {:<24}  {:<24}",
        "row", "fp16(HIP)", "fp16(sim)", "fp32"
    );
    for bi in 0..b {
        let fp16_str = match fp16_top_k {
            Some(h) => {
                let row: Vec<String> = h[bi * top_k..(bi + 1) * top_k]
                    .iter()
                    .map(|e| format!("expert_{e}"))
                    .collect();
                format!("[{}]", row.join(", "))
            }
            None => "(HIP n/a)".to_string(),
        };
        let sim_row: Vec<String> = sim_top_k[bi * top_k..(bi + 1) * top_k]
            .iter()
            .map(|e| format!("expert_{e}"))
            .collect();
        let fp32_row: Vec<String> = fp32_top_k[bi * top_k..(bi + 1) * top_k]
            .iter()
            .map(|e| format!("expert_{e}"))
            .collect();
        let sim_str = format!("[{}]", sim_row.join(", "));
        let fp32_str = format!("[{}]", fp32_row.join(", "));
        eprintln!(
            "  {:<5}  {:<24}  {:<24}  {:<24}",
            bi, fp16_str, sim_str, fp32_str
        );
    }
    match hip_diverge {
        Some((n, rows)) => {
            eprintln!(
                "[--diagnose] routing divergence: HIP fp16 vs fp32 = {n}/{rows} rows differ in top-K"
            );
            if n > 0 {
                eprintln!(
                    "[--diagnose] [ROUTING DIVERGENT] HIP fp16 softmax picked different experts than fp32 on {n} of {rows} batch rows"
                );
            }
        }
        None => {
            eprintln!(
                "[--diagnose] routing divergence: HIP fp16 vs fp32 = n/a (HIP forward failed)"
            );
        }
    }
    match sim_diverge {
        Some((n, rows)) => {
            eprintln!(
                "[--diagnose] routing divergence: sim fp16 vs fp32 = {n}/{rows} rows differ in top-K"
            );
            if n > 0 {
                eprintln!(
                    "[--diagnose] [ROUTING DIVERGENT] simulated fp16 softmax picked different experts than fp32 on {n} of {rows} batch rows"
                );
            }
        }
        None => {
            eprintln!("[--diagnose] routing divergence: sim fp16 vs fp32 = n/a");
        }
    }

    // ---- Verdict ----
    // Use the WORSE of (sim/fp32 max, HIP/fp32 max) so we don't
    // declare "noise is not a major issue" just because the
    // simulated-fp16 path looks tame. The 10K MLP gate test passes
    // at <2x noise; the 0.7B MoE training step has been observed to
    // inflate by >5x for some parameters (residual grad_norm
    // 9.9 -> 16.4 -> 27.8 over 5 steps). 2x and 5x are the two
    // thresholds — informative without being alarmist.
    let ratio_max = match ratio_max_hip {
        Some(hip) if !hip.is_finite() || hip > ratio_max_sim => hip,
        _ => ratio_max_sim,
    };
    let source = if ratio_max_hip.map_or(false, |h| h > ratio_max_sim) {
        "HIP"
    } else {
        "sim"
    };
    let verdict = if !ratio_max.is_finite() {
        // True INFINITY (a parameter with fp16 noise but fp32
        // gradient is exactly zero) is the smoking gun.
        "fp16/fp32 ratio is +inf (fp16 noise produced a non-zero grad where fp32 produced zero): noise IS a major issue"
    } else if ratio_max <= 2.0 {
        "fp16/fp32 ratio within 2x: noise is NOT a major issue"
    } else if ratio_max <= 5.0 {
        "fp16/fp32 ratio within 5x: noise is a moderate issue"
    } else {
        "fp16/fp32 ratio > 5x: noise IS a major issue"
    };
    let fmt_ratio = |r: f32| -> String {
        if r.is_finite() {
            format!("{:.2}", r)
        } else {
            "inf".to_string()
        }
    };
    eprintln!(
        "[--diagnose] verdict: {verdict}  (worst-case ratio = {} from {}; \
         sim={}, hip={}, routing_divergent_rows=HIP:{}/{} sim:{}/{})",
        fmt_ratio(ratio_max),
        source,
        fmt_ratio(ratio_max_sim),
        match ratio_max_hip {
            Some(h) => fmt_ratio(h),
            None => "n/a".to_string(),
        },
        hip_diverge_n,
        hip_diverge.map(|(_, rows)| rows).unwrap_or(0),
        sim_diverge_n,
        sim_diverge.map(|(_, rows)| rows).unwrap_or(0),
    );

    Ok(())
}

#[cfg(test)]
mod lr_schedule_tests {
    //! Pin the `TrainConfig::lr_at` schedule to a fixed point so a
    //! future refactor of the warmup / cosine / clamp logic fails
    //! the test rather than silently changing the per-step LR the
    //! production training loop applies. Reference values are
    //! computed by hand for `lr=1.0, warmup=4, min=0.1, steps=20`
    //! — chosen so warmup, the cosine interior, and the cosine
    //! endpoint at progress=1.0 are all reached at integer step
    //! values, and f32 precision is not the limiting factor.
    use super::*;

    fn cfg() -> TrainConfig {
        TrainConfig {
            model_kind: ModelKind::MoE,
            router_kind: RouterKind::SheafPadic,
            size: MoESize::Nano,
            steps: 20,
            batch_size: 4,
            lr: 1.0,
            weight_decay: 0.0,
            grad_clip: 1.0,
            seed: 1,
            checkpoint_dir: std::path::PathBuf::from("/tmp/lr-schedule-test"),
            metrics_path: std::path::PathBuf::from("/tmp/lr-schedule-test/m.jsonl"),
            arch_path: std::path::PathBuf::from("/tmp/lr-schedule-test/a.json"),
            quiet: true,
            dataset_spec: "empty".to_string(),
            use_hip: false,
            dry_run: true,
            optimizer: Optimizer::Sgd,
            momentum: 0.9,
            router_lr_scale: 1.0,
            diagnose: false,
            warmup_steps: 4,
            min_lr_ratio: 0.1,
        }
    }

    #[test]
    fn warmup_is_linear_from_zero() {
        // At step 0 the LR is 0 (warmup starts from zero, NOT
        // from `lr` — that property is what makes warmup a
        // safety ramp against the "first step blows up" failure
        // mode in MoE / transformer training).
        let c = cfg();
        assert_eq!(c.lr_at(0), 0.0);
        // step=1: lr * 1/4 = 0.25.
        assert!((c.lr_at(1) - 0.25).abs() < 1e-6);
        // step=2: lr * 2/4 = 0.50.
        assert!((c.lr_at(2) - 0.50).abs() < 1e-6);
        // step=3: lr * 3/4 = 0.75.
        assert!((c.lr_at(3) - 0.75).abs() < 1e-6);
        // step=4 is the FIRST step of the cosine window; the
        // formula at progress=0 gives `0.5 * (1 + cos(0)) = 1`,
        // so the LR is exactly `base` (continuous at the warmup
        // -> cosine boundary).
        assert!((c.lr_at(4) - 1.0).abs() < 1e-6);
    }

    #[test]
    fn cosine_decay_endpoint_is_min_lr() {
        // With steps=20, warmup=4 the cosine window is
        // [4, 20], a span of 16. step=20 has progress=1.0, so
        // the LR collapses to `min_lr` exactly.
        let c = cfg();
        assert!((c.lr_at(20) - 0.1).abs() < 1e-6);
        // The runner's last step is `steps-1 = 19`, at
        // progress = 15/16 ≈ 0.9375. The cosine factor is
        // 0.5*(1+cos(15π/16)) ≈ 0.5*(1 - 0.981) = 0.00963,
        // so the LR is 0.1 + 0.9*0.00963 ≈ 0.1087. Verify
        // the formula lands where we expect (so a regression
        // that changes the end-of-cosine behavior is loud).
        let lr_19 = c.lr_at(19);
        assert!(
            (lr_19 - 0.1087).abs() < 1e-3,
            "lr_at(19) should be ~0.1087 (cosine interior near the end), got {lr_19}"
        );
    }

    #[test]
    fn cosine_decay_midpoint_is_average_of_base_and_min() {
        // The half-cosine midpoint (progress=0.5) gives
        // `0.5 * (1 + cos(π/2)) = 0.5`, so the LR is exactly
        // the average of `base` and `min_lr`. With
        // steps=20, warmup=4 the midpoint is at
        // step = 4 + 0.5*16 = 12. For base=1.0, min=0.1 the
        // LR is 0.55.
        let c = cfg();
        assert!((c.lr_at(12) - 0.55).abs() < 1e-6);
    }

    #[test]
    fn warmup_zero_skips_warmup_but_still_cosines() {
        // warmup_steps=0 means the linear warmup branch is
        // never entered (no `step < 0` is true for u32).
        // The cosine window starts at step 0, so the LR is
        // already in the cosine decay at step 1. This is the
        // standard "no warmup phase, but still decay" shape
        // — NOT "constant LR forever" (use min_lr_ratio=1.0
        // for that). The first assert is a sanity check that
        // we don't divide by zero in the warmup block.
        let mut c = cfg();
        c.warmup_steps = 0;
        assert_eq!(c.lr_at(0), 1.0);
        // step=1 with warmup=0, steps=20: progress=1/20=0.05.
        // cos(π*0.05) ≈ 0.9877, cosine factor ≈ 0.9939,
        // LR = 0.1 + 0.9 * 0.9939 ≈ 0.9945.
        let lr_1 = c.lr_at(1);
        assert!(
            (lr_1 - 0.9945).abs() < 1e-3,
            "lr_at(1) with warmup=0 should be ~0.9945 (cosine interior near start), got {lr_1}"
        );
    }

    #[test]
    fn steps_at_or_below_warmup_hold_at_base_lr() {
        // If the total step count is <= warmup_steps the
        // cosine window has zero length (the second guard
        // `self.steps <= self.warmup_steps` fires). The
        // schedule must hold at `base` rather than divide by
        // zero in `decay_span = steps - warmup`.
        let mut c = cfg();
        c.steps = 4;
        c.warmup_steps = 4;
        // All four warmup steps are still in the linear ramp.
        assert!((c.lr_at(0) - 0.0).abs() < 1e-6);
        assert!((c.lr_at(1) - 0.25).abs() < 1e-6);
        assert!((c.lr_at(2) - 0.50).abs() < 1e-6);
        assert!((c.lr_at(3) - 0.75).abs() < 1e-6);
        // The "hold at base" branch fires only at step >= warmup.
        // For step=4 with steps=warmup=4: 4 < 4 is false, then
        // 4 <= 4 is true → return base. So lr_at(4) = 1.0.
        assert!((c.lr_at(4) - 1.0).abs() < 1e-6);
    }

    #[test]
    fn step_past_end_clamps_to_min_lr() {
        // step >= steps falls through to the end of the cosine
        // window (progress is clamped to 1.0 by the inner
        // `clamp(0.0, 1.0)`), so the LR is `min_lr`. Should
        // not happen in normal operation (the runner stops at
        // steps-1) but the schedule must not panic or NaN.
        let c = cfg();
        assert!((c.lr_at(100) - 0.1).abs() < 1e-6);
    }
}