gam 0.1.17 - Docs.rs

//! Central authority for outer smoothing-parameter optimization strategy.
//!
//! Every path that optimizes smoothing parameters (standard REML, link-wiggle,
//! GAMLSS custom family, spatial kappa, etc.) declares its derivative
//! capability here and receives an [`OuterPlan`] that determines which solver
//! and Hessian source to use.
//!
//! # Design invariant
//!
//! The planner never synthesizes numerical Hessians. If a path cannot provide
//! an analytic Hessian, that fact is visible in its
//! [`OuterCapability`] declaration and in the resulting [`OuterPlan`], which
//! falls back to BFGS or an EFS variant instead of synthesizing second-order
//! curvature numerically.

use crate::estimate::EstimationError;
use crate::solver::estimate::reml::unified::BarrierConfig;
use ::opt::{
    Arc as ArcOptimizer, ArcError, Bfgs, BfgsError, Bounds, FirstOrderObjective, FirstOrderSample,
    FixedPoint, FixedPointError, FixedPointObjective, FixedPointSample, FixedPointStatus,
    MaxIterations, ObjectiveEvalError, SecondOrderObjective, SecondOrderSample, Solution,
    Tolerance, ZerothOrderObjective,
};
use ndarray::{Array1, Array2, ArrayView2};
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering};

/// Bidirectional inner-PIRLS feedback channel.
///
/// The outer-loop scheduler (BFGS or ARC bridge) writes a coarsened
/// iteration cap into `cap` before each accepted gradient/Hessian eval,
/// and the inner solver (`execute_pirls_if_needed`) writes back into
/// `last_iters` / `last_converged` after each NON-screening solve so the
/// next outer iter's schedule can adapt to the inner solver's actual
/// convergence behavior rather than a hardcoded iter-count tier.
///
/// All atomics are owned by `RemlObjectiveState`; the bridges hold
/// `Arc` clones. `last_iters == 0` means "no inner-Newton signal yet" —
/// the schedule falls back to the coarse iter-count tier for the first
/// outer iter. `ift_residual_bits == 0` means "no IFT-predictor quality
/// signal yet" — the schedule's +margin reverts to the conservative
/// default. The two signals are independent: the IFT residual may be
/// missing even after a successful inner solve (when the predictor was
/// rejected by the |Δρ| cap and a flat warm-start was used instead).
#[derive(Clone, Debug)]
pub struct InnerProgressFeedback {
    pub cap: Arc<AtomicUsize>,
    pub last_iters: Arc<AtomicUsize>,
    pub last_converged: Arc<AtomicBool>,
    /// Bit-packed `f64` residual `‖β_converged − β_predicted‖ /
    /// ‖β_converged‖` from the previous IFT-predicted PIRLS solve.
    /// Used to tighten or loosen the cap's `+margin` when the
    /// predictor's empirical faithfulness is known: a small residual
    /// means the inner Newton starts very close to the KKT β and only
    /// needs +1 iter of margin; a large residual means the prediction
    /// collapsed to flat warm-start and the inner Newton has more
    /// recovery work, so +4 is appropriate. `0` means "no signal yet".
    pub ift_residual: Arc<AtomicU64>,
    /// Bit-packed `f64` accepted gain ratio
    /// (`actual_reduction / predicted_reduction`) from the most recent
    /// non-screening PIRLS solve. NaN bits encode "no signal yet"
    /// (matches `ift_residual`'s sentinel discipline). Used by
    /// `first_order_inner_cap_schedule` as a third quality signal
    /// alongside `last_iters` and `last_converged`: a small accept_rho
    /// (model overstating predicted reduction) is a hint the next
    /// iter's inner Newton may need extra margin even when the
    /// previous solve converged in few iters.
    pub accept_rho: Arc<AtomicU64>,
}

impl InnerProgressFeedback {
    /// Snapshot the read-back atomics for the cap schedule. Returns `None`
    /// when no inner solve has reported yet (`last_iters == 0`); the
    /// schedule then falls back to the coarse iter-count tier.
    ///
    /// The IFT residual decoding uses the same NaN-sentinel discipline
    /// as `RemlState::predict_warm_start_beta_ift_with_outcome` — see commit
    /// `748cc066` for the rationale. A residual of exactly 0 (every
    /// β_predicted_i bit-equal to β_converged_i) must NOT be confused
    /// with "no signal yet"; the NaN sentinel + `is_finite()` check
    /// distinguishes the two cleanly. Both ends of the atomic share
    /// `crate::solver::reml::runtime::IFT_RESIDUAL_NO_SIGNAL_BITS`
    /// implicitly via the same bit pattern.
    fn snapshot(&self) -> Option<InnerProgressSnapshot> {
        let iters = self.last_iters.load(Ordering::Relaxed);
        if iters == 0 {
            None
        } else {
            // NaN sentinel + is_finite() check covers three cases in
            // one expression: "no signal yet" (sentinel decodes to NaN,
            // fails is_finite), "corrupted state" (any non-finite or
            // negative residual), and "real signal" (finite non-negative
            // → Some). Matches the IFT predictor's reader semantics.
            let residual_bits = self.ift_residual.load(Ordering::Relaxed);
            let r = f64::from_bits(residual_bits);
            let last_ift_residual = if r.is_finite() && r >= 0.0 {
                Some(r)
            } else {
                None
            };
            let accept_rho_bits = self.accept_rho.load(Ordering::Relaxed);
            let ar = f64::from_bits(accept_rho_bits);
            let last_accept_rho = if ar.is_finite() && ar >= 0.0 {
                Some(ar)
            } else {
                None
            };
            Some(InnerProgressSnapshot {
                last_iters: iters,
                last_converged: self.last_converged.load(Ordering::Relaxed),
                last_ift_residual,
                last_accept_rho,
            })
        }
    }
}

#[derive(Clone, Copy, Debug)]
struct InnerProgressSnapshot {
    last_iters: usize,
    last_converged: bool,
    /// Most-recent IFT predictor residual (see field doc on
    /// `InnerProgressFeedback`). `None` when the predictor has not
    /// reported yet, when the cache was reset, or when the previous
    /// solve fell back to flat warm-start (no IFT prediction
    /// consumed).
    last_ift_residual: Option<f64>,
    /// Most-recent accepted LM gain ratio (see field doc on
    /// `InnerProgressFeedback::accept_rho`). `None` when no step was
    /// accepted in the previous solve (rejection-exhausted) or when
    /// the cache was reset.
    last_accept_rho: Option<f64>,
}

/// Exact dense-materialization route exposed by an outer Hessian operator.
///
/// The optimizer uses this as a work-model contract before turning a
/// matrix-free analytic Hessian into a dense ARC model. `Unavailable` means
/// callers must stay matrix-free; the remaining variants are all analytic
/// but differ in how much per-column HVP overhead they imply.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum OuterHessianMaterialization {
    /// Dense materialization is not part of this operator's contract.
    Unavailable,
    /// Materialization is exact but implemented by cheap repeated HVP probes.
    RepeatedHvp,
    /// Materialization is exact and can apply many HVP directions together.
    BatchedHvp,
    /// Materialization is exact and can be assembled without basis probing.
    Explicit,
}

impl OuterHessianMaterialization {
    fn is_available(self) -> bool {
        !matches!(self, Self::Unavailable)
    }
}

/// Matrix-free outer Hessian operator.
///
/// This is the exact outer Hessian action `H_outer * v` evaluated at the
/// current outer point, without requiring dense materialization.
///
/// The trait provides four increasingly materialized primitives:
///
/// - [`matvec`](Self::matvec) — single column, the only one implementors must
///   provide.
/// - [`mul_mat`](Self::mul_mat) — multi-column; the default falls back to
///   column-by-column `matvec`. Implementors override this when they can
///   amortize per-Hv-apply overhead (cached factorizations, parallel matvecs)
///   across many right-hand-sides.
/// - [`materialization_capability`](Self::materialization_capability) — an
///   explicit work-model contract that tells ARC whether dense exact
///   materialization is unavailable, cheap repeated-HVP, batched-HVP, or
///   explicit.
/// - [`materialize_dense`](Self::materialize_dense) — the special case
///   `mul_mat(I_dim)` followed by a symmetric average of the off-diagonals to
///   absorb round-off asymmetry. ARC callers only use this when
///   [`materialization_capability`](Self::materialization_capability) advertises
///   an exact dense route, preserving the no-numerical-Hessian policy.
pub trait OuterHessianOperator: Send + Sync {
    fn dim(&self) -> usize;
    fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String>;

    /// Whether probing all basis columns is cheap enough for dense ARC.
    ///
    /// The default is deliberately conservative. For operator-backed Duchon,
    /// CTN, survival, or other row-streaming kernels, `dim <= 64` does not
    /// imply cheap materialization: each column may trigger a full data pass.
    ///
    /// New implementations should prefer overriding
    /// [`materialization_capability`](Self::materialization_capability) so the
    /// caller can distinguish cheap repeated probes from true batched/explicit
    /// Hessian materialization.
    fn is_cheap_to_materialize(&self) -> bool {
        false
    }

    /// Exact dense-materialization capability for this operator.
    ///
    /// The default preserves the historical work-model hook: operators that
    /// already opted into cheap probing via
    /// [`is_cheap_to_materialize`](Self::is_cheap_to_materialize) are treated
    /// as exact repeated-HVP materializers. Backends that can amortize or avoid
    /// basis probes should override this to return
    /// [`OuterHessianMaterialization::BatchedHvp`] or
    /// [`OuterHessianMaterialization::Explicit`].
    fn materialization_capability(&self) -> OuterHessianMaterialization {
        if self.is_cheap_to_materialize() {
            OuterHessianMaterialization::RepeatedHvp
        } else {
            OuterHessianMaterialization::Unavailable
        }
    }

    /// Apply the operator to all `m` columns of `factor`, returning a
    /// `dim × m` matrix whose `j`th column is `H · factor[:, j]`.
    ///
    /// The default implementation runs the per-column matvecs in parallel
    /// over rayon — each matvec is independent and the K×K basis-probe used
    /// by [`materialize_dense`](Self::materialize_dense) issues exactly `dim`
    /// such calls.  Implementors override when batching is cheaper (cached
    /// factorizations, BLAS-3 kernels). All
    /// [`materialize_dense`](Self::materialize_dense) callers route through
    /// this method, so an override automatically accelerates any
    /// work-model-approved materialization path used by the planner.
    fn mul_mat(&self, factor: ArrayView2<'_, f64>) -> Result<Array2<f64>, String> {
        use rayon::iter::{IntoParallelIterator, ParallelIterator};
        let dim = self.dim();
        if factor.nrows() != dim {
            return Err(format!(
                "outer Hessian operator factor row count mismatch: got {}, expected {}",
                factor.nrows(),
                dim
            ));
        }
        let m = factor.ncols();
        let cols: Result<Vec<Array1<f64>>, String> = (0..m)
            .into_par_iter()
            .map(|j| {
                let col = factor.column(j).to_owned();
                let hv = self.matvec(&col)?;
                if hv.len() != dim {
                    return Err(format!(
                        "outer Hessian operator matvec length mismatch: got {}, expected {}",
                        hv.len(),
                        dim
                    ));
                }
                Ok(hv)
            })
            .collect();
        let cols = cols?;
        let mut out = Array2::<f64>::zeros((dim, m));
        for (j, hv) in cols.into_iter().enumerate() {
            out.column_mut(j).assign(&hv);
        }
        Ok(out)
    }

    /// Materialize the outer Hessian into a dense `dim × dim` matrix by
    /// applying the operator to the identity in a single
    /// [`mul_mat`](Self::mul_mat) call, then averaging the off-diagonals to
    /// stabilize against round-off asymmetry.
    fn materialize_dense(&self) -> Result<Array2<f64>, String> {
        let dim = self.dim();
        let identity = Array2::<f64>::eye(dim);
        let mut dense = self.mul_mat(identity.view())?;
        if dense.nrows() != dim || dense.ncols() != dim {
            return Err(format!(
                "outer Hessian operator mul_mat returned {}x{}, expected {}x{}",
                dense.nrows(),
                dense.ncols(),
                dim,
                dim
            ));
        }
        for row in 0..dim {
            for col in (row + 1)..dim {
                let sym = 0.5 * (dense[[row, col]] + dense[[col, row]]);
                dense[[row, col]] = sym;
                dense[[col, row]] = sym;
            }
        }
        if !dense.iter().all(|value| value.is_finite()) {
            return Err(
                "outer Hessian dense materialization produced non-finite entries".to_string(),
            );
        }
        Ok(dense)
    }
}

struct RhoBlockAdditiveOuterHessian {
    base: Arc<dyn OuterHessianOperator>,
    rho_block: Array2<f64>,
    dim: usize,
}

impl OuterHessianOperator for RhoBlockAdditiveOuterHessian {
    fn dim(&self) -> usize {
        self.dim
    }

    fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
        if v.len() != self.dim {
            return Err(format!(
                "outer Hessian operator input length mismatch: got {}, expected {}",
                v.len(),
                self.dim
            ));
        }
        let mut out = self.base.matvec(v)?;
        let k = self.rho_block.nrows();
        if k > 0 {
            let rho_v = v.slice(ndarray::s![..k]).to_owned();
            let rho_out = self.rho_block.dot(&rho_v);
            out.slice_mut(ndarray::s![..k]).scaled_add(1.0, &rho_out);
        }
        Ok(out)
    }

    /// Batched apply: delegate to the inner operator's `mul_mat` (which may
    /// itself parallelize), then add `rho_block` to the leading `k × k`
    /// block. This propagates the batched-amortization benefit to wrappers
    /// — `materialize_dense` (which goes through `mul_mat(eye)`) and any
    /// future K-column inner-CG batching path.
    fn mul_mat(&self, factor: ArrayView2<'_, f64>) -> Result<Array2<f64>, String> {
        let mut out = self.base.mul_mat(factor)?;
        let k = self.rho_block.nrows();
        if k > 0 {
            if k > out.nrows() {
                return Err(format!(
                    "rho-block Hessian update shape mismatch: rho_block is {}x{}, mul_mat output has {} rows",
                    self.rho_block.nrows(),
                    self.rho_block.ncols(),
                    out.nrows()
                ));
            }
            // Update the leading-k rows of `out` by the rho_block contribution
            // to the first k rows of v: out[..k, :] += rho_block · factor[..k, :].
            let factor_top = factor.slice(ndarray::s![..k, ..]);
            let rho_contrib = self.rho_block.dot(&factor_top);
            out.slice_mut(ndarray::s![..k, ..])
                .scaled_add(1.0, &rho_contrib);
        }
        Ok(out)
    }

    fn is_cheap_to_materialize(&self) -> bool {
        self.base.is_cheap_to_materialize()
    }

    fn materialization_capability(&self) -> OuterHessianMaterialization {
        self.base.materialization_capability()
    }
}

/// Upper safety bound for operator materialization after the operator has
/// explicitly declared that dense probing is cheap. Dimension alone is never
/// sufficient: a 50-column operator can still mean 50 full row-streaming CTN,
/// Duchon, or survival passes.
pub(crate) const OUTER_HVP_MATERIALIZE_MAX_DIM: usize = 64;

/// Hv-apply counter wrapping an `OuterHessianOperator`.
///
/// Used by `run_operator_trust_region` so the per-iteration log line can
/// report exactly how many matrix-free Hv applications were spent inside
/// the Steihaug-Toint CG inner solve. A high count at biobank scale is the
/// quantitative signal that the matrix-free Hv kernel — not ARC's outer
/// loop — is the bottleneck.
struct HvApplyCounter<'a> {
    inner: &'a dyn OuterHessianOperator,
    count: std::sync::atomic::AtomicU64,
}

impl<'a> HvApplyCounter<'a> {
    fn new(inner: &'a dyn OuterHessianOperator) -> Self {
        Self {
            inner,
            count: std::sync::atomic::AtomicU64::new(0),
        }
    }

    fn count(&self) -> u64 {
        self.count.load(std::sync::atomic::Ordering::Relaxed)
    }
}

impl<'a> OuterHessianOperator for HvApplyCounter<'a> {
    fn dim(&self) -> usize {
        self.inner.dim()
    }

    fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
        self.count
            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
        self.inner.matvec(v)
    }

    fn mul_mat(&self, factor: ArrayView2<'_, f64>) -> Result<Array2<f64>, String> {
        self.count
            .fetch_add(factor.ncols() as u64, std::sync::atomic::Ordering::Relaxed);
        self.inner.mul_mat(factor)
    }

    fn materialization_capability(&self) -> OuterHessianMaterialization {
        self.inner.materialization_capability()
    }
}

/// Whether an analytic derivative is available for a given order.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Derivative {
    /// Exact analytic derivative implemented and available.
    Analytic,
    /// No analytic derivative; must be approximated or skipped.
    Unavailable,
}

/// Declares what a specific model path can provide to the outer optimizer.
///
/// Each call site that optimizes smoothing parameters constructs one of these
/// to describe its analytic derivative coverage. The [`plan`] function then
/// selects the optimizer and Hessian strategy.
const SMALL_OUTER_BFGS_MAX_PARAMS: usize = 8;
const SECOND_ORDER_GEOMETRY_PROBE_MAX_PARAMS: usize = 64;

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct OuterThetaLayout {
    pub n_params: usize,
    pub psi_dim: usize,
}

impl OuterThetaLayout {
    pub fn new(n_params: usize, psi_dim: usize) -> Self {
        Self { n_params, psi_dim }
    }

    pub fn rho_dim(&self) -> usize {
        self.n_params.saturating_sub(self.psi_dim)
    }

    fn validate_capability(&self, context: &str) -> Result<(), EstimationError> {
        if self.psi_dim > self.n_params {
            return Err(EstimationError::RemlOptimizationFailed(format!(
                "{context}: invalid outer theta layout (psi_dim={} exceeds n_params={})",
                self.psi_dim, self.n_params
            )));
        }
        Ok(())
    }

    fn validate_point_len(
        &self,
        theta: &Array1<f64>,
        context: &str,
    ) -> Result<(), ObjectiveEvalError> {
        if theta.len() != self.n_params {
            return Err(ObjectiveEvalError::recoverable(format!(
                "{context}: outer theta length mismatch: got {}, expected {} (rho_dim={}, psi_dim={})",
                theta.len(),
                self.n_params,
                self.rho_dim(),
                self.psi_dim
            )));
        }
        Ok(())
    }

    fn validate_gradient_len(
        &self,
        gradient: &Array1<f64>,
        context: &str,
    ) -> Result<(), ObjectiveEvalError> {
        if gradient.len() != self.n_params {
            return Err(ObjectiveEvalError::recoverable(format!(
                "{context}: outer gradient length mismatch: got {}, expected {} (rho_dim={}, psi_dim={})",
                gradient.len(),
                self.n_params,
                self.rho_dim(),
                self.psi_dim
            )));
        }
        Ok(())
    }

    fn validate_hessian_shape(
        &self,
        hessian: &Array2<f64>,
        context: &str,
    ) -> Result<(), ObjectiveEvalError> {
        if hessian.nrows() != self.n_params || hessian.ncols() != self.n_params {
            return Err(ObjectiveEvalError::recoverable(format!(
                "{context}: outer Hessian shape mismatch: got {}x{}, expected {}x{} (rho_dim={}, psi_dim={})",
                hessian.nrows(),
                hessian.ncols(),
                self.n_params,
                self.n_params,
                self.rho_dim(),
                self.psi_dim
            )));
        }
        Ok(())
    }

    fn validate_efs_eval(&self, eval: &EfsEval, context: &str) -> Result<(), ObjectiveEvalError> {
        if eval.steps.len() != self.n_params {
            return Err(ObjectiveEvalError::recoverable(format!(
                "{context}: outer EFS step length mismatch: got {}, expected {} (rho_dim={}, psi_dim={})",
                eval.steps.len(),
                self.n_params,
                self.rho_dim(),
                self.psi_dim
            )));
        }
        if let Some(ref psi_gradient) = eval.psi_gradient {
            if psi_gradient.len() != self.psi_dim {
                return Err(ObjectiveEvalError::recoverable(format!(
                    "{context}: outer EFS psi-gradient length mismatch: got {}, expected {}",
                    psi_gradient.len(),
                    self.psi_dim
                )));
            }
        }
        if let Some(ref psi_indices) = eval.psi_indices {
            if psi_indices.len() != self.psi_dim {
                return Err(ObjectiveEvalError::recoverable(format!(
                    "{context}: outer EFS psi-index count mismatch: got {}, expected {}",
                    psi_indices.len(),
                    self.psi_dim
                )));
            }
            if psi_indices.iter().any(|&idx| idx >= self.n_params) {
                return Err(ObjectiveEvalError::recoverable(format!(
                    "{context}: outer EFS psi index out of range for n_params={}",
                    self.n_params
                )));
            }
        }
        Ok(())
    }
}

#[derive(Clone, Debug)]
pub struct OuterCapability {
    pub gradient: Derivative,
    pub hessian: Derivative,
    /// Number of smoothing (+ any auxiliary hyper-) parameters being optimized.
    pub n_params: usize,
    /// Number of ψ (design-moving) coordinates among the extended
    /// hyperparameter coordinates. When 0, all coords are penalty-like and
    /// pure EFS is eligible (given `fixed_point_available`). When > 0,
    /// hybrid EFS is eligible instead: EFS for ρ + preconditioned gradient
    /// for ψ.
    ///
    /// # Hybrid EFS strategy (when `psi_dim > 0`)
    ///
    /// Enabled when `psi_dim > 0`,
    /// `n_params > SMALL_OUTER_BFGS_MAX_PARAMS`, and
    /// `fixed_point_available`.
    /// Combines:
    /// - Standard EFS multiplicative fixed-point updates for ρ coordinates
    /// - Safeguarded preconditioned gradient steps for ψ coordinates:
    ///   `Δψ = -α G⁺ g_ψ` where G is the trace Gram matrix
    ///
    /// Mathematically necessary because no EFS-type fixed-point iteration
    /// exists for indefinite B_ψ (see response.md Section 2). The structural
    /// requirement for EFS is `H^{-1/2} B_d H^{-1/2} ≽ 0` (PSD) plus fixed
    /// nullspace — exactly what penalty-like coords satisfy and design-moving
    /// coords do not.
    ///
    /// The hybrid is O(1) H⁻¹ solves per iteration (same as pure EFS),
    /// compared to O(dim(θ)) for BFGS.
    pub psi_dim: usize,
    /// Whether the objective actually implements `eval_efs()` for fixed-point
    /// plans. Structural eligibility (`psi_dim == 0` / `psi_dim > 0`)
    /// is not sufficient by itself: if this is false, the planner must stay on
    /// Newton/BFGS-style plans even when EFS or Hybrid-EFS would otherwise be
    /// mathematically admissible.
    pub fixed_point_available: bool,
    /// Optional log-barrier configuration for structural monotonicity constraints.
    /// When present, EFS is still eligible at plan time, but the EFS iteration
    /// loop performs a quantitative check each step: if
    /// `barrier_curvature_is_significant(β, ref_diag, threshold)` fires, EFS
    /// bails out early and the result is finalized at the current rho.
    ///
    /// Previously this was a binary `barrier_active: bool` that unconditionally
    /// blocked EFS. The quantitative check allows EFS when constraints exist but
    /// the barrier curvature is negligible (coefficients far from their bounds).
    pub barrier_config: Option<BarrierConfig>,
    /// Policy hint for derivative-free auxiliary optimizers only. Primary REML
    /// optimization ignores this flag when an analytic Hessian exists: exact
    /// second-order geometry must not be hidden behind a quasi-Newton policy.
    pub prefer_gradient_only: bool,
    /// Policy hint: even when the objective implements `eval_efs()` and the
    /// coordinate structure is penalty-like, the planner must NOT select
    /// EFS/HybridEfs for this problem.
    ///
    /// Set by the caller for problem classes where the Wood-Fasiolo structural
    /// property (`H^{-1/2} B_k H^{-1/2} ≽ 0` plus parameter-independent
    /// nullspace) is known not to hold — e.g. GAMLSS/location-scale families
    /// where the joint Hessian is β-dependent and cross-block smoothers
    /// induce non-diagonal curvature that the EFS multiplicative fixed-point
    /// cannot resolve. Also set by the automatic fallback cascade when an
    /// EFS/HybridEfs attempt failed to converge, so the next attempt falls
    /// back to analytic-gradient BFGS rather than retrying EFS.
    pub disable_fixed_point: bool,
}

impl OuterCapability {
    pub fn theta_layout(&self) -> OuterThetaLayout {
        OuterThetaLayout::new(self.n_params, self.psi_dim)
    }

    pub fn validate_layout(&self, context: &str) -> Result<(), EstimationError> {
        self.theta_layout().validate_capability(context)
    }

    /// True when all coordinates are penalty-like (no ψ coords).
    pub fn all_penalty_like(&self) -> bool {
        self.psi_dim == 0
    }
    /// True when ψ (design-moving) coordinates are present.
    pub fn has_psi_coords(&self) -> bool {
        self.psi_dim > 0
    }

    fn efs_plan_eligible(&self) -> bool {
        self.fixed_point_available
            && !self.disable_fixed_point
            && self.all_penalty_like()
            && self.n_params > SMALL_OUTER_BFGS_MAX_PARAMS
    }

    fn hybrid_efs_plan_eligible(&self) -> bool {
        self.fixed_point_available
            && !self.disable_fixed_point
            && self.has_psi_coords()
            && self.n_params > SMALL_OUTER_BFGS_MAX_PARAMS
    }

    fn declared_hessian_for_planning(&self) -> Derivative {
        match self.hessian {
            Derivative::Analytic => Derivative::Analytic,
            Derivative::Unavailable => Derivative::Unavailable,
        }
    }
}

/// Which solver algorithm to use for the outer optimization.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Solver {
    /// Adaptive Regularized Cubic; fastest convergence, requires Hessian.
    Arc,
    /// BFGS; gradient only, builds a dense curvature approximation.
    Bfgs,
    /// Extended Fellner-Schall; multiplicative fixed-point iteration.
    /// Only valid when all hyperparameter coordinates are penalty-like.
    /// Needs no gradient or Hessian — only traces tr(H^{-1} A_k) and
    /// Frobenius norms from the inner solution.
    Efs,
    /// Hybrid EFS + preconditioned gradient.
    ///
    /// Used when ψ (design-moving) coordinates are present alongside ρ
    /// (penalty-like) coordinates. Combines:
    /// - Standard EFS multiplicative fixed-point steps for ρ coords
    /// - Safeguarded preconditioned gradient steps for ψ coords:
    ///   `Δψ = -α G⁺ g_ψ` where `G_{de} = tr(H⁻¹ B_d H⁻¹ B_e)`
    ///
    /// This hybrid exists because no EFS-type fixed-point iteration can
    /// guarantee convergence for indefinite B_ψ (proven by counterexample
    /// in response.md Section 2). The key structural property that EFS
    /// needs — `H^{-1/2} B_d H^{-1/2} ≽ 0` plus parameter-independent
    /// nullspace — holds for penalty-like coords but fails for
    /// design-moving coords where B_ψ has mixed inertia.
    ///
    /// The preconditioned gradient uses the same trace Gram matrix that
    /// EFS already computes, so the cost is O(1) H⁻¹ solves per iteration
    /// (same as pure EFS), compared to O(dim(θ)) for full BFGS.
    HybridEfs,
    /// Opportunistic coordinate compass search (positive basis {±e_i} with
    /// step contraction). Derivative-free by construction — no gradient.
    ///
    /// Reserved for genuinely-derivative-free auxiliary searches
    /// (baseline-theta for parametric survival baselines, SAS/BetaLogistic
    /// /Mixture inverse-link parameters) where no analytic
    /// ∂cost/∂θ is available and the dimension is small (≤ ~5).
    ///
    /// The planner only selects this variant when the caller has opted in
    /// via [`SolverClass::AuxiliaryGradientFree`]; it is NEVER selected
    /// for the main REML outer. For the big REML outer, declared-analytic
    /// gradients must converge on their own merits.
    ///
    /// Convergence to a stationary point on any continuously-differentiable
    /// cost bounded below on a compact box follows from
    /// Kolda-Lewis-Torczon, SIAM Review 45:385, 2003, Thm 3.3. The theorem
    /// requires that all 2·dim basis directions are polled before step
    /// contraction; the dispatcher's sweep loop satisfies this by the
    /// `!improved ⇒ step /= 2` branch.
    CompassSearch,
}

/// Declares which "class" of outer optimization the caller is doing.
///
/// The default `Primary` class applies to the main REML outer — the
/// canonical smoothing-parameter optimization — and has access to
/// Arc/Bfgs/Efs/HybridEfs according to declared derivatives.
///
/// `AuxiliaryGradientFree` unlocks `Solver::CompassSearch` for small-dim
/// auxiliary pre-optimizations where no analytic ∂cost/∂θ exists (survival
/// baseline theta, non-standard inverse-link parameters). The planner gates
/// selection of CompassSearch strictly on this flag; REML builders never
/// set it, so REML can never be routed to compass search.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
pub enum SolverClass {
    /// The main REML outer — smoothing parameters and ψ-coords where
    /// analytic gradient (and typically analytic Hessian) is the contract.
    #[default]
    Primary,
    /// A genuinely-derivative-free low-dim auxiliary search (e.g. survival
    /// baseline theta). Opts into `Solver::CompassSearch` when gradient is
    /// Unavailable. Must not be set by REML builders.
    AuxiliaryGradientFree,
}

#[inline]
fn effective_seed_budget(
    requested_budget: usize,
    solver: Solver,
    risk_profile: crate::seeding::SeedRiskProfile,
    screening_enabled: bool,
) -> usize {
    let requested_budget = requested_budget.max(1);
    let capped = match (solver, risk_profile) {
        (Solver::Efs | Solver::HybridEfs, _) => 1,
        (Solver::Arc, crate::seeding::SeedRiskProfile::Survival) => 1,
        (Solver::Arc, crate::seeding::SeedRiskProfile::GeneralizedLinear) if screening_enabled => 1,
        (Solver::Arc, crate::seeding::SeedRiskProfile::GeneralizedLinear) => 2,
        // Aux direct-search is a single-start low-dim local method; restarting
        // from another seed would just re-explore the same basin.
        (Solver::CompassSearch, _) => 1,
        _ => requested_budget,
    };
    requested_budget.min(capped)
}

#[inline]
fn should_screen_seeds(
    config: &OuterConfig,
    solver: Solver,
    generated_seed_count: usize,
    seed_budget: usize,
) -> bool {
    config.screening_cap.is_some()
        && generated_seed_count > seed_budget
        && matches!(solver, Solver::Arc | Solver::Efs | Solver::HybridEfs)
}

#[inline]
fn expensive_unsuccessful_seed_limit(
    solver: Solver,
    risk_profile: crate::seeding::SeedRiskProfile,
) -> Option<usize> {
    match (solver, risk_profile) {
        (Solver::Efs | Solver::HybridEfs, _) => Some(1),
        (Solver::Arc, crate::seeding::SeedRiskProfile::Survival) => Some(1),
        (Solver::Arc, crate::seeding::SeedRiskProfile::GeneralizedLinear) => Some(2),
        (Solver::CompassSearch, _) => Some(1),
        _ => None,
    }
}

/// Multipliers for the seed-screening cap cascade, applied to the user's
/// `screen_max_inner_iterations`.
///
/// The cascade evaluates seeds at successive caps until at least one
/// produces a finite cost — at which point it ranks them and exits. The
/// geometric ×4 progression keeps each escalation step cheap relative to
/// the next while still letting the cap reach the full inner budget if
/// needed: `initial × {1, 4, 16}` followed by uncapped (`0` interpreted
/// by the inner solver as "use the full `pirls_config.max_iterations`").
///
/// Worst-case extra work bounds: every seed pays at most
/// `initial × (1 + 4 + 16)` = 21 × initial inner iterations across the
/// three capped stages before falling through to the uncapped pass —
/// negligible overhead compared to a full P-IRLS solve, paid only when
/// every cap stage collapsed all seeds to non-finite cost.
const SEED_SCREENING_CASCADE_MULTIPLIERS: [usize; 3] = [1, 4, 16];

/// Sentinel cap value passed to the inner solver to mean "no cap — use
/// the full `pirls_config.max_iterations`". Always the final cascade
/// stage after the geometric escalation exhausts.
const SEED_SCREENING_UNCAPPED: usize = 0;

fn rank_seeds_with_screening(
    obj: &mut dyn OuterObjective,
    config: &OuterConfig,
    context: &str,
    seeds: &[Array1<f64>],
) -> Vec<Array1<f64>> {
    let Some(screening_cap) = config.screening_cap.as_ref() else {
        return seeds.to_vec();
    };

    let initial_cap = config.seed_config.screen_max_inner_iterations.max(1);
    let previous_cap = screening_cap.swap(initial_cap, Ordering::Relaxed);

    // Geometric cap cascade: each stage exits the moment any seed produces
    // a finite cost. The original two-stage protocol (initial cap → fully
    // uncapped on every seed) has a degenerate worst case at biobank scale
    // — when every seed at the shallow cap collapses, we re-evaluate every
    // seed at the *full* inner budget, costing `N_seeds × full_pirls_work`
    // just to pick a starting point. The cascade replaces that all-or-
    // nothing jump with a geometric escalation: the typical case stays at
    // the initial cap (one pass), and the rare uniform-failure case pays
    // only `21 × initial` extra inner iterations before the uncapped
    // fallback.
    let cascade_caps = [
        initial_cap.saturating_mul(SEED_SCREENING_CASCADE_MULTIPLIERS[0]),
        initial_cap.saturating_mul(SEED_SCREENING_CASCADE_MULTIPLIERS[1]),
        initial_cap.saturating_mul(SEED_SCREENING_CASCADE_MULTIPLIERS[2]),
        SEED_SCREENING_UNCAPPED,
    ];

    let mut ranked: Vec<(usize, f64)> = Vec::with_capacity(seeds.len());
    let mut rejected = 0usize;
    let mut final_cap_used = initial_cap;
    let mut stages_consumed = 0usize;
    let cascade_start = std::time::Instant::now();

    for (stage, &cap) in cascade_caps.iter().enumerate() {
        screening_cap.store(cap, Ordering::Relaxed);
        ranked.clear();
        rejected = 0;
        for (idx, seed) in seeds.iter().enumerate() {
            obj.reset();
            match obj.eval_screening_proxy(seed) {
                Ok(cost) if cost.is_finite() => ranked.push((idx, cost)),
                Ok(_) | Err(_) => rejected += 1,
            }
        }
        final_cap_used = cap;
        stages_consumed = stage + 1;
        if !ranked.is_empty() {
            if stage > 0 {
                log::info!(
                    "[OUTER] {context}: seed screening cap escalated from {} to {} \
                     (initial cap was too shallow for this problem; {}/{} seeds ranked)",
                    initial_cap,
                    if cap == 0 {
                        "uncapped".to_string()
                    } else {
                        cap.to_string()
                    },
                    ranked.len(),
                    seeds.len(),
                );
            }
            break;
        }
    }

    screening_cap.store(previous_cap, Ordering::Relaxed);
    obj.reset();
    log::info!(
        "[OUTER] {context}: seed screening cascade complete elapsed={:.3}s stages_used={} final_cap={} ranked={}/{}",
        cascade_start.elapsed().as_secs_f64(),
        stages_consumed,
        if final_cap_used == 0 {
            "uncapped".to_string()
        } else {
            final_cap_used.to_string()
        },
        ranked.len(),
        seeds.len(),
    );

    if ranked.is_empty() {
        log::info!(
            "[OUTER] {context}: no finite seed cost even with full inner budget \
             ({} seeds, {} rejected, {} cascade stages tried); keeping heuristic order",
            seeds.len(),
            rejected,
            stages_consumed,
        );
        return seeds.to_vec();
    }

    ranked.sort_by(|(idx_a, cost_a), (idx_b, cost_b)| {
        cost_a.total_cmp(cost_b).then_with(|| idx_a.cmp(idx_b))
    });

    let mut ordered = Vec::with_capacity(seeds.len());
    let mut seen = vec![false; seeds.len()];
    for (idx, _) in ranked {
        seen[idx] = true;
        ordered.push(seeds[idx].clone());
    }
    for (idx, seed) in seeds.iter().enumerate() {
        if !seen[idx] {
            ordered.push(seed.clone());
        }
    }

    log::debug!(
        "[OUTER] {context}: seed screening ranked {}/{} candidates at cap={} \
         (initial cap={}, stages used={}); rejected={}",
        ordered.len() - rejected,
        seeds.len(),
        if final_cap_used == 0 {
            "uncapped".to_string()
        } else {
            final_cap_used.to_string()
        },
        initial_cap,
        stages_consumed,
        rejected,
    );

    ordered
}

#[inline]
fn candidate_improves_best(candidate: &OuterResult, best: Option<&OuterResult>) -> bool {
    match best {
        None => true,
        Some(best) if candidate.converged != best.converged => candidate.converged,
        Some(best) => candidate.final_value < best.final_value,
    }
}

/// How the Hessian will be obtained for the outer optimizer.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum HessianSource {
    /// Exact analytic Hessian provided by the objective.
    Analytic,
    /// No explicit Hessian; BFGS builds a rank-2 approximation from
    /// gradient history.
    BfgsApprox,
    /// No explicit Hessian or gradient needed. EFS uses traces and
    /// Frobenius norms from the inner solution directly.
    EfsFixedPoint,
    /// Hybrid EFS + preconditioned gradient for ψ coordinates.
    /// EFS traces for ρ coords, trace Gram matrix + gradient for ψ coords.
    HybridEfsFixedPoint,
}

/// Requested derivative order for an outer objective evaluation.
///
/// Cost-only paths continue to use [`OuterObjective::eval_cost`]. This enum is
/// for the shared `eval` bridge where the runner needs either first-order or
/// second-order information depending on the active plan.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum OuterEvalOrder {
    /// Compute value and gradient only.
    ValueAndGradient,
    /// Compute value, gradient, and analytic Hessian when available.
    ValueGradientHessian,
}

/// The outer optimization plan. Produced by [`plan`], consumed by the runner.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct OuterPlan {
    pub solver: Solver,
    pub hessian_source: HessianSource,
}

pub(crate) const EFS_FIRST_ORDER_FALLBACK_MARKER: &str = "[outer-efs-first-order-fallback]";

/// Whether outer_strategy should automatically derive a retry ladder from the
/// primary capability, or disable retries entirely.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum FallbackPolicy {
    /// Centralized retry path chosen from the declared capability.
    Automatic,
    /// No retries; use only the primary plan.
    Disabled,
}

impl std::fmt::Display for OuterPlan {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "solver={:?}, hessian_source={:?}",
            self.solver, self.hessian_source
        )
    }
}

impl OuterPlan {
    /// Stable, grep-friendly routing token for biobank/log regression
    /// assertions. Emits `solver=<Solver>;hessian=<Source>;matrix-free=<bool>`.
    /// `matrix-free=true` is reported when the plan keeps
    /// `HessianSource::Analytic` (runtime may then use operator HVPs);
    /// the BFGS/EFS variants report `matrix-free=false` because no Hessian
    /// operator is supplied to the runner.
    pub fn routing_log_line(&self) -> String {
        let matrix_free = matches!(self.hessian_source, HessianSource::Analytic);
        format!(
            "solver={:?};hessian={:?};matrix-free={}",
            self.solver, self.hessian_source, matrix_free
        )
    }
}

/// Select the outer optimization strategy from the declared capability.
///
/// This is a pure function with no side effects. All policy lives here.
pub fn plan(cap: &OuterCapability) -> OuterPlan {
    use Derivative::*;
    use HessianSource as H;
    use Solver as S;

    match (cap.gradient, cap.declared_hessian_for_planning()) {
        (Analytic, Analytic) => OuterPlan {
            solver: S::Arc,
            hessian_source: H::Analytic,
        },
        // EFS: all penalty-like coords, no analytic Hessian, many params.
        // Multiplicative fixed-point needs only traces — no gradient evals.
        // Much cheaper than BFGS for k=10-50 smoothing parameters.
        //
        // When a log-barrier is present (monotonicity constraints), EFS is
        // still selected here. The EFS iteration loop in `run_outer` performs
        // a quantitative check each step via `barrier_curvature_is_significant`
        // and bails out early if the barrier curvature becomes non-negligible
        // relative to the penalized Hessian diagonal.
        (Analytic, Unavailable) if cap.efs_plan_eligible() => OuterPlan {
            solver: S::Efs,
            hessian_source: H::EfsFixedPoint,
        },
        (Unavailable, Unavailable) if cap.efs_plan_eligible() => OuterPlan {
            solver: S::Efs,
            hessian_source: H::EfsFixedPoint,
        },

        // Hybrid EFS: ψ (design-moving) coords present alongside ρ coords.
        //
        // When ψ coords are present, pure EFS is invalid because B_ψ can be
        // indefinite (see response.md Section 2 for the counterexample). But
        // falling back to full BFGS wastes the cheap EFS structure for ρ coords.
        //
        // The hybrid strategy uses EFS for ρ-coords and a safeguarded
        // preconditioned gradient step for ψ-coords:
        //   Δψ = -α G⁺ g_ψ,  G_{de} = tr(H⁻¹ B_d H⁻¹ B_e)
        //
        // This stays O(1) H⁻¹ solves per iteration (vs O(dim(θ)) for BFGS)
        // and uses the same trace Gram matrix that EFS already computes.
        (Analytic, Unavailable) if cap.hybrid_efs_plan_eligible() => OuterPlan {
            solver: S::HybridEfs,
            hessian_source: H::HybridEfsFixedPoint,
        },
        (Unavailable, Unavailable) if cap.hybrid_efs_plan_eligible() => OuterPlan {
            solver: S::HybridEfs,
            hessian_source: H::HybridEfsFixedPoint,
        },

        // Gradient-only problems should use a gradient-only optimizer.
        (Analytic, Unavailable) => OuterPlan {
            solver: S::Bfgs,
            hessian_source: H::BfgsApprox,
        },
        // No analytic gradient: emit a BFGS plan so the error surfaces with
        // context rather than as a panic on an unmatched arm. The runner will
        // reject this path because it requires an analytic gradient.
        (Unavailable, _) => OuterPlan {
            solver: S::Bfgs,
            hessian_source: H::BfgsApprox,
        },
    }
}

/// Plan selection with an explicit [`SolverClass`] opt-in.
///
/// For `SolverClass::Primary` this is identical to [`plan`] — the main REML
/// outer dispatch never changes behavior.
///
/// For `SolverClass::AuxiliaryGradientFree` with no declared gradient or
/// Hessian capability, returns a `Solver::CompassSearch` plan. This is the
/// sole path by which compass search can be dispatched; the primary REML
/// builder never sets the aux class, so the direct-search variant cannot
/// leak into the big REML outer or the automatic fallback cascade.
///
/// If the aux class is set but analytic gradient IS available, that is a
/// caller error (the caller should have used `Primary` and let Arc/Bfgs
/// handle it); we defer to the standard `plan` in that case so the caller
/// still gets a well-formed plan rather than a silent mis-dispatch.
pub fn plan_with_class(cap: &OuterCapability, class: SolverClass) -> OuterPlan {
    use Derivative::*;
    if class == SolverClass::AuxiliaryGradientFree
        && cap.gradient == Unavailable
        && cap.declared_hessian_for_planning() == Unavailable
        && !cap.efs_plan_eligible()
        && !cap.hybrid_efs_plan_eligible()
    {
        return OuterPlan {
            solver: Solver::CompassSearch,
            hessian_source: HessianSource::BfgsApprox,
        };
    }
    plan(cap)
}

/// Log the outer optimization plan. Called once per fit at the start of
/// outer optimization so the user can see what strategy was selected and why.
pub fn log_plan(context: &str, cap: &OuterCapability, the_plan: &OuterPlan) {
    let hess_warning = match the_plan.hessian_source {
        HessianSource::BfgsApprox if cap.n_params > 0 => {
            " [no Hessian: BFGS approximation]".to_string()
        }
        _ => String::new(),
    };
    let barrier_note = if cap.barrier_config.is_some() && cap.efs_plan_eligible() {
        " [EFS with runtime barrier-curvature guard]"
    } else {
        ""
    };
    let hybrid_note = if the_plan.solver == Solver::HybridEfs {
        " [hybrid EFS(ρ) + preconditioned-gradient(ψ)]"
    } else {
        ""
    };
    // Promoted to info: this fires once per outer optimization dispatch and
    // tells the user immediately whether ARC, BFGS, EFS, etc. was selected
    // and why. That information is otherwise inferred only from the per-iter
    // log tag prefix once the loop has started.
    log::info!(
        "[OUTER] {context}: n_params={}, gradient={:?}, hessian={:?} -> {} [{}]{hess_warning}{barrier_note}{hybrid_note}",
        cap.n_params,
        cap.gradient,
        cap.hessian,
        the_plan,
        the_plan.routing_log_line(),
    );
}

fn requests_immediate_first_order_fallback(message: &str) -> bool {
    message.contains(EFS_FIRST_ORDER_FALLBACK_MARKER)
}

/// Disable the EFS/HybridEfs planner path, forcing BFGS-class solvers on the
/// next attempt. Returns `None` if fixed-point is already disabled.
fn disable_fixed_point(cap: &OuterCapability) -> Option<OuterCapability> {
    (!cap.disable_fixed_point && (cap.efs_plan_eligible() || cap.hybrid_efs_plan_eligible())).then(
        || {
            let mut degraded = cap.clone();
            degraded.disable_fixed_point = true;
            degraded
        },
    )
}

fn automatic_fallback_attempts(cap: &OuterCapability) -> Vec<OuterCapability> {
    // Production fallback ladder is strictly analytic-gradient.
    //
    // The cascade is:
    //   1. If the primary plan is EFS/HybridEFS AND an analytic gradient is
    //      available, retry with fixed-point disabled so the analytic
    //      derivative declaration is evaluated directly.
    //   2. If the primary plan is Arc (declared (Analytic, Analytic)
    //      capability), do NOT add a degraded fallback. Demoting to
    //      BFGS+BfgsApprox in this case discards the analytic outer Hessian
    //      ARC was using — a strictly weaker geometry — and silently masks
    //      ARC's actual failure mode (e.g. budget exhaustion, indefinite
    //      curvature) under a BFGS Strong-Wolfe plateau on a flat surface.
    //      ARC retries are handled by the per-attempt budget-bump retry
    //      ladder in `run_outer_with_strategy`; once that is exhausted, the
    //      caller surfaces the underlying ARC failure verbatim.
    //   3. Otherwise (e.g. (Analytic, Unavailable) without EFS eligibility,
    //      which is the BFGS primary), there is nothing to degrade further
    //      — the caller surfaces the RemlOptimizationFailed error so the
    //      non-convergence is visible.
    let mut attempts = Vec::new();

    if cap.gradient == Derivative::Analytic
        && matches!(plan(cap).solver, Solver::Efs | Solver::HybridEfs)
    {
        if let Some(no_fp_cap) = disable_fixed_point(cap) {
            attempts.push(no_fp_cap.clone());
            return attempts;
        }
    }

    // Arc primary: no lateral demotion to BFGS. The runner's ARC-budget-bump
    // retry covers cases where ARC needed more iterations; if even that is
    // exhausted, the caller sees the genuine analytic-Hessian non-convergence
    // rather than a misleading BFGS-on-flat-surface plateau.
    if matches!(plan(cap).solver, Solver::Arc) {
        return attempts;
    }

    attempts
}

fn disabled_fallback_hybrid_efs_has_standalone_bfgs_primary(
    cap: &OuterCapability,
    config: &OuterConfig,
) -> bool {
    config.solver_class == SolverClass::Primary
        && config.fallback_policy == FallbackPolicy::Disabled
        && cap.gradient == Derivative::Analytic
        && matches!(plan(cap).solver, Solver::HybridEfs)
}

fn primary_capability_for_config(
    mut cap: OuterCapability,
    config: &OuterConfig,
    context: &str,
) -> OuterCapability {
    if disabled_fallback_hybrid_efs_has_standalone_bfgs_primary(&cap, config) {
        // HybridEFS is not a standalone first-order method for ψ coordinates:
        // when ψ backtracking proves non-descent, the bridge intentionally
        // surfaces `EFS_FIRST_ORDER_FALLBACK_MARKER` so the runner can switch
        // to a joint gradient solver that enforces ∇ψ V = 0. With fallback
        // disabled and an analytic gradient available, selecting HybridEFS as
        // the only primary attempt is internally inconsistent; BFGS is the
        // standalone first-order primary for that capability.
        log::info!(
            "[OUTER] {context}: HybridEFS requires the automatic first-order \
             escape path for ψ coordinates; fallback is disabled, so routing the \
             primary attempt to analytic-gradient BFGS"
        );
        cap.disable_fixed_point = true;
    }
    cap
}

/// Result of one outer objective evaluation.
///
/// The Hessian field uses [`HessianResult`] instead of `Option<Array2<f64>>`
/// to make the presence/absence of an analytic Hessian explicit and
/// pattern-matchable.
pub struct OuterEval {
    pub cost: f64,
    pub gradient: Array1<f64>,
    pub hessian: HessianResult,
}

impl OuterEval {
    /// Conventional representation of an infeasible trial point.
    ///
    /// `opt` translates the non-finite objective into a recoverable trial
    /// failure so trust-region/line-search solvers retreat without the caller
    /// needing to special-case infeasible regions locally.
    pub fn infeasible(n_params: usize) -> Self {
        Self {
            cost: f64::INFINITY,
            gradient: Array1::zeros(n_params),
            hessian: HessianResult::Unavailable,
        }
    }
}

/// Explicit Hessian result replacing `Option<Array2<f64>>`.
pub enum HessianResult {
    /// Analytic Hessian was computed and returned.
    Analytic(Array2<f64>),
    /// Analytic Hessian is available as an exact Hessian-vector product.
    Operator(Arc<dyn OuterHessianOperator>),
    /// No analytic Hessian available for this model path.
    /// The runner must use the [`HessianSource`] from the [`OuterPlan`]
    /// to choose a declared first-order or derivative-free strategy.
    Unavailable,
}

impl Clone for OuterEval {
    fn clone(&self) -> Self {
        Self {
            cost: self.cost,
            gradient: self.gradient.clone(),
            hessian: self.hessian.clone(),
        }
    }
}

impl std::fmt::Debug for OuterEval {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("OuterEval")
            .field("cost", &self.cost)
            .field("gradient", &self.gradient)
            .field("hessian", &self.hessian)
            .finish()
    }
}

impl Clone for HessianResult {
    fn clone(&self) -> Self {
        match self {
            Self::Analytic(h) => Self::Analytic(h.clone()),
            Self::Operator(op) => Self::Operator(Arc::clone(op)),
            Self::Unavailable => Self::Unavailable,
        }
    }
}

impl std::fmt::Debug for HessianResult {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Analytic(h) => f
                .debug_tuple("Analytic")
                .field(&format!("{}x{}", h.nrows(), h.ncols()))
                .finish(),
            Self::Operator(op) => f
                .debug_tuple("Operator")
                .field(&format!("dim={}", op.dim()))
                .finish(),
            Self::Unavailable => f.write_str("Unavailable"),
        }
    }
}

impl HessianResult {
    /// Extract the Hessian matrix, panicking if unavailable.
    ///
    /// Only call this when the [`OuterPlan`] guarantees `HessianSource::Analytic`.
    pub fn unwrap_analytic(self) -> Array2<f64> {
        match self {
            HessianResult::Analytic(h) => h,
            HessianResult::Operator(_) => {
                panic!("expected dense analytic Hessian but got HessianResult::Operator")
            }
            HessianResult::Unavailable => {
                panic!("expected analytic Hessian but got HessianResult::Unavailable")
            }
        }
    }

    /// Returns `true` if an analytic Hessian is present in any exact form.
    pub fn is_analytic(&self) -> bool {
        matches!(
            self,
            HessianResult::Analytic(_) | HessianResult::Operator(_)
        )
    }

    /// Convert to the optional Hessian shape used by the opt bridge.
    pub fn into_option(self) -> Option<Array2<f64>> {
        match self {
            HessianResult::Analytic(h) => Some(h),
            HessianResult::Operator(_) => None,
            HessianResult::Unavailable => None,
        }
    }

    pub fn dim(&self) -> Option<usize> {
        match self {
            HessianResult::Analytic(h) => Some(h.nrows()),
            HessianResult::Operator(op) => Some(op.dim()),
            HessianResult::Unavailable => None,
        }
    }

    pub fn materialize_dense(&self) -> Result<Option<Array2<f64>>, String> {
        match self {
            HessianResult::Analytic(h) => Ok(Some(h.clone())),
            HessianResult::Operator(op) => op.materialize_dense().map(Some),
            HessianResult::Unavailable => Ok(None),
        }
    }

    pub fn add_rho_block_dense(&mut self, rho_block: &Array2<f64>) -> Result<(), String> {
        if rho_block.nrows() != rho_block.ncols() {
            return Err(format!(
                "rho-block Hessian update must be square, got {}x{}",
                rho_block.nrows(),
                rho_block.ncols()
            ));
        }
        match self {
            HessianResult::Analytic(h) => {
                if rho_block.nrows() > h.nrows() || rho_block.ncols() > h.ncols() {
                    return Err(format!(
                        "rho-block Hessian update shape mismatch: got {}x{}, outer Hessian is {}x{}",
                        rho_block.nrows(),
                        rho_block.ncols(),
                        h.nrows(),
                        h.ncols()
                    ));
                }
                let k = rho_block.nrows();
                let mut sl = h.slice_mut(ndarray::s![..k, ..k]);
                sl += rho_block;
                Ok(())
            }
            HessianResult::Operator(op) => {
                let base = Arc::clone(op);
                let dim = base.dim();
                if rho_block.nrows() > dim {
                    return Err(format!(
                        "rho-block Hessian update dimension mismatch: got {}x{}, operator dim is {}",
                        rho_block.nrows(),
                        rho_block.ncols(),
                        dim
                    ));
                }
                *self = HessianResult::Operator(Arc::new(RhoBlockAdditiveOuterHessian {
                    base,
                    rho_block: rho_block.clone(),
                    dim,
                }));
                Ok(())
            }
            HessianResult::Unavailable => Ok(()),
        }
    }
}

/// Result of an EFS (Extended Fellner-Schall) evaluation at a given rho.
///
/// Contains the REML/LAML cost at the current rho and the additive step
/// vector produced by `compute_efs_update`. The caller applies the step as
/// `rho_new[i] = rho[i] + steps[i]`.
///
/// For the hybrid EFS+preconditioned-gradient strategy, the steps vector
/// contains both EFS steps (for ρ coords) and preconditioned gradient steps
/// (for ψ coords). The `psi_gradient` field carries the raw ψ-block gradient
/// for optional backtracking.
#[derive(Clone, Debug)]
pub struct EfsEval {
    /// REML/LAML cost at the current rho (for convergence monitoring and
    /// comparing candidates).
    pub cost: f64,
    /// Additive steps. Length = n_rho + n_ext_coords.
    ///
    /// For pure EFS: steps for non-penalty-like coordinates are 0.0.
    /// For hybrid EFS: ρ-coords get standard EFS multiplicative steps,
    /// ψ-coords get preconditioned gradient steps `Δψ = -α G⁺ g_ψ`.
    pub steps: Vec<f64>,
    /// Current coefficient vector β̂ from the inner P-IRLS solve.
    /// Used by the EFS loop for the runtime barrier-curvature significance
    /// check when monotonicity constraints are present.
    pub beta: Option<Array1<f64>>,
    /// Raw REML/LAML gradient restricted to the ψ block (design-moving coords).
    ///
    /// Present only when the hybrid EFS strategy is active. Used by the
    /// outer iteration for backtracking on the ψ step: if the combined
    /// (ρ-EFS, ψ-gradient) step does not decrease V(θ), the ψ step size
    /// α is halved while keeping the ρ-EFS step fixed.
    ///
    /// This avoids re-evaluating the gradient during backtracking since
    /// the gradient was already computed as part of the hybrid EFS eval.
    pub psi_gradient: Option<Array1<f64>>,
    /// Indices into the full θ vector that correspond to ψ (design-moving)
    /// coordinates. Used by the backtracking logic to selectively scale
    /// only the ψ portion of the step.
    pub psi_indices: Option<Vec<usize>>,
}

/// Common interface for outer smoothing-parameter objectives.
///
/// Every model path that optimizes smoothing parameters implements this trait.
/// The runner function consumes it and handles solver selection,
/// multi-start, and logging while delegating derivative fallback policy to
/// `opt`.
///
/// # Contract
///
/// - `capability()` must be stable (same result across calls).
/// - `eval()` may return `HessianResult::Unavailable` at individual trial
///   points even when `capability().hessian == Analytic`; `opt` degrades that
///   step to first-order behavior instead of requiring the objective to fake a
///   stale or non-finite Hessian.
/// - Use `eval_cost()` / `OuterEval::infeasible()` for infeasible trial points.
///   Return `Err(...)` for genuine evaluation breakdowns so the runner can mark
///   the step as a recoverable solver failure and escalate to the next declared
///   fallback plan if the full attempt still fails.
/// - `eval_cost()` is used only for cost-based optimization paths.
/// - `eval()` is the main evaluation path (cost + gradient + optional Hessian).
/// - `eval_efs()` is used only by the EFS solver. It runs the inner solve,
///   builds the `InnerSolution`, and computes the EFS step vector. The default
///   implementation returns an error; only objectives that support EFS need
///   to override it.
/// - `reset()` restores state to a clean baseline (for multi-start).
pub trait OuterObjective {
    /// Declare what this objective can compute analytically.
    fn capability(&self) -> OuterCapability;

    /// Evaluate cost only for cost-based optimization paths.
    fn eval_cost(&mut self, rho: &Array1<f64>) -> Result<f64, EstimationError>;

    /// Evaluate the seed-screening ranking proxy at this `rho`.
    ///
    /// Used exclusively by the `rank_seeds_with_screening` cascade. The
    /// default delegates to [`OuterObjective::eval_cost`], which preserves
    /// behavior for non-REML objectives.
    ///
    /// Concrete REML-state objectives override this to return the per-seed
    /// minimum penalized deviance observed during the inner P-IRLS solve
    /// (a monotonically descending quantity that remains a meaningful
    /// quality signal even at a 3-iteration screening cap), instead of the
    /// V_LAML criterion (which is dominated by a poorly-conditioned
    /// `0.5·log|H|` term at partial-fit β̂ and ranks seeds little better
    /// than random). The proxy fires *only* in screening mode; outside
    /// screening it must return the regular V_LAML cost so the optimization
    /// objective is unchanged.
    fn eval_screening_proxy(&mut self, rho: &Array1<f64>) -> Result<f64, EstimationError> {
        self.eval_cost(rho)
    }

    /// Evaluate cost + gradient + (if capable) Hessian.
    fn eval(&mut self, rho: &Array1<f64>) -> Result<OuterEval, EstimationError>;

    /// Evaluate the outer objective at the order requested by the active plan.
    ///
    /// The default preserves legacy behavior by delegating to
    /// [`OuterObjective::eval`].
    fn eval_with_order(
        &mut self,
        rho: &Array1<f64>,
        order: OuterEvalOrder,
    ) -> Result<OuterEval, EstimationError> {
        match order {
            OuterEvalOrder::ValueAndGradient | OuterEvalOrder::ValueGradientHessian => {
                self.eval(rho)
            }
        }
    }

    /// Evaluate cost + EFS step vector. Only needed when the plan selects
    /// `Solver::Efs`. The default returns an error indicating EFS is not
    /// supported by this objective.
    fn eval_efs(&mut self, _: &Array1<f64>) -> Result<EfsEval, EstimationError> {
        Err(EstimationError::RemlOptimizationFailed(
            "EFS evaluation not implemented for this objective".to_string(),
        ))
    }

    /// Restore to a clean baseline for the next multi-start candidate.
    fn reset(&mut self);
}

/// Closure-based adapter for [`OuterObjective`].
///
/// This allows any call site to construct an `OuterObjective` from closures
/// without needing to define a wrapper struct or modify the state type.
/// Each call site wraps its existing methods into closures and passes them here.
pub struct ClosureObjective<
    S,
    Fc,
    Fe,
    Fr = fn(&mut S),
    Fefs = fn(&mut S, &Array1<f64>) -> Result<EfsEval, EstimationError>,
    Feo = fn(&mut S, &Array1<f64>, OuterEvalOrder) -> Result<OuterEval, EstimationError>,
    Fsp = fn(&mut S, &Array1<f64>) -> Result<f64, EstimationError>,
> {
    pub(crate) state: S,
    pub(crate) cap: OuterCapability,
    pub(crate) cost_fn: Fc,
    pub(crate) eval_fn: Fe,
    /// Optional order-aware eval closure. When `None`, `eval_with_order()`
    /// falls back to `eval()`.
    pub(crate) eval_order_fn: Option<Feo>,
    /// Optional reset closure. When `None`, `reset()` is a no-op.
    pub(crate) reset_fn: Option<Fr>,
    /// Optional EFS evaluation closure. When `None`, the default
    /// `OuterObjective::eval_efs` returns an error.
    pub(crate) efs_fn: Option<Fefs>,
    /// Optional seed-screening ranking proxy closure. When `None`,
    /// `eval_screening_proxy()` falls back to `eval_cost()` (the trait
    /// default), preserving legacy behavior for non-REML objectives.
    pub(crate) screening_proxy_fn: Option<Fsp>,
}

impl<S, Fc, Fe, Fr, Fefs, Feo, Fsp> OuterObjective
    for ClosureObjective<S, Fc, Fe, Fr, Fefs, Feo, Fsp>
where
    Fc: FnMut(&mut S, &Array1<f64>) -> Result<f64, EstimationError>,
    Fe: FnMut(&mut S, &Array1<f64>) -> Result<OuterEval, EstimationError>,
    Fr: FnMut(&mut S),
    Fefs: FnMut(&mut S, &Array1<f64>) -> Result<EfsEval, EstimationError>,
    Feo: FnMut(&mut S, &Array1<f64>, OuterEvalOrder) -> Result<OuterEval, EstimationError>,
    Fsp: FnMut(&mut S, &Array1<f64>) -> Result<f64, EstimationError>,
{
    fn capability(&self) -> OuterCapability {
        self.cap.clone()
    }

    fn eval_cost(&mut self, rho: &Array1<f64>) -> Result<f64, EstimationError> {
        (self.cost_fn)(&mut self.state, rho)
    }

    fn eval_screening_proxy(&mut self, rho: &Array1<f64>) -> Result<f64, EstimationError> {
        match self.screening_proxy_fn.as_mut() {
            Some(f) => f(&mut self.state, rho),
            None => (self.cost_fn)(&mut self.state, rho),
        }
    }

    fn eval(&mut self, rho: &Array1<f64>) -> Result<OuterEval, EstimationError> {
        (self.eval_fn)(&mut self.state, rho)
    }

    fn eval_with_order(
        &mut self,
        rho: &Array1<f64>,
        order: OuterEvalOrder,
    ) -> Result<OuterEval, EstimationError> {
        match self.eval_order_fn.as_mut() {
            Some(f) => f(&mut self.state, rho, order),
            None => (self.eval_fn)(&mut self.state, rho),
        }
    }

    fn eval_efs(&mut self, rho: &Array1<f64>) -> Result<EfsEval, EstimationError> {
        match self.efs_fn.as_mut() {
            Some(f) => f(&mut self.state, rho),
            None => Err(EstimationError::RemlOptimizationFailed(
                "EFS evaluation not implemented for this objective".to_string(),
            )),
        }
    }

    fn reset(&mut self) {
        if let Some(f) = self.reset_fn.as_mut() {
            f(&mut self.state);
        }
    }
}

fn into_objective_error(context: &str, err: EstimationError) -> ObjectiveEvalError {
    ObjectiveEvalError::recoverable(format!("{context}: {err}"))
}

fn finite_cost_or_error(context: &str, cost: f64) -> Result<f64, ObjectiveEvalError> {
    if cost.is_finite() {
        Ok(cost)
    } else {
        Err(ObjectiveEvalError::recoverable(format!(
            "{context}: objective returned a non-finite cost"
        )))
    }
}

fn finite_outer_eval_or_error(
    context: &str,
    layout: OuterThetaLayout,
    eval: OuterEval,
) -> Result<OuterEval, ObjectiveEvalError> {
    layout.validate_gradient_len(&eval.gradient, context)?;
    if !eval.cost.is_finite() {
        return Err(ObjectiveEvalError::recoverable(format!(
            "{context}: objective returned a non-finite cost"
        )));
    }
    if !eval.gradient.iter().all(|v| v.is_finite()) {
        return Err(ObjectiveEvalError::recoverable(format!(
            "{context}: objective returned a non-finite gradient"
        )));
    }
    match &eval.hessian {
        HessianResult::Analytic(hessian) => {
            layout.validate_hessian_shape(hessian, context)?;
            if !hessian.iter().all(|v| v.is_finite()) {
                return Err(ObjectiveEvalError::recoverable(format!(
                    "{context}: objective returned a non-finite Hessian"
                )));
            }
        }
        HessianResult::Operator(op) => {
            if op.dim() != layout.n_params {
                return Err(ObjectiveEvalError::recoverable(format!(
                    "{context}: outer Hessian operator dimension mismatch: got {}, expected {} (rho_dim={}, psi_dim={})",
                    op.dim(),
                    layout.n_params,
                    layout.rho_dim(),
                    layout.psi_dim
                )));
            }
        }
        HessianResult::Unavailable => {}
    }
    Ok(eval)
}

fn finite_outer_first_order_eval_or_error(
    context: &str,
    layout: OuterThetaLayout,
    eval: OuterEval,
) -> Result<OuterEval, ObjectiveEvalError> {
    layout.validate_gradient_len(&eval.gradient, context)?;
    if !eval.cost.is_finite() {
        return Err(ObjectiveEvalError::recoverable(format!(
            "{context}: objective returned a non-finite cost"
        )));
    }
    if !eval.gradient.iter().all(|v| v.is_finite()) {
        return Err(ObjectiveEvalError::recoverable(format!(
            "{context}: objective returned a non-finite gradient"
        )));
    }
    Ok(eval)
}

fn validate_second_order_seed_hessian(
    context: &str,
    layout: OuterThetaLayout,
    eval: &OuterEval,
) -> Result<(), ObjectiveEvalError> {
    if layout.n_params > SECOND_ORDER_GEOMETRY_PROBE_MAX_PARAMS || !eval.hessian.is_analytic() {
        return Ok(());
    }
    if matches!(
        &eval.hessian,
        HessianResult::Operator(op) if !op.materialization_capability().is_available()
    ) {
        return Ok(());
    }

    let Some(hessian) = eval.hessian.materialize_dense().map_err(|message| {
        ObjectiveEvalError::recoverable(format!(
            "{context}: analytic outer Hessian materialization failed during second-order seed validation: {message}"
        ))
    })?
    else {
        return Ok(());
    };

    layout.validate_hessian_shape(&hessian, context)?;
    if !hessian.iter().all(|value| value.is_finite()) {
        return Err(ObjectiveEvalError::recoverable(format!(
            "{context}: analytic outer Hessian probe encountered non-finite entries"
        )));
    }

    Ok(())
}

fn finite_efs_eval_or_error(
    context: &str,
    layout: OuterThetaLayout,
    eval: EfsEval,
) -> Result<EfsEval, ObjectiveEvalError> {
    layout.validate_efs_eval(&eval, context)?;
    finite_cost_or_error(context, eval.cost)?;
    if let Some((idx, value)) = eval.steps.iter().enumerate().find(|(_, v)| !v.is_finite()) {
        let coord_kind = match eval.psi_indices.as_deref() {
            Some(indices) if indices.contains(&idx) => "ψ",
            Some(_) => "ρ/τ",
            None => "ρ",
        };
        return Err(ObjectiveEvalError::recoverable(format!(
            "{context}: objective returned a non-finite {coord_kind} EFS step at \
             coord {idx} (step[{idx}]={value}, rho_dim={}, psi_dim={}, n_params={})",
            layout.rho_dim(),
            layout.psi_dim,
            layout.n_params,
        )));
    }
    Ok(eval)
}

struct OuterFirstOrderBridge<'a> {
    obj: &'a mut dyn OuterObjective,
    layout: OuterThetaLayout,
    /// Outer-aware inner-PIRLS cap atomic. When `Some`, the bridge stores
    /// a coarsen-then-tighten cap into it on every accepted gradient eval
    /// (see `first_order_inner_cap_schedule`). The cap is NEVER touched
    /// in `eval_cost` so line-search probes within an outer iter see a
    /// stable inner tolerance — Wolfe conditions assume constant cost
    /// noise within a bracket.
    outer_inner_cap: Option<InnerProgressFeedback>,
    /// Counts accepted gradient evaluations (one per BFGS outer iter).
    iter_count: usize,
    /// First observed `‖g‖` from `eval_grad`. Used by the schedule to
    /// compute the gradient-ratio (`last / initial`) — when the ratio
    /// drops, the optimizer is approaching convergence and the inner
    /// cap should lift to full so the cached β is at full tolerance.
    g_norm_initial: Option<f64>,
    /// `‖g‖` from the most recent eval. Stale by one outer iter relative
    /// to the cap that consumes it (the cap is set BEFORE the new eval),
    /// but for monotone-decreasing g_norm this is safe — it makes the
    /// cap conservatively LARGER than the truly-needed value, never
    /// smaller.
    last_g_norm: Option<f64>,
}

impl ZerothOrderObjective for OuterFirstOrderBridge<'_> {
    fn eval_cost(&mut self, x: &Array1<f64>) -> Result<f64, ObjectiveEvalError> {
        self.layout
            .validate_point_len(x, "outer eval_cost failed")?;
        let cost = self
            .obj
            .eval_cost(x)
            .map_err(|err| into_objective_error("outer eval_cost failed", err))?;
        finite_cost_or_error("outer eval_cost failed", cost)
    }
}

impl FirstOrderObjective for OuterFirstOrderBridge<'_> {
    fn eval_grad(&mut self, x: &Array1<f64>) -> Result<FirstOrderSample, ObjectiveEvalError> {
        self.layout.validate_point_len(x, "outer eval failed")?;
        // Drive the outer-aware inner-PIRLS cap based on the iter count,
        // BEFORE invoking the inner solve. Cap stays fixed within a single
        // outer iter (line-search probes go through `eval_cost`, which
        // never touches the atomic). A cap of 0 means "no cap from this
        // source"; the inner solver still honors `pirls_max_iterations`
        // and the screening cap (combined via min).
        if let Some(feedback) = self.outer_inner_cap.as_ref() {
            let g_ratio = match (self.last_g_norm, self.g_norm_initial) {
                (Some(g), Some(g0)) if g0 > 0.0 => Some(g / g0),
                _ => None,
            };
            let snapshot = feedback.snapshot();
            let cap = first_order_inner_cap_schedule(self.iter_count, g_ratio, snapshot);
            let prev = feedback.cap.swap(cap, Ordering::Relaxed);
            if prev != cap {
                let ratio_str = match g_ratio {
                    Some(r) => format!("{:.3e}", r),
                    None => "n/a".to_string(),
                };
                let snap_str = match snapshot {
                    Some(s) => format!(
                        "last_iters={} converged={} ift_residual={} accept_rho={}",
                        s.last_iters,
                        s.last_converged,
                        match s.last_ift_residual {
                            Some(r) => format!("{:.3e}", r),
                            None => "n/a".to_string(),
                        },
                        match s.last_accept_rho {
                            Some(r) => format!("{:.3}", r),
                            None => "n/a".to_string(),
                        },
                    ),
                    None => "no-history".to_string(),
                };
                log::info!(
                    "[OUTER schedule] inner-PIRLS cap transition iter={} g_ratio={} {} prev={} new={} ({})",
                    self.iter_count,
                    ratio_str,
                    snap_str,
                    prev,
                    cap,
                    if cap == 0 { "uncapped" } else { "capped" }
                );
            }
        }
        let stage_start = std::time::Instant::now();
        log::info!(
            "[STAGE] outer eval start order=ValueAndGradient dim={} (first-order bridge, iter={})",
            x.len(),
            self.iter_count
        );
        let eval = self
            .obj
            .eval_with_order(x, OuterEvalOrder::ValueAndGradient)
            .map_err(|err| into_objective_error("outer eval failed", err))?;
        let eval = finite_outer_first_order_eval_or_error("outer eval failed", self.layout, eval)?;
        let g_norm = eval.gradient.iter().map(|v| v * v).sum::<f64>().sqrt();
        if self.g_norm_initial.is_none() && g_norm.is_finite() && g_norm > 0.0 {
            self.g_norm_initial = Some(g_norm);
        }
        if g_norm.is_finite() {
            self.last_g_norm = Some(g_norm);
        }
        log::info!(
            "[STAGE] outer eval end order=ValueAndGradient elapsed={:.3}s cost={:.6e} |g|={:.3e} (first-order bridge, iter={})",
            stage_start.elapsed().as_secs_f64(),
            eval.cost,
            g_norm,
            self.iter_count,
        );
        self.iter_count = self.iter_count.saturating_add(1);
        Ok(FirstOrderSample {
            value: eval.cost,
            gradient: eval.gradient,
        })
    }
}

/// Adaptive inner-PIRLS cap schedule. Replaces the older hardcoded
/// iter-tier (3/5/10/20) and ratio-tier (0.50/0.20/0.05/0.01) schedule
/// with a cap driven by the inner solver's actual convergence behavior
/// — Eisenstat-Walker style for the inner Newton.
///
/// Inputs:
/// - `iter_count`: outer iter index, used only as a fallback when no
///   inner-progress feedback has arrived yet (first 1-2 outer iters).
/// - `g_ratio`: outer gradient-norm decay `‖g_now‖ / ‖g_initial‖`. When
///   this drops below 1% the outer is essentially converged; we lift
///   the cap fully so the cached β is at full inner tolerance and the
///   convergence guard does not have to re-pay a full inner solve.
/// - `last`: snapshot from `InnerProgressFeedback`. When present and
///   the previous solve converged, we set the cap to `last_iters + 2`
///   (a small margin in case ρ moved enough to need a couple more
///   iters); when the previous solve hit the cap, we double — a
///   geometric backoff that recovers from too-tight a cap without
///   thrashing.
///
/// A cap of 0 means "no cap from this source"; the inner solver still
/// honors `pirls_max_iterations` and the screening cap. The cap is
/// floored at 3 (anything less is below noise) and ceilinged at 64
/// (the inner noise floor at biobank scale; further iters would be
/// pure waste).
fn first_order_inner_cap_schedule(
    iter_count: usize,
    g_ratio: Option<f64>,
    last: Option<InnerProgressSnapshot>,
) -> usize {
    // Convergence override: when the outer is essentially converged the
    // cached β must be at full inner tolerance. This belt-and-suspenders
    // path is independent of inner-progress history because the outer
    // re-evaluation guard pays a full inner solve anyway — uncapping
    // here just avoids one wasted iter at low cap before the guard.
    if matches!(g_ratio, Some(r) if r < 0.01) {
        return 0;
    }

    // Adaptive path: drive the cap from the inner solver's prior iter
    // count rather than a hardcoded tier.
    if let Some(snap) = last {
        let next = if snap.last_converged {
            // Converged in `last_iters` last time; pick a small margin
            // for ρ-step variability. The IFT predictor's residual
            // tells us how close the warm-start was to the KKT point:
            //   residual < 0.01  → next solve starts essentially AT the
            //                      KKT β, so +1 iter of margin suffices.
            //   residual < 0.10  → +2 (default, current behavior).
            //   residual ≥ 0.10  → predictor was poor (or fell back to
            //                      flat); the inner Newton has more
            //                      recovery work, so +4 to be safe.
            //   None             → no signal yet → +2 (default).
            // This wires the [IFT-QUALITY] feedback directly into the
            // adaptive cap, replacing the previous fixed +2.
            let mut margin = match snap.last_ift_residual {
                Some(r) if r < 0.01 => 1usize,
                Some(r) if r >= 0.10 => 4usize,
                _ => 2usize,
            };
            // LM model fidelity (commit 6445c079): if the previous
            // solve's accepted gain ratio was poor (model overstating
            // predicted reduction), the inner Newton's quadratic model
            // is unreliable. Bump margin by +2 — even a fast-converged
            // previous iter (small `last_iters`) provides weaker
            // evidence about the next solve's required effort when the
            // model is mis-calibrated. Threshold 0.5 is the textbook
            // "good agreement" cutoff for trust-region gain ratios.
            if matches!(snap.last_accept_rho, Some(r) if r < 0.5) {
                margin = margin.saturating_add(2);
            }
            snap.last_iters.saturating_add(margin)
        } else {
            // Hit the cap. Geometric backoff so we don't thrash on a
            // marginally-too-tight cap, but enforce floor of
            // last_iters+4 to actually grow.
            //
            // LM-fidelity escalation: if the previous solve's accepted
            // gain ratio was VERY poor (`accept_rho < 0.3`), the LM
            // model is severely mis-calibrated — doubling the cap may
            // not give the inner Newton enough headroom to find a
            // usable trust radius. Triple instead of doubling so we
            // don't waste another cycle hitting the cap. The 0.3
            // threshold is tighter than the +2-margin trigger (0.5)
            // because here we ALREADY know the iter budget was
            // insufficient AND the model was poor — both signals
            // pointing the same way.
            let multiplier = if matches!(snap.last_accept_rho, Some(r) if r < 0.3) {
                3
            } else {
                2
            };
            snap.last_iters
                .saturating_mul(multiplier)
                .max(snap.last_iters.saturating_add(4))
        };
        return next.clamp(3, 64);
    }

    // No feedback yet (first outer iter, or right after a screening
    // bundle reset). Coarse iter-count fallback for the first 1-2
    // outer iters so the cold-start cap is shallow even before the
    // adaptive signal kicks in.
    match iter_count {
        0 => 3,
        1 => 5,
        _ => 10,
    }
}

#[cfg(test)]
mod outer_inner_cap_schedule_tests {
    use super::{InnerProgressSnapshot, first_order_inner_cap_schedule};

    fn snap(last_iters: usize, last_converged: bool) -> Option<InnerProgressSnapshot> {
        Some(InnerProgressSnapshot {
            last_iters,
            last_converged,
            last_ift_residual: None,
            last_accept_rho: None,
        })
    }

    fn snap_with_accept_rho(
        last_iters: usize,
        last_converged: bool,
        accept_rho: f64,
    ) -> Option<InnerProgressSnapshot> {
        Some(InnerProgressSnapshot {
            last_iters,
            last_converged,
            last_ift_residual: None,
            last_accept_rho: Some(accept_rho),
        })
    }

    fn snap_with_residual(
        last_iters: usize,
        last_converged: bool,
        residual: f64,
    ) -> Option<InnerProgressSnapshot> {
        Some(InnerProgressSnapshot {
            last_iters,
            last_converged,
            last_ift_residual: Some(residual),
            last_accept_rho: None,
        })
    }

    /// The bridge's snapshot reader must distinguish "no signal yet"
    /// (NaN sentinel, encoded as `IFT_RESIDUAL_NO_SIGNAL_BITS`) from
    /// "residual was 0.0" (a real signal). Previously the bridge used
    /// `bits == 0` to detect no-signal, which collided with
    /// `f64::to_bits(0.0) == 0`. This test pins down the new
    /// NaN-sentinel discipline at the bridge layer.
    #[test]
    fn snapshot_distinguishes_zero_residual_from_no_signal() {
        use super::InnerProgressFeedback;
        use crate::solver::estimate::reml::runtime::IFT_RESIDUAL_NO_SIGNAL_BITS;
        use std::sync::Arc;
        use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize};

        // Helper to build a feedback channel with concrete values.
        let make_feedback =
            |iters: usize, converged: bool, residual_bits: u64| InnerProgressFeedback {
                cap: Arc::new(AtomicUsize::new(0)),
                last_iters: Arc::new(AtomicUsize::new(iters)),
                last_converged: Arc::new(AtomicBool::new(converged)),
                ift_residual: Arc::new(AtomicU64::new(residual_bits)),
                accept_rho: Arc::new(AtomicU64::new(
                    crate::solver::estimate::reml::runtime::IFT_RESIDUAL_NO_SIGNAL_BITS,
                )),
            };

        // Sentinel → no IFT signal (last_ift_residual = None).
        let fb = make_feedback(5, true, IFT_RESIDUAL_NO_SIGNAL_BITS);
        let snap = fb.snapshot().expect("iters > 0, snapshot present");
        assert!(
            snap.last_ift_residual.is_none(),
            "sentinel must decode to None"
        );

        // 0.0 residual → genuine signal (last_ift_residual = Some(0.0)).
        // This is the bug: previously the reader treated `bits == 0` as
        // no-signal, dropping the genuine 0.0 residual.
        let fb = make_feedback(5, true, 0.0_f64.to_bits());
        let snap = fb.snapshot().expect("iters > 0, snapshot present");
        assert_eq!(
            snap.last_ift_residual,
            Some(0.0),
            "residual of exactly 0.0 must round-trip as a real signal, \
             not be confused with the no-signal sentinel",
        );

        // Modest finite residual round-trips.
        let fb = make_feedback(5, true, 0.05_f64.to_bits());
        let snap = fb.snapshot().expect("snapshot present");
        assert_eq!(snap.last_ift_residual, Some(0.05));

        // last_iters == 0 → entire snapshot is None (no inner-Newton
        // signal yet at all). Sentinel residual irrelevant.
        let fb = make_feedback(0, false, IFT_RESIDUAL_NO_SIGNAL_BITS);
        assert!(fb.snapshot().is_none());
    }

    #[test]
    fn schedule_falls_back_to_iter_tier_without_feedback() {
        // No inner-progress history yet → coarse iter-count fallback so
        // the cold-start cap is shallow even before the adaptive signal
        // arrives.
        assert_eq!(first_order_inner_cap_schedule(0, None, None), 3);
        assert_eq!(first_order_inner_cap_schedule(1, None, None), 5);
        assert_eq!(first_order_inner_cap_schedule(2, None, None), 10);
        assert_eq!(first_order_inner_cap_schedule(20, None, None), 10);
    }

    #[test]
    fn schedule_uses_last_iters_plus_margin_when_converged() {
        // Inner converged in 4 iters last time → cap = 4+2 = 6.
        assert_eq!(first_order_inner_cap_schedule(2, None, snap(4, true)), 6);
        // Inner converged in 12 → cap = 14.
        assert_eq!(first_order_inner_cap_schedule(5, None, snap(12, true)), 14);
    }

    #[test]
    fn schedule_geometric_backoff_when_last_hit_cap() {
        // Last hit cap at 5 → 2*5=10, max(10, 5+4=9) = 10.
        assert_eq!(first_order_inner_cap_schedule(2, None, snap(5, false)), 10);
        // Last hit cap at 1 → 2*1=2, max(2, 1+4=5) = 5.
        assert_eq!(first_order_inner_cap_schedule(2, None, snap(1, false)), 5);
        // Last hit cap at 30 → would be 60 but ceiling is 64, so 60.
        assert_eq!(first_order_inner_cap_schedule(2, None, snap(30, false)), 60);
    }

    #[test]
    fn schedule_clamps_floor_and_ceiling() {
        // Last converged in 0 (degenerate; should never happen because
        // the producer only writes nonzero, but defensively check the
        // floor of 3).
        assert_eq!(first_order_inner_cap_schedule(2, None, snap(0, true)), 3);
        // Last converged in 100 → ceiling 64.
        assert_eq!(first_order_inner_cap_schedule(2, None, snap(100, true)), 64);
    }

    #[test]
    fn schedule_uncaps_when_outer_converged() {
        // g_ratio < 1% trumps everything: cached β must be at full
        // inner tolerance for the convergence guard.
        assert_eq!(first_order_inner_cap_schedule(0, Some(0.0001), None), 0);
        assert_eq!(
            first_order_inner_cap_schedule(0, Some(0.005), snap(4, true)),
            0
        );
        assert_eq!(
            first_order_inner_cap_schedule(20, Some(0.001), snap(50, false)),
            0
        );
    }

    #[test]
    fn schedule_ignores_modest_g_ratio_decay() {
        // Old schedule had tiered ratio caps at 0.50/0.20/0.05; the new
        // schedule only special-cases the deep-convergence threshold
        // (<1%). Modest decay no longer overrides the adaptive cap.
        assert_eq!(
            first_order_inner_cap_schedule(2, Some(0.30), snap(4, true)),
            6
        );
        assert_eq!(
            first_order_inner_cap_schedule(2, Some(0.05), snap(4, true)),
            6
        );
    }

    #[test]
    fn schedule_uses_ift_residual_to_pick_margin() {
        // Excellent IFT prediction (residual < 0.01): warm-start lands
        // essentially AT the KKT β, so +1 of margin suffices.
        assert_eq!(
            first_order_inner_cap_schedule(2, None, snap_with_residual(4, true, 0.005)),
            5
        );
        assert_eq!(
            first_order_inner_cap_schedule(2, None, snap_with_residual(4, true, 0.0001)),
            5
        );
        // Default zone (0.01 ≤ residual < 0.10): +2, current behavior.
        assert_eq!(
            first_order_inner_cap_schedule(2, None, snap_with_residual(4, true, 0.05)),
            6
        );
        // Poor IFT prediction (residual ≥ 0.10): +4, the inner Newton
        // has more recovery work after a near-flat warm-start.
        assert_eq!(
            first_order_inner_cap_schedule(2, None, snap_with_residual(4, true, 0.20)),
            8
        );
        assert_eq!(
            first_order_inner_cap_schedule(2, None, snap_with_residual(4, true, 0.80)),
            8
        );
        // Margin policy is monotone non-decreasing in residual: a worse
        // predictor never produces a tighter cap than a better one.
        let residuals = [0.001, 0.05, 0.30];
        let caps: Vec<usize> = residuals
            .iter()
            .map(|&r| first_order_inner_cap_schedule(2, None, snap_with_residual(4, true, r)))
            .collect();
        for w in caps.windows(2) {
            assert!(
                w[0] <= w[1],
                "ift-residual margin policy regressed monotonicity: {caps:?}"
            );
        }
    }

    #[test]
    fn schedule_bumps_margin_on_poor_lm_accept_rho() {
        // Healthy LM model fidelity (accept_rho ≥ 0.5): margin
        // unchanged from the no-accept-rho baseline (+2 default).
        // last_iters=4, default margin=2 → cap=6.
        assert_eq!(
            first_order_inner_cap_schedule(2, None, snap_with_accept_rho(4, true, 0.95)),
            6
        );
        assert_eq!(
            first_order_inner_cap_schedule(2, None, snap_with_accept_rho(4, true, 0.5)),
            6
        );
        // Poor LM model fidelity (accept_rho < 0.5): +2 margin bump
        // beyond the IFT-residual base. last_iters=4, default base=2,
        // accept_rho<0.5 bump=+2 → margin=4 → cap=8.
        assert_eq!(
            first_order_inner_cap_schedule(2, None, snap_with_accept_rho(4, true, 0.4)),
            8
        );
        assert_eq!(
            first_order_inner_cap_schedule(2, None, snap_with_accept_rho(4, true, 0.1)),
            8
        );
        // accept_rho saturation guard: `r < 0.5` is the strict
        // textbook "good agreement" cutoff for trust-region gain
        // ratios. Boundary at 0.5 admits, just below 0.5 bumps.
        assert_eq!(
            first_order_inner_cap_schedule(2, None, snap_with_accept_rho(4, true, 0.49)),
            8
        );
    }

    #[test]
    fn schedule_escalates_geometric_backoff_on_very_poor_accept_rho() {
        // Cap-hit (last_converged=false) with VERY poor LM model
        // (accept_rho < 0.3): triple instead of double the cap, so the
        // next solve has materially more iter budget when the model is
        // both insufficient (cap-hit) AND mis-calibrated (poor rho).
        // last_iters=4 → 4*3 = 12, vs 4*2=8 with doubling.
        let snap = Some(InnerProgressSnapshot {
            last_iters: 4,
            last_converged: false,
            last_ift_residual: None,
            last_accept_rho: Some(0.15),
        });
        assert_eq!(first_order_inner_cap_schedule(2, None, snap), 12);
        // Cap-hit with moderately-poor accept_rho (0.3 ≤ r < 0.5):
        // standard doubling. The threshold for escalation is 0.3, not
        // 0.5, because the +2-margin path (commit 04b30163) already
        // covers the 0.3-0.5 case for the converged branch.
        let snap = Some(InnerProgressSnapshot {
            last_iters: 4,
            last_converged: false,
            last_ift_residual: None,
            last_accept_rho: Some(0.4),
        });
        assert_eq!(first_order_inner_cap_schedule(2, None, snap), 8);
        // Cap-hit with healthy accept_rho ≥ 0.5: standard doubling.
        // The previous solve hit the cap because it needed more iters,
        // not because the LM was mis-calibrated.
        let snap = Some(InnerProgressSnapshot {
            last_iters: 4,
            last_converged: false,
            last_ift_residual: None,
            last_accept_rho: Some(0.9),
        });
        assert_eq!(first_order_inner_cap_schedule(2, None, snap), 8);
        // Cap-hit with no accept_rho signal: standard doubling. No
        // escalation when we don't have evidence of LM trouble.
        let snap = Some(InnerProgressSnapshot {
            last_iters: 4,
            last_converged: false,
            last_ift_residual: None,
            last_accept_rho: None,
        });
        assert_eq!(first_order_inner_cap_schedule(2, None, snap), 8);
        // Boundary at exactly 0.3: NOT escalated (`< 0.3` is strict).
        let snap = Some(InnerProgressSnapshot {
            last_iters: 4,
            last_converged: false,
            last_ift_residual: None,
            last_accept_rho: Some(0.3),
        });
        assert_eq!(first_order_inner_cap_schedule(2, None, snap), 8);
    }

    #[test]
    fn schedule_skips_lm_accept_rho_bump_when_signal_absent() {
        // None for last_accept_rho means "no signal yet" — the schedule
        // must NOT bump the margin in that case (otherwise a fresh
        // surface with the NaN sentinel would get penalty cap inflation
        // for no reason). last_iters=4, last_ift_residual=None →
        // default base margin=2 → cap=6, regardless of accept_rho being
        // unset.
        let snap = Some(InnerProgressSnapshot {
            last_iters: 4,
            last_converged: true,
            last_ift_residual: None,
            last_accept_rho: None,
        });
        assert_eq!(first_order_inner_cap_schedule(2, None, snap), 6);
        // Regression-lock the boundary: accept_rho exactly at 0.5
        // (textbook good-agreement cutoff) does NOT bump (`< 0.5` is
        // strict). cap = 4 + 2 = 6.
        let snap = Some(InnerProgressSnapshot {
            last_iters: 4,
            last_converged: true,
            last_ift_residual: None,
            last_accept_rho: Some(0.5),
        });
        assert_eq!(first_order_inner_cap_schedule(2, None, snap), 6);
        // accept_rho = 1.0 is the textbook "perfect agreement" — never
        // bumps. cap = 4 + 2 = 6.
        let snap = Some(InnerProgressSnapshot {
            last_iters: 4,
            last_converged: true,
            last_ift_residual: None,
            last_accept_rho: Some(1.0),
        });
        assert_eq!(first_order_inner_cap_schedule(2, None, snap), 6);
    }

    #[test]
    fn schedule_combines_ift_residual_and_lm_accept_rho() {
        // When BOTH signals fire (poor IFT prediction AND poor LM
        // accept_rho), the bumps compose: IFT base = 4, accept_rho
        // bump = +2 → total margin = 6, cap = last_iters + 6.
        let snap = Some(InnerProgressSnapshot {
            last_iters: 4,
            last_converged: true,
            last_ift_residual: Some(0.30),
            last_accept_rho: Some(0.20),
        });
        assert_eq!(first_order_inner_cap_schedule(2, None, snap), 10);
        // When only LM accept_rho is poor (IFT residual is excellent),
        // the bumps still compose: IFT base = 1 (excellent), accept_rho
        // bump = +2 → margin = 3, cap = 4 + 3 = 7.
        let snap = Some(InnerProgressSnapshot {
            last_iters: 4,
            last_converged: true,
            last_ift_residual: Some(0.005),
            last_accept_rho: Some(0.30),
        });
        assert_eq!(first_order_inner_cap_schedule(2, None, snap), 7);
    }
}

struct OuterSecondOrderBridge<'a> {
    obj: &'a mut dyn OuterObjective,
    layout: OuterThetaLayout,
    hessian_source: HessianSource,
    /// When the evaluator returns `HessianResult::Operator(op)` and the
    /// operator advertises an exact dense route, the bridge may materialize the
    /// operator into a dense K×K matrix so the dense ARC path can run an exact
    /// factorization instead of operator-CG.
    materialize_operator_max_dim: usize,
    /// Counts gradient/Hessian evaluations so that progress is visible even
    /// when the upstream `opt` solver does not emit per-iteration logs of its
    /// own. Emitted at INFO from `eval_grad` and `eval_hessian` (the calls
    /// that gate one optimizer step); skipped on `eval_cost` so linesearch
    /// trial points do not flood the log. Also drives the outer-aware
    /// inner-PIRLS cap schedule (see `first_order_inner_cap_schedule`).
    eval_count: usize,
    /// Outer-aware inner-PIRLS cap atomic. When `Some`, the bridge stores
    /// a coarsen-then-tighten cap into it on every accepted eval_grad /
    /// eval_hessian call. Mirrors the BFGS-side wiring in
    /// `OuterFirstOrderBridge`. Cap is NEVER touched in `eval_cost` so
    /// line-search probes within an outer iter see a stable inner
    /// tolerance (Wolfe / trust-region acceptance both assume constant
    /// cost noise within a bracket).
    outer_inner_cap: Option<InnerProgressFeedback>,
    /// First observed `‖g‖` from `eval_grad`/`eval_hessian`. Used by the
    /// schedule's gradient-ratio gate so the cap lifts when the optimizer
    /// is approaching convergence, not just when iter count says so.
    g_norm_initial: Option<f64>,
    /// `‖g‖` from the most recent eval. See `OuterFirstOrderBridge` for
    /// the staleness rationale: monotone-decreasing g_norm means the cap
    /// is conservatively LARGER than truly needed, never smaller.
    last_g_norm: Option<f64>,
}

impl ZerothOrderObjective for OuterSecondOrderBridge<'_> {
    fn eval_cost(&mut self, x: &Array1<f64>) -> Result<f64, ObjectiveEvalError> {
        self.layout
            .validate_point_len(x, "outer eval_cost failed")?;
        let cost = self
            .obj
            .eval_cost(x)
            .map_err(|err| into_objective_error("outer eval_cost failed", err))?;
        finite_cost_or_error("outer eval_cost failed", cost)
    }
}

impl FirstOrderObjective for OuterSecondOrderBridge<'_> {
    fn eval_grad(&mut self, x: &Array1<f64>) -> Result<FirstOrderSample, ObjectiveEvalError> {
        self.layout.validate_point_len(x, "outer eval failed")?;
        if let Some(feedback) = self.outer_inner_cap.as_ref() {
            // The ARC bridge increments `eval_count` in BOTH `eval_grad` and
            // `eval_hessian`. ARC calls both per outer iter, so `eval_count
            // / 2` is the correct iter index for the schedule. Without this
            // divisor the schedule would lift to full inner-cap at ARC iter
            // 3 instead of iter 6.
            let arc_iter = self.eval_count / 2;
            let g_ratio = match (self.last_g_norm, self.g_norm_initial) {
                (Some(g), Some(g0)) if g0 > 0.0 => Some(g / g0),
                _ => None,
            };
            let snapshot = feedback.snapshot();
            let cap = first_order_inner_cap_schedule(arc_iter, g_ratio, snapshot);
            let prev = feedback.cap.swap(cap, Ordering::Relaxed);
            if prev != cap {
                let ratio_str = match g_ratio {
                    Some(r) => format!("{:.3e}", r),
                    None => "n/a".to_string(),
                };
                let snap_str = match snapshot {
                    Some(s) => format!(
                        "last_iters={} converged={} ift_residual={} accept_rho={}",
                        s.last_iters,
                        s.last_converged,
                        match s.last_ift_residual {
                            Some(r) => format!("{:.3e}", r),
                            None => "n/a".to_string(),
                        },
                        match s.last_accept_rho {
                            Some(r) => format!("{:.3}", r),
                            None => "n/a".to_string(),
                        },
                    ),
                    None => "no-history".to_string(),
                };
                log::info!(
                    "[OUTER schedule] inner-PIRLS cap transition (ARC bridge) arc_iter={} g_ratio={} {} prev={} new={} ({})",
                    arc_iter,
                    ratio_str,
                    snap_str,
                    prev,
                    cap,
                    if cap == 0 { "uncapped" } else { "capped" }
                );
            }
        }
        let stage_start = std::time::Instant::now();
        log::info!(
            "[STAGE] outer eval start order=ValueAndGradient dim={}",
            x.len()
        );
        let eval = self
            .obj
            .eval_with_order(x, OuterEvalOrder::ValueAndGradient)
            .map_err(|err| into_objective_error("outer eval failed", err))?;
        let eval = finite_outer_first_order_eval_or_error("outer eval failed", self.layout, eval)?;
        self.eval_count += 1;
        let g_norm = eval.gradient.iter().map(|v| v * v).sum::<f64>().sqrt();
        if self.g_norm_initial.is_none() && g_norm.is_finite() && g_norm > 0.0 {
            self.g_norm_initial = Some(g_norm);
        }
        if g_norm.is_finite() {
            self.last_g_norm = Some(g_norm);
        }
        log::info!(
            "[STAGE] outer eval end order=ValueAndGradient elapsed={:.3}s cost={:.6e} |g|={:.3e}",
            stage_start.elapsed().as_secs_f64(),
            eval.cost,
            g_norm,
        );
        log::info!(
            "[OUTER] eval#{n} (grad) cost={cost:.6e} |g|={gnorm:.3e}",
            n = self.eval_count,
            cost = eval.cost,
            gnorm = g_norm,
        );
        Ok(FirstOrderSample {
            value: eval.cost,
            gradient: eval.gradient,
        })
    }
}

impl SecondOrderObjective for OuterSecondOrderBridge<'_> {
    fn eval_hessian(&mut self, x: &Array1<f64>) -> Result<SecondOrderSample, ObjectiveEvalError> {
        self.layout.validate_point_len(x, "outer eval failed")?;
        if let Some(feedback) = self.outer_inner_cap.as_ref() {
            let arc_iter = self.eval_count / 2;
            let g_ratio = match (self.last_g_norm, self.g_norm_initial) {
                (Some(g), Some(g0)) if g0 > 0.0 => Some(g / g0),
                _ => None,
            };
            let snapshot = feedback.snapshot();
            let cap = first_order_inner_cap_schedule(arc_iter, g_ratio, snapshot);
            let prev = feedback.cap.swap(cap, Ordering::Relaxed);
            if prev != cap {
                let ratio_str = match g_ratio {
                    Some(r) => format!("{:.3e}", r),
                    None => "n/a".to_string(),
                };
                let snap_str = match snapshot {
                    Some(s) => format!(
                        "last_iters={} converged={} ift_residual={} accept_rho={}",
                        s.last_iters,
                        s.last_converged,
                        match s.last_ift_residual {
                            Some(r) => format!("{:.3e}", r),
                            None => "n/a".to_string(),
                        },
                        match s.last_accept_rho {
                            Some(r) => format!("{:.3}", r),
                            None => "n/a".to_string(),
                        },
                    ),
                    None => "no-history".to_string(),
                };
                log::info!(
                    "[OUTER schedule] inner-PIRLS cap transition (ARC bridge) arc_iter={} g_ratio={} {} prev={} new={} ({})",
                    arc_iter,
                    ratio_str,
                    snap_str,
                    prev,
                    cap,
                    if cap == 0 { "uncapped" } else { "capped" }
                );
            }
        }
        let stage_start = std::time::Instant::now();
        log::info!(
            "[STAGE] outer eval start order=ValueGradientHessian dim={}",
            x.len()
        );
        let eval = self
            .obj
            .eval_with_order(x, OuterEvalOrder::ValueGradientHessian)
            .map_err(|err| into_objective_error("outer eval failed", err))?;
        let eval = finite_outer_eval_or_error("outer eval failed", self.layout, eval)?;
        self.eval_count += 1;
        let g_norm = eval.gradient.iter().map(|v| v * v).sum::<f64>().sqrt();
        if self.g_norm_initial.is_none() && g_norm.is_finite() && g_norm > 0.0 {
            self.g_norm_initial = Some(g_norm);
        }
        if g_norm.is_finite() {
            self.last_g_norm = Some(g_norm);
        }
        log::info!(
            "[STAGE] outer eval end order=ValueGradientHessian elapsed={:.3}s cost={:.6e} |g|={:.3e}",
            stage_start.elapsed().as_secs_f64(),
            eval.cost,
            g_norm,
        );
        log::info!(
            "[OUTER] eval#{n} (hess) cost={cost:.6e} |g|={gnorm:.3e}",
            n = self.eval_count,
            cost = eval.cost,
            gnorm = g_norm,
        );
        let hessian = match self.hessian_source {
            HessianSource::Analytic => match eval.hessian {
                HessianResult::Analytic(h) => Some(h),
                HessianResult::Operator(ref op)
                    if op.materialization_capability().is_available()
                        && op.dim() <= self.materialize_operator_max_dim =>
                {
                    // Work-model-approved exact operator materialization:
                    // batched/explicit backends can avoid one full pass per
                    // basis column while still feeding dense ARC an analytic
                    // K×K matrix.
                    Some(
                        op.materialize_dense()
                            .map_err(|message| ObjectiveEvalError::Fatal {
                                message: format!(
                                    "outer Hessian operator materialization failed: {message}"
                                ),
                            })?,
                    )
                }
                HessianResult::Operator(_) | HessianResult::Unavailable => None,
            },
            HessianSource::BfgsApprox
            | HessianSource::EfsFixedPoint
            | HessianSource::HybridEfsFixedPoint => None,
        };
        Ok(SecondOrderSample {
            value: eval.cost,
            gradient: eval.gradient,
            hessian,
        })
    }
}

struct OuterFixedPointBridge<'a> {
    obj: &'a mut dyn OuterObjective,
    layout: OuterThetaLayout,
    barrier_config: Option<BarrierConfig>,
    /// Consecutive HybridEFS iterations whose ψ block was zeroed after
    /// exhausting backtracking. When this reaches
    /// [`MAX_CONSECUTIVE_PSI_STAGNATION`], the bridge surfaces the
    /// [`EFS_FIRST_ORDER_FALLBACK_MARKER`] error so the runner aborts the
    /// HybridEFS attempt and the fallback ladder routes to a joint
    /// gradient-based solver where ψ stationarity ∇_ψ V = 0 can be enforced.
    consecutive_psi_zero_iters: usize,
}

/// Maximum number of α halvings for the cost line search wrapping the EFS
/// step.
///
/// The Wood–Fasiolo paper proves that the EFS update direction is an *ascent
/// direction* for REML/LAML on penalty-like coordinates, but full-step
/// monotonicity is not guaranteed — both the original Fellner–Schall paper
/// and the extension recommend step-length control. We backtrack the entire
/// θ vector by halving α ∈ {1, 1/2, …, 1/2⁸ ≈ 0.004}, accepting the first
/// trial point with a strictly lower cost. With 8 halvings the smallest
/// trial step is ≈ 0.4% of the raw EFS step in every coordinate, which is
/// enough to clear pathologies near the identifiability boundary while
/// staying inside one cache-warm Hessian factorization budget.
const MAX_EFS_BACKTRACK: usize = 8;

/// Step components below this threshold (in θ-space) are treated as zero
/// for backtracking purposes — there is no point line-searching a step of
/// magnitude `1e-12`, and skipping the trial keeps the convergence path
/// numerically clean (no spurious cost decreases from ULP noise).
const EFS_NEGLIGIBLE_STEP: f64 = 1e-12;

/// Maximum infinity-norm of the EFS step (in θ-space) at which we skip the
/// cost line search and trust the multiplicative formula's quadratic
/// convergence. Above this, we always backtrack.
///
/// At small step magnitudes the canonical formula `Δρ = log((d−t)/q_eff)`
/// is itself a Newton step on the REML stationarity equation, with
/// quadratic local convergence. Under Wood–Fasiolo's Loewner-order
/// assumptions on the penalty derivative, sufficiently small steps are
/// always descent on `V`, so the line search would add an inner P-IRLS
/// solve per outer iteration with essentially zero chance of finding a
/// halving that beats the full step. The threshold is set to ~exp(0.5)
/// ≈ 1.65× change in any single λ_i (well inside the local-convergence
/// regime) and gates only the line-search call — the step itself is
/// applied unchanged, so correctness is preserved.
const EFS_LINESEARCH_THRESHOLD: f64 = 0.5;

/// Relative tolerance for the descent condition `c < current_cost` during
/// EFS backtracking. Without this, ULP-level cost noise near a fixed point
/// can cause spurious backtracking even when the step is mathematically
/// correct. We accept any trial whose cost is within
/// `EFS_COST_DESCENT_TOL · |current_cost|` of the current value.
const EFS_COST_DESCENT_TOL: f64 = 1e-12;

/// Maximum number of consecutive HybridEFS iterations whose ψ block was
/// zeroed before the bridge bails out and triggers a solver switch.
///
/// On hard problems (Matérn additive at biobank scale, Duchon60, anisotropic
/// joint penalties) a single zeroed-ψ iteration after exhausted backtracking
/// is already strong evidence the EFS ψ direction is not descent-correlated
/// at the current iterate; continuing on ρ alone with Δψ = 0 cannot enforce
/// ∇_ψ V = 0 and burns outer iterations on a non-stationary direction.
/// Bail out immediately so the fallback ladder routes to a joint
/// gradient-based solver (BFGS / L-BFGS) where ψ stationarity is part of
/// the optimality condition.
const MAX_CONSECUTIVE_PSI_STAGNATION: usize = 1;

impl FixedPointObjective for OuterFixedPointBridge<'_> {
    fn eval_step(&mut self, x: &Array1<f64>) -> Result<FixedPointSample, ObjectiveEvalError> {
        self.layout.validate_point_len(x, "outer EFS eval failed")?;
        let eval = self
            .obj
            .eval_efs(x)
            .map_err(|err| into_objective_error("outer EFS eval failed", err))?;
        self.layout
            .validate_efs_eval(&eval, "outer EFS eval failed")?;
        if !eval.cost.is_finite() {
            return Err(ObjectiveEvalError::recoverable(
                "outer EFS eval failed: objective returned a non-finite cost".to_string(),
            ));
        }
        // Reject non-finite EFS step components at the bridge boundary with
        // full diagnostic context (which coord, its value, and whether it is
        // a ρ or ψ coord). Without this, a NaN/Inf step flows into the
        // hybrid-EFS backtrack loop, which halves it via `NaN * 0.5^k = NaN`
        // until backtracking exhausts, then silently zeros the ψ block and
        // applies only the ρ step — masking the analytic-gradient bug that
        // produced the NaN. The opt crate's FixedPoint::run also detects
        // this downstream (opt 0.2.2 lib.rs:4949) but surfaces only the bare
        // `NonFiniteStep` variant with no context, which is not actionable.
        if let Some((idx, value)) = eval.steps.iter().enumerate().find(|(_, v)| !v.is_finite()) {
            let psi_indices = eval.psi_indices.as_deref();
            let coord_kind = match psi_indices {
                Some(indices) if indices.contains(&idx) => "ψ",
                Some(_) => "ρ/τ",
                None => "ρ",
            };
            return Err(ObjectiveEvalError::recoverable(format!(
                "outer EFS eval failed: non-finite {coord_kind} step at coord {idx} \
                 (step[{idx}]={value}, rho_dim={}, psi_dim={}, n_params={}, cost={:.6e})",
                self.layout.rho_dim(),
                self.layout.psi_dim,
                self.layout.n_params,
                eval.cost,
            )));
        }
        let status = if let Some(ref barrier_cfg) = self.barrier_config {
            if let Some(ref beta) = eval.beta {
                let ref_diag = 1.0;
                let threshold = 0.01;
                if barrier_cfg.barrier_curvature_is_significant(beta, ref_diag, threshold) {
                    FixedPointStatus::Stop
                } else {
                    FixedPointStatus::Continue
                }
            } else {
                FixedPointStatus::Continue
            }
        } else {
            FixedPointStatus::Continue
        };

        if matches!(status, FixedPointStatus::Stop) {
            return Ok(FixedPointSample {
                value: eval.cost,
                step: Array1::zeros(x.len()),
                status,
            });
        }

        let raw_step = Array1::from_vec(eval.steps);
        let psi_indices = eval.psi_indices.clone();
        let max_step_abs = raw_step.iter().map(|s| s.abs()).fold(0.0_f64, f64::max);
        let current_cost = eval.cost;

        // Negligible raw step — the iteration is at (or numerically
        // indistinguishable from) a fixed point. Pass it through so the
        // outer step-norm convergence check fires; no point evaluating the
        // cost at x + 1e-30·s to chase ULP-level "improvements".
        if max_step_abs < EFS_NEGLIGIBLE_STEP {
            if psi_indices.is_some() {
                self.consecutive_psi_zero_iters = 0;
            }
            return Ok(FixedPointSample {
                value: current_cost,
                step: raw_step,
                status,
            });
        }

        // Small-step fast path. The canonical Wood–Fasiolo formula is
        // locally quadratically convergent, so once we are inside the
        // multiplicative-Newton basin (`||Δθ||∞ < EFS_LINESEARCH_THRESHOLD`)
        // a halving is essentially never accepted over the full step. Skip
        // the inner P-IRLS solve we'd otherwise burn on backtracking. For
        // hybrid runs we still need to reset the ψ-stagnation counter.
        if max_step_abs < EFS_LINESEARCH_THRESHOLD {
            if psi_indices.is_some() {
                self.consecutive_psi_zero_iters = 0;
            }
            return Ok(FixedPointSample {
                value: current_cost,
                step: raw_step,
                status,
            });
        }

        // ── Stage 1: full-vector cost backtracking ──
        //
        // Wood–Fasiolo gives ascent in the EFS direction but not full-step
        // monotonicity, so backtrack α ∈ {1, 1/2, …} on the *whole* step
        // vector (not just ψ). This is a uniform requirement: even on the
        // pure-ρ path, the additive log-λ formula is exact only at the
        // fixed point and is otherwise just a Newton-flavoured Wood–Fasiolo
        // surrogate that benefits from line search at large iterations.
        if let Some(scaled) = self.efs_backtrack(x, &raw_step, current_cost, MAX_EFS_BACKTRACK)? {
            if psi_indices.is_some() {
                self.consecutive_psi_zero_iters = 0;
            }
            return Ok(FixedPointSample {
                value: current_cost,
                step: scaled,
                status,
            });
        }

        // ── Stage 2 (hybrid only): ψ-zeroed retry ──
        //
        // Full-vector backtracking exhausted means *every* α we tried gave
        // a worse cost. On the hybrid path, the most common cause is a
        // bad ψ direction polluting an otherwise-good ρ step (preconditioned
        // gradient step on a near-singular ψ-ψ Gram matrix overshoots).
        // Try the ρ/τ block alone with the same backtracking schedule. If
        // that succeeds, we make progress on ρ this iteration; the ψ
        // stagnation counter advances and triggers the joint-solver
        // fallback once it crosses MAX_CONSECUTIVE_PSI_STAGNATION.
        if let Some(psi_idx) = psi_indices.as_ref() {
            let mut rho_only = raw_step.clone();
            for &i in psi_idx {
                rho_only[i] = 0.0;
            }
            let max_rho_abs = rho_only.iter().map(|s| s.abs()).fold(0.0_f64, f64::max);
            if max_rho_abs >= EFS_NEGLIGIBLE_STEP {
                if let Some(scaled) =
                    self.efs_backtrack(x, &rho_only, current_cost, MAX_EFS_BACKTRACK)?
                {
                    self.consecutive_psi_zero_iters =
                        self.consecutive_psi_zero_iters.saturating_add(1);
                    log::info!(
                        "[HYBRID-EFS] full-vector backtrack exhausted; ρ/τ-only step \
                         accepted. Consecutive ψ-zero iters = {}",
                        self.consecutive_psi_zero_iters,
                    );
                    if self.consecutive_psi_zero_iters >= MAX_CONSECUTIVE_PSI_STAGNATION {
                        log::info!(
                            "[STAGE] HybridEFS -> joint gradient (BFGS/L-BFGS) fallback: \
                             {} consecutive ψ-zero iterations after exhausted backtracking \
                             (rho_dim={}, psi_dim={}, n_params={}, cost={:.6e})",
                            self.consecutive_psi_zero_iters,
                            self.layout.rho_dim(),
                            self.layout.psi_dim,
                            self.layout.n_params,
                            current_cost,
                        );
                        return Err(ObjectiveEvalError::recoverable(format!(
                            "{} HybridEFS ψ stagnation: {} consecutive iterations \
                             exhausted backtracking and zeroed ψ step \
                             (rho_dim={}, psi_dim={}, n_params={}, cost={:.6e})",
                            EFS_FIRST_ORDER_FALLBACK_MARKER,
                            self.consecutive_psi_zero_iters,
                            self.layout.rho_dim(),
                            self.layout.psi_dim,
                            self.layout.n_params,
                            current_cost,
                        )));
                    }
                    return Ok(FixedPointSample {
                        value: current_cost,
                        step: scaled,
                        status,
                    });
                }
            }
            // ρ/τ-only backtracking also failed — surface the joint-solver
            // fallback marker so the runner abandons EFS for this attempt.
            log::info!(
                "[STAGE] HybridEFS -> joint gradient fallback: ρ/τ-only step also \
                 failed all {} halvings (rho_dim={}, psi_dim={}, n_params={}, \
                 cost={:.6e})",
                MAX_EFS_BACKTRACK,
                self.layout.rho_dim(),
                self.layout.psi_dim,
                self.layout.n_params,
                current_cost,
            );
            return Err(ObjectiveEvalError::recoverable(format!(
                "{} HybridEFS step rejected after {} halvings on full vector \
                 and {} halvings on ρ/τ-only fallback \
                 (rho_dim={}, psi_dim={}, n_params={}, cost={:.6e})",
                EFS_FIRST_ORDER_FALLBACK_MARKER,
                MAX_EFS_BACKTRACK,
                MAX_EFS_BACKTRACK,
                self.layout.rho_dim(),
                self.layout.psi_dim,
                self.layout.n_params,
                current_cost,
            )));
        }

        // Pure-EFS path with full backtracking exhausted: there is no ψ
        // block to escape to. Surface the same fallback marker so the
        // runner switches to a gradient-based solver instead of looping.
        log::info!(
            "[STAGE] EFS -> gradient fallback: no α ∈ {{1, …, 2^-{}}} decreased the \
             cost (rho_dim={}, n_params={}, cost={:.6e})",
            MAX_EFS_BACKTRACK,
            self.layout.rho_dim(),
            self.layout.n_params,
            current_cost,
        );
        Err(ObjectiveEvalError::recoverable(format!(
            "{} EFS step rejected after {} halvings on pure-ρ vector \
             (rho_dim={}, n_params={}, cost={:.6e})",
            EFS_FIRST_ORDER_FALLBACK_MARKER,
            MAX_EFS_BACKTRACK,
            self.layout.rho_dim(),
            self.layout.n_params,
            current_cost,
        )))
    }
}

impl OuterFixedPointBridge<'_> {
    /// Backtrack the cost along `raw_step` by halving α ∈ {1, 1/2, …, 2^-k}
    /// up to `max_halvings` times. Returns `Some(α·raw_step)` for the first
    /// α that yields a strictly lower finite cost, `None` if every trial
    /// failed or evaluation errored. Eval errors at trial points are
    /// treated as step rejection (a common pathology in inner solves at
    /// over-aggressive λ jumps), not propagated.
    fn efs_backtrack(
        &mut self,
        x: &Array1<f64>,
        raw_step: &Array1<f64>,
        current_cost: f64,
        max_halvings: usize,
    ) -> Result<Option<Array1<f64>>, ObjectiveEvalError> {
        // Relaxed Armijo: accept any trial within ULP noise of the current
        // cost. Pure `<` rejects ULP-noise dithering on flat regions of V
        // and forces unnecessary halvings.
        let cost_floor = current_cost + EFS_COST_DESCENT_TOL * current_cost.abs().max(1.0);
        let mut alpha = 1.0_f64;
        for bt in 0..=max_halvings {
            let trial_step = raw_step * alpha;
            let trial = x + &trial_step;
            match self.obj.eval_cost(&trial) {
                Ok(c) if c.is_finite() && c <= cost_floor => {
                    if bt > 0 {
                        log::debug!(
                            "[EFS] backtrack accepted at α=2^-{bt}={alpha:.4e} \
                             after {bt} halvings (cost: {current_cost:.6e} → {c:.6e})"
                        );
                    }
                    return Ok(Some(trial_step));
                }
                Ok(c) => {
                    log::trace!(
                        "[EFS] backtrack α=2^-{bt}={alpha:.4e}: trial cost {c:.6e} \
                         not below current {current_cost:.6e}, halving"
                    );
                }
                Err(err) => {
                    log::trace!(
                        "[EFS] backtrack α=2^-{bt}={alpha:.4e}: trial eval failed \
                         ({err}), halving"
                    );
                }
            }
            alpha *= 0.5;
        }
        Ok(None)
    }
}

/// Outcome of an auxiliary compass-search run.
enum CompassSearchOutcome {
    /// The step length contracted below tolerance with no further improvement
    /// — i.e. the iterate is a step-minimizer over the positive basis
    /// {±step·e_i} at scale < step_tol. By Kolda-Lewis-Torczon Thm 3.3 this
    /// implies first-order stationarity up to the step-tol grid.
    Converged {
        point: Array1<f64>,
        cost: f64,
        polls: usize,
    },
    /// The poll budget was exhausted before step contraction reached the
    /// tolerance. Return the best-seen iterate; caller treats as
    /// non-converged so log/diagnostics surface the truncation.
    BudgetExhausted {
        point: Array1<f64>,
        cost: f64,
        polls: usize,
    },
}

/// Coordinate compass search with bound clamping.
///
/// Why this method is correct for derivative-free aux optimization:
/// the algorithm only compares cost values at polled points. It never builds
/// derivative approximations and never feeds approximations into a
/// gradient-based optimizer. For any continuously differentiable cost
/// bounded below on the compact box [lower, upper], compass search
/// converges to a stationary point (Kolda-Lewis-Torczon, SIAM Review
/// 45:385, 2003, Thm 3.3). The theorem's polling requirement — that
/// all 2·dim directions ±step·e_i are evaluated before the step
/// contracts — is satisfied explicitly by the `!improved ⇒ step /= 2`
/// branch below: if no coordinate probe improved, every probe was
/// evaluated and rejected.
///
/// Error policy: `obj.eval_cost` errors at a probe are treated as
/// infeasible (the search simply does not accept that point). A genuine
/// error at the seed itself is surfaced by the caller via the initial
/// `eval_cost` check, so the helper only runs against a finite seed cost.
fn compass_search_outer(
    obj: &mut dyn OuterObjective,
    mut x: Array1<f64>,
    mut best_cost: f64,
    lower: ndarray::ArrayView1<'_, f64>,
    upper: ndarray::ArrayView1<'_, f64>,
    init_step: f64,
    step_tol: f64,
    max_polls: usize,
) -> CompassSearchOutcome {
    for i in 0..x.len() {
        x[i] = x[i].clamp(lower[i], upper[i]);
    }
    let mut step = init_step;
    let mut polls: usize = 0;
    while step > step_tol && polls < max_polls {
        let mut improved = false;
        'sweep: for i in 0..x.len() {
            for &sign in &[1.0, -1.0] {
                if polls >= max_polls {
                    break 'sweep;
                }
                polls += 1;
                let candidate_i = (x[i] + sign * step).clamp(lower[i], upper[i]);
                if (candidate_i - x[i]).abs() < step_tol {
                    continue;
                }
                let mut candidate = x.clone();
                candidate[i] = candidate_i;
                let probe = obj.eval_cost(&candidate).ok().filter(|v| v.is_finite());
                if let Some(c) = probe
                    && c < best_cost
                {
                    x = candidate;
                    best_cost = c;
                    improved = true;
                    break 'sweep;
                }
            }
        }
        if !improved {
            step *= 0.5;
        }
    }
    if step <= step_tol {
        CompassSearchOutcome::Converged {
            point: x,
            cost: best_cost,
            polls,
        }
    } else {
        CompassSearchOutcome::BudgetExhausted {
            point: x,
            cost: best_cost,
            polls,
        }
    }
}

fn solution_into_outer_result(
    solution: Solution,
    converged: bool,
    plan_used: OuterPlan,
) -> OuterResult {
    let final_grad_norm = solution
        .final_gradient_norm
        .or(solution.final_step_norm)
        .unwrap_or(0.0);
    OuterResult {
        rho: solution.final_point,
        final_value: solution.final_value,
        iterations: solution.iterations,
        final_grad_norm,
        final_gradient: solution.final_gradient,
        final_hessian: solution.final_hessian,
        converged,
        plan_used,
        operator_trust_radius: None,
        operator_stop_reason: None,
    }
}

/// Configuration for the outer optimization runner.
#[derive(Clone, Debug)]
struct OuterConfig {
    tolerance: f64,
    max_iter: usize,
    bounds: Option<(Array1<f64>, Array1<f64>)>,
    seed_config: crate::seeding::SeedConfig,
    rho_bound: f64,
    heuristic_lambdas: Option<Vec<f64>>,
    initial_rho: Option<Array1<f64>>,
    fallback_policy: FallbackPolicy,
    screening_cap: Option<Arc<AtomicUsize>>,
    /// Outer-aware inner-PIRLS iteration cap (sibling of `screening_cap`).
    /// When set, the BFGS bridge drives this atomic on every accepted
    /// gradient eval to coarsen the inner Newton solve at early outer iters
    /// (when ρ is far from converged) and lift it back to full as
    /// convergence approaches. Distinct from `screening_cap` in that it
    /// does NOT suppress cache writes / warm-start updates / KKT
    /// enforcement; it is purely a budget. See
    /// `RemlObjectiveState::outer_inner_cap` for dual-cap semantics.
    outer_inner_cap: Option<InnerProgressFeedback>,
    solver_class: SolverClass,
    operator_initial_trust_radius: Option<f64>,
}

impl Default for OuterConfig {
    fn default() -> Self {
        Self {
            tolerance: 1e-5,
            max_iter: 200,
            bounds: None,
            seed_config: crate::seeding::SeedConfig::default(),
            rho_bound: 30.0,
            heuristic_lambdas: None,
            initial_rho: None,
            fallback_policy: FallbackPolicy::Automatic,
            screening_cap: None,
            outer_inner_cap: None,
            solver_class: SolverClass::Primary,
            operator_initial_trust_radius: None,
        }
    }
}

// ─── OuterProblem builder ─────────────────────────────────────────────
//
// Declarative builder for outer optimization problems.  Derives
// OuterCapability flags from high-level inputs (gradient/hessian
// availability, psi dimension, EFS eligibility) so call sites never
// hand-copy capability flags.

/// Declarative outer-problem builder.  Produces both the
/// [`OuterCapability`] (what the objective can provide) and the
/// [`OuterConfig`] (how the runner should behave) from a small set
/// of high-level declarations.
pub struct OuterProblem {
    n_params: usize,
    gradient: Derivative,
    hessian: Derivative,
    prefer_gradient_only: bool,
    disable_fixed_point: bool,
    psi_dim: usize,
    barrier_config: Option<BarrierConfig>,
    tolerance: f64,
    max_iter: usize,
    bounds: Option<(Array1<f64>, Array1<f64>)>,
    rho_bound: f64,
    seed_config: crate::seeding::SeedConfig,
    heuristic_lambdas: Option<Vec<f64>>,
    initial_rho: Option<Array1<f64>>,
    fallback_policy: FallbackPolicy,
    screening_cap: Option<Arc<AtomicUsize>>,
    outer_inner_cap: Option<InnerProgressFeedback>,
    solver_class: SolverClass,
    operator_initial_trust_radius: Option<f64>,
}

impl OuterProblem {
    pub fn new(n_params: usize) -> Self {
        Self {
            n_params,
            gradient: Derivative::Unavailable,
            hessian: Derivative::Unavailable,
            prefer_gradient_only: false,
            disable_fixed_point: false,
            psi_dim: 0,
            barrier_config: None,
            tolerance: 1e-5,
            max_iter: 200,
            bounds: None,
            rho_bound: 30.0,
            seed_config: crate::seeding::SeedConfig::default(),
            heuristic_lambdas: None,
            initial_rho: None,
            fallback_policy: FallbackPolicy::Automatic,
            screening_cap: None,
            outer_inner_cap: None,
            solver_class: SolverClass::Primary,
            operator_initial_trust_radius: None,
        }
    }

    pub fn with_gradient(mut self, d: Derivative) -> Self {
        self.gradient = d;
        self
    }
    pub fn with_hessian(mut self, d: Derivative) -> Self {
        self.hessian = d;
        self
    }
    pub fn with_prefer_gradient_only(mut self, prefer_gradient_only: bool) -> Self {
        self.prefer_gradient_only = prefer_gradient_only;
        self
    }
    /// Forbid the planner from selecting EFS/HybridEfs, even when the
    /// objective implements `eval_efs()` and the coordinate structure would
    /// otherwise make pure/hybrid EFS eligible.
    ///
    /// Callers use this for families where the Wood-Fasiolo structural
    /// property is known not to hold (e.g. GAMLSS/location-scale with
    /// β-dependent joint Hessian), so EFS would stagnate and burn budget
    /// before the automatic cascade falls back to gradient-based BFGS.
    pub fn with_disable_fixed_point(mut self, disable: bool) -> Self {
        self.disable_fixed_point = disable;
        self
    }
    pub fn with_psi_dim(mut self, dim: usize) -> Self {
        self.psi_dim = dim;
        self
    }
    pub fn with_barrier(mut self, cfg: Option<BarrierConfig>) -> Self {
        self.barrier_config = cfg;
        self
    }
    pub fn with_tolerance(mut self, tol: f64) -> Self {
        self.tolerance = tol;
        self
    }
    pub fn with_max_iter(mut self, n: usize) -> Self {
        self.max_iter = n;
        self
    }
    pub fn with_bounds(mut self, lo: Array1<f64>, hi: Array1<f64>) -> Self {
        self.bounds = Some((lo, hi));
        self
    }
    pub fn with_rho_bound(mut self, b: f64) -> Self {
        self.rho_bound = b;
        self
    }
    pub fn with_seed_config(mut self, sc: crate::seeding::SeedConfig) -> Self {
        self.seed_config = sc;
        self
    }
    pub fn with_heuristic_lambdas(mut self, h: Vec<f64>) -> Self {
        self.heuristic_lambdas = Some(h);
        self
    }
    pub fn with_initial_rho(mut self, rho: Array1<f64>) -> Self {
        self.initial_rho = Some(rho);
        self
    }
    pub fn with_screening_cap(mut self, screening_cap: Arc<AtomicUsize>) -> Self {
        self.screening_cap = Some(screening_cap);
        self
    }
    /// Wire the bidirectional inner-PIRLS feedback channel.
    ///
    /// The outer bridge writes a coarsened iteration cap into
    /// `feedback.cap` on every accepted gradient/Hessian eval; the inner
    /// solver writes back into `feedback.last_iters` /
    /// `feedback.last_converged` after each non-screening solve so the
    /// next outer iter's schedule can adapt to the inner solver's
    /// actual convergence behavior. Typical caller passes
    /// `InnerProgressFeedback {
    ///     cap: Arc::clone(&reml_state.outer_inner_cap),
    ///     last_iters: Arc::clone(&reml_state.last_inner_iters),
    ///     last_converged: Arc::clone(&reml_state.last_inner_converged),
    /// }` so the inner and outer observe the same atomics.
    pub fn with_outer_inner_cap(mut self, feedback: InnerProgressFeedback) -> Self {
        self.outer_inner_cap = Some(feedback);
        self
    }
    /// Opt into a specific solver class. The default is
    /// [`SolverClass::Primary`] (the main REML outer). Setting
    /// [`SolverClass::AuxiliaryGradientFree`] unlocks
    /// [`Solver::CompassSearch`] dispatch for small-dim problems with no
    /// analytic gradient (survival baseline theta, inverse-link params).
    /// REML builders must not set this.
    pub fn with_solver_class(mut self, class: SolverClass) -> Self {
        self.solver_class = class;
        self
    }

    pub fn with_operator_initial_trust_radius(mut self, radius: Option<f64>) -> Self {
        self.operator_initial_trust_radius = radius;
        self
    }

    /// Override the fallback policy. Default is [`FallbackPolicy::Automatic`].
    ///
    /// Set [`FallbackPolicy::Disabled`] when the caller requires the primary
    /// plan to stand on its own. Exact-Hessian objectives use this to ensure
    /// failures surface on the analytic geometry instead of being reinterpreted
    /// by a different optimizer class.
    pub fn with_fallback_policy(mut self, policy: FallbackPolicy) -> Self {
        self.fallback_policy = policy;
        self
    }

    /// Derive the capability flags from the builder state.
    /// `fixed_point_available` is set to `false` here; `build_objective`
    /// overrides it based on whether an EFS closure is actually provided.
    fn capability(&self) -> OuterCapability {
        OuterCapability {
            gradient: self.gradient,
            hessian: self.hessian,
            prefer_gradient_only: self.prefer_gradient_only,
            disable_fixed_point: self.disable_fixed_point,
            n_params: self.n_params,
            psi_dim: self.psi_dim,
            fixed_point_available: false,
            barrier_config: self.barrier_config.clone(),
        }
    }

    /// Derive the runner configuration from the builder state.
    fn config(&self) -> OuterConfig {
        OuterConfig {
            tolerance: self.tolerance,
            max_iter: self.max_iter,
            bounds: self.bounds.clone(),
            seed_config: self.seed_config.clone(),
            rho_bound: self.rho_bound,
            heuristic_lambdas: self.heuristic_lambdas.clone(),
            initial_rho: self.initial_rho.clone(),
            fallback_policy: self.fallback_policy,
            screening_cap: self.screening_cap.clone(),
            outer_inner_cap: self.outer_inner_cap.clone(),
            solver_class: self.solver_class,
            operator_initial_trust_radius: self.operator_initial_trust_radius,
        }
    }

    /// Construct a [`ClosureObjective`] with capability flags derived from the
    /// builder state **and** the closures actually provided.
    ///
    /// `fixed_point_available` is set to `true` when `efs_fn` is `Some`,
    /// regardless of whether `.with_efs()` was called.  This is the canonical
    /// way to create production objectives — it eliminates the drift risk of
    /// manually entering capability flags.
    pub fn build_objective<S, Fc, Fe, Fr, Fefs>(
        &self,
        state: S,
        cost_fn: Fc,
        eval_fn: Fe,
        reset_fn: Option<Fr>,
        efs_fn: Option<Fefs>,
    ) -> ClosureObjective<S, Fc, Fe, Fr, Fefs>
    where
        Fc: FnMut(&mut S, &Array1<f64>) -> Result<f64, EstimationError>,
        Fe: FnMut(&mut S, &Array1<f64>) -> Result<OuterEval, EstimationError>,
        Fr: FnMut(&mut S),
        Fefs: FnMut(&mut S, &Array1<f64>) -> Result<EfsEval, EstimationError>,
    {
        let mut cap = self.capability();
        // Derive fixed_point_available from whether the caller actually
        // provided an EFS hook, rather than relying on manual flags.
        cap.fixed_point_available = efs_fn.is_some();
        ClosureObjective {
            state,
            cap,
            cost_fn,
            eval_fn,
            eval_order_fn: None,
            reset_fn,
            efs_fn,
            screening_proxy_fn: None::<fn(&mut S, &Array1<f64>) -> Result<f64, EstimationError>>,
        }
    }

    /// Construct a [`ClosureObjective`] with an order-aware evaluation hook.
    ///
    /// This lets the runner request first-order vs second-order work based on
    /// the active outer plan while preserving the legacy eager `eval_fn`.
    pub fn build_objective_with_eval_order<S, Fc, Fe, Feo, Fr, Fefs>(
        &self,
        state: S,
        cost_fn: Fc,
        eval_fn: Fe,
        eval_order_fn: Feo,
        reset_fn: Option<Fr>,
        efs_fn: Option<Fefs>,
    ) -> ClosureObjective<S, Fc, Fe, Fr, Fefs, Feo>
    where
        Fc: FnMut(&mut S, &Array1<f64>) -> Result<f64, EstimationError>,
        Fe: FnMut(&mut S, &Array1<f64>) -> Result<OuterEval, EstimationError>,
        Feo: FnMut(&mut S, &Array1<f64>, OuterEvalOrder) -> Result<OuterEval, EstimationError>,
        Fr: FnMut(&mut S),
        Fefs: FnMut(&mut S, &Array1<f64>) -> Result<EfsEval, EstimationError>,
    {
        let mut cap = self.capability();
        cap.fixed_point_available = efs_fn.is_some();
        ClosureObjective {
            state,
            cap,
            cost_fn,
            eval_fn,
            eval_order_fn: Some(eval_order_fn),
            reset_fn,
            efs_fn,
            screening_proxy_fn: None::<fn(&mut S, &Array1<f64>) -> Result<f64, EstimationError>>,
        }
    }

    /// Construct a [`ClosureObjective`] with both an order-aware evaluation
    /// hook and a custom seed-screening ranking proxy. The proxy fires only
    /// when the cascade in `rank_seeds_with_screening` calls it; outside
    /// screening the regular cost path is unaffected.
    pub fn build_objective_with_screening_proxy<S, Fc, Fe, Feo, Fr, Fefs, Fsp>(
        &self,
        state: S,
        cost_fn: Fc,
        eval_fn: Fe,
        eval_order_fn: Feo,
        reset_fn: Option<Fr>,
        efs_fn: Option<Fefs>,
        screening_proxy_fn: Fsp,
    ) -> ClosureObjective<S, Fc, Fe, Fr, Fefs, Feo, Fsp>
    where
        Fc: FnMut(&mut S, &Array1<f64>) -> Result<f64, EstimationError>,
        Fe: FnMut(&mut S, &Array1<f64>) -> Result<OuterEval, EstimationError>,
        Feo: FnMut(&mut S, &Array1<f64>, OuterEvalOrder) -> Result<OuterEval, EstimationError>,
        Fr: FnMut(&mut S),
        Fefs: FnMut(&mut S, &Array1<f64>) -> Result<EfsEval, EstimationError>,
        Fsp: FnMut(&mut S, &Array1<f64>) -> Result<f64, EstimationError>,
    {
        let mut cap = self.capability();
        cap.fixed_point_available = efs_fn.is_some();
        ClosureObjective {
            state,
            cap,
            cost_fn,
            eval_fn,
            eval_order_fn: Some(eval_order_fn),
            reset_fn,
            efs_fn,
            screening_proxy_fn: Some(screening_proxy_fn),
        }
    }

    /// Run the outer optimization with a given objective.
    pub fn run(
        &self,
        obj: &mut dyn OuterObjective,
        context: &str,
    ) -> Result<OuterResult, EstimationError> {
        run_outer(obj, &self.config(), context)
    }
}

/// Result of a completed outer optimization.
#[derive(Clone, Debug)]
pub struct OuterResult {
    /// Optimized log-smoothing parameters.
    pub rho: Array1<f64>,
    /// Final objective value.
    pub final_value: f64,
    /// Total outer iterations across all solver restarts.
    pub iterations: usize,
    /// Final gradient norm.
    pub final_grad_norm: f64,
    /// Final gradient when the solver is gradient-based.
    pub final_gradient: Option<Array1<f64>>,
    /// Final Hessian when the solver tracks one.
    pub final_hessian: Option<Array2<f64>>,
    /// Whether the optimizer converged to a stationary point.
    pub converged: bool,
    /// Which plan was actually used (may differ from initial if fallback fired).
    pub plan_used: OuterPlan,
    /// Final trust radius for the internal operator trust-region solver.
    ///
    /// A non-converged operator-ARC attempt may be restarted by the budget
    /// ladder. Restarting only from the last θ but resetting the trust radius
    /// is not a warm start: it replays the same rejected large trial steps.
    /// Carry this globalization state so retries resume from the scale the
    /// previous attempt already learned.
    pub operator_trust_radius: Option<f64>,
    /// Why the internal operator trust-region solver stopped.
    pub operator_stop_reason: Option<OperatorTrustRegionStopReason>,
}

const OPERATOR_TRUST_RADIUS_INIT: f64 = 1.0;
const OPERATOR_TRUST_RADIUS_MAX: f64 = 1.0e6;
const OPERATOR_ETA_ACCEPT: f64 = 1.0e-4;
const OPERATOR_TRUST_RADIUS_REJECT_FLOOR: f64 = 1.0e-9;
const OPERATOR_DENSE_REUSE_MAX_DIM: usize = 64;

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum OperatorTrustRegionStopReason {
    Converged,
    RejectFloor,
    IterationBudget,
    /// Family returned a non-operator Hessian mid-flight after routing into
    /// the operator path. Best-effort `x_k` returned with this reason; the
    /// caller should consider re-fitting under a different solver class
    /// (e.g. BFGS gradient-only) instead of trusting the partial result.
    RoutingMismatch,
}

struct BorrowedDenseOuterHessian<'a> {
    matrix: &'a Array2<f64>,
}

impl OuterHessianOperator for BorrowedDenseOuterHessian<'_> {
    fn dim(&self) -> usize {
        self.matrix.nrows()
    }

    fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
        if v.len() != self.matrix.ncols() {
            return Err(format!(
                "dense outer Hessian matvec length mismatch: got {}, expected {}",
                v.len(),
                self.matrix.ncols()
            ));
        }
        Ok(self.matrix.dot(v))
    }

    fn mul_mat(&self, factor: ArrayView2<'_, f64>) -> Result<Array2<f64>, String> {
        if factor.nrows() != self.matrix.ncols() {
            return Err(format!(
                "dense outer Hessian factor row count mismatch: got {}, expected {}",
                factor.nrows(),
                self.matrix.ncols()
            ));
        }
        Ok(self.matrix.dot(&factor))
    }

    fn is_cheap_to_materialize(&self) -> bool {
        true
    }
}

/// Pick the post-rejection trust radius. If the cubic model just
/// predicted a decrease >1e6 × |cost| (i.e. the local function is wildly
/// non-quadratic at the proposed step), and a Newton-step-scale derived
/// radius is materially smaller than the post-halving default, snap to
/// the derived radius — this avoids 20+ wasted halvings when an accepted
/// step lands in a steep-gradient basin mid-loop. Otherwise return the
/// caller's `default_radius` (standard ARC geometric backoff).
///
/// `pred_dec` is the cubic model's predicted decrease at the rejected step,
/// `default_radius` is what plain halving would set (typically
/// `trust_radius * 0.5`), and `derived_radius` is the
/// `|cost|/‖g‖`-derived target.
#[inline]
fn snap_recovery_radius_on_rejection(
    cost: f64,
    pred_dec: f64,
    derived_radius: f64,
    default_radius: f64,
) -> f64 {
    let cost_scale = cost.abs().max(1.0);
    let mismatch_ratio = if pred_dec.is_finite() && pred_dec > 0.0 {
        pred_dec / cost_scale
    } else {
        0.0
    };
    if mismatch_ratio > 1e6 && derived_radius < default_radius * 0.5 {
        derived_radius
    } else {
        default_radius
    }
}

/// Newton-step-scale initial trust radius derivation for the operator
/// trust-region solver.
///
/// Given the seed cost and projected-gradient norm, return a trust radius
/// such that the linearised first-order decrease `|g|·radius` is comparable
/// to `|cost|`. This lands the first cubic-model probe near the right local
/// geometry instead of forcing the optimizer to halve a thousand-fold-too-
/// big default.
///
/// The result is clamped to `[OPERATOR_TRUST_RADIUS_REJECT_FLOOR · 10,
/// OPERATOR_TRUST_RADIUS_INIT]`:
/// - lower bound keeps us strictly above the reject-floor (so the first
///   iter actually attempts a step rather than terminating immediately),
/// - upper bound preserves the previous default when `‖g‖` is small (the
///   typical, well-conditioned case): we never start more aggressively
///   than `OPERATOR_TRUST_RADIUS_INIT`.
///
/// Non-finite or zero `g_norm` → fall back to `OPERATOR_TRUST_RADIUS_INIT`.
#[inline]
fn derive_initial_trust_radius_from_seed(cost: f64, g_norm: f64) -> f64 {
    let cost_scale = cost.abs().max(1.0);
    if g_norm.is_finite() && g_norm > 0.0 {
        (cost_scale / g_norm).clamp(
            OPERATOR_TRUST_RADIUS_REJECT_FLOOR * 10.0,
            OPERATOR_TRUST_RADIUS_INIT,
        )
    } else {
        OPERATOR_TRUST_RADIUS_INIT
    }
}

fn project_to_bounds(x: &Array1<f64>, bounds: Option<&(Array1<f64>, Array1<f64>)>) -> Array1<f64> {
    match bounds {
        Some((lower, upper)) => {
            let mut out = x.clone();
            for idx in 0..out.len() {
                out[idx] = out[idx].clamp(lower[idx], upper[idx]);
            }
            out
        }
        None => x.clone(),
    }
}

fn projected_gradient(
    x: &Array1<f64>,
    gradient: &Array1<f64>,
    bounds: Option<&(Array1<f64>, Array1<f64>)>,
) -> Array1<f64> {
    let mut out = gradient.clone();
    if let Some((lower, upper)) = bounds {
        for idx in 0..out.len() {
            let at_lower = x[idx] <= lower[idx] + 1e-10;
            let at_upper = x[idx] >= upper[idx] - 1e-10;
            if (at_lower && gradient[idx] > 0.0) || (at_upper && gradient[idx] < 0.0) {
                out[idx] = 0.0;
            }
        }
    }
    out
}

fn active_mask(
    x: &Array1<f64>,
    gradient: &Array1<f64>,
    bounds: Option<&(Array1<f64>, Array1<f64>)>,
) -> Vec<bool> {
    match bounds {
        Some((lower, upper)) => (0..x.len())
            .map(|idx| {
                let at_lower = x[idx] <= lower[idx] + 1e-10;
                let at_upper = x[idx] >= upper[idx] - 1e-10;
                (at_lower && gradient[idx] > 0.0) || (at_upper && gradient[idx] < 0.0)
            })
            .collect(),
        None => vec![false; x.len()],
    }
}

fn masked_matvec(
    op: &dyn OuterHessianOperator,
    vector: &Array1<f64>,
    active: &[bool],
) -> Result<Array1<f64>, String> {
    let mut masked = vector.clone();
    for idx in 0..masked.len() {
        if active[idx] {
            masked[idx] = 0.0;
        }
    }
    let mut out = op.matvec(&masked)?;
    if out.len() != masked.len() {
        return Err(format!(
            "outer Hessian operator matvec length mismatch: got {}, expected {}",
            out.len(),
            masked.len()
        ));
    }
    for idx in 0..out.len() {
        if active[idx] {
            out[idx] = 0.0;
        }
    }
    Ok(out)
}

fn predicted_decrease_from_operator(
    op: &dyn OuterHessianOperator,
    gradient: &Array1<f64>,
    step: &Array1<f64>,
    active: &[bool],
) -> Result<f64, String> {
    let hs = if active.iter().copied().any(|flag| flag) {
        masked_matvec(op, step, active)?
    } else {
        op.matvec(step)?
    };
    Ok(-(gradient.dot(step) + 0.5 * step.dot(&hs)))
}

fn steihaug_toint_step_operator(
    op: &dyn OuterHessianOperator,
    gradient: &Array1<f64>,
    trust_radius: f64,
    active: &[bool],
) -> Result<Option<(Array1<f64>, f64)>, String> {
    let n = gradient.len();
    let g_norm = gradient.dot(gradient).sqrt();
    if !g_norm.is_finite() || g_norm <= 0.0 {
        return Ok(None);
    }

    let mut p = Array1::<f64>::zeros(n);
    let mut r = gradient.clone();
    for idx in 0..n {
        if active[idx] {
            r[idx] = 0.0;
        }
    }
    let mut d = r.mapv(|value| -value);
    let mut rtr = r.dot(&r);
    if !rtr.is_finite() || rtr <= 0.0 {
        return Ok(None);
    }

    let cg_tol = (1e-6 * g_norm).max(1e-12);
    let max_iter = (2 * n).max(10);

    for _ in 0..max_iter {
        let bd = if active.iter().copied().any(|flag| flag) {
            masked_matvec(op, &d, active)?
        } else {
            op.matvec(&d)?
        };
        let d_bd = d.dot(&bd);
        if !d_bd.is_finite() || d_bd <= 1e-14 * d.dot(&d).max(1.0) {
            let d_norm_sq = d.dot(&d);
            if d_norm_sq <= 0.0 {
                return Ok(None);
            }
            let p_dot_d = p.dot(&d);
            let p_norm_sq = p.dot(&p);
            let disc = p_dot_d * p_dot_d - d_norm_sq * (p_norm_sq - trust_radius * trust_radius);
            if disc < 0.0 {
                return Ok(None);
            }
            let tau = (-p_dot_d + disc.sqrt()) / d_norm_sq;
            let mut boundary = p.clone();
            boundary.scaled_add(tau, &d);
            let pred = predicted_decrease_from_operator(op, gradient, &boundary, active)?;
            return Ok((pred.is_finite() && pred > 0.0).then_some((boundary, pred)));
        }

        let alpha = rtr / d_bd;
        if !alpha.is_finite() || alpha <= 0.0 {
            return Ok(None);
        }

        let mut p_next = p.clone();
        p_next.scaled_add(alpha, &d);
        let p_next_norm = p_next.dot(&p_next).sqrt();
        if p_next_norm >= trust_radius {
            let d_norm_sq = d.dot(&d);
            if d_norm_sq <= 0.0 {
                return Ok(None);
            }
            let p_dot_d = p.dot(&d);
            let p_norm_sq = p.dot(&p);
            let disc = p_dot_d * p_dot_d - d_norm_sq * (p_norm_sq - trust_radius * trust_radius);
            if disc < 0.0 {
                return Ok(None);
            }
            let tau = (-p_dot_d + disc.sqrt()) / d_norm_sq;
            let mut boundary = p.clone();
            boundary.scaled_add(tau, &d);
            let pred = predicted_decrease_from_operator(op, gradient, &boundary, active)?;
            return Ok((pred.is_finite() && pred > 0.0).then_some((boundary, pred)));
        }

        r.scaled_add(alpha, &bd);
        for idx in 0..n {
            if active[idx] {
                r[idx] = 0.0;
            }
        }
        let rtr_next = r.dot(&r);
        if !rtr_next.is_finite() {
            return Ok(None);
        }

        p = p_next;
        if rtr_next.sqrt() <= cg_tol {
            let pred = predicted_decrease_from_operator(op, gradient, &p, active)?;
            return Ok((pred.is_finite() && pred > 0.0).then_some((p, pred)));
        }

        let beta = rtr_next / rtr.max(1e-32);
        if !beta.is_finite() || beta < 0.0 {
            return Ok(None);
        }
        d *= beta;
        d -= &r;
        for idx in 0..n {
            if active[idx] {
                d[idx] = 0.0;
            }
        }
        rtr = rtr_next;
    }

    let pred = predicted_decrease_from_operator(op, gradient, &p, active)?;
    Ok((pred.is_finite() && pred > 0.0).then_some((p, pred)))
}

/// Bound-pinned axis mask used by the corner-clamp escape.
///
/// An axis is "pinned" iff `x[i]` is at either bound, measured with a
/// per-axis tolerance `1e-12 · max(upper - lower, 1)`. This is a strictly
/// geometric test — distinct from `active_mask` which additionally
/// inspects the gradient direction. We need the geometric test for the
/// active-set escape because the corner-clamp pathology can wedge axes
/// that aren't strictly "active" in the KKT sense (gradient may flip sign
/// after the inner-PIRLS update changes the local Hessian).
fn pinned_axes_mask(x: &Array1<f64>, bounds: Option<&(Array1<f64>, Array1<f64>)>) -> Vec<bool> {
    match bounds {
        Some((lower, upper)) => (0..x.len())
            .map(|idx| {
                let span = (upper[idx] - lower[idx]).abs().max(1.0);
                let tol = 1e-12 * span;
                (x[idx] - lower[idx]).abs() <= tol || (upper[idx] - x[idx]).abs() <= tol
            })
            .collect(),
        None => vec![false; x.len()],
    }
}

/// Mask of axes whose unconstrained trial step was clipped by
/// `project_to_bounds`. These are the axes responsible for the
/// corner-clamp pathology: as the trust radius shrinks, they keep getting
/// clipped to the same bound, so `x_trial` (and therefore `act_dec`) is
/// constant across consecutive halvings.
fn clipped_axes_mask(x_raw: &Array1<f64>, x_projected: &Array1<f64>) -> Vec<bool> {
    debug_assert_eq!(x_raw.len(), x_projected.len());
    (0..x_raw.len())
        .map(|idx| (x_raw[idx] - x_projected[idx]).abs() > 0.0)
        .collect()
}

/// Build the union of the existing KKT-style active mask and the
/// geometric pinned-axes mask, used as the active set for the
/// corner-clamp escape step.
fn union_active_with_pinned(base_active: &[bool], pinned: &[bool]) -> Vec<bool> {
    debug_assert_eq!(base_active.len(), pinned.len());
    base_active
        .iter()
        .zip(pinned.iter())
        .map(|(a, p)| *a || *p)
        .collect()
}

/// Reduced-space ("active-set") trust-region escape from a corner clamp.
///
/// When the unconstrained Steihaug-Toint step is being repeatedly clipped
/// by `project_to_bounds`, halving the trust radius produces bit-identical
/// `act_dec` because all the clipped axes still hit the same bound and
/// the unclipped axes contribute negligibly. This helper forces the
/// clipped (and any pinned) axes to take a zero step, then re-solves the
/// trust-region subproblem on the unpinned subspace using the existing
/// Steihaug-Toint solver with an extended active mask.
///
/// Returns `None` when:
/// - all axes are pinned (genuine corner — no feasible descent direction);
/// - the reduced-space CG fails to find a step;
/// - the resulting step has zero norm.
///
/// On success, the returned step has zero entries at every pinned axis
/// and obeys the trust-region radius in the reduced subspace. Caller is
/// responsible for evaluating it and applying the standard accept/reject
/// ratio test.
fn compute_active_set_step(
    op: &dyn OuterHessianOperator,
    g_proj: &Array1<f64>,
    base_active: &[bool],
    pinned_or_clipped: &[bool],
    trust_radius: f64,
) -> Result<Option<(Array1<f64>, f64, Vec<bool>)>, String> {
    let extended = union_active_with_pinned(base_active, pinned_or_clipped);
    if extended.iter().all(|flag| *flag) {
        return Ok(None);
    }
    // If `extended` is identical to `base_active`, the Steihaug-Toint
    // solver has already been called with this mask in the main loop and
    // returned the step that triggered the corner clamp. Re-running it
    // would loop. Caller should fall back to halving in that case.
    if extended
        .iter()
        .zip(base_active.iter())
        .all(|(e, b)| *e == *b)
    {
        return Ok(None);
    }
    match steihaug_toint_step_operator(op, g_proj, trust_radius, &extended)? {
        Some((step, pred)) => {
            let s_norm = step.dot(&step).sqrt();
            if !s_norm.is_finite() || s_norm <= 1e-16 {
                Ok(None)
            } else {
                Ok(Some((step, pred, extended)))
            }
        }
        None => Ok(None),
    }
}

/// Scale-invariant tolerance for the outer REML/LAML optimizer.
///
/// V_LAML(ρ) is dominated by an O(n) likelihood term at biobank scale, so
/// its gradient ∇V scales the same way. An absolute test ‖∇V‖ < τ becomes
/// systematically too tight at large n. Multiplying τ by `1 + |V(ρ)|` makes
/// the certificate invariant under uniform `V → c·V`; the additive 1 is a
/// unit floor for trivial-cost problems. This is the standard textbook
/// scaling (mgcv's `magic` REML driver applies the same rescaling to its
/// score tolerance).
///
/// Used to scale the absolute gradient-norm tolerance passed to argmin's
/// BFGS / ARC / Trust-Region solvers, which only support an absolute test.
#[inline]
fn outer_scaled_tolerance(base_tol: f64, seed_cost: f64) -> f64 {
    base_tol * (1.0 + seed_cost.abs())
}

fn run_operator_trust_region(
    obj: &mut dyn OuterObjective,
    seed: &Array1<f64>,
    layout: OuterThetaLayout,
    bounds: Option<&(Array1<f64>, Array1<f64>)>,
    tolerance: f64,
    max_iter: usize,
    initial_eval: OuterEval,
    plan: OuterPlan,
    initial_trust_radius: Option<f64>,
) -> Result<OuterResult, EstimationError> {
    let mut x_k = project_to_bounds(seed, bounds);
    let mut eval_k = initial_eval;
    // If the caller didn't supply an initial trust radius, derive a Newton-
    // step-scale default from the seed gradient. The previous unconditional
    // `OPERATOR_TRUST_RADIUS_INIT = 1.0` produced 20-30 rejected halving
    // iterations on problems where ‖g‖ ≫ |cost| at the seed (e.g. CTN exact-
    // joint with warm-start β at biobank scale, where ‖g‖ ≈ 3·10¹⁷ vs
    // |cost| ≈ 1·10¹⁰): the cubic model predicts a 3·10⁴⁹ decrease, the
    // actual function is wildly non-quadratic at full step, the step is
    // rejected and the radius halved. Halving a thousand-fold-too-big radius
    // until it fits the local geometry costs ~10s per iter on biobank-scale
    // CTN evaluation — about 4 minutes per CTN preprocessing call. The
    // derivation is factored out into `derive_initial_trust_radius_from_seed`
    // so it can be unit-tested without a full optimizer harness.
    let g_seed_for_init = projected_gradient(&x_k, &eval_k.gradient, bounds);
    let g_norm_for_init = g_seed_for_init.dot(&g_seed_for_init).sqrt();
    let derived_initial_radius =
        derive_initial_trust_radius_from_seed(eval_k.cost, g_norm_for_init);
    let mut trust_radius = initial_trust_radius
        .filter(|radius| radius.is_finite() && *radius > 0.0)
        .unwrap_or(derived_initial_radius)
        .clamp(
            OPERATOR_TRUST_RADIUS_REJECT_FLOOR,
            OPERATOR_TRUST_RADIUS_MAX,
        );
    if initial_trust_radius.is_none() && derived_initial_radius < OPERATOR_TRUST_RADIUS_INIT {
        log::info!(
            "[ARC-timing] iter=0 derived initial trust_radius={:.3e} from \
             |cost|={:.3e} / ‖g‖={:.3e} (default {:.3e} would have wasted halving iters)",
            trust_radius,
            eval_k.cost.abs(),
            eval_k.gradient.dot(&eval_k.gradient).sqrt(),
            OPERATOR_TRUST_RADIUS_INIT,
        );
    }
    let mut dense_model: Option<Array2<f64>> = None;
    let mut materialize_dense_after_rejection = false;
    // Cliff diagnostics: at biobank scale the cubic-model step direction can
    // point straight into a bound, so projection clamps the trial point to
    // the same corner of the box on every halving. Across many rejected
    // iters at the same x_k (eval_k unchanged on rejection), the unclamped
    // axes scale with trust_radius and contribute negligibly to the trial
    // cost as r → 0; the trial value asymptotes to the cost AT THE CORNER.
    // Result: act_dec is bit-for-bit constant across consecutive rejections,
    // even though pred_dec is shrinking like r². This is the
    // `act_dec=-46.65 EXACTLY` pattern observed in CTN ARC at biobank
    // scale. Detect it: track how many consecutive rejections produce
    // (relative-)identical act_dec.
    let mut last_rejection_act_dec: Option<f64> = None;
    let mut consecutive_corner_clamps: usize = 0;
    const CORNER_CLAMP_REPORT_THRESHOLD: usize = 4;
    let mut corner_clamp_reported_for_run: bool = false;

    for iter in 0..max_iter {
        let iter_start = std::time::Instant::now();
        let g_proj = projected_gradient(&x_k, &eval_k.gradient, bounds);
        let g_norm = g_proj.dot(&g_proj).sqrt();
        // Scale-invariant outer KKT certificate via `outer_scaled_tolerance`,
        // recomputed each iteration with the iteration-current cost so the
        // scale tracks the descent. Same form as the BFGS / ARC paths and
        // the inner-PIRLS fix; see the helper's doc comment for derivation.
        let scaled_tol = outer_scaled_tolerance(tolerance, eval_k.cost);
        if g_norm.is_finite() && g_norm <= scaled_tol {
            log::info!(
                "[ARC-timing] iter={iter} converged cost={:.6e} grad_norm={:.3e} \
                 trust_radius={:.3e} tol={:.3e}",
                eval_k.cost,
                g_norm,
                trust_radius,
                scaled_tol,
            );
            return Ok(OuterResult {
                rho: x_k,
                final_value: eval_k.cost,
                iterations: iter,
                final_grad_norm: g_norm,
                final_gradient: Some(eval_k.gradient),
                final_hessian: None,
                converged: true,
                plan_used: plan,
                operator_trust_radius: Some(trust_radius),
                operator_stop_reason: Some(OperatorTrustRegionStopReason::Converged),
            });
        }
        if trust_radius <= OPERATOR_TRUST_RADIUS_REJECT_FLOOR {
            log::info!(
                "[ARC-timing] iter={iter} status=reject_floor cost={:.6e} grad_norm={:.3e} \
                 trust_radius={:.3e}",
                eval_k.cost,
                g_norm,
                trust_radius,
            );
            return Ok(OuterResult {
                rho: x_k,
                final_value: eval_k.cost,
                iterations: iter,
                final_grad_norm: g_norm,
                final_gradient: Some(eval_k.gradient),
                final_hessian: None,
                converged: false,
                plan_used: plan,
                operator_trust_radius: Some(trust_radius),
                operator_stop_reason: Some(OperatorTrustRegionStopReason::RejectFloor),
            });
        }

        let HessianResult::Operator(op_arc) = &eval_k.hessian else {
            // Graceful fallback: a family that returned an operator Hessian
            // at the seed but a different shape (Analytic/Unavailable)
            // mid-iteration is a planner-vs-family disagreement, not a fatal
            // error. The current x_k has a valid cost + gradient — return
            // it as the best-effort result with a `RoutingMismatch` stop
            // reason so the caller can decide whether to retry under a
            // different solver. Previously this branch panicked the whole
            // REML fit (observed at k≥32, n=50k in the
            // standard_gam_p_scaling_law probe).
            //
            // ROOT CAUSE: the planner at `:4168` decides routing by
            // inspecting `seed_eval.hessian` at the first eval. It assumes
            // all subsequent eval_k.hessian will also be Operator, but
            // `HessianResult` is an enum (Operator | Analytic | Unavailable)
            // and nothing forces per-call consistency. Families return
            // whatever's most efficient for the current ρ. The clean fix
            // is to extend the `CustomFamily` / outer-eval contract with
            // a `declared_hessian_shape()` const method and route on that,
            // not on the runtime eval. Tracked as part of task #28
            // (matrix-free outer Hessian) — once the operator form is
            // universally available the disagreement disappears.
            let observed_shape = match &eval_k.hessian {
                HessianResult::Operator(_) => "Operator", // (unreachable here)
                HessianResult::Analytic(_) => "Analytic(dense)",
                HessianResult::Unavailable => "Unavailable",
            };
            log::warn!(
                "[ARC] iter={iter}: family returned non-operator Hessian mid-flight \
                 (planner expected Operator throughout, got {observed_shape}); \
                 returning best-effort x_k cost={:.6e} grad_norm={:.3e}. \
                 Routing decision was made on seed_eval.hessian shape; family \
                 should declare a consistent shape via its capability surface.",
                eval_k.cost,
                g_norm
            );
            return Ok(OuterResult {
                rho: x_k,
                final_value: eval_k.cost,
                iterations: iter,
                final_grad_norm: g_norm,
                final_gradient: Some(eval_k.gradient),
                final_hessian: None,
                converged: false,
                plan_used: plan,
                operator_trust_radius: Some(trust_radius),
                operator_stop_reason: Some(OperatorTrustRegionStopReason::RoutingMismatch),
            });
        };
        if materialize_dense_after_rejection
            && dense_model.is_none()
            && op_arc.dim() <= OPERATOR_DENSE_REUSE_MAX_DIM
            && op_arc.materialization_capability().is_available()
        {
            let materialize_start = std::time::Instant::now();
            match op_arc.materialize_dense() {
                Ok(dense) => {
                    if !dense.iter().all(|value| value.is_finite()) {
                        log::debug!(
                            "[ARC-timing] iter={iter} dense outer Hessian reuse skipped: \
                             materialization produced non-finite entries"
                        );
                    } else {
                        log::info!(
                            "[ARC-timing] iter={iter} status=dense_model_ready dim={} bytes={} \
                             elapsed={:.3}s",
                            dense.nrows(),
                            dense.len() * std::mem::size_of::<f64>(),
                            materialize_start.elapsed().as_secs_f64(),
                        );
                        dense_model = Some(dense);
                    }
                }
                Err(message) => {
                    log::debug!(
                        "[ARC-timing] iter={iter} dense outer Hessian reuse skipped: {message}"
                    );
                }
            }
            materialize_dense_after_rejection = false;
        }
        let counter = HvApplyCounter::new(op_arc.as_ref());
        let dense_borrowed;
        let step_op: &dyn OuterHessianOperator = if let Some(dense) = dense_model.as_ref() {
            dense_borrowed = BorrowedDenseOuterHessian { matrix: dense };
            &dense_borrowed
        } else {
            &counter
        };
        let active = active_mask(&x_k, &eval_k.gradient, bounds);
        let cg_result = steihaug_toint_step_operator(step_op, &g_proj, trust_radius, &active)
            .map_err(EstimationError::RemlOptimizationFailed)?;
        let Some((trial_step, pred_dec_free)) = cg_result else {
            let elapsed = iter_start.elapsed().as_secs_f64();
            log::info!(
                "[ARC-timing] iter={iter} status=cg_no_step cost={:.6e} grad_norm={:.3e} \
                 trust_radius={:.3e} hv_applies={} elapsed={:.3}s",
                eval_k.cost,
                g_norm,
                trust_radius,
                counter.count(),
                elapsed,
            );
            trust_radius = (trust_radius * 0.5).max(1e-12);
            continue;
        };

        let x_trial_raw = &x_k + &trial_step;
        let x_trial = project_to_bounds(&x_trial_raw, bounds);
        let s_trial = &x_trial - &x_k;
        let s_norm = s_trial.dot(&s_trial).sqrt();
        if !s_norm.is_finite() || s_norm <= 1e-16 {
            let elapsed = iter_start.elapsed().as_secs_f64();
            log::info!(
                "[ARC-timing] iter={iter} status=zero_step cost={:.6e} grad_norm={:.3e} \
                 trust_radius={:.3e} hv_applies={} elapsed={:.3}s",
                eval_k.cost,
                g_norm,
                trust_radius,
                counter.count(),
                elapsed,
            );
            trust_radius = (trust_radius * 0.5).max(1e-12);
            continue;
        }

        let pred_dec = if (&s_trial - &trial_step)
            .dot(&(&s_trial - &trial_step))
            .sqrt()
            > 1e-8 * (1.0 + trial_step.dot(&trial_step).sqrt())
        {
            predicted_decrease_from_operator(step_op, &g_proj, &s_trial, &active)
                .map_err(EstimationError::RemlOptimizationFailed)?
        } else {
            pred_dec_free
        };
        if !pred_dec.is_finite() || pred_dec <= 0.0 {
            let elapsed = iter_start.elapsed().as_secs_f64();
            log::info!(
                "[ARC-timing] iter={iter} status=bad_pred_dec cost={:.6e} grad_norm={:.3e} \
                 trust_radius={:.3e} hv_applies={} elapsed={:.3}s",
                eval_k.cost,
                g_norm,
                trust_radius,
                counter.count(),
                elapsed,
            );
            trust_radius = (trust_radius * 0.5).max(1e-12);
            continue;
        }

        let trial_cost = match obj.eval_cost(&x_trial) {
            Ok(cost) => match finite_cost_or_error("outer operator trial cost failed", cost) {
                Ok(cost) => cost,
                Err(ObjectiveEvalError::Recoverable { message }) => {
                    let elapsed = iter_start.elapsed().as_secs_f64();
                    log::info!(
                        "[ARC-timing] iter={iter} status=infeasible_trial cost={:.6e} \
                         grad_norm={:.3e} pred_dec={:.3e} trust_radius={:.3e}->{:.3e} \
                         hv_applies={} elapsed={:.3}s reason={}",
                        eval_k.cost,
                        g_norm,
                        pred_dec,
                        trust_radius,
                        (trust_radius * 0.25).max(1e-12),
                        counter.count(),
                        elapsed,
                        message,
                    );
                    trust_radius = (trust_radius * 0.25).max(1e-12);
                    continue;
                }
                Err(ObjectiveEvalError::Fatal { message }) => {
                    return Err(EstimationError::RemlOptimizationFailed(message));
                }
            },
            Err(err) => {
                let elapsed = iter_start.elapsed().as_secs_f64();
                log::info!(
                    "[ARC-timing] iter={iter} status=eval_error cost={:.6e} grad_norm={:.3e} \
                     pred_dec={:.3e} trust_radius={:.3e}->{:.3e} hv_applies={} \
                     elapsed={:.3}s reason={}",
                    eval_k.cost,
                    g_norm,
                    pred_dec,
                    trust_radius,
                    (trust_radius * 0.25).max(1e-12),
                    counter.count(),
                    elapsed,
                    err,
                );
                trust_radius = (trust_radius * 0.25).max(1e-12);
                continue;
            }
        };
        let act_dec = eval_k.cost - trial_cost;
        let rho = act_dec / pred_dec;
        let accepted_by_ratio = rho > OPERATOR_ETA_ACCEPT;
        let accepted = accepted_by_ratio;
        let new_trust_radius = if rho > 0.75 && s_norm > 0.99 * trust_radius {
            (trust_radius * 2.0).min(OPERATOR_TRUST_RADIUS_MAX)
        } else if rho < 0.25 {
            (trust_radius * 0.5).max(1e-12)
        } else {
            trust_radius
        };
        let hv_applies = counter.count();
        if accepted {
            let trust_radius_before_trial_eval = trust_radius;
            let eval_trial = match obj
                .eval_with_order(&x_trial, OuterEvalOrder::ValueGradientHessian)
            {
                Ok(eval) => {
                    match finite_outer_eval_or_error("outer operator eval failed", layout, eval) {
                        Ok(eval) => eval,
                        Err(ObjectiveEvalError::Recoverable { message }) => {
                            let elapsed = iter_start.elapsed().as_secs_f64();
                            let rejected_trust_radius = (new_trust_radius * 0.25).max(1e-12);
                            log::info!(
                                "[ARC-timing] iter={iter} status=accepted_eval_error \
                                     cost={:.6e}->{:.6e} grad_norm={:.3e} rho={:.3} \
                                     pred_dec={:.3e} act_dec={:.3e} \
                                     trust_radius={:.3e}->{:.3e} hv_applies={} elapsed={:.3}s \
                                     reason={}",
                                eval_k.cost,
                                trial_cost,
                                g_norm,
                                rho,
                                pred_dec,
                                act_dec,
                                trust_radius_before_trial_eval,
                                rejected_trust_radius,
                                hv_applies,
                                elapsed,
                                message,
                            );
                            trust_radius = rejected_trust_radius;
                            continue;
                        }
                        Err(ObjectiveEvalError::Fatal { message }) => {
                            return Err(EstimationError::RemlOptimizationFailed(message));
                        }
                    }
                }
                Err(err) => {
                    let elapsed = iter_start.elapsed().as_secs_f64();
                    let rejected_trust_radius = (new_trust_radius * 0.25).max(1e-12);
                    log::info!(
                        "[ARC-timing] iter={iter} status=accepted_eval_error \
                             cost={:.6e}->{:.6e} grad_norm={:.3e} rho={:.3} \
                             pred_dec={:.3e} act_dec={:.3e} \
                             trust_radius={:.3e}->{:.3e} hv_applies={} elapsed={:.3}s \
                             reason={}",
                        eval_k.cost,
                        trial_cost,
                        g_norm,
                        rho,
                        pred_dec,
                        act_dec,
                        trust_radius_before_trial_eval,
                        rejected_trust_radius,
                        hv_applies,
                        elapsed,
                        err,
                    );
                    trust_radius = rejected_trust_radius;
                    continue;
                }
            };
            let elapsed = iter_start.elapsed().as_secs_f64();
            log::info!(
                "[ARC-timing] iter={iter} status=accepted cost={:.6e}->{:.6e} \
                 grad_norm={:.3e} rho={:.3} pred_dec={:.3e} act_dec={:.3e} \
                 trust_radius={:.3e}->{:.3e} hv_applies={} elapsed={:.3}s",
                eval_k.cost,
                trial_cost,
                g_norm,
                rho,
                pred_dec,
                act_dec,
                trust_radius_before_trial_eval,
                new_trust_radius,
                hv_applies,
                elapsed,
            );
            trust_radius = new_trust_radius;
            x_k = x_trial;
            eval_k = eval_trial;
            dense_model = None;
            materialize_dense_after_rejection = false;
            // Accepting any step exits the corner-clamp regime: x_k moved,
            // so the next rejection's trial cost is no longer comparable to
            // the prior corner's cost. Reset the cliff tracker.
            last_rejection_act_dec = None;
            consecutive_corner_clamps = 0;
        } else {
            // Smart snap on egregious rejection. When the cubic model
            // predicts a decrease many orders of magnitude larger than
            // the current cost (i.e., the local geometry is wildly
            // different from the seed-iter-0 geometry — typically because
            // an accepted earlier step landed in a steep-gradient basin),
            // geometric halving from the current radius can cost dozens
            // of rejected iterations. Same |cost|/‖g‖ Newton-step heuristic
            // we use at iter=0: snap directly to a radius whose linearised
            // first-order decrease is comparable to |cost|. Only fires
            // when the mismatch is extreme (>1e6) so well-behaved
            // rejections continue to use ARC's standard halving.
            let derived_recovery_radius =
                derive_initial_trust_radius_from_seed(eval_k.cost, g_norm);
            let snapped_radius = snap_recovery_radius_on_rejection(
                eval_k.cost,
                pred_dec,
                derived_recovery_radius,
                new_trust_radius,
            );
            let elapsed = iter_start.elapsed().as_secs_f64();
            log::info!(
                "[ARC-timing] iter={iter} status=rejected cost={:.6e}->{:.6e} \
                 grad_norm={:.3e} rho={:.3} pred_dec={:.3e} act_dec={:.3e} \
                 trust_radius={:.3e}->{:.3e} hv_applies={} elapsed={:.3}s",
                eval_k.cost,
                eval_k.cost,
                g_norm,
                rho,
                pred_dec,
                act_dec,
                trust_radius,
                snapped_radius,
                hv_applies,
                elapsed,
            );
            if snapped_radius < new_trust_radius {
                let cost_scale_for_log = eval_k.cost.abs().max(1.0);
                let mismatch_ratio_for_log = if pred_dec.is_finite() && pred_dec > 0.0 {
                    pred_dec / cost_scale_for_log
                } else {
                    0.0
                };
                log::info!(
                    "[ARC-timing] iter={iter} snap_recovery: pred_dec/|cost|={:.3e} >> 1; \
                     snapped trust_radius {:.3e} -> {:.3e} (skipping ~{} halvings)",
                    mismatch_ratio_for_log,
                    new_trust_radius,
                    snapped_radius,
                    (new_trust_radius / snapped_radius).log2().ceil() as i64,
                );
            }
            // Cliff / corner-clamp diagnostic. If consecutive rejections
            // produce act_dec values that are equal to within float noise
            // *relative to the cost scale*, the trial point is asymptoting
            // to a constant point as trust_radius shrinks — almost certainly
            // because the unconstrained step direction points across one or
            // more bounds and projection is clamping to the same corner of
            // the box every time. Surface this once per run so it shows up
            // in CI logs without spamming.
            let mut corner_clamp_just_detected = false;
            if act_dec.is_finite() {
                let cost_floor = eval_k.cost.abs().max(1.0);
                let near_equal = match last_rejection_act_dec {
                    Some(prev) => {
                        let scale = prev.abs().max(act_dec.abs()).max(cost_floor) * 1e-9;
                        (act_dec - prev).abs() <= scale
                    }
                    None => false,
                };
                if near_equal {
                    consecutive_corner_clamps += 1;
                    if consecutive_corner_clamps >= CORNER_CLAMP_REPORT_THRESHOLD {
                        corner_clamp_just_detected = true;
                    }
                    if !corner_clamp_reported_for_run
                        && consecutive_corner_clamps >= CORNER_CLAMP_REPORT_THRESHOLD
                    {
                        log::warn!(
                            "[ARC-timing] iter={iter} corner_clamp_detected: act_dec={:.6e} \
                             stable across {} consecutive rejected halvings (trust_radius now \
                             {:.3e}); the unconstrained cubic-model step direction is crossing \
                             one or more box bounds, projection clamps the trial point to a \
                             constant corner regardless of step magnitude, so cost(trial) is \
                             constant and act_dec stops carrying useful information. \
                             Attempting active-set escape (reduced-subspace step).",
                            act_dec,
                            consecutive_corner_clamps + 1,
                            snapped_radius,
                        );
                        corner_clamp_reported_for_run = true;
                    }
                } else {
                    consecutive_corner_clamps = 0;
                }
                last_rejection_act_dec = Some(act_dec);
            }

            // Active-set escape. When the corner-clamp condition fires,
            // build an extended active mask containing every axis that
            // (a) was already KKT-active, plus (b) was clipped during
            // `project_to_bounds(x_k + trial_step)`, plus (c) sits at a
            // bound to within `1e-12 · span` numerically. Force those
            // axes to take a zero step and re-solve the trust-region
            // subproblem on the unpinned subspace. If the resulting step
            // produces a valid descent (rho > OPERATOR_ETA_ACCEPT), accept
            // it and continue from the new x_k. Otherwise fall through to
            // the standard halving path.
            if corner_clamp_just_detected {
                let clipped = clipped_axes_mask(&x_trial_raw, &x_trial);
                let pinned = pinned_axes_mask(&x_k, bounds);
                let extra_active: Vec<bool> = clipped
                    .iter()
                    .zip(pinned.iter())
                    .map(|(c, p)| *c || *p)
                    .collect();
                match compute_active_set_step(
                    step_op,
                    &g_proj,
                    &active,
                    &extra_active,
                    snapped_radius.max(trust_radius),
                ) {
                    Ok(Some((reduced_step, reduced_pred, extended_active))) => {
                        let x_trial_red_raw = &x_k + &reduced_step;
                        let x_trial_red = project_to_bounds(&x_trial_red_raw, bounds);
                        let s_red = &x_trial_red - &x_k;
                        let s_red_norm = s_red.dot(&s_red).sqrt();
                        if s_red_norm.is_finite() && s_red_norm > 1e-16 {
                            let red_pred = if (&s_red - &reduced_step)
                                .dot(&(&s_red - &reduced_step))
                                .sqrt()
                                > 1e-8 * (1.0 + reduced_step.dot(&reduced_step).sqrt())
                            {
                                predicted_decrease_from_operator(
                                    step_op,
                                    &g_proj,
                                    &s_red,
                                    &extended_active,
                                )
                                .unwrap_or(reduced_pred)
                            } else {
                                reduced_pred
                            };
                            if red_pred.is_finite() && red_pred > 0.0 {
                                let red_trial_cost = obj.eval_cost(&x_trial_red);
                                if let Ok(red_trial_cost) = red_trial_cost {
                                    if red_trial_cost.is_finite() {
                                        let red_act_dec = eval_k.cost - red_trial_cost;
                                        let red_rho = red_act_dec / red_pred;
                                        if red_rho > OPERATOR_ETA_ACCEPT {
                                            // Accept the active-set step.
                                            match obj.eval_with_order(
                                                &x_trial_red,
                                                OuterEvalOrder::ValueGradientHessian,
                                            ) {
                                                Ok(eval_red) => {
                                                    if let Ok(eval_red) = finite_outer_eval_or_error(
                                                        "outer operator active-set eval failed",
                                                        layout,
                                                        eval_red,
                                                    ) {
                                                        log::warn!(
                                                            "[ARC-timing] iter={iter} \
                                                             active_set_escape_accepted: \
                                                             cost={:.6e}->{:.6e} red_rho={:.3} \
                                                             red_pred={:.3e} red_act_dec={:.3e} \
                                                             axes_pinned={}/{} \
                                                             trust_radius={:.3e}",
                                                            eval_k.cost,
                                                            red_trial_cost,
                                                            red_rho,
                                                            red_pred,
                                                            red_act_dec,
                                                            extended_active
                                                                .iter()
                                                                .filter(|f| **f)
                                                                .count(),
                                                            extended_active.len(),
                                                            snapped_radius,
                                                        );
                                                        x_k = x_trial_red;
                                                        eval_k = eval_red;
                                                        // Reset corner-clamp tracker; we're
                                                        // now at a new x_k, so prior act_decs
                                                        // are no longer comparable.
                                                        last_rejection_act_dec = None;
                                                        consecutive_corner_clamps = 0;
                                                        corner_clamp_reported_for_run = false;
                                                        // Keep trust radius at the snap target
                                                        // — we don't want to reset to seed
                                                        // scale and replay long halvings.
                                                        trust_radius = snapped_radius.max(
                                                            OPERATOR_TRUST_RADIUS_REJECT_FLOOR
                                                                * 10.0,
                                                        );
                                                        dense_model = None;
                                                        materialize_dense_after_rejection = false;
                                                        continue;
                                                    }
                                                }
                                                Err(_) => {
                                                    // Eval failed; fall through to halving.
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                    Ok(None) => {
                        // Either all axes pinned (true corner) or extended
                        // mask matches base — nothing new to try, fall
                        // through to halving with a higher floor.
                    }
                    Err(_) => {
                        // Operator matvec error; fall through.
                    }
                }
                // Active-set escape failed or didn't apply. Force the
                // trust radius floor up so the global optimizer can
                // terminate cleanly rather than spinning into the
                // reject floor with no useful progress.
                trust_radius = snapped_radius.max(OPERATOR_TRUST_RADIUS_REJECT_FLOOR * 10.0);
                let final_grad = projected_gradient(&x_k, &eval_k.gradient, bounds);
                let final_grad_norm = final_grad.dot(&final_grad).sqrt();
                log::warn!(
                    "[ARC-timing] iter={iter} active_set_escape_unavailable: \
                     terminating at corner with trust_radius={:.3e} \
                     grad_norm={:.3e}",
                    trust_radius,
                    final_grad_norm,
                );
                return Ok(OuterResult {
                    rho: x_k,
                    final_value: eval_k.cost,
                    iterations: iter + 1,
                    final_grad_norm,
                    final_gradient: Some(eval_k.gradient),
                    final_hessian: None,
                    converged: false,
                    plan_used: plan,
                    operator_trust_radius: Some(trust_radius),
                    operator_stop_reason: Some(OperatorTrustRegionStopReason::RejectFloor),
                });
            }
            trust_radius = snapped_radius;
            if trust_radius <= OPERATOR_TRUST_RADIUS_REJECT_FLOOR {
                let final_grad = projected_gradient(&x_k, &eval_k.gradient, bounds);
                let final_grad_norm = final_grad.dot(&final_grad).sqrt();
                return Ok(OuterResult {
                    rho: x_k,
                    final_value: eval_k.cost,
                    iterations: iter + 1,
                    final_grad_norm,
                    final_gradient: Some(eval_k.gradient),
                    final_hessian: None,
                    converged: false,
                    plan_used: plan,
                    operator_trust_radius: Some(trust_radius),
                    operator_stop_reason: Some(OperatorTrustRegionStopReason::RejectFloor),
                });
            } else if dense_model.is_none() {
                materialize_dense_after_rejection = true;
            }
        }
    }

    let final_grad = projected_gradient(&x_k, &eval_k.gradient, bounds);
    let final_grad_norm = final_grad.dot(&final_grad).sqrt();
    Ok(OuterResult {
        rho: x_k,
        final_value: eval_k.cost,
        iterations: max_iter,
        final_grad_norm,
        final_gradient: Some(eval_k.gradient),
        final_hessian: None,
        converged: false,
        plan_used: plan,
        operator_trust_radius: Some(trust_radius),
        operator_stop_reason: Some(OperatorTrustRegionStopReason::IterationBudget),
    })
}

/// Run the outer smoothing-parameter optimization.
///
/// This is the single entry point that replaces the scattered optimizer wiring
/// across estimate.rs, joint.rs, and custom_family.rs. It:
///
/// 1. Queries and canonicalizes the objective's capability declaration.
/// 2. Calls `plan()` to select solver + hessian source.
/// 3. Logs the plan and the analytic derivative capabilities it will consume.
/// 4. Generates seed candidates.
/// 5. Runs the chosen solver on candidates in heuristic order up to budget.
/// 6. If the configured fallback policy allows it, re-plans with degraded
///    capabilities chosen centrally inside outer_strategy and retries.
/// 7. Returns the best result (including which plan was actually used).
///
/// Do not wrap `run_outer` calls in try/catch with ad-hoc solver recovery.
/// Callers should declare only the primary capability and, at most, whether
/// automatic fallback is enabled at all.
fn run_outer(
    obj: &mut dyn OuterObjective,
    config: &OuterConfig,
    context: &str,
) -> Result<OuterResult, EstimationError> {
    let cap = primary_capability_for_config(obj.capability(), config, context);
    cap.validate_layout(context)?;
    if let Some(initial_rho) = config.initial_rho.as_ref() {
        cap.theta_layout()
            .validate_point_len(initial_rho, "initial outer seed")
            .map_err(|err| match err {
                ObjectiveEvalError::Recoverable { message }
                | ObjectiveEvalError::Fatal { message } => {
                    EstimationError::RemlOptimizationFailed(format!("{context}: {message}"))
                }
            })?;
    }

    if cap.n_params == 0 {
        let cost = obj.eval_cost(&Array1::zeros(0))?;
        let the_plan = plan_with_class(&cap, config.solver_class);
        return Ok(OuterResult {
            rho: Array1::zeros(0),
            final_value: cost,
            iterations: 0,
            final_grad_norm: 0.0,
            final_gradient: None,
            final_hessian: None,
            converged: true,
            plan_used: the_plan,
            operator_trust_radius: None,
            operator_stop_reason: None,
        });
    }

    // Build the ordered list of capabilities to attempt: primary first, then
    // any centrally-derived degraded capabilities. Aux direct-search has no
    // degraded ladder — a single attempt either succeeds or the failure is
    // surfaced to the caller.
    let fallback_attempts = match (config.fallback_policy, config.solver_class) {
        (FallbackPolicy::Automatic, SolverClass::Primary) => automatic_fallback_attempts(&cap),
        (FallbackPolicy::Automatic, SolverClass::AuxiliaryGradientFree)
        | (FallbackPolicy::Disabled, _) => Vec::new(),
    };
    let mut attempts: Vec<OuterCapability> = Vec::with_capacity(1 + fallback_attempts.len());
    attempts.push(cap.clone());
    for degraded in fallback_attempts {
        attempts.push(degraded);
    }

    let mut last_error: Option<EstimationError> = None;

    for (attempt_idx, attempt_cap) in attempts.iter().enumerate() {
        let the_plan = plan_with_class(attempt_cap, config.solver_class);
        if attempt_idx > 0 {
            log::debug!("[OUTER] {context}: primary plan failed; falling back to {the_plan}");
        }
        log_plan(context, attempt_cap, &the_plan);

        obj.reset();

        // ARC budget-bump retry ladder: when an Arc attempt exhausts its
        // iteration budget, re-run the same Arc plan up to two additional
        // times with `max_iter *= 2` per retry (3× and 7× original budget,
        // capped well under 8×) warm-started from the previous attempt's
        // last rho. This preserves ARC's analytic-Hessian geometry instead
        // of demoting to a strictly weaker BFGS+BfgsApprox surface, while
        // still bounding total work. Mirrors the retry structure already
        // implicit in the cascade for EFS/HybridEFS.
        let mut arc_retries_left: u32 = if matches!(the_plan.solver, Solver::Arc) {
            2
        } else {
            0
        };
        let mut retry_config: Option<OuterConfig> = None;

        let outcome = loop {
            let active_config: &OuterConfig = retry_config.as_ref().unwrap_or(config);
            match run_outer_with_plan(obj, active_config, context, attempt_cap, &the_plan) {
                Ok(result) => {
                    if result.converged
                        || arc_retries_left == 0
                        || matches!(
                            result.operator_stop_reason,
                            Some(OperatorTrustRegionStopReason::RejectFloor)
                        )
                    {
                        break Ok(result);
                    }
                    let prev_max_iter = active_config.max_iter;
                    let bumped_max_iter = prev_max_iter.saturating_mul(2);
                    let next_trust_radius = result.operator_trust_radius;
                    log::info!(
                        "[OUTER] {context}: ARC attempt exhausted budget at \
                         iter={} cost={:.6e} |g|={:.6e}; retrying ARC with \
                         max_iter {} -> {} warm-started from last rho \
                         and trust_radius {:?} (analytic-Hessian preservation)",
                        result.iterations,
                        result.final_value,
                        result.final_grad_norm,
                        prev_max_iter,
                        bumped_max_iter,
                        next_trust_radius,
                    );
                    let mut next = active_config.clone();
                    next.max_iter = bumped_max_iter;
                    next.initial_rho = Some(result.rho.clone());
                    next.operator_initial_trust_radius = next_trust_radius;
                    retry_config = Some(next);
                    arc_retries_left -= 1;
                    obj.reset();
                }
                Err(e) => break Err(e),
            }
        };

        match outcome {
            Ok(result) => {
                if result.converged || attempt_idx + 1 == attempts.len() {
                    return Ok(result);
                }

                let message = format!(
                    "{context}: attempt {} (plan={the_plan}) exhausted without convergence",
                    attempt_idx + 1
                );
                log::debug!("[OUTER] {message}; trying degraded fallback plan");
                last_error = Some(EstimationError::RemlOptimizationFailed(message));
            }
            Err(e) => {
                log::debug!(
                    "[OUTER] {context}: attempt {} (plan={the_plan}) failed: {e}",
                    attempt_idx + 1
                );
                last_error = Some(e);
            }
        }
    }

    Err(last_error.unwrap_or_else(|| {
        EstimationError::RemlOptimizationFailed(format!("all plan attempts exhausted ({context})"))
    }))
}

/// Execute a single plan attempt (seed generation → solver loop → best result).
fn run_outer_with_plan(
    obj: &mut dyn OuterObjective,
    config: &OuterConfig,
    context: &str,
    cap: &OuterCapability,
    the_plan: &OuterPlan,
) -> Result<OuterResult, EstimationError> {
    let mut seeds = {
        let generated = crate::seeding::generate_rho_candidates(
            cap.n_params,
            config.heuristic_lambdas.as_deref(),
            &config.seed_config,
        );
        if generated.is_empty() {
            Vec::new()
        } else {
            generated
        }
    };
    if let Some(initial_rho) = config.initial_rho.as_ref()
        && !seeds.iter().any(|seed| seed == initial_rho)
    {
        seeds.insert(0, initial_rho.clone());
    }
    if seeds.is_empty() {
        return Err(EstimationError::RemlOptimizationFailed(format!(
            "no seeds generated for outer optimization ({context})"
        )));
    }

    let screening_enabled = config.screening_cap.is_some();
    let seed_budget = effective_seed_budget(
        config.seed_config.seed_budget,
        the_plan.solver,
        config.seed_config.risk_profile,
        screening_enabled,
    )
    .min(seeds.len());
    if should_screen_seeds(config, the_plan.solver, seeds.len(), seed_budget) {
        seeds = rank_seeds_with_screening(obj, config, context, &seeds);
    }
    log::debug!(
        "[OUTER] {context}: trying generated seeds directly (generated={}, budget={})",
        seeds.len(),
        seed_budget,
    );
    if seed_budget < config.seed_config.seed_budget.max(1) {
        log::debug!(
            "[OUTER] {context}: capped requested seed budget {} -> {} for {:?} ({:?})",
            config.seed_config.seed_budget.max(1),
            seed_budget,
            the_plan.solver,
            config.seed_config.risk_profile,
        );
    }
    if seeds.len() > seed_budget {
        log::debug!(
            "[OUTER] {context}: trying up to {seed_budget}/{} generated seeds in heuristic order",
            seeds.len(),
        );
    }

    let (lower, upper) = config.bounds.clone().unwrap_or_else(|| {
        (
            Array1::<f64>::from_elem(cap.n_params, -config.rho_bound),
            Array1::<f64>::from_elem(cap.n_params, config.rho_bound),
        )
    });
    let bounds_template = (lower, upper);

    let mut best: Option<OuterResult> = None;
    // Accumulate every per-seed rejection with its 0-based seed index and the
    // phase that rejected it (validation vs solver run). When all seeds fail
    // systematically (bad analytic gradient, rank-deficient penalty, etc.) the
    // first rejection's rho + error is often the most diagnostic.
    let mut rejection_reasons: Vec<(usize, &'static str, String)> = Vec::new();
    let layout = cap.theta_layout();
    let mut started_seeds = 0usize;
    let expensive_seed_limit =
        expensive_unsuccessful_seed_limit(the_plan.solver, config.seed_config.risk_profile);
    let mut unsuccessful_expensive_seeds = 0usize;
    // Tracks whether the loop broke out early due to
    // `expensive_unsuccessful_seed_limit` so the aggregate error can
    // distinguish "all generated seeds tried" from "stopped early".
    let mut stopped_early_due_to_limit = false;

    'seed_attempts: for (seed_idx, seed) in seeds.iter().enumerate() {
        if started_seeds == seed_budget {
            break;
        }
        obj.reset();
        let t_seed_start = std::time::Instant::now();
        let seed_slot;
        let result: Result<OuterResult, EstimationError> = match the_plan.solver {
            Solver::Arc => {
                let seed_eval = obj
                    .eval_with_order(&seed, OuterEvalOrder::ValueGradientHessian)
                    .map_err(|err| into_objective_error("outer eval failed", err));
                let seed_eval = match seed_eval {
                    Ok(seed_eval) => seed_eval,
                    Err(err) => {
                        let err = match err {
                            ObjectiveEvalError::Recoverable { message }
                            | ObjectiveEvalError::Fatal { message } => {
                                EstimationError::RemlOptimizationFailed(message)
                            }
                        };
                        if requests_immediate_first_order_fallback(&err.to_string()) {
                            return Err(err);
                        }
                        log::warn!(
                            "[OUTER] {context}: rejecting seed {seed_idx} before solver start: {err}"
                        );
                        rejection_reasons.push((seed_idx, "validation", err.to_string()));
                        continue 'seed_attempts;
                    }
                };
                let seed_eval = finite_outer_eval_or_error("outer eval failed", layout, seed_eval)
                    .map_err(|err| match err {
                        ObjectiveEvalError::Recoverable { message }
                        | ObjectiveEvalError::Fatal { message } => {
                            EstimationError::RemlOptimizationFailed(message)
                        }
                    });
                let mut seed_eval = match seed_eval {
                    Ok(seed_eval) => seed_eval,
                    Err(err) => {
                        log::warn!(
                            "[OUTER] {context}: rejecting seed {seed_idx} before solver start: {err}"
                        );
                        rejection_reasons.push((seed_idx, "validation", err.to_string()));
                        continue 'seed_attempts;
                    }
                };
                validate_second_order_seed_hessian(context, layout, &seed_eval).map_err(|err| {
                    match err {
                        ObjectiveEvalError::Recoverable { message }
                        | ObjectiveEvalError::Fatal { message } => {
                            EstimationError::RemlOptimizationFailed(message)
                        }
                    }
                })?;
                started_seeds += 1;
                seed_slot = started_seeds;

                let cheap_materializable_operator = matches!(
                    seed_eval.hessian,
                    HessianResult::Operator(ref op)
                        if op.materialization_capability().is_available()
                            && op.dim() <= OUTER_HVP_MATERIALIZE_MAX_DIM
                );
                if cheap_materializable_operator {
                    // The operator's own work model says probing every column
                    // is cheap; convert the seed Hessian to dense in-place.
                    // Subsequent bridge evaluations apply the same predicate.
                    if let HessianResult::Operator(op) = &seed_eval.hessian {
                        match op.materialize_dense() {
                            Ok(dense) => {
                                seed_eval.hessian = HessianResult::Analytic(dense);
                            }
                            Err(message) => {
                                let err = EstimationError::RemlOptimizationFailed(format!(
                                    "outer Hessian operator materialization failed: {message}"
                                ));
                                log::warn!(
                                    "[OUTER] {context}: rejecting seed {seed_idx} before solver start: {err}"
                                );
                                rejection_reasons.push((seed_idx, "validation", err.to_string()));
                                continue 'seed_attempts;
                            }
                        }
                    }
                }
                if matches!(seed_eval.hessian, HessianResult::Operator(_)) {
                    log::debug!(
                        "[OUTER] {context}: analytic Hessian provided as Hv operator; \
                         routing to internal trust-region CG"
                    );
                    run_operator_trust_region(
                        obj,
                        &seed,
                        layout,
                        Some(&bounds_template),
                        config.tolerance,
                        config.max_iter,
                        seed_eval,
                        *the_plan,
                        config.operator_initial_trust_radius,
                    )
                } else {
                    let hessian_source = the_plan.hessian_source;
                    let objective = OuterSecondOrderBridge {
                        obj,
                        layout,
                        hessian_source,
                        materialize_operator_max_dim: OUTER_HVP_MATERIALIZE_MAX_DIM,
                        eval_count: 0,
                        outer_inner_cap: config.outer_inner_cap.clone(),
                        g_norm_initial: None,
                        last_g_norm: None,
                    };

                    let (lo, hi) = &bounds_template;
                    let bounds = Bounds::new(lo.clone(), hi.clone(), 1e-6)
                        .expect("outer rho bounds must be valid");
                    let scaled_tol = outer_scaled_tolerance(config.tolerance, seed_eval.cost);
                    let tol = Tolerance::new(scaled_tol).expect("outer tolerance must be valid");
                    let max_iter =
                        MaxIterations::new(config.max_iter).expect("outer max_iter must be valid");

                    let mut optimizer = ArcOptimizer::new(seed.clone(), objective)
                        .with_bounds(bounds)
                        .with_tolerance(tol)
                        .with_max_iterations(max_iter);
                    match optimizer.run() {
                        Ok(sol) => Ok(solution_into_outer_result(sol, true, *the_plan)),
                        Err(ArcError::MaxIterationsReached { last_solution, .. }) => {
                            Ok(solution_into_outer_result(*last_solution, false, *the_plan))
                        }
                        Err(e) => Err(EstimationError::RemlOptimizationFailed(format!(
                            "Arc solver failed: {e:?}"
                        ))),
                    }
                }
            }
            Solver::Bfgs => {
                // Production invariant: the outer BFGS runner requires an
                // analytic gradient capability. Fail loudly at the top of the
                // seed loop so the caller surfaces the underlying
                // capability/plan mismatch instead of degrading correctness
                // behind the scenes.
                if cap.gradient != Derivative::Analytic {
                    return Err(EstimationError::RemlOptimizationFailed(format!(
                        "{context}: outer BFGS requires an analytic gradient capability; \
                         no non-analytic fallback is available (plan={the_plan}, \
                         declared gradient={:?})",
                        cap.gradient,
                    )));
                }
                let seed_eval = obj
                    .eval_with_order(&seed, OuterEvalOrder::ValueAndGradient)
                    .map_err(|err| into_objective_error("outer eval failed", err));
                let seed_eval = match seed_eval {
                    Ok(seed_eval) => seed_eval,
                    Err(err) => {
                        let err = match err {
                            ObjectiveEvalError::Recoverable { message }
                            | ObjectiveEvalError::Fatal { message } => {
                                EstimationError::RemlOptimizationFailed(message)
                            }
                        };
                        log::warn!(
                            "[OUTER] {context}: rejecting seed {seed_idx} before solver start: {err}"
                        );
                        rejection_reasons.push((seed_idx, "validation", err.to_string()));
                        continue 'seed_attempts;
                    }
                };
                let seed_eval = match finite_outer_first_order_eval_or_error(
                    "outer eval failed",
                    layout,
                    seed_eval,
                )
                .map_err(|err| match err {
                    ObjectiveEvalError::Recoverable { message }
                    | ObjectiveEvalError::Fatal { message } => {
                        EstimationError::RemlOptimizationFailed(message)
                    }
                }) {
                    Ok(eval) => eval,
                    Err(err) => {
                        log::warn!(
                            "[OUTER] {context}: rejecting seed {seed_idx} before solver start: {err}"
                        );
                        rejection_reasons.push((seed_idx, "validation", err.to_string()));
                        continue 'seed_attempts;
                    }
                };
                started_seeds += 1;
                seed_slot = started_seeds;
                let (lo, hi) = &bounds_template;
                let bounds = Bounds::new(lo.clone(), hi.clone(), 1e-6)
                    .expect("outer rho bounds must be valid");
                let scaled_tol = outer_scaled_tolerance(config.tolerance, seed_eval.cost);
                let tol = Tolerance::new(scaled_tol).expect("outer tolerance must be valid");
                let max_iter =
                    MaxIterations::new(config.max_iter).expect("outer max_iter must be valid");
                let objective = OuterFirstOrderBridge {
                    obj,
                    layout,
                    outer_inner_cap: config.outer_inner_cap.clone(),
                    iter_count: 0,
                    g_norm_initial: None,
                    last_g_norm: None,
                };
                let mut optimizer = Bfgs::new(seed.clone(), objective)
                    .with_bounds(bounds)
                    .with_tolerance(tol)
                    .with_max_iterations(max_iter);
                let bfgs_start = std::time::Instant::now();
                let outcome = optimizer.run();
                let bfgs_elapsed = bfgs_start.elapsed().as_secs_f64();
                match &outcome {
                    Ok(sol) => log::info!(
                        "[OUTER summary] BFGS converged in {} iters elapsed={:.3}s final_value={:.6e}",
                        sol.iterations,
                        bfgs_elapsed,
                        sol.final_value
                    ),
                    Err(BfgsError::MaxIterationsReached { last_solution }) => log::info!(
                        // Include `in N iters` for symmetry with the
                        // converged log line — the runner aggregator
                        // (commit afd66d6a) reads the optional iters
                        // group to build `bfgs_iters_p50/_max` across
                        // both successful and cap-hit runs. Without
                        // this, the iter-count distribution would be
                        // biased toward fast-converged runs.
                        "[OUTER summary] BFGS hit max_iter in {} iters elapsed={:.3}s final_value={:.6e}",
                        last_solution.iterations,
                        bfgs_elapsed,
                        last_solution.final_value
                    ),
                    Err(BfgsError::LineSearchFailed { last_solution, .. }) => log::info!(
                        // Same rationale as the MaxIterationsReached
                        // arm: surface `in N iters` so the runner can
                        // include line-search-failed runs in the
                        // iter-count distribution. A line-search
                        // failure at iter 1 (cold start collapses
                        // immediately) is a different signal from
                        // failure at iter 50 (the optimizer made
                        // substantial progress before stalling).
                        "[OUTER summary] BFGS line-search failed in {} iters elapsed={:.3}s final_value={:.6e}",
                        last_solution.iterations,
                        bfgs_elapsed,
                        last_solution.final_value
                    ),
                    Err(e) => log::info!(
                        "[OUTER summary] BFGS failed elapsed={:.3}s err={:?}",
                        bfgs_elapsed,
                        e
                    ),
                }
                match outcome {
                    Ok(sol) => Ok(solution_into_outer_result(sol, true, *the_plan)),
                    Err(BfgsError::MaxIterationsReached { last_solution }) => {
                        Ok(solution_into_outer_result(*last_solution, false, *the_plan))
                    }
                    Err(BfgsError::LineSearchFailed { last_solution, .. }) => {
                        Ok(solution_into_outer_result(*last_solution, false, *the_plan))
                    }
                    Err(e) => Err(EstimationError::RemlOptimizationFailed(format!(
                        "BFGS solver failed: {e:?}"
                    ))),
                }
            }
            Solver::Efs => {
                let seed_eval = obj
                    .eval_efs(&seed)
                    .map_err(|err| into_objective_error("outer EFS eval failed", err));
                let seed_eval = match seed_eval {
                    Ok(seed_eval) => seed_eval,
                    Err(err) => {
                        let err = match err {
                            ObjectiveEvalError::Recoverable { message }
                            | ObjectiveEvalError::Fatal { message } => {
                                EstimationError::RemlOptimizationFailed(message)
                            }
                        };
                        if requests_immediate_first_order_fallback(&err.to_string()) {
                            return Err(err);
                        }
                        log::warn!(
                            "[OUTER] {context}: rejecting seed {seed_idx} before solver start: {err}"
                        );
                        rejection_reasons.push((seed_idx, "validation", err.to_string()));
                        continue 'seed_attempts;
                    }
                };
                let seed_eval =
                    finite_efs_eval_or_error("outer EFS eval failed", layout, seed_eval).map_err(
                        |err| match err {
                            ObjectiveEvalError::Recoverable { message }
                            | ObjectiveEvalError::Fatal { message } => {
                                EstimationError::RemlOptimizationFailed(message)
                            }
                        },
                    );
                if let Err(err) = seed_eval {
                    if requests_immediate_first_order_fallback(&err.to_string()) {
                        return Err(err);
                    }
                    log::warn!(
                        "[OUTER] {context}: rejecting seed {seed_idx} before solver start: {err}"
                    );
                    rejection_reasons.push((seed_idx, "validation", err.to_string()));
                    continue 'seed_attempts;
                }
                started_seeds += 1;
                seed_slot = started_seeds;
                let (lo, hi) = &bounds_template;
                let bounds = Bounds::new(lo.clone(), hi.clone(), 1e-6)
                    .expect("outer rho bounds must be valid");
                let tol = Tolerance::new(config.tolerance).expect("outer tolerance must be valid");
                let max_iter =
                    MaxIterations::new(config.max_iter).expect("outer max_iter must be valid");
                let objective = OuterFixedPointBridge {
                    obj,
                    layout,
                    barrier_config: cap.barrier_config.clone(),
                    consecutive_psi_zero_iters: 0,
                };
                let mut optimizer = FixedPoint::new(seed.clone(), objective)
                    .with_bounds(bounds)
                    .with_tolerance(tol)
                    .with_max_iterations(max_iter);
                match optimizer.run() {
                    Ok(sol) => Ok(solution_into_outer_result(sol, true, *the_plan)),
                    Err(FixedPointError::MaxIterationsReached { last_solution }) => {
                        Ok(solution_into_outer_result(*last_solution, false, *the_plan))
                    }
                    Err(e) => Err(EstimationError::RemlOptimizationFailed(format!(
                        "fixed-point solver failed: {e:?}"
                    ))),
                }
            }
            Solver::HybridEfs => {
                let seed_eval = obj
                    .eval_efs(&seed)
                    .map_err(|err| into_objective_error("outer EFS eval failed", err));
                let seed_eval = match seed_eval {
                    Ok(seed_eval) => seed_eval,
                    Err(err) => {
                        let err = match err {
                            ObjectiveEvalError::Recoverable { message }
                            | ObjectiveEvalError::Fatal { message } => {
                                EstimationError::RemlOptimizationFailed(message)
                            }
                        };
                        if requests_immediate_first_order_fallback(&err.to_string()) {
                            return Err(err);
                        }
                        log::warn!(
                            "[OUTER] {context}: rejecting seed {seed_idx} before solver start: {err}"
                        );
                        rejection_reasons.push((seed_idx, "validation", err.to_string()));
                        continue 'seed_attempts;
                    }
                };
                let seed_eval =
                    finite_efs_eval_or_error("outer EFS eval failed", layout, seed_eval).map_err(
                        |err| match err {
                            ObjectiveEvalError::Recoverable { message }
                            | ObjectiveEvalError::Fatal { message } => {
                                EstimationError::RemlOptimizationFailed(message)
                            }
                        },
                    );
                if let Err(err) = seed_eval {
                    if requests_immediate_first_order_fallback(&err.to_string()) {
                        return Err(err);
                    }
                    log::warn!(
                        "[OUTER] {context}: rejecting seed {seed_idx} before solver start: {err}"
                    );
                    rejection_reasons.push((seed_idx, "validation", err.to_string()));
                    continue 'seed_attempts;
                }
                started_seeds += 1;
                seed_slot = started_seeds;
                let (lo, hi) = &bounds_template;
                let bounds = Bounds::new(lo.clone(), hi.clone(), 1e-6)
                    .expect("outer rho bounds must be valid");
                let tol = Tolerance::new(config.tolerance).expect("outer tolerance must be valid");
                let max_iter =
                    MaxIterations::new(config.max_iter).expect("outer max_iter must be valid");
                let objective = OuterFixedPointBridge {
                    obj,
                    layout,
                    barrier_config: cap.barrier_config.clone(),
                    consecutive_psi_zero_iters: 0,
                };
                let mut optimizer = FixedPoint::new(seed.clone(), objective)
                    .with_bounds(bounds)
                    .with_tolerance(tol)
                    .with_max_iterations(max_iter);
                match optimizer.run() {
                    Ok(sol) => Ok(solution_into_outer_result(sol, true, *the_plan)),
                    Err(FixedPointError::MaxIterationsReached { last_solution }) => {
                        Ok(solution_into_outer_result(*last_solution, false, *the_plan))
                    }
                    Err(e) => Err(EstimationError::RemlOptimizationFailed(format!(
                        "hybrid EFS solver failed: {e:?}"
                    ))),
                }
            }
            Solver::CompassSearch => {
                // Aux direct-search: uses cost values only, never queries
                // gradient or Hessian. config.tolerance is the step-length
                // floor, config.max_iter is the total-poll budget.
                let projected_seed = project_to_bounds(seed, Some(&bounds_template));
                let seed_cost = obj.eval_cost(&projected_seed).map_err(|err| {
                    EstimationError::RemlOptimizationFailed(format!(
                        "aux direct-search seed cost failed ({context}): {err}"
                    ))
                })?;
                if !seed_cost.is_finite() {
                    rejection_reasons.push((
                        seed_idx,
                        "validation",
                        format!("aux direct-search rejects non-finite seed cost ({seed_cost})"),
                    ));
                    continue 'seed_attempts;
                }
                started_seeds += 1;
                seed_slot = started_seeds;
                let (lo, hi) = &bounds_template;
                let outcome = compass_search_outer(
                    obj,
                    projected_seed,
                    seed_cost,
                    lo.view(),
                    hi.view(),
                    1.0,
                    config.tolerance,
                    config.max_iter,
                );
                match outcome {
                    CompassSearchOutcome::Converged { point, cost, polls } => Ok(OuterResult {
                        rho: point,
                        final_value: cost,
                        iterations: polls,
                        final_grad_norm: 0.0,
                        final_gradient: None,
                        final_hessian: None,
                        converged: true,
                        plan_used: *the_plan,
                        operator_trust_radius: None,
                        operator_stop_reason: None,
                    }),
                    CompassSearchOutcome::BudgetExhausted { point, cost, polls } => {
                        Ok(OuterResult {
                            rho: point,
                            final_value: cost,
                            iterations: polls,
                            final_grad_norm: 0.0,
                            final_gradient: None,
                            final_hessian: None,
                            converged: false,
                            plan_used: *the_plan,
                            operator_trust_radius: None,
                            operator_stop_reason: None,
                        })
                    }
                }
            }
        };

        let seed_elapsed = t_seed_start.elapsed().as_secs_f64();
        match result {
            Ok(candidate) => {
                let candidate_converged = candidate.converged;
                log::debug!(
                    "[outer-timing] seed {}/{} ({:?}): {:.3}s  cost={:.6e}  converged={}",
                    seed_slot,
                    seed_budget,
                    the_plan.solver,
                    seed_elapsed,
                    candidate.final_value,
                    candidate.converged,
                );
                if candidate_improves_best(&candidate, best.as_ref()) {
                    best = Some(candidate);
                }
                if best.as_ref().is_some_and(|b| b.converged) {
                    break;
                }
                if !candidate_converged && matches!(expensive_seed_limit, Some(limit) if limit > 0)
                {
                    unsuccessful_expensive_seeds += 1;
                    if let Some(limit) = expensive_seed_limit
                        && unsuccessful_expensive_seeds >= limit
                    {
                        log::info!(
                            "[OUTER] {context}: stopping expensive multi-start after {} non-converged {:?} seed(s)",
                            unsuccessful_expensive_seeds,
                            the_plan.solver,
                        );
                        stopped_early_due_to_limit = true;
                        break;
                    }
                }
            }
            Err(e) => {
                if requests_immediate_first_order_fallback(&e.to_string()) {
                    return Err(e);
                }
                log::debug!(
                    "[outer-timing] seed {}/{} ({:?}): {:.3}s  FAILED: {}",
                    seed_slot,
                    seed_budget,
                    the_plan.solver,
                    seed_elapsed,
                    e,
                );
                rejection_reasons.push((seed_idx, "solver", e.to_string()));
                if let Some(limit) = expensive_seed_limit {
                    unsuccessful_expensive_seeds += 1;
                    if unsuccessful_expensive_seeds >= limit {
                        log::info!(
                            "[OUTER] {context}: stopping expensive multi-start after {} failed {:?} seed(s)",
                            unsuccessful_expensive_seeds,
                            the_plan.solver,
                        );
                        stopped_early_due_to_limit = true;
                        break;
                    }
                }
            }
        }
    }

    best.ok_or_else(|| {
        // Build a compact breakdown of why every attempted seed failed so the
        // caller sees root causes, not just the last-written reason. Earlier
        // behaviour stored only `Option<String>` and overwrote on every reject,
        // which erased diagnostic context for the first k-1 seeds — a silent
        // drift especially bad when analytic-gradient or penalty-rank bugs
        // systematically break every seed with the same class of error, because
        // then only the LAST occurrence was visible to the caller.
        let n_generated = seeds.len();
        let n_attempted = n_generated.min(seed_budget);
        let n_rejected = rejection_reasons.len();
        let breakdown = if rejection_reasons.is_empty() {
            String::new()
        } else {
            let joined = rejection_reasons
                .iter()
                .map(|(idx, phase, msg)| format!("seed {idx} ({phase}): {msg}"))
                .collect::<Vec<_>>()
                .join("; ");
            format!("; reasons: [{joined}]")
        };
        let early_stop_note = if stopped_early_due_to_limit {
            format!(
                "; stopped early after {unsuccessful_expensive_seeds} consecutive \
                 non-converged {:?} seed(s) (expensive_unsuccessful_seed_limit)",
                the_plan.solver
            )
        } else {
            String::new()
        };
        if started_seeds == 0 {
            EstimationError::RemlOptimizationFailed(format!(
                "no candidate seeds passed outer startup validation ({context}); \
                 generated={n_generated}, attempted={n_attempted}, rejected={n_rejected}{breakdown}"
            ))
        } else {
            EstimationError::RemlOptimizationFailed(format!(
                "all {started_seeds} seed candidates failed ({context}); \
                 generated={n_generated}, attempted={n_attempted}, \
                 started_in_solver={started_seeds}, rejected={n_rejected}\
                 {early_stop_note}{breakdown}"
            ))
        }
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use ::opt::FixedPointObjective;
    use ndarray::array;
    use std::sync::atomic::{AtomicUsize, Ordering};
    use std::sync::{Arc, Mutex};

    /// Smart initial trust radius: when ‖g‖ ≫ |cost| (the CTN warm-start
    /// pathology where ‖g‖ ≈ 3·10¹⁷ vs |cost| ≈ 1·10¹⁰), the derived
    /// radius is `|cost|/‖g‖` clamped to the floor. The previous
    /// unconditional default of 1.0 wasted ~25 halving iters on this
    /// pattern.
    #[test]
    fn derive_initial_trust_radius_clamps_huge_gradient_to_floor() {
        // Pathological CTN warm-start: |cost|=1.1e10, ‖g‖=3.3e17.
        // Newton-scale would be ≈ 3.3e-8, well above the floor.
        let r = derive_initial_trust_radius_from_seed(-1.1e10, 3.3e17);
        let expected = 1.1e10 / 3.3e17;
        assert!(
            (r - expected).abs() < expected * 1e-10,
            "expected {expected:.3e}, got {r:.3e}"
        );
        assert!(r >= OPERATOR_TRUST_RADIUS_REJECT_FLOOR * 10.0);
        assert!(r <= OPERATOR_TRUST_RADIUS_INIT);

        // Even more pathological: ‖g‖ astronomical relative to cost.
        // Should be clamped to REJECT_FLOOR · 10, not below.
        let clamped = derive_initial_trust_radius_from_seed(1.0, 1e30);
        assert!((clamped - OPERATOR_TRUST_RADIUS_REJECT_FLOOR * 10.0).abs() < 1e-15);
    }

    /// Smart initial trust radius: when ‖g‖ is small (the well-conditioned
    /// case), the derivation must NOT shrink the radius below the existing
    /// default — otherwise we'd regress problems that previously converged
    /// quickly with the 1.0 default.
    #[test]
    fn derive_initial_trust_radius_preserves_default_for_small_gradient() {
        // |cost| = 10, ‖g‖ = 0.1: Newton-scale = 100, but capped at INIT.
        let r = derive_initial_trust_radius_from_seed(10.0, 0.1);
        assert!((r - OPERATOR_TRUST_RADIUS_INIT).abs() < 1e-15);

        // |cost| ≈ ‖g‖: derived radius is exactly 1.0, equal to default.
        let r = derive_initial_trust_radius_from_seed(5.0, 5.0);
        assert!((r - 1.0).abs() < 1e-15);
    }

    /// Defensive: degenerate gradient (zero, NaN, infinity) must fall
    /// back to the default radius rather than producing 0 or non-finite.
    #[test]
    fn derive_initial_trust_radius_handles_degenerate_gradient() {
        // Zero gradient: function is at stationary point — fall back to default.
        let r = derive_initial_trust_radius_from_seed(100.0, 0.0);
        assert!((r - OPERATOR_TRUST_RADIUS_INIT).abs() < 1e-15);

        // NaN gradient: degenerate, fall back.
        let r = derive_initial_trust_radius_from_seed(100.0, f64::NAN);
        assert!((r - OPERATOR_TRUST_RADIUS_INIT).abs() < 1e-15);

        // Infinite gradient: nonfinite, fall back (we'd otherwise produce 0).
        let r = derive_initial_trust_radius_from_seed(100.0, f64::INFINITY);
        assert!((r - OPERATOR_TRUST_RADIUS_INIT).abs() < 1e-15);
    }

    /// Snap-on-rejection: when pred_dec catastrophically overestimates the
    /// local function (>1e6 × |cost|) and the derived recovery radius is
    /// substantially smaller than what halving would produce, snap to it.
    /// Saves 20+ wasted halvings when an accepted step lands in a
    /// steep-gradient basin mid-loop.
    #[test]
    fn snap_on_rejection_fires_for_catastrophic_pred_dec() {
        let cost = 3.124e6_f64;
        let pred_dec = 2.548e36_f64; // mismatch ratio ≈ 8e29, far above 1e6
        let derived_radius = 1e-7_f64; // |cost|/‖g‖ for ‖g‖≈3e13
        let halve_default = 0.15_f64; // trust_radius * 0.5

        let r = snap_recovery_radius_on_rejection(cost, pred_dec, derived_radius, halve_default);
        assert_eq!(
            r, derived_radius,
            "snap should jump to derived radius when mismatch is extreme"
        );
    }

    /// Snap-on-rejection: well-behaved rejections (small/no model mismatch)
    /// must use plain ARC halving, not snap. Otherwise we'd over-aggressively
    /// shrink in normal trust-region operation.
    #[test]
    fn snap_on_rejection_no_op_for_normal_rejection() {
        // Normal rejection: pred_dec is small relative to cost (within 1000×).
        // mismatch_ratio = pred_dec / max(|cost|, 1) = 100.0. Below 1e6 threshold.
        let cost = 1.0_f64;
        let pred_dec = 100.0_f64;
        let derived_radius = 1e-3_f64;
        let halve_default = 0.5_f64;

        let r = snap_recovery_radius_on_rejection(cost, pred_dec, derived_radius, halve_default);
        assert_eq!(
            r, halve_default,
            "snap must NOT fire for normal-magnitude rejections"
        );
    }

    /// Snap-on-rejection: even with extreme pred_dec, snap is suppressed
    /// when the derived radius isn't materially smaller than what halving
    /// would produce. Avoids redundant snaps when both paths agree.
    #[test]
    fn snap_on_rejection_no_op_when_derived_close_to_halve() {
        let cost = 1e6_f64;
        let pred_dec = 1e20_f64; // mismatch huge
        let derived_radius = 0.4_f64; // close to halve_default
        let halve_default = 0.5_f64;

        let r = snap_recovery_radius_on_rejection(cost, pred_dec, derived_radius, halve_default);
        assert_eq!(
            r, halve_default,
            "snap requires derived < default*0.5 to fire"
        );
    }

    /// Snap-on-rejection: degenerate pred_dec (non-finite or non-positive)
    /// must not produce a spurious snap.
    #[test]
    fn snap_on_rejection_handles_degenerate_pred_dec() {
        let cost = 100.0_f64;
        let derived_radius = 1e-6_f64;
        let halve_default = 0.5_f64;

        // NaN pred_dec: mismatch_ratio = 0, no snap.
        let r = snap_recovery_radius_on_rejection(cost, f64::NAN, derived_radius, halve_default);
        assert_eq!(r, halve_default);

        // Negative pred_dec (shouldn't happen, but defensive): no snap.
        let r = snap_recovery_radius_on_rejection(cost, -1e10, derived_radius, halve_default);
        assert_eq!(r, halve_default);
    }

    /// `outer_scaled_tolerance` reduces to the base tolerance for trivial
    /// cost and scales as `τ · (1 + |V|)` for biobank-scale cost. This is
    /// the textbook relative-tolerance form mgcv's `magic` REML driver
    /// applies to its score tolerance.
    #[test]
    fn outer_scaled_tolerance_matches_textbook_form() {
        // Trivial-cost case: scale factor is 1, so test reduces to absolute.
        let trivial = outer_scaled_tolerance(1e-6, 0.0);
        assert!((trivial - 1e-6).abs() < 1e-18);

        // Small cost case: 1 + |V| = 1.5, still close to absolute.
        let small = outer_scaled_tolerance(1e-6, 0.5);
        assert!((small - 1.5e-6).abs() < 1e-18);

        // Biobank-scale cost: V_LAML ≈ -1.6e5 for binomial logit at
        // n=320 000. The relative test ‖∇V‖/|V| ≤ τ becomes
        // ‖∇V‖ ≤ 1e-6 · (1 + 1.6e5) ≈ 0.16, vs the absolute 1e-6
        // that would never fire at biobank scale.
        let biobank = outer_scaled_tolerance(1e-6, -1.6e5);
        let expected = 1e-6 * (1.0 + 1.6e5);
        assert!((biobank - expected).abs() < 1e-12);
        assert!(
            biobank > 0.1,
            "biobank tolerance must be substantially larger than the absolute 1e-6: got {biobank:.6e}"
        );
    }

    /// Scale invariance: under uniform `V → c·V` the relative test
    /// ‖∇V‖ ≤ τ · (1 + |V|) is invariant in the limit |V| ≫ 1. At
    /// biobank scale this is exact within rounding.
    #[test]
    fn outer_scaled_tolerance_is_scale_invariant_at_large_cost() {
        let base_tol = 1e-7;
        let v_base: f64 = 1.0e6;
        let v_scaled: f64 = v_base * 1000.0;

        // Scale ratio must be the same for both magnitudes.
        let ratio_base = outer_scaled_tolerance(base_tol, v_base) / (1.0 + v_base);
        let ratio_scaled = outer_scaled_tolerance(base_tol, v_scaled) / (1.0 + v_scaled);
        assert!(
            (ratio_base - ratio_scaled).abs() < 1e-15,
            "scale ratio must be invariant: base={ratio_base:.3e}  scaled={ratio_scaled:.3e}"
        );
    }

    /// `pinned_axes_mask`: an axis is geometrically at a bound when it is
    /// within `1e-12 · max(span, 1)` of either lower or upper. Distinct
    /// from `active_mask` which also examines gradient direction.
    #[test]
    fn pinned_axes_mask_detects_corner_points() {
        let lower = array![0.0, 0.0, -10.0, 0.0];
        let upper = array![1.0, 1.0, 10.0, 1.0];
        let bounds = (lower, upper);

        // x[0] at lower, x[1] at upper, x[2] interior, x[3] within tol of upper.
        let x = array![0.0, 1.0, 0.0, 1.0 - 1e-15];
        let mask = pinned_axes_mask(&x, Some(&bounds));
        assert_eq!(mask, vec![true, true, false, true]);

        // No bounds → all unpinned.
        assert_eq!(pinned_axes_mask(&x, None), vec![false; 4]);
    }

    /// `clipped_axes_mask`: detects axes whose unconstrained step would
    /// have crossed a bound and was therefore clipped during projection.
    #[test]
    fn clipped_axes_mask_marks_projection_changes() {
        let raw = array![1.5, 0.5, -0.2, 0.7];
        let projected = array![1.0, 0.5, 0.0, 0.7];
        let mask = clipped_axes_mask(&raw, &projected);
        assert_eq!(mask, vec![true, false, true, false]);

        // Raw == projected means nothing was clipped.
        let mask = clipped_axes_mask(&projected, &projected);
        assert_eq!(mask, vec![false; 4]);
    }

    /// `union_active_with_pinned`: the active set used by the
    /// corner-clamp escape is the OR of the existing KKT active mask
    /// and the geometric pinned mask.
    #[test]
    fn union_active_with_pinned_is_logical_or() {
        let base = vec![false, true, false, false];
        let pinned = vec![false, false, true, false];
        let merged = union_active_with_pinned(&base, &pinned);
        assert_eq!(merged, vec![false, true, true, false]);
    }

    /// Active-set escape on a synthetic 4D quadratic that simulates the
    /// corner-clamp pathology. The proposed unconstrained step crosses
    /// axis 0's upper bound, so projection clips axis 0 — but axis 0 is
    /// strictly interior at `x_k`, so the standard KKT `active_mask`
    /// (which uses gradient sign at the bound) does NOT mark axis 0 as
    /// active. The corner-clamp escape extends the active set by axis 0
    /// (because it was clipped) and re-solves the trust-region
    /// subproblem on axes 1..3, which has a strictly positive predicted
    /// decrease and a step that is exactly zero on axis 0. Without the
    /// extension, the Steihaug-Toint solve would return the same
    /// unconstrained step that triggered the clamp.
    #[test]
    fn active_set_step_unblocks_corner_clamp_on_4d_quadratic() {
        struct IdHess(usize);
        impl OuterHessianOperator for IdHess {
            fn dim(&self) -> usize {
                self.0
            }
            fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
                Ok(v.clone())
            }
        }

        let n = 4;
        let op = IdHess(n);

        // Bounds: axis 0 in [0, 1], others in [-10, 10].
        let lower = array![0.0, -10.0, -10.0, -10.0];
        let upper = array![1.0, 10.0, 10.0, 10.0];
        let bounds = (lower.clone(), upper.clone());

        // x_k strictly INTERIOR on every axis (axis 0 not at a bound).
        let x_k = array![0.5, 0.0, 0.0, 0.0];

        // Gradient: small on every axis. Axis 0 is interior, so the KKT
        // active mask returns false everywhere.
        let g = array![-0.3, 0.5, -0.5, 0.3];
        let g_proj = projected_gradient(&x_k, &g, Some(&bounds));
        // Interior x_k → projected gradient is unchanged.
        for idx in 0..n {
            assert!((g_proj[idx] - g[idx]).abs() < 1e-15);
        }

        let base_active = active_mask(&x_k, &g, Some(&bounds));
        assert_eq!(
            base_active,
            vec![false; n],
            "interior x_k must yield empty KKT active mask"
        );

        // Simulate the corner-clamp pathology: an unconstrained step
        // that crosses axis 0's upper bound. (In real ARC this can
        // come from the cubic model's coupling between axes; here we
        // construct it directly.)
        let s_unclipped = array![1.0, -0.2, 0.2, -0.1];
        let x_raw = &x_k + &s_unclipped;
        let x_proj = project_to_bounds(&x_raw, Some(&bounds));
        let clipped = clipped_axes_mask(&x_raw, &x_proj);
        let pinned = pinned_axes_mask(&x_k, Some(&bounds));
        let extra: Vec<bool> = clipped
            .iter()
            .zip(pinned.iter())
            .map(|(c, p)| *c || *p)
            .collect();
        // Axis 0 was clipped (raw=1.5 → projected=1.0); other axes unaffected.
        assert_eq!(extra, vec![true, false, false, false]);

        let trust_radius = 0.5_f64;
        let result = compute_active_set_step(&op, &g_proj, &base_active, &extra, trust_radius)
            .expect("compute_active_set_step must not error");
        let (step, pred, extended) = result.expect(
            "active-set step must produce a feasible reduced-space step on this 4D quadratic",
        );
        // Extended mask differs from base (axis 0 newly added).
        assert_eq!(extended, vec![true, false, false, false]);
        // Step on axis 0 must be exactly zero (it is in the active set).
        assert_eq!(step[0], 0.0, "active-set step must be zero on pinned axis");
        // Predicted decrease must be strictly positive (we have a
        // non-zero gradient on axes 1..3 and identity Hessian).
        assert!(
            pred > 0.0,
            "expected positive predicted decrease, got {pred}"
        );
        // Step must respect the trust radius.
        let s_norm = step.dot(&step).sqrt();
        assert!(
            s_norm <= trust_radius * (1.0 + 1e-9),
            "active-set step must respect trust radius: ‖s‖={s_norm:.3e} > Δ={trust_radius:.3e}"
        );
        // The reduced step must move at least one of axes 1..3
        // (since g_proj is non-zero on those axes).
        let reduced_norm = (1..n).map(|i| step[i] * step[i]).sum::<f64>().sqrt();
        assert!(
            reduced_norm > 0.0,
            "active-set step must produce non-zero motion in the unpinned subspace"
        );
    }

    /// When every axis is already pinned (true corner), the active-set
    /// step must return `None` so the caller can terminate cleanly
    /// rather than spinning.
    #[test]
    fn active_set_step_returns_none_at_true_corner() {
        struct IdHess(usize);
        impl OuterHessianOperator for IdHess {
            fn dim(&self) -> usize {
                self.0
            }
            fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
                Ok(v.clone())
            }
        }

        let op = IdHess(2);
        let g = array![1.0, -1.0];
        let base = vec![true, true];
        let extra = vec![true, true];
        let result = compute_active_set_step(&op, &g, &base, &extra, 0.5).expect("must not error");
        assert!(result.is_none());
    }

    /// When the extended mask doesn't add any new active axes (i.e. the
    /// base KKT active set already matches), the helper must return
    /// `None` to avoid looping with the same Steihaug-Toint solve.
    #[test]
    fn active_set_step_returns_none_when_extended_matches_base() {
        struct IdHess(usize);
        impl OuterHessianOperator for IdHess {
            fn dim(&self) -> usize {
                self.0
            }
            fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
                Ok(v.clone())
            }
        }

        let op = IdHess(3);
        let g = array![0.1, -0.2, 0.3];
        let base = vec![true, false, false];
        let extra = vec![false, false, false];
        // Union(base, extra) == base, so escape is a no-op.
        let result = compute_active_set_step(&op, &g, &base, &extra, 0.5).expect("must not error");
        assert!(result.is_none());
    }

    struct FailingSeedMaterializationOperator {
        dim: usize,
    }

    impl OuterHessianOperator for FailingSeedMaterializationOperator {
        fn dim(&self) -> usize {
            self.dim
        }

        fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
            Ok(v.clone())
        }

        fn is_cheap_to_materialize(&self) -> bool {
            true
        }

        fn materialize_dense(&self) -> Result<Array2<f64>, String> {
            Err("seed materialization failed".to_string())
        }
    }

    struct IdentityOuterHessianOperator;

    impl OuterHessianOperator for IdentityOuterHessianOperator {
        fn dim(&self) -> usize {
            1
        }

        fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
            Ok(v.clone())
        }
    }

    #[test]
    fn materialize_dense_uses_single_batched_mul_mat() {
        struct BatchedOnlyHessian {
            matrix: Array2<f64>,
            matvec_calls: Arc<AtomicUsize>,
            mul_mat_calls: Arc<AtomicUsize>,
            rhs_columns: Arc<AtomicUsize>,
        }

        impl OuterHessianOperator for BatchedOnlyHessian {
            fn dim(&self) -> usize {
                self.matrix.nrows()
            }

            fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
                self.matvec_calls.fetch_add(1, Ordering::Relaxed);
                Ok(self.matrix.dot(v))
            }

            fn mul_mat(&self, factor: ArrayView2<'_, f64>) -> Result<Array2<f64>, String> {
                self.mul_mat_calls.fetch_add(1, Ordering::Relaxed);
                self.rhs_columns
                    .fetch_add(factor.ncols(), Ordering::Relaxed);
                Ok(self.matrix.dot(&factor))
            }
        }

        let matvec_calls = Arc::new(AtomicUsize::new(0));
        let mul_mat_calls = Arc::new(AtomicUsize::new(0));
        let rhs_columns = Arc::new(AtomicUsize::new(0));
        let op = BatchedOnlyHessian {
            matrix: array![[2.0, 0.25, -0.5], [0.5, 3.0, 1.0], [-0.25, 2.0, 4.0]],
            matvec_calls: Arc::clone(&matvec_calls),
            mul_mat_calls: Arc::clone(&mul_mat_calls),
            rhs_columns: Arc::clone(&rhs_columns),
        };

        let dense = op
            .materialize_dense()
            .expect("batched dense materialization");
        let expected = array![[2.0, 0.375, -0.375], [0.375, 3.0, 1.5], [-0.375, 1.5, 4.0]];
        assert_eq!(dense, expected);
        assert_eq!(
            mul_mat_calls.load(Ordering::Relaxed),
            1,
            "dense materialization must batch all identity columns into one mul_mat call"
        );
        assert_eq!(
            rhs_columns.load(Ordering::Relaxed),
            3,
            "the single batched materialization call must include every identity RHS"
        );
        assert_eq!(
            matvec_calls.load(Ordering::Relaxed),
            0,
            "operators with batched mul_mat must not be probed column-by-column"
        );
    }

    #[test]
    fn plan_analytic_hessian_selects_arc() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Analytic,
            n_params: 3,
            psi_dim: 0,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Arc);
        assert_eq!(p.hessian_source, HessianSource::Analytic);
    }

    #[test]
    fn plan_prefer_gradient_only_does_not_hide_analytic_hessian() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Analytic,
            n_params: 3,
            psi_dim: 1,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: true,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Arc);
        assert_eq!(p.hessian_source, HessianSource::Analytic);
    }

    #[test]
    fn plan_survival_baseline_exact_hessian_selects_arc() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Analytic,
            n_params: 3,
            psi_dim: 0,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Arc);
        assert_eq!(p.hessian_source, HessianSource::Analytic);
    }

    #[test]
    fn plan_no_hessian_few_params_selects_bfgs() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 3,
            psi_dim: 0,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Bfgs);
        assert_eq!(p.hessian_source, HessianSource::BfgsApprox);
    }

    #[test]
    fn plan_no_hessian_many_params_selects_bfgs() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 12,
            psi_dim: 0,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Bfgs);
        assert_eq!(p.hessian_source, HessianSource::BfgsApprox);
    }

    #[test]
    fn plan_boundary_8_params_uses_bfgs() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: SMALL_OUTER_BFGS_MAX_PARAMS,
            psi_dim: 0,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Bfgs);
        assert_eq!(p.hessian_source, HessianSource::BfgsApprox);
    }

    #[test]
    fn plan_boundary_9_params_uses_bfgs() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: SMALL_OUTER_BFGS_MAX_PARAMS + 1,
            psi_dim: 0,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Bfgs);
        assert_eq!(p.hessian_source, HessianSource::BfgsApprox);
    }

    #[test]
    fn plan_efs_selected_for_penalty_like_many_params() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 15,
            psi_dim: 0,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Efs);
        assert_eq!(p.hessian_source, HessianSource::EfsFixedPoint);
    }

    #[test]
    fn plan_penalty_like_without_fixed_point_stays_bfgs() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 15,
            psi_dim: 0,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Bfgs);
        assert_eq!(p.hessian_source, HessianSource::BfgsApprox);
    }

    #[test]
    fn plan_efs_not_selected_few_params_even_if_penalty_like() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 5,
            psi_dim: 0,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Bfgs);
        assert_eq!(p.hessian_source, HessianSource::BfgsApprox);
    }

    #[test]
    fn plan_efs_not_selected_with_analytic_hessian() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Analytic,
            n_params: 20,
            psi_dim: 0,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        // Arc is always preferred when analytic Hessian is available.
        assert_eq!(p.solver, Solver::Arc);
    }

    #[test]
    fn plan_efs_with_no_gradient_penalty_like_many_params() {
        // Even without analytic gradient, EFS works because it doesn't
        // need the gradient at all.
        let cap = OuterCapability {
            gradient: Derivative::Unavailable,
            hessian: Derivative::Unavailable,
            n_params: 20,
            psi_dim: 0,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Efs);
        assert_eq!(p.hessian_source, HessianSource::EfsFixedPoint);
    }

    #[test]
    fn plan_efs_allowed_with_barrier_config() {
        // When barrier_config is present (monotonicity constraints), EFS is
        // still selected at plan time. The runtime barrier-curvature guard
        // in the EFS loop handles safety.
        let barrier = BarrierConfig {
            tau: 1e-6,
            constrained_indices: vec![0, 1],
            lower_bounds: vec![0.0, 0.0],
        };
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 15,
            psi_dim: 0,
            fixed_point_available: true,
            barrier_config: Some(barrier),
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Efs);
        assert_eq!(p.hessian_source, HessianSource::EfsFixedPoint);
    }

    #[test]
    fn plan_efs_allowed_with_barrier_config_no_gradient() {
        // Even without analytic gradient, EFS is selected when all coords
        // are penalty-like and the problem is above the small-problem
        // BFGS cutoff, regardless of barrier presence.
        let barrier = BarrierConfig {
            tau: 1e-6,
            constrained_indices: vec![0],
            lower_bounds: vec![0.0],
        };
        let cap = OuterCapability {
            gradient: Derivative::Unavailable,
            hessian: Derivative::Unavailable,
            n_params: 20,
            psi_dim: 0,
            fixed_point_available: true,
            barrier_config: Some(barrier),
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Efs);
        assert_eq!(p.hessian_source, HessianSource::EfsFixedPoint);
    }

    #[test]
    fn barrier_curvature_significant_blocks_efs_at_runtime() {
        // Verify that barrier_curvature_is_significant correctly detects
        // when coefficients are near their bounds.
        let barrier = BarrierConfig {
            tau: 1e-6,
            constrained_indices: vec![0],
            lower_bounds: vec![0.0],
        };
        // β very close to bound → curvature is large
        let beta_near = Array1::from_vec(vec![0.001]);
        assert!(barrier.barrier_curvature_is_significant(&beta_near, 1.0, 0.01));

        // β far from bound → curvature is negligible
        let beta_far = Array1::from_vec(vec![10.0]);
        assert!(!barrier.barrier_curvature_is_significant(&beta_far, 1.0, 0.01));
    }

    #[test]
    fn hessian_result_unwrap_analytic() {
        let h = Array2::<f64>::eye(3);
        let result = HessianResult::Analytic(h.clone());
        assert!(result.is_analytic());
        let extracted = result.unwrap_analytic();
        assert_eq!(extracted, h);
    }

    #[test]
    #[should_panic(expected = "expected analytic Hessian")]
    fn hessian_result_unwrap_unavailable_panics() {
        let result = HessianResult::Unavailable;
        result.unwrap_analytic();
    }

    #[test]
    fn zero_params_selects_arc() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Analytic,
            n_params: 0,
            psi_dim: 0,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Arc);
        assert_eq!(p.hessian_source, HessianSource::Analytic);
    }

    #[test]
    fn hessian_result_into_option() {
        let h = Array2::<f64>::eye(2);
        let result = HessianResult::Analytic(h.clone());
        assert_eq!(result.into_option(), Some(h));

        let result = HessianResult::Unavailable;
        assert_eq!(result.into_option(), None);
    }

    #[test]
    fn closure_objective_delegates() {
        let mut obj = ClosureObjective {
            state: 42_i32,
            cap: OuterCapability {
                gradient: Derivative::Analytic,
                hessian: Derivative::Unavailable,
                n_params: 1,
                psi_dim: 0,
                fixed_point_available: false,
                barrier_config: None,
                prefer_gradient_only: false,
                disable_fixed_point: false,
            },
            cost_fn: |_: &mut i32, _: &Array1<f64>| Ok(1.0),
            eval_fn: |_: &mut i32, _: &Array1<f64>| {
                Ok(OuterEval {
                    cost: 1.0,
                    gradient: Array1::zeros(1),
                    hessian: HessianResult::Unavailable,
                })
            },
            eval_order_fn: None::<
                fn(&mut i32, &Array1<f64>, OuterEvalOrder) -> Result<OuterEval, EstimationError>,
            >,
            reset_fn: Some(|st: &mut i32| {
                *st = 42;
            }),
            efs_fn: None::<fn(&mut i32, &Array1<f64>) -> Result<EfsEval, EstimationError>>,
            screening_proxy_fn: None::<fn(&mut i32, &Array1<f64>) -> Result<f64, EstimationError>>,
        };
        assert_eq!(obj.capability().n_params, 1);
        assert_eq!(obj.eval_cost(&Array1::zeros(1)).unwrap(), 1.0);
    }

    #[test]
    fn hybrid_efs_backtracking_uses_half_step_after_first_rejection() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 12,
            psi_dim: 1,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let mut obj = ClosureObjective {
            state: (),
            cap: cap.clone(),
            cost_fn: |_: &mut (), theta: &Array1<f64>| {
                let psi = theta[11];
                let cost = if (psi - 0.0).abs() < 1e-12 {
                    1.0
                } else if (psi - 0.5).abs() < 1e-12 {
                    0.5
                } else {
                    2.0
                };
                Ok(cost)
            },
            eval_fn: |_: &mut (), theta: &Array1<f64>| {
                Ok(OuterEval {
                    cost: theta[11].abs(),
                    gradient: Array1::zeros(theta.len()),
                    hessian: HessianResult::Unavailable,
                })
            },
            eval_order_fn: None::<
                fn(&mut (), &Array1<f64>, OuterEvalOrder) -> Result<OuterEval, EstimationError>,
            >,
            reset_fn: None::<fn(&mut ())>,
            efs_fn: Some(|_: &mut (), theta: &Array1<f64>| {
                let mut steps = vec![0.0; theta.len()];
                steps[11] = 1.0;
                Ok(EfsEval {
                    cost: 1.0,
                    steps,
                    beta: None,
                    psi_gradient: Some(array![1.0]),
                    psi_indices: Some(vec![11]),
                })
            }),
            screening_proxy_fn: None::<fn(&mut (), &Array1<f64>) -> Result<f64, EstimationError>>,
        };
        let mut bridge = OuterFixedPointBridge {
            obj: &mut obj,
            layout: cap.theta_layout(),
            barrier_config: None,
            consecutive_psi_zero_iters: 0,
        };

        let sample = bridge
            .eval_step(&Array1::zeros(cap.n_params))
            .expect("hybrid EFS step should backtrack cleanly");

        assert_eq!(sample.status, FixedPointStatus::Continue);
        assert_eq!(sample.step.len(), cap.n_params);
        assert_eq!(sample.step[11], 0.5);
        assert!(
            sample
                .step
                .iter()
                .enumerate()
                .all(|(idx, &value)| idx == 11 || value == 0.0)
        );
    }

    #[test]
    fn run_bfgs_mode_aware_eval_skips_hessian_work() {
        let seen_orders = Arc::new(Mutex::new(Vec::new()));
        let problem = OuterProblem::new(1)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Unavailable)
            .with_initial_rho(array![1.0])
            .with_max_iter(1);
        let mut obj = problem.build_objective_with_eval_order(
            (),
            |_: &mut (), theta: &Array1<f64>| Ok(theta[0] * theta[0]),
            |_: &mut (), _: &Array1<f64>| {
                Err(EstimationError::InvalidInput(
                    "legacy eager eval should not run on BFGS".to_string(),
                ))
            },
            {
                let seen_orders = Arc::clone(&seen_orders);
                move |_: &mut (), theta: &Array1<f64>, order: OuterEvalOrder| {
                    seen_orders.lock().unwrap().push(order);
                    Ok(OuterEval {
                        cost: theta[0] * theta[0],
                        gradient: array![2.0 * theta[0]],
                        hessian: HessianResult::Unavailable,
                    })
                }
            },
            None::<fn(&mut ())>,
            None::<fn(&mut (), &Array1<f64>) -> Result<EfsEval, EstimationError>>,
        );
        let result = problem
            .run(&mut obj, "mode-aware bfgs first order")
            .expect("BFGS should use the order-aware first-order bridge");
        assert_eq!(result.plan_used.solver, Solver::Bfgs);
        let seen_orders = seen_orders.lock().unwrap();
        assert!(
            !seen_orders.is_empty(),
            "mode-aware eval hook should have been used"
        );
        assert!(
            seen_orders
                .iter()
                .all(|order| *order == OuterEvalOrder::ValueAndGradient),
            "BFGS should request only value+gradient, saw {seen_orders:?}"
        );
    }

    #[test]
    fn outer_second_order_bridge_separates_first_and_second_order_requests() {
        let seen_orders = Arc::new(Mutex::new(Vec::new()));
        let problem = OuterProblem::new(1)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Analytic);
        let mut obj = problem.build_objective_with_eval_order(
            (),
            |_: &mut (), theta: &Array1<f64>| Ok(theta[0] * theta[0]),
            |_: &mut (), _: &Array1<f64>| {
                Err(EstimationError::InvalidInput(
                    "legacy eager eval should not run".to_string(),
                ))
            },
            {
                let seen_orders = Arc::clone(&seen_orders);
                move |_: &mut (), theta: &Array1<f64>, order: OuterEvalOrder| {
                    seen_orders.lock().unwrap().push(order);
                    Ok(OuterEval {
                        cost: theta[0] * theta[0],
                        gradient: array![2.0 * theta[0]],
                        hessian: match order {
                            OuterEvalOrder::ValueAndGradient => HessianResult::Unavailable,
                            OuterEvalOrder::ValueGradientHessian => {
                                HessianResult::Analytic(array![[2.0]])
                            }
                        },
                    })
                }
            },
            None::<fn(&mut ())>,
            None::<fn(&mut (), &Array1<f64>) -> Result<EfsEval, EstimationError>>,
        );
        let mut bridge = OuterSecondOrderBridge {
            obj: &mut obj,
            layout: OuterThetaLayout::new(1, 0),
            hessian_source: HessianSource::Analytic,
            materialize_operator_max_dim: OUTER_HVP_MATERIALIZE_MAX_DIM,
            eval_count: 0,
            outer_inner_cap: None,
            g_norm_initial: None,
            last_g_norm: None,
        };
        let grad_sample =
            FirstOrderObjective::eval_grad(&mut bridge, &array![1.0]).expect("grad eval");
        assert_eq!(grad_sample.value, 1.0);
        assert_eq!(grad_sample.gradient, array![2.0]);
        let hess_sample =
            SecondOrderObjective::eval_hessian(&mut bridge, &array![1.0]).expect("hessian eval");
        assert_eq!(hess_sample.value, 1.0);
        assert_eq!(hess_sample.gradient, array![2.0]);
        assert_eq!(hess_sample.hessian, Some(array![[2.0]]));
        let seen_orders = seen_orders.lock().unwrap();
        assert!(
            *seen_orders
                == vec![
                    OuterEvalOrder::ValueAndGradient,
                    OuterEvalOrder::ValueGradientHessian
                ],
            "second-order bridge should split first-order and second-order requests, saw {seen_orders:?}"
        );
    }

    #[test]
    fn outer_config_default() {
        let cfg = OuterConfig::default();
        assert_eq!(cfg.tolerance, 1e-5);
        assert_eq!(cfg.max_iter, 200);
        assert_eq!(cfg.rho_bound, 30.0);
    }

    #[test]
    fn plan_hybrid_efs_selected_for_psi_coords_many_params() {
        // When ψ (design-moving) coords are present and the problem is above
        // the small-problem BFGS cutoff, the planner should select HybridEfs
        // instead of falling back to BFGS.
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 15,
            psi_dim: 1,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::HybridEfs);
        assert_eq!(p.hessian_source, HessianSource::HybridEfsFixedPoint);
    }

    #[test]
    fn plan_psi_without_fixed_point_stays_bfgs() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 15,
            psi_dim: 1,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Bfgs);
        assert_eq!(p.hessian_source, HessianSource::BfgsApprox);
    }

    #[test]
    fn plan_hybrid_efs_no_gradient_selected_for_psi_coords() {
        // Even without analytic gradient, hybrid EFS works because the
        // gradient is computed internally by the unified evaluator.
        let cap = OuterCapability {
            gradient: Derivative::Unavailable,
            hessian: Derivative::Unavailable,
            n_params: 15,
            psi_dim: 1,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::HybridEfs);
        assert_eq!(p.hessian_source, HessianSource::HybridEfsFixedPoint);
    }

    // ----------------------------------------------------------------------
    // Routing regression tests (spec section 12).
    //
    // Post-#1 (compute-budget failure paths removed) and #2 (Hessian
    // cost-gating in custom_family.rs removed), the planner no longer
    // downgrades `(Analytic, Analytic)` to BFGS at any problem size. The
    // contract is:
    //
    //   high dense work + analytic+analytic     → ARC + Analytic
    //                                             (runtime then chooses
    //                                              operator HVP per family)
    //   high dense work + analytic + Unavailable → BFGS + BfgsApprox
    //                                             (matrix-free not advertised
    //                                              by the family — BFGS is
    //                                              still the right choice)
    //
    // `routing_log_line()` exposes a stable token that biobank log
    // regressions in tests/bench_biobank_scale_runner_test.py pin against.
    // ----------------------------------------------------------------------

    fn cap_for_routing(
        gradient: Derivative,
        hessian: Derivative,
        n_params: usize,
    ) -> OuterCapability {
        OuterCapability {
            gradient,
            hessian,
            n_params,
            psi_dim: 0,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        }
    }

    #[test]
    fn routing_analytic_analytic_stays_arc_at_biobank_scale() {
        // Biobank-scale standard GAM (n=320K, p=65, k=6) used to trigger the
        // aggregate `k·n·p²` cost-driven downgrade. Post-#1 the planner has
        // no scale-driven downgrade, so `(Analytic, Analytic)` must stay on
        // ARC + Analytic regardless of the problem dimensions.
        let cap = cap_for_routing(Derivative::Analytic, Derivative::Analytic, 6);
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Arc);
        assert_eq!(p.hessian_source, HessianSource::Analytic);
    }

    #[test]
    fn routing_analytic_analytic_stays_arc_at_dense_work_scale() {
        // n=3·10⁵, p=300 used to trigger the per-inner-solve `n·p²` downgrade
        // (`2.7·10¹⁰ ≫ 5·10⁹`). Post-#1, no work-hint API exists; ARC stays.
        let cap = cap_for_routing(Derivative::Analytic, Derivative::Analytic, 3);
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Arc);
        assert_eq!(p.hessian_source, HessianSource::Analytic);
    }

    #[test]
    fn routing_unavailable_hessian_routes_to_bfgs() {
        // Spec section 12: when the family cannot provide a second derivative
        // (matrix-free or otherwise), BFGS is the correct route.
        let cap = cap_for_routing(Derivative::Analytic, Derivative::Unavailable, 8);
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Bfgs);
        assert_eq!(p.hessian_source, HessianSource::BfgsApprox);
    }

    #[test]
    fn routing_explicit_prefer_gradient_only_does_not_override_exact_hessian() {
        // The primary REML outer must never hide an analytic Hessian behind a
        // quasi-Newton route. Auxiliary gradient-only optimizers are separate
        // solver classes; this flag is ignored for Analytic+Analytic primary
        // capabilities.
        let mut cap = cap_for_routing(Derivative::Analytic, Derivative::Analytic, 6);
        cap.prefer_gradient_only = true;
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Arc);
        assert_eq!(p.hessian_source, HessianSource::Analytic);
    }

    #[test]
    fn routing_log_line_arc_analytic_advertises_matrix_free() {
        // Token pinned by tests/bench_biobank_scale_runner_test.py. Renaming
        // any of these substrings is a log-regression and breaks downstream
        // grep patterns.
        let p = OuterPlan {
            solver: Solver::Arc,
            hessian_source: HessianSource::Analytic,
        };
        let line = p.routing_log_line();
        assert!(line.contains("solver=Arc"), "got {line}");
        assert!(line.contains("hessian=Analytic"), "got {line}");
        assert!(line.contains("matrix-free=true"), "got {line}");
    }

    #[test]
    fn routing_log_line_bfgs_reports_no_matrix_free() {
        let p = OuterPlan {
            solver: Solver::Bfgs,
            hessian_source: HessianSource::BfgsApprox,
        };
        let line = p.routing_log_line();
        assert!(line.contains("solver=Bfgs"), "got {line}");
        assert!(line.contains("hessian=BfgsApprox"), "got {line}");
        assert!(line.contains("matrix-free=false"), "got {line}");
    }

    #[test]
    fn routing_log_line_efs_reports_no_matrix_free() {
        // EFS variants don't expose a Hessian operator either, so the
        // matrix-free token is `false`.
        for source in [
            HessianSource::EfsFixedPoint,
            HessianSource::HybridEfsFixedPoint,
        ] {
            let p = OuterPlan {
                solver: Solver::Efs,
                hessian_source: source,
            };
            assert!(
                p.routing_log_line().contains("matrix-free=false"),
                "{:?} should not advertise matrix-free",
                source
            );
        }
    }

    // ----------------------------------------------------------------------
    // Per-family routing regression tests.
    //
    // Each family that gains matrix-free Hessian operators must, at the
    // OuterProblem build site, declare both derivatives `Analytic` so the
    // planner stays on ARC + Analytic. These tests pin that contract from
    // the planner side. The runtime's choice between dense-Hessian-assembly
    // and operator-HVPs is independent of the planner; a separate per-family
    // test (in the family's own module) should pin that.
    //
    // ----------------------------------------------------------------------

    #[test]
    fn routing_custom_family_gamlss_stays_on_arc_when_both_derivs_analytic() {
        // Post-#5/#12, GAMLSS advertises matrix-free directional operators
        // for the joint Hessian; the OuterProblem build site must declare
        // both derivatives Analytic so ARC + Analytic stays in effect.
        let cap = cap_for_routing(Derivative::Analytic, Derivative::Analytic, 4);
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Arc);
        assert_eq!(p.hessian_source, HessianSource::Analytic);
    }

    #[test]
    fn routing_matern_iso_kappa_stays_on_arc_when_both_derivs_analytic() {
        // Post-#7, Matern/TPS spatial κ/τ derivative drifts ship as
        // HyperOperators; planner contract: (Analytic, Analytic) → ARC.
        let cap = cap_for_routing(Derivative::Analytic, Derivative::Analytic, 5);
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Arc);
        assert_eq!(p.hessian_source, HessianSource::Analytic);
    }

    #[test]
    fn routing_matern_iso_large_kappa_dim_stays_on_arc_with_analytic_hessian() {
        // Spatial isotropic κ no longer declares Hessian unavailable when
        // kappa_dim > 30.  Large κ blocks are represented by exact HVP
        // operators at evaluation time, so the planner must keep second-order
        // ARC instead of selecting HybridEFS.
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Analytic,
            n_params: 37,
            psi_dim: 31,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Arc);
        assert_eq!(p.hessian_source, HessianSource::Analytic);
    }

    #[test]
    fn routing_marginal_slope_stays_on_arc_when_both_derivs_analytic() {
        // Bernoulli/survival marginal-slope: the planner contract is the
        // same — (Analytic, Analytic) → ARC + Analytic. Runtime selects
        // operator HVPs via `use_joint_matrix_free_path`.
        let cap = cap_for_routing(Derivative::Analytic, Derivative::Analytic, 3);
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Arc);
        assert_eq!(p.hessian_source, HessianSource::Analytic);
    }

    #[test]
    fn plan_hybrid_efs_not_selected_few_params() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 5,
            psi_dim: 1,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Bfgs);
        assert_eq!(p.hessian_source, HessianSource::BfgsApprox);
    }

    #[test]
    fn plan_exact_hvp_capability_selects_arc_even_when_fixed_point_is_available() {
        // Large spatial/custom-family problems may also expose EFS/HybridEFS
        // fixed-point traces, but an explicit dense Hessian or exact HVP
        // operator is stronger geometry. The planner must therefore select
        // ARC + Analytic rather than cost-demoting to BFGS/EFS when the
        // evaluator advertises second-order capability.
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Analytic,
            n_params: 64,
            psi_dim: 16,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: true,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Arc);
        assert_eq!(p.hessian_source, HessianSource::Analytic);
    }

    #[test]
    fn plan_hybrid_efs_not_selected_with_analytic_hessian() {
        // Arc is always preferred when analytic Hessian is available,
        // even with ψ coordinates.
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Analytic,
            n_params: 20,
            psi_dim: 1,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Arc);
    }

    #[test]
    fn plan_pure_efs_not_hybrid_when_all_penalty_like() {
        // When all coords are penalty-like (no ψ), pure EFS is selected
        // even if has_psi_coords is false.
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 15,
            psi_dim: 0,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Efs);
        assert_eq!(p.hessian_source, HessianSource::EfsFixedPoint);
    }

    #[test]
    fn automatic_fallbacks_preserve_analytic_hessian_for_arc_primary() {
        // For an (Analytic, Analytic) capability the planner emits ARC. The
        // cascade MUST NOT add a BFGS+BfgsApprox demotion: doing so discards
        // the analytic outer Hessian ARC was using, replaces it with a
        // strictly weaker rank-2 approximation, and silently masks ARC's
        // actual failure mode (budget exhaustion, indefinite curvature)
        // under a BFGS Strong-Wolfe plateau. ARC budget exhaustion is
        // handled by the per-attempt retry ladder in
        // `run_outer_with_strategy`; once that is exhausted, the caller
        // sees the genuine analytic-Hessian non-convergence verbatim.
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Analytic,
            n_params: 12,
            psi_dim: 0,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        assert_eq!(plan(&cap).solver, Solver::Arc);
        let attempts = automatic_fallback_attempts(&cap);
        assert!(
            attempts.is_empty(),
            "ARC primary must not lateral-demote to BFGS+BfgsApprox; \
             ARC budget retries live in the runner",
        );
    }

    #[test]
    fn automatic_fallbacks_from_efs_prefer_analytic_bfgs_over_fd() {
        // When the primary plan is EFS, the first fallback must keep the
        // analytic gradient and just disable the fixed-point path so the
        // planner picks gradient-based BFGS. Silently downgrading to finite
        // differences here was the long-standing production bug we are
        // guarding against.
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 15,
            psi_dim: 0,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        assert_eq!(plan(&cap).solver, Solver::Efs);

        let attempts = automatic_fallback_attempts(&cap);
        assert!(!attempts.is_empty(), "EFS failure must have a fallback");
        assert_eq!(attempts[0].gradient, Derivative::Analytic);
        assert_eq!(attempts[0].hessian, Derivative::Unavailable);
        assert!(attempts[0].disable_fixed_point);
        assert_eq!(plan(&attempts[0]).solver, Solver::Bfgs);

        assert!(
            attempts.iter().all(|c| c.gradient == Derivative::Analytic),
            "fallback cascade must stay on analytic-gradient attempts",
        );
    }

    #[test]
    fn automatic_fallbacks_from_hybrid_efs_prefer_analytic_bfgs_over_fd() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 15,
            psi_dim: 2,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        assert_eq!(plan(&cap).solver, Solver::HybridEfs);

        let attempts = automatic_fallback_attempts(&cap);
        assert!(!attempts.is_empty());
        assert_eq!(attempts[0].gradient, Derivative::Analytic);
        assert!(attempts[0].disable_fixed_point);
        assert_eq!(plan(&attempts[0]).solver, Solver::Bfgs);
    }

    #[test]
    fn disabled_fallback_hybrid_efs_capability_routes_to_bfgs_primary() {
        // Production Matérn60 exact adaptive regularization at biobank scale:
        // rho_dim=3 retained quadratic penalties, psi_dim=6 adaptive λ/ε
        // coordinates, n_params=9, analytic gradient, and exact outer Hessian
        // cost-gated unavailable. Structurally this is HybridEFS-shaped, but
        // HybridEFS with ψ coordinates is not a standalone primary solver: its
        // ψ backtracking path can legitimately request the first-order escape
        // ladder. If that ladder is disabled, the runner must route the primary
        // attempt directly to BFGS instead of relying on call sites to remember
        // `.with_disable_fixed_point(true)`.
        let trapped_cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 9,
            psi_dim: 6,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        assert_eq!(plan(&trapped_cap).solver, Solver::HybridEfs);

        let disabled_config = OuterConfig {
            fallback_policy: FallbackPolicy::Disabled,
            ..OuterConfig::default()
        };
        let primary_cap = primary_capability_for_config(
            trapped_cap.clone(),
            &disabled_config,
            "biobank exact adaptive",
        );
        assert!(primary_cap.disable_fixed_point);
        assert_eq!(plan(&primary_cap).solver, Solver::Bfgs);

        let pure_efs_cap = OuterCapability {
            psi_dim: 0,
            ..trapped_cap.clone()
        };
        assert_eq!(plan(&pure_efs_cap).solver, Solver::Efs);
        let pure_primary_cap =
            primary_capability_for_config(pure_efs_cap.clone(), &disabled_config, "pure EFS");
        assert!(!pure_primary_cap.disable_fixed_point);
        assert_eq!(plan(&pure_primary_cap).solver, Solver::Efs);

        let no_gradient_cap = OuterCapability {
            gradient: Derivative::Unavailable,
            ..trapped_cap.clone()
        };
        assert_eq!(plan(&no_gradient_cap).solver, Solver::HybridEfs);
        let no_gradient_primary_cap = primary_capability_for_config(
            no_gradient_cap.clone(),
            &disabled_config,
            "gradient-unavailable hybrid EFS",
        );
        assert!(!no_gradient_primary_cap.disable_fixed_point);
        assert_eq!(plan(&no_gradient_primary_cap).solver, Solver::HybridEfs);

        let automatic_config = OuterConfig::default();
        let automatic_cap = primary_capability_for_config(
            trapped_cap.clone(),
            &automatic_config,
            "biobank exact adaptive",
        );
        assert!(!automatic_cap.disable_fixed_point);
        assert_eq!(plan(&automatic_cap).solver, Solver::HybridEfs);

        let automatic_attempts = automatic_fallback_attempts(&trapped_cap);
        assert!(!automatic_attempts.is_empty());
        assert!(automatic_attempts[0].disable_fixed_point);
        assert_eq!(plan(&automatic_attempts[0]).solver, Solver::Bfgs);
    }

    #[test]
    fn disabled_fallback_hybrid_efs_problem_uses_bfgs_without_calling_efs() {
        let efs_calls = Arc::new(AtomicUsize::new(0));
        let problem = OuterProblem::new(9)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Unavailable)
            .with_psi_dim(6)
            .with_fallback_policy(FallbackPolicy::Disabled)
            .with_initial_rho(Array1::zeros(9))
            .with_max_iter(5);
        let mut obj = problem.build_objective(
            (),
            |_: &mut (), theta: &Array1<f64>| Ok(0.5 * theta.dot(theta)),
            |_: &mut (), theta: &Array1<f64>| {
                Ok(OuterEval {
                    cost: 0.5 * theta.dot(theta),
                    gradient: theta.clone(),
                    hessian: HessianResult::Unavailable,
                })
            },
            None::<fn(&mut ())>,
            {
                let efs_calls = Arc::clone(&efs_calls);
                Some(move |_: &mut (), _: &Array1<f64>| {
                    efs_calls.fetch_add(1, Ordering::Relaxed);
                    Err(EstimationError::RemlOptimizationFailed(format!(
                        "{} synthetic biobank adaptive HybridEFS escape",
                        EFS_FIRST_ORDER_FALLBACK_MARKER,
                    )))
                })
            },
        );

        let result = problem
            .run(&mut obj, "disabled fallback marker")
            .expect("disabled-fallback HybridEFS-shaped problem should route directly to BFGS");
        assert_eq!(result.plan_used.solver, Solver::Bfgs);
        assert_eq!(
            efs_calls.load(Ordering::Relaxed),
            0,
            "central primary-capability canonicalization should avoid the EFS hook entirely"
        );
    }

    #[test]
    fn automatic_fallbacks_without_gradient_stop_at_fixed_point_status() {
        for (psi_dim, expected_solver) in [(0, Solver::Efs), (2, Solver::HybridEfs)] {
            let cap = OuterCapability {
                gradient: Derivative::Unavailable,
                hessian: Derivative::Unavailable,
                n_params: 15,
                psi_dim,
                fixed_point_available: true,
                barrier_config: None,
                prefer_gradient_only: false,
                disable_fixed_point: false,
            };
            assert_eq!(plan(&cap).solver, expected_solver);
            assert!(
                automatic_fallback_attempts(&cap).is_empty(),
                "gradient-unavailable fixed-point capabilities must not fabricate a BFGS fallback",
            );
        }
    }

    #[test]
    fn automatic_fallbacks_do_not_repeat_arc_when_fixed_point_is_irrelevant() {
        // The contract here is that the cascade does not lateral-hop ARC
        // through the EFS planner arm when `fixed_point_available=true` is
        // incidentally set on an (Analytic, Analytic) capability that the
        // planner already chose ARC for. Combined with the
        // analytic-Hessian-preservation contract enforced by
        // `automatic_fallbacks_preserve_analytic_hessian_for_arc_primary`,
        // the ARC primary now has zero degraded fallbacks — the runner's
        // ARC budget-bump retry ladder owns recovery.
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Analytic,
            n_params: 15,
            psi_dim: 0,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        assert_eq!(plan(&cap).solver, Solver::Arc);

        let attempts = automatic_fallback_attempts(&cap);
        assert!(
            attempts.is_empty(),
            "ARC primary with incidental fixed_point_available must not \
             cascade through the EFS arm or lateral-demote to BFGS",
        );
    }

    #[test]
    fn plan_disable_fixed_point_forces_bfgs_even_when_efs_eligible() {
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 15,
            psi_dim: 0,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: true,
        };
        let p = plan(&cap);
        assert_eq!(p.solver, Solver::Bfgs);
        assert_eq!(p.hessian_source, HessianSource::BfgsApprox);
    }

    #[test]
    fn run_malformed_gradient_seed_surfaces_as_error() {
        // A capability that declares Analytic gradient but returns a malformed
        // one must fail loudly. The previous numerical-gradient fallback masked
        // the underlying bug by silently spinning a cost-only BFGS; that path is
        // disabled in production.
        let problem = OuterProblem::new(2)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Unavailable)
            .with_initial_rho(Array1::zeros(2))
            .with_max_iter(1);
        let mut obj = problem.build_objective(
            (),
            |_: &mut (), _: &Array1<f64>| Ok(0.0),
            |_: &mut (), _: &Array1<f64>| {
                Ok(OuterEval {
                    cost: 0.0,
                    gradient: Array1::zeros(1),
                    hessian: HessianResult::Unavailable,
                })
            },
            None::<fn(&mut ())>,
            None::<fn(&mut (), &Array1<f64>) -> Result<EfsEval, EstimationError>>,
        );
        let err = problem
            .run(&mut obj, "test gradient mismatch")
            .expect_err("malformed analytic gradient must surface as error");
        assert!(
            matches!(err, EstimationError::RemlOptimizationFailed(_)),
            "unexpected error variant: {err:?}",
        );
    }

    #[test]
    fn run_bfgs_ignores_malformed_hessian_payload() {
        let problem = OuterProblem::new(1)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Unavailable)
            .with_initial_rho(array![0.0])
            .with_max_iter(1);
        let mut obj = problem.build_objective(
            (),
            |_: &mut (), theta: &Array1<f64>| Ok(theta[0] * theta[0]),
            |_: &mut (), theta: &Array1<f64>| {
                Ok(OuterEval {
                    cost: theta[0] * theta[0],
                    gradient: array![2.0 * theta[0]],
                    // First-order paths must ignore Hessian payload quality.
                    hessian: HessianResult::Analytic(array![[f64::NAN, 0.0], [0.0, 1.0]]),
                })
            },
            None::<fn(&mut ())>,
            None::<fn(&mut (), &Array1<f64>) -> Result<EfsEval, EstimationError>>,
        );
        let result = problem
            .run(&mut obj, "bfgs should ignore malformed hessian payload")
            .expect("valid first-order data should be enough for BFGS");
        assert_eq!(result.plan_used.solver, Solver::Bfgs);
        assert_eq!(result.plan_used.hessian_source, HessianSource::BfgsApprox);
    }

    #[test]
    fn finite_outer_eval_reports_gradient_length_mismatch() {
        let err = finite_outer_eval_or_error(
            "test gradient mismatch",
            OuterThetaLayout::new(2, 0),
            OuterEval {
                cost: 0.0,
                gradient: Array1::zeros(1),
                hessian: HessianResult::Unavailable,
            },
        )
        .expect_err("gradient mismatch should be rejected");
        let message = match err {
            ObjectiveEvalError::Recoverable { message } | ObjectiveEvalError::Fatal { message } => {
                message
            }
        };
        assert!(
            message.contains("outer gradient length mismatch"),
            "unexpected error: {message}"
        );
    }

    #[test]
    fn run_with_initial_seed_still_considers_generated_candidates() {
        let generated = crate::seeding::generate_rho_candidates(
            1,
            None,
            &crate::seeding::SeedConfig::default(),
        );
        let valid_seed = generated
            .first()
            .expect("seed generator should yield at least one candidate")
            .clone();
        let expected_seed = valid_seed.clone();
        let initial_seed = array![9.0];
        let mut seed_config = crate::seeding::SeedConfig::default();
        seed_config.seed_budget = 1;
        let problem = OuterProblem::new(1)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Unavailable)
            .with_seed_config(seed_config)
            .with_initial_rho(initial_seed)
            .with_max_iter(1);
        let mut obj = problem.build_objective(
            (),
            {
                let valid_seed = valid_seed.clone();
                move |_: &mut (), theta: &Array1<f64>| {
                    if theta == &valid_seed {
                        Ok(0.0)
                    } else {
                        Ok(f64::INFINITY)
                    }
                }
            },
            move |_: &mut (), theta: &Array1<f64>| {
                if theta == &valid_seed {
                    Ok(OuterEval {
                        cost: 0.0,
                        gradient: Array1::zeros(1),
                        hessian: HessianResult::Unavailable,
                    })
                } else {
                    Ok(OuterEval::infeasible(theta.len()))
                }
            },
            None::<fn(&mut ())>,
            None::<fn(&mut (), &Array1<f64>) -> Result<EfsEval, EstimationError>>,
        );
        let result = problem
            .run(&mut obj, "generated seed should remain reachable")
            .expect("generated seed should still be eligible when an initial seed is provided");
        assert_eq!(result.rho, expected_seed);
    }

    #[test]
    fn run_indefinite_analytic_seed_stays_on_arc() {
        let mut seed_config = crate::seeding::SeedConfig::default();
        seed_config.seed_budget = 1;
        let problem = OuterProblem::new(1)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Analytic)
            .with_seed_config(seed_config)
            .with_initial_rho(array![0.0])
            .with_max_iter(1);
        let mut obj = problem.build_objective(
            (),
            |_: &mut (), theta: &Array1<f64>| Ok(theta[0] * theta[0]),
            |_: &mut (), _: &Array1<f64>| {
                Ok(OuterEval {
                    cost: 0.0,
                    gradient: array![0.0],
                    hessian: HessianResult::Analytic(array![[-1.0]]),
                })
            },
            None::<fn(&mut ())>,
            None::<fn(&mut (), &Array1<f64>) -> Result<EfsEval, EstimationError>>,
        );
        let result = problem
            .run(&mut obj, "indefinite seed geometry")
            .expect("indefinite analytic seed geometry should stay on the second-order plan");
        assert_eq!(result.plan_used.solver, Solver::Arc);
        assert_eq!(result.plan_used.hessian_source, HessianSource::Analytic);
    }

    #[test]
    fn run_seed_materialization_failure_surfaces_arc_error_verbatim() {
        // Under the budget-bump retry ladder (commit c96c4233), an ARC
        // primary with `(Analytic, Analytic)` capability has zero degraded
        // fallbacks. A seed-materialization failure surfaces as `Err`
        // verbatim — there is no lateral demote to BFGS+BfgsApprox that
        // would silently discard the analytic outer Hessian. Materialization
        // failures are deterministic w.r.t. rho, so the budget-bump retry
        // ladder cannot rescue them; the operator returns the same Err on
        // every retry. Hence the runner returns the original Err.
        let mut seed_config = crate::seeding::SeedConfig::default();
        seed_config.seed_budget = 1;
        let problem = OuterProblem::new(1)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Analytic)
            .with_seed_config(seed_config)
            .with_initial_rho(array![0.0])
            .with_max_iter(1);
        let mut obj = problem.build_objective(
            (),
            |_: &mut (), theta: &Array1<f64>| Ok(theta[0] * theta[0]),
            |_: &mut (), _: &Array1<f64>| {
                Ok(OuterEval {
                    cost: 0.0,
                    gradient: array![0.0],
                    hessian: HessianResult::Operator(Arc::new(
                        FailingSeedMaterializationOperator { dim: 1 },
                    )),
                })
            },
            None::<fn(&mut ())>,
            None::<fn(&mut (), &Array1<f64>) -> Result<EfsEval, EstimationError>>,
        );
        let err = problem
            .run(&mut obj, "seed materialization failure")
            .expect_err(
                "ARC primary must surface the materialization failure verbatim — \
                 no lateral demote to BFGS+BfgsApprox",
            );
        let msg = err.to_string();
        assert!(
            msg.contains("seed materialization failed"),
            "error must propagate the underlying materialization message; got: {msg}"
        );
    }

    #[test]
    fn run_nonconverged_arc_stays_on_arc_after_budget_retry_ladder() {
        // Under the budget-bump retry ladder (commit c96c4233), an ARC
        // primary that exhausts its iteration budget is retried up to two
        // additional times with `max_iter *= 2` per retry, warm-started
        // from the previous attempt's last rho — preserving the analytic
        // outer Hessian instead of demoting to a strictly weaker
        // BFGS+BfgsApprox surface. After the ladder is exhausted, the
        // runner returns the final `Ok(OuterResult{converged:false})` from
        // the last ARC attempt; the plan stays ARC + Analytic Hessian.
        //
        // We use `cost = x^4`, `grad = 4 x^3`, `hess = 12 x^2` from
        // `initial_rho = [5.0]` with `max_iter = 1`. ARC cannot reach the
        // optimum from x=5 within the available budget (1 → 2 → 4 outer
        // iterations on a quartic); thus the ladder surfaces a
        // non-converged result without lateral demotion.
        let mut seed_config = crate::seeding::SeedConfig::default();
        seed_config.seed_budget = 1;
        let problem = OuterProblem::new(1)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Analytic)
            .with_seed_config(seed_config)
            .with_initial_rho(array![5.0])
            .with_max_iter(1);
        let mut obj = problem.build_objective(
            (),
            |_: &mut (), theta: &Array1<f64>| Ok(theta[0].powi(4)),
            |_: &mut (), theta: &Array1<f64>| {
                let x = theta[0];
                Ok(OuterEval {
                    cost: x.powi(4),
                    gradient: array![4.0 * x.powi(3)],
                    hessian: HessianResult::Analytic(array![[12.0 * x.powi(2)]]),
                })
            },
            None::<fn(&mut ())>,
            None::<fn(&mut (), &Array1<f64>) -> Result<EfsEval, EstimationError>>,
        );
        let result = problem
            .run(&mut obj, "nonconverged arc should stay on arc")
            .expect(
                "ARC ladder must surface the last non-converged ARC result rather than \
                 demoting to BFGS+BfgsApprox",
            );
        assert_eq!(
            result.plan_used.solver,
            Solver::Arc,
            "ARC primary must not lateral-demote after budget exhaustion"
        );
        assert_eq!(
            result.plan_used.hessian_source,
            HessianSource::Analytic,
            "analytic outer Hessian must be preserved across the budget-bump retry ladder"
        );
        assert!(
            !result.converged,
            "test fixture is engineered so the ladder cannot converge; \
             converged=true would mean the fixture stopped exercising the ladder"
        );
    }

    #[test]
    fn arc_budget_retry_preserves_operator_trust_radius() {
        struct RejectingArcObjective {
            trial_abs: Arc<std::sync::Mutex<Vec<f64>>>,
        }

        impl OuterObjective for RejectingArcObjective {
            fn capability(&self) -> OuterCapability {
                OuterCapability {
                    gradient: Derivative::Analytic,
                    hessian: Derivative::Analytic,
                    n_params: 1,
                    psi_dim: 0,
                    fixed_point_available: false,
                    barrier_config: None,
                    prefer_gradient_only: false,
                    disable_fixed_point: false,
                }
            }

            fn eval_cost(&mut self, theta: &Array1<f64>) -> Result<f64, EstimationError> {
                if theta[0] != 0.0 {
                    self.trial_abs
                        .lock()
                        .expect("trial recorder mutex poisoned")
                        .push(theta[0].abs());
                }
                Ok(-theta[0])
            }

            fn eval(&mut self, theta: &Array1<f64>) -> Result<OuterEval, EstimationError> {
                self.eval_with_order(theta, OuterEvalOrder::ValueGradientHessian)
            }

            fn eval_with_order(
                &mut self,
                theta: &Array1<f64>,
                _: OuterEvalOrder,
            ) -> Result<OuterEval, EstimationError> {
                Ok(OuterEval {
                    cost: -theta[0],
                    gradient: array![1.0],
                    hessian: HessianResult::Operator(Arc::new(IdentityOuterHessianOperator)),
                })
            }

            fn reset(&mut self) {}
        }

        let trial_abs = Arc::new(std::sync::Mutex::new(Vec::<f64>::new()));
        let mut obj = RejectingArcObjective {
            trial_abs: Arc::clone(&trial_abs),
        };
        let seed = array![0.0];
        let layout = OuterThetaLayout::new(1, 0);
        let plan = OuterPlan {
            solver: Solver::Arc,
            hessian_source: HessianSource::Analytic,
        };
        let initial_eval = obj
            .eval_with_order(&seed, OuterEvalOrder::ValueGradientHessian)
            .expect("initial eval");
        let first = run_operator_trust_region(
            &mut obj,
            &seed,
            layout,
            None,
            1e-12,
            1,
            initial_eval,
            plan,
            None,
        )
        .expect("first operator ARC attempt");
        assert_eq!(
            first.operator_stop_reason,
            Some(OperatorTrustRegionStopReason::IterationBudget)
        );

        let retry_eval = obj
            .eval_with_order(&first.rho, OuterEvalOrder::ValueGradientHessian)
            .expect("retry eval");
        let result = run_operator_trust_region(
            &mut obj,
            &first.rho,
            layout,
            None,
            1e-12,
            1,
            retry_eval,
            plan,
            first.operator_trust_radius,
        )
        .expect("retry operator ARC attempt");
        assert_eq!(
            result.operator_stop_reason,
            Some(OperatorTrustRegionStopReason::IterationBudget)
        );
        let trials = trial_abs.lock().expect("trial recorder mutex poisoned");
        assert_eq!(trials.len(), 2, "unexpected trial evaluations: {trials:?}");
        assert!(
            (trials[0] - 1.0).abs() < 1e-12,
            "first ARC attempt should use the default trust radius, got {trials:?}"
        );
        assert!(
            (trials[1] - 0.5).abs() < 1e-12,
            "retry must resume from the shrunken trust radius instead of replaying radius=1, got {trials:?}"
        );
    }

    #[test]
    fn operator_arc_rejected_trial_uses_cost_only() {
        struct CountingRejectedTrialObjective {
            cost_calls: Arc<std::sync::atomic::AtomicUsize>,
            full_calls: Arc<std::sync::atomic::AtomicUsize>,
        }

        impl OuterObjective for CountingRejectedTrialObjective {
            fn capability(&self) -> OuterCapability {
                OuterCapability {
                    gradient: Derivative::Analytic,
                    hessian: Derivative::Analytic,
                    n_params: 1,
                    psi_dim: 0,
                    fixed_point_available: false,
                    barrier_config: None,
                    prefer_gradient_only: false,
                    disable_fixed_point: false,
                }
            }

            fn eval_cost(&mut self, theta: &Array1<f64>) -> Result<f64, EstimationError> {
                self.cost_calls
                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
                Ok(theta[0].abs())
            }

            fn eval(&mut self, theta: &Array1<f64>) -> Result<OuterEval, EstimationError> {
                self.eval_with_order(theta, OuterEvalOrder::ValueGradientHessian)
            }

            fn eval_with_order(
                &mut self,
                theta: &Array1<f64>,
                _: OuterEvalOrder,
            ) -> Result<OuterEval, EstimationError> {
                self.full_calls
                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
                Ok(OuterEval {
                    cost: theta[0].abs(),
                    gradient: array![1.0],
                    hessian: HessianResult::Operator(Arc::new(IdentityOuterHessianOperator)),
                })
            }

            fn reset(&mut self) {}
        }

        let cost_calls = Arc::new(std::sync::atomic::AtomicUsize::new(0));
        let full_calls = Arc::new(std::sync::atomic::AtomicUsize::new(0));
        let mut obj = CountingRejectedTrialObjective {
            cost_calls: Arc::clone(&cost_calls),
            full_calls: Arc::clone(&full_calls),
        };
        let seed = array![0.0];
        let initial_eval = obj
            .eval_with_order(&seed, OuterEvalOrder::ValueGradientHessian)
            .expect("initial eval");
        let result = run_operator_trust_region(
            &mut obj,
            &seed,
            OuterThetaLayout::new(1, 0),
            None,
            1e-12,
            1,
            initial_eval,
            OuterPlan {
                solver: Solver::Arc,
                hessian_source: HessianSource::Analytic,
            },
            None,
        )
        .expect("operator ARC attempt");
        assert!(
            !result.converged,
            "single-iteration fixture should reject and exhaust its budget"
        );
        assert_eq!(
            cost_calls.load(std::sync::atomic::Ordering::Relaxed),
            1,
            "one rejected trial should require exactly one cost-only evaluation"
        );
        assert_eq!(
            full_calls.load(std::sync::atomic::Ordering::Relaxed),
            1,
            "rejected trial must not request gradient/Hessian; only initial eval should be full"
        );
    }

    #[test]
    fn operator_arc_reuses_dense_model_after_rejection() {
        struct CountingIdentityOuterHessianOperator {
            matvec_calls: Arc<std::sync::atomic::AtomicUsize>,
            batched_mul_mat_calls: Arc<std::sync::atomic::AtomicUsize>,
        }

        impl OuterHessianOperator for CountingIdentityOuterHessianOperator {
            fn dim(&self) -> usize {
                1
            }

            fn matvec(&self, v: &Array1<f64>) -> Result<Array1<f64>, String> {
                self.matvec_calls
                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
                Ok(v.clone())
            }

            fn mul_mat(&self, factor: ArrayView2<'_, f64>) -> Result<Array2<f64>, String> {
                self.batched_mul_mat_calls
                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
                Ok(factor.to_owned())
            }

            fn materialization_capability(&self) -> OuterHessianMaterialization {
                OuterHessianMaterialization::BatchedHvp
            }
        }

        struct RejectingObjective {
            matvec_calls: Arc<std::sync::atomic::AtomicUsize>,
            batched_mul_mat_calls: Arc<std::sync::atomic::AtomicUsize>,
        }

        impl OuterObjective for RejectingObjective {
            fn capability(&self) -> OuterCapability {
                OuterCapability {
                    gradient: Derivative::Analytic,
                    hessian: Derivative::Analytic,
                    n_params: 1,
                    psi_dim: 0,
                    fixed_point_available: false,
                    barrier_config: None,
                    prefer_gradient_only: false,
                    disable_fixed_point: false,
                }
            }

            fn eval_cost(&mut self, theta: &Array1<f64>) -> Result<f64, EstimationError> {
                Ok(theta[0].abs())
            }

            fn eval(&mut self, theta: &Array1<f64>) -> Result<OuterEval, EstimationError> {
                self.eval_with_order(theta, OuterEvalOrder::ValueGradientHessian)
            }

            fn eval_with_order(
                &mut self,
                theta: &Array1<f64>,
                _: OuterEvalOrder,
            ) -> Result<OuterEval, EstimationError> {
                Ok(OuterEval {
                    cost: theta[0].abs(),
                    gradient: array![1.0],
                    hessian: HessianResult::Operator(Arc::new(
                        CountingIdentityOuterHessianOperator {
                            matvec_calls: Arc::clone(&self.matvec_calls),
                            batched_mul_mat_calls: Arc::clone(&self.batched_mul_mat_calls),
                        },
                    )),
                })
            }

            fn reset(&mut self) {}
        }

        let matvec_calls = Arc::new(std::sync::atomic::AtomicUsize::new(0));
        let batched_mul_mat_calls = Arc::new(std::sync::atomic::AtomicUsize::new(0));
        let mut obj = RejectingObjective {
            matvec_calls: Arc::clone(&matvec_calls),
            batched_mul_mat_calls: Arc::clone(&batched_mul_mat_calls),
        };
        let seed = array![0.0];
        let initial_eval = obj
            .eval_with_order(&seed, OuterEvalOrder::ValueGradientHessian)
            .expect("initial eval");
        let result = run_operator_trust_region(
            &mut obj,
            &seed,
            OuterThetaLayout::new(1, 0),
            None,
            1e-12,
            3,
            initial_eval,
            OuterPlan {
                solver: Solver::Arc,
                hessian_source: HessianSource::Analytic,
            },
            None,
        )
        .expect("operator ARC attempt");
        assert!(!result.converged);
        assert_eq!(
            matvec_calls.load(std::sync::atomic::Ordering::Relaxed),
            2,
            "first rejected step uses two HVPs; batched dense materialization must not \
             fall back to an extra per-column matvec"
        );
        assert_eq!(
            batched_mul_mat_calls.load(std::sync::atomic::Ordering::Relaxed),
            1,
            "post-rejection dense ARC reuse must materialize through the operator's \
             batched mul_mat path exactly once"
        );
    }

    #[test]
    fn candidate_selection_prefers_lower_cost_within_same_convergence_class() {
        let nonconverged_hi = OuterResult {
            rho: array![0.0],
            final_value: 9.0,
            iterations: 1,
            final_grad_norm: 1.0,
            final_gradient: None,
            final_hessian: None,
            converged: false,
            plan_used: OuterPlan {
                solver: Solver::Bfgs,
                hessian_source: HessianSource::BfgsApprox,
            },
            operator_trust_radius: None,
            operator_stop_reason: None,
        };
        let nonconverged_lo = OuterResult {
            rho: array![1.0],
            final_value: 1.0,
            iterations: 1,
            final_grad_norm: 1.0,
            final_gradient: None,
            final_hessian: None,
            converged: false,
            plan_used: OuterPlan {
                solver: Solver::Bfgs,
                hessian_source: HessianSource::BfgsApprox,
            },
            operator_trust_radius: None,
            operator_stop_reason: None,
        };
        let converged = OuterResult {
            rho: array![2.0],
            final_value: 5.0,
            iterations: 1,
            final_grad_norm: 0.0,
            final_gradient: None,
            final_hessian: None,
            converged: true,
            plan_used: OuterPlan {
                solver: Solver::Bfgs,
                hessian_source: HessianSource::BfgsApprox,
            },
            operator_trust_radius: None,
            operator_stop_reason: None,
        };

        assert!(candidate_improves_best(&nonconverged_hi, None));
        assert!(candidate_improves_best(
            &nonconverged_lo,
            Some(&nonconverged_hi)
        ));
        assert!(!candidate_improves_best(
            &nonconverged_hi,
            Some(&nonconverged_lo)
        ));
        assert!(candidate_improves_best(&converged, Some(&nonconverged_lo)));
        assert!(!candidate_improves_best(&nonconverged_lo, Some(&converged)));
    }

    #[test]
    fn run_starts_solver_with_direct_startup_eval() {
        let mut seed_config = crate::seeding::SeedConfig::default();
        seed_config.seed_budget = 1;
        let calls = Arc::new(Mutex::new(Vec::new()));
        let problem = OuterProblem::new(1)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Analytic)
            .with_seed_config(seed_config)
            .with_max_iter(1);
        let mut obj = problem.build_objective(
            (),
            {
                let calls = Arc::clone(&calls);
                move |_: &mut (), theta: &Array1<f64>| {
                    calls.lock().unwrap().push("cost");
                    Ok(theta[0] * theta[0])
                }
            },
            {
                let calls = Arc::clone(&calls);
                move |_: &mut (), theta: &Array1<f64>| {
                    calls.lock().unwrap().push("eval");
                    Ok(OuterEval {
                        cost: theta[0] * theta[0],
                        gradient: array![2.0 * theta[0]],
                        hessian: HessianResult::Analytic(array![[2.0]]),
                    })
                }
            },
            None::<fn(&mut ())>,
            None::<fn(&mut (), &Array1<f64>) -> Result<EfsEval, EstimationError>>,
        );
        problem
            .run(&mut obj, "solver should start from a direct startup eval")
            .expect("analytic plans should start with a direct full evaluation");
        let calls = calls.lock().unwrap();
        let first_eval_idx = calls
            .iter()
            .position(|call| *call == "eval")
            .expect("solver should eventually request a full eval");
        assert!(
            first_eval_idx == 0,
            "startup should not perform a separate cost-screening pass first: {calls:?}"
        );
    }

    #[test]
    fn run_screening_reorders_expensive_seeds_before_full_startup_eval() {
        let mut seed_config = crate::seeding::SeedConfig::default();
        seed_config.seed_budget = 1;
        seed_config.risk_profile = crate::seeding::SeedRiskProfile::GeneralizedLinear;
        let screening_cap = Arc::new(AtomicUsize::new(0));
        let initial_seed = array![9.0];
        let valid_seed = crate::seeding::generate_rho_candidates(1, None, &seed_config)
            .first()
            .expect("seed generator should yield at least one candidate")
            .clone();
        let started = Arc::new(Mutex::new(Vec::new()));
        let problem = OuterProblem::new(1)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Analytic)
            .with_seed_config(seed_config)
            .with_screening_cap(Arc::clone(&screening_cap))
            .with_initial_rho(initial_seed.clone())
            .with_max_iter(1);
        let mut obj = problem.build_objective(
            (),
            {
                let valid_seed = valid_seed.clone();
                move |_: &mut (), theta: &Array1<f64>| {
                    if theta == &valid_seed {
                        Ok(0.0)
                    } else {
                        Ok(1000.0)
                    }
                }
            },
            {
                let valid_seed = valid_seed.clone();
                let started = Arc::clone(&started);
                move |_: &mut (), theta: &Array1<f64>| {
                    started.lock().unwrap().push(theta.clone());
                    if theta == &valid_seed {
                        Ok(OuterEval {
                            cost: 0.0,
                            gradient: array![0.0],
                            hessian: HessianResult::Analytic(array![[1.0]]),
                        })
                    } else {
                        Ok(OuterEval::infeasible(theta.len()))
                    }
                }
            },
            None::<fn(&mut ())>,
            None::<fn(&mut (), &Array1<f64>) -> Result<EfsEval, EstimationError>>,
        );
        let result = problem
            .run(&mut obj, "screening should reorder expensive seeds")
            .expect("screened startup should reach the best generated seed");
        assert_eq!(result.rho, valid_seed);
        assert_eq!(
            started.lock().unwrap().first().cloned(),
            Some(valid_seed),
            "screening should move the lowest-cost seed to the front before full startup eval",
        );
        assert_eq!(screening_cap.load(std::sync::atomic::Ordering::Relaxed), 0);
    }

    #[test]
    fn rank_seeds_cascade_escalates_when_initial_cap_collapses_all() {
        // When every seed's cost is non-finite at the initial screening cap
        // we must NOT jump straight to a fully uncapped re-evaluation on
        // every seed (the original two-stage protocol). Instead the cap
        // should escalate geometrically (initial → 4× → 16× → uncapped),
        // exiting the moment any cap stage produces a finite cost. This
        // test forces a cost function that returns non-finite for cap < 12
        // and finite for cap ≥ 12, then asserts the cascade exits at the
        // 4× stage with a meaningful ranking — never reaching the uncapped
        // pass.
        let mut seed_config = crate::seeding::SeedConfig::default();
        seed_config.seed_budget = 1;
        seed_config.screen_max_inner_iterations = 3;
        let screening_cap = Arc::new(AtomicUsize::new(0));
        let initial_seed = array![5.0];
        let valid_seed = crate::seeding::generate_rho_candidates(1, None, &seed_config)
            .first()
            .expect("seed generator should yield at least one candidate")
            .clone();
        let max_cap_seen = Arc::new(AtomicUsize::new(0));
        let problem = OuterProblem::new(1)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Analytic)
            .with_seed_config(seed_config)
            .with_screening_cap(Arc::clone(&screening_cap))
            .with_initial_rho(initial_seed.clone())
            .with_max_iter(1);
        let mut obj = problem.build_objective(
            (),
            {
                let screening_cap = Arc::clone(&screening_cap);
                let max_cap_seen = Arc::clone(&max_cap_seen);
                let valid_seed = valid_seed.clone();
                move |_: &mut (), theta: &Array1<f64>| {
                    let cap = screening_cap.load(Ordering::Relaxed);
                    max_cap_seen.fetch_max(cap, Ordering::Relaxed);
                    // Mimic an inner solver that needs ≥ 12 iterations of
                    // budget to certify a finite cost; below that it returns
                    // a non-finite "could not converge" signal.
                    if cap > 0 && cap < 12 {
                        return Ok(f64::NAN);
                    }
                    if theta == &valid_seed {
                        Ok(0.0)
                    } else {
                        Ok(1000.0)
                    }
                }
            },
            {
                let valid_seed = valid_seed.clone();
                move |_: &mut (), theta: &Array1<f64>| {
                    if theta == &valid_seed {
                        Ok(OuterEval {
                            cost: 0.0,
                            gradient: array![0.0],
                            hessian: HessianResult::Analytic(array![[1.0]]),
                        })
                    } else {
                        Ok(OuterEval::infeasible(theta.len()))
                    }
                }
            },
            None::<fn(&mut ())>,
            None::<fn(&mut (), &Array1<f64>) -> Result<EfsEval, EstimationError>>,
        );
        let _ = problem
            .run(&mut obj, "cascade should escalate")
            .expect("cascade should reach a finite cost at the 4× cap stage");
        // The cascade is [3, 12, 48, 0]; the 4× stage (cap=12) is the first
        // stage that produces a finite cost, so the cascade must exit there
        // and never escalate to 48 or to the uncapped (0) stage.
        let max_cap = max_cap_seen.load(Ordering::Relaxed);
        assert_eq!(
            max_cap, 12,
            "cascade should stop at the 4× cap stage; observed max cap = {max_cap}"
        );
        assert_eq!(
            screening_cap.load(Ordering::Relaxed),
            0,
            "screening cap must be restored to its previous value after cascade"
        );
    }

    #[test]
    fn run_efs_skips_global_cost_screening() {
        let mut seed_config = crate::seeding::SeedConfig::default();
        seed_config.max_seeds = 6;
        seed_config.seed_budget = 1;
        let screening_calls = Arc::new(AtomicUsize::new(0));
        let problem = OuterProblem::new(15)
            .with_gradient(Derivative::Unavailable)
            .with_hessian(Derivative::Unavailable)
            .with_seed_config(seed_config)
            .with_max_iter(1);
        let mut obj = problem.build_objective(
            (),
            {
                let screening_calls = Arc::clone(&screening_calls);
                move |_: &mut (), _: &Array1<f64>| {
                    screening_calls.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
                    Ok(0.0)
                }
            },
            |_: &mut (), theta: &Array1<f64>| Ok(OuterEval::infeasible(theta.len())),
            None::<fn(&mut ())>,
            Some(|_: &mut (), theta: &Array1<f64>| {
                Ok(EfsEval {
                    cost: 0.0,
                    steps: vec![0.0; theta.len()],
                    beta: None,
                    psi_gradient: None,
                    psi_indices: None,
                })
            }),
        );
        problem
            .run(
                &mut obj,
                "EFS should not use a separate global cost-screening pass",
            )
            .expect("first generated EFS seed should be sufficient");
        assert_eq!(
            screening_calls.load(std::sync::atomic::Ordering::Relaxed),
            0,
            "EFS startup should not call eval_cost just to screen seeds"
        );
    }

    #[test]
    fn run_efs_skips_invalid_leading_seed_without_spending_budget() {
        let generated = crate::seeding::generate_rho_candidates(
            15,
            None,
            &crate::seeding::SeedConfig::default(),
        );
        let valid_seed = generated
            .first()
            .expect("seed generator should yield at least one candidate")
            .clone();
        let invalid_seed = Array1::from_elem(15, 9.0);
        let mut seed_config = crate::seeding::SeedConfig::default();
        seed_config.seed_budget = 1;
        let problem = OuterProblem::new(15)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Unavailable)
            .with_seed_config(seed_config)
            .with_initial_rho(invalid_seed)
            .with_max_iter(1);
        let mut obj = problem.build_objective(
            (),
            |_: &mut (), _: &Array1<f64>| Ok(0.0),
            |_: &mut (), theta: &Array1<f64>| Ok(OuterEval::infeasible(theta.len())),
            None::<fn(&mut ())>,
            {
                let valid_seed = valid_seed.clone();
                Some(move |_: &mut (), theta: &Array1<f64>| {
                    if theta == &valid_seed {
                        Ok(EfsEval {
                            cost: 0.0,
                            steps: vec![0.0; theta.len()],
                            beta: None,
                            psi_gradient: None,
                            psi_indices: None,
                        })
                    } else {
                        Err(EstimationError::RemlOptimizationFailed(
                            "invalid EFS seed".to_string(),
                        ))
                    }
                })
            },
        );
        let result = problem
            .run(&mut obj, "efs generated seed should remain reachable")
            .expect("invalid startup seeds should not consume the only EFS seed slot");
        assert_eq!(result.rho, valid_seed);
        assert_eq!(result.plan_used.solver, Solver::Efs);
    }

    #[test]
    fn run_efs_runtime_fallback_marker_degrades_to_bfgs_immediately() {
        let mut seed_config = crate::seeding::SeedConfig::default();
        seed_config.seed_budget = 2;
        let efs_calls = Arc::new(AtomicUsize::new(0));
        let problem = OuterProblem::new(12)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Unavailable)
            .with_seed_config(seed_config)
            .with_initial_rho(Array1::zeros(12))
            .with_max_iter(5);
        let mut obj = problem.build_objective(
            (),
            |_: &mut (), theta: &Array1<f64>| Ok(0.5 * theta.dot(theta)),
            |_: &mut (), theta: &Array1<f64>| {
                Ok(OuterEval {
                    cost: 0.5 * theta.dot(theta),
                    gradient: theta.clone(),
                    hessian: HessianResult::Unavailable,
                })
            },
            None::<fn(&mut ())>,
            {
                let efs_calls = Arc::clone(&efs_calls);
                Some(move |_: &mut (), _: &Array1<f64>| {
                    efs_calls.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
                    Err(EstimationError::RemlOptimizationFailed(format!(
                        "{} synthetic runtime escape hatch",
                        EFS_FIRST_ORDER_FALLBACK_MARKER,
                    )))
                })
            },
        );
        let result = problem
            .run(&mut obj, "efs runtime fallback marker")
            .expect("runtime EFS escape hatch should degrade to BFGS");
        assert_eq!(result.plan_used.solver, Solver::Bfgs);
        assert_eq!(
            efs_calls.load(std::sync::atomic::Ordering::Relaxed),
            1,
            "runtime fallback marker should abort the EFS attempt immediately"
        );
    }

    #[test]
    fn run_rejects_invalid_theta_layout() {
        let problem = OuterProblem::new(1)
            .with_gradient(Derivative::Analytic)
            .with_hessian(Derivative::Unavailable)
            .with_psi_dim(2)
            .with_initial_rho(Array1::zeros(1))
            .with_max_iter(1);
        let mut obj = problem.build_objective(
            (),
            |_: &mut (), _: &Array1<f64>| Ok(0.0),
            |_: &mut (), _: &Array1<f64>| {
                Ok(OuterEval {
                    cost: 0.0,
                    gradient: Array1::zeros(1),
                    hessian: HessianResult::Unavailable,
                })
            },
            None::<fn(&mut ())>,
            None::<fn(&mut (), &Array1<f64>) -> Result<EfsEval, EstimationError>>,
        );
        let err = problem
            .run(&mut obj, "test invalid layout")
            .expect_err("invalid theta layout should fail cleanly");
        assert!(
            err.to_string().contains("invalid outer theta layout"),
            "unexpected error: {err}"
        );
    }

    #[test]
    fn effective_seed_budget_caps_expensive_solver_retries() {
        assert_eq!(
            effective_seed_budget(
                4,
                Solver::Efs,
                crate::seeding::SeedRiskProfile::GeneralizedLinear,
                false,
            ),
            1
        );
        assert_eq!(
            effective_seed_budget(
                4,
                Solver::HybridEfs,
                crate::seeding::SeedRiskProfile::Survival,
                false,
            ),
            1
        );
        assert_eq!(
            effective_seed_budget(
                3,
                Solver::Arc,
                crate::seeding::SeedRiskProfile::GeneralizedLinear,
                true,
            ),
            1
        );
        assert_eq!(
            effective_seed_budget(
                3,
                Solver::Arc,
                crate::seeding::SeedRiskProfile::Survival,
                false,
            ),
            1
        );
        assert_eq!(
            effective_seed_budget(
                3,
                Solver::Bfgs,
                crate::seeding::SeedRiskProfile::Survival,
                false,
            ),
            3
        );
    }

    // ─── Gated SolverClass / CompassSearch dispatch ──────────────────────

    fn aux_cap_unavailable(n_params: usize) -> OuterCapability {
        OuterCapability {
            gradient: Derivative::Unavailable,
            hessian: Derivative::Unavailable,
            n_params,
            psi_dim: 0,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        }
    }

    #[test]
    fn plan_with_class_primary_is_identical_to_plan_for_unavailable_grad() {
        // Unavailable-grad capability must still route to BFGS for the
        // primary class (the runner will then hard-error on the missing
        // analytic gradient — that behavior is tested elsewhere).
        let cap = aux_cap_unavailable(3);
        assert_eq!(plan_with_class(&cap, SolverClass::Primary), plan(&cap));
    }

    #[test]
    fn plan_with_class_aux_unavailable_routes_to_compass_search() {
        let cap = aux_cap_unavailable(3);
        let p = plan_with_class(&cap, SolverClass::AuxiliaryGradientFree);
        assert_eq!(p.solver, Solver::CompassSearch);
    }

    #[test]
    fn plan_with_class_aux_analytic_grad_defers_to_primary_plan() {
        // Aux class + analytic gradient is a misuse: the caller should
        // have used Primary. We defer to the standard plan so the caller
        // still gets a well-formed result rather than silently being
        // routed to direct search when a derivative-based solver exists.
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Analytic,
            n_params: 3,
            psi_dim: 0,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan_with_class(&cap, SolverClass::AuxiliaryGradientFree);
        assert_eq!(p.solver, Solver::Arc);
    }

    #[test]
    fn plan_with_class_aux_efs_eligible_defers_to_primary() {
        // If the coordinate structure is EFS-eligible, use EFS even if
        // the caller set Auxiliary — EFS is strictly better than compass
        // search whenever it applies.
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Unavailable,
            n_params: 12,
            psi_dim: 0,
            fixed_point_available: true,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let p = plan_with_class(&cap, SolverClass::AuxiliaryGradientFree);
        assert_eq!(p.solver, Solver::Efs);
    }

    #[test]
    fn automatic_fallback_never_includes_compass_search() {
        // The fallback cascade must not introduce direct-search for the
        // primary REML path. Aux direct-search is a single-attempt
        // method; its dispatch is orthogonal to the fallback ladder.
        let cap = OuterCapability {
            gradient: Derivative::Analytic,
            hessian: Derivative::Analytic,
            n_params: 5,
            psi_dim: 0,
            fixed_point_available: false,
            barrier_config: None,
            prefer_gradient_only: false,
            disable_fixed_point: false,
        };
        let attempts = automatic_fallback_attempts(&cap);
        for attempt_cap in &attempts {
            let p = plan_with_class(attempt_cap, SolverClass::Primary);
            assert_ne!(p.solver, Solver::CompassSearch);
        }
    }

    #[test]
    fn compass_search_budget_accounts_for_single_seed() {
        // Aux direct-search is intrinsically a single-seed local method;
        // generating extra seeds would just duplicate cost.
        let b = effective_seed_budget(
            8,
            Solver::CompassSearch,
            crate::seeding::SeedRiskProfile::Survival,
            false,
        );
        assert_eq!(b, 1);
    }

    #[test]
    fn run_aux_compass_projects_seed_before_seed_cost() {
        let seen = Arc::new(Mutex::new(Vec::new()));
        let problem = OuterProblem::new(1)
            .with_solver_class(SolverClass::AuxiliaryGradientFree)
            .with_bounds(array![0.0], array![1.0])
            .with_initial_rho(array![2.0])
            .with_max_iter(64);
        let mut obj = problem.build_objective(
            (),
            {
                let seen = Arc::clone(&seen);
                move |_: &mut (), theta: &Array1<f64>| {
                    seen.lock().unwrap().push(theta.clone());
                    Ok((theta[0] - 2.0).powi(2))
                }
            },
            |_: &mut (), _: &Array1<f64>| {
                Err(EstimationError::InvalidInput(
                    "aux direct-search test should not call eval".to_string(),
                ))
            },
            None::<fn(&mut ())>,
            None::<fn(&mut (), &Array1<f64>) -> Result<EfsEval, EstimationError>>,
        );
        let result = problem
            .run(&mut obj, "aux direct-search seed projection")
            .expect("aux direct-search should evaluate the projected seed");
        assert_eq!(result.plan_used.solver, Solver::CompassSearch);
        assert_eq!(result.rho, array![1.0]);
        assert_eq!(result.final_value, 1.0);
        assert_eq!(
            seen.lock().unwrap().first().cloned(),
            Some(array![1.0]),
            "aux direct-search must project the seed before evaluating its cost",
        );
    }
}